diff --git a/.github/actions/build-asset-unix.sh b/.github/actions/build-asset-unix.sh
index b7fc8920..d96f234f 100755
--- a/.github/actions/build-asset-unix.sh
+++ b/.github/actions/build-asset-unix.sh
@@ -1,12 +1,28 @@
 #! /usr/bin/env bash
 # NOTE: This is meant to be run from the repo root dir
-#
-#  Expects env variables:
-#   - BB_ARTIFACT_NAME
-#   - BB_VERSION
-#
+
 set -eo pipefail
 
+compile_cuda=0
+artifact_name=bladebit
+version=v1.0
+
+while true; do
+  case $1 in
+    --cuda)
+      compile_cuda=1 || exit 1
+    ;;
+    --artifact)
+      shift && artifact_name=$1 || exit 1
+    ;;
+    --version)
+      shift && version=$1 || exit 1
+    ;;
+  esac
+  shift || break
+done
+
+
 thread_count=2
 
 if [[ $OSTYPE == 'darwin'* ]]; then
@@ -19,11 +35,19 @@ fi
 echo "System: $(uname -s)"
 gcc --version
 
-mkdir build && cd build
-cmake ..
+exe_name=bladebit
+target=bladebit
+if [[ compile_cuda -eq 1 ]]; then
+  target=bladebit_cuda
+  exe_name=bladebit_cuda
+fi
+
+set -x
+mkdir build-${target} && cd build-${target}
+cmake .. -DCMAKE_BUILD_TYPE=Release
 bash -eo pipefail ../embed-version.sh
-cmake --build . --target bladebit --config Release -j $thread_count
-chmod +x ./bladebit
+cmake --build . --config Release --target $target -j $thread_count
+chmod +x ./${exe_name}
 
 if [[ $OSTYPE == 'msys'* ]] || [[ $OSTYPE == 'cygwin'* ]]; then
   ls -la Release
@@ -32,16 +56,16 @@ else
 fi
 
 # Ensure bladebit version matches expected version
-bb_version="$(./bladebit --version | xargs)"
+bb_version="$(./${exe_name} --version | xargs)"
 
-if [[ "$bb_version" != "$BB_VERSION" ]]; then
-    >&2 echo "Incorrect bladebit version. Got '$bb_version' but expected '$BB_VERSION'."
+if [[ "$bb_version" != "$version" ]]; then
+    >&2 echo "Incorrect bladebit version. Got '$bb_version' but expected '$version'."
     exit 1
 fi
 
 tar --version
-tar -czvf $BB_ARTIFACT_NAME bladebit
-mkdir ../bin
-mv $BB_ARTIFACT_NAME ../bin/
+tar -czvf $artifact_name $exe_name
+mkdir -p ../bin
+mv $artifact_name ../bin/
 ls -la ../bin
 
diff --git a/.github/actions/build-harvester.sh b/.github/actions/build-harvester.sh
new file mode 100644
index 00000000..2460a279
--- /dev/null
+++ b/.github/actions/build-harvester.sh
@@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+set -eo pipefail
+if [[ $RUNNER_DEBUG = 1 ]]; then
+  set -x
+fi
+
+host_os=$(uname -a)
+case "${host_os}" in
+  Linux*)  host_os="linux";;
+  Darwin*) host_os="macos";;
+  CYGWIN*) host_os="windows";;
+  MINGW*)  host_os="windows";;
+  *Msys)   host_os="windows";;
+esac
+
+if [[ "$host_os" == "windows" ]]; then
+  ext="zip"
+else
+  ext="tar.gz"
+fi
+
+if [[ "$host_os" == "macos" ]]; then
+  procs=$(sysctl -n hw.logicalcpu)
+  sha_sum="shasum -a 256"
+else
+  procs=$(nproc --all)
+  sha_sum="sha256sum"
+fi
+
+artifact_name=green_reaper.$ext
+
+while true; do
+  case $1 in
+  --artifact)
+    shift && artifact_name=$1 || exit 1
+    ;;
+  esac
+  shift || break
+done
+
+echo "Harvester artifact: ${artifact_name}"
+echo 'cmake --version'
+cmake --version
+
+mkdir -p build-harvester
+pushd build-harvester
+cmake .. -DCMAKE_BUILD_TYPE=Release -DBB_HARVESTER_ONLY=ON
+
+cmake --build . --config Release --target bladebit_harvester
+
+if [[ "$host_os" == "windows" ]]; then
+  OBJDUMP=$("${CUDA_PATH}"\\bin\\cuobjdump Release\\bladebit_harvester.dll)
+elif [[ "$host_os" == "linux" ]]; then
+  OBJDUMP=$(/usr/local/cuda/bin/cuobjdump libbladebit_harvester.so)
+fi
+
+cmake --install . --prefix harvester_dist
+pushd harvester_dist/green_reaper
+
+if [[ "$host_os" == "windows" ]]; then
+  mkdir -p lib
+  cp -vn ../../*/*.dll lib/
+  cp -vn ../../*/*.lib lib/
+fi
+
+artifact_files=($(find . -type f -name '*.*' | cut -c3-))
+
+# shellcheck disable=SC2068
+$sha_sum ${artifact_files[@]} > sha256checksum
+
+artifact_files+=("sha256checksum")
+
+if [[ "$host_os" == "windows" ]]; then
+  7z.exe a -tzip "${artifact_name}" "${artifact_files[@]}"
+else
+  # shellcheck disable=SC2068
+  tar -czvf "${artifact_name}" ${artifact_files[@]}
+fi
+
+popd
+mv "harvester_dist/green_reaper/${artifact_name}" ./
+$sha_sum "${artifact_name}" > "${artifact_name}.sha256.txt"
+ls -la
+cat "${artifact_name}.sha256.txt"
+
+if [[ "$CI" == "true" ]]; then
+  if [[ "$host_os" == "windows" ]] || [[ "$host_os" == "linux" ]]; then
+    while IFS= read -r line; do
+      echo -e "$(echo ${line#* } | tr -d '*')\n###### <sup>${line%% *}</sup>\n"
+    done <"${artifact_name}.sha256.txt" >> "$GITHUB_STEP_SUMMARY"
+    echo "| Arch | Code Version | Host | Compile Size |" >> "$GITHUB_STEP_SUMMARY"
+    echo "| --- | --- | --- | --- |" >> "$GITHUB_STEP_SUMMARY"
+    echo "$OBJDUMP" | awk -v RS= -v FS='\n' -v OFS=' | ' '{
+    for (i=1; i<=NF; i++) {
+        if (index($i, "=")) {
+            gsub(/.* = /, "", $i);
+            }
+        }
+        print $3, $4, $5, $6;
+    }' | sed 's/^/| /; s/$/ |/; s/ |  | / | /g' >> "$GITHUB_STEP_SUMMARY"
+  fi
+
+  if [[ "$host_os" == "windows" ]]; then
+    harvester_artifact_path="$(cygpath -m "$(pwd)/${artifact_name}")*"
+  else
+    harvester_artifact_path="$(pwd)/${artifact_name}*"
+  fi
+  echo "harvester_artifact_path=$harvester_artifact_path"
+  echo "harvester_artifact_path=$harvester_artifact_path" >> "$GITHUB_ENV"
+fi
+
+popd
+ls -la
diff --git a/.github/actions/get-version.sh b/.github/actions/get-version.sh
index 81dea115..16c51dda 100755
--- a/.github/actions/get-version.sh
+++ b/.github/actions/get-version.sh
@@ -29,6 +29,8 @@ if [[ "$os" == "windows" ]]; then
     ext="zip"
 fi
 
-echo "::set-output name=BB_VERSION::$version"
-echo "::set-output name=BB_ARTIFACT_NAME::bladebit-v${version}-${os}-${arch}.${ext}"
+echo "BB_VERSION=$version" >> $GITHUB_ENV
+echo "BB_ARTIFACT_NAME=bladebit-v${version}-${os}-${arch}.${ext}" >> $GITHUB_ENV
+echo "BB_ARTIFACT_NAME_CUDA=bladebit-cuda-v${version}-${os}-${arch}.${ext}" >> $GITHUB_ENV
+
 
diff --git a/.github/actions/install-cmake-linux.sh b/.github/actions/install-cmake-linux.sh
new file mode 100644
index 00000000..ec75913a
--- /dev/null
+++ b/.github/actions/install-cmake-linux.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+set -eo pipefail
+ref_cmake_sha256='39e1c2eccda989b0d000dc5f4ee2cb031bdda799163780d855acc0bd9eda9d92'
+cmake_name='cmake-3.23.3-linux-x86_64'
+
+curl -L https://github.com/Kitware/CMake/releases/download/v3.23.3/cmake-3.23.3-linux-x86_64.tar.gz > cmake.tar.gz
+
+cmake_sh_sha256=$(sha256sum cmake.tar.gz | cut -f1 -d' ')
+if [[ "${ref_cmake_sha256}" != "${cmake_sh_sha256}" ]]; then
+    2>&1 echo "sha256 mismatch!: "
+    2>&1 echo "Got     : '${cmake_sh_sha256}'"
+    2>&1 echo "Expected: '${ref_cmake_sha256}'"
+    exit 1
+fi
+
+rm -f /usr/bin/cmake && rm -f /usr/local/bin/cmake
+mkdir -p /usr/local/bin
+mkdir -p /usr/local/share
+
+cmake_prefix=$(pwd)/${cmake_name}
+tar -xzvf cmake.tar.gz
+ls -la
+ls -la ${cmake_prefix}
+
+cp -r ${cmake_prefix}/bin/* /usr/local/bin/
+cp -r ${cmake_prefix}/share/* /usr/local/share/
+
+echo 'Cmake Info:'
+which cmake
+cmake --version
+
+echo 'Done.'
+exit 0
diff --git a/.github/workflows/attach-release-assets.yml b/.github/workflows/attach-release-assets.yml
index 50818edd..d0df509d 100644
--- a/.github/workflows/attach-release-assets.yml
+++ b/.github/workflows/attach-release-assets.yml
@@ -35,8 +35,15 @@ jobs:
             bladebit-v${BB_VERSION}-ubuntu-arm64.tar.gz
             bladebit-v${BB_VERSION}-centos-arm64.tar.gz
             bladebit-v${BB_VERSION}-windows-x86-64.zip
-            bladebit-v${BB_VERSION}-macos-arm64.tar.gz
-            bladebit-v${BB_VERSION}-macos-x86-64.tar.gz
+            bladebit-cuda-v${BB_VERSION}-ubuntu-x86-64.tar.gz
+            bladebit-cuda-v${BB_VERSION}-centos-x86-64.tar.gz
+            bladebit-cuda-v${BB_VERSION}-ubuntu-arm64.tar.gz
+            bladebit-cuda-v${BB_VERSION}-windows-x86-64.zip
+            green_reaper-v${BB_VERSION}-linux-x86-64.tar.gz
+            green_reaper-v${BB_VERSION}-linux-ARM64.tar.gz
+            green_reaper-v${BB_VERSION}-macos-x86-64.tar.gz
+            green_reaper-v${BB_VERSION}-macos-arm64.tar.gz
+            green_reaper-v${BB_VERSION}-windows-x86-64.zip
           )
 
           mkdir -p bin
@@ -59,4 +66,4 @@ jobs:
             echo "Uploading release asset '${artifact_name}'"
             node .github/actions/artifacts.mjs upload-release-asset $BB_VERSION $artifact_name bin/$artifact_name
           done
- 
+
diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml
index a70fbacb..0cfc70cc 100644
--- a/.github/workflows/build-release.yml
+++ b/.github/workflows/build-release.yml
@@ -4,100 +4,339 @@ on:
     branches: ['*']
   workflow_dispatch:
 
+env:
+  CI_BLADEBIT: 1  # Our own CI, that is, not being built as a dependency
+
 jobs:
-  build-ubuntu-x86-64:
+  build-harvester-linux-x86-64:
     runs-on: ubuntu-20.04
+    container:
+      image: ghcr.io/chia-network/build-images/manylinux2014_cuda_x86_64:sha-1caf046d5ff19b7c743de2a106dd86928794032b
     steps:
       - name: Checkout Repo
         uses: actions/checkout@v3
 
+      - name: Set Env
+        uses: Chia-Network/actions/setjobenv@main
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
       - name: Get Version Number
         id: version_number
-        run: .github/actions/get-version.sh ubuntu x86-64
+        shell: bash
+        run: ./.github/actions/get-version.sh ubuntu x86-64
 
       - name: Install Prerequisites
-        run: sudo apt install -y libgmp-dev libnuma-dev
+        shell: bash
+        run: |
+          set -eo pipefail
+          yum group install -y "Development Tools"
+          yum install -y sudo make git wget subscription-manager
 
-      - name: Build
+      - name: Build Harvester
+        shell: bash
+        run: |
+          export artifact_name="green_reaper-v${{ env.BB_VERSION }}-linux-x86-64.tar.gz"
+          echo "harvester_artifact_name=${artifact_name}" >> "$GITHUB_ENV"
+          # emits env.harvester_artifact_path
+          bash .github/actions/build-harvester.sh --artifact "${artifact_name}"
+
+      - name: Upload Harvester Artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ env.harvester_artifact_name }}
+          path: ${{ env.harvester_artifact_path }}
+          if-no-files-found: error
+
+  build-harvester-windows-x86-64:
+    runs-on: windows-2022
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v3
+
+      - name: Set Env
+        uses: Chia-Network/actions/setjobenv@main
         env:
-          BB_ARTIFACT_NAME: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
-          BB_VERSION: ${{steps.version_number.outputs.BB_VERSION}}
-        run: .github/actions/build-asset-unix.sh
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Get Version Number
+        id: version_number
+        shell: bash
+        run: ./.github/actions/get-version.sh windows x86-64
+
+      - name: Install Prerequisites
+        shell: powershell
+        run: |
+          choco install -y make
+          choco install -y wget
+          choco install -y sed
+
+      - name: Setup CUDA
+        uses: Jimver/cuda-toolkit@v0.2.11
+        id: cuda-toolkit
+        with:
+          cuda: '12.1.0'
+          method: network
 
-      - name: Upload Artifact Ubuntu x86-64
+      - name: Verify CUDA
+        shell: bash
+        run: |
+          echo "Installed cuda version is: ${{ steps.cuda-toolkit.outputs.cuda }}"
+          echo "Cuda install location: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}"
+          nvcc -V
+
+      - name: Build Harvester
+        shell: bash
+        run: |
+          export artifact_name="green_reaper-v${{ env.BB_VERSION }}-windows-x86-64.zip"
+          echo "harvester_artifact_name=${artifact_name}" >> "$GITHUB_ENV"
+          # emits env.harvester_artifact_path
+          bash .github/actions/build-harvester.sh --artifact "${artifact_name}"
+        env:
+          CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
+
+      - name: Upload Harvester Artifact
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
-          path: ${{ github.workspace }}/bin/${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
+          name: ${{ env.harvester_artifact_name }}
+          path: ${{ env.harvester_artifact_path }}
           if-no-files-found: error
 
-  build-centos-x86-64:
-    runs-on: ubuntu-20.04
+  build-harvester-linux-arm64:
+    runs-on: [ARM64, Linux]
     container:
-      image: quay.io/centos/centos:stream8
+      image: ghcr.io/chia-network/build-images/manylinux2014_cuda_aarch64:sha-1caf046d5ff19b7c743de2a106dd86928794032b
     steps:
       - name: Checkout Repo
         uses: actions/checkout@v3
 
+      - name: Set Env
+        uses: Chia-Network/actions/setjobenv@main
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Cache DNF packages
+        uses: actions/cache@v3
+        with:
+          path: /var/cache/dnf
+          key: ${{ runner.os }}-dnf-${{ hashFiles('**/your-build-file') }}
+          restore-keys: |
+            ${{ runner.os }}-dnf-
+
       - name: Get Version Number
         id: version_number
-        run: .github/actions/get-version.sh centos x86-64
+        shell: bash
+        run: ./.github/actions/get-version.sh centos arm64
 
       - name: Install Prerequisites
+        shell: bash
         run: |
-          dnf install -y gcc-toolset-9-gcc gcc-toolset-9-gcc-c++ \
-                         cmake gmp-devel numactl-devel make git
+          set -eo pipefail
+          export module_platform_id=platform:el9
+          export MODULE_PLATFORM_ID=platform:el9
+          export PLATFORM_ID=platform:el9
+          uname -a
+          cat /etc/os-release
+          yum install -y dnf
+          dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+          dnf install -y dnf-plugins-core
+          dnf makecache
+          dnf install -y kernel-headers.aarch64 kernel-devel.aarch64 tar bzip2 make automake gcc gcc-c++ pciutils elfutils-libelf-devel libglvnd-opengl libglvnd-glx libglvnd-devel acpid pkgconfig dkms
+          dnf install -y cmake
+          dnf group install -y "Development Tools"
+          dnf install -y gmp-devel numactl-devel make git wget sed
+
+      - name: Build Harvester
+        shell: bash
+        run: |
+          export artifact_name="green_reaper-v${{ env.BB_VERSION }}-linux-ARM64.tar.gz"
+          echo "harvester_artifact_name=${artifact_name}" >> "$GITHUB_ENV"
+          # emits env.harvester_artifact_path
+          bash .github/actions/build-harvester.sh --artifact "${artifact_name}"
 
-      - name: Build
+      - name: Upload Harvester Artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ env.harvester_artifact_name }}
+          path: ${{ env.harvester_artifact_path }}
+          if-no-files-found: error
+
+  build-harvester-macos-arm64:
+    runs-on: [macos, arm64]
+    steps:
+      - name: Cleanup Environment
+        uses: Chia-Network/actions/clean-workspace@main
+
+      - name: Checkout Repo
+        uses: actions/checkout@v3
+
+      - name: Set Env
+        uses: Chia-Network/actions/setjobenv@main
         env:
-          BB_ARTIFACT_NAME: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
-          BB_VERSION: ${{steps.version_number.outputs.BB_VERSION}}
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Get Version Number
+        id: version_number
+        run: bash -e .github/actions/get-version.sh macos arm64
+
+      - name: Build Harvester
+        shell: bash
         run: |
-          source /opt/rh/gcc-toolset-9/enable
-          .github/actions/build-asset-unix.sh
-      
-      - name: Upload Artifact CentOS x86-64
+          export artifact_name="green_reaper-v${{ env.BB_VERSION }}-macos-arm64.tar.gz"
+          echo "harvester_artifact_name=${artifact_name}" >> "$GITHUB_ENV"
+          # emits env.harvester_artifact_path
+          bash .github/actions/build-harvester.sh --artifact "${artifact_name}"
+
+      - name: Upload Harvester Artifact
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
-          path: ${{ github.workspace }}/bin/${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
+          name: ${{ env.harvester_artifact_name }}
+          path: ${{ env.harvester_artifact_path }}
           if-no-files-found: error
 
-  build-ubuntu-arm64:
-    runs-on: [ARM64, Linux]
-    container:
-      image: chianetwork/ubuntu-20.04-builder:latest
-    defaults:
-      run:
+  build-harvester-macos-x86-64:
+    runs-on: macOS-11
+    steps:
+      - name: Cleanup Environment
+        uses: Chia-Network/actions/clean-workspace@main
+
+      - name: Checkout Repo
+        uses: actions/checkout@v3
+
+      - name: Set Env
+        uses: Chia-Network/actions/setjobenv@main
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Get Version Number
+        id: version_number
+        run: bash -e .github/actions/get-version.sh macos x86-64
+
+      - name: Build Harvester
         shell: bash
+        run: |
+          export artifact_name="green_reaper-v${{ env.BB_VERSION }}-macos-x86-64.tar.gz"
+          echo "harvester_artifact_name=${artifact_name}" >> "$GITHUB_ENV"
+          # emits env.harvester_artifact_path
+          bash .github/actions/build-harvester.sh --artifact "${artifact_name}"
+
+      - name: Upload Harvester Artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ env.harvester_artifact_name }}
+          path: ${{ env.harvester_artifact_path }}
+          if-no-files-found: error
+
+
+  build-bladebit-ubuntu-x86-64:
+    runs-on: ubuntu-20.04
     steps:
       - name: Checkout Repo
         uses: actions/checkout@v3
 
+      - name: Set Env
+        uses: Chia-Network/actions/setjobenv@main
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
       - name: Get Version Number
         id: version_number
-        run: .github/actions/get-version.sh ubuntu arm64
+        run: .github/actions/get-version.sh ubuntu x86-64
 
       - name: Install Prerequisites
         run: |
-          export DEBIAN_FRONTEND=noninteractive
-          apt update
-          apt install -y build-essential git libgmp-dev libnuma-dev
+            sudo apt install -y libgmp-dev libnuma-dev
+            sudo bash .github/actions/install-cmake-linux.sh
 
-      - name: Build
+      - name: Build Bladebit
+        run: .github/actions/build-asset-unix.sh --artifact ${{ env.BB_ARTIFACT_NAME }} --version ${{env.BB_VERSION}}
+
+      - name: Setup CUDA
+        uses: Jimver/cuda-toolkit@v0.2.11
+        id: cuda-toolkit
+        with:
+          cuda: '12.1.0'
+          method: network
+          linux-local-args: '["--toolkit"]'
+
+      - name: Build Bladebit CUDA
+        run: .github/actions/build-asset-unix.sh --cuda --artifact ${{ env.BB_ARTIFACT_NAME_CUDA }} --version ${{env.BB_VERSION}}
+
+      - name: Upload Bladebit Artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ env.BB_ARTIFACT_NAME }}
+          path: ${{ github.workspace }}/bin/${{ env.BB_ARTIFACT_NAME }}
+          if-no-files-found: error
+
+      - name: Upload Bladebit CUDA Artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ env.BB_ARTIFACT_NAME_CUDA }}
+          path: ${{ github.workspace }}/bin/${{ env.BB_ARTIFACT_NAME_CUDA }}
+          if-no-files-found: error
+
+  build-bladebit-centos-x86-64:
+    runs-on: ubuntu-20.04
+    container:
+      image: quay.io/centos/centos:stream8
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v3
+
+      - name: Set Env
+        uses: Chia-Network/actions/setjobenv@main
         env:
-          BB_ARTIFACT_NAME: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
-          BB_VERSION: ${{steps.version_number.outputs.BB_VERSION}}
-        run: .github/actions/build-asset-unix.sh
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Upload Artifact Ubuntu ARM64
+      - name: Get Version Number
+        id: version_number
+        run: .github/actions/get-version.sh centos x86-64
+
+      - name: Install Prerequisites
+        run: |
+          set -eo pipefail
+          dnf install -y gcc-toolset-9-gcc gcc-toolset-9-gcc-c++ \
+              gmp-devel numactl-devel make git wget subscription-manager
+          bash .github/actions/install-cmake-linux.sh
+
+      - name: Build Bladebit
+        run: |
+          source /opt/rh/gcc-toolset-9/enable
+          .github/actions/build-asset-unix.sh --artifact ${{ env.BB_ARTIFACT_NAME }} --version ${{ env.BB_VERSION }}
+
+      - name: Install CUDA Prerequisites
+        run: |
+          distro=rhel8
+          arch=x86_64
+          dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm
+          dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$distro/$arch/cuda-$distro.repo
+          dnf clean expire-cache
+          dnf module install -y nvidia-driver:latest-dkms
+          dnf install -y cuda
+          ls -la /usr/local/
+          ls -la /usr/local/cuda/
+
+      - name: Build Bladebit CUDA
+        run: |
+          source /opt/rh/gcc-toolset-9/enable
+          .github/actions/build-asset-unix.sh --cuda --artifact ${{ env.BB_ARTIFACT_NAME_CUDA }} --version ${{ env.BB_VERSION }}
+
+      - name: Upload Bladebit Artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ env.BB_ARTIFACT_NAME }}
+          path: ${{ github.workspace }}/bin/${{ env.BB_ARTIFACT_NAME }}
+          if-no-files-found: error
+
+      - name: Upload Bladebit CUDA Artifact
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
-          path: ${{ github.workspace }}/bin/${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
+          name: ${{ env.BB_ARTIFACT_NAME_CUDA }}
+          path: ${{ github.workspace }}/bin/${{ env.BB_ARTIFACT_NAME_CUDA }}
           if-no-files-found: error
 
-  build-centos-arm64:
+  build-bladebit-centos-arm64:
     runs-on: [ARM64, Linux]
     container:
       image: quay.io/centos/centos:stream8
@@ -105,6 +344,11 @@ jobs:
       - name: Checkout Repo
         uses: actions/checkout@v3
 
+      - name: Set Env
+        uses: Chia-Network/actions/setjobenv@main
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
       - name: Get Version Number
         id: version_number
         run: .github/actions/get-version.sh centos arm64
@@ -112,44 +356,100 @@ jobs:
       - name: Install Prerequisites
         run: |
           dnf install -y gcc-toolset-9-gcc gcc-toolset-9-gcc-c++ \
-                         cmake gmp-devel numactl-devel make git
+            cmake gmp-devel numactl-devel make git
 
       - name: Build
-        env:
-          BB_ARTIFACT_NAME: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
-          BB_VERSION: ${{steps.version_number.outputs.BB_VERSION}}
         run: |
           source /opt/rh/gcc-toolset-9/enable
-          .github/actions/build-asset-unix.sh
+          .github/actions/build-asset-unix.sh --artifact ${{ env.BB_ARTIFACT_NAME }} --version ${{ env.BB_VERSION }}
 
       - name: Upload Artifact CentOS ARM64
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
-          path: ${{ github.workspace }}/bin/${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
+          name: ${{ env.BB_ARTIFACT_NAME }}
+          path: ${{ github.workspace }}/bin/${{ env.BB_ARTIFACT_NAME }}
           if-no-files-found: error
 
-  build-windows-x86-64:
+  build-bladebit-cuda-linux-arm64:
+    runs-on: [ARM64, Linux]
+    container:
+      image: chianetwork/ubuntu-20.04-builder:latest
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v3
+
+      - name: Set Env
+        uses: Chia-Network/actions/setjobenv@main
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Get Version Number
+        id: version_number
+        run: .github/actions/get-version.sh ubuntu arm64
+
+      - name: Install Prerequisites
+        run: |
+          export DEBIAN_FRONTEND=noninteractive
+          apt update
+          apt install -y build-essential git libgmp-dev libnuma-dev
+      - name: Setup CUDA
+        run: |
+          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa/cuda-ubuntu2004.pin
+          mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600
+          wget https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda-repo-ubuntu2004-12-1-local_12.1.1-530.30.02-1_arm64.deb
+          dpkg -i cuda-repo-ubuntu2004-12-1-local_12.1.1-530.30.02-1_arm64.deb
+          cp /var/cuda-repo-ubuntu2004-12-1-local/cuda-*-keyring.gpg /usr/share/keyrings/
+          apt-get update
+          DEBIAN_FRONTEND=noninteractive apt-get -y install cuda
+
+      - name: Build
+        run: .github/actions/build-asset-unix.sh --artifact ${{ env.BB_ARTIFACT_NAME }} --version ${{env.BB_VERSION}}
+
+      - name: Upload Artifact Ubuntu ARM64
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ env.BB_ARTIFACT_NAME }}
+          path: ${{ github.workspace }}/bin/${{ env.BB_ARTIFACT_NAME }}
+          if-no-files-found: error
+
+      - name: Build Bladebit CUDA
+        run: |
+          .github/actions/build-asset-unix.sh --cuda --artifact ${{ env.BB_ARTIFACT_NAME_CUDA }} --version ${{ env.BB_VERSION }}
+      - name: Upload Bladebit CUDA Artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ env.BB_ARTIFACT_NAME_CUDA }}
+          path: ${{ github.workspace }}/bin/${{ env.BB_ARTIFACT_NAME_CUDA }}
+          if-no-files-found: error
+
+  build-bladebit-windows-x86-64:
     runs-on: windows-2019
     steps:
       - name: Checkout Repo
         uses: actions/checkout@v3
 
+      - name: Set Env
+        uses: Chia-Network/actions/setjobenv@main
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
       - name: Get Version Number
         shell: bash
         id: version_number
         run: .github/actions/get-version.sh windows x86-64
 
-      - name: Build
+      - name: Build Bladebit
         shell: bash
         env:
-          BB_ARTIFACT_NAME: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
-          BB_VERSION: ${{steps.version_number.outputs.BB_VERSION}}
+          BB_ARTIFACT_NAME: ${{ env.BB_ARTIFACT_NAME }}
+          BB_VERSION: ${{env.BB_VERSION}}
         run: |
-          
           mkdir build && cd build
           cmake ..
-          bash -e -o pipefail ../embed-version.sh
+          bash -eo pipefail ../embed-version.sh
           cat ../src/Version.h
           cmake --build . --target bladebit --config Release
 
@@ -160,74 +460,86 @@ jobs:
               >&2 echo "Incorrect bladebit version. Got but '$bb_version' expected '$BB_VERSION'."
               exit 1
           fi
-          
+
           mkdir ../bin
           cd Release
           ls -la
           7z.exe a -tzip ../../bin/${BB_ARTIFACT_NAME} bladebit.exe
           ls -la ../../bin
 
-      - name: Upload Artifact Windows x86-64
+      - name: Upload Bladebit Artifact Windows x86-64
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
-          path: ${{ github.workspace }}/bin/${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
+          name: ${{ env.BB_ARTIFACT_NAME }}
+          path: ${{ github.workspace }}/bin/${{ env.BB_ARTIFACT_NAME }}
           if-no-files-found: error
 
-  build-macos-arm64:
-    runs-on: [macOS, ARM64]
+  build-bladebit-cuda-windows-x86-64:
+    runs-on: windows-2019
     steps:
-      - name: Cleanup Environment
-        uses: Chia-Network/actions/clean-workspace@main
-
       - name: Checkout Repo
         uses: actions/checkout@v3
 
+      - name: Set Env
+        uses: Chia-Network/actions/setjobenv@main
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
       - name: Get Version Number
+        shell: bash
         id: version_number
-        run: bash -e .github/actions/get-version.sh macos arm64
+        run: .github/actions/get-version.sh windows x86-64
 
       - name: Install Prerequisites
-        run: brew install cmake
-
-      - name: Build
-        env:
-          BB_ARTIFACT_NAME: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
-          BB_VERSION: ${{steps.version_number.outputs.BB_VERSION}}
-        run: .github/actions/build-asset-unix.sh
+        shell: powershell
+        run: |
+          choco install -y make
+          choco install -y wget
+          choco install -y sed
 
-      - name: Upload Artifact macOS arm64
-        uses: actions/upload-artifact@v3
+      - name: Setup CUDA
+        uses: Jimver/cuda-toolkit@v0.2.11
+        id: cuda-toolkit
         with:
-          name: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
-          path: ${{ github.workspace }}/bin/${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
-          if-no-files-found: error
+          cuda: '12.1.0'
+          method: network
 
-  build-macos-x86-64:
-    runs-on: macOS-11
-    steps:
-      - name: Cleanup Environment
-        uses: Chia-Network/actions/clean-workspace@main
+      - name: Verify CUDA
+        shell: bash
+        run: |
+          echo "Installed cuda version is: ${{ steps.cuda-toolkit.outputs.cuda }}"
+          echo "Cuda install location: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}"
+          nvcc -V
 
-      - name: Checkout Repo
-        uses: actions/checkout@v3
+      - name: Build Bladebit CUDA
+        shell: bash
+        env:
+          BB_ARTIFACT_NAME_CUDA: ${{ env.BB_ARTIFACT_NAME_CUDA }}
+          BB_VERSION: ${{env.BB_VERSION}}
+        run: |
+          mkdir build_cuda && cd build_cuda
+          cmake ..
+          bash -eo pipefail ../embed-version.sh
+          cat ../src/Version.h
+          cmake --build . --target bladebit_cuda --config Release
 
-      - name: Get Version Number
-        id: version_number
-        run: .github/actions/get-version.sh macos x86-64
+          # Ensure bladebit version matches expected version
+          bb_version="$(./Release/bladebit_cuda.exe --version | xargs)"
 
-      - name: Install Prerequisites
-        run: brew install cmake
+          if [[ "$bb_version" != "$BB_VERSION" ]]; then
+              >&2 echo "Incorrect bladebit version. Got but '$bb_version' expected '$BB_VERSION'."
+              exit 1
+          fi
 
-      - name: Build
-        env:
-          BB_ARTIFACT_NAME: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
-          BB_VERSION: ${{steps.version_number.outputs.BB_VERSION}}
-        run: .github/actions/build-asset-unix.sh
+          mkdir ../bin
+          cd Release
+          ls -la
+          7z.exe a -tzip ../../bin/${BB_ARTIFACT_NAME_CUDA} bladebit_cuda.exe
+          ls -la ../../bin
 
-      - name: Upload Artifact macOS x86-64
+      - name: Upload Bladebit CUDA Artifact Windows x86-64
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
-          path: ${{ github.workspace }}/bin/${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
+          name: ${{ env.BB_ARTIFACT_NAME_CUDA }}
+          path: ${{ github.workspace }}/bin/${{ env.BB_ARTIFACT_NAME_CUDA }}
           if-no-files-found: error
diff --git a/.idea/.name b/.idea/.name
new file mode 100644
index 00000000..1e51b03a
--- /dev/null
+++ b/.idea/.name
@@ -0,0 +1 @@
+bladebit
\ No newline at end of file
diff --git a/.idea/qmlSettings.xml b/.idea/qmlSettings.xml
new file mode 100644
index 00000000..b08a4e58
--- /dev/null
+++ b/.idea/qmlSettings.xml
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="QmlSettings">
+    <option name="mySettingsPerProfile">
+      <map>
+        <entry key="">
+          <value>
+            <PerProfileState />
+          </value>
+        </entry>
+      </map>
+    </option>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
index e88cc20c..54a1aefd 100644
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@@ -3,39 +3,12 @@
   <component name="VcsDirectoryMappings">
     <mapping directory="$PROJECT_DIR$" vcs="Git" />
     <mapping directory="$PROJECT_DIR$/build-release/_deps/bls-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/build-release/_deps/catch2-src" vcs="Git" />
     <mapping directory="$PROJECT_DIR$/build-release/_deps/relic-src" vcs="Git" />
     <mapping directory="$PROJECT_DIR$/build-release/_deps/sodium-src" vcs="Git" />
     <mapping directory="$PROJECT_DIR$/build-release/_deps/sodium-src/libsodium" vcs="Git" />
     <mapping directory="$PROJECT_DIR$/build/_deps/bls-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/build/_deps/catch2-src" vcs="Git" />
     <mapping directory="$PROJECT_DIR$/build/_deps/relic-src" vcs="Git" />
     <mapping directory="$PROJECT_DIR$/build/_deps/sodium-src" vcs="Git" />
     <mapping directory="$PROJECT_DIR$/build/_deps/sodium-src/libsodium" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/cmake-build-debug-visual-studio-x64/_deps/bls-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/cmake-build-debug-visual-studio-x64/_deps/catch2-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/cmake-build-debug-visual-studio-x64/_deps/relic-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/cmake-build-debug-visual-studio-x64/_deps/sodium-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/cmake-build-debug-visual-studio-x64/_deps/sodium-src/libsodium" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/cmake-build-debug/_deps/bls-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/cmake-build-debug/_deps/catch2-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/cmake-build-debug/_deps/relic-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/cmake-build-debug/_deps/sodium-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/cmake-build-debug/_deps/sodium-src/libsodium" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/cmake-build-release-visual-studio-x64/_deps/bls-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/cmake-build-release-visual-studio-x64/_deps/catch2-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/cmake-build-release-visual-studio-x64/_deps/relic-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/cmake-build-release-visual-studio-x64/_deps/sodium-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/cmake-build-release-visual-studio-x64/_deps/sodium-src/libsodium" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/cmake-build-release/_deps/bls-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/cmake-build-release/_deps/catch2-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/cmake-build-release/_deps/relic-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/cmake-build-release/_deps/sodium-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/cmake-build-release/_deps/sodium-src/libsodium" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/out/build/x64-Debug/_deps/bls-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/out/build/x64-Debug/_deps/catch2-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/out/build/x64-Debug/_deps/relic-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/out/build/x64-Debug/_deps/sodium-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/out/build/x64-Debug/_deps/sodium-src/libsodium" vcs="Git" />
   </component>
 </project>
\ No newline at end of file
diff --git a/.vscode/launch.json b/.vscode/launch.json
index 246a5532..6957af27 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -33,10 +33,12 @@
             "args": [
                 "-f", "ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef",
                 "-p", "80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8",
-                "-i", "c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835",
-                // "-n", "1",
+                // "-i", "c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835",
+                "-i", "5cfc42dfaa5613da0b425994c2427a2ba4a8efcfb49e7844e93c0854baf09863",
+                // "-n", "2",
                 "-w",
                 "-v",
+                "--compress", "4",
                 "ramplot",
                 "~/plot/tmp"
             ]
@@ -86,11 +88,13 @@
 
                 "--show-memo",
 
+                // "--compress", "6",
+
                 "diskplot",
 
                 "-t1", "~/plot/tmp",
                 "--f1-threads", "24",
-                // "--fp-threads", "62",
+                "--fp-threads", "62",
                 "--c-threads", "28",
                 "--p2-threads", "24",
                 
@@ -101,14 +105,14 @@
                 // "--cache", "64G",
                 // "-s",
                 // "--k32-bounded",
-                "-b", "64",
+                // "-b", "64",
                 // "--sizes",
-                // "-b", "128",
+                "-b", "128",
                 // "-b", "256",
 
-                "--c-threads", "26",
-                "--p2-threads", "24",
-                "--p3-threads", "48",
+                // "--c-threads", "26",
+                // "--p2-threads", "24",
+                // "--p3-threads", "48",
                 "~/plot/tmp"
             ],
 
@@ -117,6 +121,78 @@
             "environment": []
         },
 
+        {
+            "name"       : "Bladebit CUDA",
+            
+            "type"          : "cuda-gdb",
+            "request"       : "launch",
+            "stopAtEntry"   : false,
+            "cwd"           : "${workspaceFolder}",
+            "preLaunchTask" : "build_cuda_debug",
+
+            "program": "${workspaceFolder}/build/bladebit_cuda",
+            
+            //                 "-c", "xch1uf48n3f50xrs7zds0uek9wp9wmyza6crnex6rw8kwm3jnm39y82q5mvps6",
+            // "-i", "7a709594087cca18cffa37be61bdecf9b6b465de91acb06ecb6dbe0f4a536f73",    // Yes overflow
+            // "--memo", "80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef207d52406afa2b6d7d92ea778f407205bd9dca40816c1b1cacfca2a6612b93eb",
+            
+            "args": 
+            // "-w --compress 3 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot ~/plot/tmp",
+            "-w --compress 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot ~/plot",
+            
+            "windows": {
+                "type": "cppvsdbg",
+                "program": "${workspaceFolder}/build/Debug/bladebit_cuda.exe",
+                "args": "--benchmark --compress 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot D:/"
+            }
+        },
+
+        {
+            "name"       : "CUDA Harvest Test",
+
+            "type"          : "cuda-gdb",
+            "request"       : "launch",
+            "stopAtEntry"   : false,
+            "cwd"           : "${workspaceFolder}",
+            "preLaunchTask" : "build_cuda_debug",
+            "program": "${workspaceFolder}/build/bladebit_cuda",
+            
+            // "preLaunchTask" : "build_debug",
+            // "program": "${workspaceFolder}/build/bladebit",
+            
+            //                 "-c", "xch1uf48n3f50xrs7zds0uek9wp9wmyza6crnex6rw8kwm3jnm39y82q5mvps6",
+            // "-i", "7a709594087cca18cffa37be61bdecf9b6b465de91acb06ecb6dbe0f4a536f73",    // Yes overflow
+            // "--memo", "80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef207d52406afa2b6d7d92ea778f407205bd9dca40816c1b1cacfca2a6612b93eb",
+
+            // "args": "-t 1 validate --cuda --f7 0 ~/plot/tmp/plot-k32-c01-2023-03-09-14-07-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+            // "args": "validate --cuda --f7 0 ~/plot/tmp/plot-k32-c07-2023-03-16-11-49-7732c75d9f3b5ad1fc804bb7429121e334bd4f25f9bbbb76ef0370b5a0e80aae.plot",
+            // "args": "validate --cuda --f7 0 ~/plot/tmp/plot-k32-c09-2023-04-19-16-12-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+            
+            // "args": "-t 1 simulate -n 5000 -p 4  ~/plot/tmp/plot-k32-c07-2023-04-13-16-08-330fbf677f78641061c93312c1a7ffa28138739b69975f3b874df6acc3e76378.plot",
+            // "args": "-t 16 simulate -n 10 ~/plot/tmp/plot-k32-c09-2023-04-19-16-12-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+            // "args": "-t 16 simulate -n 10 ~/plot/tmp/plot-k32-c01-2023-03-09-14-07-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+            // "args": "validate --cuda --quality 4 ~/plot/tmp/plot-k32-c07-2023-04-13-16-08-330fbf677f78641061c93312c1a7ffa28138739b69975f3b874df6acc3e76378.plot",
+            // "args": "validate --cuda --f7 4 ~/plot/tmp/plot-k32-c07-2023-04-13-16-08-330fbf677f78641061c93312c1a7ffa28138739b69975f3b874df6acc3e76378.plot",
+
+            // "args": "validate --quality 4 ~/plot/tmp/plot-k32-c07-2023-04-13-16-08-330fbf677f78641061c93312c1a7ffa28138739b69975f3b874df6acc3e76378.plot",
+            // "args": "validate --cuda --quality 4 ~/plot/tmp/plot-k32-c07-2023-04-13-16-08-330fbf677f78641061c93312c1a7ffa28138739b69975f3b874df6acc3e76378.plot",
+
+            // "args": "validate --quality 4 ~/plot/tmp/plot-k32-2023-04-26-20-24-330fbf677f78641061c93312c1a7ffa28138739b69975f3b874df6acc3e76378.plot",
+            
+            // "args": "validate --quality 98 ~/plot/tmp/plot-k32-c09-2023-04-19-16-12-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+            
+            // "args": "validate --cuda --quality 6 ~/plot/tmp/plot-k32-c09-2023-04-19-16-12-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+            "args": "validate --cuda --f7 6 ~/plot/tmp/plot-k32-c09-2023-04-19-16-12-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+            // "args": "validate --quality 6 ~/plot/tmp/plot-k32-2023-04-26-20-20-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+
+            "windows": {
+                "type": "cppvsdbg",
+                "program": "${workspaceFolder}/build/Debug/bladebit_cuda.exe",
+                // "args": "--benchmark --compress 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot D:/"
+                "args": "validate --cuda --f7 0 D:/chia_test_plots/plot-k32-c01-2023-05-10-18-56-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+            }
+        },
+
         {
             "name"         : "IOTest",
             
@@ -162,6 +238,9 @@
             "name"         : "Tests",
             
             "type"          : "cppdbg",
+            "osx": {
+                "MIMode": "lldb",
+            },
             "request"       : "launch",
             "stopAtEntry"   : false,
             "cwd"           : "${workspaceFolder}",
@@ -171,7 +250,16 @@
             "program": "${workspaceRoot}/build/tests",
             
             "environment": [
-                // { "name": "bbtest_thread_count", "value": "2" }
+                // { "name": "bb_thread_count", "value": "60" }
+                { "name": "bb_iterations"  , "value": "1"  },
+                { "name": "bb_thread_count", "value": "4" },
+                { "name": "bb_f7"          , "value": "0" },
+                { "name": "bb_plot"        , "value": "/home/harold/plot/tmp/plot-k32-c01-2023-02-13-22-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" },
+                // { "name": "bb_plot"        , "value": "/home/harold/plot/tmp/plot-k32-c07-2023-02-08-17-35-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" }
+                // { "name": "bb_plot"        , "value": "/home/harold/plot/tmp/plot-k32-c04-2023-02-08-01-33-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" }
+                // { "name": "bb_plot"        , "value": "/home/harold/plot/tmp/plot-k32-c06-2023-02-14-21-43-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" },
+                { "name": "bb_clevel"      , "value": "1" },
+                { "name": "bb_end_clevel"  , "value": "1" },
             ],
 
             "args": [
@@ -183,10 +271,12 @@
                 // "PairsAndMap"
                 // "bucket-slice-write"
                 // "line-point-deltas"
+                // "compressed-plot-proof"
+                // "compressed-plot-qualities"
+                "macos-threads"
             ]
         }
 
-
         ,{
             "name"       : "Plot Tool",
             
@@ -208,23 +298,48 @@
 
             "args": [
                 /// Validate
-                // "-t", "32",
+                // "-t", "48",
                 // "-t", "1",
-                "validate",
+                
+                // "-t", "1", "validate", "--f7", "324", "~/plot/tmp/plot-k32-c01-2023-02-13-22-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+                // "validate", "--f7", "7", "~/plot/tmp/plot-k32-c01-2023-03-09-14-07-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+                // "validate", "--cuda", "--f7", "4", "~/plot/tmp/plot-k32-c07-2023-04-13-16-08-330fbf677f78641061c93312c1a7ffa28138739b69975f3b874df6acc3e76378.plot",
+                
+                // "--verify", "0x7d7ceb24ca25bac5f4c59b4400b23585bff254efa5b78f3085192e399fc74fdaab630f2cd74ea733eb9b82a5bc5582e8fd075c0591b2eef12adae264159a8eeeae5808202d1a10cffd1a0fcb64b1f43cd3941987cf606ba01434d43715cbe1773f01fe74288110606b2cd90063f01f0eca3ba515a2fb2a011ea73d7da3148895e046b09c3d393cad44411fe57671290e4f34ed7d2aafe6788effde2c965b814158a1fe1109b67cf2f9849dfa55568d68e3e5fa24605269499f30b61cb889b6256256e467de963c25d7fb47e6a4119f2f8719ec9acbd82f7d95b8196660fe43165490255730ddf870a4e48da1ea2050fef4608d7321d6a3eede07744d8847858d", 
+                // "0x00000037ff04b8ee9355068689bd558eafe07cc7af47ad1574b074fc34d6913a", "c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835",
+
+                // // "--f7", "2534554965", "~/plot/tmp/plot-k32-2022-10-18-22-25-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+                // "-t", "16", "validate", "--f7", "11", "~/plot/tmp/plot-k32-c07-2023-03-16-11-49-7732c75d9f3b5ad1fc804bb7429121e334bd4f25f9bbbb76ef0370b5a0e80aae.plot"
 
-                "--f7", "2534554965",
-                "~/plot/tmp/plot-k32-2022-10-18-22-25-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+                // "/home/harold/plot/tmp/plot-k32-c07-2023-02-08-17-35-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+                // "~/plot/tmp/plot-k32-2023-02-08-17-39-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+                // //  "~/plot/tmp/plot-k32-c04-2023-01-29-03-29-5cfc42dfaa5613da0b425994c2427a2ba4a8efcfb49e7844e93c0854baf09863.plot"
+                // // "~/plot/tmp/ramplot-k32-2023-01-30-09-04-5cfc42dfaa5613da0b425994c2427a2ba4a8efcfb49e7844e93c0854baf09863.plot"
+                // // "~/plot/tmp/disk/plot-k32-c04-2023-01-30-23-07-5cfc42dfaa5613da0b425994c2427a2ba4a8efcfb49e7844e93c0854baf09863.plot"
+                // // "~/plot/tmp/plot-k32-c04-2023-01-30-23-55-5cfc42dfaa5613da0b425994c2427a2ba4a8efcfb49e7844e93c0854baf09863.plot"
+                // // "~/plot/tmp/plot-k32-c04-2023-01-31-01-00-5cfc42dfaa5613da0b425994c2427a2ba4a8efcfb49e7844e93c0854baf09863.plot"
+                // // "/home/harold/plot/tmp/plot-k32-c04-2023-01-31-22-57-5cfc42dfaa5613da0b425994c2427a2ba4a8efcfb49e7844e93c0854baf09863.plot"
+                // // "/home/harold/plot/tmp/plot-k32-c04-2023-01-31-23-15-5cfc42dfaa5613da0b425994c2427a2ba4a8efcfb49e7844e93c0854baf09863.plot"
+                
+                // Simulation
+                "-t", "1", "simulate", "--seed", "b8e9ec6bc179ae6ba5f5c3483f7501db32879efa84b62001d27601a540dca5ff", 
+                    "-p", "16", "-n", "1", "--power", "45", "--size", "4PB", "~/plot/tmp/plot-k32-c01-2023-03-09-14-07-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+                // "-t", "30", "simulate", "-p", "2", "-n", "600", "~/plot/tmp/plot-k32-c07-2023-03-16-11-49-7732c75d9f3b5ad1fc804bb7429121e334bd4f25f9bbbb76ef0370b5a0e80aae.plot"
                 
                 // "-m",
-                // "-u",
-                // "~/plot/tmp/plot-k32-2022-10-17-15-05-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+                // "-u", "~/plot/tmp/plot-k32-2022-10-26-23-58-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
 
                 // "/mnt/p5510a/disk_tmp/plot.dat"
+                
+                // "--f7", "3983284117", "/home/harito/plot/tmp/plot-k32-2022-11-21-05-59-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+                // "--f7", "3983284117", "/home/harito/plot/tmp/gpu_1.plot",
 
                 /// Compare
                 // "plotcmp",
-                // "/mnt/p5510a/disk_tmp/plot-k32-2022-04-12-13-53-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
-                // "/mnt/p5510a/disk_tmp/plot-k32-2022-04-12-13-03-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+                // "/home/harito/plot/tmp/gpu_1.plot.old",
+                // "/home/harold/plot-tmpfs/gpu_1.plot",
+                // "/home/harito/plot/tmp/gpu_1.plot",
+                // "/home/harito/plot/tmp/plot-k32-2022-11-21-05-59-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
             ]
         },
 
diff --git a/.vscode/settings.json b/.vscode/settings.json
index be694e05..c6c5274d 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -13,6 +13,7 @@
         "*.ac": "shellscript",
         "player": "json",
         "*.userprefs": "xml",
+        "*.make": "makefile",
         "memory": "cpp",
         "cstddef": "cpp",
         "string": "cpp",
@@ -101,11 +102,30 @@
         "locale": "cpp",
         "stack": "cpp",
         "*.include": "cpp",
-        "relic_core.h": "c"
+        "relic_core.h": "c",
+        "compare": "cpp",
+        "concepts": "cpp",
+        "numbers": "cpp",
+        "semaphore": "cpp",
+        "stop_token": "cpp",
+        "queue": "cpp",
+        "__memory": "cpp",
+        "filesystem": "cpp",
+        "__bits": "cpp",
+        "csignal": "cpp",
+        "cfenv": "cpp"
     },
     "cSpell.words": [
         "Ryzen"
     ],
     "C_Cpp.errorSquiggles": "Enabled",
-    "cmake.configureOnOpen": true
+    "cmake.configureOnOpen": true,
+    "cmake.configureOnEdit": false,
+    "cmake.preferredGenerators": [
+        "Unix Makefiles",
+        "Visual Studio 17 2022"
+    ]
+    // "cmake.generator": "Unix Makefiles"
+    // "cmake.generator": "Visual Studio 17 2022"
+
 }
\ No newline at end of file
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
index be6ce096..e98520f2 100644
--- a/.vscode/tasks.json
+++ b/.vscode/tasks.json
@@ -35,6 +35,56 @@
 			}
 		},
 
+		{
+			"type"   : "shell",
+			"label"  : "build_cuda_debug",
+			"detail" : "Build CUDA Bladebit",
+			"command": "cmake",
+
+			"args": [
+				"--build", ".",
+				"--target", "bladebit_cuda",
+				"--config", "Debug",
+				"-j", "24"
+			],
+			
+			"problemMatcher": [ "$nvcc" ],
+
+			"options": {
+				"cwd": "${workspaceFolder}/build"
+			},
+			
+			"group": {
+				"kind": "build",
+				"isDefault": true
+			}
+		},
+
+		{
+			"type"   : "shell",
+			"label"  : "build_harvester",
+			"detail" : "Build Bladebit Harvester",
+			"command": "cmake",
+
+			"args": [
+				"--build", ".",
+				"--target", "lib_bladebit_harvester",
+				"--config", "Debug",
+				"-j", "24"
+			],
+			
+			"problemMatcher": [ "$nvcc" ],
+
+			"options": {
+				"cwd": "${workspaceFolder}/build"
+			},
+			
+			"group": {
+				"kind": "build",
+				"isDefault": false
+			}
+		},
+
 		{
 			"type"   : "shell",
 			"label"  : "rebuild_debug",
diff --git a/Bladebit.cmake b/Bladebit.cmake
new file mode 100644
index 00000000..6ce0ad97
--- /dev/null
+++ b/Bladebit.cmake
@@ -0,0 +1,305 @@
+add_library(bladebit_core)
+target_link_libraries(bladebit_core PUBLIC bladebit_config)
+
+target_include_directories(bladebit_core PUBLIC
+    ${INCLUDE_DIRECTORIES}
+    ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
+target_compile_definitions(bladebit_core PUBLIC
+    GR_NO_IMPORT=1
+    BB_NUMA_ENABLED=1
+)
+
+target_compile_options(bladebit_core PUBLIC ${preinclude_pch})
+
+target_link_libraries(bladebit_core PUBLIC 
+    Threads::Threads
+    bls
+
+    $<$<PLATFORM_ID:Linux>:
+        ${NUMA_LIBRARY}
+    >
+)
+
+add_executable(bladebit
+    src/main.cpp
+    cuda/harvesting/CudaThresherDummy.cpp)
+
+target_link_libraries(bladebit PRIVATE bladebit_core)
+
+
+# Sources
+set(src_uint128
+    src/uint128_t/endianness.h
+    src/uint128_t/uint128_t.cpp
+    src/uint128_t/uint128_t.h
+)
+
+set(src_chacha8
+    src/pos/chacha8.cpp
+    src/pos/chacha8.h
+)
+
+set(src_fse
+    src/fse/bitstream.h
+    src/fse/compiler.h
+    src/fse/debug.c
+    src/fse/debug.h
+    src/fse/entropy_common.c
+    src/fse/error_private.h
+    src/fse/error_public.h
+    src/fse/fse_compress.c
+    src/fse/fse_decompress.c
+    src/fse/fse.h
+    src/fse/hist.c
+    src/fse/hist.h
+    src/fse/huf.h
+    src/fse/mem.h
+)
+
+set(src_blake3
+    src/b3/blake3.c
+    src/b3/blake3_dispatch.c
+    src/b3/blake3.h
+    src/b3/blake3_impl.h
+    src/b3/blake3_portable.c
+    
+    $<${is_x86}:
+
+        $<$<PLATFORM_ID:Windows>:
+            src/b3/blake3_sse41.c
+            src/b3/blake3_avx2.c
+            src/b3/blake3_avx512.c
+        >
+        $<$<NOT:$<PLATFORM_ID:Windows>>:
+            src/b3/blake3_avx2_x86-64_unix.S
+            src/b3/blake3_avx512_x86-64_unix.S
+            src/b3/blake3_sse41_x86-64_unix.S
+        >
+    >
+)
+
+set(src_bech32
+    src/bech32/segwit_addr.c
+    src/bech32/segwit_addr.h
+)
+
+set(src_bladebit
+
+    # third party
+    $<$<CXX_COMPILER_ID:MSVC>:
+        ${src_uint128}
+    >
+
+    ${src_chacha8}
+    ${src_fse}
+    ${src_blake3}
+    ${src_bech32}
+
+    # bladebit
+    $<$<PLATFORM_ID:Linux>:
+        src/platform/linux
+        src/platform/linux/SysHost_Linux.cpp
+    >
+
+    $<$<PLATFORM_ID:Darwin>:
+        src/platform/macos/SysHost_Macos.cpp
+    >
+
+    $<$<PLATFORM_ID:Linux,Darwin>:
+        src/platform/unix/FileStream_Unix.cpp
+        src/platform/unix/Thread_Unix.cpp
+    >
+
+    $<$<PLATFORM_ID:Windows>:
+        src/platform/win32/FileStream_Win32.cpp
+        src/platform/win32/SysHost_Win32.cpp
+        src/platform/win32/Thread_Win32.cpp
+    >
+
+    src/BLS.h
+    src/Config.h
+    src/ChiaConsts.h
+    src/Globals.h
+    src/Types.h
+    src/Platform.h
+    src/PlotContext.h
+    src/PlotContext.cpp
+    src/PlotWriter.h
+    src/PlotWriter.cpp
+    src/SysHost.cpp
+    src/SysHost.h
+    src/View.h
+    src/pch.cpp
+    src/pch.h
+    src/Version.h
+
+    src/algorithm/YSort.cpp
+    src/algorithm/YSort.h
+    src/algorithm/RadixSort.h
+
+    src/io/BucketStream.cpp
+    src/io/BucketStream.h
+    src/io/FileStream.cpp
+    src/io/FileStream.h
+    src/io/HybridStream.cpp
+    src/io/HybridStream.h
+    src/io/IOUtil.cpp
+    src/io/IOUtil.h
+    src/io/IStream.h
+    src/io/MemoryStream.h
+
+    src/plotdisk/BlockWriter.h
+    src/plotdisk/DiskFp.h
+    src/plotdisk/DiskPairReader.h
+    src/plotdisk/DiskPlotDebug.cpp
+    src/plotdisk/DiskPlotDebug.h
+    src/plotdisk/DiskPlotInfo.h
+    src/plotdisk/DiskPlotPhase2.h
+    src/plotdisk/DiskPlotPhase3.cpp.disabled
+    src/plotdisk/DiskPlotPhase3.h
+    src/plotdisk/FileId.h
+    src/plotdisk/FpFxGen.h
+    src/plotdisk/FpGroupMatcher.h
+    src/plotdisk/MapWriter.h
+    
+    src/plotdisk/jobs/JobShared.h
+    src/plotdisk/jobs/LookupMapJob.h
+    src/plotdisk/jobs/UnpackMapJob.cpp
+    src/plotdisk/jobs/UnpackMapJob.h
+    src/plotdisk/jobs/IOJob.cpp
+    src/plotdisk/jobs/IOJob.h
+    
+    src/plotdisk/k32/DiskPlotBounded.h
+    src/plotdisk/k32/FpMatchBounded.inl
+    src/plotdisk/k32/CTableWriterBounded.h
+    src/plotdisk/k32/DiskPlotBounded.cpp
+    src/plotdisk/k32/F1Bounded.inl
+    src/plotdisk/k32/FxBounded.inl
+
+    src/plotdisk/DiskF1.h
+    src/plotdisk/DiskPlotConfig.h
+    src/plotdisk/DiskPlotContext.h
+    src/plotdisk/DiskPlotPhase2.cpp
+    src/plotdisk/DiskPlotPhase3.cpp
+    src/plotdisk/DiskPlotter.h
+    src/plotdisk/DiskPlotter.cpp
+    src/plotdisk/DiskBufferQueue.cpp
+    src/plotdisk/DiskBufferQueue.h
+    src/plotdisk/BitBucketWriter.h
+
+    
+    src/plotmem/DbgHelper.cpp
+    src/plotmem/FxSort.h
+    src/plotmem/MemPhase1.h
+    src/plotmem/MemPhase2.h
+    src/plotmem/MemPhase3.h
+    src/plotmem/MemPlotter.h
+    src/plotmem/ParkWriter.h
+    src/plotmem/DbgHelper.h
+    src/plotmem/LPGen.h
+    src/plotmem/MemPhase1.cpp
+    src/plotmem/MemPhase2.cpp
+    src/plotmem/MemPhase3.cpp
+    src/plotmem/MemPhase4.cpp
+    src/plotmem/MemPhase4.h
+    src/plotmem/MemPlotter.cpp
+
+    
+    src/plotting/DTables.h
+    src/plotting/GenSortKey.h
+    src/plotting/PlotValidation.cpp
+    src/plotting/TableWriter.cpp
+    src/plotting/PlotTypes.h
+    src/plotting/TableWriter.h
+    src/plotting/WorkHeap.cpp
+    src/plotting/CTables.h
+    src/plotting/Compression.cpp
+    src/plotting/Compression.h
+    src/plotting/FSETableGenerator.cpp
+    src/plotting/GlobalPlotConfig.h
+    src/plotting/IPlotter.h
+    src/plotting/PlotHeader.h
+    src/plotting/PlotTools.cpp
+    src/plotting/PlotTools.h
+    src/plotting/PlotValidation.h
+    src/plotting/PlotWriter.cpp
+    src/plotting/PlotWriter.h
+    src/plotting/Tables.h
+    
+    src/plotting/f1/F1Gen.h
+    src/plotting/f1/F1Gen.cpp
+    
+    src/plotting/fx/PlotFx.inl
+    
+    src/plotting/matching/GroupScan.cpp
+    src/plotting/matching/GroupScan.h
+    src/plotting/WorkHeap.h
+
+    src/threading/AutoResetSignal.h
+    src/threading/Semaphore.cpp
+    src/threading/Semaphore.h
+    src/threading/Fence.cpp
+    src/threading/Fence.h
+    src/threading/GenJob.h
+    src/threading/MTJob.h
+    src/threading/MonoJob.h
+    src/threading/Thread.h
+    src/threading/ThreadPool.cpp
+    src/threading/ThreadPool.h
+    src/threading/AutoResetSignal.cpp
+
+    # src/tools/FSETableGenerator.cpp
+    src/tools/MemTester.cpp
+    src/tools/IOTester.cpp
+    src/tools/PlotComparer.cpp
+    src/tools/PlotFile.cpp
+    src/tools/PlotReader.cpp
+    src/tools/PlotReader.h
+    src/tools/PlotValidator.cpp
+
+    src/util/Array.h
+    src/util/Array.inl
+    src/util/BitField.h
+    src/util/SPCQueue.h
+    src/util/SPCQueue.inl
+
+    src/util/jobs/MemJobs.h
+    src/util/jobs/SortKeyJob.h
+    src/util/BitView.h
+    src/util/CliParser.cpp
+    src/util/KeyTools.cpp
+    src/util/KeyTools.h
+    src/util/Log.h
+    src/util/CliParser.h
+    src/util/Log.cpp
+    src/util/Span.h
+    src/util/StackAllocator.h
+    src/util/Util.cpp
+    src/util/Util.h
+    src/util/VirtualAllocator.h
+
+    src/commands/Commands.h
+    src/commands/CmdPlotCheck.cpp
+    src/commands/CmdSimulator.cpp
+    src/commands/CmdCheckCUDA.cpp
+
+    src/harvesting/GreenReaper.cpp
+    src/harvesting/GreenReaper.h
+    src/harvesting/GreenReaperInternal.h
+    src/harvesting/Thresher.h
+)
+
+target_sources(bladebit_core PUBLIC ${src_bladebit})
+
+ # Disable blake3 conversion loss of data warnings
+ if("${CMAKE_CXX_COMPILER_ID}" MATCHES "MSVC")
+    set_source_files_properties( 
+        src/b3/blake3_avx2.c
+        src/b3/blake3_avx512.c
+        src/b3/blake3_sse41.c
+        PROPERTIES COMPILE_FLAGS
+        /wd4244
+    )
+ endif()
diff --git a/BladebitCUDA.cmake b/BladebitCUDA.cmake
new file mode 100644
index 00000000..1fc668fa
--- /dev/null
+++ b/BladebitCUDA.cmake
@@ -0,0 +1,59 @@
+add_executable(bladebit_cuda
+    src/main.cpp
+
+    cuda/CudaPlotter.cu
+    cuda/CudaPlotter.h
+    cuda/CudaPlotContext.h
+    cuda/CudaPlotPhase2.cu
+    cuda/CudaPlotPhase3.cu
+    cuda/CudaPlotPhase3Step2.cu
+    cuda/CudaPlotPhase3Step3.cu
+    cuda/CudaPlotPhase3Internal.h
+    cuda/CudaParkSerializer.h
+    cuda/CudaParkSerializer.cu
+    cuda/chacha8.cu
+    cuda/CudaF1.h
+    cuda/CudaF1.cu
+    cuda/CudaMatch.h
+    cuda/CudaMatch.cu
+    cuda/CudaFx.h
+    cuda/FxCuda.cu
+    cuda/CudaUtil.h
+    cuda/CudaPlotUtil.cu
+    cuda/GpuStreams.h
+    cuda/GpuStreams.cu
+
+    # Harvester
+    cuda/harvesting/CudaThresher.cu
+    cuda/harvesting/CudaThresherFactory.cu
+)
+
+target_include_directories(bladebit_cuda PRIVATE src cuda SYSTEM cuda)
+
+target_compile_definitions(bladebit_cuda PUBLIC
+    BB_CUDA_ENABLED=1
+    THRUST_IGNORE_CUB_VERSION_CHECK=1
+)
+
+target_compile_options(bladebit_cuda PRIVATE
+    ${cuda_archs}
+
+    $<${is_cuda_release}:
+    >
+
+    $<${is_cuda_debug}:
+        -G
+    >
+ )
+
+target_link_options(bladebit_cuda PRIVATE $<DEVICE_LINK: ${cuda_archs}>)
+
+target_link_libraries(bladebit_cuda PRIVATE bladebit_core CUDA::cudart_static)# CUDA::cuda_driver)
+
+set_target_properties(bladebit_cuda PROPERTIES
+    MSVC_RUNTIME_LIBRARY MultiThreaded$<$<CONFIG:Debug>:Debug>
+    CUDA_RUNTIME_LIBRARY Static
+    CUDA_SEPARABLE_COMPILATION ON
+    CUDA_RESOLVE_DEVICE_SYMBOLS ON
+    CUDA_ARCHITECTURES OFF
+)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a8022376..56595d7c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.20 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.19 FATAL_ERROR)
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -12,10 +12,39 @@ if(NOT CMAKE_BUILD_TYPE)
    )
 endif()
 
+# Allows for CMAKE_MSVC_RUNTIME_LIBRARY
+if(POLICY CMP0091)
+  cmake_policy(SET CMP0091 NEW) 
+endif()
+
 set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14" CACHE STRING "macOS minimum supported version.")
 set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>" CACHE STRING "MSVC Runtime Library")
 
-project(bladebit C CXX ASM)
+project(bladebit LANGUAGES C CXX ASM)
+
+# Ensure supported OS and Architecture
+if(NOT( (${CMAKE_SYSTEM_NAME} MATCHES "Linux") OR (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") OR (${CMAKE_SYSTEM_NAME} MATCHES "Windows") ))
+    message( FATAL_ERROR "Unsupported operating system '${CMAKE_SYSTEM_NAME}'" )
+endif()
+
+if(NOT (${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "arm64" OR ${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "aarch64" OR ${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "AMD64" OR ${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "x86_64"))
+    message( FATAL_ERROR "Unsupported architecture '${CMAKE_HOST_SYSTEM_PROCESSOR}'" )
+endif()
+
+if(NOT CMAKE_CUDA_COMPILER)
+    include(FindCUDAToolkit)
+
+    if(CUDAToolkit_FOUND)
+        message("Found CUDA: true")
+        message("NVCC      : ${CUDAToolkit_NVCC_EXECUTABLE}")
+        set(CMAKE_CUDA_COMPILER ${CUDAToolkit_NVCC_EXECUTABLE})
+    endif()
+endif()
+
+if(CMAKE_CUDA_COMPILER)
+    enable_language(CUDA)
+endif()
+
 
 message("Config   : ${CMAKE_BUILD_TYPE}")
 message("Compiler : ${CMAKE_CXX_COMPILER_ID}")
@@ -30,53 +59,17 @@ set(CMAKE_MODULE_PATH
   ${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules
 )
 
-
-#
-# Grab Dependencies
-#
-set(platform_libs)
-
-# BLS
-include(FetchContent)
-
-FetchContent_Declare(
-  bls
-  GIT_REPOSITORY https://github.com/Chia-Network/bls-signatures.git
-  GIT_TAG        1.0.10
-)
-
-set(BUILD_BLS_PYTHON_BINDINGS "0" CACHE STRING "0")
-set(BUILD_BLS_TESTS "0" CACHE STRING "")
-set(BUILD_BLS_BENCHMARKS "0" CACHE STRING "")
-FetchContent_MakeAvailable(bls)
-
-# Threads
-find_package(Threads REQUIRED)
-
-# NUMA
-if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
-    find_package(NUMA REQUIRED)
-    set(platform_libs ${NUMA_LIBRARY})
+# Is this project included as a dependency/FetchContent?
+if(NOT(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR))
+    set(BB_IS_DEPENDENCY ON)
+    set(BB_ENABLE_TESTS OFF)
+    set(BB_ENABLE_EXE OFF)
 endif()
 
-# Catch
-# TODO: Add configuration var to disable this
-include(cmake_modules/FindCatch2.cmake)
-set_target_properties(Catch2 PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
-set_target_properties(Catch2WithMain PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
-
-
-# Config
-set(c_opts)
-set(link_opts)
-
-set(release_c_opts)
-set(debug_c_opts)
-set(dev_c_opts)
-
-set(release_link_opts)
-set(debug_link_opts)
 
+#
+# Options
+#
 option(BENCHMARK_MODE "Enable benchmark mode for memplot. No final plot is written." OFF)
 if(BENCHMARK_MODE)
     add_compile_definitions("BB_BENCHMARK_MODE=1")
@@ -87,342 +80,85 @@ if(ENABLE_DISK_METRICS)
     add_compile_definitions("BB_IO_METRICS_ON=1")
 endif()
 
-# Embed version inline when in dev mode
-if((NOT DEFINED ENV{CI}) AND (NOT DEFINED CACHE{bb_version_embedded}))
-    message("Embedding local build version")
-
-    set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.")
-
-    set(cmd_ver bash)
-    if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
-        set(cmd_ver bash.exe)
-    endif()
-
-    execute_process(COMMAND ${cmd_ver} extract-version.sh major    OUTPUT_VARIABLE bb_ver_maj    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} extract-version.sh minor    OUTPUT_VARIABLE bb_ver_min    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} extract-version.sh revision OUTPUT_VARIABLE bb_ver_rev    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} extract-version.sh suffix   OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} extract-version.sh commit   OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-
-    # Remove trailing whitespace incurred in windows gitbash
-    string(STRIP "${bb_ver_maj}"    bb_ver_maj)
-    string(STRIP "${bb_ver_min}"    bb_ver_min)
-    string(STRIP "${bb_ver_rev}"    bb_ver_rev)
-    string(STRIP "${bb_ver_suffix}" bb_ver_suffix)
-    string(STRIP "${bb_ver_commit}" bb_ver_commit)
-
-    set(bb_ver_suffix ${bb_ver_suffix}-dev)
-
-    # This is slow on windows, so let's cache them
-    set(bb_ver_maj    ${bb_ver_maj}    CACHE STRING "")
-    set(bb_ver_min    ${bb_ver_min}    CACHE STRING "")
-    set(bb_ver_rev    ${bb_ver_rev}    CACHE STRING "")
-    set(bb_ver_suffix ${bb_ver_suffix} CACHE STRING "")
-    set(bb_ver_commit ${bb_ver_commit} CACHE STRING "")
-endif()
-
-if(NOT DEFINED ENV{CI})
-    add_compile_definitions(BLADEBIT_VERSION_MAJ=${bb_ver_maj})
-    add_compile_definitions(BLADEBIT_VERSION_MIN=${bb_ver_min})
-    add_compile_definitions(BLADEBIT_VERSION_REV=${bb_ver_rev})
-    add_compile_definitions(BLADEBIT_VERSION_SUFFIX="${bb_ver_suffix}")
-    add_compile_definitions(BLADEBIT_GIT_COMMIT="${bb_ver_commit}")
-endif()
-
-if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
-
-    # MSVC
-    set(c_opts
-        /std:c++17
-        /Zc:__cplusplus
-        /MP
-        /Zi
-        # /EHsc-
-        # /Wall
-        /W3
-        /WX
-        /FIpch.h 
-        /wd4068
-        /wd4464
-        /wd4668
-        /wd4820
-        /wd4514
-        /wd4626
-        /wd5027
-        /DUNICODE=1
-        /DWIN32_LEAN_AND_MEAN=1
-        /DNOMINMAX=1
-        /D_CRT_SECURE_NO_WARNINGS=1
-        /D_HAS_EXCEPTIONS=0
-        ${c_opts})
-    
-    set(tests_c_opts /DBB_TEST_MODE=1 ${tests_c_opts})
-    
-    set(link_opts 
-        /SUBSYSTEM:CONSOLE
-        /STACK:33554432,1048576
-        ${link_opts})
-
-    set(release_c_opts 
-        /Oi
-        /O2
-        /Gy
-        /GL
-        /DNDEBUG=1
-        /D_NDEBUG=1
-        ${release_c_opts})
-
-    set(debug_c_opts
-        /Od
-        /DDEBUG=1
-        /D_DEBUG=1
-        ${debug_c_opts})
-
-    set(dev_c_opts 
-        ${dev_c_opts})
-
-    
-    set(release_link_opts 
-        /DEBUG:FULL
-        /LTCG
-        /OPT:REF,ICF,LBR
-        ${release_link_opts})
-
-    set(debug_link_opts
-#        /DEBUG:FASTLINK
-        /OPT:NOREF,NOICF,NOLBR
-#        /INCREMENTAL
-        ${debug_link_opts})
-
-    # Dependency config
-    target_compile_options(bls     PRIVATE /MP)
-    target_compile_options(relic_s PRIVATE /MP)
-    target_compile_options(sodium  PRIVATE /MP)
-
-    target_compile_options(bls     PRIVATE $<$<CONFIG:Release>:/MT>)
-    target_compile_options(relic_s PRIVATE $<$<CONFIG:Release>:/MT>)
-    target_compile_options(sodium  PRIVATE $<$<CONFIG:Release>:/MT>)
-
-    target_compile_options(bls     PRIVATE $<$<CONFIG:Debug>:/MTd>)
-    target_compile_options(relic_s PRIVATE $<$<CONFIG:Debug>:/MTd>)
-    target_compile_options(sodium  PRIVATE $<$<CONFIG:Debug>:/MTd>)
-    
-else()
-
-    # *Nix
-    set(c_opts --include=pch.h -Wall -Wno-comment -Wno-unknown-pragmas -g ${c_opts})
-
-    set(tests_c_opts -DBB_TEST_MODE=1 ${tests_c_opts})
+# NOTE: These are mostly sandbox test environment, not proper tests
+option(BB_ENABLE_TESTS "Enable tests." OFF)
+option(NO_CUDA_HARVESTER "Explicitly disable CUDA in the bladebit_harvester target." OFF)
+option(BB_NO_EMBED_VERSION "Disable embedding the version when building locally (non-CI)." ON)
+option(BB_HARVESTER_ONLY "Enable only the harvester target." OFF)
+option(BB_HARVESTER_STATIC "Build the harvester target as a static library." OFF)
 
-    set(release_c_opts 
-        -O3 #-flto
-        -D_NDEBUG=1
-        -DNDEBUG=1
-        ${release_c_opts})
-
-    set(debug_c_opts 
-        -O0
-        -DDEBUG=1
-        -D_DEBUG=1
-        ${debug_c_opts})
-
-    set(dev_c_opts 
-        ${dev_c_opts})
-
-    set(link_opts -g -rdynamic #-flto
-        ${link_opts})
-
-    # GCC
-    if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-
-        set(c_opts -fmax-errors=5 ${c_opts})
-
-        # Avoid ranlib error: plugin needed to handle lto objectR "gcc-ar")
-        # set(c_opts -ffat-lto-objects ${c_opts})
-        
-        # Build with native architecture when not building release packages
-        if(NOT DEFINED ENV{CI})
-            set(c_opts -march=native ${c_opts})
-        endif()
-
-    # Clang
-    elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-
-        set(c_opts -ferror-limit=5 -fdeclspec -Wno-empty-body ${c_opts})
-
-    endif()
-
-endif()
 
 #
-# Sources
+# Dependencies
 #
-file(GLOB_RECURSE bb_sources
-    RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
-    CONFIGURE_DEPENDS
-    LIST_DIRECTORIES false
-    src/*.cpp
-    src/*.c
-)
-set(src_full ${bb_sources})
-
-# Headers
-file(GLOB_RECURSE bb_headers
-    RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
-    CONFIGURE_DEPENDS
-    LIST_DIRECTORIES false
-    src/*.h
-    src/*.hpp
-    src/*.inl
-)
-
-# Ignore some sources
-list(FILTER bb_sources EXCLUDE REGEX "src/main\\.cpp")
-list(FILTER bb_sources EXCLUDE REGEX "src/tools/FSETableGenerator.cpp")
-list(FILTER bb_sources EXCLUDE REGEX "src/sandbox/.+")
-list(FILTER bb_sources EXCLUDE REGEX "src/platform/.+")
-list(FILTER bb_sources EXCLUDE REGEX "src/b3/blake3_(avx|sse).+")
-list(FILTER bb_sources EXCLUDE REGEX "src/uint128_t/.+")
-
+include(FetchContent)
 
-# Project-specific sources
-file(GLOB_RECURSE src_tests RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
-    CONFIGURE_DEPENDS LIST_DIRECTORIES false
-    tests/*.cpp
-)
+# Threads
+find_package(Threads REQUIRED)
 
-file(GLOB_RECURSE src_dev RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} 
-    CONFIGURE_DEPENDS LIST_DIRECTORIES false
-    src/sandbox/*.cpp
-    src/sandbox/*.h
-)
+if(NOT ${BB_HARVESTER_ONLY})
+    # BLS
+    FetchContent_Declare(
+    bls
+    GIT_REPOSITORY https://github.com/Chia-Network/bls-signatures.git
+    GIT_TAG        2.0.2
+    EXCLUDE_FROM_ALL ${BB_IS_DEPENDENCY}
+    )
 
+    set(BUILD_BLS_PYTHON_BINDINGS "0" CACHE STRING "0")
+    set(BUILD_BLS_TESTS "0" CACHE STRING "")
+    set(BUILD_BLS_BENCHMARKS "0" CACHE STRING "")
+    FetchContent_MakeAvailable(bls)
 
-# Configure dependent on config/platform/architecture
-# Architecture
-if(${CMAKE_HOST_SYSTEM_PROCESSOR} MATCHES "x86_64" OR ${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "AMD64")
-
-    if(NOT MSVC)
-        list(APPEND bb_sources
-            src/b3/blake3_avx2_x86-64_unix.S
-            src/b3/blake3_avx512_x86-64_unix.S
-            src/b3/blake3_sse41_x86-64_unix.S
-        )
-    else()
-        list(APPEND bb_sources
-            src/b3/blake3_avx2.c
-            src/b3/blake3_avx512.c
-            src/b3/blake3_sse41.c
-            src/uint128_t/uint128_t.cpp
-        )
-        # Disable blake3 conversion loss of data warnings
-        set_source_files_properties( 
-            src/b3/blake3_avx2.c
-            src/b3/blake3_avx512.c
-            src/b3/blake3_sse41.c
-            PROPERTIES COMPILE_FLAGS
-            /wd4244
-        )
+    # NUMA
+    if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
+        find_package(NUMA REQUIRED)
+        set(platform_libs ${NUMA_LIBRARY})
     endif()
+endif() # BB_HARVESTER_ONLY
 
-elseif(${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "arm64" OR ${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "aarch64")
+#
+# Internal Config
+#
+set(is_release $<CONFIG:Release>)
+set(is_debug $<CONFIG:Debug>)
+set(is_c_cpp $<COMPILE_LANGUAGE:CXX,C>)
+set(is_cuda $<COMPILE_LANGUAGE:CUDA>)
+set(is_cuda_release $<AND:${is_cuda},${is_release}>)
+set(is_cuda_debug $<AND:${is_cuda},${is_debug}>)
+set(is_x86 $<OR:$<STREQUAL:${CMAKE_HOST_SYSTEM_PROCESSOR},AMD64>,$<STREQUAL:${CMAKE_HOST_SYSTEM_PROCESSOR},x86_64>>)
+set(is_arm $<OR:$<STREQUAL:${CMAKE_HOST_SYSTEM_PROCESSOR},arm64>,$<STREQUAL:${CMAKE_HOST_SYSTEM_PROCESSOR},aarch64>>)
+set(is_msvc_c_cpp $<AND:${is_c_cpp},$<CXX_COMPILER_ID:MSVC>>)
+
+if(CUDAToolkit_FOUND AND NOT ${NO_CUDA_HARVESTER})
+    set(have_cuda $<BOOL:1>)
 else()
-    message( FATAL_ERROR "Unsupported architecture '${CMAKE_HOST_SYSTEM_PROCESSOR}'" )
+    set(have_cuda $<BOOL:0>)
 endif()
 
-# OS
-if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
 
-    file(GLOB_RECURSE src_linux RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} 
-        CONFIGURE_DEPENDS LIST_DIRECTORIES false
-        src/platform/unix/*.cpp
-        src/platform/linux/*.cpp
-    )
-    list(APPEND bb_sources ${src_linux})
-
-elseif(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
-    
-    file(GLOB_RECURSE src_win RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} 
-        CONFIGURE_DEPENDS LIST_DIRECTORIES false
-        src/platform/win32/*.cpp
-    )
-    list(APPEND bb_sources ${src_win})
+#
+# Targets
+#
+include(Config.cmake)
 
-elseif(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+if(NOT ${BB_HARVESTER_ONLY})
+    if(NOT BB_IS_DEPENDENCY AND (NOT BB_NO_EMBED_VERSION))
+        include(cmake_modules/EmbedVersion.cmake)
+    endif()
 
-    file(GLOB_RECURSE src_mac RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} 
-        CONFIGURE_DEPENDS LIST_DIRECTORIES false
-        src/platform/unix/*.cpp
-        src/platform/macos/*.cpp
-    )
-    list(APPEND bb_sources ${src_mac})
+    include(Bladebit.cmake)
+    set_target_properties(bladebit_core bladebit PROPERTIES EXCLUDE_FROM_ALL $<BOOL:${BB_IS_DEPENDENCY}>)
 
-else()
-    message( FATAL_ERROR "Unsupported operating system '${CMAKE_SYSTEM_NAME}'" )
+    if(CUDAToolkit_FOUND)
+        include(BladebitCUDA.cmake)
+        set_target_properties(bladebit_cuda PROPERTIES EXCLUDE_FROM_ALL $<BOOL:${BB_IS_DEPENDENCY}>)
+    endif()
 endif()
 
+include(Harvester.cmake)
 
-#
-# Targets
-#
-set(bb_include_dirs 
-    ${INCLUDE_DIRECTORIES}
-    ${CMAKE_CURRENT_SOURCE_DIR}/src
-)
-
-# macro(config_proj tgt)
-#     message("Configuring target ${tgt}:${CMAKE_BUILD_TYPE}.")
-#     target_compile_options(${tgt} PRIVATE $<$<CONFIG:Release>:${c_opts} ${release_c_opts}>)
-#     target_compile_options(${tgt} PRIVATE $<$<CONFIG:Debug>:${c_opts} ${debug_c_opts}>)
-#     target_link_options(${tgt} PRIVATE $<$<CONFIG:Release>:${link_opts} ${release_link_opts}>)
-#     target_link_options(${tgt} PRIVATE $<$<CONFIG:Debug>:${link_opts} ${debug_link_opts}>)
-    
-#     target_include_directories(${tgt} PRIVATE ${bb_include_dirs})
-# endmacro()
-
-# BladeBit
-add_library(lib_bladebit ${bb_sources} ${bb_headers} src/plotdisk/k32/FpMatchBounded.inl src/plotdisk/k32/F1Bounded.inl)
-
-set_target_properties(lib_bladebit PROPERTIES
-        OUTPUT_NAME bladebit
-        MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>"
-)
-target_link_libraries(lib_bladebit PUBLIC Threads::Threads bls ${platform_libs})
-target_include_directories(lib_bladebit PUBLIC ${bb_include_dirs})
-
-target_compile_options(lib_bladebit PUBLIC $<$<CONFIG:Release>:${c_opts} ${release_c_opts}>)
-target_compile_options(lib_bladebit PUBLIC $<$<CONFIG:Debug>:${c_opts} ${debug_c_opts}>)
-target_link_options(lib_bladebit PUBLIC $<$<CONFIG:Release>:${link_opts} ${release_link_opts}>)
-target_link_options(lib_bladebit PUBLIC $<$<CONFIG:Debug>:${link_opts} ${debug_link_opts}>)
-
-add_executable(bladebit ${bb_headers} src/main.cpp src/plotdisk/k32/FxBounded.inl)
-target_link_libraries(bladebit PRIVATE lib_bladebit)
-
-add_executable(bladebit_dev  EXCLUDE_FROM_ALL src/sandbox/sandbox_main.cpp ${src_dev} ${bb_headers})
-target_link_libraries(bladebit_dev PRIVATE lib_bladebit)
-
-# Tools
-add_executable(fsegen src/tools/FSETableGenerator.cpp ${bb_sources} ${bb_headers})
-target_link_libraries(fsegen PRIVATE lib_bladebit)
-
-# add_executable(plot_tool 
-#     src/tools/PlotTools_Main.cpp 
-#     src/tools/PlotReader.cpp
-#     src/tools/PlotValidator.cpp
-#     src/tools/PlotComparer.cpp 
-#     ${bb_headers}
-# )
-# target_link_libraries(plot_tool PRIVATE lib_bladebit)
-
-# Tests
-add_executable(tests ${src_tests} ${bb_headers})
-target_compile_options(tests PUBLIC $<$<CONFIG:Release>:${c_opts} ${release_c_opts} ${tests_c_opts}>)
-target_compile_options(tests PUBLIC $<$<CONFIG:Debug>:${c_opts} ${debug_c_opts} ${tests_c_opts}>)
-set_target_properties(tests PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
-target_link_libraries(tests PRIVATE lib_bladebit Catch2::Catch2WithMain)
-
-# Pretty source view for IDE projects
-source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR}/src 
-    FILES ${src_full} ${bb_headers}
-)
+if(${BB_ENABLE_TESTS} AND NOT ${BB_HARVESTER_ONLY})
+    include(Tests.cmake)
+endif()
 
diff --git a/CMakeSettings.json b/CMakeSettings.json
index fb51b4bc..5ef52577 100644
--- a/CMakeSettings.json
+++ b/CMakeSettings.json
@@ -9,6 +9,17 @@
       "installRoot": "${projectDir}\\out\\install\\${name}",
       "ctestCommandArgs": ""
     },
+
+    {
+      "name": "x64-Release",
+      "generator": "Ninja",
+      "configurationType": "Debug",
+      "inheritEnvironments": [ "msvc_x64_x64" ],
+      "buildRoot": "${projectDir}\\out\\build-release\\${name}",
+      "installRoot": "${projectDir}\\out\\cmake-install-release\\${name}",
+      "ctestCommandArgs": ""
+    },
+
     {
       "name": "MemTest",
       "generator": "Ninja",
diff --git a/Config.cmake b/Config.cmake
new file mode 100644
index 00000000..4139b4a9
--- /dev/null
+++ b/Config.cmake
@@ -0,0 +1,175 @@
+# Base interface configuration project
+add_library(bladebit_config INTERFACE)
+
+target_compile_definitions(bladebit_config INTERFACE
+    $<${is_release}:
+        _NDEBUG=1
+        NDEBUG=1
+    >
+    $<${is_debug}:
+        _DEBUG=1
+        DEBUG=1
+    >
+
+    $<$<CXX_COMPILER_ID:MSVC>:
+        UNICODE=1
+        WIN32_LEAN_AND_MEAN=1
+        NOMINMAX=1
+        _CRT_SECURE_NO_WARNINGS=1
+        _HAS_EXCEPTIONS=0
+    >
+)
+
+target_compile_options(bladebit_config INTERFACE
+
+    # GCC or Clang
+    $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:
+        -Wall
+        -Wno-comment
+        -Wno-unknown-pragmas
+        -g
+
+        $<${is_release}:
+            -O3
+        >
+
+        $<${is_debug}:
+            -O0
+        >
+    >
+    
+    # GCC
+    $<$<CXX_COMPILER_ID:GNU>:
+        -fmax-errors=5
+    >
+
+    # Clang
+    $<$<CXX_COMPILER_ID:Clang,AppleClang>:
+        -ferror-limit=5
+        -fdeclspec
+        -Wno-empty-body
+    >
+
+    # MSVC
+    $<${is_msvc_c_cpp}:
+        /Zc:__cplusplus
+        /MP
+        /Zi
+        # /EHsc-
+        # /Wall
+        /W3
+        /WX
+        /wd4068
+        /wd4464
+        /wd4668
+        /wd4820
+        /wd4514
+        /wd4626
+        /wd5027
+
+        $<${is_release}:
+            /Oi /O2 /Gy /GL
+        >
+        
+        $<${is_debug}:
+            /Od
+        >
+    >
+
+    $<${is_x86}:
+    >
+
+    $<${is_arm}:
+    >
+)
+
+target_link_options(bladebit_config INTERFACE
+
+    # GCC or Clang
+    $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:
+        -g
+        -rdynamic
+    >
+
+    # MSVC
+    $<${is_msvc_c_cpp}:
+
+        /SUBSYSTEM:CONSOLE
+        /STACK:33554432,1048576
+
+        $<${is_release}:
+            /DEBUG:FULL
+            /LTCG
+            /OPT:REF,ICF,LBR
+        >
+
+        $<${is_debug}:
+            # /DEBUG:FASTLINK
+            # /OPT:NOREF,NOICF,NOLBR
+            # /INCREMENTAL
+        >
+    >
+)
+
+set_property(DIRECTORY . PROPERTY MSVC_RUNTIME_LIBRARY MultiThreaded$<$<CONFIG:Debug>:Debug>)
+set_property(DIRECTORY . PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+set_property(DIRECTORY . PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+
+set(preinclude_pch
+    $<${is_cuda}:--pre-include pch.h>
+    $<${is_c_cpp}:
+        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:--include=pch.h>
+    >
+    $<${is_msvc_c_cpp}:/FIpch.h>
+)
+
+# See: https://gitlab.kitware.com/cmake/cmake/-/issues/18265
+cmake_policy(SET CMP0105 NEW)
+
+set(cuda_archs
+
+    $<${is_cuda_release}:
+## Maxwell
+    ## Tesla/Quadro M series
+        -gencode=arch=compute_50,code=sm_50
+    ## Quadro M6000 , GeForce 900, GTX-970, GTX-980, GTX Titan X
+        -gencode=arch=compute_52,code=sm_52
+    ## Tegra (Jetson) TX1 / Tegra X1, Drive CX, Drive PX, Jetson Nano
+        -gencode=arch=compute_53,code=sm_53
+## Pascal
+    ## GeForce 1000 series
+        -gencode=arch=compute_60,code=sm_60
+    ## GeForce GTX 1050Ti, GTX 1060, GTX 1070, GTX 1080
+        -gencode=arch=compute_61,code=sm_61
+    ## Drive Xavier, Jetson AGX Xavier, Jetson Xavier NX
+        -gencode=arch=compute_62,code=sm_62
+## Volta
+    ## GV100, Tesla V100, Titan V
+        -gencode=arch=compute_70,code=sm_70
+    ## Tesla V100
+        -gencode=arch=compute_72,code=sm_72
+    ## Turing
+        -gencode=arch=compute_75,code=sm_75
+## Ampere
+    ## NVIDIA A100, DGX-A100
+        -gencode=arch=compute_80,code=sm_80
+    ## GeForce RTX 3000 series, NVIDIA A100
+        -gencode=arch=compute_86,code=sm_86
+    ## Jetson Orin
+        -gencode=arch=compute_87,code=sm_87
+## Lovelace
+    ## NVIDIA GeForce RTX 4090, RTX 4080, RTX 6000, Tesla L40
+        -gencode=arch=compute_89,code=sm_89
+    ## Future proofing
+        -gencode=arch=compute_89,code=compute_89
+## Hopper
+    ## NVIDIA H100 (GH100)
+        # -gencode=arch=compute_90,code=sm_90
+        # -gencode=arch=compute_90a,code=sm_90a
+    >
+
+    $<${is_cuda_debug}:
+        -arch=native
+        # -gencode=arch=compute_52,code=sm_52 # Maxwell
+    >
+)
diff --git a/Harvester.cmake b/Harvester.cmake
new file mode 100644
index 00000000..d853a2db
--- /dev/null
+++ b/Harvester.cmake
@@ -0,0 +1,176 @@
+if(NOT ${BB_HARVESTER_STATIC})
+    add_library(bladebit_harvester SHARED)
+else()
+    add_library(bladebit_harvester STATIC)
+endif()
+
+
+set_property(TARGET bladebit_harvester PROPERTY PUBLIC_HEADER 
+    src/harvesting/GreenReaper.h 
+    src/harvesting/GreenReaperPortable.h)
+
+install(TARGETS bladebit_harvester
+    LIBRARY DESTINATION green_reaper/lib
+    ARCHIVE DESTINATION green_reaper/lib
+    PUBLIC_HEADER DESTINATION green_reaper/include
+)
+
+target_sources(bladebit_harvester PRIVATE
+    src/pch.cpp
+
+    src/pos/chacha8.cpp
+    src/pos/chacha8.h
+
+    src/fse/bitstream.h
+    src/fse/compiler.h
+    src/fse/debug.c
+    src/fse/debug.h
+    src/fse/entropy_common.c
+    src/fse/error_private.h
+    src/fse/error_public.h
+    src/fse/fse_compress.c
+    src/fse/fse_decompress.c
+    src/fse/fse.h
+    src/fse/hist.c
+    src/fse/hist.h
+    src/fse/huf.h
+    src/fse/mem.h
+
+    src/b3/blake3.c
+    src/b3/blake3_dispatch.c
+    src/b3/blake3.h
+    src/b3/blake3_impl.h
+    src/b3/blake3_portable.c
+
+    $<${is_x86}:
+        $<$<PLATFORM_ID:Windows>:
+            src/b3/blake3_sse41.c
+            src/b3/blake3_avx2.c
+            src/b3/blake3_avx512.c
+        >
+        $<$<NOT:$<PLATFORM_ID:Windows>>:
+            src/b3/blake3_avx2_x86-64_unix.S
+            src/b3/blake3_avx512_x86-64_unix.S
+            src/b3/blake3_sse41_x86-64_unix.S
+        >
+    >
+    
+
+    src/util/Log.cpp
+    src/util/Util.cpp
+    src/PlotContext.cpp
+    src/io/HybridStream.cpp
+    src/threading/AutoResetSignal.cpp
+    src/threading/Fence.cpp
+    src/threading/Semaphore.cpp
+    src/threading/ThreadPool.cpp
+    src/plotting/FSETableGenerator.cpp
+    src/plotting/PlotWriter.cpp
+    src/plotting/Compression.cpp
+    src/plotting/matching/GroupScan.cpp
+    src/plotdisk/DiskBufferQueue.cpp
+    src/plotting/WorkHeap.cpp
+    src/plotdisk/jobs/IOJob.cpp
+    src/harvesting/GreenReaper.cpp
+
+    src/bech32/segwit_addr.c
+
+    $<${have_cuda}:
+        cuda/harvesting/CudaThresher.cu
+        cuda/harvesting/CudaThresherFactory.cu
+        cuda/FxCuda.cu
+        cuda/CudaF1.cu
+        cuda/CudaMatch.cu
+        cuda/CudaPlotUtil.cu
+
+        # TODO: Remove this, ought not be needed in harvester
+        cuda/GpuStreams.cu
+    >
+
+    $<$<NOT:${have_cuda}>:
+        cuda/harvesting/CudaThresherDummy.cpp
+    >
+
+    $<$<PLATFORM_ID:Windows>:
+        src/platform/win32/SysHost_Win32.cpp
+        src/platform/win32/FileStream_Win32.cpp
+        src/platform/win32/Thread_Win32.cpp
+    >
+
+    $<$<PLATFORM_ID:Linux>:
+        src/platform/linux/SysHost_Linux.cpp
+    >
+
+    $<$<PLATFORM_ID:Darwin>:
+        src/platform/macos/SysHost_Macos.cpp
+    >
+
+    $<$<PLATFORM_ID:Darwin,Linux>:
+        src/platform/unix/FileStream_Unix.cpp
+        src/platform/unix/Thread_Unix.cpp
+    >
+
+    $<$<CXX_COMPILER_ID:MSVC>:
+        src/uint128_t/uint128_t.cpp
+    >
+)
+
+target_include_directories(bladebit_harvester PRIVATE src SYSTEM cuda INTERFACE src/harvesting)
+
+target_compile_features(bladebit_harvester PUBLIC cxx_std_17)
+
+target_compile_definitions(bladebit_harvester
+    PRIVATE
+        THRUST_IGNORE_CUB_VERSION_CHECK=1
+        GR_EXPORT=1
+
+    $<${have_cuda}:
+        BB_CUDA_ENABLED=1
+    >
+
+    PUBLIC
+        BB_IS_HARVESTER=1
+    INTERFACE
+        $<$<BOOL:${BB_HARVESTER_STATIC}>:GR_NO_IMPORT=1>
+)
+
+
+target_compile_options(bladebit_harvester PRIVATE 
+    ${preinclude_pch}
+    ${cuda_archs}
+)
+
+if(have_cuda)
+    target_link_options(bladebit_harvester PUBLIC $<DEVICE_LINK: ${cuda_archs}>)
+endif()
+
+target_link_libraries(bladebit_harvester
+    PRIVATE
+        bladebit_config
+    PUBLIC 
+        Threads::Threads
+        $<${have_cuda}:CUDA::cudart_static>
+)
+
+if(CUDAToolkit_FOUND)
+    set_target_properties(bladebit_harvester PROPERTIES 
+        EXCLUDE_FROM_ALL ON
+        MSVC_RUNTIME_LIBRARY MultiThreaded$<$<CONFIG:Debug>:Debug>
+        CUDA_RUNTIME_LIBRARY Static
+        CUDA_SEPARABLE_COMPILATION ON
+        CUDA_RESOLVE_DEVICE_SYMBOLS ON
+        # CUDA_ARCHITECTURES OFF
+    )
+endif()
+
+ # Disable blake3 conversion loss of data warnings
+ if("${CMAKE_CXX_COMPILER_ID}" MATCHES "MSVC")
+    set_source_files_properties( 
+        src/b3/blake3_avx2.c
+        src/b3/blake3_avx512.c
+        src/b3/blake3_sse41.c
+        PROPERTIES COMPILE_FLAGS
+        /wd4244
+    )
+ endif()
+
diff --git a/README.md b/README.md
index a913c55f..9197014e 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,22 @@ A high-performance **k32-only**, Chia (XCH) plotter supporting in-RAM and disk-b
 
 ## Requirements
 
+## **GPU (CUDA) Plotter Requirements**
+
+
+**Supported system configurations for alpha:**
+ 
+|||
+|------------|-------------------------------------------------------------------------------
+| **OS**     | Windows and Linux                                                             
+| **Memory** | **256GB** of system DRAM                                                      
+| **GPUs**   | NVIDIA GPUs with CUDA capability **5.2** and up with at least **8GB** of vRAM 
+|            |                                                                               
+
+> See https://developer.nvidia.com/cuda-gpus for compatible GPUs.
+
+<br/>
+
 ### In-RAM
 **416 GiB of RAM are required** to run it, and a few more megabytes for stack space and small allocations.
 
diff --git a/Tests.cmake b/Tests.cmake
new file mode 100644
index 00000000..577e541c
--- /dev/null
+++ b/Tests.cmake
@@ -0,0 +1,11 @@
+include(cmake_modules/FindCatch2.cmake)
+
+add_executable(tests ${src_bladebit})
+target_compile_definitions(tests PRIVATE
+    BB_TEST_MODE=1
+)
+target_link_libraries(tests PRIVATE bladebit_config Catch2::Catch2WithMain)
+
+set_target_properties(tests PROPERTIES 
+    EXCLUDE_FROM_ALL ON
+)
\ No newline at end of file
diff --git a/VERSION b/VERSION
index 38f77a65..4a36342f 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.0.1
+3.0.0
diff --git a/cmake_modules/EmbedVersion.cmake b/cmake_modules/EmbedVersion.cmake
new file mode 100644
index 00000000..6ec042c0
--- /dev/null
+++ b/cmake_modules/EmbedVersion.cmake
@@ -0,0 +1,41 @@
+
+if((NOT DEFINED ENV{CI}) AND (NOT DEFINED CACHE{bb_version_embedded}))
+    message("Embedding local build version")
+
+    set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.")
+
+    set(cmd_ver bash)
+    if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
+        set(cmd_ver bash.exe)
+    endif()
+
+    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh major    OUTPUT_VARIABLE bb_ver_maj    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh minor    OUTPUT_VARIABLE bb_ver_min    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh revision OUTPUT_VARIABLE bb_ver_rev    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh suffix   OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh commit   OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+
+    # Remove trailing whitespace incurred in windows gitbash
+    string(STRIP "${bb_ver_maj}"    bb_ver_maj)
+    string(STRIP "${bb_ver_min}"    bb_ver_min)
+    string(STRIP "${bb_ver_rev}"    bb_ver_rev)
+    string(STRIP "${bb_ver_suffix}" bb_ver_suffix)
+    string(STRIP "${bb_ver_commit}" bb_ver_commit)
+
+    set(bb_ver_suffix ${bb_ver_suffix}-dev)
+
+    # This is slow on windows, so let's cache them
+    set(bb_ver_maj    ${bb_ver_maj}    CACHE STRING "")
+    set(bb_ver_min    ${bb_ver_min}    CACHE STRING "")
+    set(bb_ver_rev    ${bb_ver_rev}    CACHE STRING "")
+    set(bb_ver_suffix ${bb_ver_suffix} CACHE STRING "")
+    set(bb_ver_commit ${bb_ver_commit} CACHE STRING "")
+endif()
+
+if(NOT DEFINED ENV{CI})
+    add_compile_definitions(BLADEBIT_VERSION_MAJ=${bb_ver_maj})
+    add_compile_definitions(BLADEBIT_VERSION_MIN=${bb_ver_min})
+    add_compile_definitions(BLADEBIT_VERSION_REV=${bb_ver_rev})
+    add_compile_definitions(BLADEBIT_VERSION_SUFFIX="${bb_ver_suffix}")
+    add_compile_definitions(BLADEBIT_GIT_COMMIT="${bb_ver_commit}")
+endif()
diff --git a/cmake_modules/FindCatch2.cmake b/cmake_modules/FindCatch2.cmake
index ddced5a8..c3623a9f 100644
--- a/cmake_modules/FindCatch2.cmake
+++ b/cmake_modules/FindCatch2.cmake
@@ -3,7 +3,7 @@ Include(FetchContent)
 FetchContent_Declare(
         Catch2
         GIT_REPOSITORY https://github.com/catchorg/Catch2.git
-        GIT_TAG        v3.0.0-preview4
+        GIT_TAG        v3.3.2
 )
 
 FetchContent_MakeAvailable(Catch2)
diff --git a/cuda/CudaF1.cu b/cuda/CudaF1.cu
new file mode 100644
index 00000000..5f51e8f5
--- /dev/null
+++ b/cuda/CudaF1.cu
@@ -0,0 +1,175 @@
+#include "CudaF1.h"
+#include "CudaUtil.h"
+#include "ChiaConsts.h"
+
+/// #NOTE: Code duplicated from chacha8.cu for now.
+/// #TODO: Refactor and consolidate
+
+
+#define U32TO32_LITTLE(v) CuBSwap32(v)
+#define U8TO32_LITTLE(p) (*(const uint32_t *)(p))
+#define U32TO8_LITTLE(p, v) (((uint32_t *)(p))[0] = U32TO32_LITTLE(v))
+#define ROTL32(v, n) (((v) << (n)) | ((v) >> (32 - (n))))
+
+#define ROTATE(v, c) (ROTL32(v, c))
+#define XOR(v, w) ((v) ^ (w))
+#define PLUS(v, w) ((v) + (w))
+#define PLUSONE(v) (PLUS((v), 1))
+
+#define QUARTERROUND(a, b, c, d) \
+    a = PLUS(a, b);              \
+    d = ROTATE(XOR(d, a), 16);   \
+    c = PLUS(c, d);              \
+    b = ROTATE(XOR(b, c), 12);   \
+    a = PLUS(a, b);              \
+    d = ROTATE(XOR(d, a), 8);    \
+    c = PLUS(c, d);              \
+    b = ROTATE(XOR(b, c), 7)
+
+
+// 128 threads per cuda block, each thread will do one chacha block
+#define CHACHA_BLOCKS_PER_CUDA_BLOCK 128ull
+
+//-----------------------------------------------------------
+__global__ void chacha8_get_keystream_cuda_k32( 
+    const CudaPlotInfo info,
+    const uint32_t* input,
+    const uint64_t  chachaBlockBase,
+    uint64*         outY, 
+    uint32*         outX )
+{
+    extern __shared__ uint32 sharedBucketCounts[];
+
+    const uint32 id  = threadIdx.x;
+    const uint32 gid = blockIdx.x * blockDim.x + id;
+
+    const uint64_t chachaBlock = chachaBlockBase + blockIdx.x * CHACHA_BLOCKS_PER_CUDA_BLOCK + id;
+
+
+    uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+    uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
+
+    j0  = input[0];
+    j1  = input[1];
+    j2  = input[2];
+    j3  = input[3];
+    j4  = input[4];
+    j5  = input[5];
+    j6  = input[6];
+    j7  = input[7];
+    j8  = input[8];
+    j9  = input[9];
+    j10 = input[10];
+    j11 = input[11];
+    j12 = (uint32_t)chachaBlock;
+    j13 = (uint32_t)(chachaBlock >> 32);
+    j14 = input[14];
+    j15 = input[15];
+
+    // #TODO: Dispatch a different kernel to set the x's
+    x0  = j0;
+    x1  = j1;
+    x2  = j2;
+    x3  = j3;
+    x4  = j4;
+    x5  = j5;
+    x6  = j6;
+    x7  = j7;
+    x8  = j8;
+    x9  = j9;
+    x10 = j10;
+    x11 = j11;
+    x12 = j12;
+    x13 = j13;
+    x14 = j14;
+    x15 = j15;
+
+    #pragma unroll
+    for( int i = 8; i > 0; i -= 2 )
+    {
+        QUARTERROUND( x0, x4, x8 , x12 );
+        QUARTERROUND( x1, x5, x9 , x13 );
+        QUARTERROUND( x2, x6, x10, x14 );
+        QUARTERROUND( x3, x7, x11, x15 );
+        QUARTERROUND( x0, x5, x10, x15 );
+        QUARTERROUND( x1, x6, x11, x12 );
+        QUARTERROUND( x2, x7, x8 , x13 );
+        QUARTERROUND( x3, x4, x9 , x14 );
+    }
+
+    const uint32 x   = (uint32)(chachaBlock * 16);    // X start offset
+    const uint32 out = gid * (kF1BlockSize / sizeof(uint32));
+
+    const uint32 xo0  = x + 0 ;
+    const uint32 xo1  = x + 1 ;
+    const uint32 xo2  = x + 2 ;
+    const uint32 xo3  = x + 3 ;
+    const uint32 xo4  = x + 4 ;
+    const uint32 xo5  = x + 5 ;
+    const uint32 xo6  = x + 6 ;
+    const uint32 xo7  = x + 7 ;
+    const uint32 xo8  = x + 8 ;
+    const uint32 xo9  = x + 9 ;
+    const uint32 xo10 = x + 10;
+    const uint32 xo11 = x + 11;
+    const uint32 xo12 = x + 12;
+    const uint32 xo13 = x + 13;
+    const uint32 xo14 = x + 14;
+    const uint32 xo15 = x + 15;
+
+    outY[out+0 ] = (((uint64)CuBSwap32( PLUS( x0 , j0  ) )) << kExtraBits) | (xo0  >> (info.k - kExtraBits));
+    outY[out+1 ] = (((uint64)CuBSwap32( PLUS( x1 , j1  ) )) << kExtraBits) | (xo1  >> (info.k - kExtraBits));
+    outY[out+2 ] = (((uint64)CuBSwap32( PLUS( x2 , j2  ) )) << kExtraBits) | (xo2  >> (info.k - kExtraBits));
+    outY[out+3 ] = (((uint64)CuBSwap32( PLUS( x3 , j3  ) )) << kExtraBits) | (xo3  >> (info.k - kExtraBits));
+    outY[out+4 ] = (((uint64)CuBSwap32( PLUS( x4 , j4  ) )) << kExtraBits) | (xo4  >> (info.k - kExtraBits));
+    outY[out+5 ] = (((uint64)CuBSwap32( PLUS( x5 , j5  ) )) << kExtraBits) | (xo5  >> (info.k - kExtraBits));
+    outY[out+6 ] = (((uint64)CuBSwap32( PLUS( x6 , j6  ) )) << kExtraBits) | (xo6  >> (info.k - kExtraBits));
+    outY[out+7 ] = (((uint64)CuBSwap32( PLUS( x7 , j7  ) )) << kExtraBits) | (xo7  >> (info.k - kExtraBits));
+    outY[out+8 ] = (((uint64)CuBSwap32( PLUS( x8 , j8  ) )) << kExtraBits) | (xo8  >> (info.k - kExtraBits));
+    outY[out+9 ] = (((uint64)CuBSwap32( PLUS( x9 , j9  ) )) << kExtraBits) | (xo9  >> (info.k - kExtraBits));
+    outY[out+10] = (((uint64)CuBSwap32( PLUS( x10, j10 ) )) << kExtraBits) | (xo10 >> (info.k - kExtraBits));
+    outY[out+11] = (((uint64)CuBSwap32( PLUS( x11, j11 ) )) << kExtraBits) | (xo11 >> (info.k - kExtraBits));
+    outY[out+12] = (((uint64)CuBSwap32( PLUS( x12, j12 ) )) << kExtraBits) | (xo12 >> (info.k - kExtraBits));
+    outY[out+13] = (((uint64)CuBSwap32( PLUS( x13, j13 ) )) << kExtraBits) | (xo13 >> (info.k - kExtraBits));
+    outY[out+14] = (((uint64)CuBSwap32( PLUS( x14, j14 ) )) << kExtraBits) | (xo14 >> (info.k - kExtraBits));
+    outY[out+15] = (((uint64)CuBSwap32( PLUS( x15, j15 ) )) << kExtraBits) | (xo15 >> (info.k - kExtraBits));
+
+    outX[out+0 ] = xo0 ;
+    outX[out+1 ] = xo1 ;
+    outX[out+2 ] = xo2 ;
+    outX[out+3 ] = xo3 ;
+    outX[out+4 ] = xo4 ;
+    outX[out+5 ] = xo5 ;
+    outX[out+6 ] = xo6 ;
+    outX[out+7 ] = xo7 ;
+    outX[out+8 ] = xo8 ;
+    outX[out+9 ] = xo9 ;
+    outX[out+10] = xo10;
+    outX[out+11] = xo11;
+    outX[out+12] = xo12;
+    outX[out+13] = xo13;
+    outX[out+14] = xo14;
+    outX[out+15] = xo15;
+}
+
+void CudaGenF1K32(
+    const CudaPlotInfo& info,
+    const uint32* devChaChhaInput,
+    const uint64  chachaBlockBase,
+    const uint32  chachaBlockCount,
+          uint64* devOutY,
+          uint32* devOutX,
+    cudaStream_t  stream )
+{
+    const uint32 cuThreads = CHACHA_BLOCKS_PER_CUDA_BLOCK;
+    const uint32 cuBlocks  = CDiv( chachaBlockCount, cuThreads );
+        
+    chacha8_get_keystream_cuda_k32<<<cuBlocks, cuThreads, 0, stream>>>(
+        info,
+        devChaChhaInput,
+        chachaBlockBase,
+        devOutY,
+        devOutX
+    );
+}
+
diff --git a/cuda/CudaF1.h b/cuda/CudaF1.h
new file mode 100644
index 00000000..811f487a
--- /dev/null
+++ b/cuda/CudaF1.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+struct CudaPlotInfo;
+
+void CudaGenF1K32(
+    const CudaPlotInfo& info,
+    const uint32* devChaChhaInput,
+    const uint64  chachaBlockBase,
+    const uint32  chachaBlockCount,
+          uint64* devOutY,
+          uint32* devOutX,
+    cudaStream_t  stream );
diff --git a/cuda/CudaFSE.cuh b/cuda/CudaFSE.cuh
new file mode 100644
index 00000000..3b263e98
--- /dev/null
+++ b/cuda/CudaFSE.cuh
@@ -0,0 +1,204 @@
+#pragma once
+
+#define FSE_STATIC_LINKING_ONLY 1
+#include "fse/fse.h"
+#include "fse/bitstream.h"
+#undef FSE_STATIC_LINKING_ONLY
+
+#include "CudaPlotContext.h"
+
+#ifdef _WIN32
+__pragma( pack( push, 1 ) )
+typedef struct { U16 v; } unalign16;
+typedef struct { U32 v; } unalign32;
+typedef struct { U64 v; } unalign64;
+typedef struct { size_t v; } unalignArch;
+__pragma( pack( pop ) )
+#endif
+
+__constant__ unsigned CUDA_FSE_BIT_mask[32];
+
+#define CU_FSE_PREFIX(name) FSE_error_##name
+#define CU_FSE_ERROR(name) ((size_t)-CU_FSE_PREFIX(name))
+
+__device__ __forceinline__ unsigned CUDA_ERR_isError(size_t code) { return (code > CU_FSE_ERROR(maxCode)); }
+__device__ __forceinline__ unsigned CUDA_FSE_isError(size_t code) { return CUDA_ERR_isError(code); }
+
+
+__device__ __forceinline__ U16 CUDA_MEM_read16(const void* ptr) { return ((const unalign16*)ptr)->v; }
+
+__device__ __forceinline__ void CUDA_MEM_writeLEST(void* memPtr, size_t val) { ((unalign64*)memPtr)->v = (U64)val; }
+
+__device__ __forceinline__ void CUDA_BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits)
+{
+    CUDA_ASSERT(BIT_MASK_SIZE == 32);
+    CUDA_ASSERT(nbBits < BIT_MASK_SIZE);
+    CUDA_ASSERT(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+
+    bitC->bitContainer |= (value & CUDA_FSE_BIT_mask[nbBits]) << bitC->bitPos;
+    bitC->bitPos += nbBits;
+}
+
+__device__ __forceinline__ void CUDA_BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits)
+{
+    CUDA_ASSERT((value>>nbBits) == 0);
+    CUDA_ASSERT(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    bitC->bitContainer |= value << bitC->bitPos;
+    bitC->bitPos += nbBits;
+}
+
+__device__ __forceinline__ void CUDA_BIT_flushBits(BIT_CStream_t* bitC)
+{
+    size_t const nbBytes = bitC->bitPos >> 3;
+    CUDA_ASSERT(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    CUDA_MEM_writeLEST(bitC->ptr, bitC->bitContainer);
+    bitC->ptr += nbBytes;
+    if (bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr;
+    bitC->bitPos &= 7;
+    bitC->bitContainer >>= nbBytes*8;
+}
+
+__device__ __forceinline__ void CUDA_BIT_flushBitsFast(BIT_CStream_t* bitC)
+{
+    size_t const nbBytes = bitC->bitPos >> 3;
+    CUDA_ASSERT(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    CUDA_MEM_writeLEST(bitC->ptr, bitC->bitContainer);
+    bitC->ptr += nbBytes;
+    CUDA_ASSERT(bitC->ptr <= bitC->endPtr);
+    bitC->bitPos &= 7;
+    bitC->bitContainer >>= nbBytes*8;
+}
+
+__device__ __forceinline__ size_t CUDA_BIT_closeCStream(BIT_CStream_t* bitC)
+{
+    CUDA_BIT_addBitsFast(bitC, 1, 1);   /* endMark */
+    CUDA_BIT_flushBits(bitC);
+    if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+    return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0);
+}
+
+__device__ __forceinline__ size_t CUDA_BIT_initCStream(BIT_CStream_t* bitC, void* startPtr, size_t dstCapacity)
+{
+    bitC->bitContainer = 0;
+    bitC->bitPos = 0;
+    bitC->startPtr = (char*)startPtr;
+    bitC->ptr = bitC->startPtr;
+    bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer);
+    if (dstCapacity <= sizeof(bitC->bitContainer)) return CU_FSE_ERROR(dstSize_tooSmall);
+    return 0;
+}
+
+__device__ __forceinline__ void CUDA_FSE_initCState(FSE_CState_t* statePtr, const FSE_CTable* ct)
+{
+    const void* ptr = ct;
+    const U16* u16ptr = (const U16*) ptr;
+    const U32 tableLog = CUDA_MEM_read16(ptr);
+    statePtr->value = (ptrdiff_t)1<<tableLog;
+    statePtr->stateTable = u16ptr+2;
+    statePtr->symbolTT = ((const U32*)ct + 1 + (tableLog ? (1<<(tableLog-1)) : 1));
+    statePtr->stateLog = tableLog;
+}
+
+__device__ __forceinline__ void CUDA_FSE_initCState2(FSE_CState_t* statePtr, const FSE_CTable* ct, U32 symbol)
+{
+    CUDA_FSE_initCState(statePtr, ct);
+    {   const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+        const U16* stateTable = (const U16*)(statePtr->stateTable);
+        U32 nbBitsOut  = (U32)((symbolTT.deltaNbBits + (1<<15)) >> 16);
+        statePtr->value = (nbBitsOut << 16) - symbolTT.deltaNbBits;
+        statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+    }
+}
+
+__device__ __forceinline__ void CUDA_FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, U32 symbol)
+{
+    FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+    const U16* const stateTable = (const U16*)(statePtr->stateTable);
+    U32 const nbBitsOut  = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+    CUDA_BIT_addBits(bitC, statePtr->value, nbBitsOut);
+    statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+}
+
+__device__ __forceinline__ void CUDA_FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr)
+{
+    CUDA_BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
+    CUDA_BIT_flushBits(bitC);
+}
+
+template<int32 EntryCount>
+__device__ size_t CUDA_FSE_compress_usingCTable(
+    void* dst, size_t dstSize,
+    const void* src, size_t srcSize,
+    const FSE_CTable* ct )
+{
+    const byte* const istart = (const byte*) src;
+    const byte* const iend = istart + srcSize;
+    const byte* ip=iend;
+
+    BIT_CStream_t bitC;
+    FSE_CState_t CState1, CState2;
+
+    /* init */
+    CUDA_ASSERT( srcSize > 2 );
+    CUDA_ASSERT( srcSize == (size_t)EntryCount );
+    CUDA_ASSERT( (uintptr_t)(ip - istart) == (uintptr_t)EntryCount );
+
+    // if (srcSize <= 2) return 0;
+    { 
+        size_t const initError = CUDA_BIT_initCStream(&bitC, dst, dstSize);
+        CUDA_ASSERT( !CUDA_FSE_isError(initError) );
+        
+        #if _DEBUG
+            // if (FSE_isError(initError)) 
+            //     return 0; /* not enough space available to write a bitstream */ 
+        #endif
+    }
+
+    #define FSE_FLUSHBITS(s)  CUDA_BIT_flushBitsFast(s)
+
+    // if (srcSize & 1) 
+    {
+        CUDA_FSE_initCState2(&CState1, ct, *--ip);
+        CUDA_FSE_initCState2(&CState2, ct, *--ip);
+        CUDA_FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        FSE_FLUSHBITS(&bitC);
+    } 
+    // else {
+    //     CUDA_FSE_initCState2(&CState2, ct, *--ip);
+    //     CUDA_FSE_initCState2(&CState1, ct, *--ip);
+    // }
+
+    /* join to mod 4 */
+    srcSize -= 2;
+    if ((sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) && (srcSize & 2)) {  /* test bit 2 */
+        CUDA_FSE_encodeSymbol(&bitC, &CState2, *--ip);
+        CUDA_FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        FSE_FLUSHBITS(&bitC);
+    }
+
+    /* 2 or 4 encoding per loop */
+    // while ( ip>istart ) 
+    #pragma unroll
+    for( int32 i = 0; i < EntryCount / 4; i ++ )
+    {
+        CUDA_FSE_encodeSymbol(&bitC, &CState2, *--ip);
+
+        // if constexpr (sizeof(bitC.bitContainer)*8 < FSE_MAX_TABLELOG*2+7 )   /* this test must be static */
+        //     FSE_FLUSHBITS(&bitC);
+
+        CUDA_FSE_encodeSymbol(&bitC, &CState1, *--ip);
+
+        // if constexpr (sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) {  /* this test must be static */
+            CUDA_FSE_encodeSymbol(&bitC, &CState2, *--ip);
+            CUDA_FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        // }
+
+        FSE_FLUSHBITS(&bitC);
+    }
+
+    CUDA_FSE_flushCState(&bitC, &CState2);
+    CUDA_FSE_flushCState(&bitC, &CState1);
+
+    #undef FSE_FLUSHBITS
+    return CUDA_BIT_closeCStream(&bitC);
+}
diff --git a/cuda/CudaFx.h b/cuda/CudaFx.h
new file mode 100644
index 00000000..66d282cb
--- /dev/null
+++ b/cuda/CudaFx.h
@@ -0,0 +1,14 @@
+#pragma once
+#include <cuda_runtime.h>
+#include "plotting/Tables.h"
+
+struct Pair;
+void CudaFxHarvestK32(
+    TableId       table,
+    uint64*       devYOut, 
+    void*         devMetaOut,
+          uint32  matchCount, 
+    const Pair*   devPairsIn, 
+    const uint64* devYIn,
+    const void*   devMetaIn,
+    cudaStream_t  stream );
\ No newline at end of file
diff --git a/cuda/CudaMatch.cu b/cuda/CudaMatch.cu
new file mode 100644
index 00000000..e827547f
--- /dev/null
+++ b/cuda/CudaMatch.cu
@@ -0,0 +1,669 @@
+#include "CudaPlotContext.h"
+#include "ChiaConsts.h"
+#include "CudaMatch.h"
+
+#define CU_MATCH_THREAD_COUNT (kExtraBitsPow)
+
+#define BBCU_SCAN_GROUP_THREADS      128
+#define BBCU_THREADS_PER_MATCH_GROUP 352
+static constexpr uint32 BBCU_MAX_ENTRIES_PER_GROUP = 238;
+static constexpr uint32 BBCU_MIN_ENTRIES_PER_GROUP = 230;
+static constexpr uint32 BBCU_MIN_GROUP_COUNT       = ( CuCDiv( BBCU_BUCKET_ENTRY_COUNT, BBCU_MAX_ENTRIES_PER_GROUP ) );
+static constexpr uint32 BBCU_MAX_GROUP_COUNT       = ( CuCDiv( BBCU_BUCKET_ENTRY_COUNT, BBCU_MIN_ENTRIES_PER_GROUP ) );
+
+static_assert( CU_MAX_BC_GROUP_BOUNDARIES >= BBCU_MAX_GROUP_COUNT );
+
+// #NOTE: The above have been tuned for 128 buckets, should check them for other bucket counts.
+//static_assert( BBCU_BUCKET_COUNT == 128, "Unexpected bucket count" );
+
+//-----------------------------------------------------------
+__forceinline__ __device__ uint16 GenLTarget( const uint16 parity, const uint16 rTargetIdx, const uint16 m )
+{
+    const uint16 indJ = rTargetIdx / kC;
+    return ((indJ + m) % kB) * kC + (((2 * m + parity) * (2 * m + parity) + rTargetIdx) % kC);
+}
+
+//-----------------------------------------------------------
+__global__ void CudaInitGroupsBucket( uint32* entries )
+{
+    const uint32 id       = threadIdx.x;
+    const uint32 groupIdx = blockIdx.x;
+    const uint32 gid      = blockIdx.x * blockDim.x + id;
+
+    entries[gid] = 0xFFFFFFFF;
+}
+
+//-----------------------------------------------------------
+__global__ void CudaInitGroups( uint32* entries, const uint32 entryCount )
+{
+    const uint32 id       = threadIdx.x;
+    const uint32 groupIdx = blockIdx.x;
+    const uint32 gid      = blockIdx.x * blockDim.x + id;
+
+    if( gid >= entryCount )
+        return;
+
+    entries[gid] = 0xFFFFFFFF;
+}
+
+//-----------------------------------------------------------
+__global__ void CudaSetFirstAndLastGroup( uint32* groups, const uint32 entryCount )
+{
+    const uint32 id       = threadIdx.x;
+    const uint32 groupIdx = blockIdx.x;
+    const uint32 gid      = blockIdx.x * blockDim.x + id;
+
+    if( id == 0 )
+        groups[id] = 0;
+    else
+        groups[id] = entryCount;
+}
+
+//-----------------------------------------------------------
+__global__ void ScanGroupsCudaK32Bucket( const uint32* yEntries, uint32* groupBounadries, uint32* gGroupCount, const uint32 entryCount, const uint64 bucketMask )
+{
+    const uint32 id           = threadIdx.x;
+    const uint32 gid          = blockIdx.x * blockDim.x + id;
+
+    if( gid >= entryCount-1 )
+        return;
+    
+    __shared__ uint32 sharedGroupCount;
+    if( id == 0 )
+        sharedGroupCount = 0;
+
+    __syncthreads();
+
+    const uint64 currentGroup = ( bucketMask | yEntries[gid]   ) / kBC;
+    const uint64 nextGroup    = ( bucketMask | yEntries[gid+1] ) / kBC;
+
+    uint32 offset;
+    if( currentGroup != nextGroup )
+    {
+        // #TODO: Use cooperative groups here instead, so we can just sync these threads
+        offset = atomicAdd( &sharedGroupCount, 1 );
+    }
+
+    __syncthreads();
+
+    // Global sync
+    if( id == 0 )
+        sharedGroupCount = atomicAdd( gGroupCount, sharedGroupCount );
+
+    __syncthreads();
+
+    if( currentGroup != nextGroup )
+    {
+        CUDA_ASSERT( sharedGroupCount + offset < CU_MAX_BC_GROUP_BOUNDARIES );
+        groupBounadries[sharedGroupCount + offset] = gid+1;
+    }
+}
+
+//-----------------------------------------------------------
+__global__ void MatchCudaK32Bucket( const uint64 bucketMask, const uint32 entryCount, const uint32* gGroupCounts, const uint32* yEntries, const uint32* groupBoundaries, uint32* gMatchCount, Pair* outPairs )
+{
+    // 1 thread per y
+    const uint32 id       = threadIdx.x;
+    const uint32 groupIdx = blockIdx.x;
+    const uint32 gid      = blockIdx.x * blockDim.x + id;
+
+    if( groupIdx >= *gGroupCounts )
+        return;
+
+    const uint32 groupLIdx    = groupBoundaries[groupIdx];
+    const uint32 groupRIdx    = groupBoundaries[groupIdx+1];
+    const uint32 groupREnd    = groupBoundaries[groupIdx+2];
+    const uint64 groupL       = ( bucketMask | yEntries[groupLIdx] ) / kBC;
+    const uint64 groupR       = ( bucketMask | yEntries[groupRIdx] ) / kBC;
+    const uint32 groupRLength = groupREnd - groupRIdx;
+    const uint64 groupLYStart = groupL * kBC;
+    const uint64 groupRYStart = groupR * kBC;
+    const uint32 groupLLength = groupRIdx - groupLIdx;
+
+#if _DEBUG
+    if( groupLLength >= BBCU_THREADS_PER_MATCH_GROUP || groupRLength >= BBCU_THREADS_PER_MATCH_GROUP )
+        printf( "[%u] Group %u is too large: %u\n", gid, groupRIdx, ( groupRIdx - groupLIdx ) );
+#endif
+    CUDA_ASSERT( groupLLength <= BBCU_THREADS_PER_MATCH_GROUP );
+    CUDA_ASSERT( groupRLength <= BBCU_THREADS_PER_MATCH_GROUP );
+
+    // Generate R group map
+    __shared__ uint32 rMap[kBC/2+1];
+    __shared__ uint32 sharedMatchCount;
+
+    if( groupR - groupL != 1 )
+        return;
+
+    if( id == 0 )
+        sharedMatchCount = 0;
+
+    const uint16 localLY = (uint16)(( bucketMask | yEntries[groupLIdx + min(id, groupLLength-1)] ) - groupLYStart );
+    const uint16 localRY = (uint16)(( bucketMask | yEntries[groupRIdx + min(id, groupRLength-1)] ) - groupRYStart );
+
+    // #TODO: See about using coop_threads here
+    {
+        {
+            uint32 mapIdx = id;
+            while( mapIdx < kBC/2+1 )
+            { 
+                // Each entry is:
+                // hi           lo
+                // 7     9      7     9
+                //(count,offset|count,offset) 
+                rMap[mapIdx] = 0x01FF01FF;
+                mapIdx += BBCU_THREADS_PER_MATCH_GROUP;
+            }
+        }
+
+        __syncthreads();
+        
+        const uint16 shift  = ( ( localRY & 1 ) << 4 );   // Shift left by 16 bits if odd
+        const uint32 idx    = localRY >> 1;               // Divide by 2
+
+        // First set the offsets for the even ones (lower bits)
+        if( id < groupRLength && ( localRY & 1 ) == 0 )
+            atomicMin( &rMap[idx], id | 0x01FF0000 );
+            
+        __syncthreads();
+
+        // Then set offset for the odd ones
+        if( id < groupRLength && ( localRY & 1 ) )
+            atomicMin( &rMap[idx], (id << 16) | (rMap[idx] & 0x0000FFFF) );
+
+        __syncthreads();
+
+        // Finally, add the counts
+        if( id < groupRLength )
+            atomicAdd( &rMap[idx], 0x200ul << shift );
+    }
+
+    if( id >= groupLLength )
+        return;
+
+    __syncthreads();
+
+
+    // Begin matching
+    constexpr uint32 MAX_MATCHES = 16;
+    Pair matches[MAX_MATCHES];
+    uint32 matchCount = 0;
+
+    #pragma unroll
+    for( uint32 i = 0; i < kExtraBitsPow; i++ )
+    {
+        const uint16 lTarget = GenLTarget( (byte)(groupL & 1), localLY, (uint16)i );
+        const uint16 shift   = ( ( lTarget & 1 ) << 4 );   // Shift left by 16 bits if odd
+        const uint16 rValue  = (uint16)(rMap[lTarget>>1] >> shift);
+        const int16  rCount  = (int16)(rValue >> 9);
+
+        for( int32 j = 0; j < rCount; j++ )
+        {
+            CUDA_ASSERT( matchCount < MAX_MATCHES );
+            matches[matchCount++] = { groupLIdx + id, groupRIdx + (rValue & 0x1FF) + j };
+        }
+    }
+
+    // Store final values
+    const uint32 copyOffset = atomicAdd( &sharedMatchCount, matchCount );
+    __syncthreads();
+        
+    // Store our shared match count and get our global offset
+    if( id == 0 )
+        sharedMatchCount = atomicAdd( gMatchCount, sharedMatchCount );
+    __syncthreads();
+
+    outPairs += copyOffset + sharedMatchCount;
+
+    for( uint32 i = 0; i < matchCount; i++ )
+    {
+        CUDA_ASSERT( matches[i].left < entryCount );
+        CUDA_ASSERT( matches[i].right < entryCount );
+
+        outPairs[i] = matches[i];
+    }
+}
+
+/// This kernel, meant for harvesting compressed k32 plots,
+/// matches adjacent BC groups with 64 threads per block.
+/// Where each block represents 1 L entry, and 
+/// each thread is one iteration of log2( kExtraBits ) required
+/// per L entry during normal matching. 
+/// Since compressed groups are small, we expect this
+/// to be a reasonable way to implement matching
+/// vs the way is imlpemented in plotting where the group
+/// sizes are exploited.
+//-----------------------------------------------------------
+__global__ void HarvestMatchK32Kernel(
+          Pair*   gOutMatches,
+          uint32* gOutMatchCount,
+    const uint64* yEntries,
+    const uint32  entryCount,
+    const uint32  matchOffset
+)
+{
+    const uint32 id   = threadIdx.x;
+    const uint32 yIdx = blockIdx.x;
+    const uint32 gid  = yIdx + id;
+
+    CUDA_ASSERT( id < 64 );
+
+    constexpr uint32 SHARED_R_BUF_SIZE = 64;
+    constexpr uint32 MAX_MATCHES       = 16;
+
+    // Read rGroup entries into a shared buffer
+    __shared__ uint64 rBuf[SHARED_R_BUF_SIZE];
+    __shared__ uint32 sharedMatchCount;
+    __shared__ uint64 lYShared;
+
+    // Find group boundaries
+    __shared__ uint32 lGroupStart;
+    __shared__ uint32 rGroupStartShared;
+    __shared__ uint32 rGroupEnd;
+
+    uint64 myY = 0xFFFFFFFFFFFFFFFF;
+    if( gid < entryCount )
+        myY = yEntries[gid];
+
+
+    if( id == 0 )
+    {
+        lYShared          = myY;
+        sharedMatchCount  = 0;
+        rGroupStartShared = 0xFFFFFFFF;
+        rGroupEnd         = 0xFFFFFFFF;
+    }
+    __syncthreads();
+
+    const uint32 groupL  = (uint32)(lYShared / kBC);
+    const uint32 myGroup = (uint32)(myY / kBC);
+
+    if( myGroup - groupL == 1 )
+        atomicMin( &rGroupStartShared, id );
+
+    __syncthreads();
+
+    // Not an adjacent group, exit
+    if( rGroupStartShared == 0xFFFFFFFF )
+        return;
+
+    const uint32 rGroupStart = rGroupStartShared;
+
+    // Store our read Y into shared value buffer
+    rBuf[id] = myY;
+    __syncthreads();
+
+    // Now find the R group end
+    // Notice we store the global index here, not the block-local one,
+    // like we did for rGroupStart
+    const uint32 groupR = (uint32)( rBuf[rGroupStart] / kBC);
+    if( myGroup > groupR )
+        atomicMin( &rGroupEnd, gid );
+
+    __syncthreads();
+
+    // Is it the last R group?
+    if( id == 0 && rGroupEnd == 0xFFFFFFFF )
+        rGroupEnd = entryCount;
+
+    __syncthreads();
+    CUDA_ASSERT( rGroupEnd < 0xFFFFFFFF );
+
+    // We should have all the info we need to match this Y now
+    const uint32 rGroupLength = rGroupEnd - (yIdx + rGroupStart);
+
+    const uint64 lY           = lYShared;
+    const uint64 groupLYStart = ((uint64)groupL) * kBC;
+    const uint64 groupRYStart = ((uint64)groupR) * kBC;
+    const uint16 localLY      = (uint16)(lY - groupLYStart);
+
+    const uint16 lTarget      = GenLTarget( (byte)(groupL & 1), localLY, (uint16)id );
+
+    Pair   matches[MAX_MATCHES];
+    uint32 matchCount = 0;
+
+    #pragma unroll
+    for( uint32 i = rGroupStart; i < (rGroupStart+rGroupLength); i++ )
+    {
+        const uint64 rY      = rBuf[i];
+        const uint16 localRY = (uint16)(rY - groupRYStart);
+
+        if( lTarget == localRY )
+        {
+            CUDA_ASSERT( matchCount <= MAX_MATCHES );
+            matches[matchCount++] = { matchOffset + yIdx, matchOffset + yIdx + i };
+        }
+    }
+
+    // Store matches into global memory
+    const uint32 offset = atomicAdd( &sharedMatchCount, matchCount );
+
+    __syncthreads();
+    if( sharedMatchCount == 0 )
+        return;
+
+    if( id == 0 )
+        sharedMatchCount = atomicAdd( gOutMatchCount, sharedMatchCount );
+
+    __syncthreads();
+
+    // Copy matches to global buffer
+    const uint32 out = sharedMatchCount + offset;
+
+    for( uint32 i = 0; i < matchCount; i++ )
+        gOutMatches[out+i] = matches[i];
+}
+
+//-----------------------------------------------------------
+__global__ void MatchCudaK32KernelInternal( 
+    Pair*         outPairs,
+    uint32*       gMatchCount,
+    const uint32  entryCount,
+    const uint32* gGroupCounts,
+    const uint64* yEntries,
+    const uint32* groupBoundaries )
+{
+    // 1 thread per y
+    const uint32 id       = threadIdx.x;
+    const uint32 groupIdx = blockIdx.x;
+    const uint32 gid      = blockIdx.x * blockDim.x + id;
+
+    if( groupIdx >= *gGroupCounts )
+        return;
+
+    const uint32 groupLIdx    = groupBoundaries[groupIdx];
+    const uint32 groupRIdx    = groupBoundaries[groupIdx+1];
+    const uint32 groupREnd    = groupBoundaries[groupIdx+2];
+    const uint64 groupL       = yEntries[groupLIdx] / kBC;
+    const uint64 groupR       = yEntries[groupRIdx] / kBC;
+    const uint32 groupRLength = groupREnd - groupRIdx;
+    const uint64 groupLYStart = groupL * kBC;
+    const uint64 groupRYStart = groupR * kBC;
+    const uint32 groupLLength = groupRIdx - groupLIdx;
+
+#if _DEBUG
+    if( groupLLength >= BBCU_THREADS_PER_MATCH_GROUP || groupRLength >= BBCU_THREADS_PER_MATCH_GROUP )
+        printf( "[%u] Group %u is too large: %u\n", gid, groupRIdx, ( groupRIdx - groupLIdx ) );
+#endif
+    CUDA_ASSERT( groupLLength <= BBCU_THREADS_PER_MATCH_GROUP );
+    CUDA_ASSERT( groupRLength <= BBCU_THREADS_PER_MATCH_GROUP );
+
+    // Each rMap entry is:
+    // hi           lo
+    // 7     9      7     9
+    //(count,offset|count,offset) 
+    uint32 rMap[kBC/2+1] = {};
+
+    __shared__ uint32 sharedMatchCount;
+    if( id == 0 )
+        sharedMatchCount = 0;
+    __syncthreads();
+
+    if( groupR - groupL != 1 )
+        return;
+
+    const uint16 localLY     = (uint16)( yEntries[groupLIdx + min(id, groupLLength-1)] - groupLYStart );
+    const uint16 localRYBase = (uint16)( yEntries[groupRIdx + min(id, groupRLength-1)] - groupRYStart );
+
+    // Packed rMap. 2 entries (of count and offset) per DWORD
+    for( uint32 i = 0; i < groupRLength; i++ )
+    {
+        const uint16 localRY = localRYBase + (uint16)i;
+
+        const uint32 idx   = localRY >> 1; // Index in the rMap (Divide by 2)
+        const uint32 value = rMap[idx];
+
+        // Increase the count and sets the index
+        if( (localRY & 1) == 0 )
+        {
+            // Even value, store in the LSbits
+            rMap[idx] = (value + 0x200) | i;
+        }
+        else
+        {
+            // Odd value, store in the MSbits
+            rMap[idx] = (value + 0x2000000) | (i << 16);
+        }
+    }
+    __syncthreads();
+
+
+    // Begin matching
+    constexpr uint32 MAX_MATCHES = 16;
+    Pair   matches[MAX_MATCHES];
+    uint32 matchCount = 0;
+
+    #pragma unroll
+    for( uint32 i = 0; i < kExtraBitsPow; i++ )
+    {
+        const uint16 lTarget = GenLTarget( (byte)(groupL & 1), localLY, (uint16)i );
+        const uint16 shift   = ( ( lTarget & 1 ) << 4 );   // Shift left by 16 bits if odd
+        const uint16 rValue  = (uint16)(rMap[lTarget>>1] >> shift);
+        const int16  rCount  = (int16)(rValue >> 9);
+
+        for( int32 j = 0; j < rCount; j++ )
+        {
+            if( matchCount >= MAX_MATCHES )
+            {
+                printf( "[%u] We got too many (i=%u) matches: %u\n", gid, i, matchCount );
+            }
+            CUDA_ASSERT( matchCount < MAX_MATCHES );
+            matches[matchCount++] = { groupLIdx + id, groupRIdx + (rValue & 0x1FF) + j };
+        }
+    }
+
+    // Store final values
+    const uint32 copyOffset = atomicAdd( &sharedMatchCount, matchCount );
+    __syncthreads();
+        
+    // Store our shared match count and get our global offset
+    if( id == 0 )
+        sharedMatchCount = atomicAdd( gMatchCount, sharedMatchCount );
+    __syncthreads();
+
+    outPairs += copyOffset + sharedMatchCount;
+
+    for( uint32 i = 0; i < matchCount; i++ )
+    {
+        CUDA_ASSERT( matches[i].left < entryCount );
+        CUDA_ASSERT( matches[i].right < entryCount );
+
+        outPairs[i] = matches[i];
+    }
+}
+
+//-----------------------------------------------------------
+__global__ void MatchK32Kernel( 
+    Pair*         outPairs,
+    uint32*       gMatchCount,
+    const uint32  entryCount,
+    const uint32* gGroupCounts,
+    const uint64* yEntries,
+    const uint32* groupBoundaries )
+{
+    // CUDA_ASSERT( blockDim.x == 1 );
+    // CUDA_ASSERT( blockIdx.x == 0 );
+    // CUDA_ASSERT( threadIdx.x == 0 );
+
+
+    // const uint32 groupCount      = *gGroupCounts;
+    // const uint32 entriesPerGroup = (entryCount / groupCount) + 6;
+
+    // const uint32 blocks  = groupCount;
+    // const uint32 threads = entriesPerGroup;
+
+    // HarvestMatchK32Kernel<<<blocks, 64>>>(
+    //       gMatchCount,
+    // const uint32  lGroupIdx,
+    // const uint32  lYIdx,
+    // const uint32  rGroupIdx,
+    // const uint32  rGroupLength,
+    // const uint64* yEntries
+    
+    // MatchCudaK32KernelInternal<<<blocks, threads>>>( outPairs, gMatchCount, entryCount, gGroupCounts, yEntries, groupBoundaries );
+    
+    // const cudaError_t err = cudaGetLastError();
+    // assert( err == cudaSuccess );
+}
+
+//-----------------------------------------------------------
+__global__ void ScanGroupsK32Kernel( 
+    uint32*       groupIndices,
+    uint32*       outGroupCount,
+    const uint64* yEntries,
+    const uint32  entryCount )
+{
+    const uint32 id  = threadIdx.x;
+    const uint32 gid = blockIdx.x * blockDim.x + id;
+
+    __shared__ uint32 sharedGroupCount;
+
+    if( id == 0 ) 
+        sharedGroupCount = 0;
+    __syncthreads();
+
+    if( gid >= entryCount-1 )
+        return;
+
+    const uint32 currentGroup = (uint32)(yEntries[gid] / kBC);
+    const uint32 nextGroup    = (uint32)(yEntries[gid+1] / kBC);
+
+    uint32 offset;
+    if( currentGroup != nextGroup )
+        offset = atomicAdd( &sharedGroupCount, 1 );
+    
+    __syncthreads();
+
+    // Global sync
+    if( id == 0 )
+        sharedGroupCount = atomicAdd( outGroupCount, sharedGroupCount );
+
+    __syncthreads();
+
+    if( currentGroup != nextGroup )
+        groupIndices[sharedGroupCount + offset] = gid+1;
+    // // CUDA_ASSERT( sharedGroupCount + offset < CU_MAX_BC_GROUP_BOUNDARIES );
+}
+
+//-----------------------------------------------------------
+cudaError CudaHarvestMatchK32(
+    Pair*         devOutPairs,
+    uint32*       devMatchCount,
+    const uint32  maxMatches,
+    const uint64* devYEntries,
+    const uint32  entryCount,
+    const uint32  matchOffset,
+    cudaStream_t  stream )
+{
+    uint32 kthreads = 64;
+    uint32 kblocks  = entryCount-1;
+
+    cudaError cErr = cudaMemsetAsync( devMatchCount, 0, sizeof( uint32 ), stream );
+    if( cErr != cudaSuccess )
+        return cErr;
+
+    HarvestMatchK32Kernel<<<kblocks, kthreads, 0, stream>>>(
+        devOutPairs, devMatchCount, devYEntries, entryCount, matchOffset );
+
+// #if _DEBUG
+//     uint32 matchCount = 0;
+//     CudaErrCheck( cudaMemcpyAsync( &matchCount, devMatchCount, sizeof( uint32 ) , cudaMemcpyDeviceToHost, stream ) );
+//     CudaErrCheck( cudaStreamSynchronize( stream ) );
+//     CudaErrCheck( cudaStreamSynchronize( stream ) );
+
+//     Pair* matches = new Pair[matchCount];
+//     CudaErrCheck( cudaMemcpyAsync( matches, devOutPairs, sizeof( Pair ) * matchCount , cudaMemcpyDeviceToHost, stream ) );
+//     CudaErrCheck( cudaStreamSynchronize( stream ) );
+//     CudaErrCheck( cudaStreamSynchronize( stream ) );
+// #endif
+
+    return cudaSuccess;
+}
+
+
+//-----------------------------------------------------------
+void CudaMatchBucketizedK32( 
+    CudaK32PlotContext& cx,
+    const uint32*       devY,
+    cudaStream_t        stream,
+    cudaEvent_t         event )
+{
+    const TableId inTable    = cx.table - 1;
+    const uint32  entryCount = cx.bucketCounts[(int)inTable][cx.bucket];
+    const uint64  bucketMask = BBC_BUCKET_MASK( cx.bucket );
+
+    constexpr uint32 kscanblocks = CuCDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, BBCU_SCAN_GROUP_THREADS );
+
+    uint32* tmpGroupCounts = (uint32*)cx.devMatches;
+
+    {
+        // Initialize the entries to the max value so that they are not included in the sort
+        CudaInitGroupsBucket<<<kscanblocks, BBCU_SCAN_GROUP_THREADS, 0, stream>>>( tmpGroupCounts );
+
+        // Add first group and last ghost group
+        CudaSetFirstAndLastGroup<<<1,2,0,stream>>>( tmpGroupCounts, entryCount );
+    }
+
+    CudaErrCheck( cudaMemsetAsync( cx.devGroupCount, 0, sizeof( uint32 ), stream ) );
+    CudaErrCheck( cudaMemsetAsync( cx.devMatchCount, 0, sizeof( uint32 ), stream ) );
+    ScanGroupsCudaK32Bucket<<<kscanblocks, BBCU_SCAN_GROUP_THREADS, 0, stream>>>( devY, tmpGroupCounts+2, cx.devGroupCount, entryCount, bucketMask );
+
+    byte*  sortTmpAlloc = (byte*)( tmpGroupCounts + BBCU_MAX_GROUP_COUNT );
+    size_t sortTmpSize  = ( BBCU_BUCKET_ALLOC_ENTRY_COUNT - BBCU_MAX_GROUP_COUNT ) * sizeof( uint32 );
+
+#if _DEBUG
+    size_t sortSize = 0;
+    cub::DeviceRadixSort::SortKeys<uint32, uint32>( nullptr, sortSize, nullptr, nullptr, BBCU_MAX_GROUP_COUNT, 0, 32 );
+    ASSERT( sortSize <= sortTmpSize );
+#endif
+
+    cub::DeviceRadixSort::SortKeys<uint32, uint32>( sortTmpAlloc, sortTmpSize, tmpGroupCounts, cx.devGroupBoundaries, BBCU_MAX_GROUP_COUNT, 0, 32, stream );
+
+    MatchCudaK32Bucket<<<BBCU_MAX_GROUP_COUNT, BBCU_THREADS_PER_MATCH_GROUP, 0, stream>>>( bucketMask, entryCount, cx.devGroupCount, devY, cx.devGroupBoundaries, cx.devMatchCount, cx.devMatches );
+}
+
+//-----------------------------------------------------------
+// cudaError CudaHarvestMatchK32WithGroupScan(
+//     Pair*         devOutPairs,
+//     uint32*       devMatchCount,
+//     const uint32  maxMatches,
+//     uint32*       devGroupIndices,
+//     uint32*       devGroupIndicesTemp,
+//     const uint32  maxGroups,
+//     void*         sortBuffer,
+//     const size_t  sortBufferSize,
+//     const uint64* devYEntries,
+//     const uint32  entryCount,
+//     const uint32  matchOffset,
+//     cudaStream_t  stream )
+// {
+//     // Scan for BC groups
+//     {
+//         const uint32 kblocks  = 0;
+//         const uint32 kthreads = 0;
+
+
+//     // constexpr uint32 kscanblocks = CuCDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, BBCU_SCAN_GROUP_THREADS );
+//         // Initialize the entries to the max value so that they are not included in the sort
+//         CudaInitGroups<<<kblocks, kthreads, 0, stream>>>( devGroupIndicesTemp, entryCount );
+//         // CudaInitGroupsBucket<<<kscanblocks, BBCU_SCAN_GROUP_THREADS, 0, stream>>>( tmpGroupCounts );
+
+//         // Add first group and last ghost group
+//         CudaSetFirstAndLastGroup<<<1,2,0,stream>>>( tmpGroupCounts, entryCount );
+//     }
+
+//     CudaErrCheck( cudaMemsetAsync( cx.devGroupCount, 0, sizeof( uint32 ), stream ) );
+//     CudaErrCheck( cudaMemsetAsync( cx.devMatchCount, 0, sizeof( uint32 ), stream ) );
+//     ScanGroupsCudaK32Bucket<<<kscanblocks, BBCU_SCAN_GROUP_THREADS, 0, stream>>>( devY, tmpGroupCounts+2, cx.devGroupCount, entryCount, bucketMask );
+
+//     byte*  sortTmpAlloc = (byte*)( tmpGroupCounts + BBCU_MAX_GROUP_COUNT );
+//     size_t sortTmpSize  = ( BBCU_BUCKET_ALLOC_ENTRY_COUNT - BBCU_MAX_GROUP_COUNT ) * sizeof( uint32 );
+
+// #if _DEBUG
+//     size_t sortSize = 0;
+//     cub::DeviceRadixSort::SortKeys<uint32, uint32>( nullptr, sortSize, nullptr, nullptr, BBCU_MAX_GROUP_COUNT, 0, 32 );
+//     ASSERT( sortSize <= sortTmpSize );
+// #endif
+
+//     cub::DeviceRadixSort::SortKeys<uint32, uint32>( sortTmpAlloc, sortTmpSize, tmpGroupCounts, cx.devGroupBoundaries, BBCU_MAX_GROUP_COUNT, 0, 32, stream );
+
+// }
diff --git a/cuda/CudaMatch.h b/cuda/CudaMatch.h
new file mode 100644
index 00000000..f52e02fa
--- /dev/null
+++ b/cuda/CudaMatch.h
@@ -0,0 +1,30 @@
+#pragma once
+#include <cuda_runtime.h>
+
+/// Unbucketized CUDA-based matching function for k32 compressed plots.
+/// This method is meant to only be used with compressed plots.
+cudaError CudaHarvestMatchK32(
+    struct Pair*  devOutPairs,
+    uint32*       devMatchCount,
+    const uint32  maxMatches,
+    const uint64* devYEntries,
+    const uint32  entryCount,
+    const uint32  matchOffset,
+    cudaStream_t  stream );
+
+/// Unbucketized CUDA-based matching function, specifically for k32.
+/// The matches are deterministic. That is, you will always get the 
+/// same matches given the same input, though the order of the 
+// /// stored matches is not deterministic.
+// cudaError CudaMatchK32(
+//     struct Pair*  devOutPairs,
+//     uint32*       devMatchCount,
+//     uint32*       devTempGroupIndices,
+//     uint32*       devGroupIndices,
+//     uint32*       devGroupCount,
+//     uint32        maxGroups,
+//     byte*         devSortTempData,
+//     const size_t  sortTempDataSize,
+//     const uint64* devYEntries,
+//     const uint32  entryCount,
+//     cudaStream_t  stream );
diff --git a/cuda/CudaParkSerializer.cu b/cuda/CudaParkSerializer.cu
new file mode 100644
index 00000000..f3e8b8d4
--- /dev/null
+++ b/cuda/CudaParkSerializer.cu
@@ -0,0 +1,296 @@
+#include "CudaParkSerializer.h"
+#include "CudaFSE.cuh"
+
+
+//-----------------------------------------------------------
+void InitFSEBitMask( CudaK32PlotContext& cx )
+{
+    static bool _initialized = false;
+    if( _initialized )
+        return;
+
+    _initialized = true;
+
+    uint32 bitmask[] = {
+        0,          1,         3,         7,         0xF,       0x1F,
+        0x3F,       0x7F,      0xFF,      0x1FF,     0x3FF,     0x7FF,
+        0xFFF,      0x1FFF,    0x3FFF,    0x7FFF,    0xFFFF,    0x1FFFF,
+        0x3FFFF,    0x7FFFF,   0xFFFFF,   0x1FFFFF,  0x3FFFFF,  0x7FFFFF,
+        0xFFFFFF,   0x1FFFFFF, 0x3FFFFFF, 0x7FFFFFF, 0xFFFFFFF, 0x1FFFFFFF,
+        0x3FFFFFFF, 0x7FFFFFFF
+    };
+
+    CudaErrCheck( cudaMemcpyToSymbolAsync( CUDA_FSE_BIT_mask, bitmask, sizeof( bitmask ), 0, cudaMemcpyHostToDevice, cx.computeStream ) );
+    CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
+}
+
+
+//-----------------------------------------------------------
+void CompressToParkInGPU( const uint32 parkCount, const size_t parkSize, 
+    uint64* devLinePoints, byte* devParkBuffer, const size_t parkBufferSize, 
+    const uint32 stubBitSize, const FSE_CTable* devCTable, uint32* devParkOverrunCount, cudaStream_t stream )
+{
+    const uint32 kThreadCount = 256;
+    const uint32 kBlocks      = CDivT( parkCount, kThreadCount );
+    CudaCompressToPark<<<kBlocks, kThreadCount, 0, stream>>>( parkCount, parkSize, devLinePoints, devParkBuffer, parkBufferSize, stubBitSize, devCTable, devParkOverrunCount );
+}
+
+//-----------------------------------------------------------
+__global__ void CudaCompressToPark( 
+    const uint32 parkCount, const size_t parkSize, 
+    uint64* linePoints, byte* parkBuffer, const size_t parkBufferSize,
+    const uint32 stubBitSize, const FSE_CTable* cTable, uint32* gParkOverrunCount )
+{
+    const uint32 id  = threadIdx.x;
+    const uint32 gid = blockIdx.x * blockDim.x + id;
+
+    if( gid >= parkCount )
+        return;
+
+    linePoints += kEntriesPerPark * (size_t)gid;
+    parkBuffer += parkBufferSize  * (size_t)gid;
+
+    // __shared__ uint16 sharedCTable[34812/2];
+
+
+    CUDA_ASSERT( (uintptr_t)parkBuffer / sizeof( uint64 ) * sizeof( uint64 ) == (uintptr_t)parkBuffer ); // Must be 64-bit aligned
+    uint64* writer = (uint64*)parkBuffer;
+
+    // Write the first LinePoint as a full LinePoint
+    uint64 prevLinePoint = linePoints[0];
+
+    *writer++ = CuBSwap64( prevLinePoint );
+
+    // Grab the writing location after the stubs
+    const size_t stubSectionBytes = CuCDiv( (kEntriesPerPark - 1) * (size_t)stubBitSize, 8 );
+
+    byte* deltaBytesWriter = ((byte*)writer) + stubSectionBytes;
+
+    // Write stubs
+    {
+        const uint64 stubMask = ((1ULL << stubBitSize) - 1);
+
+        uint64 field = 0;   // Current field to write
+        uint   bits  = 0;   // Bits occupying the current field (always shifted to the leftmost bits)
+
+        #pragma unroll
+        for( uint32 i = 1; i < kEntriesPerPark; i++ )
+        {
+            const uint64 lpDelta = linePoints[i];
+            const uint64 stub    = lpDelta & stubMask;
+
+            // Serialize into bits, one uint64 field at a time
+            // Always store it all the way to the MSbits
+            const uint freeBits = 64 - bits;
+            if( freeBits <= stubBitSize )
+            {
+                // Update the next field bits to what the stub bits that were not written into the current field
+                bits = (uint32)stubBitSize - freeBits;
+
+                // Write what we can (which may be nothing) into the free bits of the current field
+                field |= stub >> bits;
+
+                // Store field
+                *writer++ = CuBSwap64( field );
+
+                const uint remainder = 64 - bits;
+                uint64 mask = ( ( 1ull << bits ) - 1 ) << (remainder & 63);
+                field = ( stub << remainder ) & mask;
+            }
+            else
+            {
+                // The stub completely fits into the current field with room to spare
+                field |= stub << (freeBits - stubBitSize);
+                bits += stubBitSize;
+            }
+        }
+
+        // Write any trailing fields
+        if( bits > 0 )
+            *writer++ = CuBSwap64( field );
+
+        // Zero-out any remaining unused bytes
+        // const size_t stubUsedBytes  = CDiv( (kEntriesPerPark - 1) * (size_t)stubBitSize, 8 );
+        // const size_t remainderBytes = stubSectionBytes - stubUsedBytes;
+        
+        // memset( deltaBytesWriter - remainderBytes, 0, remainderBytes );
+    }
+    
+    
+    // Convert to small deltas
+    byte* smallDeltas = (byte*)&linePoints[1];
+    
+    #pragma unroll
+    for( uint32 i = 1; i < kEntriesPerPark; i++ )
+    {
+        const uint64 smallDelta = linePoints[i] >> stubBitSize;
+        CUDA_ASSERT( smallDelta < 256 );
+
+        smallDeltas[i-1] = (byte)smallDelta;
+    }
+
+    // Write small deltas
+    size_t parkSizeWritten = 0;
+    {
+        byte* deltaSizeWriter = (byte*)deltaBytesWriter;
+        deltaBytesWriter += 2;
+
+        // CUDA_ASSERT( smallDeltas[0] == 3 );
+        size_t deltasSize = CUDA_FSE_compress_usingCTable<kEntriesPerPark-1>(
+                                deltaBytesWriter, (kEntriesPerPark-1) * 8,
+                                smallDeltas, kEntriesPerPark-1, cTable );
+
+        if( deltasSize == 0 )
+        {
+            // #TODO: Set error
+            CUDA_ASSERT( 0 );
+        }
+        else
+        {
+            // Deltas were compressed
+            
+            //memcpy( deltaSizeWriter, &deltasSize, sizeof( uint16 ) );
+            // *deltaSizeWriter = (uint16)deltasSize;
+            deltaSizeWriter[0] = (byte)( deltasSize ); // Stored as LE
+            deltaSizeWriter[1] = (byte)( deltasSize >> 8 );
+        }
+
+        if( ((deltaBytesWriter + deltasSize) - parkBuffer)  > parkSize )
+        {
+            *gParkOverrunCount++;
+        }
+// #if _DEBUG
+        // deltaBytesWriter += deltasSize;
+        // parkSizeWritten = deltaBytesWriter - parkBuffer;
+
+        // if( parkSizeWritten > parkSize )
+            // printf( "[CUDA KERN ERROR] Overran park buffer: %llu / %llu\n", parkSizeWritten, parkSize );
+        // CUDA_ASSERT( parkSizeWritten <= parkSize );
+// #endif
+        
+        // Zero-out any remaining bytes in the deltas section
+        // const size_t parkSizeRemainder = parkSize - parkSizeWritten;
+
+        // memset( deltaBytesWriter, 0, parkSizeRemainder );
+    }
+
+    // return parkSizeWritten;
+}
+
+
+// #TODO: Check if deltafying in a different kernel would be good
+//-----------------------------------------------------------
+__global__ void CudaCompressC3Park( const uint32 parkCount, uint32* f7Entries, byte* parkBuffer, const size_t c3ParkSize, const FSE_CTable* cTable )
+{
+    const uint32 id  = threadIdx.x;
+    const uint32 gid = blockIdx.x * blockDim.x + id;
+
+    if( gid >= parkCount )
+        return;
+
+    f7Entries  += gid * kCheckpoint1Interval;
+    parkBuffer += gid * c3ParkSize;
+
+    byte* deltaWriter = (byte*)f7Entries;
+
+    // Convert to deltas
+
+    // f7Entries must always start at an interval of kCheckpoint1Interval
+    // Therefore its first entry is a C1 entry, and not written as a delta.
+    uint32 prevF7 = *f7Entries;
+    
+    #pragma unroll
+    for( uint32 i = 1; i < kCheckpoint1Interval; i++ )
+    {
+        const uint32 f7    = f7Entries[i];
+        const uint32 delta = f7 - prevF7;
+        prevF7 = f7;
+
+        CUDA_ASSERT( delta < 255 );
+        *deltaWriter++ = (byte)delta;
+    }
+
+    CUDA_ASSERT( (uintptr_t)(deltaWriter - (byte*)f7Entries) == kCheckpoint1Interval-1 );
+
+    // Serialize them into the C3 park buffer
+    const size_t compressedSize = CUDA_FSE_compress_usingCTable<kCheckpoint1Interval-1>(
+        parkBuffer+2, c3ParkSize, (byte*)f7Entries,
+        kCheckpoint1Interval-1, cTable );
+    
+    CUDA_ASSERT( (compressedSize+2) < c3ParkSize );
+    CUDA_ASSERT( (compressedSize+2) < 3000 );
+    
+    // Store size in the first 2 bytes
+    //memcpy( parkBuffer, &sizeu16, sizeof( uint16)  );
+    parkBuffer[0] = (byte)( compressedSize >> 8 ); // Stored as BE
+    parkBuffer[1] = (byte)( compressedSize );
+}
+
+//-----------------------------------------------------------
+void CompressC3ParksInGPU( const uint32 parkCount, uint32* devF7, byte* devParkBuffer, 
+                           const size_t parkBufSize, const FSE_CTable* cTable, cudaStream_t stream )
+{
+    const uint32 kthreads = 128;
+    const uint32 kblocks  = CDiv( parkCount, kthreads );
+
+    CudaCompressC3Park<<<kblocks, kthreads, 0, stream>>>( parkCount, devF7, devParkBuffer, parkBufSize, cTable );
+}
+
+
+//-----------------------------------------------------------
+__global__ void CudaWritePark7( const uint32 parkCount, const uint32* indices, uint64* fieldWriter, const size_t parkFieldCount )
+{
+    const uint32 id  = threadIdx.x;
+    const uint32 gid = blockIdx.x * blockDim.x + id;
+
+    if( gid >= parkCount )
+        return;
+
+    indices     += gid * kEntriesPerPark;
+    fieldWriter += gid * parkFieldCount;
+
+    const uint32 bitsPerEntry = BBCU_K + 1;
+
+    uint64 field = 0;
+    uint32 bits  = 0;
+
+    #pragma unroll
+    for( int32 i = 0; i < kEntriesPerPark; i++ )
+    {
+        const uint64 index    = indices[i];
+        const uint32 freeBits = 64 - bits;
+
+        // Filled a field?
+        if( freeBits <= bitsPerEntry )
+        {
+            bits = bitsPerEntry - freeBits;
+            field |= index >> bits;
+
+            // Store field
+            *fieldWriter++ = CuBSwap64( field );
+
+            const uint remainder = 64 - bits;
+            uint64 mask = ( ( 1ull << bits ) - 1 ) << (remainder & 63);
+            field = ( index << remainder ) & mask;
+        }
+        else
+        {
+            // The entry completely fits into the current field with room to spare
+            field |= index << ( freeBits - bitsPerEntry );
+            bits += bitsPerEntry;
+        }
+    }
+
+    // Write any trailing fields
+    if( bits > 0 )
+        *fieldWriter = CuBSwap64( field );
+}
+
+//-----------------------------------------------------------
+void SerializePark7InGPU( const uint32 parkCount, const uint32* indices, uint64* fieldWriter, const size_t parkFieldCount, cudaStream_t stream )
+{
+    const uint32 kthreads = 256;
+    const uint32 kblocks  = CDiv( parkCount, kthreads );
+
+    CudaWritePark7<<<kblocks, kthreads, 0, stream>>>( parkCount, indices, fieldWriter, parkFieldCount );
+}
diff --git a/cuda/CudaParkSerializer.h b/cuda/CudaParkSerializer.h
new file mode 100644
index 00000000..7b3171d2
--- /dev/null
+++ b/cuda/CudaParkSerializer.h
@@ -0,0 +1,20 @@
+#pragma once
+#include "CudaPlotContext.h"
+
+typedef unsigned FSE_CTable;
+
+void InitFSEBitMask( struct CudaK32PlotContext& cx );
+
+void CompressC3ParksInGPU( const uint32 parkCount, uint32* devF7, byte* devParkBuffer,
+                           size_t parkBufSize, const FSE_CTable* cTable, cudaStream_t stream );
+
+void SerializePark7InGPU( const uint32 parkCount, const uint32* indices, uint64* fieldWriter,
+                          const size_t parkFieldCount, cudaStream_t stream );
+
+void CompressToParkInGPU( const uint32 parkCount, const size_t parkSize, 
+    uint64* devLinePoints, byte* devParkBuffer, size_t parkBufferSize, 
+    const uint32 stubBitSize, const FSE_CTable* devCTable, uint32* devParkOverrunCount, cudaStream_t stream );
+
+__global__ void CudaCompressToPark( const uint32 parkCount, const size_t parkSize, 
+    uint64* linePoints, byte* parkBuffer, size_t parkBufferSize, 
+    const uint32 stubBitSize, const FSE_CTable* cTable, uint32* gParkOverrunCount );
diff --git a/cuda/CudaPlotConfig.h b/cuda/CudaPlotConfig.h
new file mode 100644
index 00000000..80721e9f
--- /dev/null
+++ b/cuda/CudaPlotConfig.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#define BBCU_GPU_STREAM_COUNT         4
+#define BBCU_GPU_BUFFER_MAX_COUNT     4
+#define BBCU_DEFAULT_GPU_BUFFER_COUNT 2
+
+#define BBCU_K                          (32u)
+#define BBCU_BUCKET_COUNT               (128u)
+#define BBC_Y_BITS                      (BBCU_K+kExtraBits)
+#define BBC_Y_BITS_T7                   (BBCU_K)
+#define BBC_BUCKET_BITS                 (CuBBLog2( BBCU_BUCKET_COUNT ))
+#define BBC_BUCKET_SHIFT                (BBC_Y_BITS-BBC_BUCKET_BITS)
+#define BBC_BUCKET_SHIFT_T7             (BBC_Y_BITS_T7-BBC_BUCKET_BITS)
+#define BBC_Y_MASK                      ((uint32)((1ull << BBC_Y_BITS) - 1))
+#define BBC_Y_MASK_T7                   (0xFFFFFFFFu)
+#define BBC_BUCKET_MASK( bucket )       ( ((uint64)bucket) << BBC_BUCKET_SHIFT )
+
+
+#define BBCU_TABLE_ENTRY_COUNT          (1ull<<32)
+#define BBCU_BUCKET_ENTRY_COUNT         (BBCU_TABLE_ENTRY_COUNT/BBCU_BUCKET_COUNT)
+//#define BBCU_XTRA_ENTRIES_PER_SLICE     (1024u*64u)
+#define BBCU_XTRA_ENTRIES_PER_SLICE     (4096u*1u)
+#define BBCU_MAX_SLICE_ENTRY_COUNT      ((BBCU_BUCKET_ENTRY_COUNT/BBCU_BUCKET_COUNT)+BBCU_XTRA_ENTRIES_PER_SLICE)
+#define BBCU_BUCKET_ALLOC_ENTRY_COUNT   (BBCU_MAX_SLICE_ENTRY_COUNT*BBCU_BUCKET_COUNT)
+#define BBCU_TABLE_ALLOC_ENTRY_COUNT    (((uint64)BBCU_BUCKET_ALLOC_ENTRY_COUNT)*BBCU_BUCKET_COUNT)
+
+// The host always needs to start slices at the meta4 size, to avoid overwriting by subsequent tables
+#define BBCU_HOST_META_MULTIPLIER       (4ull)
+#define BBCU_META_SLICE_ENTRY_COUNT     (BBCU_MAX_SLICE_ENTRY_COUNT*BBCU_HOST_META_MULTIPLIER)
+#define BBCU_META_BUCKET_ENTRY_COUNT    (BBCU_BUCKET_ALLOC_ENTRY_COUNT*BBCU_HOST_META_MULTIPLIER)
+
+#define BBCU_SCAN_GROUP_THREAD_COUNT       128
+#define BBCU_SCAN_GROUP_ENTRIES_PER_THREAD 512
+
+static constexpr uint32 CU_MAX_BC_GROUP_BOUNDARIES = ( BBCU_BUCKET_ENTRY_COUNT / 210 ); // Should be enough for all threads
+
+
+static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT / BBCU_BUCKET_COUNT == BBCU_MAX_SLICE_ENTRY_COUNT );
+
+#if _DEBUG
+
+    #ifdef _WIN32
+        #define DBG_BBCU_DBG_DIR "D:/dbg/cuda/"
+    #else
+        // #define DBG_BBCU_DBG_DIR "/home/harold/plot/dbg/cuda/"
+        #define DBG_BBCU_DBG_DIR "/home/harito/plot/dbg/cuda/"
+    #endif
+    // #define DBG_BBCU_REF_DIR       "/home/harold/plot/ref/"
+
+    
+    // #define BBCU_DBG_SKIP_PHASE_1   1   // Skip phase 1 and load pairs from disk
+    // #define BBCU_DBG_SKIP_PHASE_2   1   // Skip phase 1 and 2 and load pairs and marks from disk
+
+    #if (defined( BBCU_DBG_SKIP_PHASE_2 ) && !defined( BBCU_DBG_SKIP_PHASE_1 ) )
+        #define BBCU_DBG_SKIP_PHASE_1 1
+    #endif
+
+    // #define DBG_BBCU_P1_WRITE_CONTEXT 1
+    // #define DBG_BBCU_P1_WRITE_PAIRS   1
+    // #define DBG_BBCU_P2_WRITE_MARKS   1
+
+    // #define DBG_BBCU_P2_COUNT_PRUNED_ENTRIES 1
+
+
+    #define _ASSERT_DOES_NOT_OVERLAP( b0, b1, size ) ASSERT( (b1+size) <= b0 || b1 >= (b0+size) )
+    #define ASSERT_DOES_NOT_OVERLAP( b0, b1, size ) _ASSERT_DOES_NOT_OVERLAP( ((byte*)b0), ((byte*)b1), (size) )
+
+    #define _ASSERT_DOES_NOT_OVERLAP2( b0, b1, sz0, sz1 )ASSERT( (b1+sz1) <= b0 || b1 >= (b0+sz0) )
+    #define ASSERT_DOES_NOT_OVERLAP2( b0, b1, size0, size1 ) _ASSERT_DOES_NOT_OVERLAP2( ((byte*)b0), ((byte*)b1), (size0), (size1) )
+
+#else
+
+    #define _ASSERT_DOES_NOT_OVERLAP( b0, b1, size ) 
+    #define ASSERT_DOES_NOT_OVERLAP( b0, b1, size ) 
+    #define _ASSERT_DOES_NOT_OVERLAP2( b0, b1, sz0, sz1 ) 
+    #define ASSERT_DOES_NOT_OVERLAP2( b0, b1, size0, size1 ) 
+#endif
\ No newline at end of file
diff --git a/cuda/CudaPlotContext.h b/cuda/CudaPlotContext.h
new file mode 100644
index 00000000..f4e8d909
--- /dev/null
+++ b/cuda/CudaPlotContext.h
@@ -0,0 +1,560 @@
+#pragma once
+
+#include "CudaPlotConfig.h"
+#include "CudaUtil.h"
+#include "ChiaConsts.h"
+#include "CudaPlotter.h"
+#include "plotting/PlotTypes.h"
+#include "plotting/PlotWriter.h"
+#include "GpuStreams.h"
+#include "util/StackAllocator.h"
+#include "fse/fse.h"
+#include "threading/Fence.h"
+#include "plotting/GlobalPlotConfig.h"
+#include "threading/ThreadPool.h"
+
+#include "cub/device/device_radix_sort.cuh"
+// #include <cub/device/device_radix_sort.cuh>
+
+// Fix for cooperative_groups.h on windows
+#ifdef __LITTLE_ENDIAN__
+    #undef __LITTLE_ENDIAN__
+    #define __LITTLE_ENDIAN__ 1 
+#endif
+#include <cooperative_groups.h>
+using namespace cooperative_groups;
+
+#if _DEBUG
+    #include <assert.h>
+#endif
+
+
+
+
+struct CudaK32Phase2
+{
+    GpuUploadBuffer   pairsLIn;
+    GpuUploadBuffer   pairsRIn;
+    GpuDownloadBuffer outMarks;
+
+    uint64            pairsLoadOffset;
+    byte*             devMarkingTable;          // bytefield marking table
+    const uint64*     devRMarks[6];             // Right table's marks as a bitfield
+    uint32*           devPrunedCount;
+
+    StackAllocator*   hostBitFieldAllocator;    // Pinned bitfield buffers
+
+    TableId           endTable;
+};
+
+struct CudaK32Phase3
+{
+    struct LMap
+    {
+        uint32 sourceIndex; // Initial unsorted (or y-sorted) index
+        uint32 sortedIndex; // Final LinePoint-sorted index
+    };
+    static_assert( sizeof( LMap ) == sizeof( uint64 ) );
+
+    struct RMap
+    {
+        uint32 src;
+        uint32 dstL;
+        uint32 dstR;
+    };
+
+    uint64  pairsLoadOffset;
+    
+    uint32* devBucketCounts;
+    uint32* devPrunedEntryCount;
+
+
+    union {
+        RMap*   hostRMap;
+        uint32* hostIndices;
+    };
+
+    union {
+        LMap*   hostLMap;
+        uint64* hostLinePoints;
+    };
+
+    // #TODO: Remove this when we sort-out all of the buffer usage 
+    // uint64* hostMarkingTables[6]; // Set by Phase 2
+
+
+    // uint32* hostBucketCounts;
+
+    uint32 prunedBucketCounts[7][BBCU_BUCKET_COUNT];
+    uint64 prunedTableEntryCounts[7];
+
+
+    // Inlined x table
+    struct {
+        const uint64*     devRMarks;    // R-Marking table
+        GpuUploadBuffer   xIn;          // 64-bit Pair
+        GpuDownloadBuffer lpOut;        // Output line points (uint64)
+        GpuDownloadBuffer indexOut;     // Output source line point index (uint32) (taken from the rMap source value)
+
+    } xTable;
+
+    // Step 1
+    struct {
+        uint64*           rTableMarks;
+        GpuUploadBuffer   pairsLIn;
+        GpuUploadBuffer   pairsRIn;
+        GpuDownloadBuffer rMapOut;
+
+        uint32 prunedBucketSlices[BBCU_BUCKET_COUNT][BBCU_BUCKET_COUNT];
+    } step1;
+
+    // Step 2
+    struct {
+        GpuUploadBuffer   rMapIn;       // RMap from step 1
+        GpuUploadBuffer   lMapIn;       // Output map (uint64) from the previous table run. Or during L table 1, it is inlined x values
+        GpuDownloadBuffer lpOut;        // Output line points (uint64)
+        GpuDownloadBuffer indexOut;     // Output source line point index (uint32) (taken from the rMap source value)
+        uint32*           devLTable[2]; // Unpacked L table bucket
+
+        uint32 prunedBucketSlices[BBCU_BUCKET_COUNT][BBCU_BUCKET_COUNT];
+    } step2;
+
+    // Step 3
+    struct {
+        GpuUploadBuffer   lpIn;         // Line points from step 2
+        GpuUploadBuffer   indexIn;      // Indices from step 2
+        GpuDownloadBuffer mapOut;       // lTable for next step 1
+        GpuDownloadBuffer parksOut;     // Downloads park buffers to host
+
+        uint32*           hostParkOverrunCount;
+
+        size_t            sizeTmpSort;
+        byte*             devSortTmpData;
+
+        uint64*           devLinePoints;
+        uint64*           devDeltaLinePoints;
+        uint32*           devIndices;
+        FSE_CTable*       devCTable;
+        uint32*           devParkOverrunCount;
+
+        Fence*              parkFence;
+        std::atomic<uint32> parkBucket;
+
+        uint32 prunedBucketSlices[BBCU_BUCKET_COUNT][BBCU_BUCKET_COUNT];
+
+    } step3;
+};
+
+struct CudaK32AllocContext
+{
+    size_t alignment;
+    bool   dryRun;
+
+    IStackAllocator* pinnedAllocator;
+    IStackAllocator* devAllocator;
+    IStackAllocator* hostTableAllocator;
+    IStackAllocator* hostTempAllocator;
+};
+
+// struct CudaK32PlotRequest
+// {
+//     const char* plotOutDir;
+//     const char* plotFileName;
+
+//     const byte* plotId;
+//     const char* plotIdStr;
+
+//     const byte* plotMemo;
+//     uint16      plotMemoSize;
+
+//     uint32      plotCount;
+// };
+
+struct CudaK32PlotContext
+{
+          CudaK32PlotConfig cfg       = {};
+    const GlobalPlotConfig* gCfg      = nullptr;
+
+    int32           cudaDevice        = -1;
+    cudaDeviceProp* cudaDevProps      = nullptr;
+    bool            downloadDirect    = false;
+    ThreadPool*     threadPool        = nullptr;
+    
+    TableId      table                = TableId::Table1;    // Current table being generated
+    uint32       bucket               = 0;                  // Current bucket being processed
+
+    uint64       prevTablePairOffset  = 0;                  // Offset at which to write the previous table's sorted pairs
+
+    uint32       bucketCounts[7][BBCU_BUCKET_COUNT]  = {};
+    uint32       bucketSlices[2][BBCU_BUCKET_COUNT][BBCU_BUCKET_COUNT] = {};
+    uint64       tableEntryCounts[7]  = {};
+
+    PlotRequest  plotRequest;
+    PlotWriter*  plotWriter           = nullptr;
+    Fence*       plotFence            = nullptr;
+
+    // Root allocations
+    size_t allocAlignment             = 0;
+    size_t pinnedAllocSize            = 0;
+    size_t devAllocSize               = 0;
+    size_t hostTableAllocSize         = 0;
+    size_t hostTempAllocSize          = 0;
+
+    void* pinnedBuffer                = nullptr;
+    void* deviceBuffer                = nullptr;
+    void* hostBufferTemp              = nullptr;
+    void* hostBufferTables            = nullptr;
+
+    // Device stuff
+    cudaStream_t computeStream        = nullptr;
+    cudaStream_t computeStreamB       = nullptr;
+    cudaStream_t computeStreamC       = nullptr;
+    cudaStream_t computeStreamD       = nullptr;
+    cudaEvent_t  computeEventA        = nullptr;
+    cudaEvent_t  computeEventB        = nullptr;
+    cudaEvent_t  computeEventC        = nullptr;
+    GpuQueue*    gpuDownloadStream[BBCU_GPU_STREAM_COUNT] = {};
+    GpuQueue*    gpuUploadStream  [BBCU_GPU_STREAM_COUNT] = {};
+
+    GpuDownloadBuffer yOut;
+    GpuDownloadBuffer metaOut;
+    GpuUploadBuffer   yIn;
+    GpuUploadBuffer   metaIn;
+
+
+    GpuDownloadBuffer xPairsOut;        // This shares the same backing buffer with pairsLOut & pairsROut
+    GpuDownloadBuffer pairsLOut;
+    GpuDownloadBuffer pairsROut;
+    GpuUploadBuffer   xPairsIn;         // This shares the same backing buffer with pairsLIn & pairsRIn
+    GpuUploadBuffer   pairsLIn;
+    GpuUploadBuffer   pairsRIn;
+    GpuDownloadBuffer sortedXPairsOut;  // This shares the same backing buffer with sortedPairsLOut & sortedPairsROut
+    GpuDownloadBuffer sortedPairsLOut;
+    GpuDownloadBuffer sortedPairsROut;
+
+    
+    size_t       devSortTmpAllocSize  = 0;
+    void*        devSortTmp           = nullptr;
+    uint32*      devYWork             = nullptr;
+    uint32*      devMetaWork          = nullptr;
+    uint32*      devXInlineInput      = nullptr;
+    Pair*        devMatches           = nullptr;
+    union {
+        Pair*    devInlinedXs         = nullptr;
+        uint32*  devCompressedXs;
+    };
+    uint32*      devBucketCounts      = nullptr;
+    uint32*      devSliceCounts       = nullptr;
+    uint32*      devSortKey           = nullptr;
+    uint32*      devChaChaInput       = nullptr;
+    
+    uint32*      devGroupBoundaries   = nullptr;
+
+    uint32*      devMatchCount        = nullptr;
+    uint32*      devGroupCount        = nullptr;
+
+
+    /// Host stuff
+
+    // Host "Temp 2"
+    uint32*      hostY                = nullptr;
+    uint32*      hostMeta             = nullptr;
+    uint32*      hostBucketCounts     = nullptr;
+    uint32*      hostBucketSlices     = nullptr;
+    uint32*      hostTableL           = nullptr;
+    uint16*      hostTableR           = nullptr;
+    uint32*      hostTableSortedL     = nullptr;
+    uint16*      hostTableSortedR     = nullptr;
+
+    union {
+        uint32*  hostMatchCount       = nullptr;
+        uint32*  hostGroupCount;
+    };
+
+    // Host "Temp 1"
+    Pairs        hostBackPointers [7] = {};
+    uint64*      hostMarkingTables[6] = {};
+
+
+    CudaK32Phase2* phase2 = nullptr;
+    CudaK32Phase3* phase3 = nullptr;
+
+    struct
+    {
+        Duration uploadTime   = Duration::zero();   // Host-to-device wait time
+        Duration downloadTime = Duration::zero();   // Device-to-host wait time
+        Duration matchTime    = Duration::zero();
+        Duration sortTime     = Duration::zero();
+        Duration fxTime       = Duration::zero();
+
+    } timings;
+};
+
+#if _DEBUG
+    extern ThreadPool* _dbgThreadPool;
+
+    void DbgLoadTablePairs( CudaK32PlotContext& cx, const TableId table, bool copyToPinnedBuffer = false );
+    void DbgWritePairs( CudaK32PlotContext& cx, TableId table );
+    void DbgWriteContext( CudaK32PlotContext& cx );
+    void DbgLoadContextAndPairs( CudaK32PlotContext& cx, bool loadTables = false );
+    void DbgLoadMarks( CudaK32PlotContext& cx );
+    ThreadPool& DbgGetThreadPool( CudaK32PlotContext& cx );
+#endif
+
+void CudaK32PlotDownloadBucket( CudaK32PlotContext& cx );
+//void CudaK32PlotUploadBucket( CudaK32PlotContext& cx );
+
+
+void CudaK32PlotGenSortKey( const uint32 entryCount, uint32* devKey, cudaStream_t stream = nullptr, bool synchronize = false );
+
+template<typename T>
+void CudaK32PlotSortByKey( const uint32 entryCount, const uint32* devKey, const T* devInput, T* devOutput, cudaStream_t stream = nullptr, bool synchronize = false );
+
+void CudaK32InlineXsIntoPairs(
+    const uint32  entryCount,
+          Pair*   devOutPairs,
+    const Pair*   devInPairs,
+    const uint32* devXs,
+    cudaStream_t  stream );
+
+void CudaK32ApplyPairOffset(
+    const uint32 entryCount,
+    const uint32 offset,
+          Pair*  devOutPairs,
+    const Pair*  devInPairs,
+    cudaStream_t stream );
+
+///
+/// Phase 2
+///
+void CudaK32PlotPhase2( CudaK32PlotContext& cx );
+void CudaK32PlotPhase2AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx );
+
+///
+/// Phase 3
+///
+void CudaK32PlotPhase3( CudaK32PlotContext& cx );
+void CudaK32PlotPhase3AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx );
+
+///
+/// Debug
+///
+uint64 CudaPlotK32DbgXtoF1( CudaK32PlotContext& cx, const uint32 x );
+
+
+
+///
+/// Internal
+///
+//-----------------------------------------------------------
+inline uint32 CudaK32PlotGetInputIndex( CudaK32PlotContext& cx )
+{
+    return ((uint32)cx.table-1) & 1;
+}
+
+//-----------------------------------------------------------
+inline uint32 CudaK32PlotGetOutputIndex( CudaK32PlotContext& cx )
+{
+    return (uint32)cx.table & 1;
+}
+
+//-----------------------------------------------------------
+inline bool CudaK32PlotIsOutputInterleaved( CudaK32PlotContext& cx )
+{
+    return CudaK32PlotGetOutputIndex( cx ) == 0;
+}
+
+//-----------------------------------------------------------
+inline size_t GetMarkingTableBitFieldSize()
+{
+    return ((1ull << BBCU_K) / 64) * sizeof(uint64);
+}
+
+#define CuCDiv( a, b ) (( (a) + (b) - 1 ) / (b))
+
+//-----------------------------------------------------------
+template <typename T>
+__host__ __device__ __forceinline__ constexpr T CuBBLog2( T x )
+{
+    T r = 0;
+    while( x >>= 1 )
+        r++;
+    return r;
+}
+
+
+
+// Calculates x * (x-1) / 2. Division is done before multiplication.
+//-----------------------------------------------------------
+__device__ __forceinline__ uint64 CudaGetXEnc64( uint64 x )
+{
+    uint64 a = x, b = x - 1;
+
+    if( (a & 1) == 0 )
+        a >>= 1;
+    else
+        b >>= 1;
+
+    return a * b;
+}
+
+//-----------------------------------------------------------
+__device__ __forceinline__ uint64 CudaSquareToLinePoint64( uint64 x, uint64 y )
+{
+    return CudaGetXEnc64( max( x, y ) ) + min( x, y );
+}
+
+//-----------------------------------------------------------
+template<typename T>
+__device__ inline void CuGetThreadOffsets( const uint32 id, const uint32 threadCount, const T totalCount, T& count, T& offset, T& end )
+{
+    const T countPerThread = totalCount / (T)threadCount;
+    const T remainder      = totalCount - countPerThread * (T)threadCount;
+
+    count  = countPerThread;
+    offset = (T)id * countPerThread;
+
+    if( id == threadCount - 1 )
+        count += remainder;
+
+    end = offset + count;
+}
+
+//-----------------------------------------------------------
+__host__ __device__ __forceinline__ bool CuBitFieldGet( const uint64* bitfield, uint64 index )
+{
+    const uint64 fieldIdx = index >> 6;                          // Divide by 64. Safe to do with power of 2. (shift right == log2(64))
+    const uint64 field    = bitfield[fieldIdx];
+
+    const uint32 rShift   = (uint32)(index - (fieldIdx << 6));  // Multiply by fieldIdx (shift left == log2(64))
+    return (bool)((field >> rShift) & 1u);
+}
+
+
+//-----------------------------------------------------------
+__device__ __forceinline__ uint32 atomicAggrInc( uint32* dst )
+{
+    // Increment from coallesced group first
+    coalesced_group g = coalesced_threads();
+    
+    uint32 prev;
+    if( g.thread_rank() == 0 )
+        prev = atomicAdd( dst, g.size() );
+
+    prev = g.thread_rank() + g.shfl( prev, 0 );
+    return prev;
+}
+
+//-----------------------------------------------------------
+__device__ __forceinline__ uint32 atomicGlobalOffset( uint32* globalCount )
+{
+    __shared__ uint32 sharedCount;
+
+    thread_block block = this_thread_block();
+
+    if( block.thread_rank() == 0 )
+        sharedCount = 0;
+
+    // Store block-wide offset
+    block.sync();
+    const uint32 blockOffset = atomicAggrInc( &sharedCount );
+    block.sync();
+
+    // Store global offset
+    if( block.thread_rank() == 0 )
+        sharedCount = atomicAdd( globalCount, sharedCount );
+
+    block.sync();
+
+    // Broadcast the shared count to each thread
+    const uint32 gOffset = sharedCount + blockOffset;
+    return gOffset;
+}
+
+//-----------------------------------------------------------
+__device__ __forceinline__ uint32 atomicAddShared( uint32* globalCount, const uint32 count  )
+{
+    __shared__ uint32 sharedCount;
+
+    thread_block block = this_thread_block();
+
+    if( block.thread_rank() == 0 )
+        sharedCount = 0;
+
+    // Store shared offset
+    block.sync();
+    const uint32 offset = atomicAdd( &sharedCount, count );
+    block.sync();
+
+    // Store global offset
+    if( block.thread_rank() == 0 )
+        sharedCount = atomicAdd( globalCount, sharedCount );
+
+    block.sync();
+
+    return sharedCount + offset;
+}
+
+
+#if _DEBUG
+
+
+#include "b3/blake3.h"
+
+//-----------------------------------------------------------
+inline void DbgPrintHash( const char* msg, const void* ptr, const size_t size )
+{
+    byte hash[32];
+    
+    blake3_hasher hasher;
+    blake3_hasher_init( &hasher );
+    blake3_hasher_update( &hasher, ptr, size );
+    blake3_hasher_finalize( &hasher, hash, sizeof( hash ) );
+
+    char hashstr[sizeof(hash)*2+1] = {};
+    size_t _;
+    BytesToHexStr( hash, sizeof( hash ), hashstr, sizeof( hashstr ), _ );
+
+    Log::Line( "%s 0x%s", msg, hashstr );
+}
+
+//-----------------------------------------------------------
+inline void DbgPrintDeviceHash( const char* msg, const void* ptr, const size_t size, cudaStream_t stream )
+{
+    byte hash[32];
+
+    void* hostBuffer = bbvirtallocboundednuma<byte>( size );
+    CudaErrCheck( cudaMemcpyAsync( hostBuffer, ptr, size, cudaMemcpyDeviceToHost, stream ) );
+    CudaErrCheck( cudaStreamSynchronize( stream ) );
+
+    blake3_hasher hasher;
+    blake3_hasher_init( &hasher );
+    blake3_hasher_update( &hasher, hostBuffer, size );
+    blake3_hasher_finalize( &hasher, hash, sizeof( hash ) );
+
+    bbvirtfreebounded( hostBuffer );
+
+    char hashstr[sizeof( hash ) * 2 + 1] = {};
+    size_t _;
+    BytesToHexStr( hash, sizeof( hash ), hashstr, sizeof( hashstr ), _ );
+
+    Log::Line( "%s 0x%s", msg, hashstr );
+}
+
+//-----------------------------------------------------------
+template<typename T>
+inline void DbgPrintDeviceHashT( const char* msg, const T* ptr, const size_t count, cudaStream_t stream )
+{
+    return DbgPrintDeviceHash( msg, ptr, count * sizeof( T ), stream );
+}
+
+//-----------------------------------------------------------
+inline ThreadPool& DbgGetThreadPool( CudaK32PlotContext& cx )
+{
+    if( _dbgThreadPool == nullptr )
+        _dbgThreadPool = new ThreadPool( SysHost::GetLogicalCPUCount() );
+
+    return *_dbgThreadPool;
+}
+
+#endif
\ No newline at end of file
diff --git a/cuda/CudaPlotPhase2.cu b/cuda/CudaPlotPhase2.cu
new file mode 100644
index 00000000..93099d86
--- /dev/null
+++ b/cuda/CudaPlotPhase2.cu
@@ -0,0 +1,654 @@
+#include "CudaPlotContext.h"
+#include "util/StackAllocator.h"
+
+#if _DEBUG
+    #include "util/BitField.h"
+    #include "threading/MTJob.h"
+    #include "plotdisk/jobs/IOJob.h"
+
+    byte* _dbgRMarks = nullptr;
+    
+    static void DbgValidateTable( CudaK32PlotContext& cx, const TableId table );
+    static void DbgWriteMarks( CudaK32PlotContext& cx, const TableId table );
+    static void DebugPruneInCPU( CudaK32PlotContext& cx );
+
+    #ifndef DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
+        #define DBG_BBCU_P2_COUNT_PRUNED_ENTRIES 1
+    #endif
+#endif
+
+static void CudaK32PlotAllocateBuffersTest( CudaK32PlotContext& cx );
+
+#define MARK_TABLE_BLOCK_THREADS 128
+#define P2_BUCKET_COUNT          BBCU_BUCKET_COUNT
+#define P2_ENTRIES_PER_BUCKET    BBCU_BUCKET_ALLOC_ENTRY_COUNT //((1ull<<BBCU_K)/P2_BUCKET_COUNT)
+
+
+inline size_t GetMarkingTableByteSize()
+{
+    return 1ull << BBCU_K;
+}
+
+template<bool useRMarks>
+__global__ void CudaMarkTables( const uint32 entryCount, const uint32* lPairs, const uint16* rPairs, byte* marks, const uint64* rTableMarks, const uint32 rOffset )
+{
+    const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // Each thread handles 1 entry
+    if( gid >= entryCount )
+        return;
+
+    if constexpr ( useRMarks )
+    {   
+        if( !CuBitFieldGet( rTableMarks, rOffset + gid ) )
+            return;
+    }
+    
+    const uint32 l = lPairs[gid];
+    const uint32 r = l + rPairs[gid];
+
+    marks[l] = 1;
+    marks[r] = 1;
+}
+
+
+__global__ void CudaBytefieldToBitfield( const byte* bytefield, uint64* bitfield
+#if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
+    , uint32* gPrunedCount
+#endif
+ )
+{
+    const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+    CUDA_ASSERT( gid < 67108864 );
+
+    // if( gid >= fieldCount )
+    //     return;
+
+    // Each thread reads a full 64-bit field, so 64 bytes
+    bytefield += gid * 64ull;
+
+    // Convert 64 bytes to a 64-bit field
+    uint64 bits = (uint64)bytefield[0];
+    
+    #pragma unroll
+    for( int32 i = 1; i < 64; i++ )
+        bits |= (((uint64)bytefield[i]) << i);
+
+    CUDA_ASSERT( (uintptr_t)bitfield / 8 * 8 == (uintptr_t)bitfield );
+    bitfield[gid] = bits;
+
+#if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
+    
+    uint32 markCount = 0;
+
+    #pragma unroll
+    for( uint32 i = 0; i < 64; i++ )
+    {
+        // if( (bits & (1ull << i)) != 0 )
+        //     markCount++;
+        if( bytefield[i] == 1 )
+            markCount++;
+    }
+
+    __shared__ uint32 sharedMarkCount;
+    thread_block block = this_thread_block();
+
+    // #TODO: Use warp-aware reduction via CUB
+    block.sync();
+    if( block.thread_rank() == 0 )
+        sharedMarkCount = 0;
+    block.sync();
+    
+    atomicAdd( &sharedMarkCount, markCount );
+    block.sync();
+
+    if( block.thread_rank() == 0 )
+        atomicAdd( gPrunedCount, sharedMarkCount );
+#endif
+}
+
+static void BytefieldToBitfield( CudaK32PlotContext& cx, const byte* bytefield, uint64* bitfield, cudaStream_t stream )
+{
+    const uint64 tableEntryCount    = 1ull << BBCU_K;
+    const uint32 fieldCount         = (uint32)( tableEntryCount / 64 );
+    
+    const uint32 blockThreadCount   = 256;
+    const uint32 blockCount         = CDivT( fieldCount, blockThreadCount );
+
+    ASSERT( (uint64)blockCount * blockThreadCount * 64 == tableEntryCount );
+
+#if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
+        #define G_PRUNED_COUNTS ,cx.phase2->devPrunedCount
+        CudaErrCheck( cudaMemsetAsync( cx.phase2->devPrunedCount, 0, sizeof( uint32 ), stream ) );
+#else
+        #define G_PRUNED_COUNTS 
+#endif
+    
+    ASSERT_DOES_NOT_OVERLAP2( bitfield, bytefield, GetMarkingTableBitFieldSize(), GetMarkingTableByteSize() );
+
+    CudaBytefieldToBitfield<<<blockCount, blockThreadCount, 0, stream>>>( bytefield, bitfield G_PRUNED_COUNTS );
+}
+
+void LoadPairs( CudaK32PlotContext& cx, CudaK32Phase2& p2, const TableId rTable, const uint32 bucket )
+{
+    const uint64 tableEntryCount = cx.tableEntryCounts[(int)rTable];
+    const uint32 entryCount      = BBCU_BUCKET_ENTRY_COUNT;//(uint32)std::min( (uint64)BBCU_BUCKET_ENTRY_COUNT, tableEntryCount - p2.pairsLoadOffset );// cx.bucketCounts[(int)rTable][bucket];
+
+        //   uint32* hostPairsL     = cx.hostTableSortedL + p2.pairsLoadOffset;
+        //   uint16* hostPairsR     = cx.hostTableSortedR + p2.pairsLoadOffset;
+          uint32* hostPairsL     = cx.hostBackPointers[(int)rTable].left  + p2.pairsLoadOffset;
+          uint16* hostPairsR     = cx.hostBackPointers[(int)rTable].right + p2.pairsLoadOffset;
+    // const uint32* nextHostPairsL = cx.hostBackPointers[(int)rTable-1].left  + p2.pairsLoadOffset;
+    // const uint16* nextHostPairsR = cx.hostBackPointers[(int)rTable-1].right + p2.pairsLoadOffset;
+
+    // if( rTable > p2.endTable )
+    {
+        // Copy the next table to our pinned host pairs
+        // p2.pairsLIn.UploadAndPreLoadT( hostPairsL, entryCount, nextHostPairsL, entryCount );
+        // p2.pairsRIn.UploadAndPreLoadT( hostPairsR, entryCount, nextHostPairsR, entryCount );
+    }
+    // else
+    // {
+        p2.pairsLIn.UploadT( hostPairsL, entryCount );
+        p2.pairsRIn.UploadT( hostPairsR, entryCount );
+    // }
+
+    p2.pairsLoadOffset += entryCount;
+}
+
+void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 )
+{
+    const TableId lTable = cx.table;
+    const TableId rTable = lTable + 1;
+
+    byte* devLMarks = p2.devMarkingTable;
+
+    // Zero-out marks
+    CudaErrCheck( cudaMemsetAsync( devLMarks, 0, GetMarkingTableByteSize(), cx.computeStream ) );
+
+    // Load first bucket's worth of pairs
+    LoadPairs( cx, p2, rTable, 0 );
+
+    uint32 rOffset = 0;
+    for( uint32 bucket = 0; bucket < P2_BUCKET_COUNT; bucket++ )
+    {
+        const bool isLastBucket = bucket + 1 == P2_BUCKET_COUNT;
+
+        // Load next set of pairs in the background
+        if( !isLastBucket )
+            LoadPairs( cx, p2, rTable, bucket + 1 );
+
+        const uint64 tableEntryCount = cx.tableEntryCounts[(int)rTable];
+        const uint32 entryCount      = isLastBucket ? tableEntryCount - (BBCU_BUCKET_ENTRY_COUNT * (BBCU_BUCKET_COUNT-1)): BBCU_BUCKET_ENTRY_COUNT;
+        // const uint32 entryCount       = cx.bucketCounts[(int)rTable][bucket];
+
+        // Wait for pairs to be ready
+        const uint32* devLPairs = p2.pairsLIn.GetUploadedDeviceBufferT<uint32>( cx.computeStream );
+        const uint16* devRPairs = p2.pairsRIn.GetUploadedDeviceBufferT<uint16>( cx.computeStream );
+
+
+        // Mark
+        const uint32 blockCount = (uint32)CDiv( entryCount, MARK_TABLE_BLOCK_THREADS );
+
+        if( rTable == TableId::Table7 )
+            CudaMarkTables<false><<<blockCount, MARK_TABLE_BLOCK_THREADS, 0, cx.computeStream>>>( entryCount, devLPairs, devRPairs, devLMarks, nullptr, 0 );
+        else
+            CudaMarkTables<true ><<<blockCount, MARK_TABLE_BLOCK_THREADS, 0, cx.computeStream>>>( entryCount, devLPairs, devRPairs, devLMarks, p2.devRMarks[(int)rTable], rOffset );
+        
+        p2.pairsLIn.ReleaseDeviceBuffer( cx.computeStream );
+        p2.pairsRIn.ReleaseDeviceBuffer( cx.computeStream );
+
+        rOffset += entryCount;
+    }
+
+    // Convert the bytefield marking table to a bitfield
+    uint64* bitfield = (uint64*)p2.outMarks.LockDeviceBuffer( cx.computeStream );
+
+    BytefieldToBitfield( cx, devLMarks, bitfield, cx.computeStream );
+
+    // Download bitfield marks
+    // uint64* hostBitField = p2.hostBitFieldAllocator->AllocT<uint64>( GetMarkingTableBitFieldSize() );
+    uint64* hostBitField = cx.hostMarkingTables[(int)lTable];
+    
+    // #TODO: Do download and copy again, for now just store all of them in this pinned buffer
+    // cx.phase3->hostMarkingTables[(int)lTable] = hostBitField;
+    p2.outMarks.Download( hostBitField, GetMarkingTableBitFieldSize(), cx.computeStream );
+    
+    // p2.outMarks.DownloadAndCopy( hostBitField, cx.hostMarkingTables[(int)lTable], GetMarkingTableBitFieldSize(), cx.computeStream );
+    // p2.outMarks.Download( cx.hostMarkingTables[(int)lTable], GetMarkingTableBitFieldSize() );
+    
+
+#if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
+    {
+        uint32 prunedEntryCount = 0;
+        CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
+        CudaErrCheck( cudaMemcpyAsync( &prunedEntryCount, p2.devPrunedCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, cx.computeStream ) );
+        CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
+
+        const uint64 lEntryCount = cx.tableEntryCounts[(int)lTable];
+        Log::Line( "Table %u now has %u / %llu  ( %.2lf%% ) entries.", (uint)lTable+1, 
+            prunedEntryCount, lEntryCount, ((double)prunedEntryCount / lEntryCount ) * 100.0 );
+    }
+
+    // Check on CPU
+    if( 0 )
+    {
+        #if _DEBUG
+        p2.outMarks.WaitForCompletion();
+
+    //     CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
+    //     CudaErrCheck( cudaStreamSynchronize( cx.gpuDownloadStream[0]->GetStream() ) );
+    //     CudaErrCheck( cudaStreamSynchronize( cx.gpuDownloadStream[1]->GetStream() ) );
+    //     CudaErrCheck( cudaStreamSynchronize( cx.gpuDownloadStream[2]->GetStream() ) );
+
+    //     byte*   hByteField = bbcvirtalloc<byte>( GetMarkingTableByteSize() );
+        // uint64* hBitField  = bbcvirtalloc<uint64>( GetMarkingTableBitFieldSize() );
+    //     uint64* rBitField  = bbcvirtalloc<uint64>( GetMarkingTableBitFieldSize() );
+    //     CudaErrCheck( cudaMemcpyAsync( hByteField, devLMarks, GetMarkingTableByteSize(), cudaMemcpyDeviceToHost, cx.computeStream  ) );
+    //     CudaErrCheck( cudaMemcpyAsync( hBitField, bitfield, GetMarkingTableBitFieldSize(), cudaMemcpyDeviceToHost, cx.computeStream  ) );
+
+    //     if( rTable < TableId::Table7 )
+    //         CudaErrCheck( cudaMemcpyAsync( rBitField, p2.devRMarks, GetMarkingTableBitFieldSize(), cudaMemcpyDeviceToHost, cx.computeStream  ) );
+
+    //     CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
+    //     // (void)p2.outMarks.GetDeviceBuffer();
+        uint64* hBitField = cx.hostMarkingTables[(int)lTable];
+        
+        std::atomic<uint64> bitfieldPrunedEntryCount = 0;
+        // std::atomic<uint64> totalPrunedEntryCount    = 0;
+        // std::atomic<uint64> rTablePrunedEntryCount   = 0;
+
+        AnonMTJob::Run( DbgGetThreadPool( cx ), [&]( AnonMTJob* self ){
+            
+            const TableId rt          = lTable + 1;
+            const uint64  rEntryCount = cx.tableEntryCounts[(int)rTable];
+            const uint64  lEntryCount = cx.tableEntryCounts[(int)lTable];
+
+            uint64 localPrunedEntryCount = 0;
+            uint64 rPrunedEntryCount     = 0;
+
+    //         BitField rMarks( rBitField, rEntryCount );
+    //         const byte* bytefield = hByteField;
+
+            uint64 count, offset, end;
+
+    //         // Count r entries again to make sure it's still valid
+    //         if( rt < TableId::Table7 )
+    //         {
+    //             GetThreadOffsets( self, rEntryCount, count, offset, end );
+    //             for( uint64 i = offset; i < end; i++ )
+    //             {
+    //                 if( rMarks.Get( i ) )
+    //                     rPrunedEntryCount ++;
+    //             }
+        
+    //             rTablePrunedEntryCount += rPrunedEntryCount;   
+    //         }
+
+            GetThreadOffsets( self, lEntryCount, count, offset, end );
+    //         for( uint64 i = offset; i < end; i++ )
+    //         {
+    //             if( bytefield[i] == 1 )
+    //                 localPrunedEntryCount++;
+    //         }
+    //         totalPrunedEntryCount += localPrunedEntryCount;
+
+            BitField bits( hBitField, lEntryCount );
+            localPrunedEntryCount = 0;
+            for( uint64 i = offset; i < end; i++ )
+            {
+                if( bits.Get( i ) )
+                    localPrunedEntryCount++;
+            }
+            bitfieldPrunedEntryCount += localPrunedEntryCount;
+        });
+        
+        uint64 prunedEntryCount;
+        const uint64 lEntryCount      = cx.tableEntryCounts[(int)lTable];
+    //           prunedEntryCount = totalPrunedEntryCount.load();
+    //     Log::Line( "*** BYTEfield pruned entry count: %llu / %llu ( %.2lf %% )", 
+    //         prunedEntryCount, lEntryCount, prunedEntryCount / (double)lEntryCount * 100.0 );
+
+        prunedEntryCount = bitfieldPrunedEntryCount.load();
+        Log::Line( "*** Bitfield pruned entry count: %llu / %llu ( %.2lf %% )", 
+            prunedEntryCount, lEntryCount, prunedEntryCount / (double)lEntryCount * 100.0 );
+
+    //     if( rTable < TableId::Table7 )
+    //     {
+    //         prunedEntryCount = rTablePrunedEntryCount.load();
+    //         const uint64 rEntryCount = cx.tableEntryCounts[(int)rTable];
+    //         Log::Line( "*** R pruned entry count: %llu / %llu ( %.2lf %% )", 
+    //             prunedEntryCount, rEntryCount, prunedEntryCount / (double)rEntryCount * 100.0 );
+
+    //     }
+
+    //     // Full CPU method
+
+    //     bbvirtfree( hByteField );
+    //     bbvirtfree( hBitField  );
+    //     bbvirtfree( rBitField );
+    #endif
+    }
+#endif
+
+    // Set right table marks for the next step
+    p2.devRMarks[(int)lTable] = bitfield;    
+}
+
+void CudaK32PlotPhase2( CudaK32PlotContext& cx )
+{
+    CudaK32Phase2& p2 = *cx.phase2;
+    // p2.hostBitFieldAllocator->PopToMarker( 0 );
+
+    const uint32 compressionLevel = cx.gCfg->compressionLevel;
+
+    const TableId startRTable = TableId::Table7;    
+    const TableId endRTable   = TableId::Table3 + (TableId)cx.gCfg->numDroppedTables;
+
+    p2.endTable = endRTable;
+
+// #if _DEBUG
+//     DebugPruneInCPU( cx );
+// #endif
+
+#if BBCU_DBG_SKIP_PHASE_1
+    DbgLoadTablePairs( cx, TableId::Table7, true );
+#endif
+    // CudaK32PlotAllocateBuffersTest( cx );
+
+    for( TableId rTable = startRTable; rTable >= endRTable; rTable-- )
+    {
+    #if BBCU_DBG_SKIP_PHASE_1
+        DbgLoadTablePairs( cx, rTable-1, false );
+    // DbgValidateTable( cx, rTable );
+    #endif
+        const auto timer = TimerBegin();
+
+        cx.table           = rTable-1;
+        p2.pairsLoadOffset = 0;
+
+        MarkTable( cx, p2 );
+        p2.outMarks.WaitForCompletion();
+        p2.outMarks.Reset();
+        const auto elapsed = TimerEnd( timer );
+        Log::Line( "Marked Table %u in %.2lf seconds.", rTable, elapsed );
+
+        #if _DEBUG && DBG_BBCU_P2_WRITE_MARKS
+            p2.outMarks.WaitForCompletion();
+            DbgWriteMarks( cx, rTable-1 );
+        #endif
+    }
+
+    // Wait for everything to complete
+    
+    // p2.outMarks.WaitForCopyCompletion(); // #TODO: Re-activate this when re-enabling copy
+    p2.outMarks.WaitForCompletion();
+    p2.outMarks.Reset();
+}
+
+
+///
+/// Allocation
+///
+void CudaK32PlotPhase2AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
+{
+    const size_t alignment = cx.allocAlignment;
+
+    IAllocator& devAllocator    = *acx.devAllocator;
+    IAllocator& pinnedAllocator = *acx.pinnedAllocator; 
+
+    CudaK32Phase2& p2 = *cx.phase2;
+
+    const size_t markingTableByteSize     = GetMarkingTableByteSize();
+    const size_t markingTableBitFieldSize = GetMarkingTableBitFieldSize();
+
+    p2.devPrunedCount  = devAllocator.CAlloc<uint32>( 1, alignment );
+    p2.devMarkingTable = devAllocator.AllocT<byte>( markingTableByteSize, alignment );
+
+    p2.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
+                    sizeof( uint32 ) * P2_ENTRIES_PER_BUCKET, devAllocator, pinnedAllocator, alignment, acx.dryRun );
+
+    p2.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
+                    sizeof( uint16 ) * P2_ENTRIES_PER_BUCKET, devAllocator, pinnedAllocator, alignment, acx.dryRun );
+
+    p2.outMarks = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( 
+                    markingTableBitFieldSize, devAllocator, alignment, acx.dryRun );
+
+    // These buffers are safe to use at this point
+    // p2.hostBitFieldAllocator = new StackAllocator( cx.hostTableR, sizeof( uint32 ) * BBCU_TABLE_ALLOC_ENTRY_COUNT );
+}
+
+
+#if _DEBUG
+
+void DebugPruneInCPU( CudaK32PlotContext& cx )
+{
+    ThreadPool& pool = DbgGetThreadPool( cx );
+    byte* bytefields[2] = {
+        bbvirtalloc<byte>( GetMarkingTableByteSize() ),
+        bbvirtalloc<byte>( GetMarkingTableByteSize() )
+    };
+
+    
+    // uint64* bitfield = bbvirtalloc<uint64>( GetMarkingTableBitFieldSize() );
+    // BitField marks( bitfield, 1ull << BBCU_K );
+    // memset( bitfield, 0, GetMarkingTableBitFieldSize() );
+
+    // uint64 prunedEntryCount = 0;
+    // const uint64 entryCount = cx.tableEntryCounts[6];
+    
+
+    // for( uint64 i = 0; i < entryCount; i++ )
+    // {
+    //     const uint32 l = rTable.left[i];
+    //     const uint32 r = l + rTable.right[i];
+
+    //     marks.Set( l );
+    //     marks.Set( r );
+    // }
+
+    // for( uint64 i = 0; i < 1ull << BBCU_K; i++ )
+    // {
+    //     if( marks.Get( i ) )
+    //         prunedEntryCount++;
+    // }
+    // const TableId rTableId = TableId::Table7;
+
+    for( TableId rTableId = TableId::Table7; rTableId >= cx.phase2->endTable; rTableId-- )
+    {
+        const TableId lTableId = rTableId - 1;
+
+        const byte* rTableByteField = bytefields[(int)lTableId % 2];
+              byte* bytefield       = bytefields[(int)rTableId % 2];
+
+        memset( bytefield, 0, GetMarkingTableByteSize() );
+        
+        // DbgLoadTablePairs( cx, rTableId );
+        // Pairs rTable = cx.hostBackPointers[(int)rTableId];
+        
+        std::atomic<uint64> totalPrunedEntryCount = 0;
+
+        AnonMTJob::Run( pool, [&]( AnonMTJob* self ) {
+
+            const uint64 rEntryCount = cx.tableEntryCounts[(int)rTableId];
+            {
+                uint64 count, offset, end;
+                GetThreadOffsets( self, rEntryCount, count, offset, end );
+
+                const TableId rId    = rTableId;
+                      Pairs   rTable = cx.hostBackPointers[(int)rTableId];
+
+                for( uint64 i = offset; i < end; i++ )
+                {
+                    if( rId < TableId::Table7 && rTableByteField[i] == 0 )
+                        continue;
+
+                    const uint32 l = rTable.left[i];
+                    const uint32 r = l + rTable.right[i];
+                    
+                    bytefield[l] = 1;
+                    bytefield[r] = 1;
+                }
+
+                self->SyncThreads();
+
+                      uint64 localPrunedEntryCount = 0;
+                const uint64 lEntryCount           = cx.tableEntryCounts[(int)lTableId];
+                GetThreadOffsets( self, lEntryCount, count, offset, end );
+                for( uint64 i = offset; i < end; i++ )
+                {
+                    if( bytefield[i] == 1 )
+                        localPrunedEntryCount++;
+                }
+
+                totalPrunedEntryCount += localPrunedEntryCount;
+            }
+        });
+
+        const uint64 prunedEntryCount = totalPrunedEntryCount.load();
+        const uint64 lEntryCount      = cx.tableEntryCounts[(int)lTableId];
+        Log::Line( "Table %u Pruned entry count: %llu / %llu ( %.2lf %% )", (uint)rTableId,
+            prunedEntryCount, lEntryCount, prunedEntryCount / (double)lEntryCount * 100.0 );
+    }
+}
+
+void DbgValidateTable( CudaK32PlotContext& cx )
+{
+    ThreadPool& pool = DbgGetThreadPool( cx );
+
+    byte* bytefieldL = bbvirtalloc<byte>( GetMarkingTableByteSize() );
+    byte* bytefieldR = bbvirtalloc<byte>( GetMarkingTableByteSize() );
+    memset( bytefieldL, 0, GetMarkingTableByteSize() );
+    memset( bytefieldR, 0, GetMarkingTableByteSize() );
+
+    // uint64* bitfield = bbvirtalloc<uint64>( GetMarkingTableBitFieldSize() );
+    // BitField marks( bitfield, 1ull << BBCU_K );
+    // memset( bitfield, 0, GetMarkingTableBitFieldSize() );
+
+    // uint64 prunedEntryCount = 0;
+    // const uint64 entryCount = cx.tableEntryCounts[6];
+    // Pairs rTable = cx.hostBackPointers[6];
+
+    // for( uint64 i = 0; i < entryCount; i++ )
+    // {
+    //     const uint32 l = rTable.left[i];
+    //     const uint32 r = l + rTable.right[i];
+
+    //     marks.Set( l );
+    //     marks.Set( r );
+    // }
+
+    // for( uint64 i = 0; i < 1ull << BBCU_K; i++ )
+    // {
+    //     if( marks.Get( i ) )
+    //         prunedEntryCount++;
+    // }
+    Log::Line( "[DEBUG] Validating table" );
+
+    // for( TableId rt = TableId::Table7; rt >= TableId::Table3; rt-- )
+    TableId rt = TableId::Table7;
+    {
+        {
+            uint64 totalCount = 0;
+            for( uint32 bucket = 0; bucket < P2_BUCKET_COUNT; bucket++ )
+                totalCount += cx.bucketCounts[(int)rt][bucket];
+
+            ASSERT( totalCount == cx.tableEntryCounts[(int)rt] );
+        }
+
+        std::atomic<uint64> totalPrunedEntryCount = 0;
+
+        memset( bytefieldL, 0, GetMarkingTableByteSize() );
+
+        Pairs hostRTablePairs = cx.hostBackPointers[(int)rt];
+
+        for( uint32 bucket = 0; bucket < P2_BUCKET_COUNT; bucket++ )
+        {
+            const uint32 rTableBucketEntryCount = cx.bucketCounts[(int)rt][bucket];
+
+            // Mark
+            AnonMTJob::Run( pool, [&]( AnonMTJob* self ){
+                
+                    //   Pairs  rTable      = cx.hostBackPointers[(int)rt];
+                // const uint64 rEntryCount = cx.tableEntryCounts[(int)rt];
+                const uint64 rBucketEntryCount = rTableBucketEntryCount;
+
+                {
+                    uint64 count, offset, end;
+                    GetThreadOffsets( self, rBucketEntryCount, count, offset, end );
+
+                    Pairs rTable = hostRTablePairs;
+
+                    if( offset == 0 )
+                        Log::Line( "[%-3u] %u, %u", bucket, rTable.left[offset], (uint32)rTable.right[offset] );
+
+                    const bool readR = rt < TableId::Table7;
+
+                    const byte* rBytes = bytefieldR;
+                          byte* lBytes = bytefieldL;
+
+                    for( uint64 i = offset; i < end; i++ )
+                    {
+                        // if( readR && rBytes[i] == 0 )
+                        //     continue;
+        
+                        const uint32 l = rTable.left[i];
+                        const uint32 r = l + rTable.right[i];
+                        
+                        lBytes[l] = 1;
+                        lBytes[r] = 1;
+                    }
+                }
+            });
+
+            hostRTablePairs.left  += rTableBucketEntryCount;
+            hostRTablePairs.right += rTableBucketEntryCount;
+        }
+
+        // Count
+        AnonMTJob::Run( pool, [&]( AnonMTJob* self ){
+
+                  uint64 localPrunedEntryCount = 0;
+            const uint64 lEntryCount           = cx.tableEntryCounts[(int)rt-1];
+            const byte * lBytes                = bytefieldL;
+
+            uint64 count, offset, end;
+            GetThreadOffsets( self, lEntryCount, count, offset, end );
+            for( uint64 i = offset; i < end; i++ )
+            {
+                if( lBytes[i] == 1 )
+                    localPrunedEntryCount++;
+            }
+
+            totalPrunedEntryCount += localPrunedEntryCount;
+        });
+
+        // if( _dbgRMarks == nullptr )
+        //     _dbgRMarks = bb
+        std::swap( bytefieldL, bytefieldR );
+
+        const uint64 prunedEntryCount = totalPrunedEntryCount.load();
+        const uint64 lEntryCount      = cx.tableEntryCounts[(int)rt-1];
+        Log::Line( "Table %u pruned entry count: %llu / %llu ( %.2lf %% )", (uint)rt,
+            prunedEntryCount, lEntryCount, prunedEntryCount / (double)lEntryCount * 100.0 );
+    }
+}
+
+void DbgWriteMarks( CudaK32PlotContext& cx, const TableId table )
+{
+    char path[512];
+
+    Log::Line( "[DEBUG] Writing marking table %u to disk...", table+1 );
+    {
+        sprintf( path, "%smarks%d.tmp", DBG_BBCU_DBG_DIR, (int)table+1 );
+
+        const uint64* marks = cx.hostMarkingTables[(int)table];
+
+        int err;
+        FatalIf( !IOJob::WriteToFile( path, marks, GetMarkingTableBitFieldSize(), err ),
+            "Failed to write marking table with error: %d", err );
+    }
+}
+
+#endif
+
diff --git a/cuda/CudaPlotPhase3.cu b/cuda/CudaPlotPhase3.cu
new file mode 100644
index 00000000..b19d42c3
--- /dev/null
+++ b/cuda/CudaPlotPhase3.cu
@@ -0,0 +1,959 @@
+#include "CudaPlotPhase3Internal.h"
+#include "CudaParkSerializer.h"
+
+
+static void CompressInlinedTable( CudaK32PlotContext& cx );
+static void Step1( CudaK32PlotContext& cx );
+
+void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx );
+void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx );
+void WritePark7( CudaK32PlotContext& cx );
+
+
+static void AllocXTableStep( CudaK32PlotContext& cx, CudaK32AllocContext& acx );
+static void CudaK32PlotAllocateBuffersStep1( CudaK32PlotContext& cx, CudaK32AllocContext& acx );
+static void CudaK32PlotAllocateBuffersStep2( CudaK32PlotContext& cx, CudaK32AllocContext& acx );
+static void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContext& acx );
+
+
+
+#if _DEBUG
+    static void DbgValidateRMap( CudaK32PlotContext& cx );
+    static void DbgValidateIndices( CudaK32PlotContext& cx );
+    void DbgLoadLMap( CudaK32PlotContext& cx );
+    void DbgDumpSortedLinePoints( CudaK32PlotContext& cx );
+#endif
+
+
+//-----------------------------------------------------------
+__global__ void CudaConvertInlinedXsToLinePoints( 
+    const uint64 entryCount, const uint32 rOffset, const uint32 bucketShift,
+    const Pair* inXs, const uint64* rMarks,
+    uint64* outLPs, uint32* outIndices, uint32* gBucketCounts )
+{
+    const uint32 id     = threadIdx.x;
+    const uint32 gid    = blockIdx.x * blockDim.x + id;
+    const uint32 rIndex = rOffset + gid;
+
+    __shared__ uint32 sharedBuckets[BBCU_BUCKET_COUNT];
+
+    CUDA_ASSERT( gridDim.x >= BBCU_BUCKET_COUNT );
+    if( id < BBCU_BUCKET_COUNT )
+        sharedBuckets[id] = 0;
+
+    __syncthreads();
+
+    uint32 bucket;
+    uint32 offset;
+    uint64 lp;
+    uint32 count = 0;
+
+    const bool isPruned = gid >= entryCount || !CuBitFieldGet( rMarks, rIndex );
+    if( !isPruned )
+    {
+        const Pair p = inXs[gid];
+        CUDA_ASSERT( p.left || p.right );
+        
+        lp     = CudaSquareToLinePoint64( p.left, p.right );
+        bucket = (uint32)(lp >> bucketShift);
+        offset = atomicAdd( &sharedBuckets[bucket], 1 );
+
+        count = 1;
+    }
+    __syncthreads();
+
+    // Global offset
+    if( id < BBCU_BUCKET_COUNT )
+        sharedBuckets[id] = atomicAdd( &gBucketCounts[id], sharedBuckets[id] );
+    __syncthreads();
+
+    if( isPruned )
+        return;
+
+    const uint32 dst = bucket * P3_PRUNED_SLICE_MAX + sharedBuckets[bucket] + offset;
+
+    CUDA_ASSERT( lp );
+    // CUDA_ASSERT( outLPs[dst] == 0 );
+
+    outLPs    [dst] = lp;
+    outIndices[dst] = rIndex;
+}
+
+
+//-----------------------------------------------------------
+__global__ void CudaTestPrune(
+    const uint64 entryCount, const uint32 rOffset, const uint64* rTableMarks, uint32* gPrunedEntryCount )
+{
+    const uint32 id     = threadIdx.x;
+    const uint32 gid    = blockIdx.x * blockDim.x + id;
+    
+    const uint32 count = ( gid >= entryCount || !CuBitFieldGet( rTableMarks, rOffset + gid ) ) ? 0 : 1;
+
+    atomicAddShared( gPrunedEntryCount, count );
+}
+
+//-----------------------------------------------------------
+__global__ void CudaConvertToLinePoints( 
+    const uint64 entryCount, const uint32 rOffset, const uint32 lpBitSize,
+    const uint32* lTable, const uint32* lPairs, const uint16* rPairs,
+    const byte* marks, uint64* outLPs, uint32* gPrunedCount )
+{
+    const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if( gid == 0 )
+        gPrunedCount = 0;
+
+    // Filter-out entries that are not marked
+    // if( !CuBitFieldGet( rMarks, rIndex ) )
+    // {
+        
+    // }
+
+    // Grab L table values
+    const uint32 l = lPairs[gid];
+    const uint32 r = l + rPairs[gid];
+
+    const uint32 x = lTable[l];
+    const uint32 y = lTable[r];
+
+    // Convert to line point
+    const uint64 lp  = CudaSquareToLinePoint64( x, y );
+
+    const uint32 dst = atomicGlobalOffset( gPrunedCount );
+
+    outLPs[dst] = lp;
+}
+
+
+//-----------------------------------------------------------
+template<bool prune>
+__global__ void PruneAndWriteRMap( 
+    const uint32 entryCount, const uint64 rOffset,
+    uint32* gBucketCounts, uint32* gPrunedEntryCount, RMap* gRMap,
+    const uint32* lPairs, const uint16* rPairs, const uint64* rMarks )
+{
+    const uint32 id  = threadIdx.x;
+    const uint32 gid = blockIdx.x * blockDim.x + id;
+
+    __shared__ uint32 sharedBuckets[BBCU_BUCKET_COUNT];
+
+    CUDA_ASSERT( gridDim.x >= BBCU_BUCKET_COUNT );
+    if( id < BBCU_BUCKET_COUNT )
+        sharedBuckets[id] = 0;
+
+    __syncthreads();
+
+    // if( gid >= entryCount )
+    //     return;
+
+    const uint64 rIndex = rOffset + gid;
+
+    bool isPruned = gid >= entryCount;
+
+    if constexpr ( prune )
+        isPruned = isPruned || !CuBitFieldGet( rMarks, rIndex );
+
+    RMap entry;
+    uint32 bucket, offset;
+
+    if( !isPruned )
+    {
+        entry.dstL = lPairs[gid];
+        entry.dstR = entry.dstL + rPairs[gid];
+        entry.src  = (uint32)rIndex;   // It's original index
+
+        bucket = (uint32)(entry.dstL >> (BBCU_K - BBC_BUCKET_BITS));
+
+        // Block-level offset
+        offset = atomicAdd( &sharedBuckets[bucket], 1 );
+    }
+
+    // Global offset
+    __syncthreads();
+    if( id < BBCU_BUCKET_COUNT )
+        sharedBuckets[id] = atomicAdd( &gBucketCounts[id], sharedBuckets[id] );
+    __syncthreads();
+
+    if( isPruned )
+        return;
+
+    const uint32 dst = bucket * P3_PRUNED_SLICE_MAX + sharedBuckets[bucket] + offset;
+    gRMap[dst] = entry;   
+}
+
+
+/**
+ * #TODO: Optimize Steps 1 & 2 w/ packing.
+ * Phase 3 works in 4 steps:
+ * Step 1:
+ *  - Prune table R and for each pair write a mapping
+ *      at the back pointer locations, which points to the index of the pair.
+ * 
+ * Step 2:
+ *  - Load the RMap
+ *  - Load the LTable
+ *  - Create line points given RMap with LTable values
+ *  - Write line points to the their buckets along with the indices from the RMap
+ * 
+ * Step 3:
+ *  - Load line points and index
+ *  - Sort line points w/ index
+ *  - Compress line points to park
+ *  - Write parks
+ *  - Write index as a map, this will be the next iteration's L table
+*/
+//-----------------------------------------------------------
+void CudaK32PlotPhase3( CudaK32PlotContext& cx )
+{
+    // Set-up our context
+    memset( cx.phase3->prunedBucketCounts    , 0, sizeof( cx.phase3->prunedBucketCounts ) );
+    memset( cx.phase3->prunedTableEntryCounts, 0, sizeof( cx.phase3->prunedTableEntryCounts ) );
+
+    InitFSEBitMask( cx );
+
+#if _DEBUG
+    //#define SKIP_TO_TABLE TableId::Table3
+#endif
+
+#if BBCU_DBG_SKIP_PHASE_2 && !defined( SKIP_TO_TABLE )
+    DbgLoadMarks( cx );
+
+    // if( cx.gCfg->compressionLevel > 0 )
+    {
+        DbgLoadTablePairs( cx, TableId::Table1 + (TableId)cx.gCfg->numDroppedTables + 2, false );
+    }
+#endif
+
+    // Ensure the host buffers are not being used by the plot writer anymore
+    #if !BBCU_DBG_SKIP_PHASE_1
+    {
+        Duration waitTime = Duration::zero();
+        cx.plotFence->Wait( waitTime );
+        cx.plotFence->Reset();
+
+        if( TicksToSeconds( waitTime ) > 0.001 )
+            Log::Line( "Waited %.2lf seconds for C tables to finish writing.", TicksToSeconds( waitTime ) );
+    }
+    #endif
+
+    const uint32 compressionLevel = cx.gCfg->compressionLevel;
+
+    // Special case with the starting table, since it has the values inlined already
+    cx.table = TableId::Table2 + cx.gCfg->numDroppedTables;
+
+    // if( compressionLevel == 0 )
+    {
+        Log::Line( "Compressing Table %u and %u...", cx.table, cx.table+1 );
+
+        auto tableTimer = TimerBegin();
+
+        auto timer = tableTimer;
+        CompressInlinedTable( cx );
+        auto elapsed = TimerEnd( timer );
+        Log::Line( " Step 1 completed step in %.2lf seconds.", elapsed );
+
+        timer = TimerBegin();
+        CudaK32PlotPhase3Step3( cx );
+
+        auto tableElapsed = TimerEnd( tableTimer );
+        elapsed = TimerEnd( timer );
+        Log::Line( " Step 2 completed step in %.2lf seconds.", elapsed );
+
+
+        const uint64 baseEntryCount   = cx.tableEntryCounts[(int)cx.table];
+        const uint64 prunedEntryCount = cx.phase3->prunedTableEntryCounts[(int)cx.table];
+        Log::Line( "Completed table %u in %.2lf seconds with %llu / %llu entries ( %.2lf%% ).",
+            cx.table, tableElapsed, prunedEntryCount, baseEntryCount, (prunedEntryCount / (double)baseEntryCount) * 100.0 );
+    }
+    // else if( compressionLevel > 0 )
+    // {
+    //     const TableId startLTable = TableId::Table1 + (TableId)cx.gCfg->numDroppedTables;
+    //     cx.phase3->prunedTableEntryCounts[(int)startLTable] = cx.tableEntryCounts[(int)startLTable];
+    //     if( cx.gCfg->numDroppedTables > 1 )
+    //         cx.table = TableId::Table3;
+    // }
+
+#ifdef SKIP_TO_TABLE
+    cx.table = SKIP_TO_TABLE;
+    DbgLoadLMap( cx );
+#endif
+
+    auto& p3 = *cx.phase3;
+    const TableId startRTable = cx.table + 1;
+
+    for( TableId rTable = startRTable; rTable <= TableId::Table7; rTable++ )
+    {
+        Log::Line( "Compressing tables %u and %u...", (uint)rTable, (uint)rTable+1 );
+
+        cx.table = rTable;
+        
+        #if BBCU_DBG_SKIP_PHASE_2
+            if( rTable < TableId::Table7 )
+                DbgLoadTablePairs( cx, rTable+1, false );
+        #endif
+
+        auto tableTimer = TimerBegin();
+
+        // Step 1
+        auto timer = tableTimer;
+        Step1( cx );
+        double elapsed = TimerEnd( timer );
+        Log::Line( " Step 1 completed step in %.2lf seconds.", elapsed );
+
+        // Step 2
+        timer = TimerBegin();
+        CudaK32PlotPhase3Step2( cx );
+        elapsed = TimerEnd( timer );
+        Log::Line( " Step 2 completed step in %.2lf seconds.", elapsed );
+
+        // Step 3
+        timer = TimerBegin();
+        CudaK32PlotPhase3Step3( cx );
+        elapsed = TimerEnd( timer );
+        Log::Line( " Step 3 completed step in %.2lf seconds.", elapsed );
+
+        auto tableElapsed = TimerEnd( tableTimer );
+
+        const uint64 baseEntryCount   = cx.tableEntryCounts[(int)rTable];
+        const uint64 prunedEntryCount = p3.prunedTableEntryCounts[(int)rTable];
+        Log::Line( "Completed table %u in %.2lf seconds with %llu / %llu entries ( %.2lf%% ).",
+            rTable, tableElapsed, prunedEntryCount, baseEntryCount, (prunedEntryCount / (double)baseEntryCount) * 100.0 );
+    }
+
+    // Park 7
+    {
+        Log::Line( "Serializing P7 entries" );
+
+        const auto timer = TimerBegin();
+        WritePark7( cx );
+        const auto elapsed = TimerEnd( timer );
+        Log::Line( "Completed serializing P7 entries in %.2lf seconds.", elapsed );
+    }
+}
+
+//-----------------------------------------------------------
+void Step1( CudaK32PlotContext& cx )
+{
+    auto LoadBucket = []( CudaK32PlotContext& cx, const uint32 bucket ) -> void
+    {
+        const TableId rTable = cx.table;
+        auto&         p3     = *cx.phase3;
+        auto&         s1     = p3.step1;
+
+        const uint32 entryCount = BBCU_BUCKET_ENTRY_COUNT;
+
+        // uint32* hostPairsL = cx.hostTableSortedL + p3.pairsLoadOffset;
+        // uint16* hostPairsR = cx.hostTableSortedR + p3.pairsLoadOffset;
+        uint32* hostPairsL = cx.hostBackPointers[(int)rTable].left  + p3.pairsLoadOffset;
+        uint16* hostPairsR = cx.hostBackPointers[(int)rTable].right + p3.pairsLoadOffset;
+
+        // if( rTable < TableId::Table7 )
+        // {
+        //     const uint32* nextHostPairsL = cx.hostBackPointers[(int)rTable + 1].left  + p3.pairsLoadOffset;
+        //     const uint16* nextHostPairsR = cx.hostBackPointers[(int)rTable + 1].right + p3.pairsLoadOffset;
+
+        //     s1.pairsLIn.UploadAndPreLoadT( hostPairsL, entryCount, nextHostPairsL, entryCount );
+        //     s1.pairsRIn.UploadAndPreLoadT( hostPairsR, entryCount, nextHostPairsR, entryCount );
+        // }
+        // else
+        {
+            s1.pairsLIn.UploadT( hostPairsL, entryCount );
+            s1.pairsRIn.UploadT( hostPairsR, entryCount );
+        }
+
+        p3.pairsLoadOffset += entryCount;
+    };
+
+    auto& p2 = *cx.phase2;
+    auto& p3 = *cx.phase3;
+    auto& s1 = p3.step1;
+
+    const TableId rTable = cx.table;
+
+    // Clear pruned table count
+    CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) );
+
+    // Load marking table (must be loaded before first bucket, on the same stream)
+    if( cx.table < TableId::Table7 )
+    {
+        CudaErrCheck( cudaMemcpyAsync( s1.rTableMarks, cx.hostMarkingTables[(int)rTable],
+                        GetMarkingTableBitFieldSize(), cudaMemcpyHostToDevice, s1.pairsLIn.GetQueue()->GetStream() ) );
+    }
+
+    // Load initial bucket
+    p3.pairsLoadOffset = 0;
+    LoadBucket( cx, 0 );
+
+
+    ///
+    /// Process buckets
+    ///
+    const uint32 threadPerBlock = 256;
+    const uint32 blocksPerGrid  = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, (int)threadPerBlock ); 
+
+    uint64 rTableOffset = 0;
+    for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+    {
+        cx.bucket = bucket;
+
+        if( bucket + 1 < BBCU_BUCKET_COUNT )
+            LoadBucket( cx, bucket + 1 );
+
+        // Wait for R table pairs to be ready
+        const uint32* devLPairs = (uint32*)s1.pairsLIn.GetUploadedDeviceBuffer( cx.computeStream );
+        const uint16* devRPairs = (uint16*)s1.pairsRIn.GetUploadedDeviceBuffer( cx.computeStream );
+
+        const uint32 entryCount = bucket == BBCU_BUCKET_COUNT-1 ?
+                                  ( cx.tableEntryCounts[(int)rTable] - (BBCU_BUCKET_ENTRY_COUNT * (BBCU_BUCKET_COUNT-1)) ) :    // Get only the remaining entries for the last bucket
+                                  BBCU_BUCKET_ENTRY_COUNT;                                                                      // Otherwise, use a whole bucket's worth.
+
+        auto* devRMap = (RMap*)s1.rMapOut.LockDeviceBuffer( cx.computeStream );
+
+        uint32* devSliceCounts = cx.devSliceCounts + bucket * BBCU_BUCKET_COUNT;
+
+        // Generate map
+        #define KERN_RMAP_ARGS entryCount, rTableOffset, devSliceCounts, p3.devPrunedEntryCount, devRMap, devLPairs, devRPairs, s1.rTableMarks
+
+        CudaErrCheck( cudaMemsetAsync( devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT, cx.computeStream ) );
+
+        if( cx.table < TableId::Table7 )
+            PruneAndWriteRMap<true><<<blocksPerGrid, threadPerBlock, 0, cx.computeStream>>>( KERN_RMAP_ARGS );
+        else
+            PruneAndWriteRMap<false><<<blocksPerGrid, threadPerBlock, 0, cx.computeStream>>>( KERN_RMAP_ARGS );
+
+        #undef KERN_RMAP_ARGS
+        s1.pairsLIn.ReleaseDeviceBuffer( cx.computeStream );
+        s1.pairsRIn.ReleaseDeviceBuffer( cx.computeStream );
+        rTableOffset += entryCount;
+
+        // Download data (Vertical download (write 1 column))
+        s1.rMapOut.Download2DT<RMap>( p3.hostRMap + (size_t)bucket * P3_PRUNED_SLICE_MAX,
+            P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_BUCKET_MAX, P3_PRUNED_SLICE_MAX, cx.computeStream );
+    }
+    
+    // Download slice counts
+    cudaStream_t downloadStream = s1.rMapOut.GetQueue()->GetStream();
+
+    CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, 
+                    cudaMemcpyDeviceToHost, downloadStream ) );
+
+    // Wait for completion
+    s1.rMapOut.WaitForCompletion();
+    s1.rMapOut.Reset();
+
+    s1.pairsLIn.Reset();
+    s1.pairsRIn.Reset();
+
+    CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
+
+    // Add-up pruned bucket counts and tables counts
+    memcpy( &s1.prunedBucketSlices[0][0], cx.hostBucketSlices, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT );
+    {
+        uint32* hostSliceCounts = cx.hostBucketSlices;
+
+        for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+        {
+            for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+                p3.prunedBucketCounts[(int)rTable][bucket] += s1.prunedBucketSlices[slice][bucket];
+
+            // hostSliceCounts += BBCU_BUCKET_COUNT;
+        }
+
+        p3.prunedTableEntryCounts[(int)rTable] = 0;
+
+        for( uint32 i = 0; i < BBCU_BUCKET_COUNT; i++ )
+            p3.prunedTableEntryCounts[(int)rTable] += p3.prunedBucketCounts[(int)rTable][i];
+    }
+}
+
+//-----------------------------------------------------------
+// Table 2 (or 3,4,etc., depending on comrpession level), already has
+// the x values inlined into the pairs. Therefore, we can skip step 1
+// and go directly into converting to line point, then sorting it to the target 
+//-----------------------------------------------------------
+void CompressInlinedTable( CudaK32PlotContext& cx )
+{
+    auto LoadBucket = []( CudaK32PlotContext& cx, const uint32 bucket ) -> void {
+
+        auto& p3 = *cx.phase3;
+        auto& tx = p3.xTable;
+
+        if( bucket == 0 )
+            p3.pairsLoadOffset = 0;
+
+        // Load inlined x's
+        const TableId rTable     = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables;
+        const uint32  entryCount = cx.bucketCounts[(int)rTable][bucket];
+
+        const Pair* inlinedXs = ((Pair*)cx.hostBackPointers[(int)rTable].left) + p3.pairsLoadOffset;
+
+        tx.xIn.UploadT( inlinedXs, entryCount, cx.computeStream );
+    
+        p3.pairsLoadOffset += entryCount;
+    };
+
+    const TableId rTable = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables;
+    auto& p3 = *cx.phase3;
+    auto& tx = p3.xTable;
+    auto& s2 = p3.step2;
+
+    #if BBCU_DBG_SKIP_PHASE_2
+        DbgLoadTablePairs( cx, rTable );
+    #endif
+
+    // Load R Marking table (must be loaded before first bucket, on the same stream)
+    CudaErrCheck( cudaMemcpyAsync( (void*)tx.devRMarks, cx.hostMarkingTables[(int)rTable],
+                GetMarkingTableBitFieldSize(), cudaMemcpyHostToDevice, p3.xTable.xIn.GetQueue()->GetStream() ) );
+
+    // Load initial bucket
+    LoadBucket( cx, 0 );
+
+    const bool   isCompressed     = cx.gCfg->compressionLevel > 0;
+    const uint32 compressedLPBits = isCompressed ? GetCompressedLPBitCount( cx.gCfg->compressionLevel ) : 0;
+
+    const uint32 lpBits        = isCompressed ? compressedLPBits : BBCU_K * 2 - 1;
+    const uint32 lpBucketShift = lpBits - BBC_BUCKET_BITS;
+
+    uint64 tablePrunedEntryCount = 0;
+    uint32 rTableOffset          = 0;
+
+    CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );
+
+    for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+    {
+        cx.bucket = bucket;
+
+        if( bucket + 1 < BBCU_BUCKET_COUNT )
+            LoadBucket( cx, bucket + 1 );
+
+        // Wait for pairs to be ready
+        const Pair* devXs = (Pair*)tx.xIn.GetUploadedDeviceBuffer( cx.computeStream );
+
+        uint64* outLps     = (uint64*)tx.lpOut   .LockDeviceBuffer( cx.computeStream );
+        uint32* outIndices = (uint32*)tx.indexOut.LockDeviceBuffer( cx.computeStream );
+
+        const uint32 entryCount     = cx.bucketCounts[(int)rTable][bucket];
+
+        const uint32 threadPerBlock = 256;
+        const uint32 blocksPerGrid  = CDiv( entryCount, (int)threadPerBlock ); 
+
+        uint32* devSliceCounts = cx.devSliceCounts + bucket * BBCU_BUCKET_COUNT;
+
+        #if _DEBUG
+            CudaErrCheck( cudaMemsetAsync( outLps, 0, sizeof( uint64 ) * P3_PRUNED_BUCKET_MAX, cx.computeStream ) );
+        #endif
+
+        CudaConvertInlinedXsToLinePoints<<<blocksPerGrid, threadPerBlock, 0, cx.computeStream>>>(
+            entryCount, rTableOffset, lpBucketShift,
+            devXs, tx.devRMarks, outLps, outIndices, devSliceCounts );
+
+        tx.xIn.ReleaseDeviceBuffer( cx.computeStream );
+
+        // Download output
+        // Horizontal download (write 1 row)
+        tx.lpOut   .Download2DT<uint64>( p3.hostLinePoints + (size_t)bucket * P3_PRUNED_BUCKET_MAX  , P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX    , P3_PRUNED_SLICE_MAX, cx.computeStream );
+        tx.indexOut.Download2DT<uint32>( p3.hostIndices    + (size_t)bucket * P3_PRUNED_BUCKET_MAX*3, P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX * 3, P3_PRUNED_SLICE_MAX, cx.computeStream );
+
+        rTableOffset += entryCount;
+    }
+    
+    cudaStream_t downloadStream = tx.lpOut.GetQueue()->GetStream();
+
+    CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, 
+                    cudaMemcpyDeviceToHost, downloadStream ) );
+
+    tx.lpOut   .WaitForCompletion();
+    tx.indexOut.WaitForCompletion();
+    tx.lpOut   .Reset();
+    tx.indexOut.Reset();
+
+    CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
+
+    #if _DEBUG
+        for( uint32 i = 0; i < BBCU_BUCKET_COUNT; i++ )
+        {
+            ASSERT( p3.prunedBucketCounts[(int)rTable][i] <= P3_PRUNED_BUCKET_MAX );
+        }
+    #endif
+
+    // Add-up pruned bucket counts and tables counts
+    {
+        bbmemcpy_t( &s2.prunedBucketSlices[0][0], cx.hostBucketSlices, BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT );
+
+        for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+        {
+            for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+                p3.prunedBucketCounts[(int)rTable][bucket] += s2.prunedBucketSlices[slice][bucket];
+        }
+
+        p3.prunedTableEntryCounts[(int)rTable] = 0;
+
+        for( uint32 i = 0; i < BBCU_BUCKET_COUNT; i++ )
+            p3.prunedTableEntryCounts[(int)rTable] += p3.prunedBucketCounts[(int)rTable][i];
+    }
+
+#if _DEBUG
+    // DbgValidateIndices( cx );
+    // DbgValidateStep2Output( cx );
+    // DbgDumpSortedLinePoints( cx );
+#endif
+}
+
+
+///
+/// Allocation
+///
+//-----------------------------------------------------------
+void CudaK32PlotPhase3AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
+{
+    auto& p3 = *cx.phase3;
+
+    // Shared allocations
+    p3.devBucketCounts      = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_COUNT, acx.alignment );
+    p3.devPrunedEntryCount  = acx.devAllocator->CAlloc<uint32>( 1, acx.alignment );
+
+    // Host allocations
+    p3.hostRMap             = acx.hostTempAllocator->CAlloc<RMap>( BBCU_TABLE_ALLOC_ENTRY_COUNT );     // Used for rMap and index
+    p3.hostLinePoints       = acx.hostTempAllocator->CAlloc<uint64>( BBCU_TABLE_ALLOC_ENTRY_COUNT );   // Used for lMap and LPs
+
+    if( !acx.dryRun )
+    {
+        ASSERT( (uintptr_t)(p3.hostLinePoints + BBCU_TABLE_ALLOC_ENTRY_COUNT ) <= (uintptr_t)cx.hostTableL );
+        ASSERT( (uintptr_t)(p3.hostLinePoints + BBCU_TABLE_ALLOC_ENTRY_COUNT ) < (uintptr_t)cx.hostTableSortedL );
+    }
+    // p3.hostBucketCounts     = acx.pinnedAllocator->CAlloc<uint32>( BBCU_BUCKET_COUNT, acx.alignment );
+
+    if( acx.dryRun )
+    {
+        CudaK32AllocContext dacx = acx;
+
+        DummyAllocator devAlloc    = {};
+        DummyAllocator pinnedAlloc = {};
+
+        dacx.devAllocator     = &devAlloc;
+        dacx.pinnedAllocator  = &pinnedAlloc;
+
+        AllocXTableStep( cx, dacx );
+
+        size_t sharedDevSize    = devAlloc.Size();
+        size_t sharedPinnedSize = pinnedAlloc.Size();
+
+        devAlloc    = {};
+        pinnedAlloc = {};
+        CudaK32PlotAllocateBuffersStep1( cx, dacx );
+
+        sharedDevSize    = std::max( sharedDevSize   , devAlloc.Size()    );
+        sharedPinnedSize = std::max( sharedPinnedSize, pinnedAlloc.Size() );
+        devAlloc         = {};
+        pinnedAlloc      = {};
+        CudaK32PlotAllocateBuffersStep2( cx, dacx );
+
+        sharedDevSize    = std::max( sharedDevSize   , devAlloc.Size()    );
+        sharedPinnedSize = std::max( sharedPinnedSize, pinnedAlloc.Size() );
+        devAlloc         = {};
+        pinnedAlloc      = {};
+        CudaK32PlotAllocateBuffersStep3( cx, dacx );
+
+        sharedDevSize    = std::max( sharedDevSize   , devAlloc.Size()    );
+        sharedPinnedSize = std::max( sharedPinnedSize, pinnedAlloc.Size() );
+
+        acx.devAllocator   ->Alloc( sharedDevSize   , acx.alignment );
+        acx.pinnedAllocator->Alloc( sharedPinnedSize, acx.alignment );
+    }
+    else
+    {
+        StackAllocator* devAllocator    = (StackAllocator*)acx.devAllocator;
+        StackAllocator* pinnedAllocator = (StackAllocator*)acx.pinnedAllocator;
+
+        const size_t devMarker = devAllocator   ->Size();
+        const size_t pinMarker = pinnedAllocator->Size();
+
+        AllocXTableStep( cx, acx );
+        devAllocator   ->PopToMarker( devMarker );
+        pinnedAllocator->PopToMarker( pinMarker );
+
+        CudaK32PlotAllocateBuffersStep1( cx, acx );
+        devAllocator   ->PopToMarker( devMarker );
+        pinnedAllocator->PopToMarker( pinMarker );
+
+        CudaK32PlotAllocateBuffersStep2( cx, acx );
+        devAllocator   ->PopToMarker( devMarker );
+        pinnedAllocator->PopToMarker( pinMarker );
+
+        CudaK32PlotAllocateBuffersStep3( cx, acx );
+    }
+}
+
+//-----------------------------------------------------------
+void AllocXTableStep( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
+{
+    auto& tx = cx.phase3->xTable;
+
+    tx.devRMarks = (uint64*)acx.devAllocator->AllocT<uint64>( GetMarkingTableBitFieldSize(), acx.alignment );
+    tx.xIn       = cx.gpuUploadStream[0]->CreateUploadBuffer(sizeof(Pair) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, acx.alignment, acx.dryRun);
+    tx.lpOut     = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, acx.alignment, acx.dryRun );
+    tx.indexOut  = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, acx.alignment, acx.dryRun );
+}
+
+//-----------------------------------------------------------
+void CudaK32PlotAllocateBuffersStep1( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
+{
+    auto&        s1        = cx.phase3->step1;
+    const size_t alignment = acx.alignment;
+
+    s1.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
+                    sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+    
+    s1.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
+                    sizeof( uint16 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+
+    s1.rMapOut  = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer(
+                    sizeof( RMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
+
+    s1.rTableMarks = (uint64*)acx.devAllocator->AllocT<uint64>( GetMarkingTableBitFieldSize(), acx.alignment );
+}
+
+//-----------------------------------------------------------
+void CudaK32PlotAllocateBuffersStep2( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
+{
+    auto&        s2        = cx.phase3->step2;
+    const size_t alignment = acx.alignment;
+
+    s2.rMapIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
+        sizeof( RMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+
+    s2.lMapIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
+        sizeof( LMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+
+    s2.lpOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer(
+        sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
+
+    s2.indexOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer(
+        sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
+    
+    s2.devLTable[0] = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
+    s2.devLTable[1] = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
+}
+
+//-----------------------------------------------------------
+void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
+{
+    auto&        s3        = cx.phase3->step3;
+    const size_t alignment = acx.alignment;
+
+    s3.hostParkOverrunCount = acx.pinnedAllocator->CAlloc<uint32>( 1 );
+
+    const size_t devParkAllocSize = DEV_MAX_PARK_SIZE * P3_PRUNED_MAX_PARKS_PER_BUCKET;
+
+    s3.lpIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
+        sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+
+    s3.indexIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
+        sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+
+    s3.mapOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer(
+        sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
+
+    s3.parksOut = cx.gpuDownloadStream[0]->CreateDownloadBuffer(devParkAllocSize, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun);
+
+    if( acx.dryRun )
+    {
+        s3.sizeTmpSort = 0;
+        cub::DeviceRadixSort::SortPairs<uint64, uint32>( nullptr, s3.sizeTmpSort, nullptr, nullptr, nullptr, nullptr, BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+    }
+
+    s3.devSortTmpData = acx.devAllocator->AllocT<byte>( s3.sizeTmpSort, alignment );
+
+
+    // Allocate 1 more park's worth of line points so we can have space to retain the line points
+    // that did not make it into a park for the next bucket.
+    const size_t linePointAllocCount = P3_PRUNED_MAX_PARKS_PER_BUCKET * (size_t)kEntriesPerPark;
+    static_assert( linePointAllocCount > BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+
+    s3.devLinePoints      = acx.devAllocator->CAlloc<uint64>( linePointAllocCount, alignment );
+    s3.devDeltaLinePoints = acx.devAllocator->CAlloc<uint64>( linePointAllocCount, alignment );
+    s3.devIndices         = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
+
+    // s3.devParks  = acx.devAllocator->AllocT<uint64>( parkAllocSize, alignment );
+    // s3.hostParks = acx.devAllocator->AllocT<byte>  ( maxParkSize  , alignment );
+
+    s3.devCTable           = acx.devAllocator->AllocT<FSE_CTable>( P3_MAX_CTABLE_SIZE, alignment );
+    s3.devParkOverrunCount = acx.devAllocator->CAlloc<uint32>( 1 );
+}
+
+
+
+#if _DEBUG
+
+//-----------------------------------------------------------
+__global__ static void DbgCudaValidateRMap( const uint64 entryCount, const uint32 lTableOffset, const RMap* rmap )
+{
+    const uint32 id  = threadIdx.x;
+    const uint32 gid = blockIdx.x * blockDim.x + id;
+
+    if( gid >= entryCount )
+        return;
+
+    
+    const RMap map = rmap[gid];
+
+    const uint32 left  = map.dstL - lTableOffset;
+    const uint32 right = map.dstR - lTableOffset;
+
+    // if( left >= BBCU_BUCKET_ALLOC_ENTRY_COUNT )
+    if( left >= right || left >= BBCU_BUCKET_ALLOC_ENTRY_COUNT || right >= BBCU_BUCKET_ALLOC_ENTRY_COUNT )
+    {
+        printf( "gid: %u | left: %u | right: %u | loffset: %u\n"
+            " dstL: %u | dstR: %u | src: %u\n",
+        gid, left, right, lTableOffset, map.dstL, map.dstR, map.src );
+        CUDA_ASSERT( false );
+    }
+
+    CUDA_ASSERT( left  < BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+    CUDA_ASSERT( right < BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+    CUDA_ASSERT( left < right );
+}
+
+//-----------------------------------------------------------
+void DbgValidateRMap( CudaK32PlotContext& cx )
+{
+    Log::Line( "[DEBUG] Validating RMap..." );
+
+    auto& p3 = *cx.phase3;
+    auto& s1 = p3.step1;
+ 
+    {
+        ThreadPool& pool = DbgGetThreadPool( cx );
+
+        RMap* rMap = bbcvirtallocbounded<RMap>( BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+
+        for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+        {
+            const RMap* reader = p3.hostRMap + bucket * P3_PRUNED_BUCKET_MAX;
+            RMap* writer = rMap;
+
+            uint32 entryCount = 0;
+
+            for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+            {
+                const uint32 copyCount = s1.prunedBucketSlices[slice][bucket];
+                bbmemcpy_t( writer, reader, copyCount );
+                
+                writer     += copyCount;
+                entryCount += copyCount;
+
+                reader += P3_PRUNED_SLICE_MAX;
+            }
+
+            // Validate bucket
+            const uint32 bucketOffset = bucket * BBCU_BUCKET_ENTRY_COUNT;
+            for( uint32 i = 0; i < entryCount; i++ )
+            {
+                const RMap map = rMap[i];
+                ASSERT( map.dstL || map.dstR );
+                ASSERT( map.dstR - map.dstL < 0x10000u );
+                ASSERT( map.dstL >> ( 32 - BBC_BUCKET_BITS ) == bucket );
+
+                const uint32 left  = map.dstL - bucketOffset;
+                const uint32 right = map.dstR - bucketOffset;
+                ASSERT( left  < BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+                ASSERT( right < BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+                CUDA_ASSERT( left < right );
+
+            }
+        }
+
+        bbvirtfreebounded( rMap );
+        Log::Line( "[DEBUG] CPU OK" );
+    }
+
+    // Validate in CUDA
+    {
+        uint64 pairsLoadOffset = 0;
+        for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+        {
+            uint64 entryCount = 0;
+            for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+            {
+                const uint32 copyCount = s1.prunedBucketSlices[slice][bucket];
+                entryCount += copyCount;
+            }
+
+            const RMap*   rmap         = p3.hostRMap + (size_t)bucket * P3_PRUNED_BUCKET_MAX;
+            const uint32* rSliceCounts = &p3.step1.prunedBucketSlices[0][bucket];
+            
+            p3.step2.rMapIn.UploadArrayT<RMap>( rmap, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, rSliceCounts );
+
+            const uint32 rEntryCount = p3.prunedBucketCounts[(int)cx.table][bucket];
+                  RMap*  devRMap     = p3.step2.rMapIn.GetUploadedDeviceBufferT<RMap>( cx.computeStream );
+
+            ASSERT( entryCount == rEntryCount );
+
+            const uint32 threads = 256;
+            const uint32 blocks  = CDiv( rEntryCount, threads );
+
+            const uint32 lTableOffset = bucket * BBCU_BUCKET_ENTRY_COUNT;
+
+            DbgCudaValidateRMap<<<blocks, threads, 0, cx.computeStream>>>( rEntryCount, lTableOffset, devRMap );
+            CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
+
+            p3.step2.rMapIn.ReleaseDeviceBuffer( cx.computeStream );
+        }
+        Log::Line( "[DEBUG] CUDA OK" );
+
+        p3.step2.lMapIn.Reset();
+    }
+}
+
+//-----------------------------------------------------------
+void DbgValidateIndices( CudaK32PlotContext& cx )
+{
+    // Ensure all origin output indices are not repeated and well distributed
+    Log::Line( "[DEBUG] Validating indices..." );
+
+    auto& p3 = *cx.phase3;
+    auto& s2 = p3.step2;
+    
+    ThreadPool& pool  = DbgGetThreadPool( cx );
+
+    uint32* indices   = bbcvirtallocbounded<uint32>( BBCU_TABLE_ENTRY_COUNT );
+    uint32* idxTmp    = bbcvirtallocbounded<uint32>( BBCU_TABLE_ENTRY_COUNT );
+    uint32* idxWriter = indices;
+
+    const uint32* reader       = p3.hostIndices;
+    const size_t  readerStride = P3_PRUNED_SLICE_MAX * 3;
+
+    uint64 entryCount = 0;
+
+    for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+    {
+        for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+        {
+            const uint32 copyCount = s2.prunedBucketSlices[bucket][slice];
+
+            bbmemcpy_t( idxWriter, reader, copyCount );
+
+            idxWriter += copyCount;
+            entryCount += copyCount;
+            reader += readerStride;
+        }
+    }
+
+    ASSERT( entryCount == p3.prunedTableEntryCounts[(int)cx.table] );
+
+    RadixSort256::Sort<BB_MAX_JOBS>( pool, indices, idxTmp, entryCount );
+
+    // Indices must not repeat:
+    for( uint64 i = 1; i < entryCount; i++ )
+    {
+        ASSERT( indices[i] > indices[i-1] );
+    }
+
+    bbvirtfreebounded( indices );
+    bbvirtfreebounded( idxTmp );
+
+    Log::Line( "[DEBUG] OK" );
+}
+
+#endif
+
diff --git a/cuda/CudaPlotPhase3Internal.h b/cuda/CudaPlotPhase3Internal.h
new file mode 100644
index 00000000..1a4bd7a8
--- /dev/null
+++ b/cuda/CudaPlotPhase3Internal.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include "CudaPlotContext.h"
+#include "plotting/CTables.h"
+#include "ChiaConsts.h"
+
+#if _DEBUG
+    #include "util/BitField.h"
+    #include "plotmem/LPGen.h"
+    #include "plotdisk/jobs/IOJob.h"
+    #include "algorithm/RadixSort.h"
+    #include "plotmem/ParkWriter.h"
+
+    void DbgValidateStep2Output( CudaK32PlotContext& cx );
+#endif
+
+using LMap = CudaK32Phase3::LMap;
+using RMap = CudaK32Phase3::RMap;
+
+static_assert( alignof( LMap ) == sizeof( uint32 ) );
+
+// #TODO: Remove this. It is unneeeded.
+#define P3_PRUNED_BUCKET_MULTIPLIER     0.98    // Enough to hold the largest pruned bucket size
+
+#define P3_PRUNED_SLICE_MAX             BBCU_MAX_SLICE_ENTRY_COUNT     //(CuCDiv( (size_t)((BBCU_TABLE_ENTRY_COUNT/BBCU_BUCKET_COUNT/BBCU_BUCKET_COUNT)*P3_PRUNED_BUCKET_MULTIPLIER), 4096 ) * 4096 + 4096)
+#define P3_PRUNED_BUCKET_MAX            BBCU_BUCKET_ALLOC_ENTRY_COUNT  //(P3_PRUNED_SLICE_MAX*BBCU_BUCKET_COUNT)
+#define P3_PRUNED_TABLE_MAX_ENTRIES     BBCU_TABLE_ALLOC_ENTRY_COUNT   //(P3_PRUNED_BUCKET_MAX*BBCU_BUCKET_COUNT)
+#define P3_PRUNED_MAX_PARKS_PER_BUCKET  ((P3_PRUNED_BUCKET_MAX/kEntriesPerPark)+2)
+
+static constexpr size_t P3_MAX_CTABLE_SIZE = 38u * 1024u;  // Should be more than enough
+
+//static constexpr size_t P3_LP_BUCKET_COUNT       = BBCU_BUCKET_COUNT;// << 1;
+//static constexpr size_t P3_LP_SLICE_ENTRY_COUNT  = BBCU_MAX_SLICE_ENTRY_COUNT;
+//static constexpr uint32 P3_LP_BUCKET_BITS        = BBC_BUCKET_BITS;
+
+// static constexpr uint32 P3_LP_BUCKET_BITS        = (uint32)(CuBBLog2( P3_LP_BUCKET_COUNT ));
+//static constexpr size_t P3_LP_SLICE_ENTRY_COUNT  = ( CuCDiv( (size_t)( ( BBCU_TABLE_ENTRY_COUNT / P3_LP_BUCKET_COUNT / P3_LP_BUCKET_COUNT ) * P3_LP_BUCKET_MULTIPLER ),
+                                                     //BBCU_XTRA_ENTRIES_PER_SLICE ) * BBCU_XTRA_ENTRIES_PER_SLICE + BBCU_XTRA_ENTRIES_PER_SLICE );
+// static constexpr size_t P3_LP_BUCKET_ENTRY_COUNT = P3_LP_SLICE_ENTRY_COUNT * P3_LP_BUCKET_COUNT;
+
+//static constexpr size_t P3_LP_BUCKET_STRIDE      = BBCU_BUCKET_ALLOC_ENTRY_COUNT;
+
+// static constexpr size_t P3_LP_BUCKET_ALLOC_COUNT = ( CuCDiv( (size_t)( ( BBCU_TABLE_ENTRY_COUNT / P3_LP_BUCKET_COUNT / P3_LP_BUCKET_COUNT ) * P3_LP_BUCKET_MULTIPLER ),
+//                                                     BBCU_XTRA_ENTRIES_PER_SLICE ) * BBCU_XTRA_ENTRIES_PER_SLICE + BBCU_XTRA_ENTRIES_PER_SLICE );
+// //static constexpr size_t P3_LP_TABLE_ALLOC_COUNT  = P3_LP_BUCKET_STRIDE * BBCU_BUCKET_COUNT;
+
+static constexpr size_t MAX_PARK_SIZE            = CalculateParkSize( TableId::Table1 );
+static constexpr size_t DEV_MAX_PARK_SIZE        = CuCDiv( MAX_PARK_SIZE, sizeof( uint64 ) ) * sizeof( uint64 );   // Align parks to 64 bits, for easier writing of stubs
+
diff --git a/cuda/CudaPlotPhase3Step2.cu b/cuda/CudaPlotPhase3Step2.cu
new file mode 100644
index 00000000..ac13e915
--- /dev/null
+++ b/cuda/CudaPlotPhase3Step2.cu
@@ -0,0 +1,693 @@
+#include "CudaPlotPhase3Internal.h"
+#include "CudaParkSerializer.h"
+#include "plotting/TableWriter.h"
+#include "algorithm/RadixSort.h"
+#include "plotdisk/jobs/IOJob.h"
+
+#define P3_CalculateMaxLPValue( x ) ((((uint64)(x))/2)*((uint64)(x))+x)
+#define P3_CalculateTableDivisor( p ) (P3_CalculateMaxLPValue( (uint64)(BBCU_TABLE_ENTRY_COUNT*(p)) ) / BBCU_BUCKET_COUNT)
+
+__constant__ uint64 BucketDivisor;
+
+static void CudaK32PlotPhase3Step2Compressed( CudaK32PlotContext& cx );
+
+//-----------------------------------------------------------
+__global__ static void CudaUnpackLMap( const uint32 entryCount, const LMap* devLMap, uint32* devLTable
+#if _DEBUG
+    , const uint32 bucket
+#endif
+)
+{
+    const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if( gid >= entryCount )
+        return;
+
+    const uint32 bucketMask = (1u << (BBCU_K - BBC_BUCKET_BITS)) - 1;
+    const LMap   map        = devLMap[gid];
+        
+    const uint32 dst = map.sourceIndex & bucketMask;
+
+    CUDA_ASSERT( ( map.sourceIndex >> ( 32 - BBC_BUCKET_BITS ) ) == bucket );
+
+    devLTable[dst] = map.sortedIndex;
+}
+
+//-----------------------------------------------------------
+static void UnpackLMap( CudaK32PlotContext& cx, const uint32 entryCount, const LMap* devLMap, uint32* devLTable,
+                        const uint32 bucket, cudaStream_t stream )
+{
+    const uint32 threads = 256;
+    const uint32 blocks  = CDiv( entryCount, threads );
+
+    CudaUnpackLMap<<<blocks, threads, 0, stream>>>( entryCount, devLMap, devLTable
+#if _DEBUG
+        , bucket 
+#endif
+    );
+}
+
+
+//-----------------------------------------------------------
+template<bool isCompressed=false>
+__global__ static void CudaConvertRMapToLinePoints( 
+    const uint64 entryCount, const uint32 rOffset, const uint32 lTableOffset,
+    const uint32* lTable, const RMap* rmap, uint64* outLPs, uint32* outIndices, uint32* gBucketCounts, const uint32 lpShift = 0 )
+{
+    const uint32 id  = threadIdx.x;
+    const uint32 gid = blockIdx.x * blockDim.x + id;
+
+    __shared__ uint32 sharedBuckets[BBCU_BUCKET_COUNT];
+
+    CUDA_ASSERT( gridDim.x >= BBCU_BUCKET_COUNT );
+    if( id < BBCU_BUCKET_COUNT )
+        sharedBuckets[id] = 0;
+
+    __syncthreads();
+
+    uint32 bucket;
+    uint32 offset;
+    uint32 rIndex;
+    uint64 lp;
+
+    if( gid < entryCount )
+    {
+        const RMap map = rmap[gid];
+
+        const uint32 left  = map.dstL - lTableOffset;
+        const uint32 right = map.dstR - lTableOffset;
+
+        CUDA_ASSERT( left  < BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+        CUDA_ASSERT( right < BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+        CUDA_ASSERT( left < right );
+
+        rIndex = map.src;
+
+        const uint32 x = lTable[left ];
+        const uint32 y = lTable[right];
+
+        lp = CudaSquareToLinePoint64( x, y );
+
+        if constexpr( !isCompressed )
+        {
+            CUDA_ASSERT( x || y );
+            CUDA_ASSERT( lp );
+            bucket = (uint32)( lp / BucketDivisor );
+        }
+        else
+            bucket = (uint32)( lp >> lpShift );
+
+        CUDA_ASSERT( bucket < BBCU_BUCKET_COUNT );
+
+        offset = atomicAdd( &sharedBuckets[bucket], 1 );
+    }
+    __syncthreads();
+
+    // Global offset
+    if( id < BBCU_BUCKET_COUNT )
+    {
+        sharedBuckets[id] = atomicAdd( &gBucketCounts[id], sharedBuckets[id] );
+        CUDA_ASSERT( sharedBuckets[id] <= P3_PRUNED_SLICE_MAX );
+    }
+    __syncthreads();
+
+    if( gid >= entryCount )
+        return;
+
+    const uint32 dst = bucket * P3_PRUNED_SLICE_MAX + sharedBuckets[bucket] + offset;
+    CUDA_ASSERT( dst < P3_PRUNED_BUCKET_MAX );
+
+    outLPs    [dst] = lp;
+    outIndices[dst] = rIndex;
+}
+
+//-----------------------------------------------------------
+static void ConvertRMapToLinePoints( CudaK32PlotContext& cx, const uint32 entryCount, const uint32 rOffset,
+                                     const uint32* lTable, const RMap* rMap, uint64* outLPs, uint32* outIndices, cudaStream_t stream )
+{
+    const TableId rTable = cx.table;
+    auto& p3 = *cx.phase3;
+    auto& s2 = p3.step2;
+
+    const uint32 threads = 256;
+    const uint32 blocks  = CDiv( entryCount, threads );
+
+    const uint32 lTableOffset = cx.bucket * BBCU_BUCKET_ENTRY_COUNT;
+    
+    uint32* devSliceCounts = cx.devSliceCounts + cx.bucket * BBCU_BUCKET_COUNT;
+    #define Rmap2LPParams entryCount, rOffset, lTableOffset, lTable, rMap, outLPs, outIndices, devSliceCounts
+
+    const bool isCompressed = rTable - 1 <= (TableId)cx.gCfg->numDroppedTables;
+
+    if( !isCompressed )
+    {
+        if( cx.bucket == 0 )
+        {
+            // Calculate the divisor needed to generate a uniform distribution across buckets
+            // and set it as a constant for our kernel.
+            const uint64 prunedEntryCount = p3.prunedTableEntryCounts[(int)rTable - 1];
+            const uint64 divisor          = P3_CalculateMaxLPValue( prunedEntryCount ) / BBCU_BUCKET_COUNT;
+
+            // #TODO: Use upload stream?
+            CudaErrCheck( cudaMemcpyToSymbolAsync( BucketDivisor, &divisor, sizeof( divisor ), 0, cudaMemcpyHostToDevice, cx.computeStream ) );
+        }
+
+        CudaConvertRMapToLinePoints<false><<<blocks, threads, 0, stream>>>( Rmap2LPParams, 0 );
+    }
+    else
+    {
+        const uint32 xBits      = cx.gCfg->compressedEntryBits;
+        const uint32 lpBits     = (xBits * 2 - 1) * 2 - 1;
+        const uint32 lpBitShift = lpBits - BBC_BUCKET_BITS;
+
+        CudaConvertRMapToLinePoints<true><<<blocks, threads, 0, stream>>>( Rmap2LPParams, lpBitShift );
+    }
+
+    #undef Rmap2LPParams
+}
+
+/**
+ * Load RMap and L table and generate line points from RMap and L table.
+ * Write line points to their buckets, along with their origin index.
+*/
+//-----------------------------------------------------------
+void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
+{
+    auto LoadLBucket = []( CudaK32PlotContext& cx, const uint32 bucket ) -> void {
+
+        auto& p3 = *cx.phase3;
+        auto& s2 = p3.step2;
+
+        const bool isCompressed = (uint32)cx.table-1 <= cx.gCfg->numDroppedTables;
+
+        if( !isCompressed )
+        {
+            ASSERT( p3.prunedBucketCounts[(int)cx.table-1][cx.bucket] > 0 );
+
+            // Load lMap
+            // Horizontal load
+            const LMap* lmap = p3.hostLMap + (size_t)bucket * P3_PRUNED_BUCKET_MAX;
+
+            const uint32* lSliceCounts = &p3.step3.prunedBucketSlices[0][bucket];
+
+            s2.lMapIn.UploadArrayT<LMap>( lmap, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, lSliceCounts );
+        }
+        else
+        {
+            ASSERT( cx.gCfg->compressionLevel > 0 );
+
+            if( bucket == 0 )
+                p3.pairsLoadOffset = 0;
+
+            // Load the compressed entries from the table pairs
+            const uint32* lEntries = (cx.hostBackPointers[(int)cx.table-1].left) + p3.pairsLoadOffset;
+            // const uint32* lEntries         = cx.hostTableL + p3.pairsLoadOffset;   // Our compressed x's are copied to the LMap buffer before we get to this point
+
+            // #TODO: Do a preload here instead and have each bucket start at the max bucket offset
+            // const uint32 bucketEntryCount = cx.bucketCounts[(int)cx.table-1][bucket];
+
+            s2.lMapIn.UploadT<uint32>( lEntries, BBCU_BUCKET_ENTRY_COUNT );
+            p3.pairsLoadOffset += BBCU_BUCKET_ENTRY_COUNT;
+        }
+    };
+
+    auto UnpackLBucket = []( CudaK32PlotContext& cx, const uint32 bucket ) -> void {
+
+        auto& p3 = *cx.phase3;
+        auto& s2 = p3.step2;
+
+        const bool isCompressed = (uint32)cx.table-1 <= cx.gCfg->numDroppedTables;
+
+        const auto* lMap   = (LMap*)s2.lMapIn.GetUploadedDeviceBuffer( cx.computeStream );
+        uint32*     lTable = s2.devLTable[bucket & 1];
+
+        if( isCompressed )
+        {
+            // Copy from upload buffer to working buffer
+            CudaErrCheck( cudaMemcpyAsync( lTable, lMap, BBCU_BUCKET_ENTRY_COUNT * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) );
+        }
+        else
+        {
+            // Unpack next LMap and copy to the end of the current map
+            const uint32 lEntryCount = p3.prunedBucketCounts[(int)cx.table-1][bucket];
+            ASSERT( lEntryCount > 0 );
+        
+            UnpackLMap( cx, lEntryCount, lMap, lTable, bucket, cx.computeStream );
+        }
+    };
+
+    auto LoadRBucket = []( CudaK32PlotContext& cx, const uint32 bucket ) -> void {
+
+        auto& p3 = *cx.phase3;
+        auto& s2 = p3.step2;
+
+        // Load rMap
+        // Horizontal load
+        const RMap* rmap = p3.hostRMap + (size_t)bucket * P3_PRUNED_BUCKET_MAX;
+        
+        const uint32* rSliceCounts = &p3.step1.prunedBucketSlices[0][bucket];
+
+        s2.rMapIn.UploadArrayT<RMap>( rmap, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, rSliceCounts );
+    };
+    
+
+    const TableId rTable = cx.table;
+    const TableId lTable = rTable-1;
+    auto&         p3     = *cx.phase3;
+    auto&         s2     = p3.step2;
+
+
+    // We always have 1 L bucket loaded ahead since we need the next bucket to be 
+    // loaded also so that we can include the next bucket's initial 
+    // entries as part of the current bucket.
+    LoadLBucket( cx, 0 );
+    LoadLBucket( cx, 1 );
+    LoadRBucket( cx, 0 );
+
+    // Clear pruned entry count
+    CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) );
+
+    // Unpack the first map beforehand
+    UnpackLBucket( cx, 0 );
+
+
+    ///
+    /// Process buckets
+    /// 
+    uint32 rTableOffset = 0; // Track the global origin index of R entry/line point 
+
+    CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );
+
+    for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+    {
+        cx.bucket = bucket;
+        const uint32 nextBucket  = bucket + 1;
+        const uint32 nextBucketL = bucket + 2;
+
+        const uint32* devLTable = s2.devLTable[bucket & 1];
+
+        // Preload next buckets
+        if( nextBucket < BBCU_BUCKET_COUNT )
+        {
+            LoadRBucket( cx, nextBucket );
+
+            UnpackLBucket( cx, nextBucket );
+            s2.lMapIn.ReleaseDeviceBuffer( cx.computeStream );
+
+            // Copy start of next bucket to the end of the current one
+            const uint32 copyCount = BBCU_BUCKET_COUNT * BBCU_XTRA_ENTRIES_PER_SLICE;
+            static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT - BBCU_BUCKET_ENTRY_COUNT == copyCount );
+
+            uint32* nextLTable = s2.devLTable[nextBucket & 1];
+            CudaErrCheck( cudaMemcpyAsync( (uint32*)devLTable + BBCU_BUCKET_ENTRY_COUNT, nextLTable, copyCount * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) );
+        }
+
+        if( nextBucketL < BBCU_BUCKET_COUNT )
+            LoadLBucket( cx, nextBucketL );
+
+
+        // Generate line points given the unpacked LMap as input and the RMap
+        const auto*  rMap        = (RMap*)s2.rMapIn.GetUploadedDeviceBuffer( cx.computeStream );
+        const uint32 rEntryCount = p3.prunedBucketCounts[(int)rTable][bucket];
+
+        
+        uint64* devOutLPs     = (uint64*)s2.lpOut   .LockDeviceBuffer( cx.computeStream );
+        uint32* devOutIndices = (uint32*)s2.indexOut.LockDeviceBuffer( cx.computeStream );
+
+        ConvertRMapToLinePoints( cx, rEntryCount, rTableOffset, devLTable, rMap, devOutLPs, devOutIndices, cx.computeStream );
+        s2.rMapIn.ReleaseDeviceBuffer( cx.computeStream );
+        rTableOffset += rEntryCount;
+
+
+        // Horizontal download (write 1 row)
+        s2.lpOut   .Download2DT<uint64>( p3.hostLinePoints + (size_t)bucket * P3_PRUNED_BUCKET_MAX  , P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX  , P3_PRUNED_SLICE_MAX, cx.computeStream );
+        s2.indexOut.Download2DT<uint32>( p3.hostIndices    + (size_t)bucket * P3_PRUNED_BUCKET_MAX*3, P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX*3, P3_PRUNED_SLICE_MAX, cx.computeStream );
+    }
+
+    #if _DEBUG
+    {
+        size_t tableLength       = 0;
+        uint32 activeBucketCount = 0;
+        for( uint32 i = 0; i < BBCU_BUCKET_COUNT; i++ )
+        {
+            ASSERT( p3.prunedBucketCounts[(int)rTable][i] <= P3_PRUNED_BUCKET_MAX );
+            tableLength += p3.prunedBucketCounts[(int)rTable][i];
+
+            if( p3.prunedBucketCounts[(int)rTable][i] ) activeBucketCount++;
+        }
+
+        ASSERT( tableLength <= BBCU_TABLE_ALLOC_ENTRY_COUNT );
+        ASSERT( tableLength == p3.prunedTableEntryCounts[(int)rTable] );
+    }
+    #endif
+
+    s2.lpOut.WaitForCompletion();
+    s2.lpOut.Reset();
+
+    s2.indexOut.WaitForCompletion();
+    s2.indexOut.Reset();
+
+    s2.lMapIn.Reset();
+    s2.rMapIn.Reset();
+
+    // Copy slice counts & bucket count
+    cudaStream_t downloadStream = s2.lpOut.GetQueue()->GetStream();
+
+    CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
+                    cudaMemcpyDeviceToHost, downloadStream ) );
+    
+    memset( p3.prunedBucketCounts[(int)rTable], 0, BBCU_BUCKET_COUNT * sizeof( uint32 ) );
+
+    CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
+    bbmemcpy_t( &s2.prunedBucketSlices[0][0], cx.hostBucketSlices, BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT );
+    for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+    {
+        for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+        {
+            ASSERT( s2.prunedBucketSlices[slice][bucket] <= P3_PRUNED_SLICE_MAX );
+            p3.prunedBucketCounts[(int)rTable][bucket] += s2.prunedBucketSlices[slice][bucket];
+        }
+    //     //ASSERT( p3.hostBucketCounts[i] );
+        ASSERT( p3.prunedBucketCounts[(int)rTable][bucket] <= P3_PRUNED_BUCKET_MAX );
+    }
+
+    // #if _DEBUG
+    // if( cx.table > TableId::Table3 )
+    // {
+    //    DbgValidateStep2Output( cx );
+    // }
+    // #endif
+}
+
+//-----------------------------------------------------------
+void WritePark7( CudaK32PlotContext& cx )
+{
+    auto LoadBucket = []( CudaK32PlotContext& cx, const uint32 bucket ) -> void {
+
+        auto& p3 = *cx.phase3;
+        auto& s2 = p3.step2;
+
+        ASSERT( p3.prunedBucketCounts[(int)TableId::Table7][cx.bucket] > 0 );
+
+        // Load lMap
+        // Horizontal load
+        const LMap* lmap = p3.hostLMap + (size_t)bucket * P3_PRUNED_BUCKET_MAX;
+
+        const uint32* lSliceCounts = &p3.step3.prunedBucketSlices[0][bucket];
+
+        s2.lMapIn.UploadArrayT<LMap>( lmap, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, lSliceCounts );
+    };
+
+    ASSERT( cx.table == TableId::Table7 );
+
+    auto& p3 = *cx.phase3;
+    auto& s2 = p3.step2;
+
+    
+    // Load initial bucket
+    LoadBucket( cx, 0 );
+
+    // Begin park 7 table in plot
+    cx.plotWriter->BeginTable( PlotTable::Table7 );
+
+    constexpr size_t parkSize       = CalculatePark7Size( BBCU_K );
+    constexpr size_t parkFieldCount = parkSize / sizeof( uint64 );
+    static_assert( parkFieldCount * sizeof( uint64 ) == parkSize );
+
+
+    GpuDownloadBuffer& parkDownloader = s2.lpOut;
+
+    constexpr size_t maxParksPerBucket = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2;
+    static_assert( sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT >= maxParksPerBucket * parkSize );
+
+
+    // Host stuff
+    constexpr size_t hostMetaTableSize = sizeof( RMap ) * BBCU_TABLE_ALLOC_ENTRY_COUNT;
+    StackAllocator hostAllocator( p3.hostRMap, hostMetaTableSize );
+
+    const uint64 tableEntryCount = cx.tableEntryCounts[(int)cx.table];
+    const size_t totalParkCount  = CDiv( (size_t)tableEntryCount, kEntriesPerPark );
+
+    byte*   hostParks           = hostAllocator.AllocT<byte>( totalParkCount * parkSize );
+    byte*   hostParkWriter      = hostParks;
+    uint32* hostLastParkEntries = hostAllocator.CAlloc<uint32>( kEntriesPerPark );
+
+    static_assert( kEntriesPerPark * maxParksPerBucket <= BBCU_BUCKET_ALLOC_ENTRY_COUNT * 2 );
+    uint32* devIndexBuffer     = s2.devLTable[0] + kEntriesPerPark;
+    uint32  retainedEntryCount = 0;
+
+    // Begin serialization
+    cudaStream_t downloadStream = parkDownloader.GetQueue()->GetStream();
+
+    for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+    {
+        if( bucket + 1 < BBCU_BUCKET_COUNT )
+            LoadBucket( cx, bucket+1 );
+
+        const uint32 bucketEntryCount = p3.prunedBucketCounts[(int)TableId::Table7][bucket];
+
+        // Unmap bucket
+        auto* lMap = (LMap*)s2.lMapIn.GetUploadedDeviceBuffer( cx.computeStream );
+        UnpackLMap( cx, bucketEntryCount, lMap, devIndexBuffer, bucket, cx.computeStream );
+        s2.lMapIn.ReleaseDeviceBuffer( cx.computeStream );
+
+        // Serialize indices into a park
+        uint32* indices    = devIndexBuffer - retainedEntryCount;
+        uint32  indexCount = bucketEntryCount + retainedEntryCount;
+
+        const uint32 parkCount = indexCount / kEntriesPerPark;
+
+        uint64* devParkFields = (uint64*)parkDownloader.LockDeviceBuffer( cx.computeStream );
+        SerializePark7InGPU( parkCount, indices, devParkFields, parkFieldCount, cx.computeStream );
+
+
+        // Retain any entries that did not fit into a park
+        retainedEntryCount = indexCount - (parkCount * kEntriesPerPark);
+        if( retainedEntryCount > 0 )
+        {
+            const bool isLastBucket = bucket + 1 == BBCU_BUCKET_COUNT;
+
+            const uint32  serializedEntryCount = parkCount * kEntriesPerPark;
+            const uint32* copySource           = indices + serializedEntryCount;
+            const size_t  copySize             = sizeof( uint32 ) * retainedEntryCount;
+
+            if( !isLastBucket )
+                CudaErrCheck( cudaMemcpyAsync( devIndexBuffer - retainedEntryCount, copySource, copySize, cudaMemcpyDeviceToDevice, cx.computeStream ) );
+            else
+                CudaErrCheck( cudaMemcpyAsync( hostLastParkEntries, copySource, copySize, cudaMemcpyDeviceToHost, cx.computeStream ) );
+        }
+
+        // Download parks & write to plot
+        const size_t downloadSize = parkCount * parkSize;
+
+        parkDownloader.DownloadWithCallback( hostParkWriter, downloadSize,
+              []( void* parksBuffer, size_t size, void* userData ) {
+
+                auto& cx = *reinterpret_cast<CudaK32PlotContext*>( userData );
+                cx.plotWriter->WriteTableData( parksBuffer, size );
+            }, &cx, cx.computeStream );
+
+        hostParkWriter += downloadSize;
+    }
+
+    // Wait for parks to complete downloading
+    parkDownloader.WaitForCompletion();
+    parkDownloader.Reset();
+
+    CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
+    CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
+
+    // Was there a left-over park?
+    if( retainedEntryCount > 0 )
+    {
+        // Submit last park to plot
+        TableWriter::WriteP7Parks( 1, hostLastParkEntries, hostParkWriter );
+        cx.plotWriter->WriteTableData( hostParkWriter, parkSize );
+    }
+    cx.plotWriter->EndTable();
+
+    // Cleanup
+    s2.lMapIn.Reset();
+}
+
+
+#if _DEBUG
+
+//-----------------------------------------------------------
+static void _DbgValidateOutput( CudaK32PlotContext& cx );
+void DbgValidateStep2Output( CudaK32PlotContext& cx )
+{
+    // New stack (prevent overflow)
+    auto* thread = new Thread();
+    thread->Run( []( void* p ) {
+        _DbgValidateOutput( *(CudaK32PlotContext*)p );
+    }, &cx );
+
+    thread->WaitForExit();
+    delete thread;
+}
+
+//-----------------------------------------------------------
+void _DbgValidateOutput( CudaK32PlotContext& cx )
+{
+    const TableId rTable = cx.table;
+    auto& p3 = *cx.phase3;
+    auto& s2 = p3.step2;
+
+    // Validate line points...
+    uint64* refLinePoints = bbcvirtallocboundednuma<uint64>( BBCU_TABLE_ALLOC_ENTRY_COUNT );
+    uint64* tmpLinePoints = bbcvirtallocboundednuma<uint64>( BBCU_TABLE_ALLOC_ENTRY_COUNT );
+    uint32* indices       = bbcvirtallocboundednuma<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT );
+
+    uint64* writer    = refLinePoints;
+    uint32* idxWriter = indices;
+
+    const uint64 prunedEntryCount = p3.prunedTableEntryCounts[(int)rTable];
+
+    const uint32 lpBits        = 63; // #TODO: Change when compressing here
+    const uint32 lpBucketShift = lpBits - BBC_BUCKET_BITS;
+
+    for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+    {
+        uint64* reader    = p3.hostLinePoints + bucket * P3_PRUNED_SLICE_MAX;
+        uint32* idxReader = p3.hostIndices    + bucket * P3_PRUNED_SLICE_MAX*3;
+
+        for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice ++ )
+        {
+            const size_t count = s2.prunedBucketSlices[slice][bucket];
+            bbmemcpy_t( writer   , reader   , count );
+            bbmemcpy_t( idxWriter, idxReader, count );
+            
+            // The line points must be in their given buckets if inlined x's
+            if( cx.table-1 == TableId::Table1 )
+            {
+                for( size_t i = 0; i < count; i++ )
+                {
+                    const uint64 lp = writer[i];
+                    const uint32 b  = lp >> lpBucketShift;
+                    ASSERT( b == bucket );
+                }
+            }
+
+            writer    += count;
+            idxWriter += count;
+            reader    += P3_PRUNED_BUCKET_MAX;
+            idxReader += P3_PRUNED_BUCKET_MAX*3;
+        }
+    }
+
+    const uint64 readEntries = (uint64)( (uintptr_t)writer - (uintptr_t)refLinePoints ) / sizeof( uint64 );
+    ASSERT( readEntries == prunedEntryCount );
+
+    ThreadPool& pool = DbgGetThreadPool( cx );
+    RadixSort256::Sort<BB_MAX_JOBS>( pool, refLinePoints, tmpLinePoints, prunedEntryCount );
+    RadixSort256::Sort<BB_MAX_JOBS>( pool, indices, (uint32*)tmpLinePoints, prunedEntryCount );
+
+    for( uint32 i = 1; i < (uint32)prunedEntryCount; i++ )
+    {
+        ASSERT( indices[i] >= indices[i-1] );
+    }
+
+    for( uint64 i = 1; i < prunedEntryCount; i++ )
+    {
+        ASSERT( refLinePoints[i] >= refLinePoints[i-1] );
+    }
+
+    // Delta test
+    // #TODO: Get correct stub bit size depending on compression
+    const uint32 stubBitSize = (BBCU_K - kStubMinusBits);
+    for( uint32 i = 0; i < (uint32)prunedEntryCount; i+=kEntriesPerPark )
+    {
+        const uint32 parkCount = std::min( prunedEntryCount - i, (uint64)kEntriesPerPark );
+
+        const uint64* park = refLinePoints + i;
+
+        uint64 prevLp = park[0];
+
+        for( uint32 j = 1; j < parkCount; j++ )
+        {
+            uint64 lp         = park[j];
+            uint64 delta      = lp - prevLp;
+            uint64 smallDelta = delta >> stubBitSize;
+            ASSERT( smallDelta < 256 );
+
+            prevLp = lp;
+        }
+    }
+
+    bbvirtfreebounded( refLinePoints );
+    bbvirtfreebounded( tmpLinePoints );
+    bbvirtfreebounded( indices );
+}
+
+#endif
+
+//-----------------------------------------------------------
+void DbgDumpSortedLinePoints( CudaK32PlotContext& cx )
+{
+    Log::Line( "[DEBUG] Prpaparing line ponts for writing to file." );
+    const TableId rTable = cx.table;
+
+    auto& p3 = *cx.phase3;
+    auto& s2 = p3.step2;
+
+
+    uint64* sortedLinePoints = bbcvirtallocboundednuma<uint64>( BBCU_TABLE_ALLOC_ENTRY_COUNT );
+    uint64* tmpLinePoints    = bbcvirtallocboundednuma<uint64>( BBCU_TABLE_ALLOC_ENTRY_COUNT );
+
+    uint64* writer = sortedLinePoints;
+
+    const uint64 prunedEntryCount = p3.prunedTableEntryCounts[(int)rTable];
+
+    const uint32 lpBits        = 63; // #TODO: Change when compressing here
+    const uint32 lpBucketShift = lpBits - BBC_BUCKET_BITS;
+
+    for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+    {
+        uint64* reader = p3.hostLinePoints + bucket * P3_PRUNED_SLICE_MAX;
+
+        for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice ++ )
+        {
+            const size_t count = s2.prunedBucketSlices[slice][bucket];
+            bbmemcpy_t( writer, reader, count );
+
+            writer    += count;
+            reader    += P3_PRUNED_BUCKET_MAX;
+        }
+    }
+
+    // Sort
+    ThreadPool& pool = *cx.threadPool; //DbgGetThreadPool( cx );
+    RadixSort256::Sort<BB_MAX_JOBS>( pool, sortedLinePoints, tmpLinePoints, prunedEntryCount );
+
+    // Write to disk
+    {
+        char filePath[1024] = {};
+        sprintf( filePath, "%s/lp.c%u.ref", "/home/harold/plot/ref/compressed-lps", (uint32)cx.gCfg->compressionLevel );
+
+        FileStream file;
+        if( file.Open( filePath, FileMode::Open, FileAccess::Read ) )
+        {
+            Log::Line( "[DEBUG]File %s already exists. Cannot overwrite.", filePath );
+        }
+        else
+        {
+            Log::Line( "[DEBUG] Writing line points to %s", filePath );
+            file.Close();
+            file.Open( filePath, FileMode::Create, FileAccess::Write );
+
+            void* block = bbvirtalloc( file.BlockSize() );
+            int err;
+            if( !IOJob::WriteToFile( file, sortedLinePoints, prunedEntryCount * sizeof( uint64 ), block, file.BlockSize(), err ) )
+                Log::Line( "Failed to to file %s with error %d.", filePath, err );
+
+            bbvirtfree( block );
+
+            Log::Line( "[DEBUG] Wrote %llu line points", prunedEntryCount );
+        }
+
+        file.Close();
+    }
+
+    bbvirtfreebounded( sortedLinePoints );
+    bbvirtfreebounded( tmpLinePoints );
+}
diff --git a/cuda/CudaPlotPhase3Step3.cu b/cuda/CudaPlotPhase3Step3.cu
new file mode 100644
index 00000000..3949bd8c
--- /dev/null
+++ b/cuda/CudaPlotPhase3Step3.cu
@@ -0,0 +1,573 @@
+#include "CudaPlotPhase3Internal.h"
+#include "CudaParkSerializer.h"
+#include "plotmem/ParkWriter.h"
+
+static void GenerateLMap( CudaK32PlotContext& cx, const uint32 entryCount, const uint32 finalOffset, const uint32* indices, cudaStream_t stream );
+static void DeltafyLinePoints( CudaK32PlotContext& cx, const uint32 entryCount, const uint64* linePoints, uint64* deltaLinePoints, cudaStream_t stream );
+
+#if _DEBUG
+    #include "plotdisk/jobs/IOJob.h"
+    static void DbgSaveLMap( CudaK32PlotContext& cx );
+    static void DbgValidateLMapData( CudaK32PlotContext& cx );
+    static void DbgValidateLMap( CudaK32PlotContext& cx );
+#endif
+
+//-----------------------------------------------------------
+void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
+{
+    auto LoadBucket = []( CudaK32PlotContext& cx, const uint32 bucket ) -> void {
+
+        auto& p3 = *cx.phase3;
+        auto& s2 = p3.step2;
+        auto& s3 = p3.step3;
+
+        // if( bucket == 0 )
+        //     p3.pairsLoadOffset = 0;
+
+        // Load line points and their source indices
+        const TableId rTable     = cx.table;
+        const uint32  entryCount = p3.prunedBucketCounts[(int)rTable][bucket];
+        ASSERT( entryCount <= P3_PRUNED_BUCKET_MAX );
+
+        if( entryCount < 1 )
+            return;
+
+        // Vertical input layout of data: Start at row 0, column according to the current bucket
+        const uint64* linePoints = p3.hostLinePoints + (size_t)bucket * P3_PRUNED_SLICE_MAX;
+        const uint32* indices    = p3.hostIndices    + (size_t)bucket * P3_PRUNED_SLICE_MAX * 3; // This buffer is shared with RMap ((uint32)*3) (which we're about to write to),
+                                                                                                 // which is why we multiply by 3
+
+        const uint32* counts     = &s2.prunedBucketSlices[0][bucket];
+
+        // Load 1 column
+        s3.lpIn   .UploadArrayT( linePoints, BBCU_BUCKET_COUNT, P3_PRUNED_BUCKET_MAX  , BBCU_BUCKET_COUNT, counts );
+        s3.indexIn.UploadArrayT( indices   , BBCU_BUCKET_COUNT, P3_PRUNED_BUCKET_MAX*3, BBCU_BUCKET_COUNT, counts );
+    };
+
+    auto& p3 = *cx.phase3;
+    auto& s3 = p3.step3;
+
+    const TableId rTable = cx.table;
+    const TableId lTable = cx.table-1;
+
+    // Load CTable
+    const bool    isCompressed = cx.gCfg->compressionLevel > 0 && lTable <= (TableId)cx.gCfg->numDroppedTables;
+    const uint32  stubBitSize  = !isCompressed ? (BBCU_K - kStubMinusBits) : cx.gCfg->compressionInfo.subtSizeBits;
+    const TableId firstTable   = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables;
+    
+    const size_t      cTableSize = !isCompressed ? sizeof( CTable_0 )   : cx.gCfg->cTableSize;             ASSERT( cTableSize <= P3_MAX_CTABLE_SIZE );
+    const FSE_CTable* hostCTable = !isCompressed ? CTables[(int)lTable] : cx.gCfg->ctable;
+    
+    // (upload must be loaded before first bucket, on the same stream)
+    CudaErrCheck( cudaMemcpyAsync( s3.devCTable, hostCTable, cTableSize, cudaMemcpyHostToDevice, 
+                    s3.lpIn.GetQueue()->GetStream() ) );
+
+    // Load initial bucket
+    LoadBucket( cx, 0 );
+
+    // Begin plot table
+    cx.plotWriter->BeginTable( (PlotTable)lTable );
+
+
+    uint32 mapOffset       = 0;
+    uint32 retainedLPCount = 0;                     // Line points retained for the next bucket to write to park
+
+    const size_t hostParkSize = isCompressed ? cx.gCfg->compressionInfo.tableParkSize : CalculateParkSize( lTable );
+    ASSERT( DEV_MAX_PARK_SIZE >= hostParkSize );
+
+    // #TODO: Move this allocation to the beginning
+    if( s3.parkFence == nullptr )
+        s3.parkFence = new Fence();
+
+    byte*   hostParksWriter     = (byte*)cx.hostBackPointers[(int)rTable].left;  //(byte*)cx.hostTableL; 
+    uint64* hostRetainedEntries = nullptr;
+
+    // if( !isCompressed && lTable == TableId::Table1 )
+    //     hostParksWriter = (byte*)cx.hostBackPointers[(int)TableId::Table2].left;
+
+    ///
+    /// Process buckets
+    ///
+    uint64* sortedLinePoints = s3.devLinePoints + kEntriesPerPark;
+    uint32* sortedIndices    = s3.devIndices;
+
+    cudaStream_t sortAndMapStream = cx.computeStream;
+    cudaStream_t lpStream         = cx.computeStream;//B;
+    cudaStream_t downloadStream   = cx.gpuDownloadStream[0]->GetStream();
+
+    CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, sortAndMapStream ) );
+    CudaErrCheck( cudaMemsetAsync( s3.devParkOverrunCount, 0, sizeof( uint32 ), sortAndMapStream ) );
+
+    // Set initial event LP stream event as set.
+    CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) );
+
+    s3.parkFence->Reset( 0 );
+    s3.parkBucket = 0;
+
+    for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+    {
+        cx.bucket = bucket;
+
+        const uint32 bucketEntryCount = p3.prunedBucketCounts[(int)rTable][bucket];
+
+        if( bucketEntryCount == 0 )
+            break;
+
+        if( bucket + 1 < BBCU_BUCKET_COUNT )
+            LoadBucket( cx, bucket + 1 );
+
+        // Wait for upload to finish
+        uint64* unsortedLinePoints = (uint64*)s3.lpIn   .GetUploadedDeviceBuffer( sortAndMapStream );
+        uint32* unsortedIndices    = (uint32*)s3.indexIn.GetUploadedDeviceBuffer( sortAndMapStream );
+
+        // Sort line points
+        #if _DEBUG
+        {
+            size_t sortRequiredSize = 0;
+            CudaErrCheck( cub::DeviceRadixSort::SortPairs<uint64, uint32>( nullptr, sortRequiredSize, nullptr, nullptr, nullptr, nullptr, bucketEntryCount, 0, 64 ) );
+            ASSERT( s3.sizeTmpSort >= sortRequiredSize );
+        }
+        #endif
+
+        // Wait for the previous bucket's LP work to finish, so we can re-use the device buffer
+        CudaErrCheck( cudaStreamWaitEvent( sortAndMapStream, cx.computeEventA ) );
+
+        // #TODO: We can use 63-7 (log2(128 buckets)), which might be faster
+        // #NOTE: I did change it and the sort failed. Investigate.
+        CudaErrCheck( cub::DeviceRadixSort::SortPairs<uint64, uint32>(
+            s3.devSortTmpData,  s3.sizeTmpSort,
+            unsortedLinePoints, sortedLinePoints,
+            unsortedIndices,    sortedIndices,
+            bucketEntryCount, 0, 64, sortAndMapStream ) );
+
+        CudaErrCheck( cudaEventRecord( cx.computeEventB, sortAndMapStream ) );
+
+        s3.lpIn   .ReleaseDeviceBuffer( sortAndMapStream ); unsortedLinePoints = nullptr;
+        s3.indexIn.ReleaseDeviceBuffer( sortAndMapStream ); unsortedIndices    = nullptr;
+
+        ///
+        /// Map
+        /// 
+        // Generate map and download to it to host
+        GenerateLMap( cx, bucketEntryCount, mapOffset, sortedIndices, sortAndMapStream );
+        mapOffset += bucketEntryCount;
+
+        // Vertical download map (write 1 column)
+        s3.mapOut.Download2DT<LMap>( p3.hostLMap + (size_t)bucket * P3_PRUNED_SLICE_MAX,
+                 P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_BUCKET_MAX, P3_PRUNED_SLICE_MAX, sortAndMapStream );
+
+
+        ///
+        /// Line points
+        ///
+        // If we have retained entries, let's account for them in this bucket
+        uint64* parkLinePoints = sortedLinePoints - retainedLPCount;
+
+        const uint32 totalEntryCount = bucketEntryCount + retainedLPCount;
+        const uint32 parkCount       = totalEntryCount / kEntriesPerPark;
+        const uint32 entryCount      = parkCount * kEntriesPerPark;
+        ASSERT( parkCount <= P3_PRUNED_MAX_PARKS_PER_BUCKET );
+
+        // Wait for sort to finish
+        CudaErrCheck( cudaStreamWaitEvent( lpStream, cx.computeEventB ) );
+
+        // Deltafy line points
+        DeltafyLinePoints( cx, entryCount, parkLinePoints, s3.devDeltaLinePoints, lpStream );
+
+        CudaErrCheck( cudaEventRecord( cx.computeEventC, lpStream ) );  // Signal download stream can download remaining line points for last park
+
+        // Compress line point parks
+        byte* devParks = (byte*)s3.parksOut.LockDeviceBuffer( lpStream );
+        CompressToParkInGPU( parkCount, hostParkSize, s3.devDeltaLinePoints, devParks, DEV_MAX_PARK_SIZE, stubBitSize, s3.devCTable, s3.devParkOverrunCount, lpStream );
+
+        // Retain any entries that did not maked it into parks for the next bucket to process
+        retainedLPCount = totalEntryCount - (parkCount * kEntriesPerPark);
+        if( retainedLPCount > 0 )
+        {
+            // Last bucket?
+            const bool isLastBucket = bucket + 1 == BBCU_BUCKET_COUNT;
+
+            const uint64* copySource = parkLinePoints + entryCount;
+            const size_t  copySize   = sizeof( uint64 ) * retainedLPCount;
+
+            if( !isLastBucket )
+            {
+                // Not the last bucket, so retain entries for the next GPU compression bucket
+                CudaErrCheck( cudaMemcpyAsync( sortedLinePoints - retainedLPCount, copySource, copySize, cudaMemcpyDeviceToDevice, lpStream ) );
+            }
+            else
+            {       
+                // No more buckets so we have to compress this last park on the CPU
+                CudaErrCheck( cudaStreamWaitEvent( downloadStream, cx.computeEventC ) );
+
+                hostRetainedEntries = (uint64*)( hostParksWriter + hostParkSize * parkCount );
+                CudaErrCheck( cudaMemcpyAsync( hostRetainedEntries, copySource, copySize, cudaMemcpyDeviceToHost, downloadStream ) );
+            }
+        }
+
+        CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) );  // Signal sortedLinePoints buffer ready for use again
+
+
+        // Download parks
+        s3.parksOut.Download2DWithCallback( hostParksWriter, hostParkSize, parkCount, hostParkSize, DEV_MAX_PARK_SIZE, 
+            []( void* parksBuffer, size_t size, void* userData ) {
+
+                auto& cx = *reinterpret_cast<CudaK32PlotContext*>( userData );
+                auto& s3 = cx.phase3->step3;
+
+                cx.plotWriter->WriteTableData( parksBuffer, size );
+                cx.plotWriter->SignalFence( *s3.parkFence, ++s3.parkBucket );
+
+            }, &cx, lpStream, cx.downloadDirect );
+
+        hostParksWriter += hostParkSize * parkCount;
+    }
+
+    // Copy park overrun count
+    CudaErrCheck( cudaMemcpyAsync( s3.hostParkOverrunCount, s3.devParkOverrunCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, downloadStream ) );
+
+    // Wait for parks to complete downloading
+    s3.parksOut.WaitForCompletion();
+    s3.parksOut.Reset();
+
+    // Copy map slice counts (for the next step 2)
+    CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
+                    cudaMemcpyDeviceToHost, downloadStream ) );
+
+    CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
+    memcpy( &s3.prunedBucketSlices[0][0], cx.hostBucketSlices, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT );
+
+    FatalIf( *s3.hostParkOverrunCount > 0, "Park buffer overrun." );
+
+    // Was there a left-over park?
+    if( retainedLPCount > 0 )
+    {
+        ASSERT( hostRetainedEntries );
+        
+        uint64 lastParkEntries[kEntriesPerPark];
+        bbmemcpy_t( lastParkEntries, hostRetainedEntries, retainedLPCount );
+
+        WritePark( hostParkSize, retainedLPCount, lastParkEntries, hostParksWriter, stubBitSize, hostCTable );
+        cx.plotWriter->WriteTableData( hostParksWriter, hostParkSize );
+    }
+    cx.plotWriter->EndTable();
+
+    // Update buckets counts for L table
+    // #TODO: These should match Step 1 pruned entry count I believe, so just copy?
+
+    memset( p3.prunedBucketCounts[(int)rTable], 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT );
+    for( uint32 i = 0; i < BBCU_BUCKET_COUNT; i++ )
+        for( uint32 j = 0; j < BBCU_BUCKET_COUNT; j++ )
+            p3.prunedBucketCounts[(int)rTable][i] += s3.prunedBucketSlices[j][i];
+
+    s3.mapOut.WaitForCompletion();
+    s3.mapOut.Reset();
+
+    s3.lpIn   .Reset();
+    s3.indexIn.Reset();
+
+
+    // #if _DEBUG
+    // //if( cx.table >= TableId::Table6 )
+    // //{
+    //     DbgValidateLMap( cx );
+    //     DbgValidateLMapData( cx );
+    //     // DbgSaveLMap( cx );
+    // //}
+    // #endif
+}
+
+
+//-----------------------------------------------------------
+__global__ void CudaGenerateLMap( const uint32 entryCount, const uint32 finalOffset, const uint32* indices, LMap* gMap, uint32* gBucketCounts )
+{
+    const uint32 id  = threadIdx.x;
+    const uint32 gid = blockIdx.x * blockDim.x + id;
+
+    __shared__ uint32 sharedBucketCounts[BBCU_BUCKET_COUNT];
+    if( id < BBCU_BUCKET_COUNT )
+        sharedBucketCounts[id] = 0;
+
+    __syncthreads();
+
+    uint32 index;
+    uint32 bucket;
+    uint32 offset;
+
+    if( gid < entryCount )
+    {
+        index = indices[gid];
+        
+        bucket = ( index >> (32 - BBC_BUCKET_BITS) );
+        offset = atomicAdd( &sharedBucketCounts[bucket], 1 );
+    }
+
+    __syncthreads();
+
+    // Global offset
+    if( id < BBCU_BUCKET_COUNT )
+        sharedBucketCounts[id] = atomicAdd( &gBucketCounts[id], sharedBucketCounts[id] );
+
+    __syncthreads();
+    
+    if( gid >= entryCount )
+        return;
+
+    const uint32 dst = bucket * P3_PRUNED_SLICE_MAX + sharedBucketCounts[bucket] + offset;
+    
+    //CUDA_ASSERT( index != 0 );
+
+    LMap map;
+    map.sortedIndex = finalOffset + gid;
+    map.sourceIndex = index;
+#if _DEBUG
+    CUDA_ASSERT( map.sortedIndex != 0 || map.sourceIndex != 0 );
+#endif
+    gMap[dst] = map;
+}
+
+//-----------------------------------------------------------
+void GenerateLMap( CudaK32PlotContext& cx, const uint32 entryCount, const uint32 finalOffset, const uint32* indices, cudaStream_t stream )
+{
+    const uint32 threads = 256;
+    const uint32 blocks  = CDiv( entryCount, threads );
+
+    auto& p3 = *cx.phase3;
+    auto& s3 = p3.step3;
+
+    auto*   devMap         = (LMap*)s3.mapOut.LockDeviceBuffer( stream );
+    uint32* devSliceCounts = cx.devSliceCounts + cx.bucket * BBCU_BUCKET_COUNT;
+
+    CudaErrCheck( cudaMemsetAsync( devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT, stream ) );
+    
+    CudaGenerateLMap<<<blocks, threads, 0, stream>>>( entryCount, finalOffset, indices, devMap, devSliceCounts );
+}
+
+//-----------------------------------------------------------
+__global__ void CudaDeltafyLinePoints( const uint32 entryCount, const uint64* linePoints, uint64* deltaLinePoints )
+{
+    const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if( gid >= entryCount )
+        return;
+
+    const bool isFirstParkEntry = ( gid & ( kEntriesPerPark - 1 ) ) == 0;
+
+    if( isFirstParkEntry )
+    {
+        deltaLinePoints[gid] = linePoints[gid];
+    }
+    else
+    {
+        //CUDA_ASSERT( linePoints[gid] && linePoints[gid - 1] );
+        CUDA_ASSERT( linePoints[gid] >= linePoints[gid - 1] );
+        deltaLinePoints[gid] = linePoints[gid] - linePoints[gid - 1];
+    }
+}
+
+//-----------------------------------------------------------
+void DeltafyLinePoints( CudaK32PlotContext& cx, const uint32 entryCount, const uint64* linePoints, uint64* deltaLinePoints, cudaStream_t stream )
+{
+    ASSERT( entryCount / kEntriesPerPark * kEntriesPerPark == entryCount );
+
+    const uint32 threadsPerBlock = 256;
+    const uint32 blockCount      = CDivT( entryCount, threadsPerBlock );
+    CudaDeltafyLinePoints<<<blockCount, threadsPerBlock, 0, stream>>>( entryCount, linePoints, deltaLinePoints );
+}
+
+
+
+#if _DEBUG
+
+//-----------------------------------------------------------
+void DbgSaveLMap( CudaK32PlotContext& cx )
+{
+    Log::Line( "[DEBUG] Saving table %u LMap", (uint)cx.table+1 );
+    auto& p3 = *cx.phase3;
+
+    char path[512];
+    sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.tmp", (uint)cx.table+1 );
+    
+    const size_t writeSize = sizeof( LMap ) * BBCU_TABLE_ALLOC_ENTRY_COUNT;
+    int err;
+    FatalIf( !IOJob::WriteToFile( path, p3.hostLMap, writeSize, err ),
+        "[DEBUG] Failed to write LMap with error: %d", err );
+
+    sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.slices.tmp", (uint)cx.table+1 );
+    FatalIf( !IOJob::WriteToFileUnaligned( path, p3.step3.prunedBucketSlices, sizeof( p3.step3.prunedBucketSlices ), err ),
+        "[DEBUG] Failed to write LMap slices with error: %d", err );
+
+    sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.buckets.tmp", (uint)cx.table+1 );
+    FatalIf( !IOJob::WriteToFileUnaligned( path, p3.prunedBucketCounts[(int)cx.table], sizeof( uint32 ) * BBCU_BUCKET_COUNT, err ),
+        "[DEBUG] Failed to write LMap buckets with error: %d", err );
+    
+    Log::Line( " [DEBUG] OK" );
+}
+
+//-----------------------------------------------------------
+void DbgLoadLMap( CudaK32PlotContext& cx )
+{
+    auto& p3 = *cx.phase3;
+
+    char path[512];
+    sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.tmp", (uint)cx.table+1 );
+    
+    const size_t writeSize = sizeof( LMap ) * BBCU_TABLE_ALLOC_ENTRY_COUNT;
+    int err;
+    FatalIf( !IOJob::ReadFromFile( path, p3.hostLMap, writeSize, err ),
+        "[DEBUG] Failed to read LMap with error: %d", err );
+
+    sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.slices.tmp", (uint)cx.table+1 );
+    FatalIf( !IOJob::ReadFromFileUnaligned( path, p3.step3.prunedBucketSlices, sizeof( p3.step3.prunedBucketSlices ), err ),
+        "[DEBUG] Failed to read LMap slices with error: %d", err );
+
+    sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.buckets.tmp", (uint)cx.table+1 );
+
+    FatalIf( !IOJob::ReadFromFileUnaligned( path, p3.prunedBucketCounts[(int)cx.table], sizeof( uint32 ) * BBCU_BUCKET_COUNT, err ),
+        "[DEBUG] Failed to read LMap buckets with error: %d", err );
+
+    //DbgValidateLMapData( cx );
+}
+
+//-----------------------------------------------------------
+void DbgValidateLMap( CudaK32PlotContext& cx )
+{
+    Log::Line( "[DEBUG] Validating LMap..." );
+
+    ThreadPool& pool = DbgGetThreadPool( cx );
+
+    auto& p3 = *cx.phase3;
+    auto& s3 = p3.step3;
+
+    LMap* lMap = bbcvirtallocbounded<LMap>( BBCU_TABLE_ENTRY_COUNT );
+
+    
+    {
+        for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+        {
+            const LMap* reader = p3.hostLMap + bucket * P3_PRUNED_BUCKET_MAX;
+
+            uint64 entryCount = 0;
+            LMap*  writer     = lMap;
+
+            for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+            {
+                // Read counts vertically, but read data horizontally
+                const uint32 copyCount = s3.prunedBucketSlices[slice][bucket];
+
+                bbmemcpy_t( writer, reader, copyCount );
+
+                writer     += copyCount;
+                entryCount += copyCount;
+                reader     += P3_PRUNED_SLICE_MAX;
+            }
+
+            // All source entries should belong to the same bucket
+            ASSERT( entryCount == p3.prunedBucketCounts[(int)cx.table][bucket] );
+
+            for( uint64 i = 0; i < entryCount; i++ )
+            {
+                const LMap map = lMap[i];
+
+                ASSERT( map.sourceIndex || map.sortedIndex );
+                ASSERT( ( map.sourceIndex >> ( 32 - BBC_BUCKET_BITS ) ) == bucket );
+            }
+        }
+
+        
+    }
+
+    bbvirtfreebounded( lMap );
+
+    Log::Line( "[DEBUG] OK" );
+}
+
+//-----------------------------------------------------------
+static void _DbgValidateLMapData( CudaK32PlotContext& cx );
+void DbgValidateLMapData( CudaK32PlotContext& cx )
+{
+    // New stack (prevent overflow)
+    auto* thread = new Thread();
+    thread->Run( []( void* p ) {
+        _DbgValidateLMapData( *(CudaK32PlotContext*)p );
+    }, &cx );
+
+    thread->WaitForExit();
+    delete thread;
+}
+
+void _DbgValidateLMapData( CudaK32PlotContext& cx )
+{
+    Log::Line( "[DEBUG] Validating LMap uniquenes..." );
+
+    ThreadPool& pool = DbgGetThreadPool( cx );
+
+    auto& p3 = *cx.phase3;
+    auto& s3 = p3.step3;
+
+    uint32* srcIndices = bbcvirtallocbounded<uint32>( BBCU_TABLE_ENTRY_COUNT );
+    uint32* dstIndices = bbcvirtallocbounded<uint32>( BBCU_TABLE_ENTRY_COUNT );
+    uint32* tmpIndices = bbcvirtallocbounded<uint32>( BBCU_TABLE_ENTRY_COUNT );
+
+    uint64 entryCount = 0;
+    uint32 twoCount = 0;
+    {
+        uint32* srcWriter = srcIndices;
+        uint32* dstWriter = dstIndices;
+
+        for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+        {
+            const LMap* reader = p3.hostLMap + bucket * P3_PRUNED_BUCKET_MAX;
+
+            for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+            {
+                // Read counts vertically, but read data horizontally
+                const uint32 copyCount = s3.prunedBucketSlices[slice][bucket];
+
+                for( uint32 i = 0; i < copyCount; i++ )
+                {
+                    if( reader[i].sourceIndex == 2 )
+                        twoCount++;
+                    if( reader[i].sourceIndex == 0 && reader[i].sortedIndex == 0 )
+                    {
+                        ASSERT( 0 );
+                    }
+
+                    srcWriter[i] = reader[i].sourceIndex;
+                    dstWriter[i] = reader[i].sortedIndex;
+                }
+
+                srcWriter += copyCount;
+                dstWriter += copyCount;
+                entryCount += copyCount;
+                reader += P3_PRUNED_SLICE_MAX;
+            }
+        }
+
+        ASSERT( entryCount == p3.prunedTableEntryCounts[(int)cx.table] );
+    }
+
+    RadixSort256::Sort<BB_MAX_JOBS>( pool, srcIndices, tmpIndices, entryCount );
+    RadixSort256::Sort<BB_MAX_JOBS>( pool, dstIndices, tmpIndices, entryCount );
+
+    // Indices must not repeat:
+    for( uint64 i = 1; i < entryCount; i++ )
+    {
+        ASSERT( srcIndices[i] > srcIndices[i-1] );
+    }
+
+    Log::Line( "Maximum source index: %u", srcIndices[entryCount-1] );
+
+    for( uint64 i = 0; i < entryCount; i++ )
+    {
+        ASSERT( dstIndices[i] == i );
+    }
+
+    bbvirtfreebounded( srcIndices );
+    bbvirtfreebounded( dstIndices );
+    bbvirtfreebounded( tmpIndices );
+
+    Log::Line( "[DEBUG] OK" );
+}
+
+#endif
+
diff --git a/cuda/CudaPlotUtil.cu b/cuda/CudaPlotUtil.cu
new file mode 100644
index 00000000..4f7f18b3
--- /dev/null
+++ b/cuda/CudaPlotUtil.cu
@@ -0,0 +1,124 @@
+#include "CudaPlotContext.h"
+
+//-----------------------------------------------------------
+__global__ void GenSortKey( const uint32 entryCount, uint32* key )
+{
+    const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if( gid >= entryCount )
+        return;
+
+    key[gid] = gid;
+}
+
+//-----------------------------------------------------------
+void CudaK32PlotGenSortKey( const uint32 entryCount, uint32* devKey, cudaStream_t stream, bool synchronize )
+{
+    const uint32 threadsPerBlock = 128;
+    const uint32 blockCount      = CDiv( entryCount, threadsPerBlock );
+    
+    if( stream == nullptr )
+        stream = CU_STREAM_LEGACY;
+
+    GenSortKey<<<blockCount, threadsPerBlock, 0, stream>>>( entryCount, devKey );
+    if( synchronize )
+        CudaErrCheck( cudaStreamSynchronize( stream ) );
+    
+}
+
+//-----------------------------------------------------------
+template<typename T>
+__global__ void SortByKey( const uint32 entryCount, const uint32* key, const T* input, T* output )
+{
+    const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if( gid >= entryCount )
+        return;
+
+    output[gid] = input[key[gid]];
+}
+
+//-----------------------------------------------------------
+template<typename T>
+void CudaK32PlotSortByKey( const uint32 entryCount, const uint32* devKey, const T* devInput, T* devOutput, cudaStream_t stream, bool synchronize )
+{
+    const uint32 threadsPerBlock = 128;
+    const uint32 blockCount      = CDiv( entryCount, threadsPerBlock );
+    
+    if( stream == nullptr )
+        stream = CU_STREAM_LEGACY;
+
+    SortByKey<T><<<blockCount, threadsPerBlock, 0, stream>>>( entryCount, devKey, devInput, devOutput );
+    if( synchronize )
+        CudaErrCheck( cudaStreamSynchronize( stream ) );
+}
+
+//-----------------------------------------------------------
+void CudaK32PlotSortMeta( const uint32 entryCount, const uint32* devKey, const uint32* devMetaIn, uint32* devMetaOutput, cudaStream_t stream )
+{
+
+}
+
+
+template void CudaK32PlotSortByKey<uint16>( const uint32 entryCount, const uint32* devKey, const uint16* devInput, uint16* devOutput, cudaStream_t stream, bool synchronize );
+template void CudaK32PlotSortByKey<uint32>( const uint32 entryCount, const uint32* devKey, const uint32* devInput, uint32* devOutput, cudaStream_t stream, bool synchronize );
+template void CudaK32PlotSortByKey<uint64>( const uint32 entryCount, const uint32* devKey, const uint64* devInput, uint64* devOutput, cudaStream_t stream, bool synchronize );
+template void CudaK32PlotSortByKey<K32Meta3>( const uint32 entryCount, const uint32* devKey, const K32Meta3* devInput, K32Meta3* devOutput, cudaStream_t stream, bool synchronize );
+template void CudaK32PlotSortByKey<K32Meta4>( const uint32 entryCount, const uint32* devKey, const K32Meta4* devInput, K32Meta4* devOutput, cudaStream_t stream, bool synchronize );
+template void CudaK32PlotSortByKey<Pair>( const uint32 entryCount, const uint32* devKey, const Pair* devInput, Pair* devOutput, cudaStream_t stream, bool synchronize );
+
+
+__global__ void K32InlineXsIntoPairsKernel( const uint32 entryCount, Pair* outPairs, const Pair* inPairs, const uint32* xs )
+{
+    const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if( gid >= entryCount )
+        return;
+
+    const Pair pair = inPairs[gid];
+
+    Pair inlined;
+    inlined.left  = xs[pair.left ];
+    inlined.right = xs[pair.right];
+    CUDA_ASSERT( inlined.left || inlined.right );
+
+    outPairs[gid] = inlined;
+}
+
+void CudaK32InlineXsIntoPairs(
+    const uint32  entryCount,
+          Pair*   devOutPairs,
+    const Pair*   devInPairs,
+    const uint32* devXs,
+    cudaStream_t  stream )
+{
+    const uint32 kthreads = 256;
+    const uint32 kblocks  = CDivT( entryCount, kthreads );
+
+    K32InlineXsIntoPairsKernel<<<kblocks, kthreads, 0, stream>>>(
+        entryCount, devOutPairs, devInPairs, devXs );
+}
+
+
+__global__ void K3ApplyPairOffsetKernel( const uint32 entryCount, const uint32 offset, Pair* outPairs, const Pair* inPairs )
+{
+    const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if( gid >= entryCount )
+        return;
+
+    Pair pair = inPairs[gid];
+    pair.left  += offset;
+    pair.right += offset;
+
+    outPairs[gid] = pair;
+}
+void CudaK32ApplyPairOffset(
+    const uint32 entryCount,
+    const uint32 offset,
+          Pair*  devOutPairs,
+    const Pair*  devInPairs,
+    cudaStream_t stream )
+{
+    const uint32 kthreads = 256;
+    const uint32 kblocks  = CDivT( entryCount, kthreads );
+
+    K3ApplyPairOffsetKernel<<<kblocks, kthreads, 0, stream>>>(
+        entryCount, offset, devOutPairs, devInPairs );
+}
diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu
new file mode 100644
index 00000000..8e0458dd
--- /dev/null
+++ b/cuda/CudaPlotter.cu
@@ -0,0 +1,1570 @@
+#include "CudaPlotter.h"
+#include "CudaPlotContext.h"
+#include "pos/chacha8.h"
+#include "b3/blake3.h"
+#include "threading/MTJob.h"
+#include "util/jobs/MemJobs.h"
+#include "util/StackAllocator.h"
+#include "CudaParkSerializer.h"
+#include "plotting/CTables.h"
+#include "plotting/TableWriter.h"
+#include "plotting/PlotTools.h"
+
+// TEST/DEBUG
+#if _DEBUG
+    #include "algorithm/RadixSort.h"
+    #include "plotdisk/jobs/IOJob.h"
+    #include "io/FileStream.h"
+
+    ThreadPool* _dbgThreadPool = nullptr;
+
+    static void DbgPruneTableBuckets( CudaK32PlotContext& cx, const TableId rTable );
+    static void DbgPruneTable( CudaK32PlotContext& cx, const TableId rTable );
+#endif
+
+static void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext );
+static void CudaInit( CudaK32PlotContext& cx );
+
+void GenF1Cuda( CudaK32PlotContext& cx );
+
+static void MakePlot( CudaK32PlotContext& cx );
+static void FpTable( CudaK32PlotContext& cx );
+static void FpTableBucket( CudaK32PlotContext& cx, const uint32 bucket );
+static void UploadBucketForTable( CudaK32PlotContext& cx, const uint64 bucket );
+static void FinalizeTable7( CudaK32PlotContext& cx );
+static void InlineTable( CudaK32PlotContext& cx, const uint32* devInX, cudaStream_t stream );
+
+static void AllocBuffers( CudaK32PlotContext& cx );
+static void AllocateP1Buffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx );
+
+template<typename T>
+static void UploadBucketToGpu( CudaK32PlotContext& context, TableId table, const uint32* hostPtr, T* devPtr, uint64 bucket, uint64 stride );
+static void LoadAndSortBucket( CudaK32PlotContext& cx, const uint32 bucket );
+
+void CudaMatchBucketizedK32( CudaK32PlotContext& cx, const uint32* devY, cudaStream_t stream, cudaEvent_t event );
+
+// Defined in FxCuda.cu
+void GenFx( CudaK32PlotContext& cx, const uint32* devYIn, const uint32* devMetaIn, cudaStream_t stream );
+
+static const char* USAGE = "bladebit_cuda ... cudaplot <out_dir>\n"
+R"(
+GPU-based (CUDA) plotter
+
+[OPTIONS]:
+ -h, --help           : Shows this help message and exits.
+ -d, --device         : Select the CUDA device index. (default=0)
+)";
+
+///
+/// CLI
+///
+//-----------------------------------------------------------
+void CudaK32Plotter::ParseCLI( const GlobalPlotConfig& gCfg, CliParser& cli )
+{
+    CudaK32PlotConfig& cfg = _cfg;
+    cfg.gCfg = &gCfg;
+
+    while( cli.HasArgs() )
+    {
+        if( cli.ReadU32( cfg.deviceIndex, "-d", "--device" ) )
+            continue;
+        if( cli.ReadSwitch( cfg.disableDirectDownloads, "--no-direct-downloads" ) )
+            continue;
+        if( cli.ArgMatch( "--help", "-h" ) )
+        {
+            Log::Line( USAGE );
+            exit( 0 );
+        }
+        else
+            break;  // Let the caller handle it
+    }
+
+    // The rest should be output directies, parsed by the global config parser.
+}
+
+//-----------------------------------------------------------
+void CudaK32Plotter::Init()
+{
+    if( _cx )
+        return;
+
+    InitContext( _cfg, _cx );
+}
+
+//-----------------------------------------------------------
+void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext )
+{
+    auto& cx = *new CudaK32PlotContext{};
+    outContext = &cx;
+
+    cx.cfg  = cfg;
+    cx.gCfg = cfg.gCfg;
+
+    Log::Line( "[Bladebit CUDA Plotter]" );
+    CudaInit( cx );
+
+    CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStream , cudaStreamNonBlocking ) );
+    CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamB, cudaStreamNonBlocking ) );
+    CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamC, cudaStreamNonBlocking ) );
+    CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamD, cudaStreamNonBlocking ) );
+
+    cudaEventCreateWithFlags( &cx.computeEventA, cudaEventDisableTiming );
+    cudaEventCreateWithFlags( &cx.computeEventB, cudaEventDisableTiming );
+    cudaEventCreateWithFlags( &cx.computeEventC, cudaEventDisableTiming );
+
+    for( int32 i = 0; i < BBCU_GPU_STREAM_COUNT; i++ )
+    {
+        cx.gpuDownloadStream[i] = new GpuQueue( GpuQueue::Downloader );
+        cx.gpuUploadStream  [i] = new GpuQueue( GpuQueue::Uploader   );
+    }
+
+    cx.threadPool = new ThreadPool( SysHost::GetLogicalCPUCount() );
+
+    #if __linux__
+        cx.downloadDirect = cfg.disableDirectDownloads ? false : true;
+    #else
+        // #TODO: One windows, check if we have enough memory, if so, default to true.
+        cx.downloadDirect = true ;//false;
+    #endif
+
+    // cx.plotWriter = new PlotWriter( !cfg.gCfg->disableOutputDirectIO );
+    // if( cx.gCfg->benchmarkMode )
+    //     cx.plotWriter->EnableDummyMode();
+
+    cx.plotFence  = new Fence();
+
+    cx.phase2     = new CudaK32Phase2{};
+    cx.phase3     = new CudaK32Phase3{};
+
+    // #TODO: Support non-warm starting
+    Log::Line( "Allocating buffers (this may take a few seconds)..." );
+    AllocBuffers( cx );
+    InitFSEBitMask( cx );
+}
+
+//-----------------------------------------------------------
+void CudaInit( CudaK32PlotContext& cx )
+{
+    ASSERT( cx.cudaDevice == -1 );
+
+    // CUDA init
+    int deviceCount = 0;
+    CudaFatalCheckMsg( cudaGetDeviceCount( &deviceCount ), "Failed to fetch CUDA devices." );
+    FatalIf( deviceCount < 1, "No CUDA-capable devices found." );
+    FatalIf( cx.cfg.deviceIndex >= deviceCount, "CUDA device %u is out of range out of %d CUDA devices", 
+            cx.cfg.deviceIndex, deviceCount );
+    
+    CudaFatalCheckMsg( cudaSetDevice( (int)cx.cfg.deviceIndex ), "Failed to set cuda device at index %u", cx.cfg.deviceIndex );
+    cx.cudaDevice = (int32)cx.cfg.deviceIndex;
+
+    cudaDeviceProp* cudaDevProps = new cudaDeviceProp{};
+    CudaErrCheck( cudaGetDeviceProperties( cudaDevProps, cx.cudaDevice ) );
+    cx.cudaDevProps = cudaDevProps;
+
+    Log::Line( "Selected cuda device %u : %s", cx.cudaDevice, cudaDevProps->name );
+
+    // Get info & limites
+    size_t stack = 0, memFree = 0, memTotal = 0;
+    cudaMemGetInfo( &memFree, &memTotal );
+    cudaDeviceGetLimit( &stack, cudaLimitStackSize );
+
+    Log::Line( " CUDA Compute Capability   : %u.%u", cudaDevProps->major, cudaDevProps->minor );
+    Log::Line( " SM count                  : %d", cudaDevProps->multiProcessorCount );
+    Log::Line( " Max blocks per SM         : %d", cudaDevProps->maxBlocksPerMultiProcessor );
+    Log::Line( " Max threads per SM        : %d", cudaDevProps->maxThreadsPerMultiProcessor );
+    Log::Line( " Async Engine Count        : %d", cudaDevProps->asyncEngineCount );
+    Log::Line( " L2 cache size             : %.2lf MB", (double)cudaDevProps->l2CacheSize BtoMB );
+    Log::Line( " L2 persist cache max size : %.2lf MB", (double)cudaDevProps->persistingL2CacheMaxSize BtoMB );
+    Log::Line( " Stack Size                : %.2lf KB", (double)stack   BtoKB );
+    Log::Line( " Memory:" );
+    Log::Line( "  Total                    : %.2lf GB", (double)memTotal BtoGB );
+    Log::Line( "  Free                     : %.2lf GB", (double)memFree  BtoGB );
+    Log::Line( "" );
+
+    // Ensure we have the correct capabilities    
+    //int supportsCoopLaunch = 0;
+    //cudaDeviceGetAttribute( &supportsCoopLaunch, cudaDevAttrCooperativeLaunch, cx.cudaDevice );
+    //FatalIf( supportsCoopLaunch != 1, "This CUDA device does not support cooperative kernel launches." );
+}
+
+
+///
+/// Plotting entry point
+///
+//-----------------------------------------------------------
+void CudaK32Plotter::Run( const PlotRequest& req )
+{
+    SysHost::InstallCrashHandler();
+
+    // Initialize if needed
+    if( _cx == nullptr )
+        Init();
+
+    auto&       cx  = *_cx;
+    const auto& cfg = _cfg;
+
+    // Only start profiling from here (don't profile allocations)
+    CudaErrCheck( cudaProfilerStart() );
+
+    ASSERT( cx.plotWriter == nullptr );
+    cx.plotWriter = new PlotWriter( !cfg.gCfg->disableOutputDirectIO );
+    if( cx.gCfg->benchmarkMode )
+        cx.plotWriter->EnableDummyMode();
+
+    FatalIf( !cx.plotWriter->BeginPlot( cfg.gCfg->compressionLevel > 0 ? PlotVersion::v2_0 : PlotVersion::v1_0, 
+            req.outDir, req.plotFileName, req.plotId, req.memo, req.memoSize, cfg.gCfg->compressionLevel ), 
+        "Failed to open plot file with error: %d", cx.plotWriter->GetError() );
+
+    cx.plotRequest = req;
+    MakePlot( cx );
+
+    cx.plotWriter->EndPlot( true );
+
+    // #TODO: Ensure the last plot ended here for now
+    {
+        const auto pltoCompleteTimer = TimerBegin();
+        cx.plotWriter->WaitForPlotToComplete();
+        const double plotIOTime = TimerEnd( pltoCompleteTimer );
+        Log::Line( "Completed writing plot in %.2lf seconds", plotIOTime );
+
+        cx.plotWriter->DumpTables();
+    }
+    Log::Line( "" );
+
+    delete cx.plotWriter;
+    cx.plotWriter = nullptr;
+}
+
+//-----------------------------------------------------------
+void MakePlot( CudaK32PlotContext& cx )
+{
+    memset( cx.bucketCounts    , 0, sizeof( cx.bucketCounts ) );
+    memset( cx.bucketSlices    , 0, sizeof( cx.bucketSlices ) );
+    memset( cx.tableEntryCounts, 0, sizeof( cx.tableEntryCounts ) );
+
+    cx.table = TableId::Table1;
+    const auto plotTimer = TimerBegin();
+    const auto p1Timer   = plotTimer;
+
+    #if BBCU_DBG_SKIP_PHASE_1
+        DbgLoadContextAndPairs( cx );
+    #else
+    // F1
+    Log::Line( "Generating F1" );
+    const auto timer = TimerBegin();
+    GenF1Cuda( cx );
+    const auto elapsed = TimerEnd( timer );
+    Log::Line( "Finished F1 in %.2lf seconds.", elapsed );
+
+    // Time for FP   
+    for( TableId table = TableId::Table2; table <= TableId::Table7; table++ )
+    {
+        cx.table  = table;
+        cx.bucket = 0;
+        FpTable( cx );
+    }
+    const auto p1Elapsed = TimerEnd( p1Timer );
+    Log::Line( "Completed Phase 1 in %.2lf seconds", p1Elapsed );
+    #endif
+
+    // Prune
+    #if !BBCU_DBG_SKIP_PHASE_2
+    const auto p2Timer = TimerBegin();
+    CudaK32PlotPhase2( cx );
+    const auto p2Elapsed = TimerEnd( p2Timer );
+    Log::Line( "Completed Phase 2 in %.2lf seconds", p2Elapsed );
+    #endif
+
+    // Compress & write plot tables
+    const auto p3Timer = TimerBegin();
+    CudaK32PlotPhase3( cx );
+    const auto p3Elapsed = TimerEnd( p3Timer );
+    Log::Line( "Completed Phase 3 in %.2lf seconds", p3Elapsed );
+
+    auto plotElapsed = TimerEnd( plotTimer );
+    Log::Line( "Completed Plot 1 in %.2lf seconds ( %.2lf minutes )", plotElapsed, plotElapsed / 60.0 );
+    Log::Line( "" );
+}
+
+//-----------------------------------------------------------
+void FpTable( CudaK32PlotContext& cx )
+{
+    memset( &cx.timings, 0, sizeof( cx.timings ) );
+    const TableId inTable = cx.table - 1;
+
+    cx.prevTablePairOffset = 0;
+
+    // Clear slice counts
+    CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );
+
+    // Load initial buckets
+    UploadBucketForTable( cx, 0 );
+
+    const auto timer = TimerBegin();
+    for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+    {
+        FpTableBucket( cx, bucket );
+    }
+
+    CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
+
+    // Copy bucket slices to host
+    cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, 
+                        cudaMemcpyDeviceToHost, cx.gpuDownloadStream[0]->GetStream() );
+    CudaErrCheck( cudaStreamSynchronize( cx.gpuDownloadStream[0]->GetStream() ) );
+
+    // #TODO: Don't do this copy and instead just use the hostBucketSlices one
+    const uint32 outIdx = CudaK32PlotGetOutputIndex( cx );
+    memcpy( &cx.bucketSlices[outIdx], cx.hostBucketSlices, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT );
+
+    // #TODO: Do this on the GPU and simply copy it over
+    for( uint32 i = 0; i < BBCU_BUCKET_COUNT; i++ )
+        for( uint32 j = 0; j < BBCU_BUCKET_COUNT; j++ )
+            cx.bucketCounts[(int)cx.table][i] += cx.bucketSlices[outIdx][j][i];
+
+    cx.tableEntryCounts[(int)cx.table] = 0;
+    for( uint32 i = 0; i < BBCU_BUCKET_COUNT; i++ )
+        cx.tableEntryCounts[(int)cx.table] += cx.bucketCounts[(int)cx.table][i];
+
+    // Cap entry counts to 2^k
+    if( cx.tableEntryCounts[(int)cx.table] > BBCU_TABLE_ENTRY_COUNT )
+    {
+        const uint32 overflow = (uint32)( cx.tableEntryCounts[(int)cx.table] - BBCU_TABLE_ENTRY_COUNT );
+        cx.tableEntryCounts[(int)cx.table] = BBCU_TABLE_ENTRY_COUNT;
+        cx.bucketCounts[(int)cx.table][BBCU_BUCKET_COUNT-1] -= overflow;
+    }
+
+    cx.yOut.WaitForCompletion();
+    cx.yOut.Reset();
+    
+    cx.xPairsOut.WaitForCompletion();
+    cx.xPairsOut.Reset();
+
+    cx.xPairsIn.Reset();
+
+    cx.pairsLOut.WaitForCompletion();
+    cx.pairsLOut.Reset();
+    cx.pairsROut.WaitForCompletion();
+    cx.pairsROut.Reset();
+
+    // #NOTE: Must do this to ensure the buffers are
+    //        free for the next go, which use the same underlying buffers
+    //        but a different downloader object.
+    cx.sortedXPairsOut.WaitForCompletion();
+    cx.sortedXPairsOut.Reset();
+
+    cx.sortedPairsLOut.WaitForCompletion();//cx.sortedPairsLOut.WaitForCopyCompletion();
+    cx.sortedPairsLOut.Reset();
+    cx.sortedPairsROut.WaitForCompletion();//cx.sortedPairsROut.WaitForCopyCompletion();
+    cx.sortedPairsROut.Reset();
+
+    
+    if( cx.table < TableId::Table7 )
+    {
+        cx.metaOut.WaitForCompletion(); cx.metaOut.Reset();
+    }
+
+    cx.yIn     .Reset();
+    cx.pairsLIn.Reset();
+    cx.pairsRIn.Reset();
+    cx.metaIn  .Reset();
+
+    const auto elapsed = TimerEnd( timer );
+    Log::Line( "Table %u completed in %.2lf seconds with %llu entries.", 
+               (uint32)cx.table+1, elapsed, cx.tableEntryCounts[(int)cx.table] );
+
+    #if DBG_BBCU_P1_WRITE_PAIRS
+        // Write them sorted, so have to wait until table 3 completes
+        if( cx.table > TableId::Table2 )
+            DbgWritePairs( cx, cx.table - 1 );
+    #endif
+    
+    if( cx.table == TableId::Table7 )
+    {
+       FinalizeTable7( cx );
+
+       #if DBG_BBCU_P1_WRITE_PAIRS
+           DbgWritePairs( cx, TableId::Table7 );
+       #endif
+
+        #if DBG_BBCU_P1_WRITE_CONTEXT
+           DbgWriteContext( cx );
+       #endif
+    }
+}
+
+//-----------------------------------------------------------
+void FpTableBucket( CudaK32PlotContext& cx, const uint32 bucket )
+{
+    cx.bucket = bucket;
+
+    // Load next bucket in the background
+    if( bucket + 1 < BBCU_BUCKET_COUNT )
+        UploadBucketForTable( cx, bucket + 1 );
+
+    const TableId inTable    = cx.table - 1;
+    const uint32  entryCount = cx.bucketCounts[(int)inTable][bucket];
+
+    // #NOTE: Ensure these match the ones in UploadBucketForTable()
+    cudaStream_t mainStream  = cx.computeStream;
+    cudaStream_t metaStream  = cx.computeStream;//B;
+    cudaStream_t pairsStream = cx.computeStream;//C;
+
+    uint32* sortKeyIn   = (uint32*)cx.devMatches;
+    uint32* sortKeyOut  = cx.devSortKey;
+    if( cx.table > TableId::Table2 )
+    {
+        // Generate a sorting key
+        CudaK32PlotGenSortKey( entryCount, sortKeyIn, mainStream );
+    }
+ 
+    uint32* devYUnsorted    = (uint32*)cx.yIn.GetUploadedDeviceBuffer( mainStream );
+    uint32* devMetaUnsorted = nullptr;
+
+    uint32* devYSorted      = cx.devYWork;
+    uint32* devMetaSorted   = cx.devMetaWork;
+
+    if( cx.table == TableId::Table2 )
+    {
+        devMetaUnsorted = (uint32*)cx.metaIn.GetUploadedDeviceBuffer( mainStream );
+        sortKeyIn       = devMetaUnsorted;
+        sortKeyOut      = devMetaSorted;
+    }
+
+    // Sort y w/ key
+    CudaErrCheck( cub::DeviceRadixSort::SortPairs<uint32, uint32>( 
+        cx.devSortTmp, cx.devSortTmpAllocSize, 
+        devYUnsorted,  devYSorted, 
+        sortKeyIn,     sortKeyOut, 
+        entryCount, 0, 32, mainStream ) );
+
+    CudaErrCheck( cudaEventRecord( cx.computeEventC, mainStream ) );
+    CudaErrCheck( cudaEventRecord( cx.computeEventA, mainStream ) );
+
+    cx.yIn.ReleaseDeviceBuffer( mainStream );
+    if( cx.table == TableId::Table2 )
+        cx.metaIn.ReleaseDeviceBuffer( mainStream );
+
+    // Sort and download prev table's pairs
+    const bool isLTableInlineable = cx.table == TableId::Table2 || (uint32)cx.table <= cx.gCfg->numDroppedTables+1;
+    
+    if( !isLTableInlineable )
+    {
+        CudaErrCheck( cudaStreamWaitEvent( pairsStream, cx.computeEventC ) );   // Ensure sort key is ready
+
+        const bool isLTableInlinedPairs = (uint32)cx.table == cx.gCfg->numDroppedTables + 2;
+
+        if( isLTableInlinedPairs )
+        {
+            // Table 2's pairs are inlined x's. Treat as Pairs
+            Pair* pairsIn     = (Pair*)cx.xPairsIn.GetUploadedDeviceBuffer( pairsStream );
+            Pair* sortedPairs = (Pair*)cx.sortedXPairsOut.LockDeviceBuffer( pairsStream );
+
+            CudaK32PlotSortByKey( entryCount, sortKeyOut, pairsIn, sortedPairs, pairsStream );
+            cx.xPairsIn.ReleaseDeviceBuffer( pairsStream );
+
+            Pair* hostPairs = ((Pair*)cx.hostBackPointers[(int)cx.table-1].left) + cx.prevTablePairOffset;
+
+            // Write sorted pairs back to host
+            cx.sortedXPairsOut.DownloadT( hostPairs, entryCount, pairsStream, cx.downloadDirect );
+        }
+        else
+        {
+            uint32* hostPairsL, *hostPairsLFinal;
+            uint16* hostPairsR, *hostPairsRFinal;
+
+            // Wait for pairs to complete loading and sort on Y (or do this before match? Giving us time to write to disk while matching?)
+            uint32* pairsLIn     = (uint32*)cx.pairsLIn       .GetUploadedDeviceBuffer( pairsStream );
+            uint32* sortedPairsL = (uint32*)cx.sortedPairsLOut.LockDeviceBuffer( pairsStream );
+            CudaK32PlotSortByKey( entryCount, sortKeyOut, pairsLIn, sortedPairsL, pairsStream );
+            cx.pairsLIn.ReleaseDeviceBuffer( pairsStream );
+            hostPairsL      = cx.hostTableSortedL + cx.prevTablePairOffset;
+            hostPairsLFinal = cx.hostBackPointers[(int)cx.table-1].left  + cx.prevTablePairOffset;
+
+            cx.sortedPairsLOut.DownloadT( hostPairsLFinal, entryCount, pairsStream, cx.downloadDirect );
+            // cx.sortedPairsLOut.DownloadAndCopyT( hostPairsL, hostPairsLFinal, entryCount, pairsStream );
+            
+            // if( !isOutputCompressed )
+            {
+                uint16* pairsRIn     = (uint16*)cx.pairsRIn       .GetUploadedDeviceBuffer( pairsStream );
+                uint16* sortedPairsR = (uint16*)cx.sortedPairsROut.LockDeviceBuffer( pairsStream );
+                CudaK32PlotSortByKey( entryCount, sortKeyOut, pairsRIn, sortedPairsR, pairsStream );
+                cx.pairsRIn.ReleaseDeviceBuffer( pairsStream );
+                hostPairsR      = cx.hostTableSortedR + cx.prevTablePairOffset; 
+                hostPairsRFinal = cx.hostBackPointers[(int)cx.table-1].right + cx.prevTablePairOffset;
+                
+                cx.sortedPairsROut.DownloadT( hostPairsRFinal, entryCount, pairsStream, cx.downloadDirect );
+                // cx.sortedPairsROut.DownloadAndCopyT( hostPairsR, hostPairsRFinal, entryCount, pairsStream );
+            }
+        }
+    }
+
+    // Match pairs
+    CudaMatchBucketizedK32( cx, devYSorted, mainStream, nullptr );
+
+    // Inline input x's or compressed x's
+    if( isLTableInlineable )
+    {
+        uint32* inlineInput = devMetaSorted;
+
+        if( cx.table > TableId::Table2 )
+        {
+            uint32* pairsLIn = (uint32*)cx.pairsLIn.GetUploadedDeviceBuffer( pairsStream );
+            inlineInput = cx.devXInlineInput;
+
+            CudaK32PlotSortByKey( entryCount, sortKeyOut, pairsLIn, inlineInput, pairsStream );
+            cx.pairsLIn.ReleaseDeviceBuffer( pairsStream );
+        }
+
+        // Inline x values into our new pairs (merge L table into R table)
+        InlineTable( cx, inlineInput, mainStream );
+    }
+
+    // Upload and sort metadata
+    if( cx.table > TableId::Table2 )
+    {
+        const uint32 metaMultiplier = GetTableMetaMultiplier( cx.table - 1 );
+
+        // Wait for meta to complete loading, and sort on Y
+        devMetaUnsorted = (uint32*)cx.metaIn.GetUploadedDeviceBuffer( metaStream );
+
+        // Ensure the sort key is ready
+        CudaErrCheck( cudaStreamWaitEvent( metaStream, cx.computeEventA ) );
+
+        switch( metaMultiplier )
+        {
+            case 2: CudaK32PlotSortByKey( entryCount, sortKeyOut, (K32Meta2*)devMetaUnsorted, (K32Meta2*)devMetaSorted, metaStream ); break;
+            case 3: CudaK32PlotSortByKey( entryCount, sortKeyOut, (K32Meta3*)devMetaUnsorted, (K32Meta3*)devMetaSorted, metaStream ); break;
+            case 4: CudaK32PlotSortByKey( entryCount, sortKeyOut, (K32Meta4*)devMetaUnsorted, (K32Meta4*)devMetaSorted, metaStream ); break;
+            default: ASSERT( 0 ); break;
+        }
+        cx.metaIn.ReleaseDeviceBuffer( metaStream );
+        CudaErrCheck( cudaEventRecord( cx.computeEventB, metaStream ) );
+    }
+
+    // Ensure metadata is sorted
+    CudaErrCheck( cudaStreamWaitEvent( mainStream, cx.computeEventB ) );
+
+    // Compute Fx
+    GenFx( cx, devYSorted, devMetaSorted, mainStream );
+
+    CudaK32PlotDownloadBucket( cx );
+
+    cx.prevTablePairOffset += entryCount;
+}
+
+//-----------------------------------------------------------
+void FinalizeTable7( CudaK32PlotContext& cx )
+{
+    Log::Line( "Finalizing Table 7" );
+    
+    const auto timer = TimerBegin();
+
+    cx.table               = TableId::Table7+1;   // Set a false table
+    cx.prevTablePairOffset = 0;
+
+    // Upload initial bucket
+    UploadBucketForTable( cx, 0 );
+
+
+    // Prepare C1 & 2 tables
+    const uint32 c1Interval       = kCheckpoint1Interval;
+    const uint32 c2Interval       = kCheckpoint1Interval * kCheckpoint2Interval;
+
+    const uint64 tableLength      = cx.tableEntryCounts[(int)TableId::Table7];
+    const uint32 c1TotalEntries   = (uint32)CDiv( tableLength, (int)c1Interval ) + 1; // +1 because chiapos adds an extra '0' entry at the end
+    const uint32 c2TotalEntries   = (uint32)CDiv( tableLength, (int)c2Interval ) + 1; // +1 because we add a short-circuit entry to prevent C2 lookup overflows
+
+    const size_t c1TableSizeBytes = c1TotalEntries * sizeof( uint32 );
+    const size_t c2TableSizeBytes = c2TotalEntries * sizeof( uint32 );
+
+
+    // Prepare host allocations
+    constexpr size_t c3ParkSize = CalculateC3Size();
+
+    const uint64 totalParkSize = CDivT( tableLength, (uint64)kCheckpoint1Interval ) * c3ParkSize;
+
+    StackAllocator hostAlloc( cx.hostMeta, BBCU_TABLE_ALLOC_ENTRY_COUNT * sizeof( uint32 ) * 4 );
+    uint32* hostC1Buffer        = hostAlloc.CAlloc<uint32>( c1TotalEntries );
+    uint32* hostC2Buffer        = hostAlloc.CAlloc<uint32>( c2TotalEntries );
+    uint32* hostLastParkEntries = hostAlloc.CAlloc<uint32>( kCheckpoint1Interval );
+    byte*   hostLastParkBuffer  = (byte*)hostAlloc.CAlloc<uint32>( kCheckpoint1Interval );
+    byte*   hostCompressedParks = hostAlloc.AllocT<byte>( totalParkSize );
+    
+    byte*   hostParkWriter      = hostCompressedParks;
+    uint32* hostC1Writer        = hostC1Buffer;
+
+    // Prepare device allocations
+    constexpr size_t devAllocatorSize = BBCU_BUCKET_ALLOC_ENTRY_COUNT * BBCU_HOST_META_MULTIPLIER * sizeof( uint32 );
+    StackAllocator devAlloc( cx.devMetaWork, devAllocatorSize );
+
+    constexpr uint32 maxParksPerBucket = CuCDiv( BBCU_BUCKET_ENTRY_COUNT, kCheckpoint1Interval ) + 1;
+    static_assert( maxParksPerBucket * c3ParkSize < devAllocatorSize );
+
+    uint32* devC1Buffer = devAlloc.CAlloc<uint32>( c1TotalEntries );
+    uint32* devC1Writer = devC1Buffer;
+
+    const size_t parkBufferSize = kCheckpoint1Interval * sizeof( uint32 );
+
+    GpuDownloadBuffer& parkDownloader = cx.metaOut;
+
+    cudaStream_t mainStream     = cx.computeStream;
+    cudaStream_t metaStream     = cx.computeStream;//B;
+    cudaStream_t pairsStream    = cx.computeStream;//C;
+    cudaStream_t downloadStream = cx.gpuDownloadStream[0]->GetStream();
+
+    // Load CTable
+    FSE_CTable* devCTable = devAlloc.AllocT<FSE_CTable>( sizeof( CTable_C3 ), sizeof( uint64 ) );
+    CudaErrCheck( cudaMemcpyAsync( devCTable, CTable_C3, sizeof( CTable_C3 ), cudaMemcpyHostToDevice, cx.computeStream ) );
+
+
+    // Prepare plot tables
+    cx.plotWriter->ReserveTableSize( PlotTable::C1, c1TableSizeBytes );
+    cx.plotWriter->ReserveTableSize( PlotTable::C2, c2TableSizeBytes );
+    cx.plotWriter->BeginTable( PlotTable::C3 );
+
+    // Save a buffer with space before the start of it for us to copy retained entries for the next park.
+    uint32  retainedC3EntryCount = 0;
+    uint32* devYSorted           = cx.devYWork + kCheckpoint1Interval;
+
+    
+    uint32* sortKeyIn  = (uint32*)cx.devMatches;
+    uint32* sortKeyOut = cx.devSortKey;
+
+    // Compress parks
+    for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+    {
+        cx.bucket = bucket;
+
+        // Upload next bucket
+        if( bucket + 1 < BBCU_BUCKET_COUNT )
+            UploadBucketForTable( cx, bucket+1 );
+
+        const uint32 entryCount = cx.bucketCounts[(int)TableId::Table7][bucket];
+        ASSERT( entryCount > kCheckpoint1Interval );
+
+
+        // Generate a sorting key
+        CudaK32PlotGenSortKey( entryCount, sortKeyIn, mainStream );
+
+        // Sort y w/ key
+        uint32* devYUnsorted = (uint32*)cx.yIn.GetUploadedDeviceBuffer( mainStream );
+
+        CudaErrCheck( cub::DeviceRadixSort::SortPairs<uint32, uint32>( 
+            cx.devSortTmp, cx.devSortTmpAllocSize, 
+            devYUnsorted, devYSorted,
+            sortKeyIn, sortKeyOut, 
+            entryCount, 0, 32, mainStream ) );
+
+        CudaErrCheck( cudaEventRecord( cx.computeEventA, mainStream ) );
+        cx.yIn.ReleaseDeviceBuffer( mainStream ); devYUnsorted = nullptr;
+
+        // Sort pairs
+        {
+            CudaErrCheck( cudaStreamWaitEvent( pairsStream, cx.computeEventA ) );   // Wait for the sort key to be ready
+
+            uint32* sortedPairsL = (uint32*)cx.sortedPairsLOut.LockDeviceBuffer( pairsStream );
+            uint32* pairsLIn     = (uint32*)cx.pairsLIn.GetUploadedDeviceBuffer( pairsStream );
+            CudaK32PlotSortByKey( entryCount, sortKeyOut, pairsLIn, sortedPairsL, pairsStream );
+            cx.pairsLIn.ReleaseDeviceBuffer( pairsStream );
+
+            uint16* sortedPairsR = (uint16*)cx.sortedPairsROut.LockDeviceBuffer( pairsStream );
+            uint16* pairsRIn     = (uint16*)cx.pairsRIn.GetUploadedDeviceBuffer( pairsStream );
+            CudaK32PlotSortByKey( entryCount, sortKeyOut, pairsRIn, sortedPairsR, pairsStream );
+            cx.pairsRIn.ReleaseDeviceBuffer( pairsStream );
+
+
+            // Download sorted pairs back to host
+            // uint32* hostPairsL      = cx.hostTableSortedL + cx.prevTablePairOffset;
+            // uint16* hostPairsR      = cx.hostTableSortedR + cx.prevTablePairOffset;
+            uint32* hostPairsLFinal = cx.hostBackPointers[(int)TableId::Table7].left  + cx.prevTablePairOffset;
+            uint16* hostPairsRFinal = cx.hostBackPointers[(int)TableId::Table7].right + cx.prevTablePairOffset;
+
+            // cx.sortedPairsLOut.DownloadAndCopyT( hostPairsL, hostPairsLFinal, entryCount, pairsStream );
+            // cx.sortedPairsROut.DownloadAndCopyT( hostPairsR, hostPairsRFinal, entryCount, pairsStream );
+            cx.sortedPairsLOut.DownloadT( hostPairsLFinal, entryCount, pairsStream, true );
+            cx.sortedPairsROut.DownloadT( hostPairsRFinal, entryCount, pairsStream, true );
+
+            cx.prevTablePairOffset += entryCount;
+        }
+
+
+        // If we previously had entries retained, adjust our buffer and counts accordingly
+        uint32* devF7Entries = devYSorted - retainedC3EntryCount;
+        uint32  f7EntryCount = entryCount + retainedC3EntryCount;
+
+        const uint32 parkCount = f7EntryCount / kCheckpoint1Interval;
+
+        // Copy C1 entries
+        CudaErrCheck( cudaMemcpy2DAsync( devC1Writer, sizeof( uint32 ), devF7Entries, sizeof( uint32 ) * c1Interval,
+                                         sizeof( uint32 ), parkCount, cudaMemcpyDeviceToDevice, mainStream ) );
+        devC1Writer += parkCount;
+
+        // Compress C tables
+        // This action mutates the f7 buffer in-place, so ensure the C1 copies happen before this call
+        byte* devParkBuffer = (byte*)parkDownloader.LockDeviceBuffer( mainStream );
+        CompressC3ParksInGPU( parkCount, devF7Entries, devParkBuffer, c3ParkSize, devCTable, mainStream );
+
+        // Retain any new f7 entries for the next bucket, if ndeeded
+        retainedC3EntryCount = f7EntryCount - (parkCount * kCheckpoint1Interval);
+        if( retainedC3EntryCount > 0 )
+        {
+            // Last bucket?
+            const bool isLastBucket = bucket + 1 == BBCU_BUCKET_COUNT;
+
+            const uint32  compressedEntryCount = parkCount * kCheckpoint1Interval;
+            const uint32* copySource           = devF7Entries + compressedEntryCount;
+            const size_t  copySize             = sizeof( uint32 ) * retainedC3EntryCount;
+
+            if( !isLastBucket )
+            {
+                // Not the last bucket, so retain entries for the next GPU compression bucket
+                CudaErrCheck( cudaMemcpyAsync( devYSorted - retainedC3EntryCount, copySource, copySize, 
+                                                cudaMemcpyDeviceToDevice, mainStream ) );
+            }
+            else
+            {
+                // No more buckets so we have to compress this last park on the CPU
+                CudaErrCheck( cudaMemcpyAsync( hostLastParkEntries, copySource, copySize, 
+                                                cudaMemcpyDeviceToHost, downloadStream ) );
+            }
+        }
+
+        // Download compressed parks to host
+        const size_t parkDownloadSize = c3ParkSize * parkCount;
+        parkDownloader.DownloadWithCallback( hostParkWriter, parkDownloadSize, 
+            []( void* parksBuffer, size_t size, void* userData ) {
+
+                auto& cx = *reinterpret_cast<CudaK32PlotContext*>( userData );
+                cx.plotWriter->WriteTableData( parksBuffer, size );
+            }, &cx, mainStream );
+        hostParkWriter += parkDownloadSize;
+    }
+
+    // Download c1 entries
+    const size_t devC1EntryCount = (size_t)(uintptr_t)(devC1Writer - devC1Buffer);
+    CudaErrCheck( cudaMemcpyAsync( hostC1Buffer, devC1Buffer, sizeof( uint32 ) * devC1EntryCount, cudaMemcpyDeviceToHost, downloadStream ) );
+    hostC1Writer += devC1EntryCount;
+
+    // Wait for parks to finish downloading
+    parkDownloader.WaitForCompletion();
+    parkDownloader.Reset();
+
+    // Was there a left-over park?
+    if( retainedC3EntryCount > 0 )
+    {
+        // Copy c1 entry
+        *hostC1Writer++ = hostLastParkEntries[0];
+        ASSERT( hostC1Writer - hostC1Buffer == c1TotalEntries - 1 );
+
+        // Serialize and trailing park and submit it to the plot
+        if( retainedC3EntryCount > 1 )
+        {
+            TableWriter::WriteC3Park( retainedC3EntryCount - 1, hostLastParkEntries, hostLastParkBuffer );
+            cx.plotWriter->WriteTableData( hostLastParkBuffer, c3ParkSize );
+        }
+    }
+
+    // Write final empty C entries
+    hostC1Buffer[c1TotalEntries-1] = 0;
+    hostC2Buffer[c2TotalEntries-1] = 0;
+
+    // Byte-swap C1 
+    for( uint32 i = 0; i < c1TotalEntries-1; i++ )
+        hostC1Buffer[i] = Swap32( hostC1Buffer[i] );
+
+    // Calculate C2 entries
+    for( uint32 i = 0; i < c2TotalEntries-1; i++ )
+    {
+        ASSERT( i * kCheckpoint2Interval < c1TotalEntries - 1 );
+        hostC2Buffer[i] = hostC1Buffer[i * kCheckpoint2Interval];
+    }
+
+    // End C3 table & write C1 & C2 tables
+    cx.plotWriter->EndTable();
+    cx.plotWriter->WriteReservedTable( PlotTable::C1, hostC1Buffer );
+    cx.plotWriter->WriteReservedTable( PlotTable::C2, hostC2Buffer );
+    cx.plotWriter->SignalFence( *cx.plotFence );    // Signal the fence for the start of Phase 3 when we have to use our tmp2 host buffer again
+
+
+    // Cleanup
+    // cx.sortedPairsLOut.WaitForCopyCompletion();
+    // cx.sortedPairsROut.WaitForCopyCompletion();
+    cx.sortedPairsLOut.WaitForCompletion();
+    cx.sortedPairsROut.WaitForCompletion();
+    cx.sortedPairsLOut.Reset();
+    cx.sortedPairsROut.Reset();
+
+    cx.prevTablePairOffset = 0;
+
+    auto elapsed = TimerEnd( timer );
+    Log::Line( "Finalized Table 7 in %.2lf seconds.", elapsed );
+}
+
+//-----------------------------------------------------------
+__global__ void CudaInlineTable( const uint32* entryCount, const uint32* inX, const Pair* matches, Pair* inlinedPairs, uint32 entryBits = 0 )
+{
+    const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if( gid >= *entryCount )
+        return;
+
+    const Pair pair = matches[gid];
+
+    Pair inlined;
+    inlined.left  = inX[pair.left ];
+    inlined.right = inX[pair.right];
+
+    CUDA_ASSERT( inlined.left || inlined.right );
+
+    inlinedPairs[gid] = inlined;
+}
+
+//-----------------------------------------------------------
+template<bool UseLP>
+__global__ void CudaCompressTable( const uint32* entryCount, const uint32* inLEntries, const Pair* matches, uint32* outREntries, const uint32 bitShift )
+{
+    const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if( gid >= *entryCount )
+        return;
+
+    const Pair pair = matches[gid];
+
+    const uint32 x0 = inLEntries[pair.left ];
+    const uint32 x1 = inLEntries[pair.right];
+
+    // Convert to linepoint   
+    if constexpr ( UseLP )         
+        outREntries[gid] = (uint32)CudaSquareToLinePoint64( x1 >> bitShift, x0 >> bitShift );
+    else
+        outREntries[gid] =  ((x1 >> bitShift) << (32-bitShift) ) | (x0 >> bitShift);
+}
+
+//-----------------------------------------------------------
+void InlineTable( CudaK32PlotContext& cx, const uint32* devInX, cudaStream_t stream )
+{
+    static_assert( alignof( Pair ) == sizeof( uint32 ) );
+
+    const bool isCompressedInput = cx.gCfg->compressionLevel > 0 && (uint32)cx.table <= cx.gCfg->numDroppedTables;
+
+    const uint32 kthreads = 256;
+    const uint32 kblocks  = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, (int)kthreads );
+    
+    if( isCompressedInput )
+    {
+        const bool   isFinalTable = cx.table == TableId::Table1 + (TableId)cx.gCfg->numDroppedTables;
+        const uint32 bitShift     = ( isFinalTable && cx.gCfg->numDroppedTables > 1 ) ? 0 : BBCU_K - cx.gCfg->compressedEntryBits;
+
+        if( isFinalTable )
+            CudaCompressTable<true><<<kblocks, kthreads, 0, stream>>>( cx.devMatchCount, devInX, cx.devMatches, cx.devCompressedXs, bitShift );
+        else
+            CudaCompressTable<false><<<kblocks, kthreads, 0, stream>>>( cx.devMatchCount, devInX, cx.devMatches, cx.devCompressedXs, bitShift );
+    }
+    else
+    {
+        CudaInlineTable<<<kblocks, kthreads, 0, stream>>>( cx.devMatchCount, devInX, cx.devMatches, cx.devInlinedXs );
+    }
+}
+
+//-----------------------------------------------------------
+void CudaK32PlotDownloadBucket( CudaK32PlotContext& cx )
+{
+    const bool   writeVertical  = CudaK32PlotIsOutputInterleaved( cx );
+    const size_t metaMultiplier = GetTableMetaMultiplier( cx.table );
+
+    const bool   downloadCompressed   = cx.table > TableId::Table1 && (uint32)cx.table <= cx.gCfg->numDroppedTables;
+    const bool   downloadInlinedPairs = !downloadCompressed && (uint32)cx.table == cx.gCfg->numDroppedTables+1;
+
+    uint32* hostY        = cx.hostY;
+    uint32* hostMeta     = cx.hostMeta;
+
+    uint32* hostPairsL   = cx.hostTableL; //cx.hostBackPointers[6].left;
+    uint16* hostPairsR   = cx.hostTableR; //cx.hostBackPointers[6].right;
+    Pair*   t2HostPairs  = (Pair*)cx.hostBackPointers[4].left;
+
+    const size_t startOffset  = cx.bucket * ( writeVertical ? BBCU_MAX_SLICE_ENTRY_COUNT : BBCU_BUCKET_ALLOC_ENTRY_COUNT );  // vertical: offset to starting col. horizontal: to starting row
+    const size_t width        = BBCU_MAX_SLICE_ENTRY_COUNT;
+    const size_t height       = BBCU_BUCKET_COUNT;
+    const size_t dstStride    = writeVertical ? BBCU_BUCKET_ALLOC_ENTRY_COUNT : BBCU_MAX_SLICE_ENTRY_COUNT;
+    const size_t srcStride    = BBCU_MAX_SLICE_ENTRY_COUNT;
+
+    cx.yOut.Download2DT<uint32>( hostY + startOffset, width, height, dstStride, srcStride, cx.computeStream );
+
+    // Metadata
+    if( metaMultiplier > 0 )
+    {
+        const size_t metaSizeMultiplier = metaMultiplier == 3 ? 4 : metaMultiplier;
+        const size_t metaSize           = sizeof( uint32 ) * metaSizeMultiplier;
+        
+        const size_t  metaSrcStride = srcStride * metaSize;
+        const size_t  metaDstStride = dstStride * sizeof( K32Meta4 );
+        const size_t  metaWidth     = width * metaSize;
+              uint32* meta          = hostMeta + startOffset * 4;
+
+        cx.metaOut.Download2D( meta, metaWidth, height, metaDstStride, metaSrcStride, cx.computeStream );
+    }
+
+    if( cx.table > TableId::Table1 )
+    {
+        if( downloadInlinedPairs )
+        {
+            cx.xPairsOut.Download2DT<Pair>( t2HostPairs + startOffset, width, height, dstStride, srcStride, cx.computeStream );
+        }
+        else
+        {
+            cx.pairsLOut.Download2DT<uint32>( hostPairsL + startOffset, width, height, dstStride, srcStride, cx.computeStream );
+
+            if( !downloadCompressed )
+                cx.pairsROut.Download2DT<uint16>( hostPairsR + startOffset, width, height, dstStride, srcStride, cx.computeStream );
+        }
+    }
+}
+
+//-----------------------------------------------------------
+void UploadBucketForTable( CudaK32PlotContext& cx, const uint64 bucket )
+{
+    const TableId rTable  = cx.table;
+    const TableId inTable = rTable - 1;
+
+    uint32 metaMultiplier = GetTableMetaMultiplier( inTable );
+
+    const uint32  inIdx        = CudaK32PlotGetInputIndex( cx );
+    const bool    readVertical = CudaK32PlotIsOutputInterleaved( cx );
+
+    const uint32* hostY        = cx.hostY;
+    const uint32* hostMeta     = cx.hostMeta;
+    const uint32* hostPairsL   = cx.hostTableL; //cx.hostBackPointers[6].left;
+    const uint16* hostPairsR   = cx.hostTableR; //cx.hostBackPointers[6].right;
+
+    const bool   uploadCompressed   = cx.table > TableId::Table2 && (uint32)cx.table-1 <= cx.gCfg->numDroppedTables;
+    const bool   uploadInlinedPairs = !uploadCompressed && (uint32)cx.table == cx.gCfg->numDroppedTables+2;
+    const Pair*  t2HostPairs        = (Pair*)cx.hostBackPointers[4].left; // Table 2 will use table 5, and overflow onto 6
+
+    uint32 stride = BBCU_BUCKET_ALLOC_ENTRY_COUNT;          // Start as vertical
+    size_t offset = (size_t)bucket * BBCU_MAX_SLICE_ENTRY_COUNT;
+
+    if( !readVertical )
+    {
+        // Adjust to starting row
+        stride = BBCU_MAX_SLICE_ENTRY_COUNT;
+        offset = (size_t)bucket * BBCU_BUCKET_ALLOC_ENTRY_COUNT;
+    }
+
+    cudaStream_t mainStream  = cx.computeStream;
+    cudaStream_t metaStream  = cx.computeStream;//B;
+    cudaStream_t pairsStream = cx.computeStream;//C;
+
+    const uint32* counts = &cx.bucketSlices[inIdx][0][bucket];
+
+    cx.yIn.UploadArrayT<uint32>( hostY + offset, BBCU_BUCKET_COUNT, stride, BBCU_BUCKET_COUNT, counts, cx.computeStream );
+
+    // Upload pairs, also
+    if( cx.table > TableId::Table2 )
+    {
+        if( uploadInlinedPairs )
+        {
+            cx.xPairsIn.UploadArrayT<Pair>( t2HostPairs + offset, BBCU_BUCKET_COUNT, stride, BBCU_BUCKET_COUNT, counts, pairsStream );
+        }
+        else
+        {
+            cx.pairsLIn.UploadArrayT<uint32>( hostPairsL + offset, BBCU_BUCKET_COUNT, stride, BBCU_BUCKET_COUNT, counts, pairsStream );
+
+            if( !uploadCompressed )
+                cx.pairsRIn.UploadArrayT<uint16>( hostPairsR + offset, BBCU_BUCKET_COUNT, stride, BBCU_BUCKET_COUNT, counts, pairsStream );
+        }
+    }
+    
+    // Meta
+    if( metaMultiplier > 0 )
+    {
+        const size_t metaSizeMultiplier = metaMultiplier == 3 ? 4 : metaMultiplier;
+        const size_t metaSize           = sizeof( uint32 ) * metaSizeMultiplier;
+
+        auto actualMetaStream = inTable == TableId::Table1 ? cx.computeStream : metaStream;
+        cx.metaIn.UploadArray( hostMeta + offset * 4, BBCU_BUCKET_COUNT, metaSize, stride * sizeof( K32Meta4 ), BBCU_BUCKET_COUNT, counts, actualMetaStream );
+    }
+}
+
+
+///
+/// Allocations
+///
+//-----------------------------------------------------------
+void AllocBuffers( CudaK32PlotContext& cx )
+{
+    // Determine initially the largest required size
+
+    const size_t alignment = bbclamp<size_t>( SysHost::GetPageSize(), sizeof( K32Meta4 ), 4096 );
+    cx.allocAlignment     = alignment;
+    cx.pinnedAllocSize    = 0;
+    cx.hostTableAllocSize = 0;
+    cx.hostTempAllocSize  = 0;
+    cx.devAllocSize       = 0;
+
+    // Gather the size needed first
+    {
+        CudaK32AllocContext acx = {};
+
+        acx.alignment = alignment;
+        acx.dryRun    = true;
+        
+        DummyAllocator pinnedAllocator;
+        DummyAllocator hostTableAllocator;
+        DummyAllocator hostTempAllocator;
+        DummyAllocator devAllocator;
+
+        acx.pinnedAllocator    = &pinnedAllocator;
+        acx.hostTableAllocator = &hostTableAllocator;
+        acx.hostTempAllocator  = &hostTempAllocator;
+        acx.devAllocator       = &devAllocator;
+
+        AllocateP1Buffers( cx, acx );
+
+        cx.pinnedAllocSize    = pinnedAllocator   .Size();
+        cx.hostTableAllocSize = hostTableAllocator.Size();
+        cx.hostTempAllocSize  = hostTempAllocator .Size();
+        cx.devAllocSize       = devAllocator      .Size();
+
+        /// Phase 2
+        pinnedAllocator    = {};
+        hostTableAllocator = {};
+        hostTempAllocator  = {};
+        devAllocator       = {};
+
+        CudaK32PlotPhase2AllocateBuffers( cx, acx );
+
+        cx.pinnedAllocSize    = std::max( cx.pinnedAllocSize   , pinnedAllocator   .Size() );
+        cx.hostTableAllocSize = std::max( cx.hostTableAllocSize, hostTableAllocator.Size() );
+        cx.hostTempAllocSize  = std::max( cx.hostTempAllocSize , hostTempAllocator .Size() );
+        cx.devAllocSize       = std::max( cx.devAllocSize      , devAllocator      .Size() );
+
+        /// Phase 3
+        pinnedAllocator    = {};
+        hostTableAllocator = {};
+        hostTempAllocator  = {};
+        devAllocator       = {};
+
+        CudaK32PlotPhase3AllocateBuffers( cx, acx );
+
+        cx.pinnedAllocSize    = std::max( cx.pinnedAllocSize   , pinnedAllocator   .Size() );
+        cx.hostTableAllocSize = std::max( cx.hostTableAllocSize, hostTableAllocator.Size() );
+        cx.hostTempAllocSize  = std::max( cx.hostTempAllocSize , hostTempAllocator .Size() );
+        cx.devAllocSize       = std::max( cx.devAllocSize      , devAllocator      .Size() );
+    }
+
+    size_t totalPinnedSize = cx.pinnedAllocSize + cx.hostTempAllocSize;
+    size_t totalHostSize   = cx.hostTableAllocSize + totalPinnedSize;
+    Log::Line( "Kernel RAM required       : %-12llu bytes ( %-9.2lf MiB or %-6.2lf GiB )", totalPinnedSize,
+                   (double)totalPinnedSize BtoMB, (double)totalPinnedSize BtoGB );
+
+    Log::Line( "Intermediate RAM required : %-12llu bytes ( %-9.2lf MiB or %-6.2lf GiB )", cx.pinnedAllocSize,
+                   (double)cx.pinnedAllocSize BtoMB, (double)cx.pinnedAllocSize BtoGB );
+
+    Log::Line( "Host RAM required         : %-12llu bytes ( %-9.2lf MiB or %-6.2lf GiB )", cx.hostTableAllocSize,
+                    (double)cx.hostTableAllocSize BtoMB, (double)cx.hostTableAllocSize BtoGB );
+
+    Log::Line( "Total Host RAM required   : %-12llu bytes ( %-9.2lf MiB or %-6.2lf GiB )", totalHostSize,
+                    (double)totalHostSize BtoMB, (double)totalHostSize BtoGB );
+
+    Log::Line( "GPU RAM required          : %-12llu bytes ( %-9.2lf MiB or %-6.2lf GiB )", cx.devAllocSize,
+                   (double)cx.devAllocSize BtoMB, (double)cx.devAllocSize BtoGB );
+
+    Log::Line( "Allocating buffers" );
+    // Now actually allocate the buffers
+    CudaErrCheck( cudaMallocHost( &cx.pinnedBuffer, cx.pinnedAllocSize, cudaHostAllocDefault ) );
+
+    #if _DEBUG
+        cx.hostBufferTables = bbvirtallocboundednuma<byte>( cx.hostTableAllocSize );
+    #else
+        #if !_WIN32
+        // if( cx.downloadDirect )
+            CudaErrCheck( cudaMallocHost( &cx.hostBufferTables, cx.hostTableAllocSize, cudaHostAllocDefault ) );
+        // else
+        // {
+        //     // #TODO: On windows, first check if we have enough shared memory (512G)? 
+        //     //        and attempt to alloc that way first. Otherwise, use intermediate pinned buffers.
+        #else
+            cx.hostBufferTables = bbvirtallocboundednuma<byte>( cx.hostTableAllocSize );
+        #endif
+        // }
+    #endif
+
+    //CudaErrCheck( cudaMallocHost( &cx.hostBufferTables, cx.hostTableAllocSize, cudaHostAllocDefault ) );
+
+    cx.hostBufferTemp = nullptr;
+#if _DEBUG
+    cx.hostBufferTemp   = bbvirtallocboundednuma<byte>( cx.hostTempAllocSize );
+#endif
+    if( cx.hostBufferTemp == nullptr )
+        CudaErrCheck( cudaMallocHost( &cx.hostBufferTemp, cx.hostTempAllocSize, cudaHostAllocDefault ) );
+
+    CudaErrCheck( cudaMalloc( &cx.deviceBuffer, cx.devAllocSize ) );
+
+    // Warm start
+    if( true )
+    {
+        FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.pinnedBuffer, cx.pinnedAllocSize );
+        FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.hostBufferTables, cx.hostTableAllocSize );
+        FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.hostBufferTemp, cx.hostTempAllocSize );
+    }
+
+    {
+        CudaK32AllocContext acx = {};
+
+        acx.alignment = alignment;
+        acx.dryRun    = false;
+        
+        StackAllocator pinnedAllocator   ( cx.pinnedBuffer    , cx.pinnedAllocSize    );
+        StackAllocator hostTableAllocator( cx.hostBufferTables, cx.hostTableAllocSize );
+        StackAllocator hostTempAllocator ( cx.hostBufferTemp  , cx.hostTempAllocSize  );
+        StackAllocator devAllocator      ( cx.deviceBuffer    , cx.devAllocSize       );
+
+        acx.pinnedAllocator    = &pinnedAllocator;
+        acx.hostTableAllocator = &hostTableAllocator;
+        acx.hostTempAllocator  = &hostTempAllocator;
+        acx.devAllocator       = &devAllocator;
+        AllocateP1Buffers( cx, acx );
+
+        pinnedAllocator   .PopToMarker( 0 );
+        hostTableAllocator.PopToMarker( 0 );
+        hostTempAllocator .PopToMarker( 0 );
+        devAllocator      .PopToMarker( 0 );
+        CudaK32PlotPhase2AllocateBuffers( cx, acx );
+
+        pinnedAllocator   .PopToMarker( 0 );
+        hostTableAllocator.PopToMarker( 0 );
+        hostTempAllocator .PopToMarker( 0 );
+        devAllocator      .PopToMarker( 0 );
+        CudaK32PlotPhase3AllocateBuffers( cx, acx );
+    }
+}
+
+//-----------------------------------------------------------
+void AllocateP1Buffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
+{
+    const size_t alignment = acx.alignment;
+
+    const bool isCompressed = cx.gCfg->compressionLevel > 0;
+
+    // #TODO: Re-optimize usage here again for windows running 256G
+    /// Host allocations
+    {
+        // Temp allocations are pinned host buffers that can be re-used for other means in different phases.
+        // This is roughly equivalent to temp2 dir during disk plotting.
+        cx.hostY    = acx.hostTempAllocator->CAlloc<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
+        cx.hostMeta = acx.hostTempAllocator->CAlloc<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT * BBCU_HOST_META_MULTIPLIER, alignment );
+
+        const size_t markingTableBitFieldSize = GetMarkingTableBitFieldSize();
+
+        cx.hostMarkingTables[0] = nullptr;
+        cx.hostMarkingTables[1] = isCompressed ? nullptr : acx.hostTableAllocator->AllocT<uint64>( markingTableBitFieldSize, alignment );
+        cx.hostMarkingTables[2] = acx.hostTableAllocator->AllocT<uint64>( markingTableBitFieldSize, alignment );
+        cx.hostMarkingTables[3] = acx.hostTableAllocator->AllocT<uint64>( markingTableBitFieldSize, alignment );
+        cx.hostMarkingTables[4] = acx.hostTableAllocator->AllocT<uint64>( markingTableBitFieldSize, alignment );
+        cx.hostMarkingTables[5] = acx.hostTableAllocator->AllocT<uint64>( markingTableBitFieldSize, alignment );
+
+    
+        // NOTE: The first table has their values inlines into the backpointers of the next table
+        cx.hostBackPointers[0] = {};
+
+        const TableId firstTable = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables;
+        
+        Pair* firstTablePairs = acx.hostTableAllocator->CAlloc<Pair>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
+        cx.hostBackPointers[(int)firstTable] = { (uint32*)firstTablePairs, nullptr };
+
+        for( TableId table = firstTable + 1; table <= TableId::Table7; table++ )
+            cx.hostBackPointers[(int)table] = { acx.hostTableAllocator->CAlloc<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ), acx.hostTableAllocator->CAlloc<uint16>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ) };
+
+        cx.hostTableL       = cx.hostBackPointers[6].left;     // Also used for Table 7
+        cx.hostTableR       = cx.hostBackPointers[6].right;
+        cx.hostTableSortedL = cx.hostBackPointers[5].left;
+        cx.hostTableSortedR = cx.hostBackPointers[5].right;
+    }
+
+    /// Device & Pinned allocations
+    {
+        // #NOTE: The R pair is allocated as uint32 because for table 2 we want to download them as inlined x's, so we need 2 uint32 buffers
+        /// Device/Pinned allocations
+        // cx.yOut    = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+        // cx.metaOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<K32Meta4>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+        cx.yOut    = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer<uint32>  ( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
+        cx.metaOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer<K32Meta4>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
+
+        // These download buffers share the same backing buffers
+        {
+            const size_t devMarker    = acx.devAllocator->Size();
+            const size_t pinnedMarker = acx.pinnedAllocator->Size();
+
+            cx.pairsLOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
+            cx.pairsROut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer<uint16>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
+
+            acx.devAllocator->PopToMarker( devMarker );
+            acx.pinnedAllocator->PopToMarker( pinnedMarker );
+
+            // Allocate Pair at the end, to ensure we grab the highest value
+            cx.xPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<Pair>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+        }
+
+        // These download buffers share the same backing buffers
+        {
+            const size_t devMarker    = acx.devAllocator->Size();
+            const size_t pinnedMarker = acx.pinnedAllocator->Size();
+
+            cx.sortedPairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+            cx.sortedPairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint16>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+
+            acx.devAllocator->PopToMarker( devMarker );
+            acx.pinnedAllocator->PopToMarker( pinnedMarker );
+
+            // Allocate Pair at the end, to ensure we grab the highest value
+            cx.sortedXPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<Pair>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+        }
+
+        cx.yIn    = cx.gpuUploadStream[0]->CreateUploadBufferT<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+        cx.metaIn = cx.gpuUploadStream[0]->CreateUploadBufferT<K32Meta4>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+
+        // These uploaded buffers share the same backing buffers
+        {
+            const size_t devMarker    = acx.devAllocator->Size();
+            const size_t pinnedMarker = acx.pinnedAllocator->Size();
+
+            cx.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+            cx.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint16>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+
+            acx.devAllocator->PopToMarker( devMarker );
+            acx.pinnedAllocator->PopToMarker( pinnedMarker );
+
+            // Allocate Pair at the end, to ensure we grab the highest value
+            cx.xPairsIn = cx.gpuUploadStream[0]->CreateUploadBufferT<Pair>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+        }
+
+        /// Device-only allocations
+        if( acx.dryRun )
+        {
+            cx.devSortTmpAllocSize = 0;
+            cub::DeviceRadixSort::SortPairs<uint32, uint32>( nullptr, cx.devSortTmpAllocSize, nullptr, nullptr, nullptr, nullptr, BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+        }
+
+        cx.devSortTmp         = acx.devAllocator->AllocT<byte>( cx.devSortTmpAllocSize, alignment );
+
+        cx.devYWork           = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
+        cx.devMetaWork        = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT * BBCU_HOST_META_MULTIPLIER, alignment );
+        cx.devXInlineInput    = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );   // #TODO: Maybe we can avoid this allocation?
+        cx.devMatches         = acx.devAllocator->CAlloc<Pair>  ( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
+        cx.devInlinedXs       = acx.devAllocator->CAlloc<Pair>  ( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
+
+        cx.devSortKey         = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
+        cx.devChaChaInput     = (uint32*)acx.devAllocator->AllocT<byte>( 64, alignment );
+        cx.devGroupBoundaries = acx.devAllocator->CAlloc<uint32>( CU_MAX_BC_GROUP_BOUNDARIES, alignment );
+        cx.devMatchCount      = acx.devAllocator->CAlloc<uint32>( 1 );
+        cx.devGroupCount      = acx.devAllocator->CAlloc<uint32>( 1 );
+        cx.devBucketCounts    = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_COUNT, alignment );
+        cx.devSliceCounts     = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, alignment );
+
+
+        /// Pinned-only allocations
+        cx.hostMatchCount   = acx.pinnedAllocator->CAlloc<uint32>( 1, alignment );
+        cx.hostBucketCounts = acx.pinnedAllocator->CAlloc<uint32>( BBCU_BUCKET_COUNT, alignment );
+        cx.hostBucketSlices = acx.pinnedAllocator->CAlloc<uint32>( BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, alignment );
+    }
+}
+
+
+///
+/// Debug
+///
+#if _DEBUG
+
+void DbgWritePairs( CudaK32PlotContext& cx, const TableId table )
+{
+    const TableId earliestTable = TableId::Table1 + (TableId)cx.gCfg->numDroppedTables+1;
+    if( table < earliestTable )
+        return;
+
+    char lPath[512];
+    char rPath[512];
+
+    Log::Line( "[DEBUG] Writing pairs to disk..." );
+    {
+        sprintf( lPath, "%st%d.l.tmp", DBG_BBCU_DBG_DIR, (int)table+1 );
+        sprintf( rPath, "%st%d.r.tmp", DBG_BBCU_DBG_DIR, (int)table+1 );
+
+        const uint64 entryCount = cx.tableEntryCounts[(int)table];
+        const Pairs  pairs      = cx.hostBackPointers[(int)table];
+
+        int err;
+
+        if( table == earliestTable )
+        {
+            FatalIf( !IOJob::WriteToFile( lPath, pairs.left, sizeof( Pair ) * entryCount, err ),
+                "Failed to write table pairs: %d", err );
+        }
+        else
+        {
+            FatalIf( !IOJob::WriteToFile( lPath, pairs.left, sizeof( uint32 ) * entryCount, err ),
+                "Failed to write table L pairs: %d", err );
+
+            // if( (uint32)table > cx.gCfg->numDroppedTables )
+                FatalIf( !IOJob::WriteToFile( rPath, pairs.right, sizeof( uint16 ) * entryCount, err),
+                    "Failed to write table R pairs: %d", err );
+        }
+    }
+
+    // if( cx.table == TableId::Table7 )
+    // {
+    //     // Now write our context data
+    //     Log::Line( "[DEBUG] Writing context file." );
+    //     FileStream contxetFile;
+    //     sprintf( lPath, "%scontext.tmp", DBG_BBCU_DBG_DIR );
+    //     FatalIf( !contxetFile.Open( lPath, FileMode::Create, FileAccess::Write ), "Failed to open context file." );
+    //     FatalIf( contxetFile.Write( &cx, sizeof( CudaK32PlotContext ) ) != (ssize_t)sizeof( CudaK32PlotContext ), "Failed to write context data." );
+    //     contxetFile.Close();
+    // }
+    Log::Line( "[DEBUG] Done." );
+}
+
+void DbgWriteContext( CudaK32PlotContext& cx )
+{
+    char path[512];
+    
+    // Now write our context data
+    Log::Line( "[DEBUG] Writing context file." );
+    FileStream contxetFile;
+    sprintf( path, "%scontext.tmp", DBG_BBCU_DBG_DIR );
+    FatalIf( !contxetFile.Open( path, FileMode::Create, FileAccess::Write ), "Failed to open context file." );
+    FatalIf( contxetFile.Write( &cx, sizeof( CudaK32PlotContext ) ) != (ssize_t)sizeof( CudaK32PlotContext ), "Failed to write context data." );
+    
+    contxetFile.Close();
+    
+    Log::Line( "[DEBUG] Done." );
+}
+
+void DbgLoadContextAndPairs( CudaK32PlotContext& cx, bool loadTables )
+{
+    char lPath[512];
+    char rPath[512];
+
+    // Log::Line( "[DEBUG] Loading table pairs..." );
+    {
+        Log::Line( "[DEBUG] Reading context" );
+        CudaK32PlotContext tmpCx = {};
+
+        FileStream contxetFile;
+        sprintf( lPath, "%scontext.tmp", DBG_BBCU_DBG_DIR );
+        FatalIf( !contxetFile.Open( lPath, FileMode::Open, FileAccess::Read ), "Failed to open context file." );
+        FatalIf( contxetFile.Read( &tmpCx, sizeof( CudaK32PlotContext ) ) != (ssize_t)sizeof( CudaK32PlotContext ), "Failed to read context data." );
+        contxetFile.Close();
+
+        memcpy( cx.bucketCounts, tmpCx.bucketCounts, sizeof( tmpCx.bucketCounts ) );
+        memcpy( cx.bucketSlices, tmpCx.bucketSlices, sizeof( tmpCx.bucketSlices ) );
+        memcpy( cx.tableEntryCounts, tmpCx.tableEntryCounts, sizeof( tmpCx.tableEntryCounts ) );        
+    }
+    
+    if( !loadTables )
+        return;
+
+    for( TableId table = TableId::Table2; table <= TableId::Table7; table++ )
+    {
+        Log::Line( "[DEBUG] Loading table %d", (int)table+1 );
+
+        sprintf( lPath, "%st%d.l.tmp", DBG_BBCU_DBG_DIR, (int)table+1 );
+        sprintf( rPath, "%st%d.r.tmp", DBG_BBCU_DBG_DIR, (int)table+1 );
+
+        const uint64  entryCount = cx.tableEntryCounts[(int)table];
+              Pairs&  pairs      = cx.hostBackPointers[(int)table];
+
+
+        int err;
+        pairs.left = (uint32*)IOJob::ReadAllBytesDirect( lPath, err );
+        FatalIf( pairs.left == nullptr, "Failed to read table L pairs: %d", err );
+
+        pairs.right = (uint16*)IOJob::ReadAllBytesDirect( rPath, err );
+        FatalIf( pairs.right == nullptr, "Failed to read table R pairs: %d", err );
+    }
+}
+
+void DbgLoadTablePairs( CudaK32PlotContext& cx, const TableId table, bool copyToPinnedBuffer )
+{
+    char lPath[512];
+    char rPath[512];
+
+    const TableId earliestTable = TableId::Table1 + (TableId)cx.gCfg->numDroppedTables+1;
+    if( table < earliestTable )
+        return;
+
+    // for( TableId table = TableId::Table2; table <= TableId::Table7; table++ )
+    {
+        Log::Line( "[DEBUG] Loading table %d", (int)table + 1 );
+
+        sprintf( lPath, "%st%d.l.tmp", DBG_BBCU_DBG_DIR, (int)table + 1 );
+        sprintf( rPath, "%st%d.r.tmp", DBG_BBCU_DBG_DIR, (int)table + 1 );
+
+        const uint64 entryCount = cx.tableEntryCounts[(int)table];
+        // cx.hostBackPointers[(int)table].left  = bbcvirtallocbounded<uint32>( entryCount );
+        // cx.hostBackPointers[(int)table].right = bbcvirtallocbounded<uint16>( entryCount );
+        Pairs& pairs = cx.hostBackPointers[(int)table];
+
+        int err;
+
+        if( table == earliestTable )
+        {
+            FatalIf( !IOJob::ReadFromFile( lPath, pairs.left, entryCount * sizeof( Pair ), err ), "Failed to read table X pairs: %d", err );
+        }
+        else
+        {
+            FatalIf( !IOJob::ReadFromFile( lPath, pairs.left , entryCount * sizeof( uint32 ), err ), "Failed to read table L pairs: %d", err );
+            
+            // if( (uint32)table > cx.gCfg->numDroppedTables )
+                FatalIf( !IOJob::ReadFromFile( rPath, pairs.right, entryCount * sizeof( uint16 ), err ), "Failed to read table R pairs: %d", err );
+        }
+
+        // We expect table 7 to also be found in these buffers, so copy it
+        // if( table == TableId::Table7 )
+        if( copyToPinnedBuffer )
+        {
+            bbmemcpy_t( cx.hostTableSortedL, pairs.left , entryCount );
+            bbmemcpy_t( cx.hostTableSortedR, pairs.right, entryCount );
+        }
+    }
+
+    Log::Line( "[DEBUG] Done." );
+}
+
+
+void DbgLoadMarks( CudaK32PlotContext& cx )
+{
+    char path[512];
+
+    // const size_t tableSize = ((1ull << BBCU_K) / 64) * sizeof(uint64);
+    Log::Line( "[DEBUG] Loadinging marking tables" );
+
+    const TableId startTable = TableId::Table2 + cx.gCfg->numDroppedTables; 
+
+    for( TableId table = startTable; table < TableId::Table7; table++ )
+    {
+        sprintf( path, "%smarks%d.tmp", DBG_BBCU_DBG_DIR, (int)table+1 );
+
+        int err = 0;
+        cx.hostMarkingTables[(int)table] = (uint64*)IOJob::ReadAllBytesDirect( path, err );
+    }
+
+    Log::Line( "[DEBUG] Done." );
+}
+
+void DbgPruneTable( CudaK32PlotContext& cx, const TableId rTable )
+{
+    const size_t MarkingTableSize = 1ull << 32;
+    byte* bytefield = bbvirtalloc<byte>( MarkingTableSize );
+    memset( bytefield, 0, MarkingTableSize );
+    
+    std::atomic<uint64> totalPrunedEntryCount = 0;
+
+    ThreadPool& pool = DbgGetThreadPool( cx );
+    AnonMTJob::Run( pool, [&]( AnonMTJob* self ){
+
+        const uint64 rEntryCount = cx.tableEntryCounts[(int)rTable];
+        {
+            uint64 count, offset, end;
+            GetThreadOffsets( self, rEntryCount, count, offset, end );
+
+            auto marks = bytefield;
+            Pairs rTablePairs = cx.hostBackPointers[(int)rTable];
+
+            for( uint64 i = offset; i < end; i++ )
+            {
+                const uint32 l = rTablePairs.left[i];
+                const uint32 r = l + rTablePairs.right[i];
+                
+                marks[l] = 1;
+                marks[r] = 1;
+            }
+
+            self->SyncThreads();
+
+                  uint64 localPrunedEntryCount = 0;
+            const uint64 lEntryCount           = cx.tableEntryCounts[(int)rTable-1];
+
+            GetThreadOffsets( self, lEntryCount, count, offset, end );
+            for( uint64 i = offset; i < end; i++ )
+            {
+                if( marks[i] == 1 )
+                    localPrunedEntryCount++;
+            }
+
+            totalPrunedEntryCount += localPrunedEntryCount;
+        }
+    });
+
+    const uint64 prunedEntryCount = totalPrunedEntryCount.load();
+    const uint64 lEntryCount      = cx.tableEntryCounts[(int)rTable-1];
+    Log::Line( "Table %u pruned entry count: %llu / %llu ( %.2lf %% )", rTable, 
+        prunedEntryCount, lEntryCount, prunedEntryCount / (double)lEntryCount * 100.0 );
+
+    bbvirtfree( bytefield );
+}
+
+void DbgPruneTableBuckets( CudaK32PlotContext& cx, const TableId rTable )
+{
+    const size_t MarkingTableSize = 1ull << 32;
+    byte* bytefield = bbvirtalloc<byte>( MarkingTableSize );
+    memset( bytefield, 0, MarkingTableSize );
+    
+    std::atomic<uint64> totalPrunedEntryCount = 0;
+
+    AnonMTJob::Run( *_dbgThreadPool, [&]( AnonMTJob* self ){
+
+        auto  marks = bytefield;
+
+        Pairs rTablePairs = cx.hostBackPointers[6];
+
+        for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+        {
+            const uint64 rEntryCount = cx.bucketCounts[(int)rTable][bucket];
+
+            uint64 count, offset, end;
+            GetThreadOffsets( self, rEntryCount, count, offset, end );
+
+            for( uint64 i = offset; i < end; i++ )
+            {
+                const uint32 l = rTablePairs.left[i];
+                const uint32 r = l + rTablePairs.right[i];
+                
+                marks[l] = 1;
+                marks[r] = 1;
+            }
+
+            rTablePairs.left  += BBCU_BUCKET_ALLOC_ENTRY_COUNT;
+            rTablePairs.right += BBCU_BUCKET_ALLOC_ENTRY_COUNT;
+        }
+
+        self->SyncThreads();
+
+        {
+                  uint64 localPrunedEntryCount = 0;
+            const uint64 lEntryCount           = cx.tableEntryCounts[(int)rTable-1];
+
+            uint64 count, offset, end;
+            GetThreadOffsets( self, lEntryCount, count, offset, end );
+
+            for( uint64 i = offset; i < end; i++ )
+            {
+                if( marks[i] == 1 )
+                    localPrunedEntryCount++;
+            }
+
+            totalPrunedEntryCount += localPrunedEntryCount;
+        }
+    });
+
+    const uint64 prunedEntryCount = totalPrunedEntryCount.load();
+    const uint64 lEntryCount      = cx.tableEntryCounts[(int)rTable-1];
+    Log::Line( "Table %u pruned entry count: %llu / %llu ( %.2lf %% )", rTable, 
+        prunedEntryCount, lEntryCount, prunedEntryCount / (double)lEntryCount * 100.0 );
+
+    bbvirtfree( bytefield );
+}
+
+#endif // _DEBUG
+
+
diff --git a/cuda/CudaPlotter.h b/cuda/CudaPlotter.h
new file mode 100644
index 00000000..ebe30f67
--- /dev/null
+++ b/cuda/CudaPlotter.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "plotting/GlobalPlotConfig.h"
+#include "util/CliParser.h"
+#include "PlotContext.h"
+#include "plotting/IPlotter.h"
+
+struct CudaK32PlotConfig
+{
+    const GlobalPlotConfig* gCfg        = nullptr;
+
+    uint32 deviceIndex            = 0;     // Which CUDA device to use when plotting// 
+    bool   disableDirectDownloads = false; // Don't allocate host tables using pinned buffers, instead
+                                           // download to intermediate pinned buffers then copy to the final host buffer.
+                                           // May be necessarry on Windows because of shared memory limitations (usual 50% of system memory)
+};
+
+class CudaK32Plotter : public IPlotter
+{
+public:
+    inline CudaK32Plotter() {}
+    inline virtual ~CudaK32Plotter() {}
+
+    virtual void ParseCLI( const GlobalPlotConfig& gCfg, CliParser& cli ) override;
+    virtual void Init() override;
+    virtual void Run( const PlotRequest& req ) override;
+
+private:
+    CudaK32PlotConfig          _cfg = {};
+    struct CudaK32PlotContext* _cx  = nullptr;;
+};
\ No newline at end of file
diff --git a/cuda/CudaUtil.h b/cuda/CudaUtil.h
new file mode 100644
index 00000000..6a8f7146
--- /dev/null
+++ b/cuda/CudaUtil.h
@@ -0,0 +1,129 @@
+#pragma once
+
+#include "pch.h"
+#include "util/Util.h"
+#include "util/Log.h"
+#include <stdarg.h>
+
+// CUDA windows fix
+#ifdef _WIN32
+
+    #ifdef __LITTLE_ENDIAN__
+        #undef __LITTLE_ENDIAN__
+        #define __LITTLE_ENDIAN__ 1
+    #endif
+#endif
+
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <device_launch_parameters.h>
+#include <cuda_profiler_api.h>
+
+#define CuBSwap16( x ) ( ((x) >> 8) | ((x) << 8) )
+
+#define CuBSwap32( x ) (\
+    ((x) << 24) |\
+    ((x) >> 24) |\
+    (((x)>>8)&0xFF00) |\
+    (((x)<<8)&0xFF0000) \
+)
+
+#define CuBSwap64( x ) (\
+    ((x) << 56) |\
+    ((x) >> 56) |\
+    (((x)<<40) & 0x00FF000000000000ull) |\
+    (((x)>>40) & 0x000000000000FF00ull) |\
+    (((x)<<24) & 0x0000FF0000000000ull) |\
+    (((x)>>24) & 0x0000000000FF0000ull) |\
+    (((x)<< 8) & 0x000000FF00000000ull) |\
+    (((x)>> 8) & 0x00000000FF000000ull) \
+)
+
+#if _DEBUG
+    #define CUDA_ASSERT( expr ) assert( (expr) )
+#else
+    #define CUDA_ASSERT( expr ) 
+#endif
+
+struct CudaPlotInfo
+{
+    uint16 k;
+    uint16 bucketCount;
+    uint16 yBits;
+    uint16 bucketBits;
+    uint32 bucketCapacity;
+    uint32 sliceCapacity;
+    uint32 metaMaxSizeBytes;
+};
+
+
+inline void CudaErrCheck( cudaError_t err )
+{
+    if( err != cudaSuccess )
+    {
+        const char* cudaErrName = cudaGetErrorName( err );
+        const char* cudaErrDesc = cudaGetErrorString( err );
+        Log::Error( "CUDA error: %d (0x%-02x) %s : %s", err, err, cudaErrName, cudaErrDesc );
+        Log::FlushError();
+        Panic( "CUDA error %s : %s.", cudaErrName, cudaErrDesc );
+    } 
+    // FatalIf( err != cudaSuccess, "CUDA error: %d (0x%-02x) %s : %s",
+    //          err, err, cudaGetErrorName( err ), cudaGetErrorString( err ) );
+}
+
+inline void CudaFatalCheckMsgV( cudaError_t err, const char* msg, va_list args )
+{
+    if( err == cudaSuccess )
+        return;
+
+    Log::Error( msg, args );
+    Log::Error( "" );
+
+    const char* cudaErrName = cudaGetErrorName( err );
+    const char* cudaErrDesc = cudaGetErrorString( err );
+    Log::Error( " CUDA error: %d (0x%-02x) %s : %s", err, err, cudaErrName, cudaErrDesc );
+    Log::FlushError();
+    Panic( " CUDA error %s : %s.", cudaErrName, cudaErrDesc );
+}
+
+inline void CudaFatalCheckMsg( cudaError_t err, const char* msg, ... )
+{
+    if( err == cudaSuccess )
+        return;
+    va_list args;
+    va_start( args, msg );
+    CudaFatalCheckMsgV( err, msg, args );
+    va_end( args );
+}
+
+template<typename T>
+inline cudaError_t CudaCallocT( T*& ptr, const size_t count )
+{
+    return cudaMalloc( &ptr, count * sizeof( T ) );
+}
+
+template<typename T>
+inline cudaError_t CudaSafeFree( T*& ptr )
+{
+    if( ptr ) 
+    {
+        cudaError_t r = cudaFree( (void*)ptr );
+        ptr = nullptr;
+        return r;
+    }
+
+    return cudaSuccess;
+}
+
+template<typename T>
+inline cudaError_t CudaSafeFreeHost( T*& ptr )
+{
+    if( ptr ) 
+    {
+        cudaError_t r = cudaFreeHost( (void*)ptr );
+        ptr = nullptr;
+        return r;
+    }
+
+    return cudaSuccess;
+}
\ No newline at end of file
diff --git a/cuda/FxCuda.cu b/cuda/FxCuda.cu
new file mode 100644
index 00000000..a3a9a606
--- /dev/null
+++ b/cuda/FxCuda.cu
@@ -0,0 +1,681 @@
+#include "CudaPlotContext.h"
+#include "CudaFx.h"
+
+#define CU_FX_THREADS_PER_BLOCK 256
+
+#define B3Round( intputByteSize ) \
+    uint32 state[16] = {                      \
+        0x6A09E667UL,   /*IV full*/           \
+        0xBB67AE85UL,                         \
+        0x3C6EF372UL,                         \
+        0xA54FF53AUL,                         \
+        0x510E527FUL,                         \
+        0x9B05688CUL,                         \
+        0x1F83D9ABUL,                         \
+        0x5BE0CD19UL,                         \
+        0x6A09E667UL,   /*IV 0-4*/            \
+        0xBB67AE85UL,                         \
+        0x3C6EF372UL,                         \
+        0xA54FF53AUL,                         \
+        0,               /*count lo*/         \
+        0,               /*count hi*/         \
+        (intputByteSize),/*buffer length*/    \
+        11              /*flags. Always 11*/  \
+    };                                        \
+                                              \
+    round_fn( state, (uint32*)&input[0], 0 ); \
+    round_fn( state, (uint32*)&input[0], 1 ); \
+    round_fn( state, (uint32*)&input[0], 2 ); \
+    round_fn( state, (uint32*)&input[0], 3 ); \
+    round_fn( state, (uint32*)&input[0], 4 ); \
+    round_fn( state, (uint32*)&input[0], 5 ); \
+    round_fn( state, (uint32*)&input[0], 6 ); 
+
+__forceinline__ __device__ uint32_t rotr32( uint32_t w, uint32_t c )
+{
+    return ( w >> c ) | ( w << ( 32 - c ) );
+}
+
+__forceinline__ __device__ void g( uint32_t* state, size_t a, size_t b, size_t c, size_t d, uint32_t x, uint32_t y )
+{
+    state[a] = state[a] + state[b] + x;
+    state[d] = rotr32( state[d] ^ state[a], 16 );
+    state[c] = state[c] + state[d];
+    state[b] = rotr32( state[b] ^ state[c], 12 );
+    state[a] = state[a] + state[b] + y;
+    state[d] = rotr32( state[d] ^ state[a], 8 );
+    state[c] = state[c] + state[d];
+    state[b] = rotr32( state[b] ^ state[c], 7 );
+}
+
+__forceinline__ __device__ void round_fn( uint32_t state[16], const uint32_t* msg, size_t round )
+{
+    static const uint8_t MSG_SCHEDULE[7][16] = {
+        {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+        {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
+        {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
+        {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
+        {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
+        {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
+        {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
+    };
+    // Select the message schedule based on the round.
+    const uint8_t* schedule = MSG_SCHEDULE[round];
+
+    // Mix the columns.
+    g( state, 0, 4, 8 , 12, msg[schedule[0]], msg[schedule[1]] );
+    g( state, 1, 5, 9 , 13, msg[schedule[2]], msg[schedule[3]] );
+    g( state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]] );
+    g( state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]] );
+
+    // Mix the rows.
+    g( state, 0, 5, 10, 15, msg[schedule[8]] , msg[schedule[9]] );
+    g( state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]] );
+    g( state, 2, 7, 8 , 13, msg[schedule[12]], msg[schedule[13]] );
+    g( state, 3, 4, 9 , 14, msg[schedule[14]], msg[schedule[15]] );
+}
+
+
+//-----------------------------------------------------------
+__forceinline__ __device__ void Blake3RunRounds( uint64 input[8], const uint32 intputByteSize )
+{
+    uint32 state[16] = {                     
+        0x6A09E667UL,   // IV full           
+        0xBB67AE85UL,                        
+        0x3C6EF372UL,                        
+        0xA54FF53AUL,                        
+        0x510E527FUL,                        
+        0x9B05688CUL,                        
+        0x1F83D9ABUL,                        
+        0x5BE0CD19UL,                        
+        0x6A09E667UL,   // IV 0-4            
+        0xBB67AE85UL,                        
+        0x3C6EF372UL,                        
+        0xA54FF53AUL,                        
+        0,              // count lo          
+        0,              // count hi          
+        intputByteSize, // buffer length     
+        11              // flags. Always 11  
+    };                                       
+                                             
+    round_fn( state, (uint32*)&input[0], 0 );
+    round_fn( state, (uint32*)&input[0], 1 );
+    round_fn( state, (uint32*)&input[0], 2 );
+    round_fn( state, (uint32*)&input[0], 3 );
+    round_fn( state, (uint32*)&input[0], 4 );
+    round_fn( state, (uint32*)&input[0], 5 );
+    round_fn( state, (uint32*)&input[0], 6 );
+}
+
+//-----------------------------------------------------------
+__global__ void ValidatePairs( const uint32 matchCount, const uint32 entryCount, const Pair* pairs )
+{
+    const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if( gid >= matchCount)
+        return;
+
+    const Pair p = pairs[gid];
+    CUDA_ASSERT( p.left  < entryCount );
+    CUDA_ASSERT( p.right < entryCount );
+}
+
+enum class FxVariant
+{
+    Regular = 0,
+    InlineTable1,
+    Compressed
+};
+
+//-----------------------------------------------------------
+template<TableId rTable>
+__global__ void HarvestFxK32Kernel( 
+    uint64*       yOut, 
+    void*         metaOutVoid,
+    const uint32  matchCount, 
+    const Pair*   pairsIn, 
+    const uint64* yIn,
+    const void*   metaInVoid
+)
+{
+    const uint32 id  = threadIdx.x;
+    const uint32 gid = (uint32)(blockIdx.x * blockDim.x + id);
+
+    if( gid >= matchCount )
+        return;
+
+    using TMetaIn  = typename K32MetaType<rTable>::In;
+    using TMetaOut = typename K32MetaType<rTable>::Out;
+
+    constexpr size_t MetaInMulti  = TableMetaIn <rTable>::Multiplier;
+    constexpr size_t MetaOutMulti = TableMetaOut<rTable>::Multiplier;
+
+
+    const uint32 k           = BBCU_K;
+    const uint32 ySize       = k + kExtraBits;
+    const uint32 yExtraBits  = MetaOutMulti == 0 ? 0 : kExtraBits;
+    const uint32 yShift      = 64 - (k + yExtraBits);
+    
+    const uint32 metaSize    = k * MetaInMulti;
+    const uint32 metaSizeLR  = metaSize * 2;
+    const uint32 inputSize   = CuCDiv( ySize + metaSizeLR, 8 );
+
+
+    //  uint64 yMask   = (1ull << (k+yExtraBits)) - 1;
+
+    // const uint32 matchCount = *pMatchCount;
+
+    const TMetaIn*  metaIn  = (TMetaIn*)metaInVoid;
+          TMetaOut* metaOut = (TMetaOut*)metaOutVoid;
+
+    // Gen fx and meta
+    uint64   oy;
+    TMetaOut ometa;
+    
+    {
+        uint64 input [8];
+        uint64 output[4];
+
+        const Pair pair = pairsIn[gid];
+        
+        // CUDA_ASSERT( pair.left  < entryCount );
+        // CUDA_ASSERT( pair.right < entryCount );
+
+        const uint64 y = yIn[pair.left];
+
+        if constexpr( MetaInMulti == 1 )
+        {
+            const uint64 l = metaIn[pair.left ];
+            const uint64 r = metaIn[pair.right];
+
+            const uint64 i0 = y << 26 | l >> 6;
+            const uint64 i1 = l << 58 | r << 26;
+
+            input[0] = CuBSwap64( i0 );
+            input[1] = CuBSwap64( i1 );
+            input[2] = 0;
+            input[3] = 0;
+            input[4] = 0;
+            input[5] = 0;
+            input[6] = 0;
+            input[7] = 0;
+
+            if constexpr( MetaOutMulti == 2 )
+                ometa = l << 32 | r;
+        }
+        else if constexpr ( MetaInMulti == 2 )
+        {
+            const uint64 l = metaIn[pair.left ];
+            const uint64 r = metaIn[pair.right];
+
+            input[0] = CuBSwap64( y << 26 | l >> 38 );
+            input[1] = CuBSwap64( l << 26 | r >> 38 );
+            input[2] = CuBSwap64( r << 26 );
+            input[3] = 0;
+            input[4] = 0;
+            input[5] = 0;
+            input[6] = 0;
+            input[7] = 0;
+
+            if constexpr ( MetaOutMulti == 4 )
+            {
+                ometa.m0 = l;
+                ometa.m1 = r;
+            }
+        }
+        else if constexpr ( MetaInMulti == 3 )
+        {
+            const uint64 l0 = metaIn[pair.left ].m0;
+            const uint64 l1 = metaIn[pair.left ].m1 & 0xFFFFFFFF;
+            const uint64 r0 = metaIn[pair.right].m0;
+            const uint64 r1 = metaIn[pair.right].m1 & 0xFFFFFFFF;
+            
+            input[0] = CuBSwap64( y  << 26 | l0 >> 38 );
+            input[1] = CuBSwap64( l0 << 26 | l1 >> 6  );
+            input[2] = CuBSwap64( l1 << 58 | r0 >> 6  );
+            input[3] = CuBSwap64( r0 << 58 | r1 << 26 );
+            input[4] = 0;
+            input[5] = 0;
+            input[6] = 0;
+            input[7] = 0;
+        }
+        else if constexpr ( MetaInMulti == 4 )
+        {
+            const K32Meta4 l = metaIn[pair.left ];
+            const K32Meta4 r = metaIn[pair.right];
+
+            input[0] = CuBSwap64( y    << 26 | l.m0 >> 38 );
+            input[1] = CuBSwap64( l.m0 << 26 | l.m1 >> 38 );
+            input[2] = CuBSwap64( l.m1 << 26 | r.m0 >> 38 );
+            input[3] = CuBSwap64( r.m0 << 26 | r.m1 >> 38 );
+            input[4] = CuBSwap64( r.m1 << 26 );
+            input[5] = 0;
+            input[6] = 0;
+            input[7] = 0;
+        }
+
+        B3Round( inputSize );
+
+        uint32* out = (uint32*)output;
+        out[0] = state[0] ^ state[8];
+        out[1] = state[1] ^ state[9];
+
+        oy = CuBSwap64( *output ) >> yShift;
+
+        // Save output metadata
+        if constexpr ( MetaOutMulti == 2 && MetaInMulti == 3 )
+        {
+            out[2] = state[2] ^ state[10];
+            out[3] = state[3] ^ state[11];
+
+            const uint64 h0 = CuBSwap64( output[0] );
+            const uint64 h1 = CuBSwap64( output[1] );
+
+            ometa = h0 << ySize | h1 >> 26;
+        }
+        else if constexpr ( MetaOutMulti == 3 )
+        {
+            out[2] = state[2] ^ state[10];
+            out[3] = state[3] ^ state[11];
+            out[4] = state[4] ^ state[12];
+            out[5] = state[5] ^ state[13];
+
+            const uint64 h0 = CuBSwap64( output[0] );
+            const uint64 h1 = CuBSwap64( output[1] );
+            const uint64 h2 = CuBSwap64( output[2] );
+
+            ometa.m0 = h0 << ySize | h1 >> 26;
+            ometa.m1 = ((h1 << 6) & 0xFFFFFFC0) | h2 >> 58;
+        }
+        else if constexpr ( MetaOutMulti == 4 && MetaInMulti != 2 )
+        {
+            out[2] = state[2] ^ state[10];
+            out[3] = state[3] ^ state[11];
+            out[4] = state[4] ^ state[12];
+            out[5] = state[5] ^ state[13];
+
+            const uint64 h0 = CuBSwap64( output[0] );
+            const uint64 h1 = CuBSwap64( output[1] );
+            const uint64 h2 = CuBSwap64( output[2] );
+
+            ometa.m0 = h0 << ySize | h1 >> 26;
+            ometa.m1 = h1 << 38    | h2 >> 26;
+        }
+    }
+
+    // OK to store the value now
+    yOut[gid] = oy;
+
+    if constexpr ( MetaOutMulti > 0 )
+        metaOut[gid] = ometa;
+}
+
+//-----------------------------------------------------------
+template<FxVariant Variant, TableId rTable>
+__global__ void GenFxCuda( const uint32* pMatchCount, const uint64 bucketMask, const Pair* pairs, const uint32* yIn, const void* metaInVoid,
+                           uint32* yOut, void* metaOutVoid, const uint32 pairsOffset, uint32* pairsOutL, uint16* pairsOutR, uint32* globalBucketCounts,
+                           const Pair* inlinedXPairs
+#if _DEBUG
+, const uint32 entryCount
+#endif
+)
+{
+    using TMetaIn     = typename K32MetaType<rTable>::In;
+    using TMetaOut    = typename K32MetaType<rTable>::Out;
+
+    constexpr size_t MetaInMulti  = TableMetaIn <rTable>::Multiplier;
+    constexpr size_t MetaOutMulti = TableMetaOut<rTable>::Multiplier;
+
+    constexpr uint32 yMask   = MetaOutMulti == 0 ? BBC_Y_MASK_T7 : BBC_Y_MASK;
+
+    const uint32 k           = BBCU_K;
+    const uint32 metaSize    = k * MetaInMulti;
+    const uint32 metaSizeLR  = metaSize * 2;
+    const uint32 inputSize   = CuCDiv( 38 + metaSizeLR, 8 );
+
+    const uint32 yShiftBits  = MetaOutMulti == 0 ? 0 : kExtraBits;
+    const uint32 ySize       = k + kExtraBits;
+    const uint32 yShift      = 64 - (k + yShiftBits);
+    const uint32 bucketShift = MetaOutMulti == 0 ? BBC_BUCKET_SHIFT_T7 : BBC_BUCKET_SHIFT;
+
+    const uint32 id  = threadIdx.x;
+    const uint32 gid = (uint32)(blockIdx.x * blockDim.x + id);
+
+    const uint32 matchCount = *pMatchCount;
+
+    const TMetaIn*  metaIn  = (TMetaIn*)metaInVoid;
+          TMetaOut* metaOut = (TMetaOut*)metaOutVoid;
+
+    CUDA_ASSERT( BBCU_BUCKET_COUNT <= CU_FX_THREADS_PER_BLOCK );
+    __shared__ uint32 sharedBucketCounts[BBCU_BUCKET_COUNT];
+    if( id < BBCU_BUCKET_COUNT )
+        sharedBucketCounts[id] = 0;
+
+    __syncthreads();
+
+    // Gen fx and meta
+    uint64   oy;
+    TMetaOut ometa;
+    uint32   offset;
+    uint32   bucket;
+    Pair     pair;
+
+    if( gid < matchCount )
+    {
+        uint64 input [8];
+        uint64 output[4];
+
+        pair = pairs[gid];
+        
+        CUDA_ASSERT( pair.left  < entryCount );
+        CUDA_ASSERT( pair.right < entryCount );
+
+        const uint64 y = bucketMask | yIn[pair.left];
+            
+        if constexpr( MetaInMulti == 1 )
+        {
+            const uint64 l = metaIn[pair.left ];
+            const uint64 r = metaIn[pair.right];
+
+            const uint64 i0 = y << 26 | l >> 6;
+            const uint64 i1 = l << 58 | r << 26;
+
+            input[0] = CuBSwap64( i0 );
+            input[1] = CuBSwap64( i1 );
+            input[2] = 0;
+            input[3] = 0;
+            input[4] = 0;
+            input[5] = 0;
+            input[6] = 0;
+            input[7] = 0;
+
+            if constexpr( MetaOutMulti == 2 )
+                ometa = l << 32 | r;
+        }
+        else if constexpr ( MetaInMulti == 2 )
+        {
+            const uint64 l = metaIn[pair.left ];
+            const uint64 r = metaIn[pair.right];
+
+            input[0] = CuBSwap64( y << 26 | l >> 38 );
+            input[1] = CuBSwap64( l << 26 | r >> 38 );
+            input[2] = CuBSwap64( r << 26 );
+            input[3] = 0;
+            input[4] = 0;
+            input[5] = 0;
+            input[6] = 0;
+            input[7] = 0;
+
+            if constexpr ( MetaOutMulti == 4 )
+            {
+                ometa.m0 = l;
+                ometa.m1 = r;
+            }
+        }
+        else if constexpr ( MetaInMulti == 3 )
+        {
+            const uint64 l0 = metaIn[pair.left ].m0;
+            const uint64 l1 = metaIn[pair.left ].m1 & 0xFFFFFFFF;
+            const uint64 r0 = metaIn[pair.right].m0;
+            const uint64 r1 = metaIn[pair.right].m1 & 0xFFFFFFFF;
+            
+            input[0] = CuBSwap64( y  << 26 | l0 >> 38 );
+            input[1] = CuBSwap64( l0 << 26 | l1 >> 6  );
+            input[2] = CuBSwap64( l1 << 58 | r0 >> 6  );
+            input[3] = CuBSwap64( r0 << 58 | r1 << 26 );
+            input[4] = 0;
+            input[5] = 0;
+            input[6] = 0;
+            input[7] = 0;
+        }
+        else if constexpr ( MetaInMulti == 4 )
+        {
+            const K32Meta4 l = metaIn[pair.left ];
+            const K32Meta4 r = metaIn[pair.right];
+
+            input[0] = CuBSwap64( y    << 26 | l.m0 >> 38 );
+            input[1] = CuBSwap64( l.m0 << 26 | l.m1 >> 38 );
+            input[2] = CuBSwap64( l.m1 << 26 | r.m0 >> 38 );
+            input[3] = CuBSwap64( r.m0 << 26 | r.m1 >> 38 );
+            input[4] = CuBSwap64( r.m1 << 26 );
+            input[5] = 0;
+            input[6] = 0;
+            input[7] = 0;
+        }
+
+        B3Round( inputSize );
+
+        uint32* out = (uint32*)output;
+        out[0] = state[0] ^ state[8];
+        out[1] = state[1] ^ state[9];
+
+        oy = CuBSwap64( *output ) >> yShift;
+        
+        // Save output metadata
+        if constexpr ( MetaOutMulti == 2 && MetaInMulti == 3 )
+        {
+            out[2] = state[2] ^ state[10];
+            out[3] = state[3] ^ state[11];
+
+            const uint64 h0 = CuBSwap64( output[0] );
+            const uint64 h1 = CuBSwap64( output[1] );
+
+            ometa = h0 << ySize | h1 >> 26;
+        }
+        else if constexpr ( MetaOutMulti == 3 )
+        {
+            out[2] = state[2] ^ state[10];
+            out[3] = state[3] ^ state[11];
+            out[4] = state[4] ^ state[12];
+            out[5] = state[5] ^ state[13];
+
+            const uint64 h0 = CuBSwap64( output[0] );
+            const uint64 h1 = CuBSwap64( output[1] );
+            const uint64 h2 = CuBSwap64( output[2] );
+
+            ometa.m0 = h0 << ySize | h1 >> 26;
+            ometa.m1 = ((h1 << 6) & 0xFFFFFFC0) | h2 >> 58;
+        }
+        else if constexpr ( MetaOutMulti == 4 && MetaInMulti != 2 )
+        {
+            out[2] = state[2] ^ state[10];
+            out[3] = state[3] ^ state[11];
+            out[4] = state[4] ^ state[12];
+            out[5] = state[5] ^ state[13];
+
+            const uint64 h0 = CuBSwap64( output[0] );
+            const uint64 h1 = CuBSwap64( output[1] );
+            const uint64 h2 = CuBSwap64( output[2] );
+
+            ometa.m0 = h0 << ySize | h1 >> 26;
+            ometa.m1 = h1 << 38    | h2 >> 26;
+        }
+
+        // Save local offset in the target bucket
+        bucket = oy >> bucketShift;
+        CUDA_ASSERT( bucket < BBCU_BUCKET_COUNT );
+        
+        // Record local offset to the shared bucket count
+        offset = atomicAdd( &sharedBucketCounts[bucket], 1 );
+        CUDA_ASSERT( offset < CU_FX_THREADS_PER_BLOCK );
+    }
+
+    // Store this block's bucket offset into the global bucket counts,
+    // and get our global offset for that particular bucket
+    __syncthreads();
+
+    if( id < BBCU_BUCKET_COUNT )
+        sharedBucketCounts[id] = atomicAdd( &globalBucketCounts[id], sharedBucketCounts[id] );
+
+    __syncthreads();
+
+    if( gid >= matchCount )
+        return;
+
+    /// Distribute
+    // Now we have our global offset within a bucket.
+    // Since all bucket slices are fixed-size, we don't need to calculate a prefix sum.
+    // We can simply store directly in the slice address
+    // #TODO: Perhaps we just cap the entries per slices here, instead of allocating more memory...
+    const uint32 offsetInSlice   = sharedBucketCounts[bucket] + offset;
+    const uint32 sliceOffsetY    = bucket * (uint32)BBCU_MAX_SLICE_ENTRY_COUNT;
+    // const uint32 sliceOffsetMeta = sliceOffsetY * ( BBCU_HOST_META_MULTIPLIER / ( sizeof( TMetaOut ) / sizeof( uint32 ) ) );
+    const uint32 dstY            = sliceOffsetY    + offsetInSlice;
+    // const uint32 dstMeta         = sliceOffsetMeta + offsetInSlice;
+
+
+#if _DEBUG
+    if( offsetInSlice >= BBCU_MAX_SLICE_ENTRY_COUNT )
+    {
+        printf( "[%u] (bucket %u) Y Offset %u (local: %u) (in-slice: %u/%u) is out of range %u.\n", gid, bucket, dstY, offset, offsetInSlice, (uint32)BBCU_MAX_SLICE_ENTRY_COUNT, (uint32)BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+    }
+#endif
+
+    // OK to store the value now
+    yOut[dstY] = (uint32)oy & yMask;
+
+    if constexpr ( Variant == FxVariant::Regular )
+    {
+        pairsOutL[dstY] = pairsOffset + pair.left;
+        pairsOutR[dstY] = (uint16)(pair.right - pair.left);
+    }
+    else if constexpr( Variant == FxVariant::InlineTable1 )
+    {
+        // Inlined x's
+        ((Pair*)pairsOutL)[dstY] = inlinedXPairs[gid];
+    }
+    else if constexpr ( Variant == FxVariant::Compressed )
+    {
+        // Compressed x's
+        pairsOutL[dstY] = ((uint32*)inlinedXPairs)[gid];
+    }
+
+    if constexpr ( MetaOutMulti > 0 )
+        metaOut[dstY] = ometa;
+}
+
+//-----------------------------------------------------------
+template<FxVariant Variant>
+inline void GenFxForTable( CudaK32PlotContext& cx, const uint32* devYIn, const uint32* devMetaIn, cudaStream_t stream )
+{
+    const uint32 cudaBlockCount = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, CU_FX_THREADS_PER_BLOCK );
+    const uint64 bucketMask     = BBC_BUCKET_MASK( cx.bucket );
+
+    const bool isCompressed = (uint32)cx.table <= cx.gCfg->numDroppedTables;
+    const bool isPairs      = !isCompressed && (uint32)cx.table == cx.gCfg->numDroppedTables+1;
+    
+
+#if _DEBUG
+    #define DBG_FX_INPUT_ENTRY_COUNT ,cx.bucketCounts[(int)cx.table-1][cx.bucket]
+#else 
+    #define DBG_FX_INPUT_ENTRY_COUNT
+#endif
+
+    // Get next download buffer
+    uint32* devYOut      = (uint32*)cx.yOut.LockDeviceBuffer( stream );
+
+    uint32* devPairsLOut = nullptr;
+    uint16* devPairsROut = nullptr;
+
+    if( isPairs )
+    {
+        devPairsLOut = (uint32*)cx.xPairsOut.LockDeviceBuffer( stream );
+        devPairsROut = nullptr;
+    }
+    else
+    {
+        devPairsLOut = (uint32*)cx.pairsLOut.LockDeviceBuffer( stream );
+
+        if( !isCompressed )
+            devPairsROut = (uint16*)cx.pairsROut.LockDeviceBuffer( stream );
+    }
+
+    void* devMetaOut = cx.table < TableId::Table7 ? cx.metaOut.LockDeviceBuffer( stream ) : nullptr;
+
+    uint32* devBucketCounts = cx.devSliceCounts + cx.bucket * BBCU_BUCKET_COUNT;
+
+    #define FX_CUDA_ARGS cx.devMatchCount, bucketMask, cx.devMatches, devYIn, devMetaIn, \
+         devYOut, devMetaOut, cx.prevTablePairOffset, devPairsLOut, devPairsROut, \
+         devBucketCounts, cx.devInlinedXs DBG_FX_INPUT_ENTRY_COUNT
+// return;
+    switch( cx.table )
+    {
+        case TableId::Table2: GenFxCuda<Variant, TableId::Table2><<<cudaBlockCount, CU_FX_THREADS_PER_BLOCK, 0, stream>>>( FX_CUDA_ARGS ); break;
+        case TableId::Table3: GenFxCuda<Variant, TableId::Table3><<<cudaBlockCount, CU_FX_THREADS_PER_BLOCK, 0, stream>>>( FX_CUDA_ARGS ); break;
+        case TableId::Table4: GenFxCuda<Variant, TableId::Table4><<<cudaBlockCount, CU_FX_THREADS_PER_BLOCK, 0, stream>>>( FX_CUDA_ARGS ); break;
+        case TableId::Table5: GenFxCuda<Variant, TableId::Table5><<<cudaBlockCount, CU_FX_THREADS_PER_BLOCK, 0, stream>>>( FX_CUDA_ARGS ); break;
+        case TableId::Table6: GenFxCuda<Variant, TableId::Table6><<<cudaBlockCount, CU_FX_THREADS_PER_BLOCK, 0, stream>>>( FX_CUDA_ARGS ); break;
+        case TableId::Table7: GenFxCuda<Variant, TableId::Table7><<<cudaBlockCount, CU_FX_THREADS_PER_BLOCK, 0, stream>>>( FX_CUDA_ARGS ); break;
+    }
+
+    #undef FX_CUDA_ARGS
+    #undef DBG_FX_INPUT_ENTRY_COUNT
+}
+
+//-----------------------------------------------------------
+void GenFx( CudaK32PlotContext& cx, const uint32* devYIn, const uint32* devMetaIn, cudaStream_t stream )
+{
+    const bool isCompressed = (uint32)cx.table <= cx.gCfg->numDroppedTables;
+
+    if( !isCompressed )
+    {
+        const bool isPairs = (uint32)cx.table == cx.gCfg->numDroppedTables+1;
+
+        if( isPairs )
+            GenFxForTable<FxVariant::InlineTable1>( cx, devYIn, devMetaIn, stream );
+        else
+            GenFxForTable<FxVariant::Regular>( cx, devYIn, devMetaIn, stream );
+    }
+    else
+        GenFxForTable<FxVariant::Compressed>( cx, devYIn, devMetaIn, stream );
+}
+
+//-----------------------------------------------------------
+void CudaFxHarvestK32(
+    const TableId table,
+    uint64*       devYOut, 
+    void*         devMetaOut,
+    const uint32  matchCount, 
+    const Pair*   devPairsIn, 
+    const uint64* devYIn,
+    const void*   devMetaIn,
+    cudaStream_t  stream )
+{
+    ASSERT( devYIn );
+    ASSERT( devMetaIn );
+    ASSERT( devYOut );
+    ASSERT( table == TableId::Table7 || devMetaOut );
+    ASSERT( devPairsIn );
+    ASSERT( matchCount );
+
+    const uint32 kthreads = 256;
+    const uint32 kblocks  = CDiv( matchCount, kthreads );
+
+    #define KERN_ARGS devYOut, devMetaOut, matchCount, devPairsIn, devYIn, devMetaIn
+    #undef KERN_ARG
+
+    switch( table )
+    {
+        case TableId::Table2:
+            HarvestFxK32Kernel<TableId::Table2><<<kblocks, kthreads, 0, stream>>>( KERN_ARGS );
+            break;
+        case TableId::Table3:
+            HarvestFxK32Kernel<TableId::Table3><<<kblocks, kthreads, 0, stream>>>( KERN_ARGS );
+            break;
+        case TableId::Table4:
+            HarvestFxK32Kernel<TableId::Table4><<<kblocks, kthreads, 0, stream>>>( KERN_ARGS );
+            break;
+        case TableId::Table5:
+            HarvestFxK32Kernel<TableId::Table5><<<kblocks, kthreads, 0, stream>>>( KERN_ARGS );
+            break;
+        case TableId::Table6:
+            HarvestFxK32Kernel<TableId::Table6><<<kblocks, kthreads, 0, stream>>>( KERN_ARGS );
+            break;
+        case TableId::Table7:
+            HarvestFxK32Kernel<TableId::Table7><<<kblocks, kthreads, 0, stream>>>( KERN_ARGS );
+            break;
+    
+        default:
+            Panic( "Unexpected table.");
+            break;
+    }
+}
\ No newline at end of file
diff --git a/cuda/GpuStreams.cu b/cuda/GpuStreams.cu
new file mode 100644
index 00000000..e5dcfd66
--- /dev/null
+++ b/cuda/GpuStreams.cu
@@ -0,0 +1,1050 @@
+#include "GpuStreams.h"
+#include "util/StackAllocator.h"
+
+struct PackedCopy
+{
+    struct IGpuBuffer* self;
+    const  byte*       src;
+           uint32      sequence;
+           uint32      length;
+           uint32      stride;
+           uint32      elementSize;
+           uint32      counts[BBCU_BUCKET_COUNT];
+};
+
+struct CopyInfo
+{
+    struct IGpuBuffer* self;
+    uint32             sequence;
+
+    const void* srcBuffer;
+    void*       dstBuffer;
+    size_t      width;
+    size_t      height;
+    size_t      dstStride;
+    size_t      srcStride;
+    
+    // Callback data
+    GpuDownloadCallback callback;
+    void*               userData;
+};
+
+struct IGpuBuffer
+{
+    size_t            size;
+    uint32            bufferCount;                                 // Number of pinned/device buffers this instance contains
+    void*             deviceBuffer   [BBCU_GPU_BUFFER_MAX_COUNT];
+    void*             pinnedBuffer   [BBCU_GPU_BUFFER_MAX_COUNT];  // Pinned host buffer
+    cudaEvent_t       events         [BBCU_GPU_BUFFER_MAX_COUNT];  // Signals the device buffer is ready for use
+    cudaEvent_t       completedEvents[BBCU_GPU_BUFFER_MAX_COUNT];  // Signals the buffer is ready for consumption by the device or buffer
+    cudaEvent_t       readyEvents    [BBCU_GPU_BUFFER_MAX_COUNT];  // User must signal this event when the device buffer is ready for download
+    // GpuQueue::Command commands       [BBCU_GPU_BUFFER_MAX_COUNT];  // Pending copy command for downloads
+    Fence             fence;                                       // Signals the pinned buffer is ready for use
+    Fence             copyFence;
+
+    cudaEvent_t       preloadEvents[BBCU_GPU_BUFFER_MAX_COUNT];
+
+    CopyInfo copies[BBCU_BUCKET_COUNT];
+    PackedCopy packedCopeis[BBCU_BUCKET_COUNT];    // For uplad buffers
+    // #TODO: Remove atomic again
+    uint32     lockSequence;           // Index of next buffer to lock
+    uint32     outgoingSequence;       // Index of locked buffer that will be downoaded/uploaded
+    std::atomic<uint32>     completedSequence;      // Index of buffer that finished downloading/uploading
+    std::atomic<uint32>     copySequence;
+
+    GpuQueue* queue;
+};
+
+
+///
+/// DownloadBuffer
+///
+void* GpuDownloadBuffer::GetDeviceBuffer()
+{
+    const uint32 index = self->outgoingSequence % self->bufferCount;
+
+    CudaErrCheck( cudaEventSynchronize( self->events[index] ) );
+
+    return self->deviceBuffer[index];
+}
+
+void* GpuDownloadBuffer::LockDeviceBuffer( cudaStream_t stream )
+{
+    ASSERT( self->lockSequence >= self->outgoingSequence );
+    ASSERT( self->lockSequence - self->outgoingSequence < self->bufferCount );
+
+    const uint32 index = self->lockSequence % self->bufferCount;
+    self->lockSequence++;
+
+    // Wait for the device buffer to be free to be used by kernels
+    CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) );
+    return self->deviceBuffer[index];
+}
+
+void GpuDownloadBuffer::Download( void* hostBuffer, const size_t size )
+{
+    Download2D( hostBuffer, size, 1, size, size );
+}
+
+void GpuDownloadBuffer::Download( void* hostBuffer, const size_t size, cudaStream_t workStream, bool directOverride )
+{
+    Download2D( hostBuffer, size, 1, size, size, workStream, directOverride );
+}
+
+void GpuDownloadBuffer::DownloadAndCopy( void* hostBuffer, void* finalBuffer, const size_t size, cudaStream_t workStream  )
+{
+    ASSERT( 0 );
+    // ASSERT( self->outgoingSequence < BBCU_BUCKET_COUNT );
+    // ASSERT( hostBuffer );
+    // ASSERT( workStream );
+    // ASSERT( self->lockSequence > 0 );
+    // ASSERT( self->outgoingSequence < self->lockSequence );
+    // ASSERT( self->lockSequence - self->outgoingSequence <= self->bufferCount );
+
+    // auto& cpy = self->copies[self->outgoingSequence];
+    // cpy.self            = self;
+    // cpy.sequence        = self->outgoingSequence;
+    // cpy.copy.hostBuffer = finalBuffer;
+    // cpy.copy.srcBuffer  = hostBuffer;
+    // cpy.copy.size       = size;
+
+
+    // const uint32 index = self->outgoingSequence % self->bufferCount;
+    // self->outgoingSequence++;
+
+    //       void* pinnedBuffer = self->pinnedBuffer[index];
+    // const void* devBuffer    = self->deviceBuffer[index];
+
+    // // Signal from the work stream when it has finished doing kernel work with the device buffer
+    // CudaErrCheck( cudaEventRecord( self->readyEvents[index], workStream ) );
+
+
+    // // Ensure the work stream has completed writing data to the device buffer
+    // cudaStream_t stream = self->queue->_stream;
+
+    // CudaErrCheck( cudaStreamWaitEvent( stream, self->readyEvents[index] ) );
+
+    // // Copy
+    // CudaErrCheck( cudaMemcpyAsync( hostBuffer, devBuffer, size, cudaMemcpyDeviceToHost, stream ) );
+    
+    // // Signal that the device buffer is free to be re-used
+    // CudaErrCheck( cudaEventRecord( self->events[index], stream ) );
+
+    // // Launch copy command
+    // CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){
+
+    //     const CopyInfo& c = *reinterpret_cast<CopyInfo*>( userData );
+    //     IGpuBuffer* self = c.self;
+
+    //     auto& cmd = self->queue->GetCommand( GpuQueue::CommandType::Copy );
+    //     cmd.copy.info = &c;
+
+    //     self->queue->SubmitCommands();
+        
+    //     // Signal the download completed
+    //     self->fence.Signal( ++self->completedSequence );
+    // }, &cpy ) );
+}
+
+void GpuDownloadBuffer::DownloadWithCallback( void* hostBuffer, const size_t size, GpuDownloadCallback callback, void* userData, cudaStream_t workStream, bool directOverride )
+{
+    Download2DWithCallback( hostBuffer, size, 1, size, size, callback, userData, workStream, directOverride );
+}
+
+void GpuDownloadBuffer::Download2D( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, cudaStream_t workStream, bool directOverride )
+{
+    Download2DWithCallback( hostBuffer, width, height, dstStride, srcStride, nullptr, nullptr, workStream, directOverride );
+}
+
+void GpuDownloadBuffer::Download2DWithCallback( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, 
+                                                GpuDownloadCallback callback, void* userData, cudaStream_t workStream, bool directOverride )
+{
+    ASSERT( hostBuffer );
+    ASSERT( workStream );
+    ASSERT( self->lockSequence > 0 );
+    ASSERT( self->outgoingSequence < self->lockSequence );
+    ASSERT( self->lockSequence - self->outgoingSequence <= self->bufferCount );
+
+    const uint32 index = self->outgoingSequence % self->bufferCount;
+
+          void* pinnedBuffer = self->pinnedBuffer[index];
+    const void* devBuffer    = self->deviceBuffer[index];
+
+    const bool isDirect = directOverride || self->pinnedBuffer[0] == nullptr;           ASSERT( isDirect || self->pinnedBuffer[0] );
+
+    // Signal from the work stream when it has finished doing kernel work with the device buffer
+    CudaErrCheck( cudaEventRecord( self->readyEvents[index], workStream ) );
+
+    // Ensure the work stream has completed writing data to the device buffer
+    cudaStream_t stream = self->queue->_stream;
+
+    CudaErrCheck( cudaStreamWaitEvent( stream, self->readyEvents[index] ) );
+    
+    // Ensure the pinned buffer is ready for use
+    if( !isDirect )
+    {
+        // CudaErrCheck( cudaStreamWaitEvent( stream, self->completedEvents[index] ) );
+        CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){
+            
+            IGpuBuffer* self = reinterpret_cast<IGpuBuffer*>( userData );
+            if( self->copySequence++ > 1 )
+            {
+                self->copyFence.Wait( self->copySequence-1 );
+            }
+        }, self ) );
+    }
+
+    // Copy from device to pinned host buffer
+    const bool   isSequentialCopy = dstStride == srcStride;
+    const size_t totalSize        = height * width;
+    
+    if( isDirect )
+    {
+        if( isSequentialCopy )
+            CudaErrCheck( cudaMemcpyAsync( hostBuffer, devBuffer, totalSize, cudaMemcpyDeviceToHost, stream ) );
+        else
+            CudaErrCheck( cudaMemcpy2DAsync( hostBuffer, dstStride, devBuffer, srcStride, width, height, cudaMemcpyDeviceToHost, stream ) );
+
+        // Signal direct download completed
+        auto& cpy = self->copies[self->outgoingSequence];
+        cpy.self      = self;
+        cpy.sequence  = self->outgoingSequence;
+        cpy.dstBuffer = hostBuffer;
+        cpy.callback  = callback;
+        cpy.userData  = userData;
+        cpy.height    = height;
+        cpy.width     = width;
+
+        CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){
+
+            CopyInfo&   cpy  = *reinterpret_cast<CopyInfo*>( userData );
+            IGpuBuffer* self = cpy.self; //reinterpret_cast<IGpuBuffer*>( userData );
+
+            self->fence.Signal( ++self->completedSequence );
+
+            // Dispatch callback, if one was set
+            if( cpy.callback )
+                cpy.callback( cpy.dstBuffer, cpy.height * cpy.width, cpy.userData );
+
+        }, &cpy ) );
+    }
+    else
+    {
+        CudaErrCheck( cudaMemcpyAsync( pinnedBuffer, devBuffer, totalSize, cudaMemcpyDeviceToHost, stream ) );
+    }
+    
+    // Signal that the device buffer is free to be re-used
+    CudaErrCheck( cudaEventRecord( self->events[index], stream ) );
+
+    // If not a direct copy, we need to do another copy from the pinned buffer to the unpinned host buffer
+    if( !isDirect )
+    {
+        // Signal the copy stream that the pinned buffer is ready to be copied to the unpinned host buffer
+        CudaErrCheck( cudaEventRecord( self->preloadEvents[index], stream ) );
+
+        // Ensure the pinned buffer is ready for use
+        cudaStream_t copyStream = self->queue->_preloadStream;
+        
+        CudaErrCheck( cudaStreamWaitEvent( copyStream, self->preloadEvents[index] ) );
+
+        {
+            auto& cpy = self->copies[self->outgoingSequence];
+            cpy.self     = self;
+            cpy.sequence = self->outgoingSequence;
+
+            cpy.dstBuffer = hostBuffer;
+            cpy.srcBuffer = pinnedBuffer;
+            cpy.width     = width;
+            cpy.height    = height;
+            cpy.srcStride = srcStride;
+            cpy.dstStride = dstStride;
+            cpy.callback  = callback;
+            cpy.userData  = userData;
+
+            CudaErrCheck( cudaLaunchHostFunc( copyStream, []( void* userData ){
+
+                CopyInfo&   cpy  = *reinterpret_cast<CopyInfo*>( userData );
+                IGpuBuffer* self = cpy.self; //reinterpret_cast<IGpuBuffer*>( userData );
+
+                auto& cmd = self->queue->GetCommand( GpuQueue::CommandType::Copy );
+                cmd.copy = &cpy;
+                self->queue->SubmitCommands();
+
+            }, &cpy ) );
+        }
+
+        // Signal the pinned buffer is free to be re-used
+        // CudaErrCheck( cudaEventRecord( self->completedEvents[index], copyStream ) );
+    }
+
+
+    // Signal the download completed
+    // {
+    //     auto& cpy = self->copies[self->outgoingSequence];
+    //     cpy.self     = self;
+    //     cpy.sequence = self->outgoingSequence;
+        
+    //     cpy.copy2d.dstBuffer = hostBuffer;
+    //     cpy.copy2d.srcBuffer = pinnedBuffer;
+    //     cpy.copy2d.width     = width;
+    //     cpy.copy2d.height    = height;
+    //     cpy.copy2d.srcStride = srcStride;
+    //     cpy.copy2d.dstStride = dstStride;
+
+    //     CudaErrCheck( cudaLaunchHostFunc( copyStream, []( void* userData ){
+            
+    //         CopyInfo&   cpy  = *reinterpret_cast<CopyInfo*>( userData );
+    //         IGpuBuffer* self = cpy.self; //reinterpret_cast<IGpuBuffer*>( userData );
+
+    //         const uint32 idx = cpy.sequence & self->bufferCount;
+            
+    //         const byte* src = (byte*)cpy.copy2d.srcBuffer;
+    //               byte* dst = (byte*)cpy.copy2d.dstBuffer;
+            
+    //         const size_t width     = cpy.copy2d.width;
+    //         const size_t height    = cpy.copy2d.height;
+    //         const size_t dstStride = cpy.copy2d.dstStride;
+    //         const size_t srcStride = cpy.copy2d.srcStride;
+
+    //         auto& cmd = self->queue->GetCommand( GpuQueue::CommandType::Download2D );
+    //         cmd.sequenceId = cpy.sequence;
+    //         cmd.srcBuffer  = src;
+    //         cmd.dstBuffer  = dst;
+    //         cmd.download2d.buf       = self;
+    //         cmd.download2d.width     = width;
+    //         cmd.download2d.height    = height;
+    //         cmd.download2d.srcStride = srcStride;
+    //         cmd.download2d.dstStride = dstStride;
+    //         self->queue->SubmitCommands();
+
+    //         // for( size_t i = 0; i < height; i++ )
+    //         // {
+    //         //     memcpy( dst, src, width );
+
+    //         //     dst += dstStride;
+    //         //     src += srcStride;
+    //         // }
+
+    //         // self->fence.Signal( ++self->completedSequence );
+    //     }, &cpy ) );
+    // }
+    // CudaErrCheck( cudaEventRecord( self->completedEvents[index], copyStream ) );
+
+    // if( callback )
+    // {
+    //     ASSERT( width <= srcStride );
+    //     ASSERT( width <= dstStride );
+
+    //     auto& cpy = self->copies[self->outgoingSequence];
+    //     cpy.self                = self;
+    //     cpy.sequence            = self->outgoingSequence;
+    //     cpy.callback.hostBuffer = hostBuffer;
+    //     cpy.callback.size       = width * height;
+    //     cpy.callback.callback   = callback;
+    //     cpy.callback.userData   = userData;
+
+    //     CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){
+            
+    //         auto& cpy  = *reinterpret_cast<CopyInfo*>( userData );
+    //         auto* self = cpy.self;
+
+    //         // Fire callback command
+    //         auto& cmd = self->queue->GetCommand( GpuQueue::CommandType::Callback );
+    //         cmd.dstBuffer         = cpy.callback.hostBuffer;
+    //         cmd.callback.copySize = cpy.callback.size;
+    //         cmd.callback.callback = cpy.callback.callback;
+    //         cmd.callback.userData = cpy.callback.userData;
+    //         self->queue->SubmitCommands();
+
+    //         // Signal the download completed
+    //         self->fence.Signal( ++self->completedSequence );
+    //     }, &cpy ) );
+    // }
+    // else
+    // {
+    //     // Signal the download completed
+    //     CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){
+
+    //         IGpuBuffer* self = reinterpret_cast<IGpuBuffer*>( userData );
+    //         self->fence.Signal( ++self->completedSequence );
+    //     }, self ) );
+    // }
+
+    self->outgoingSequence++;
+}
+
+void GpuDownloadBuffer::GetDownload2DCommand( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride,
+                                              uint32& outIndex, void*& outPinnedBuffer, const void*& outDevBuffer, GpuDownloadCallback callback, void* userData )
+{
+    ASSERT( width      );
+    ASSERT( height     );
+    ASSERT( hostBuffer );
+
+    const uint32 index = self->outgoingSequence % self->bufferCount;
+
+    // We need to block until the pinned buffer is available.
+    if( self->outgoingSequence > self->bufferCount-1 )
+        self->fence.Wait( self->outgoingSequence - self->bufferCount + 1 );
+
+          void* pinnedBuffer = self->pinnedBuffer[index];
+    const void* devBuffer    = self->deviceBuffer[index];
+
+    //auto& cmd = self->commands[index];
+    //cmd.type             = GpuQueue::CommandType::Copy2D;
+    //cmd.sequenceId       = self->outgoingSequence++;
+    //cmd.finishedSignal   = &self->fence;
+    //cmd.dstBuffer        = hostBuffer;
+    //cmd.srcBuffer        = pinnedBuffer;
+    //cmd.copy2d.width     = width;
+    //cmd.copy2d.height    = height;
+    //cmd.copy2d.dstStride = dstStride;
+    //cmd.copy2d.srcStride = srcStride;
+    //cmd.copy2d.callback  = callback;
+    //cmd.copy2d.userData  = userData;
+
+    outIndex        = index;
+    outPinnedBuffer = pinnedBuffer;
+    outDevBuffer    = devBuffer;
+}
+
+
+void GpuDownloadBuffer::DownloadAndPackArray( void* hostBuffer, const uint32 length, size_t srcStride, const uint32* counts, const uint32 elementSize )
+{
+    ASSERT( length      );
+    ASSERT( elementSize );
+    ASSERT( counts      );
+
+    uint32 totalElements = 0;
+    for( uint32 i = 0; i < length; i++ )
+        totalElements += counts[i];
+
+    const size_t totalSize = (size_t)totalElements * elementSize;
+
+    uint32      index;
+    void*       pinnedBuffer;
+    const void* devBuffer;
+    GetDownload2DCommand( hostBuffer, totalSize, 1, totalSize, totalSize, index, pinnedBuffer, devBuffer );
+
+
+    srcStride *= elementSize;
+
+          byte* dst = (byte*)pinnedBuffer;
+    const byte* src = (byte*)devBuffer;
+
+    cudaStream_t stream = self->queue->_stream;
+
+    // Copy all buffers from device to pinned buffer
+    for( uint32 i = 0; i < length; i++ )
+    {
+        const size_t copySize = counts[i] * (size_t)elementSize;
+
+        // #TODO: Determine if there's a cuda (jagged) array copy
+        CudaErrCheck( cudaMemcpyAsync( dst, src, copySize, cudaMemcpyDeviceToHost, stream ) );
+
+        src += srcStride;
+        dst += copySize;
+    }
+
+    // Signal that the device buffer is free
+    CudaErrCheck( cudaEventRecord( self->events[index], stream ) );
+
+    // Submit command to do the final copy from pinned to host
+    CudaErrCheck( cudaLaunchHostFunc( stream, GpuQueue::CopyPendingDownloadStream, self ) );
+}
+
+void GpuDownloadBuffer::WaitForCompletion()
+{
+    if( self->outgoingSequence > 0 )
+    {
+        //const uint32 index = (self->outgoingSequence - 1) % self->bufferCount;
+
+        //      cudaEvent_t event = self->completedEvents[index];
+        //const cudaError_t r     = cudaEventQuery( event );
+
+        //if( r == cudaSuccess )
+        //    return;
+
+        //if( r != cudaErrorNotReady )
+        //    CudaErrCheck( r );
+
+        //CudaErrCheck( cudaEventSynchronize( event ) );
+        
+        self->fence.Wait( self->outgoingSequence );
+    }
+}
+
+void GpuDownloadBuffer::WaitForCopyCompletion()
+{
+    if( self->outgoingSequence > 0 )
+    {
+        self->copyFence.Wait( self->outgoingSequence );
+    }
+}
+
+void GpuDownloadBuffer::Reset()
+{
+    self->lockSequence      = 0;
+    self->outgoingSequence  = 0;
+    self->completedSequence = 0;
+    self->copySequence      = 0;
+    self->fence.Reset( 0 );
+    self->copyFence.Reset( 0 );
+}
+
+GpuQueue* GpuDownloadBuffer::GetQueue() const
+{
+    return self->queue;
+}
+
+
+///
+/// UploadBuffer
+///
+void* GpuUploadBuffer::GetNextPinnedBuffer()
+{
+    // Wait for the pinned host buffer to be available
+    //if( self->outgoingSequence > self->bufferCount-1 )
+    //    self->fence.Wait( self->outgoingSequence - self->bufferCount + 1 );
+    //
+    const uint32 index = self->outgoingSequence % self->bufferCount;
+
+    void* pinnedBuffer = self->pinnedBuffer[index];
+
+    return pinnedBuffer;
+}
+
+void GpuUploadBuffer::Upload( const void* hostBuffer, size_t size, cudaStream_t workStream )
+{
+    ASSERT( hostBuffer );
+    ASSERT( size );
+    ASSERT( self->outgoingSequence - self->lockSequence < 2 );
+    // ASSERT( workStream );
+    
+    const uint32 index = self->outgoingSequence % self->bufferCount;
+    self->outgoingSequence++;
+
+    auto stream = self->queue->GetStream();
+
+    // Ensure the device buffer is ready for use
+    CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) );
+
+    // Upload to device buffer
+    CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, size, cudaMemcpyHostToDevice, stream ) );
+
+    // Signal work stream that the device buffer is ready to be used
+    CudaErrCheck( cudaEventRecord( self->readyEvents[index], stream ) );
+}
+
+void GpuUploadBuffer::UploadAndPreLoad( void* hostBuffer, const size_t size, const void* copyBufferSrc, const size_t copySize )
+{
+    ASSERT(0);
+    // ASSERT( size >= copySize );
+
+    // Upload( hostBuffer, size, nullptr );
+
+    // // Add callback for copy
+    // const uint32 sequence = self->outgoingSequence - 1;
+    // auto& cpy = self->copies[sequence];
+    // cpy.self            = self;
+    // cpy.sequence        = sequence;
+    // cpy.copy.hostBuffer = hostBuffer;
+    // cpy.copy.srcBuffer  = copyBufferSrc;
+    // cpy.copy.size       = copySize;
+
+    // // Launch copy command
+    // CudaErrCheck( cudaLaunchHostFunc( self->queue->GetStream(), []( void* userData ){
+
+    //     const CopyInfo& c = *reinterpret_cast<CopyInfo*>( userData );
+    //     IGpuBuffer* self = c.self;
+
+    //     auto& cmd = self->queue->GetCommand( GpuQueue::CommandType::Copy );
+    //     cmd.copy.info = &c;
+
+    //     self->queue->SubmitCommands();
+    // }, &cpy ) );
+}
+
+void GpuUploadBuffer::UploadArray( const void* hostBuffer, uint32 length, uint32 elementSize, uint32 srcStride, 
+                                   uint32 countStride, const uint32* counts, cudaStream_t workStream )
+{
+    ASSERT( hostBuffer );
+    ASSERT( self->outgoingSequence - self->lockSequence < 2 );
+
+    const uint32 index = self->outgoingSequence % self->bufferCount;
+    self->outgoingSequence++;
+
+    auto stream = self->queue->GetStream();
+
+    // Ensure the device buffer is ready for use
+    CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) );
+
+    // Perform uploads
+    //size_t deviceCopySize = 0;
+    const byte* src = (byte*)hostBuffer;
+          byte* dst = (byte*)self->deviceBuffer[index];
+
+    for( uint32 i = 0; i < length; i++ )
+    {
+        const size_t size = *counts * (size_t)elementSize;
+        //memcpy( dst, src, size );
+        CudaErrCheck( cudaMemcpyAsync( dst, src, size, cudaMemcpyHostToDevice, stream ) );
+
+        //deviceCopySize += size;
+
+        dst    += size;
+        src    += srcStride;
+        counts += countStride;
+    }
+
+    // Copy to device buffer
+    //CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], cpy.dstBuffer, deviceCopySize, cudaMemcpyHostToDevice, _stream ) );
+
+    // Signal work stream that the device buffer is ready to be used
+    CudaErrCheck( cudaEventRecord( self->readyEvents[index], stream ) );
+}
+
+void GpuUploadBuffer::Upload( const void* hostBuffer, const size_t size )
+{
+    Upload( hostBuffer, size, nullptr );
+}
+
+void GpuUploadBuffer::UploadArray( const void* hostBuffer, uint32 length,
+                                   uint32 elementSize, uint32 srcStride, uint32 countStride, const uint32* counts )
+{
+    UploadArray( hostBuffer, length, elementSize, srcStride, countStride, counts, nullptr );
+}
+
+void* GpuUploadBuffer::GetUploadedDeviceBuffer( cudaStream_t workStream )
+{
+    ASSERT( workStream );
+
+    if( self->outgoingSequence < 1 )
+    {
+        ASSERT( 0 );
+        return nullptr;
+    }
+
+    const uint32 index = self->completedSequence % self->bufferCount;
+    self->completedSequence++;
+
+    CudaErrCheck( cudaStreamWaitEvent( workStream, self->readyEvents[index] ) );
+
+    return self->deviceBuffer[index];
+}
+
+void* GpuUploadBuffer::GetUploadedDeviceBuffer()
+{ASSERT(0); // Not allowed for now
+    if( self->outgoingSequence < 1 )
+    {
+        ASSERT( 0 );
+        return nullptr;
+    }
+    ASSERT( 0 );
+    const uint32 index = self->completedSequence % self->bufferCount;
+
+    // #TODO: Make this spin way.
+    // #TODO: Find a better way to do this instead of having to wait on both primitives.
+    // Can't check the cuda event until we're sure it's been
+    // added to the stream
+    self->fence.Wait( self->completedSequence + 1 );
+    CudaErrCheck( cudaEventSynchronize( self->events[index] ) );
+
+    self->completedSequence++;
+
+    return self->deviceBuffer[index];
+}
+
+void GpuUploadBuffer::ReleaseDeviceBuffer( cudaStream_t workStream )
+{
+    ASSERT( self->outgoingSequence > self->lockSequence );
+    ASSERT( self->outgoingSequence - self->lockSequence <= 2 );
+    ASSERT( self->completedSequence > 0 );
+
+    const uint32 index = self->lockSequence % self->bufferCount;
+    self->lockSequence++;
+
+    CudaErrCheck( cudaEventRecord( self->events[index], workStream ) );
+}
+
+void GpuUploadBuffer::WaitForPreloadsToComplete()
+{
+    if( self->outgoingSequence > 0 )
+    {
+        self->copyFence.Wait( self->outgoingSequence );
+    }
+}
+
+void GpuUploadBuffer::Reset()
+{
+    self->lockSequence      = 0;
+    self->outgoingSequence  = 0;
+    self->completedSequence = 0;
+    self->copySequence      = 0;
+    self->fence.Reset( 0 );
+    self->copyFence.Reset( 0 );
+}
+
+GpuQueue* GpuUploadBuffer::GetQueue() const
+{
+    return self->queue;
+}
+
+
+///
+/// Shared GpuStream Inteface
+///
+GpuQueue::GpuQueue( Kind kind ) : _kind( kind )
+    , _bufferReadySignal( BBCU_BUCKET_COUNT )
+{
+    CudaErrCheck( cudaStreamCreateWithFlags( &_stream, cudaStreamNonBlocking ) );
+    CudaErrCheck( cudaStreamCreateWithFlags( &_preloadStream, cudaStreamNonBlocking ) );
+
+    _copyThread.Run( CopyThreadEntryPoint, this );
+}
+
+GpuQueue::~GpuQueue()
+{
+    _exitCopyThread.store( true, std::memory_order_release );
+    _bufferReadySignal.Release();
+    _waitForExitSignal.Wait();
+}
+
+//void GpuQueue::Synchronize()
+//{
+//    (void)GetCommand( CommandType::Sync );
+//    SubmitCommands();
+//
+//    _syncFence.Wait();
+//}
+
+
+//GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size, bool dryRun )
+//{
+//    FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" );
+//    if( dryRun ) return { nullptr };
+//
+//    // #TODO: Set size?
+//    return { CreateGpuBuffer( dev0, dev1, pinned0, pinned1, size ) };
+//}
+
+//GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( const size_t size, bool dryRun )
+//{
+//    FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" );
+//    if( dryRun ) return { nullptr };
+//    return { CreateGpuBuffer( size ) };
+//}
+
+GpuDownloadBuffer GpuQueue::CreateDirectDownloadBuffer( const size_t size, IAllocator& devAllocator, const size_t alignment, const bool dryRun )
+{
+    FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" );
+    GpuDownloadBuffer r = { CreateGpuBuffer( size, BBCU_DEFAULT_GPU_BUFFER_COUNT, &devAllocator, nullptr, alignment, dryRun ) };
+
+    if( !dryRun )
+        r.Reset();
+
+    return r;
+}
+
+GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun )
+{
+    FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" );
+    GpuDownloadBuffer r = { CreateGpuBuffer( size, devAllocator, pinnedAllocator, alignment, dryRun ) };
+
+    if( !dryRun )
+        r.Reset();
+
+    return r;
+}
+
+GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( const size_t size, const uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun )
+{
+    FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" );
+    GpuDownloadBuffer r = { CreateGpuBuffer( size, bufferCount, &devAllocator, &pinnedAllocator, alignment, dryRun ) };
+
+    if( !dryRun )
+        r.Reset();
+
+    return r;
+}
+
+GpuUploadBuffer GpuQueue::CreateUploadBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun )
+{
+    FatalIf( _kind != Uploader, "Attempted to create GpuUploadBuffer on an DownloadQueue" );
+    GpuUploadBuffer r = { CreateGpuBuffer( size, devAllocator, pinnedAllocator, alignment, dryRun ) };
+
+    if( !dryRun )
+        r.Reset();
+
+    return r;
+}
+
+
+struct IGpuBuffer* GpuQueue::CreateGpuBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun )
+{
+    return CreateGpuBuffer( size, BBCU_DEFAULT_GPU_BUFFER_COUNT, &devAllocator, &pinnedAllocator, alignment, dryRun );
+}
+
+struct IGpuBuffer* GpuQueue::CreateGpuBuffer( const size_t size, const uint32 bufferCount, IAllocator* devAllocator, IAllocator* pinnedAllocator, size_t alignment, bool dryRun )
+{
+    FatalIf( bufferCount > BBCU_GPU_BUFFER_MAX_COUNT, "GPU Buffer count overflow." );
+
+    const size_t allocSize = RoundUpToNextBoundaryT( size, alignment );
+
+    void* devBuffers   [BBCU_GPU_BUFFER_MAX_COUNT] = {};
+    void* pinnedBuffers[BBCU_GPU_BUFFER_MAX_COUNT] = {};
+
+    for( int32 i = 0; i < bufferCount; i++ )
+    {
+        devBuffers[i] = devAllocator->Alloc( allocSize, alignment );
+
+        if( pinnedAllocator )
+            pinnedBuffers[i] = pinnedAllocator->Alloc( allocSize, alignment );
+    }
+
+    if( dryRun ) return nullptr;
+
+    struct IGpuBuffer* buf = new IGpuBuffer{};
+
+    for( int32 i = 0; i < bufferCount; i++ )
+    {
+        CudaErrCheck( cudaEventCreateWithFlags( &buf->events[i]         , cudaEventDisableTiming ) );
+        CudaErrCheck( cudaEventCreateWithFlags( &buf->completedEvents[i], cudaEventDisableTiming ) );
+        CudaErrCheck( cudaEventCreateWithFlags( &buf->readyEvents[i]    , cudaEventDisableTiming ) );
+        CudaErrCheck( cudaEventCreateWithFlags( &buf->preloadEvents[i]  , cudaEventDisableTiming ) );
+        
+        buf->deviceBuffer[i] = devBuffers[i];
+        buf->pinnedBuffer[i] = pinnedBuffers[i];
+        // buf->commands[i]     = {};
+
+        // Events have to be disabled initially for uploads
+        //if( _kind == Uploader )
+        //{
+        //    CudaErrCheck( cudaEventSynchronize( buf->events[i]          ) );
+        //    CudaErrCheck( cudaEventSynchronize( buf->completedEvents[i] ) );
+        //    CudaErrCheck( cudaEventSynchronize( buf->readyEvents[i]     ) );
+        //}
+    }
+
+    buf->size        = size;
+    buf->bufferCount = bufferCount;
+    buf->queue       = this;
+
+    return buf;
+}
+
+//struct IGpuBuffer* GpuQueue::CreateGpuBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, const size_t size )
+//{
+//    ASSERT( dev0 );
+//    ASSERT( dev1 );
+//    ASSERT( pinned0 );
+//    ASSERT( pinned1 );
+//
+//    ASSERT( dev0 != dev1 );
+//    ASSERT( pinned0 != pinned1 );
+//
+//#if _DEBUG
+//    if( size )
+//    {
+//        ASSERT_DOES_NOT_OVERLAP( dev0   , dev1   , size );
+//        ASSERT_DOES_NOT_OVERLAP( dev0   , pinned0, size );
+//        ASSERT_DOES_NOT_OVERLAP( dev0   , pinned1, size );
+//        ASSERT_DOES_NOT_OVERLAP( dev1   , pinned0, size );
+//        ASSERT_DOES_NOT_OVERLAP( dev1   , pinned1, size );
+//        ASSERT_DOES_NOT_OVERLAP( pinned0, pinned1, size );
+//    }
+//#endif
+//
+//    struct IGpuBuffer* buf = new IGpuBuffer();
+//
+//    CudaErrCheck( cudaEventCreateWithFlags( &buf->events[0], cudaEventDisableTiming ) );
+//    CudaErrCheck( cudaEventCreateWithFlags( &buf->events[1], cudaEventDisableTiming ) );
+//
+//    buf->deviceBuffer[0] = dev0;
+//    buf->deviceBuffer[1] = dev1;
+//
+//    buf->pinnedBuffer[0] = pinned0;
+//    buf->pinnedBuffer[1] = pinned1;
+//
+//    buf->size = size;
+//    buf->fence.Reset( 0 );
+//
+//    buf->commands[0] = {};
+//    buf->commands[1] = {};
+//
+//    buf->outgoingSequence  = 0;
+//    buf->completedSequence = 0;
+//
+//    buf->queue = this;
+//
+//    return buf;
+//}
+
+//struct IGpuBuffer* GpuQueue::CreateGpuBuffer( const size_t size )
+//{
+//    ASSERT( size );
+//
+//    void* dev0;
+//    void* dev1;
+//    void* pinned0;
+//    void* pinned1;
+//
+//    CudaErrCheck( cudaMalloc( &dev0, size ) );
+//    CudaErrCheck( cudaMalloc( &dev1, size ) );
+//    CudaErrCheck( cudaMallocHost( &pinned0, size ) );
+//    CudaErrCheck( cudaMallocHost( &pinned1, size ) );
+//
+//    return CreateGpuBuffer( dev0, dev1, pinned0, pinned1, size );
+//}
+
+void GpuQueue::CopyPendingDownloadStream( void* userData )
+{
+    auto* buf = reinterpret_cast<IGpuBuffer*>( userData );
+
+    GpuQueue* queue = buf->queue;
+
+    //const uint32 index = buf->completedSequence % buf->bufferCount;
+    buf->completedSequence++;
+
+    //queue->GetCommand( CommandType::Download2D ) = buf->commands[index];
+    queue->SubmitCommands();
+}
+
+void GpuQueue::SubmitCommands()
+{
+    const uint64 ticket = _commitTicketOut++;
+
+    // Wait for our ticket to come up
+    while( _commitTicketIn.load( std::memory_order_relaxed ) != ticket );
+
+    _queue.Commit();
+    _bufferReadySignal.Release();
+    //_bufferReadySignal.Signal();
+
+    // Use our ticket
+    _commitTicketIn.store( ticket+1, std::memory_order_release );
+}
+
+GpuQueue::Command& GpuQueue::GetCommand( CommandType type )
+{
+    const uint64 ticket = _cmdTicketOut++;
+
+    // Wait for our ticket to come up
+    while( _cmdTicketIn.load( std::memory_order_relaxed ) != ticket );
+    
+    Command* cmd;
+    while( !_queue.Write( cmd ) )
+    {
+        Log::Line( "[GpuQueue] Queue is depleted. Waiting for copies to complete." );
+        auto waitTimer = TimerBegin();
+
+        // Block and wait until we have commands free in the buffer
+        _bufferCopiedSignal.Wait();
+        
+        Log::Line( "[GpuQueue] Waited %.6lf seconds for availability.", TimerEnd( waitTimer ) );
+    }
+
+    // Use our ticket
+    _cmdTicketIn.store( ticket+1, std::memory_order_release );
+
+    ZeroMem( cmd );
+    cmd->type = type;
+
+    return *cmd;
+}
+
+
+///
+/// Command thread
+///
+void GpuQueue::CopyThreadEntryPoint( GpuQueue* self )
+{
+    ASSERT( self );
+    self->CopyThreadMain();
+    self->_waitForExitSignal.Signal();
+}
+
+void GpuQueue::CopyThreadMain()
+{
+    const int32 CMD_BUF_SIZE = 256;
+    Command buffers[CMD_BUF_SIZE];
+
+    for( ;; )
+    {
+        _bufferReadySignal.Wait();
+
+        if( ShouldExitCopyThread() )
+            return;
+
+        // 1 command per semaphore release
+        int32 bufCount;
+        while( ( ( bufCount = _queue.Dequeue( buffers, CMD_BUF_SIZE ) ) ) )
+        // if( ( ( bufCount = _queue.Dequeue( buffers, CMD_BUF_SIZE ) ) ) )
+        {
+            ASSERT( bufCount <= CMD_BUF_SIZE );
+            _bufferCopiedSignal.Signal();
+
+            for( int i = 0; i < bufCount; i++ )
+                ExecuteCommand( buffers[i] );
+        }
+    }
+}
+
+void GpuQueue::ExecuteCommand( const Command& cmd )
+{
+
+    // const uint32 index = cmd.sequenceId % BBCU_GPU_BUFFER_MAX_COUNT;
+
+    if( cmd.type == CommandType::Copy )
+    {
+        auto& cpy = *cmd.copy;
+
+        const bool   isSequentialCopy = cpy.dstStride == cpy.srcStride;
+        const size_t totalSize        = cpy.height * cpy.width;
+
+              byte* dst = (byte*)cpy.dstBuffer;
+        const byte* src = (byte*)cpy.srcBuffer;
+        
+        if( isSequentialCopy )
+            memcpy( cpy.dstBuffer, cpy.srcBuffer, totalSize );
+        else
+        {
+            const byte* src = (byte*)cpy.srcBuffer;
+                  byte* dst = (byte*)cpy.dstBuffer;
+
+            for( size_t i = 0; i < cpy.height; i++ )
+            {
+                memcpy( dst, src, cpy.width );
+
+                dst += cpy.dstStride;
+                src += cpy.srcStride;
+            }
+        }
+
+        cpy.self->fence.Signal( cpy.sequence+1 );
+        cpy.self->copyFence.Signal( cpy.sequence+1 );
+
+        if( cpy.callback )
+            cpy.callback( cpy.dstBuffer, totalSize, cpy.userData );
+    }
+    else if( cmd.type == CommandType::Callback )
+    {
+        cmd.callback.callback( cmd.callback.dstbuffer, cmd.callback.copySize, cmd.callback.userData );
+    }
+    // else if( cmd.type == CommandType::Sync )
+    // {
+    //     _syncFence.Signal();
+    //     return;
+    // }
+    else
+    {
+        ASSERT( 0 );
+    }
+
+    // Signal that the pinned buffer is free
+    //cpy.finishedSignal->Signal( cpy.sequenceId + 1 );
+}
+
+inline bool GpuQueue::ShouldExitCopyThread()
+{
+    return _exitCopyThread.load( std::memory_order_acquire );
+}
diff --git a/cuda/GpuStreams.h b/cuda/GpuStreams.h
new file mode 100644
index 00000000..ae1a5b63
--- /dev/null
+++ b/cuda/GpuStreams.h
@@ -0,0 +1,335 @@
+#pragma once
+#include "CudaUtil.h"
+#include "CudaPlotConfig.h"
+#include "threading/Thread.h"
+#include "threading/Fence.h"
+#include "threading/Semaphore.h"
+#include "util/SPCQueue.h"
+
+//#define GPU_BUFFER_COUNT
+
+
+// Represents a double-buffered device buffer, which can be used with a GpuStreamQueue to 
+// make fast transfers (via intermediate pinned memory)
+
+class IAllocator;
+
+enum class GpuStreamKind : uint32
+{
+    Download = 0,
+    Upload
+};
+
+typedef void (*GpuDownloadCallback)( void* hostBuffer, size_t downloadSize, void* userData );
+
+struct GpuDownloadBuffer
+{
+    // Blocks the target stream buffer is available for kernel use
+    void* LockDeviceBuffer( cudaStream_t stream );
+
+    template<typename T>
+    inline T* LockDeviceBuffer( cudaStream_t stream )
+    {
+        return reinterpret_cast<T*>( LockDeviceBuffer( stream ) );
+    }
+
+    // Blocks until the next device buffer is available for kernel use
+    void* GetDeviceBuffer();
+
+    template<typename T>
+    inline T* GetDeviceBuffer()
+    {
+        return reinterpret_cast<T*>( GetDeviceBuffer() );
+    }
+
+    // Begin next async download and signal completion in the specified stream
+    // so that the stream waits until it is signalled to continue work
+    void Download( void* hostBuffer, size_t size, cudaStream_t workStream, bool directOverride = false );
+
+    template<typename T>
+    inline void DownloadT( T* hostBuffer, size_t count, cudaStream_t workStream, bool directOverride = false )
+    {
+        Download( hostBuffer, count * sizeof( T ), workStream, directOverride );
+    }
+
+    // Begin next async download call given the last device buffer used
+    void Download( void* hostBuffer, size_t size );
+
+    // Download and copy to another destination in a background thread
+    void DownloadAndCopy( void* hostBuffer, void* finalBuffer, size_t size, cudaStream_t workStream  );
+
+    template<typename T>
+    inline void DownloadAndCopyT( T* hostBuffer, T* finalBuffer, const size_t count, cudaStream_t workStream  )
+    {
+        DownloadAndCopy( hostBuffer, finalBuffer, count * sizeof( T ), workStream  );
+    }
+    
+    template<typename T>
+    inline void DownloadT( T* hostBuffer, size_t count )
+    {
+        Download( hostBuffer, count * sizeof( T ) );
+    }
+
+    void DownloadTempAndCopy( void* hostBuffer, size_t size, cudaStream_t workStream );
+
+    template<typename T>
+    inline void DownloadTempAndCopyT( T* hostBuffer, const size_t size, cudaStream_t workStream )
+    {
+        return DownloadTempAndCopy( hostBuffer, size * sizeof( T ), workStream );
+    }
+
+    void DownloadWithCallback( void* hostBuffer, size_t size, GpuDownloadCallback callback, void* userData, cudaStream_t workStream = nullptr, bool directOverride = false );
+    
+    // Performs a direct host-to-pinned buffer copy,
+    // and then a 2-dimensional copy from pinned buffer to host buffer
+    //  - width    : Size in bytes of each row to copy
+    //  - height   : How many rows to copy
+    //  - dstStride: Offset, in bytes, to the start of each row in the destination buffer
+    //  - srcStride: Offset, in bytes, to the start of each row in the source buffer
+    void Download2D( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, cudaStream_t workStream = nullptr, bool directOverride = false );
+
+    void Download2DWithCallback( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride,
+                                 GpuDownloadCallback callback, void* userData, cudaStream_t workStream = nullptr, bool directOverride = false );
+
+    // Values of width, dstStride and srcStride are in element counts for this version
+    template<typename T>
+    inline void Download2DT( T* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, cudaStream_t workStream = nullptr, bool directOverride = false )
+    {
+        Download2D( hostBuffer, width * sizeof( T ), height, dstStride * sizeof( T ), srcStride * sizeof( T ), workStream, directOverride );
+    }
+
+    // Performs several gpu-to-pinned downloads, then copies the pinned data as a contiguous buffer
+    // to the destination host buffer
+    void DownloadAndPackArray( void* hostBuffer, uint32 length, size_t srcStride, const uint32* counts, uint32 elementSize );
+
+    template<typename T>
+    inline void DownloadAndPackArray( T* hostBuffer, uint32 length, size_t srcStride, const uint32* counts )
+    {
+        DownloadAndPackArray( (void*)hostBuffer, length, srcStride, counts, sizeof( T ) );
+    }
+
+    // Wait for all downloads to complete.
+    void WaitForCompletion();
+
+    // Wait for copy to complete (when using DownloadAndCopy)
+    void WaitForCopyCompletion();
+
+    // Reset sequence id's.
+    // This should only be used when no more events are pending.
+    void Reset();
+
+    class GpuQueue* GetQueue() const;
+
+//private:
+    struct IGpuBuffer* self;
+
+private:
+    void PerformDownload( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride,
+                          GpuDownloadCallback callback, void* userData, cudaStream_t workStream, struct CopyInfo* copy = nullptr );
+
+    void GetDownload2DCommand( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, 
+                               uint32& outIndex, void*& outPinnedBuffer, const void*& outDevBuffer, GpuDownloadCallback callback = nullptr, void* userData = nullptr );
+};
+
+struct GpuUploadBuffer
+{
+    void Upload( const void* hostBuffer, size_t size, cudaStream_t workStream );
+
+    template<typename T>
+    inline void UploadT( const T* hostBuffer, size_t count, cudaStream_t workStream )
+    {
+        Upload( hostBuffer, count * sizeof( T ), workStream );
+    }
+
+    void Upload( const void* hostBuffer, size_t size );
+
+    template<typename T>
+    inline void UploadT( const T* hostBuffer, size_t count )
+    {
+        Upload( hostBuffer, count * sizeof( T ) );
+    }
+
+    // Upload the host buffer, then copy the copyBufferSrc to the host buffer. Preloading
+    // data into that hostBuffer (should be pinned) as soon as it is free so that memory is ready for the next upload.
+    void UploadAndPreLoad( void* hostBuffer, size_t size, const void* copyBufferSrc, size_t copySize );
+    
+    template<typename T>
+    inline void UploadAndPreLoadT( T* hostBuffer, const size_t count, const T* copyBufferSrc, const size_t copyCount )
+    {
+        UploadAndPreLoad( hostBuffer, count * sizeof( T ), copyBufferSrc, copyCount * sizeof( T ) );
+    }
+
+    void UploadArray( const void* hostBuffer, uint32 length, uint32 elementSize, uint32 srcStrideBytes, uint32 countStride, const uint32* counts, cudaStream_t workStream );
+   
+    template<typename T>
+    inline void UploadArrayT( const T* hostBuffer, uint32 length, uint32 srcStride, uint32 countStride, const uint32* counts, cudaStream_t workStream )
+    {
+        UploadArray( hostBuffer, length, (uint32)sizeof( T ), srcStride * (uint32)sizeof( T ), countStride, counts, workStream );
+    }
+
+
+    void UploadArray( const void* hostBuffer, uint32 length, uint32 elementSize, uint32 srcStrideBytes, uint32 countStride, const uint32* counts );
+
+    // srcStride here is in element count
+    template<typename T>
+    inline void UploadArrayT( const T* hostBuffer, uint32 length, uint32 srcStride, uint32 countStride, const uint32* counts )
+    {
+        UploadArray( hostBuffer, length, (uint32)sizeof( T ), srcStride * (uint32)sizeof( T ), countStride, counts );
+    }
+
+
+    void* GetUploadedDeviceBuffer( cudaStream_t workStream );
+
+    template<typename T>
+    inline T* GetUploadedDeviceBufferT( cudaStream_t workStream ) { return (T*)GetUploadedDeviceBuffer( workStream ); }
+
+    // Waits until the earliest buffer has been uploaded to the GPU
+    // and returns the device buffer.
+    void* GetUploadedDeviceBuffer();
+
+    template<typename T>
+    inline T* GetUploadedDeviceBufferT() { return (T*)GetUploadedDeviceBuffer(); }
+
+    // #TODO: Pass in the buffer used as a reference so that it can be nullified, for safety.
+    void ReleaseDeviceBuffer( cudaStream_t workStream );
+    // Wait for all uploads to complete.
+    // After this is called, Synchronize should be called on the stream.
+    //void WaitForCompletion();
+
+    // Waits for preloaded data (via UploadAndPreLoad) to complete
+    void WaitForPreloadsToComplete();
+
+    // Reset sequence id's.
+    // This should only be used when no more events are pending.
+    void Reset();
+
+    class GpuQueue* GetQueue() const;
+
+//private:
+    struct IGpuBuffer* self;
+
+private:
+    void* GetNextPinnedBuffer();
+};
+
+
+class GpuQueue
+{
+    friend struct IGpuBuffer;
+    friend struct GpuDownloadBuffer;
+    friend struct GpuUploadBuffer;
+
+    enum class CommandType
+    {
+        None = 0,
+        Copy,
+        Callback,
+    };
+
+    struct Command
+    {
+        CommandType type;
+
+        union
+        {
+            struct CopyInfo* copy;
+
+            struct {
+                GpuDownloadCallback callback;
+                size_t              copySize;
+                void*               dstbuffer;
+                void*               userData;
+            } callback;
+        };
+    };
+
+public:
+
+    enum Kind
+    {
+        Downloader,
+        Uploader
+    };
+
+    GpuQueue( Kind kind );
+    virtual ~GpuQueue();
+
+    //void Synchronize();
+
+    //GpuDownloadBuffer CreateDownloadBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size = 0, bool dryRun = false );
+    //GpuDownloadBuffer CreateDownloadBuffer( const size_t size, bool dryRun = false );
+    GpuDownloadBuffer CreateDirectDownloadBuffer( size_t size, IAllocator& devAllocator, size_t alignment, bool dryRun = false );
+    GpuDownloadBuffer CreateDownloadBuffer( size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false );
+    GpuDownloadBuffer CreateDownloadBuffer( size_t size, uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false );
+
+    template<typename T>
+    inline GpuDownloadBuffer CreateDirectDownloadBuffer( const size_t count, IAllocator& devAllocator, size_t alignment = alignof( T ), bool dryRun = false )
+    {
+        return CreateDirectDownloadBuffer( count * sizeof( T ), devAllocator, alignment, dryRun );
+    }
+
+    template<typename T>
+    inline GpuDownloadBuffer CreateDownloadBufferT( const size_t count, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment = alignof( T ), bool dryRun = false )
+    {
+        return CreateDownloadBuffer( count * sizeof( T ), devAllocator, pinnedAllocator, alignment, dryRun );
+    }
+
+    template<typename T>
+    inline GpuDownloadBuffer CreateDownloadBufferT( const size_t count, uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment = alignof( T ), bool dryRun = false )
+    {
+        return CreateDownloadBuffer( count * sizeof( T ), bufferCount, devAllocator, pinnedAllocator, alignment, dryRun );
+    }
+
+    //GpuUploadBuffer CreateUploadBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size = 0, bool dryRun = false );
+    //GpuUploadBuffer CreateUploadBuffer( const size_t size, bool dryRun = false );
+    GpuUploadBuffer CreateUploadBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false );
+
+    template<typename T>
+    inline GpuUploadBuffer CreateUploadBufferT( const size_t count, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false )
+    {
+        return CreateUploadBuffer( count * sizeof( T ), devAllocator, pinnedAllocator, alignment, dryRun );
+    }
+
+    inline cudaStream_t GetStream() const { return _stream; }
+
+protected:
+
+    struct IGpuBuffer* CreateGpuBuffer( size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun );
+    struct IGpuBuffer* CreateGpuBuffer( size_t size, uint32 bufferCount, IAllocator* devAllocator, IAllocator* pinnedAllocator, size_t alignment, bool dryRun );
+    //struct IGpuBuffer* CreateGpuBuffer( const size_t size );
+    //struct IGpuBuffer* CreateGpuBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size );
+
+    static void CopyPendingDownloadStream( void* userData );
+
+    [[nodiscard]]
+    Command& GetCommand( CommandType type );
+    void SubmitCommands();
+
+    // Copy threads
+    static void CopyThreadEntryPoint( GpuQueue* self );
+    virtual void CopyThreadMain();
+
+    void ExecuteCommand( const Command& cpy );
+
+    bool ShouldExitCopyThread();
+
+protected:
+    cudaStream_t             _stream;
+    cudaStream_t             _preloadStream;
+    Thread                   _copyThread;
+    //Fence                    _bufferReadySignal;
+    Semaphore                _bufferReadySignal;
+    Fence                    _bufferCopiedSignal;
+    Fence                    _syncFence;
+    SPCQueue<Command, BBCU_BUCKET_COUNT*6> _queue;
+    Kind                     _kind;
+
+    AutoResetSignal          _waitForExitSignal;
+    std::atomic<bool>        _exitCopyThread    = false;
+
+    // Support multiple threads to grab commands
+    std::atomic<uint64> _cmdTicketOut    = 0;
+    std::atomic<uint64> _cmdTicketIn     = 0;
+    std::atomic<uint64> _commitTicketOut = 0;
+    std::atomic<uint64> _commitTicketIn  = 0;
+};
diff --git a/cuda/chacha8.cu b/cuda/chacha8.cu
new file mode 100644
index 00000000..ffa4e5fb
--- /dev/null
+++ b/cuda/chacha8.cu
@@ -0,0 +1,337 @@
+#include "pos/chacha8.h"
+#include "CudaPlotContext.h"
+
+// #TEST
+#if _DEBUG
+    #include "threading/MTJob.h"
+
+    uint64 CudaPlotK32DbgXtoF1( CudaK32PlotContext& cx, const uint32 x );
+    static void DbgValidateBucket( CudaK32PlotContext& cx, const uint32 bucket );
+#endif
+
+#define U32TO32_LITTLE(v) CuBSwap32(v)
+#define U8TO32_LITTLE(p) (*(const uint32_t *)(p))
+#define U32TO8_LITTLE(p, v) (((uint32_t *)(p))[0] = U32TO32_LITTLE(v))
+#define ROTL32(v, n) (((v) << (n)) | ((v) >> (32 - (n))))
+
+#define ROTATE(v, c) (ROTL32(v, c))
+#define XOR(v, w) ((v) ^ (w))
+#define PLUS(v, w) ((v) + (w))
+#define PLUSONE(v) (PLUS((v), 1))
+
+#define QUARTERROUND(a, b, c, d) \
+    a = PLUS(a, b);              \
+    d = ROTATE(XOR(d, a), 16);   \
+    c = PLUS(c, d);              \
+    b = ROTATE(XOR(b, c), 12);   \
+    a = PLUS(a, b);              \
+    d = ROTATE(XOR(d, a), 8);    \
+    c = PLUS(c, d);              \
+    b = ROTATE(XOR(b, c), 7)
+
+
+// 128 threads per cuda block, each thread will do one chacha block
+#define CHACHA_BLOCKS_PER_CUDA_BLOCK 128ull
+
+//-----------------------------------------------------------
+__global__ void get_keystream_gpu( const uint32_t* input, uint64_t chachaBlock, uint32* outY, uint32* outX, uint32* gBucketCounts )
+{
+    const uint32   id          = (uint32)threadIdx.x;
+    const uint64_t blockOffset = blockIdx.x * CHACHA_BLOCKS_PER_CUDA_BLOCK + id;
+    
+    chachaBlock += blockOffset;
+
+    uint32_t x[16];// , x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+    uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
+
+    j0  = input[0];
+    j1  = input[1];
+    j2  = input[2];
+    j3  = input[3];
+    j4  = input[4];
+    j5  = input[5];
+    j6  = input[6];
+    j7  = input[7];
+    j8  = input[8];
+    j9  = input[9];
+    j10 = input[10];
+    j11 = input[11];
+    j12 = (uint32_t)chachaBlock;
+    j13 = (uint32_t)(chachaBlock >> 32);
+    j14 = input[14];
+    j15 = input[15];
+
+    x[0 ] = j0;
+    x[1 ] = j1;
+    x[2 ] = j2;
+    x[3 ] = j3;
+    x[4 ] = j4;
+    x[5 ] = j5;
+    x[6 ] = j6;
+    x[7 ] = j7;
+    x[8 ] = j8;
+    x[9 ] = j9;
+    x[10] = j10;
+    x[11] = j11;
+    x[12] = j12;
+    x[13] = j13;
+    x[14] = j14;
+    x[15] = j15;
+
+    #pragma unroll
+    for( int i = 8; i > 0; i -= 2 )
+    {
+        QUARTERROUND( x[0], x[4], x[8 ], x[12] );
+        QUARTERROUND( x[1], x[5], x[9 ], x[13] );
+        QUARTERROUND( x[2], x[6], x[10], x[14] );
+        QUARTERROUND( x[3], x[7], x[11], x[15] );
+        QUARTERROUND( x[0], x[5], x[10], x[15] );
+        QUARTERROUND( x[1], x[6], x[11], x[12] );
+        QUARTERROUND( x[2], x[7], x[8 ], x[13] );
+        QUARTERROUND( x[3], x[4], x[9 ], x[14] );
+    }
+
+    x[0 ] = CuBSwap32( PLUS( x[0 ], j0  ) );
+    x[1 ] = CuBSwap32( PLUS( x[1 ], j1  ) );
+    x[2 ] = CuBSwap32( PLUS( x[2 ], j2  ) );
+    x[3 ] = CuBSwap32( PLUS( x[3 ], j3  ) );
+    x[4 ] = CuBSwap32( PLUS( x[4 ], j4  ) );
+    x[5 ] = CuBSwap32( PLUS( x[5 ], j5  ) );
+    x[6 ] = CuBSwap32( PLUS( x[6 ], j6  ) );
+    x[7 ] = CuBSwap32( PLUS( x[7 ], j7  ) );
+    x[8 ] = CuBSwap32( PLUS( x[8 ], j8  ) );
+    x[9 ] = CuBSwap32( PLUS( x[9 ], j9  ) );
+    x[10] = CuBSwap32( PLUS( x[10], j10 ) );
+    x[11] = CuBSwap32( PLUS( x[11], j11 ) );
+    x[12] = CuBSwap32( PLUS( x[12], j12 ) );
+    x[13] = CuBSwap32( PLUS( x[13], j13 ) );
+    x[14] = CuBSwap32( PLUS( x[14], j14 ) );
+    x[15] = CuBSwap32( PLUS( x[15], j15 ) );
+
+    // Distribute our values to their buckets
+    __shared__ uint32 sharedBucketCounts[BBCU_BUCKET_COUNT];
+
+    {
+        constexpr uint32 xShift      = BBCU_K - kExtraBits;
+        constexpr uint32 bucketShift = BBCU_K - BBC_BUCKET_BITS;
+
+        uint32 offsets[16];
+        // #TODO: Above 128 threads we need to loop to set the others that need to be zeroed out
+        if( id < BBCU_BUCKET_COUNT )
+        {
+            sharedBucketCounts[id] = 0;
+        
+        #if BBCU_BUCKET_COUNT > 128
+            sharedBucketCounts[CHACHA_BLOCKS_PER_CUDA_BLOCK+id] = 0;
+        #endif
+        }
+
+        // Record local offsets to the shared bucket count
+        __syncthreads();
+
+        #pragma unroll
+        for( uint32 i = 0; i < 16; i++ )
+            offsets[i] = atomicAdd( &sharedBucketCounts[x[i] >> bucketShift], 1 );
+
+        __syncthreads();
+
+        // Store global bucket counts, from the block-shared count,
+        // and get the block-wide offsets into the destination bucket slice
+        CUDA_ASSERT( gridDim.x >= BBCU_BUCKET_COUNT );
+        if( id < BBCU_BUCKET_COUNT )
+        {
+            sharedBucketCounts[id] = atomicAdd( &gBucketCounts[id], sharedBucketCounts[id] );
+
+            #if BBCU_BUCKET_COUNT > 128
+                const uint32 id2 = CHACHA_BLOCKS_PER_CUDA_BLOCK + id;
+                sharedBucketCounts[id2] = atomicAdd( &gBucketCounts[id2], sharedBucketCounts[id2] );
+            #endif
+        }
+
+        __syncthreads();
+
+
+        const uint32 xOffset = (uint32)(chachaBlock * 16);
+        
+        const uint32 yBits = (uint32)( BBC_Y_BITS - BBC_BUCKET_BITS );
+        const uint32 yMask = (uint32)((1ull << yBits) - 1);
+
+        #pragma unroll
+        for( uint32 i = 0; i < 16; i++ )
+        {
+            const uint32 y      = x[i];
+            const uint32 bucket = y >> bucketShift;
+            const uint32 xo     = xOffset + i;
+
+            CUDA_ASSERT( bucket < BBCU_BUCKET_COUNT );
+
+            const uint32 offsetInSlice = sharedBucketCounts[bucket] + offsets[i];
+            const uint32 sliceOffset   = bucket * (uint32)BBCU_MAX_SLICE_ENTRY_COUNT;
+            // const uint32 sliceOffsetX  = bucket * (uint32)BBCU_META_SLICE_ENTRY_COUNT;
+            const uint32 dst           = sliceOffset  + offsetInSlice;
+            // const uint32 dstX          = sliceOffsetX + offsetInSlice;
+
+            outY[dst] = ((y << kExtraBits) | (xo >> xShift)) & yMask;
+            outX[dst] = xo;
+        }
+    }
+}
+
+//-----------------------------------------------------------
+void GenF1Cuda( CudaK32PlotContext& cx )
+{
+    const uint32 k                      = BBCU_K;
+    const uint64 bucketEntryCount       = (1ull << k) / BBCU_BUCKET_COUNT;
+    const uint32 f1EntriesPerBlock      = kF1BlockSize / sizeof( uint32 );
+    const uint32 chachaBucketBlockCount = (uint32)(bucketEntryCount / f1EntriesPerBlock);
+    const int32  cudaBlockCount         = (int32)(chachaBucketBlockCount / CHACHA_BLOCKS_PER_CUDA_BLOCK);
+    ASSERT( (uint64)cudaBlockCount * CHACHA_BLOCKS_PER_CUDA_BLOCK * f1EntriesPerBlock == bucketEntryCount );
+
+
+    uint32* devBucketCounts = cx.devSliceCounts;
+    uint32* devChaChaInput  = cx.devChaChaInput;
+
+    const size_t bucketCopySize = BBCU_BUCKET_ALLOC_ENTRY_COUNT * sizeof( uint32 );
+
+    // Init chacha context
+    byte key[32] = { 1 };
+    memcpy( key + 1, cx.plotRequest.plotId, 32 - 1 );
+    
+    chacha8_ctx chacha;
+    chacha8_keysetup( &chacha, key, 256, nullptr );
+    
+    CudaErrCheck( cudaMemcpyAsync( devChaChaInput, chacha.input, 64, cudaMemcpyHostToDevice, cx.computeStream ) );
+    CudaErrCheck( cudaMemsetAsync( devBucketCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );
+   
+    const uint32 outIndex = CudaK32PlotGetOutputIndex( cx );
+
+    uint32* hostY    = cx.hostY;
+    uint32* hostMeta = cx.hostMeta;
+
+    for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+    {
+        cx.bucket = bucket;
+        const uint32 chachaBlockIdx = bucket * chachaBucketBlockCount;
+        
+        uint32* devY    = (uint32*)cx.yOut   .LockDeviceBuffer( cx.computeStream );
+        uint32* devMeta = (uint32*)cx.metaOut.LockDeviceBuffer( cx.computeStream );
+
+        #if _DEBUG
+            CudaErrCheck( cudaMemsetAsync( devY, 0, sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, cx.computeStream ) );
+        #endif
+
+        // Gen chacha blocks
+        get_keystream_gpu<<<cudaBlockCount, CHACHA_BLOCKS_PER_CUDA_BLOCK, 0, cx.computeStream>>>( devChaChaInput, chachaBlockIdx, devY, devMeta, devBucketCounts );
+        CudaK32PlotDownloadBucket( cx );
+
+        devBucketCounts += BBCU_BUCKET_COUNT;
+    }
+
+    // Copy bucket slices to host
+    CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, 
+                        cudaMemcpyDeviceToHost, cx.computeStream ) );
+
+    CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
+
+    memcpy( &cx.bucketSlices[0], cx.hostBucketSlices, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT );
+
+    // Count-up bucket counts
+    for( uint32 i = 0; i < BBCU_BUCKET_COUNT; i++ )
+        for( uint32 j = 0; j < BBCU_BUCKET_COUNT; j++ )
+            cx.bucketCounts[(int)0][i] += cx.bucketSlices[0][j][i];
+
+    cx.tableEntryCounts[0] = 1ull << BBCU_K;
+
+    // Ensure last bucket finished downloading
+    cx.yOut   .WaitForCompletion();
+    cx.metaOut.WaitForCompletion();
+    cx.yOut   .Reset();
+    cx.metaOut.Reset();
+}
+
+///
+/// DEBUG
+///
+
+//-----------------------------------------------------------
+uint64 CudaPlotK32DbgXtoF1( CudaK32PlotContext& cx, const uint32 x )
+{
+    constexpr uint32 xShift = BBCU_K - kExtraBits;
+    constexpr uint32 yBits = (uint32)( BBC_Y_BITS - BBC_BUCKET_BITS );
+    constexpr uint32 yMask = (uint32)((1ull << yBits) - 1);
+
+    byte key[32] = { 1 };
+    memcpy( key + 1, cx.plotRequest.plotId, 32 - 1 );
+
+    chacha8_ctx chacha;
+    chacha8_keysetup( &chacha, key, 256, NULL );
+
+    uint64 chachaBlock  = x / 16;
+    uint32 indexInBlock = x - chachaBlock * 16;
+    uint32 blocks[16];
+
+    chacha8_get_keystream( &chacha, chachaBlock, 1, (byte*)blocks );
+
+    uint64 y = Swap32( blocks[indexInBlock] );
+
+    y = ((y << kExtraBits) | (x >> xShift)) & yMask;
+    return y;
+}
+
+#if _DEBUG
+static ThreadPool* _dbgPool = nullptr;
+//-----------------------------------------------------------
+static void DbgValidateBucket( CudaK32PlotContext& cx, const uint32 bucket )
+ {
+    if( _dbgPool == nullptr )
+        _dbgPool = new ThreadPool( SysHost::GetLogicalCPUCount() );
+
+    //CudaErrCheck( cudaStreamSynchronize( cx.downloadStream ) );
+
+    Log::Line( "Validating bucket %u", bucket );
+    AnonMTJob::Run( *_dbgPool, [&cx, bucket]( AnonMTJob* self ) {
+
+        // divide-up slices between threads
+        uint64 _, sliceOffset, sliceEnd;
+        GetThreadOffsets( self, (uint64)BBCU_BUCKET_COUNT, _, sliceOffset, sliceEnd );
+
+        for( uint64 slice = sliceOffset; slice < sliceEnd; slice++ )
+        {
+            const uint64  offset = (uint64)slice * BBCU_BUCKET_ALLOC_ENTRY_COUNT + bucket * BBCU_MAX_SLICE_ENTRY_COUNT;
+            const uint32* py     = cx.hostY    + offset;
+            const uint32* px     = cx.hostMeta + offset * BBCU_HOST_META_MULTIPLIER;
+
+            const uint32 sliceSize = cx.bucketSlices[0][slice][bucket];
+
+            for( uint32 j = 0; j < sliceSize; j++ )
+            {
+                const uint32 y = py[j];
+                const uint32 x = px[j];
+
+                const uint32 oy = (uint32)CudaPlotK32DbgXtoF1( cx, x );
+                ASSERT( oy == y );
+            }
+        }
+    });
+
+    Log::Line( " OK" );
+            
+    //for( uint64 slice = 0; slice < 64; slice++ )
+    //{
+    //    const uint64  offset = (uint64)i * BBCU_BUCKET_ALLOC_ENTRY_COUNT + slice * BBCU_MAX_SLICE_ENTRY_COUNT;
+    //    const uint32* py     = cx.hostY    + offset;
+    //    const uint32* px     = cx.hostMeta + offset * BBCU_HOST_META_MULTIPLIER;
+
+    //    const uint32 sliceSize = cx.bucketSlices[outIndex][i][slice];
+
+    //    for( uint32 j = 0; j < sliceSize; j++ )
+    //    {
+    //        const uint32 y = py[j];
+    //        const uint32 x = px[j];
+
+    //        const uint32 oy = (uint32)CudaPlotK32DbgXtoF1( cx, x );
+    //        ASSERT( oy == y );
+    //    }
+    //}
+}
+#endif
diff --git a/cuda/cub/agent/agent_adjacent_difference.cuh b/cuda/cub/agent/agent_adjacent_difference.cuh
new file mode 100644
index 00000000..b135fbbf
--- /dev/null
+++ b/cuda/cub/agent/agent_adjacent_difference.cuh
@@ -0,0 +1,279 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_adjacent_difference.cuh"
+
+#include <thrust/system/cuda/detail/core/util.h>
+
+
+CUB_NAMESPACE_BEGIN
+
+
+template <
+  int                      _BLOCK_THREADS,
+  int                      _ITEMS_PER_THREAD = 1,
+  cub::BlockLoadAlgorithm  _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+  cub::CacheLoadModifier   _LOAD_MODIFIER    = cub::LOAD_LDG,
+  cub::BlockStoreAlgorithm _STORE_ALGORITHM  = cub::BLOCK_STORE_DIRECT>
+struct AgentAdjacentDifferencePolicy
+{
+  static constexpr int BLOCK_THREADS    = _BLOCK_THREADS;
+  static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD;
+  static constexpr int ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+  static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM   = _LOAD_ALGORITHM;
+  static constexpr cub::CacheLoadModifier LOAD_MODIFIER     = _LOAD_MODIFIER;
+  static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
+};
+
+template <typename Policy,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename DifferenceOpT,
+          typename OffsetT,
+          typename InputT,
+          typename OutputT,
+          bool MayAlias,
+          bool ReadLeft>
+struct AgentDifference
+{
+  using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, InputIteratorT>::type;
+
+  using BlockLoad = typename cub::BlockLoadType<Policy, LoadIt>::type;
+  using BlockStore = typename cub::BlockStoreType<Policy, OutputIteratorT, OutputT>::type;
+
+  using BlockAdjacentDifferenceT =
+    cub::BlockAdjacentDifference<InputT, Policy::BLOCK_THREADS>;
+
+  union _TempStorage
+  {
+    typename BlockLoad::TempStorage load;
+    typename BlockStore::TempStorage store;
+    typename BlockAdjacentDifferenceT::TempStorage adjacent_difference;
+  };
+
+  /// Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage> {};
+
+  static constexpr int BLOCK_THREADS = Policy::BLOCK_THREADS;
+  static constexpr int ITEMS_PER_THREAD = Policy::ITEMS_PER_THREAD;
+  static constexpr int ITEMS_PER_TILE = Policy::ITEMS_PER_TILE;
+  static constexpr int SHARED_MEMORY_SIZE = static_cast<int>(sizeof(TempStorage));
+
+  _TempStorage &temp_storage;
+  InputIteratorT input_it;
+  LoadIt load_it;
+  InputT *first_tile_previous;
+  OutputIteratorT result;
+  DifferenceOpT difference_op;
+  OffsetT num_items;
+
+  __device__ __forceinline__ AgentDifference(TempStorage &temp_storage,
+                                             InputIteratorT input_it,
+                                             InputT *first_tile_previous,
+                                             OutputIteratorT result,
+                                             DifferenceOpT difference_op,
+                                             OffsetT num_items)
+      : temp_storage(temp_storage.Alias())
+      , input_it(input_it)
+      , load_it(
+          THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(Policy(),
+                                                                  input_it))
+      , first_tile_previous(first_tile_previous)
+      , result(result)
+      , difference_op(difference_op)
+      , num_items(num_items)
+  {}
+
+  template <bool IS_LAST_TILE,
+            bool IS_FIRST_TILE>
+  __device__ __forceinline__ void consume_tile_impl(int num_remaining,
+                                                    int tile_idx,
+                                                    OffsetT tile_base)
+  {
+    InputT input[ITEMS_PER_THREAD];
+    OutputT output[ITEMS_PER_THREAD];
+
+    if (IS_LAST_TILE)
+    {
+      // Fill last elements with the first element
+      // because collectives are not suffix guarded
+      BlockLoad(temp_storage.load)
+        .Load(load_it + tile_base, input, num_remaining, *(load_it + tile_base));
+    }
+    else
+    {
+      BlockLoad(temp_storage.load).Load(load_it + tile_base, input);
+    }
+
+    CTA_SYNC();
+
+    if (ReadLeft)
+    {
+      if (IS_FIRST_TILE)
+      {
+        if (IS_LAST_TILE)
+        {
+          BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
+            .SubtractLeftPartialTile(input,
+                                     output,
+                                     difference_op,
+                                     num_remaining);
+        }
+        else
+        {
+          BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
+            .SubtractLeft(input, output, difference_op);
+        }
+      }
+      else
+      {
+        InputT tile_prev_input = MayAlias 
+                               ? first_tile_previous[tile_idx]
+                               : *(input_it + tile_base - 1);
+
+        if (IS_LAST_TILE)
+        {
+          BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
+            .SubtractLeftPartialTile(input,
+                                     output,
+                                     difference_op,
+                                     num_remaining,
+                                     tile_prev_input);
+        }
+        else
+        {
+          BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
+            .SubtractLeft(input, output, difference_op, tile_prev_input);
+        }
+      }
+    }
+    else
+    {
+      if (IS_LAST_TILE)
+      {
+        BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
+          .SubtractRightPartialTile(input, output, difference_op, num_remaining);
+      }
+      else
+      {
+        InputT tile_next_input = MayAlias
+                               ? first_tile_previous[tile_idx]
+                               : *(input_it + tile_base + ITEMS_PER_TILE);
+
+        BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
+          .SubtractRight(input, output, difference_op, tile_next_input);
+      }
+    }
+
+    CTA_SYNC();
+
+    if (IS_LAST_TILE)
+    {
+      BlockStore(temp_storage.store)
+        .Store(result + tile_base, output, num_remaining);
+    }
+    else
+    {
+      BlockStore(temp_storage.store).Store(result + tile_base, output);
+    }
+  }
+
+  template <bool IS_LAST_TILE>
+  __device__ __forceinline__ void consume_tile(int num_remaining,
+                                               int tile_idx,
+                                               OffsetT tile_base)
+  {
+    if (tile_idx == 0)
+    {
+      consume_tile_impl<IS_LAST_TILE, true>(num_remaining,
+                                            tile_idx,
+                                            tile_base);
+    }
+    else
+    {
+      consume_tile_impl<IS_LAST_TILE, false>(num_remaining,
+                                             tile_idx,
+                                             tile_base);
+    }
+  }
+
+  __device__ __forceinline__ void Process(int tile_idx,
+                                          OffsetT tile_base)
+  {
+    OffsetT num_remaining = num_items - tile_base;
+
+    if (num_remaining > ITEMS_PER_TILE) // not a last tile
+    {
+      consume_tile<false>(num_remaining, tile_idx, tile_base);
+    }
+    else
+    {
+      consume_tile<true>(num_remaining, tile_idx, tile_base);
+    }
+  }
+};
+
+template <typename InputIteratorT,
+          typename InputT,
+          typename OffsetT,
+          bool ReadLeft>
+struct AgentDifferenceInit
+{
+  static constexpr int BLOCK_THREADS = 128;
+
+  static __device__ __forceinline__ void Process(int tile_idx,
+                                                 InputIteratorT first,
+                                                 InputT *result,
+                                                 OffsetT num_tiles,
+                                                 int items_per_tile)
+  {
+    OffsetT tile_base  = static_cast<OffsetT>(tile_idx) * items_per_tile;
+
+    if (tile_base > 0 && tile_idx < num_tiles)
+    {
+      if (ReadLeft)
+      {
+        result[tile_idx] = first[tile_base - 1];
+      }
+      else
+      {
+        result[tile_idx - 1] = first[tile_base];
+      }
+    }
+  }
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/agent/agent_histogram.cuh b/cuda/cub/agent/agent_histogram.cuh
new file mode 100644
index 00000000..c94e7354
--- /dev/null
+++ b/cuda/cub/agent/agent_histogram.cuh
@@ -0,0 +1,780 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../util_type.cuh"
+#include "../block/block_load.cuh"
+#include "../config.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ *
+ */
+enum BlockHistogramMemoryPreference
+{
+    GMEM,
+    SMEM,
+    BLEND
+};
+
+
+/**
+ * Parameterizable tuning policy type for AgentHistogram
+ */
+template <
+    int                             _BLOCK_THREADS,                 ///< Threads per thread block
+    int                             _PIXELS_PER_THREAD,             ///< Pixels per thread (per tile of input)
+    BlockLoadAlgorithm              _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier               _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    bool                            _RLE_COMPRESS,                  ///< Whether to perform localized RLE to compress samples before histogramming
+    BlockHistogramMemoryPreference  _MEM_PREFERENCE,                ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+    bool                            _WORK_STEALING>                 ///< Whether to dequeue tiles from a global work queue
+struct AgentHistogramPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,                   ///< Threads per thread block
+        PIXELS_PER_THREAD       = _PIXELS_PER_THREAD,               ///< Pixels per thread (per tile of input)
+        IS_RLE_COMPRESS         = _RLE_COMPRESS,                    ///< Whether to perform localized RLE to compress samples before histogramming
+        MEM_PREFERENCE          = _MEM_PREFERENCE,                  ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+        IS_WORK_STEALING        = _WORK_STEALING,                   ///< Whether to dequeue tiles from a global work queue
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
+ */
+template <
+    typename    AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
+    int         PRIVATIZED_SMEM_BINS,           ///< Number of privatized shared-memory histogram bins of any channel.  Zero indicates privatized counters to be maintained in device-accessible memory.
+    int         NUM_CHANNELS,                   ///< Number of channels interleaved in the input data.  Supports up to four channels.
+    int         NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename    SampleIteratorT,                ///< Random-access input iterator type for reading samples
+    typename    CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
+    typename    PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+    typename    OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    int         PTX_ARCH = CUB_PTX_ARCH>        ///< PTX compute capability
+struct AgentHistogram
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The sample type of the input iterator
+    using SampleT = cub::detail::value_t<SampleIteratorT>;
+
+    /// The pixel type of SampleT
+    using PixelT = typename CubVector<SampleT, NUM_CHANNELS>::Type;
+
+    /// The quad type of SampleT
+    using QuadT = typename CubVector<SampleT, 4>::Type;
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = AgentHistogramPolicyT::BLOCK_THREADS,
+
+        PIXELS_PER_THREAD       = AgentHistogramPolicyT::PIXELS_PER_THREAD,
+        SAMPLES_PER_THREAD      = PIXELS_PER_THREAD * NUM_CHANNELS,
+        QUADS_PER_THREAD        = SAMPLES_PER_THREAD / 4,
+
+        TILE_PIXELS             = PIXELS_PER_THREAD * BLOCK_THREADS,
+        TILE_SAMPLES            = SAMPLES_PER_THREAD * BLOCK_THREADS,
+
+        IS_RLE_COMPRESS            = AgentHistogramPolicyT::IS_RLE_COMPRESS,
+
+        MEM_PREFERENCE          = (PRIVATIZED_SMEM_BINS > 0) ?
+                                        AgentHistogramPolicyT::MEM_PREFERENCE :
+                                        GMEM,
+
+        IS_WORK_STEALING           = AgentHistogramPolicyT::IS_WORK_STEALING,
+    };
+
+    /// Cache load modifier for reading input elements
+    static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;
+
+
+    /// Input iterator wrapper type (for applying cache modifier)
+    // Wrap the native input pointer with CacheModifiedInputIterator
+    // or directly use the supplied input iterator type
+    using WrappedSampleIteratorT = cub::detail::conditional_t<
+      std::is_pointer<SampleIteratorT>::value,
+      CacheModifiedInputIterator<LOAD_MODIFIER, SampleT, OffsetT>,
+      SampleIteratorT>;
+
+    /// Pixel input iterator type (for applying cache modifier)
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, PixelT, OffsetT>
+        WrappedPixelIteratorT;
+
+    /// Qaud input iterator type (for applying cache modifier)
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, QuadT, OffsetT>
+        WrappedQuadIteratorT;
+
+    /// Parameterized BlockLoad type for samples
+    typedef BlockLoad<
+            SampleT,
+            BLOCK_THREADS,
+            SAMPLES_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadSampleT;
+
+    /// Parameterized BlockLoad type for pixels
+    typedef BlockLoad<
+            PixelT,
+            BLOCK_THREADS,
+            PIXELS_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadPixelT;
+
+    /// Parameterized BlockLoad type for quads
+    typedef BlockLoad<
+            QuadT,
+            BLOCK_THREADS,
+            QUADS_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadQuadT;
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1];     // Smem needed for block-privatized smem histogram (with 1 word of padding)
+
+        int tile_idx;
+
+        // Aliasable storage layout
+        union Aliasable
+        {
+            typename BlockLoadSampleT::TempStorage sample_load;     // Smem needed for loading a tile of samples
+            typename BlockLoadPixelT::TempStorage pixel_load;       // Smem needed for loading a tile of pixels
+            typename BlockLoadQuadT::TempStorage quad_load;         // Smem needed for loading a tile of quads
+
+        } aliasable;
+    };
+
+
+    /// Temporary storage type (unionable)
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    /// Reference to temp_storage
+    _TempStorage &temp_storage;
+
+    /// Sample input iterator (with cache modifier applied, if possible)
+    WrappedSampleIteratorT d_wrapped_samples;
+
+    /// Native pointer for input samples (possibly NULL if unavailable)
+    SampleT* d_native_samples;
+
+    /// The number of output bins for each channel
+    int (&num_output_bins)[NUM_ACTIVE_CHANNELS];
+
+    /// The number of privatized bins for each channel
+    int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS];
+
+    /// Reference to gmem privatized histograms for each channel
+    CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+    /// Reference to final output histograms (gmem)
+    CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS];
+
+    /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+    OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS];
+
+    /// The transform operator for determining privatized counter indices from samples, one for each channel
+    PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];
+
+    /// Whether to prefer privatized smem counters vs privatized global counters
+    bool prefer_smem;
+
+
+    //---------------------------------------------------------------------
+    // Initialize privatized bin counters
+    //---------------------------------------------------------------------
+
+    // Initialize privatized bin counters
+    __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
+    {
+        // Initialize histogram bin counts to zeros
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS)
+            {
+                privatized_histograms[CHANNEL][privatized_bin] = 0;
+            }
+        }
+
+        // Barrier to make sure all threads are done updating counters
+        CTA_SYNC();
+    }
+
+
+    // Initialize privatized bin counters.  Specialized for privatized shared-memory counters
+    __device__ __forceinline__ void InitSmemBinCounters()
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        InitBinCounters(privatized_histograms);
+    }
+
+
+    // Initialize privatized bin counters.  Specialized for privatized global-memory counters
+    __device__ __forceinline__ void InitGmemBinCounters()
+    {
+        InitBinCounters(d_privatized_histograms);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Update final output histograms
+    //---------------------------------------------------------------------
+
+    // Update final output histograms from privatized histograms
+    __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
+    {
+        // Barrier to make sure all threads are done updating counters
+        CTA_SYNC();
+
+        // Apply privatized bin counts to output bin counts
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            int channel_bins = num_privatized_bins[CHANNEL];
+            for (int privatized_bin = threadIdx.x; 
+                    privatized_bin < channel_bins;  
+                    privatized_bin += BLOCK_THREADS)
+            {
+                int         output_bin  = -1;
+                CounterT    count       = privatized_histograms[CHANNEL][privatized_bin];
+                bool        is_valid    = count > 0;
+
+                output_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>((SampleT) privatized_bin, output_bin, is_valid);
+
+                if (output_bin >= 0)
+                {
+                    atomicAdd(&d_output_histograms[CHANNEL][output_bin], count);
+                }
+
+            }
+        }
+    }
+
+
+    // Update final output histograms from privatized histograms.  Specialized for privatized shared-memory counters
+    __device__ __forceinline__ void StoreSmemOutput()
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        StoreOutput(privatized_histograms);
+    }
+
+
+    // Update final output histograms from privatized histograms.  Specialized for privatized global-memory counters
+    __device__ __forceinline__ void StoreGmemOutput()
+    {
+        StoreOutput(d_privatized_histograms);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Tile accumulation
+    //---------------------------------------------------------------------
+
+    // Accumulate pixels.  Specialized for RLE compression.
+    __device__ __forceinline__ void AccumulatePixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD],
+        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
+        Int2Type<true>      is_rle_compress)
+    {
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            // Bin pixels
+            int bins[PIXELS_PER_THREAD];
+
+            #pragma unroll
+            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+            {
+                bins[PIXEL] = -1;
+                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
+            }
+
+            CounterT accumulator = 1;
+
+            #pragma unroll
+            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)
+            {
+                if (bins[PIXEL] != bins[PIXEL + 1])
+                {
+                    if (bins[PIXEL] >= 0)
+                        atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);
+
+                     accumulator = 0;
+                }
+                accumulator++;
+            }
+
+            // Last pixel
+            if (bins[PIXELS_PER_THREAD - 1] >= 0)
+                atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);
+        }
+    }
+
+
+    // Accumulate pixels.  Specialized for individual accumulation of each pixel.
+    __device__ __forceinline__ void AccumulatePixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD],
+        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
+        Int2Type<false>     is_rle_compress)
+    {
+        #pragma unroll
+        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+        {
+            #pragma unroll
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            {
+                int bin = -1;
+                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]);
+                if (bin >= 0)
+                    atomicAdd(privatized_histograms[CHANNEL] + bin, 1);
+            }
+        }
+    }
+
+
+    /**
+     * Accumulate pixel, specialized for smem privatized histogram
+     */
+    __device__ __forceinline__ void AccumulateSmemPixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD])
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
+    }
+
+
+    /**
+     * Accumulate pixel, specialized for gmem privatized histogram
+     */
+    __device__ __forceinline__ void AccumulateGmemPixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD])
+    {
+        AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Tile loading
+    //---------------------------------------------------------------------
+
+    // Load full, aligned tile using pixel iterator (multi-channel)
+    template <int _NUM_ACTIVE_CHANNELS>
+    __device__ __forceinline__ void LoadFullAlignedTile(
+        OffsetT                         block_offset,
+        int                             valid_samples,
+        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<_NUM_ACTIVE_CHANNELS>  num_active_channels)
+    {
+        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
+
+        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
+
+        // Load using a wrapped pixel iterator
+        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
+            d_wrapped_pixels,
+            reinterpret_cast<AliasedPixels&>(samples));
+    }
+
+    // Load full, aligned tile using quad iterator (single-channel)
+    __device__ __forceinline__ void LoadFullAlignedTile(
+        OffsetT                         block_offset,
+        int                             valid_samples,
+        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<1>                     num_active_channels)
+    {
+        typedef QuadT AliasedQuads[QUADS_PER_THREAD];
+
+        WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset));
+
+        // Load using a wrapped quad iterator
+        BlockLoadQuadT(temp_storage.aliasable.quad_load).Load(
+            d_wrapped_quads,
+            reinterpret_cast<AliasedQuads&>(samples));
+    }
+
+    // Load full, aligned tile
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<true>  is_full_tile,
+        Int2Type<true>  is_aligned)
+    {
+        LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type<NUM_ACTIVE_CHANNELS>());
+    }
+
+    // Load full, mis-aligned tile using sample iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<true>  is_full_tile,
+        Int2Type<false> is_aligned)
+    {
+        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
+
+        // Load using sample iterator
+        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
+            d_wrapped_samples + block_offset,
+            reinterpret_cast<AliasedSamples&>(samples));
+    }
+
+    // Load partially-full, aligned tile using the pixel iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<false> is_full_tile,
+        Int2Type<true>  is_aligned)
+    {
+        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
+
+        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
+
+        int valid_pixels = valid_samples / NUM_CHANNELS;
+
+        // Load using a wrapped pixel iterator
+        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
+            d_wrapped_pixels,
+            reinterpret_cast<AliasedPixels&>(samples),
+            valid_pixels);
+    }
+
+    // Load partially-full, mis-aligned tile using sample iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<false> is_full_tile,
+        Int2Type<false> is_aligned)
+    {
+        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
+
+        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
+            d_wrapped_samples + block_offset,
+            reinterpret_cast<AliasedSamples&>(samples),
+            valid_samples);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Tile processing
+    //---------------------------------------------------------------------
+
+    // Consume a tile of data samples
+    template <
+        bool IS_ALIGNED,        // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
+        bool IS_FULL_TILE>      // Whether the tile is full
+    __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
+    {
+        SampleT     samples[PIXELS_PER_THREAD][NUM_CHANNELS];
+        bool        is_valid[PIXELS_PER_THREAD];
+
+        // Load tile
+        LoadTile(
+            block_offset,
+            valid_samples,
+            samples,
+            Int2Type<IS_FULL_TILE>(),
+            Int2Type<IS_ALIGNED>());
+
+        // Set valid flags
+        #pragma unroll
+        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+            is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
+
+        // Accumulate samples
+#if CUB_PTX_ARCH >= 120
+        if (prefer_smem)
+            AccumulateSmemPixels(samples, is_valid);
+        else
+            AccumulateGmemPixels(samples, is_valid);
+#else
+        AccumulateGmemPixels(samples, is_valid);
+#endif
+
+    }
+
+
+    // Consume row tiles.  Specialized for work-stealing from queue
+    template <bool IS_ALIGNED>
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue<int>      tile_queue,
+        Int2Type<true>      is_work_stealing)
+    {
+
+        int         num_tiles                   = num_rows * tiles_per_row;
+        int         tile_idx                    = (blockIdx.y  * gridDim.x) + blockIdx.x;
+        OffsetT     num_even_share_tiles        = gridDim.x * gridDim.y;
+
+        while (tile_idx < num_tiles)
+        {
+            int     row             = tile_idx / tiles_per_row;
+            int     col             = tile_idx - (row * tiles_per_row);
+            OffsetT row_offset      = row * row_stride_samples;
+            OffsetT col_offset      = (col * TILE_SAMPLES);
+            OffsetT tile_offset     = row_offset + col_offset;
+
+            if (col == tiles_per_row - 1)
+            {
+                // Consume a partially-full tile at the end of the row
+                OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
+                ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
+            } 
+            else
+            {
+                // Consume full tile
+                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
+            }
+
+            CTA_SYNC();
+
+            // Get next tile
+            if (threadIdx.x == 0)
+                temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;
+
+            CTA_SYNC();
+
+            tile_idx = temp_storage.tile_idx;
+        }
+    }
+
+
+    // Consume row tiles.  Specialized for even-share (striped across thread blocks)
+    template <bool IS_ALIGNED>
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue<int>      tile_queue,
+        Int2Type<false>     is_work_stealing)
+    {
+        for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
+        {
+            OffsetT row_begin   = row * row_stride_samples;
+            OffsetT row_end     = row_begin + (num_row_pixels * NUM_CHANNELS);
+            OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES);
+
+            while (tile_offset < row_end)
+            {
+                OffsetT num_remaining = row_end - tile_offset;
+
+                if (num_remaining < TILE_SAMPLES)
+                {
+                    // Consume partial tile
+                    ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
+                    break;
+                }
+
+                // Consume full tile
+                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
+                tile_offset += gridDim.x * TILE_SAMPLES;
+            }
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Parameter extraction
+    //---------------------------------------------------------------------
+
+    // Return a native pixel pointer (specialized for CacheModifiedInputIterator types)
+    template <
+        CacheLoadModifier   _MODIFIER,
+        typename            _ValueT,
+        typename            _OffsetT>
+    __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr)
+    {
+        return itr.ptr;
+    }
+
+    // Return a native pixel pointer (specialized for other types)
+    template <typename IteratorT>
+    __device__ __forceinline__ SampleT* NativePointer(IteratorT itr)
+    {
+        return NULL;
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentHistogram(
+        TempStorage         &temp_storage,                                      ///< Reference to temp_storage
+        SampleIteratorT     d_samples,                                          ///< Input data to reduce
+        int                 (&num_output_bins)[NUM_ACTIVE_CHANNELS],            ///< The number bins per final output histogram
+        int                 (&num_privatized_bins)[NUM_ACTIVE_CHANNELS],        ///< The number bins per privatized histogram
+        CounterT*           (&d_output_histograms)[NUM_ACTIVE_CHANNELS],        ///< Reference to final output histograms
+        CounterT*           (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS],    ///< Reference to privatized histograms
+        OutputDecodeOpT     (&output_decode_op)[NUM_ACTIVE_CHANNELS],           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+        PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS])       ///< The transform operator for determining privatized counter indices from samples, one for each channel
+    :
+        temp_storage(temp_storage.Alias()),
+        d_wrapped_samples(d_samples),
+        num_output_bins(num_output_bins),
+        num_privatized_bins(num_privatized_bins),
+        d_output_histograms(d_output_histograms),
+        privatized_decode_op(privatized_decode_op),
+        output_decode_op(output_decode_op),
+        d_native_samples(NativePointer(d_wrapped_samples)),
+        prefer_smem((MEM_PREFERENCE == SMEM) ?
+            true :                              // prefer smem privatized histograms
+            (MEM_PREFERENCE == GMEM) ?
+                false :                         // prefer gmem privatized histograms
+                blockIdx.x & 1)                 // prefer blended privatized histograms
+    {
+        int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
+
+        // Initialize the locations of this block's privatized histograms
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]);
+    }
+
+
+    /**
+     * Consume image
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue<int>      tile_queue)                 ///< Queue descriptor for assigning tiles of work to thread blocks
+    {
+        // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel)
+        int     quad_mask           = AlignBytes<QuadT>::ALIGN_BYTES - 1;
+        int     pixel_mask          = AlignBytes<PixelT>::ALIGN_BYTES - 1;
+        size_t  row_bytes           = sizeof(SampleT) * row_stride_samples;
+
+        bool quad_aligned_rows      = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) &&     // Single channel
+                                        ((size_t(d_native_samples) & quad_mask) == 0) &&        // ptr is quad-aligned
+                                        ((num_rows == 1) || ((row_bytes & quad_mask) == 0));    // number of row-samples is a multiple of the alignment of the quad
+
+        bool pixel_aligned_rows     = (NUM_CHANNELS > 1) &&                                     // Multi channel
+                                        ((size_t(d_native_samples) & pixel_mask) == 0) &&       // ptr is pixel-aligned
+                                        ((row_bytes & pixel_mask) == 0);                        // number of row-samples is a multiple of the alignment of the pixel
+
+        // Whether rows are aligned and can be vectorized
+        if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
+            ConsumeTiles<true>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
+        else
+            ConsumeTiles<false>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
+    }
+
+
+    /**
+     * Initialize privatized bin counters.  Specialized for privatized shared-memory counters
+     */
+    __device__ __forceinline__ void InitBinCounters()
+    {
+        if (prefer_smem)
+            InitSmemBinCounters();
+        else
+            InitGmemBinCounters();
+    }
+
+
+    /**
+     * Store privatized histogram to device-accessible memory.  Specialized for privatized shared-memory counters
+     */
+    __device__ __forceinline__ void StoreOutput()
+    {
+        if (prefer_smem)
+            StoreSmemOutput();
+        else
+            StoreGmemOutput();
+    }
+
+
+};
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/agent/agent_merge_sort.cuh b/cuda/cub/agent/agent_merge_sort.cuh
new file mode 100644
index 00000000..2e994dd9
--- /dev/null
+++ b/cuda/cub/agent/agent_merge_sort.cuh
@@ -0,0 +1,750 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_merge_sort.cuh"
+
+#include <thrust/system/cuda/detail/core/util.h>
+
+CUB_NAMESPACE_BEGIN
+
+
+template <
+  int                      _BLOCK_THREADS,
+  int                      _ITEMS_PER_THREAD = 1,
+  cub::BlockLoadAlgorithm  _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+  cub::CacheLoadModifier   _LOAD_MODIFIER    = cub::LOAD_LDG,
+  cub::BlockStoreAlgorithm _STORE_ALGORITHM  = cub::BLOCK_STORE_DIRECT>
+struct AgentMergeSortPolicy
+{
+  static constexpr int BLOCK_THREADS    = _BLOCK_THREADS;
+  static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD;
+  static constexpr int ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+  static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM   = _LOAD_ALGORITHM;
+  static constexpr cub::CacheLoadModifier LOAD_MODIFIER     = _LOAD_MODIFIER;
+  static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
+};
+
+/// \brief This agent is responsible for the initial in-tile sorting.
+template <typename Policy,
+          typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyIteratorT,
+          typename ValueIteratorT,
+          typename OffsetT,
+          typename CompareOpT,
+          typename KeyT,
+          typename ValueT>
+struct AgentBlockSort
+{
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+
+  static constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+
+  using BlockMergeSortT =
+    BlockMergeSort<KeyT, Policy::BLOCK_THREADS, Policy::ITEMS_PER_THREAD, ValueT>;
+
+  using KeysLoadIt  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeyInputIteratorT>::type;
+  using ItemsLoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ValueInputIteratorT>::type;
+
+  using BlockLoadKeys  = typename cub::BlockLoadType<Policy, KeysLoadIt>::type;
+  using BlockLoadItems = typename cub::BlockLoadType<Policy, ItemsLoadIt>::type;
+
+  using BlockStoreKeysIt   = typename cub::BlockStoreType<Policy, KeyIteratorT>::type;
+  using BlockStoreItemsIt  = typename cub::BlockStoreType<Policy, ValueIteratorT>::type;
+  using BlockStoreKeysRaw  = typename cub::BlockStoreType<Policy, KeyT *>::type;
+  using BlockStoreItemsRaw = typename cub::BlockStoreType<Policy, ValueT *>::type;
+
+  union _TempStorage
+  {
+    typename BlockLoadKeys::TempStorage load_keys;
+    typename BlockLoadItems::TempStorage load_items;
+    typename BlockStoreKeysIt::TempStorage store_keys_it;
+    typename BlockStoreItemsIt::TempStorage store_items_it;
+    typename BlockStoreKeysRaw::TempStorage store_keys_raw;
+    typename BlockStoreItemsRaw::TempStorage store_items_raw;
+    typename BlockMergeSortT::TempStorage block_merge;
+  };
+
+  /// Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage> {};
+
+  static constexpr int BLOCK_THREADS = Policy::BLOCK_THREADS;
+  static constexpr int ITEMS_PER_THREAD = Policy::ITEMS_PER_THREAD;
+  static constexpr int ITEMS_PER_TILE = Policy::ITEMS_PER_TILE;
+  static constexpr int SHARED_MEMORY_SIZE =
+    static_cast<int>(sizeof(TempStorage));
+
+  //---------------------------------------------------------------------
+  // Per thread data
+  //---------------------------------------------------------------------
+
+  bool ping;
+  _TempStorage &storage;
+  KeysLoadIt keys_in;
+  ItemsLoadIt items_in;
+  OffsetT keys_count;
+  KeyIteratorT keys_out_it;
+  ValueIteratorT items_out_it;
+  KeyT *keys_out_raw;
+  ValueT *items_out_raw;
+  CompareOpT compare_op;
+
+  __device__ __forceinline__ AgentBlockSort(bool ping_,
+                                            TempStorage &storage_,
+                                            KeysLoadIt keys_in_,
+                                            ItemsLoadIt items_in_,
+                                            OffsetT keys_count_,
+                                            KeyIteratorT keys_out_it_,
+                                            ValueIteratorT items_out_it_,
+                                            KeyT *keys_out_raw_,
+                                            ValueT *items_out_raw_,
+                                            CompareOpT compare_op_)
+      : ping(ping_)
+      , storage(storage_.Alias())
+      , keys_in(keys_in_)
+      , items_in(items_in_)
+      , keys_count(keys_count_)
+      , keys_out_it(keys_out_it_)
+      , items_out_it(items_out_it_)
+      , keys_out_raw(keys_out_raw_)
+      , items_out_raw(items_out_raw_)
+      , compare_op(compare_op_)
+  {
+  }
+
+  __device__ __forceinline__ void Process()
+  {
+    auto tile_idx     = static_cast<OffsetT>(blockIdx.x);
+    auto num_tiles    = static_cast<OffsetT>(gridDim.x);
+    auto tile_base    = tile_idx * ITEMS_PER_TILE;
+    int items_in_tile = (cub::min)(keys_count - tile_base, int{ITEMS_PER_TILE});
+
+    if (tile_idx < num_tiles - 1)
+    {
+      consume_tile<false>(tile_base, ITEMS_PER_TILE);
+    }
+    else
+    {
+      consume_tile<true>(tile_base, items_in_tile);
+    }
+  }
+
+  template <bool IS_LAST_TILE>
+  __device__ __forceinline__ void consume_tile(OffsetT tile_base,
+                                               int num_remaining)
+  {
+    ValueT items_local[ITEMS_PER_THREAD];
+    if (!KEYS_ONLY)
+    {
+      if (IS_LAST_TILE)
+      {
+        BlockLoadItems(storage.load_items)
+          .Load(items_in + tile_base,
+                items_local,
+                num_remaining,
+                *(items_in + tile_base));
+      }
+      else
+      {
+        BlockLoadItems(storage.load_items).Load(items_in + tile_base, items_local);
+      }
+
+      CTA_SYNC();
+    }
+
+    KeyT keys_local[ITEMS_PER_THREAD];
+    if (IS_LAST_TILE)
+    {
+      BlockLoadKeys(storage.load_keys)
+        .Load(keys_in + tile_base,
+              keys_local,
+              num_remaining,
+              *(keys_in + tile_base));
+    }
+    else
+    {
+      BlockLoadKeys(storage.load_keys)
+        .Load(keys_in + tile_base, keys_local);
+    }
+
+    CTA_SYNC();
+
+    if (IS_LAST_TILE)
+    {
+      BlockMergeSortT(storage.block_merge)
+        .Sort(keys_local, items_local, compare_op, num_remaining, keys_local[0]);
+    }
+    else
+    {
+      BlockMergeSortT(storage.block_merge).Sort(keys_local, items_local, compare_op);
+    }
+
+    CTA_SYNC();
+
+    if (ping)
+    {
+      if (IS_LAST_TILE)
+      {
+        BlockStoreKeysIt(storage.store_keys_it)
+          .Store(keys_out_it + tile_base, keys_local, num_remaining);
+      }
+      else
+      {
+        BlockStoreKeysIt(storage.store_keys_it)
+          .Store(keys_out_it + tile_base, keys_local);
+      }
+
+      if (!KEYS_ONLY)
+      {
+        CTA_SYNC();
+
+        if (IS_LAST_TILE)
+        {
+          BlockStoreItemsIt(storage.store_items_it)
+            .Store(items_out_it + tile_base, items_local, num_remaining);
+        }
+        else
+        {
+          BlockStoreItemsIt(storage.store_items_it)
+            .Store(items_out_it + tile_base, items_local);
+        }
+      }
+    }
+    else
+    {
+      if (IS_LAST_TILE)
+      {
+        BlockStoreKeysRaw(storage.store_keys_raw)
+          .Store(keys_out_raw + tile_base, keys_local, num_remaining);
+      }
+      else
+      {
+        BlockStoreKeysRaw(storage.store_keys_raw)
+          .Store(keys_out_raw + tile_base, keys_local);
+      }
+
+      if (!KEYS_ONLY)
+      {
+        CTA_SYNC();
+
+        if (IS_LAST_TILE)
+        {
+          BlockStoreItemsRaw(storage.store_items_raw)
+            .Store(items_out_raw + tile_base, items_local, num_remaining);
+        }
+        else
+        {
+          BlockStoreItemsRaw(storage.store_items_raw)
+            .Store(items_out_raw + tile_base, items_local);
+        }
+      }
+    }
+  }
+};
+
+/**
+ * \brief This agent is responsible for partitioning a merge path into equal segments
+ *
+ * There are two sorted arrays to be merged into one array. If the first array
+ * is partitioned between parallel workers by slicing it into ranges of equal
+ * size, there could be a significant workload imbalance. The imbalance is
+ * caused by the fact that the distribution of elements from the second array
+ * is unknown beforehand. Instead, the MergePath is partitioned between workers.
+ * This approach guarantees an equal amount of work being assigned to each worker.
+ *
+ * This approach is outlined in the paper:
+ * Odeh et al, "Merge Path - Parallel Merging Made Simple"
+ * doi:10.1109/IPDPSW.2012.202
+ */
+template <
+  typename KeyIteratorT,
+  typename OffsetT,
+  typename CompareOpT,
+  typename KeyT>
+struct AgentPartition
+{
+  bool ping;
+  KeyIteratorT keys_ping;
+  KeyT *keys_pong;
+  OffsetT keys_count;
+  OffsetT partition_idx;
+  OffsetT *merge_partitions;
+  CompareOpT compare_op;
+  OffsetT target_merged_tiles_number;
+  int items_per_tile;
+
+  __device__ __forceinline__ AgentPartition(bool ping,
+                                            KeyIteratorT keys_ping,
+                                            KeyT *keys_pong,
+                                            OffsetT keys_count,
+                                            OffsetT partition_idx,
+                                            OffsetT *merge_partitions,
+                                            CompareOpT compare_op,
+                                            OffsetT target_merged_tiles_number,
+                                            int items_per_tile)
+      : ping(ping)
+      , keys_ping(keys_ping)
+      , keys_pong(keys_pong)
+      , keys_count(keys_count)
+      , partition_idx(partition_idx)
+      , merge_partitions(merge_partitions)
+      , compare_op(compare_op)
+      , target_merged_tiles_number(target_merged_tiles_number)
+      , items_per_tile(items_per_tile)
+  {}
+
+  __device__ __forceinline__ void Process()
+  {
+    OffsetT merged_tiles_number = target_merged_tiles_number / 2;
+
+    // target_merged_tiles_number is a power of two.
+    OffsetT mask = target_merged_tiles_number - 1;
+
+    // The first tile number in the tiles group being merged, equal to:
+    // target_merged_tiles_number * (partition_idx / target_merged_tiles_number)
+    OffsetT list  = ~mask & partition_idx;
+    OffsetT start = items_per_tile * list;
+    OffsetT size  = items_per_tile * merged_tiles_number;
+
+    // Tile number within the tile group being merged, equal to:
+    // partition_idx / target_merged_tiles_number
+    OffsetT local_tile_idx = mask & partition_idx;
+
+    OffsetT keys1_beg = (cub::min)(keys_count, start);
+    OffsetT keys1_end = (cub::min)(keys_count, start + size);
+    OffsetT keys2_beg = keys1_end;
+    OffsetT keys2_end = (cub::min)(keys_count, keys2_beg + size);
+
+    OffsetT partition_at = (cub::min)(keys2_end - keys1_beg,
+                                      items_per_tile * local_tile_idx);
+
+    OffsetT partition_diag = ping ? MergePath<KeyT>(keys_ping + keys1_beg,
+                                                    keys_ping + keys2_beg,
+                                                    keys1_end - keys1_beg,
+                                                    keys2_end - keys2_beg,
+                                                    partition_at,
+                                                    compare_op)
+                                  : MergePath<KeyT>(keys_pong + keys1_beg,
+                                                    keys_pong + keys2_beg,
+                                                    keys1_end - keys1_beg,
+                                                    keys2_end - keys2_beg,
+                                                    partition_at,
+                                                    compare_op);
+
+    merge_partitions[partition_idx] = keys1_beg + partition_diag;
+  }
+};
+
+/// \brief The agent is responsible for merging N consecutive sorted arrays into N/2 sorted arrays.
+template <
+  typename Policy,
+  typename KeyIteratorT,
+  typename ValueIteratorT,
+  typename OffsetT,
+  typename CompareOpT,
+  typename KeyT,
+  typename ValueT>
+struct AgentMerge
+{
+
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+  using KeysLoadPingIt  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeyIteratorT>::type;
+  using ItemsLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ValueIteratorT>::type;
+  using KeysLoadPongIt  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeyT *>::type;
+  using ItemsLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ValueT *>::type;
+
+  using KeysOutputPongIt  = KeyIteratorT;
+  using ItemsOutputPongIt = ValueIteratorT;
+  using KeysOutputPingIt  = KeyT*;
+  using ItemsOutputPingIt = ValueT*;
+
+  using BlockStoreKeysPong  = typename BlockStoreType<Policy, KeysOutputPongIt>::type;
+  using BlockStoreItemsPong = typename BlockStoreType<Policy, ItemsOutputPongIt>::type;
+  using BlockStoreKeysPing  = typename BlockStoreType<Policy, KeysOutputPingIt>::type;
+  using BlockStoreItemsPing = typename BlockStoreType<Policy, ItemsOutputPingIt>::type;
+
+  /// Parameterized BlockReduce primitive
+
+  union _TempStorage
+  {
+    typename BlockStoreKeysPing::TempStorage  store_keys_ping;
+    typename BlockStoreItemsPing::TempStorage store_items_ping;
+    typename BlockStoreKeysPong::TempStorage  store_keys_pong;
+    typename BlockStoreItemsPong::TempStorage store_items_pong;
+
+    KeyT keys_shared[Policy::ITEMS_PER_TILE + 1];
+    ValueT items_shared[Policy::ITEMS_PER_TILE + 1];
+  };
+
+  /// Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage> {};
+
+  static constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+  static constexpr int BLOCK_THREADS = Policy::BLOCK_THREADS;
+  static constexpr int ITEMS_PER_THREAD = Policy::ITEMS_PER_THREAD;
+  static constexpr int ITEMS_PER_TILE = Policy::ITEMS_PER_TILE;
+  static constexpr int SHARED_MEMORY_SIZE =
+    static_cast<int>(sizeof(TempStorage));
+
+  //---------------------------------------------------------------------
+  // Per thread data
+  //---------------------------------------------------------------------
+
+  bool            ping;
+  _TempStorage&   storage;
+
+  KeysLoadPingIt  keys_in_ping;
+  ItemsLoadPingIt items_in_ping;
+  KeysLoadPongIt  keys_in_pong;
+  ItemsLoadPongIt items_in_pong;
+
+  OffsetT keys_count;
+
+  KeysOutputPongIt  keys_out_pong;
+  ItemsOutputPongIt items_out_pong;
+  KeysOutputPingIt  keys_out_ping;
+  ItemsOutputPingIt items_out_ping;
+
+  CompareOpT compare_op;
+  OffsetT *merge_partitions;
+  OffsetT target_merged_tiles_number;
+
+  //---------------------------------------------------------------------
+  // Utility functions
+  //---------------------------------------------------------------------
+
+  /**
+   * \brief Concatenates up to ITEMS_PER_THREAD elements from input{1,2} into output array
+   *
+   * Reads data in a coalesced fashion [BLOCK_THREADS * item + tid] and
+   * stores the result in output[item].
+   */
+  template <bool IS_FULL_TILE, class T, class It1, class It2>
+  __device__ __forceinline__ void
+  gmem_to_reg(T (&output)[ITEMS_PER_THREAD],
+              It1 input1,
+              It2 input2,
+              int count1,
+              int count2)
+  {
+    if (IS_FULL_TILE)
+    {
+#pragma unroll
+      for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+      {
+        int idx = BLOCK_THREADS * item + threadIdx.x;
+        output[item] = (idx < count1) ? input1[idx] : input2[idx - count1];
+      }
+    }
+    else
+    {
+#pragma unroll
+      for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+      {
+        int idx = BLOCK_THREADS * item + threadIdx.x;
+        if (idx < count1 + count2)
+        {
+          output[item] = (idx < count1) ? input1[idx] : input2[idx - count1];
+        }
+      }
+    }
+  }
+
+  /// \brief Stores data in a coalesced fashion in[item] -> out[BLOCK_THREADS * item + tid]
+  template <class T, class It>
+  __device__ __forceinline__ void
+  reg_to_shared(It output,
+                T (&input)[ITEMS_PER_THREAD])
+  {
+#pragma unroll
+    for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+    {
+      int idx = BLOCK_THREADS * item + threadIdx.x;
+      output[idx] = input[item];
+    }
+  }
+
+  template <bool IS_FULL_TILE>
+  __device__ __forceinline__ void
+  consume_tile(int tid, OffsetT tile_idx, OffsetT tile_base, int count)
+  {
+    OffsetT partition_beg = merge_partitions[tile_idx + 0];
+    OffsetT partition_end = merge_partitions[tile_idx + 1];
+
+    // target_merged_tiles_number is a power of two.
+    OffsetT merged_tiles_number = target_merged_tiles_number / 2;
+
+    OffsetT mask  = target_merged_tiles_number - 1;
+
+    // The first tile number in the tiles group being merged, equal to:
+    // target_merged_tiles_number * (tile_idx / target_merged_tiles_number)
+    OffsetT list  = ~mask & tile_idx;
+    OffsetT start = ITEMS_PER_TILE * list;
+    OffsetT size  = ITEMS_PER_TILE * merged_tiles_number;
+
+    OffsetT diag = ITEMS_PER_TILE * tile_idx - start;
+
+    OffsetT keys1_beg = partition_beg;
+    OffsetT keys1_end = partition_end;
+    OffsetT keys2_beg = (cub::min)(keys_count, 2 * start + size + diag - partition_beg);
+    OffsetT keys2_end = (cub::min)(keys_count, 2 * start + size + diag + ITEMS_PER_TILE - partition_end);
+
+    // Check if it's the last tile in the tile group being merged
+    if (mask == (mask & tile_idx))
+    {
+      keys1_end = (cub::min)(keys_count, start + size);
+      keys2_end = (cub::min)(keys_count, start + size * 2);
+    }
+
+    // number of keys per tile
+    //
+    int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
+    int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
+
+    // load keys1 & keys2
+    KeyT keys_local[ITEMS_PER_THREAD];
+    if (ping)
+    {
+      gmem_to_reg<IS_FULL_TILE>(keys_local,
+                                keys_in_ping + keys1_beg,
+                                keys_in_ping + keys2_beg,
+                                num_keys1,
+                                num_keys2);
+    }
+    else
+    {
+      gmem_to_reg<IS_FULL_TILE>(keys_local,
+                                keys_in_pong + keys1_beg,
+                                keys_in_pong + keys2_beg,
+                                num_keys1,
+                                num_keys2);
+    }
+    reg_to_shared(&storage.keys_shared[0], keys_local);
+
+    // preload items into registers already
+    //
+    ValueT items_local[ITEMS_PER_THREAD];
+    if (!KEYS_ONLY)
+    {
+      if (ping)
+      {
+        gmem_to_reg<IS_FULL_TILE>(items_local,
+                                  items_in_ping + keys1_beg,
+                                  items_in_ping + keys2_beg,
+                                  num_keys1,
+                                  num_keys2);
+      }
+      else
+      {
+        gmem_to_reg<IS_FULL_TILE>(items_local,
+                                  items_in_pong + keys1_beg,
+                                  items_in_pong + keys2_beg,
+                                  num_keys1,
+                                  num_keys2);
+      }
+    }
+
+    CTA_SYNC();
+
+    // use binary search in shared memory
+    // to find merge path for each of thread
+    // we can use int type here, because the number of
+    // items in shared memory is limited
+    //
+    int diag0_local = (cub::min)(num_keys1 + num_keys2, ITEMS_PER_THREAD * tid);
+
+    int keys1_beg_local = MergePath<KeyT>(&storage.keys_shared[0],
+                                          &storage.keys_shared[num_keys1],
+                                          num_keys1,
+                                          num_keys2,
+                                          diag0_local,
+                                          compare_op);
+    int keys1_end_local = num_keys1;
+    int keys2_beg_local = diag0_local - keys1_beg_local;
+    int keys2_end_local = num_keys2;
+
+    int num_keys1_local = keys1_end_local - keys1_beg_local;
+    int num_keys2_local = keys2_end_local - keys2_beg_local;
+
+    // perform serial merge
+    //
+    int indices[ITEMS_PER_THREAD];
+
+    SerialMerge(&storage.keys_shared[0],
+                keys1_beg_local,
+                keys2_beg_local + num_keys1,
+                num_keys1_local,
+                num_keys2_local,
+                keys_local,
+                indices,
+                compare_op);
+
+    CTA_SYNC();
+
+    // write keys
+    //
+    if (ping)
+    {
+      if (IS_FULL_TILE)
+      {
+        BlockStoreKeysPing(storage.store_keys_ping)
+          .Store(keys_out_ping + tile_base, keys_local);
+      }
+      else
+      {
+        BlockStoreKeysPing(storage.store_keys_ping)
+          .Store(keys_out_ping + tile_base, keys_local, num_keys1 + num_keys2);
+      }
+    }
+    else
+    {
+      if (IS_FULL_TILE)
+      {
+        BlockStoreKeysPong(storage.store_keys_pong)
+          .Store(keys_out_pong + tile_base, keys_local);
+      }
+      else
+      {
+        BlockStoreKeysPong(storage.store_keys_pong)
+          .Store(keys_out_pong + tile_base, keys_local, num_keys1 + num_keys2);
+      }
+    }
+
+    // if items are provided, merge them
+    if (!KEYS_ONLY)
+    {
+      CTA_SYNC();
+
+      reg_to_shared(&storage.items_shared[0], items_local);
+
+      CTA_SYNC();
+
+      // gather items from shared mem
+      //
+#pragma unroll
+      for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+      {
+        items_local[item] = storage.items_shared[indices[item]];
+      }
+
+      CTA_SYNC();
+
+      // write from reg to gmem
+      //
+      if (ping)
+      {
+        if (IS_FULL_TILE)
+        {
+          BlockStoreItemsPing(storage.store_items_ping)
+            .Store(items_out_ping + tile_base, items_local);
+        }
+        else
+        {
+          BlockStoreItemsPing(storage.store_items_ping)
+            .Store(items_out_ping + tile_base, items_local, count);
+        }
+      }
+      else
+      {
+        if (IS_FULL_TILE)
+        {
+          BlockStoreItemsPong(storage.store_items_pong)
+            .Store(items_out_pong + tile_base, items_local);
+        }
+        else
+        {
+          BlockStoreItemsPong(storage.store_items_pong)
+            .Store(items_out_pong + tile_base, items_local, count);
+        }
+      }
+    }
+  }
+
+  __device__ __forceinline__ AgentMerge(bool ping_,
+                                        TempStorage &storage_,
+                                        KeysLoadPingIt keys_in_ping_,
+                                        ItemsLoadPingIt items_in_ping_,
+                                        KeysLoadPongIt keys_in_pong_,
+                                        ItemsLoadPongIt items_in_pong_,
+                                        OffsetT keys_count_,
+                                        KeysOutputPingIt keys_out_ping_,
+                                        ItemsOutputPingIt items_out_ping_,
+                                        KeysOutputPongIt keys_out_pong_,
+                                        ItemsOutputPongIt items_out_pong_,
+                                        CompareOpT compare_op_,
+                                        OffsetT *merge_partitions_,
+                                        OffsetT target_merged_tiles_number_)
+      : ping(ping_)
+      , storage(storage_.Alias())
+      , keys_in_ping(keys_in_ping_)
+      , items_in_ping(items_in_ping_)
+      , keys_in_pong(keys_in_pong_)
+      , items_in_pong(items_in_pong_)
+      , keys_count(keys_count_)
+      , keys_out_pong(keys_out_pong_)
+      , items_out_pong(items_out_pong_)
+      , keys_out_ping(keys_out_ping_)
+      , items_out_ping(items_out_ping_)
+      , compare_op(compare_op_)
+      , merge_partitions(merge_partitions_)
+      , target_merged_tiles_number(target_merged_tiles_number_)
+  {}
+
+  __device__ __forceinline__ void Process()
+  {
+    int tile_idx      = static_cast<int>(blockIdx.x);
+    int num_tiles     = static_cast<int>(gridDim.x);
+    OffsetT tile_base = OffsetT(tile_idx) * ITEMS_PER_TILE;
+    int tid           = static_cast<int>(threadIdx.x);
+    int items_in_tile = static_cast<int>(
+      (cub::min)(static_cast<OffsetT>(ITEMS_PER_TILE), keys_count - tile_base));
+
+    if (tile_idx < num_tiles - 1)
+    {
+      consume_tile<true>(tid, tile_idx, tile_base, ITEMS_PER_TILE);
+    }
+    else
+    {
+      consume_tile<false>(tid, tile_idx, tile_base, items_in_tile);
+    }
+  }
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/agent/agent_radix_sort_downsweep.cuh b/cuda/cub/agent/agent_radix_sort_downsweep.cuh
new file mode 100644
index 00000000..1b7c2ce0
--- /dev/null
+++ b/cuda/cub/agent/agent_radix_sort_downsweep.cuh
@@ -0,0 +1,778 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread
+ * blocks for participating in device-wide radix sort downsweep .
+ */
+
+
+#pragma once
+
+#include <stdint.h>
+#include <type_traits>
+
+#include <cub/thread/thread_load.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/block_radix_rank.cuh>
+#include <cub/block/block_exchange.cuh>
+#include <cub/block/radix_rank_sort_operations.cuh>
+#include <cub/config.cuh>
+#include <cub/util_type.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentRadixSortDownsweep
+ */
+template <
+    int                 NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
+    int                 NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
+    typename            ComputeT,                       ///< Dominant compute type
+    BlockLoadAlgorithm  _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier   _LOAD_MODIFIER,                 ///< Cache load modifier for reading keys (and values)
+    RadixRankAlgorithm  _RANK_ALGORITHM,                ///< The radix ranking algorithm to use
+    BlockScanAlgorithm  _SCAN_ALGORITHM,                ///< The block scan algorithm to use
+    int                 _RADIX_BITS,                    ///< The number of radix bits, i.e., log2(bins)
+    typename            ScalingType = RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
+struct AgentRadixSortDownsweepPolicy :
+    ScalingType
+{
+    enum
+    {
+        RADIX_BITS              = _RADIX_BITS,              ///< The number of radix bits, i.e., log2(bins)
+    };
+
+    static const BlockLoadAlgorithm  LOAD_ALGORITHM     = _LOAD_ALGORITHM;    ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier   LOAD_MODIFIER      = _LOAD_MODIFIER;     ///< Cache load modifier for reading keys (and values)
+    static const RadixRankAlgorithm  RANK_ALGORITHM     = _RANK_ALGORITHM;    ///< The radix ranking algorithm to use
+    static const BlockScanAlgorithm  SCAN_ALGORITHM     = _SCAN_ALGORITHM;    ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+
+
+
+
+/**
+ * \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
+ */
+template <
+    typename AgentRadixSortDownsweepPolicy,     ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type
+    bool     IS_DESCENDING,                     ///< Whether or not the sorted-order is high-to-low
+    typename KeyT,                              ///< KeyT type
+    typename ValueT,                            ///< ValueT type
+    typename OffsetT>                           ///< Signed integer type for global offsets
+struct AgentRadixSortDownsweep
+{
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    // Appropriate unsigned-bits representation of KeyT
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
+
+    static const UnsignedBits           LOWEST_KEY  = Traits<KeyT>::LOWEST_KEY;
+    static const UnsignedBits           MAX_KEY     = Traits<KeyT>::MAX_KEY;
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM  = AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM;
+    static const CacheLoadModifier      LOAD_MODIFIER   = AgentRadixSortDownsweepPolicy::LOAD_MODIFIER;
+    static const RadixRankAlgorithm     RANK_ALGORITHM  = AgentRadixSortDownsweepPolicy::RANK_ALGORITHM;
+    static const BlockScanAlgorithm     SCAN_ALGORITHM  = AgentRadixSortDownsweepPolicy::SCAN_ALGORITHM;
+
+    enum
+    {
+        BLOCK_THREADS           = AgentRadixSortDownsweepPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentRadixSortDownsweepPolicy::ITEMS_PER_THREAD,
+        RADIX_BITS              = AgentRadixSortDownsweepPolicy::RADIX_BITS,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        RADIX_DIGITS            = 1 << RADIX_BITS,
+        KEYS_ONLY               = std::is_same<ValueT, NullType>::value,
+        LOAD_WARP_STRIPED       = RANK_ALGORITHM == RADIX_RANK_MATCH ||
+                                  RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ANY ||
+                                  RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR,
+    };
+
+    // Input iterator wrapper type (for applying cache modifier)s
+    using KeysItr = CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT>;
+    using ValuesItr = CacheModifiedInputIterator<LOAD_MODIFIER, ValueT, OffsetT>;
+
+    // Radix ranking type to use
+    using BlockRadixRankT = cub::detail::conditional_t<
+      RANK_ALGORITHM == RADIX_RANK_BASIC,
+      BlockRadixRank<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, false, SCAN_ALGORITHM>,
+      cub::detail::conditional_t<
+        RANK_ALGORITHM == RADIX_RANK_MEMOIZE,
+        BlockRadixRank<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, true, SCAN_ALGORITHM>,
+        cub::detail::conditional_t<
+          RANK_ALGORITHM == RADIX_RANK_MATCH,
+          BlockRadixRankMatch<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, SCAN_ALGORITHM>,
+          cub::detail::conditional_t<
+            RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ANY,
+            BlockRadixRankMatchEarlyCounts<BLOCK_THREADS,
+                                           RADIX_BITS,
+                                           IS_DESCENDING,
+                                           SCAN_ALGORITHM,
+                                           WARP_MATCH_ANY>,
+            BlockRadixRankMatchEarlyCounts<BLOCK_THREADS,
+                                           RADIX_BITS,
+                                           IS_DESCENDING,
+                                           SCAN_ALGORITHM,
+                                           WARP_MATCH_ATOMIC_OR>>>>>;
+
+    // Digit extractor type
+    using DigitExtractorT = BFEDigitExtractor<KeyT>;
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = BlockRadixRankT::BINS_TRACKED_PER_THREAD
+    };
+
+    // BlockLoad type (keys)
+    using BlockLoadKeysT =
+      BlockLoad<UnsignedBits, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM>;
+
+    // BlockLoad type (values)
+    using BlockLoadValuesT =
+      BlockLoad<ValueT, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM>;
+
+    // Value exchange array type
+    typedef ValueT ValueExchangeT[TILE_ITEMS];
+
+    /**
+     * Shared memory storage layout
+     */
+    union __align__(16) _TempStorage
+    {
+        typename BlockLoadKeysT::TempStorage    load_keys;
+        typename BlockLoadValuesT::TempStorage  load_values;
+        typename BlockRadixRankT::TempStorage   radix_rank;
+
+        struct KeysAndOffsets
+        {
+            UnsignedBits                        exchange_keys[TILE_ITEMS];
+            OffsetT                             relative_bin_offsets[RADIX_DIGITS];
+        } keys_and_offsets;
+
+        Uninitialized<ValueExchangeT>           exchange_values;
+
+        OffsetT                                 exclusive_digit_prefix[RADIX_DIGITS];
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    // Shared storage for this CTA
+    _TempStorage    &temp_storage;
+
+    // Input and output device pointers
+    KeysItr         d_keys_in;
+    ValuesItr       d_values_in;
+    UnsignedBits    *d_keys_out;
+    ValueT          *d_values_out;
+
+    // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
+    OffsetT         bin_offset[BINS_TRACKED_PER_THREAD];
+
+    // Digit extractor
+    DigitExtractorT digit_extractor;
+
+    // Whether to short-cirucit
+    int             short_circuit;
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Scatter ranked keys through shared memory, then to device-accessible memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterKeys(
+        UnsignedBits    (&twiddled_keys)[ITEMS_PER_THREAD],
+        OffsetT         (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        OffsetT         valid_items)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            temp_storage.keys_and_offsets.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            UnsignedBits key            = temp_storage.keys_and_offsets.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)];
+            UnsignedBits digit          = digit_extractor.Digit(key);
+            relative_bin_offsets[ITEM]  = temp_storage.keys_and_offsets.relative_bin_offsets[digit];
+
+            // Un-twiddle
+            key = Traits<KeyT>::TwiddleOut(key);
+
+            if (FULL_TILE || 
+                (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
+            {
+                d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key;
+            }
+        }
+    }
+
+
+    /**
+     * Scatter ranked values through shared memory, then to device-accessible memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterValues(
+        ValueT      (&values)[ITEMS_PER_THREAD],
+        OffsetT     (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int         (&ranks)[ITEMS_PER_THREAD],
+        OffsetT     valid_items)
+    {
+        CTA_SYNC();
+
+        ValueExchangeT &exchange_values = temp_storage.exchange_values.Alias();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            exchange_values[ranks[ITEM]] = values[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];
+
+            if (FULL_TILE ||
+                (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
+            {
+                d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value;
+            }
+        }
+    }
+
+    /**
+     * Load a tile of keys (specialized for full tile, block load)
+     */
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<true>              is_full_tile,
+        Int2Type<false>             warp_striped)
+    {
+        BlockLoadKeysT(temp_storage.load_keys).Load(
+            d_keys_in + block_offset, keys);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for partial tile, block load)
+     */
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<false>             is_full_tile,
+        Int2Type<false>             warp_striped)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+        BlockLoadKeysT(temp_storage.load_keys).Load(
+            d_keys_in + block_offset, keys, valid_items, oob_item);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for full tile, warp-striped load)
+     */
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<true>              is_full_tile,
+        Int2Type<true>              warp_striped)
+    {
+        LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys);
+    }
+
+    /**
+     * Load a tile of keys (specialized for partial tile, warp-striped load)
+     */
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<false>             is_full_tile,
+        Int2Type<true>              warp_striped)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+        LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item);
+    }
+
+    /**
+     * Load a tile of values (specialized for full tile, block load)
+     */
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<true>              is_full_tile,
+        Int2Type<false>             warp_striped)
+    {
+        BlockLoadValuesT(temp_storage.load_values).Load(
+            d_values_in + block_offset, values);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of values (specialized for partial tile, block load)
+     */
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<false>             is_full_tile,
+        Int2Type<false>             warp_striped)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+        BlockLoadValuesT(temp_storage.load_values).Load(
+            d_values_in + block_offset, values, valid_items);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of items (specialized for full tile, warp-striped load)
+     */
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<true>              is_full_tile,
+        Int2Type<true>              warp_striped)
+    {
+        LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values);
+    }
+
+    /**
+     * Load a tile of items (specialized for partial tile, warp-striped load)
+     */
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<false>             is_full_tile,
+        Int2Type<true>              warp_striped)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+        LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items);
+    }
+
+    /**
+     * Truck along associated values
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void GatherScatterValues(
+        OffsetT         (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        OffsetT         block_offset,
+        OffsetT         valid_items,
+        Int2Type<false> /*is_keys_only*/)
+    {
+        ValueT values[ITEMS_PER_THREAD];
+
+        CTA_SYNC();
+
+        LoadValues(
+            values,
+            block_offset,
+            valid_items,
+            Int2Type<FULL_TILE>(),
+            Int2Type<LOAD_WARP_STRIPED>());
+
+        ScatterValues<FULL_TILE>(
+            values,
+            relative_bin_offsets,
+            ranks,
+            valid_items);
+    }
+
+
+    /**
+     * Truck along associated values (specialized for key-only sorting)
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void GatherScatterValues(
+        OffsetT         (&/*relative_bin_offsets*/)[ITEMS_PER_THREAD],
+        int             (&/*ranks*/)[ITEMS_PER_THREAD],
+        OffsetT         /*block_offset*/,
+        OffsetT         /*valid_items*/,
+        Int2Type<true>  /*is_keys_only*/)
+    {}
+
+
+    /**
+     * Process tile
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ProcessTile(
+        OffsetT block_offset,
+        const OffsetT &valid_items = TILE_ITEMS)
+    {
+        UnsignedBits    keys[ITEMS_PER_THREAD];
+        int             ranks[ITEMS_PER_THREAD];
+        OffsetT         relative_bin_offsets[ITEMS_PER_THREAD];
+
+        // Assign default (min/max) value to all keys
+        UnsignedBits default_key = (IS_DESCENDING) ? LOWEST_KEY : MAX_KEY;
+
+        // Load tile of keys
+        LoadKeys(
+            keys,
+            block_offset,
+            valid_items, 
+            default_key,
+            Int2Type<FULL_TILE>(),
+            Int2Type<LOAD_WARP_STRIPED>());
+
+        // Twiddle key bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            keys[KEY] = Traits<KeyT>::TwiddleIn(keys[KEY]);
+        }
+
+        // Rank the twiddled keys
+        int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
+        BlockRadixRankT(temp_storage.radix_rank).RankKeys(
+            keys,
+            ranks,
+            digit_extractor,
+            exclusive_digit_prefix);
+
+        CTA_SYNC();
+
+        // Share exclusive digit prefix
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                // Store exclusive prefix
+                temp_storage.exclusive_digit_prefix[bin_idx] =
+                    exclusive_digit_prefix[track];
+            }
+        }
+
+        CTA_SYNC();
+
+        // Get inclusive digit prefix
+        int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
+
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                {
+                    // Get inclusive digit prefix from exclusive prefix (higher bins come first)
+                    inclusive_digit_prefix[track] = (bin_idx == 0) ?
+                        (BLOCK_THREADS * ITEMS_PER_THREAD) :
+                        temp_storage.exclusive_digit_prefix[bin_idx - 1];
+                }
+                else
+                {
+                    // Get inclusive digit prefix from exclusive prefix (lower bins come first)
+                    inclusive_digit_prefix[track] = (bin_idx == RADIX_DIGITS - 1) ?
+                        (BLOCK_THREADS * ITEMS_PER_THREAD) :
+                        temp_storage.exclusive_digit_prefix[bin_idx + 1];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Update global scatter base offsets for each digit
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                bin_offset[track] -= exclusive_digit_prefix[track];
+                temp_storage.keys_and_offsets.relative_bin_offsets[bin_idx] = bin_offset[track];
+                bin_offset[track] += inclusive_digit_prefix[track];
+            }
+        }
+
+        CTA_SYNC();
+
+        // Scatter keys
+        ScatterKeys<FULL_TILE>(keys, relative_bin_offsets, ranks, valid_items);
+
+        // Gather/scatter values
+        GatherScatterValues<FULL_TILE>(relative_bin_offsets , ranks, block_offset, valid_items, Int2Type<KEYS_ONLY>());
+    }
+
+    //---------------------------------------------------------------------
+    // Copy shortcut
+    //---------------------------------------------------------------------
+
+    /**
+     * Copy tiles within the range of input
+     */
+    template <
+        typename InputIteratorT,
+        typename T>
+    __device__ __forceinline__ void Copy(
+        InputIteratorT  d_in,
+        T               *d_out,
+        OffsetT         block_offset,
+        OffsetT         block_end)
+    {
+        // Simply copy the input
+        while (block_end - block_offset >= TILE_ITEMS)
+        {
+            T items[ITEMS_PER_THREAD];
+
+            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
+            CTA_SYNC();
+            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
+
+            block_offset += TILE_ITEMS;
+        }
+
+        // Clean up last partial tile with guarded-I/O
+        if (block_offset < block_end)
+        {
+            OffsetT valid_items = block_end - block_offset;
+
+            T items[ITEMS_PER_THREAD];
+
+            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
+            CTA_SYNC();
+            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
+        }
+    }
+
+
+    /**
+     * Copy tiles within the range of input (specialized for NullType)
+     */
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Copy(
+        InputIteratorT  /*d_in*/,
+        NullType        * /*d_out*/,
+        OffsetT         /*block_offset*/,
+        OffsetT         /*block_end*/)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortDownsweep(
+        TempStorage     &temp_storage,
+        OffsetT         (&bin_offset)[BINS_TRACKED_PER_THREAD],
+        OffsetT         num_items,
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             current_bit,
+        int             num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+        d_values_in(d_values_in),
+        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
+        d_values_out(d_values_out),
+        digit_extractor(current_bit, num_bits),
+        short_circuit(1)
+    {
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            this->bin_offset[track] = bin_offset[track];
+
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                // Short circuit if the histogram has only bin counts of only zeros or problem-size
+                short_circuit = short_circuit && ((bin_offset[track] == 0) || (bin_offset[track] == num_items));
+            }
+        }
+
+        short_circuit = CTA_SYNC_AND(short_circuit);
+    }
+
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortDownsweep(
+        TempStorage     &temp_storage,
+        OffsetT         num_items,
+        OffsetT         *d_spine,
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             current_bit,
+        int             num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+        d_values_in(d_values_in),
+        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
+        d_values_out(d_values_out),
+        digit_extractor(current_bit, num_bits),
+        short_circuit(1)
+    {
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size
+                OffsetT first_block_bin_offset = d_spine[gridDim.x * bin_idx];
+                short_circuit = short_circuit && ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items));
+
+                // Load my block's bin offset for my bin
+                bin_offset[track] = d_spine[(gridDim.x * bin_idx) + blockIdx.x];
+            }
+        }
+
+        short_circuit = CTA_SYNC_AND(short_circuit);
+    }
+
+
+    /**
+     * Distribute keys from a segment of input tiles.
+     */
+    __device__ __forceinline__ void ProcessRegion(
+        OffsetT   block_offset,
+        OffsetT   block_end)
+    {
+        if (short_circuit)
+        {
+            // Copy keys
+            Copy(d_keys_in, d_keys_out, block_offset, block_end);
+
+            // Copy values
+            Copy(d_values_in, d_values_out, block_offset, block_end);
+        }
+        else
+        {
+            // Process full tiles of tile_items
+            #pragma unroll 1
+            while (block_end - block_offset >= TILE_ITEMS)
+            {
+                ProcessTile<true>(block_offset);
+                block_offset += TILE_ITEMS;
+
+                CTA_SYNC();
+            }
+
+            // Clean up last partial tile with guarded-I/O
+            if (block_offset < block_end)
+            {
+                ProcessTile<false>(block_offset, block_end - block_offset);
+            }
+
+        }
+    }
+
+};
+
+
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/agent/agent_radix_sort_histogram.cuh b/cuda/cub/agent/agent_radix_sort_histogram.cuh
new file mode 100644
index 00000000..5fd70af1
--- /dev/null
+++ b/cuda/cub/agent/agent_radix_sort_histogram.cuh
@@ -0,0 +1,258 @@
+/******************************************************************************
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * agent_radix_sort_histogram.cuh implements a stateful abstraction of CUDA
+ * thread blocks for participating in the device histogram kernel used for
+ * one-sweep radix sorting.
+ */
+
+#pragma once
+
+#include "../block/block_load.cuh"
+#include "../block/radix_rank_sort_operations.cuh"
+#include "../config.cuh"
+#include "../thread/thread_reduce.cuh"
+#include "../util_math.cuh"
+#include "../util_type.cuh"
+
+
+CUB_NAMESPACE_BEGIN
+
+template <
+  int _BLOCK_THREADS,
+  int _ITEMS_PER_THREAD,
+  int NOMINAL_4B_NUM_PARTS,
+  typename ComputeT,
+  int _RADIX_BITS>
+struct AgentRadixSortHistogramPolicy
+{
+    enum
+    {
+        BLOCK_THREADS = _BLOCK_THREADS,
+        ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+        /** NUM_PARTS is the number of private histograms (parts) each histogram is split
+         * into. Each warp lane is assigned to a specific part based on the lane
+         * ID. However, lanes with the same ID in different warp use the same private
+         * histogram. This arrangement helps reduce the degree of conflicts in atomic
+         * operations. */
+        NUM_PARTS = CUB_MAX(1, NOMINAL_4B_NUM_PARTS * 4 / CUB_MAX(sizeof(ComputeT), 4)),
+        RADIX_BITS = _RADIX_BITS,
+    };
+};
+
+template <
+    int _BLOCK_THREADS,
+    int _RADIX_BITS>
+struct AgentRadixSortExclusiveSumPolicy
+{
+    enum
+    {
+        BLOCK_THREADS = _BLOCK_THREADS,
+        RADIX_BITS = _RADIX_BITS,
+    };
+};
+
+template <
+    typename AgentRadixSortHistogramPolicy,
+    bool IS_DESCENDING,
+    typename KeyT,
+    typename OffsetT>
+struct AgentRadixSortHistogram
+{
+    // constants
+    enum
+    {
+        ITEMS_PER_THREAD = AgentRadixSortHistogramPolicy::ITEMS_PER_THREAD,
+        BLOCK_THREADS = AgentRadixSortHistogramPolicy::BLOCK_THREADS,
+        TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+        RADIX_BITS = AgentRadixSortHistogramPolicy::RADIX_BITS,
+        RADIX_DIGITS = 1 << RADIX_BITS,
+        MAX_NUM_PASSES = (sizeof(KeyT) * 8 + RADIX_BITS - 1) / RADIX_BITS,
+        NUM_PARTS = AgentRadixSortHistogramPolicy::NUM_PARTS,
+    };
+
+    typedef RadixSortTwiddle<IS_DESCENDING, KeyT> Twiddle;
+    typedef std::uint32_t ShmemCounterT;
+    typedef ShmemCounterT ShmemAtomicCounterT;
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
+
+    struct _TempStorage
+    {
+        ShmemAtomicCounterT bins[MAX_NUM_PASSES][RADIX_DIGITS][NUM_PARTS];
+    };
+
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+    // thread fields
+    // shared memory storage
+    _TempStorage& s;
+  
+    // bins for the histogram
+    OffsetT* d_bins_out;
+
+    // data to compute the histogram
+    const UnsignedBits* d_keys_in;
+
+    // number of data items
+    OffsetT num_items;
+
+    // begin and end bits for sorting
+    int begin_bit, end_bit;
+
+    // number of sorting passes
+    int num_passes;
+
+    __device__ __forceinline__ AgentRadixSortHistogram
+        (TempStorage& temp_storage, OffsetT* d_bins_out, const KeyT* d_keys_in,
+         OffsetT num_items, int begin_bit, int end_bit) :
+          s(temp_storage.Alias()), d_bins_out(d_bins_out),
+          d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+          num_items(num_items), begin_bit(begin_bit), end_bit(end_bit),
+          num_passes((end_bit - begin_bit + RADIX_BITS - 1) / RADIX_BITS)
+    {}
+
+    __device__ __forceinline__ void Init()
+    {
+        // Initialize bins to 0.
+        #pragma unroll
+        for (int bin = threadIdx.x; bin < RADIX_DIGITS; bin += BLOCK_THREADS)
+        {
+            #pragma unroll
+            for (int pass = 0; pass < num_passes; ++pass)
+            {
+                #pragma unroll
+                for (int part = 0; part < NUM_PARTS; ++part)
+                {
+                    s.bins[pass][bin][part] = 0;
+                }
+            }
+        }
+        CTA_SYNC();
+    }
+
+    __device__ __forceinline__
+    void LoadTileKeys(OffsetT tile_offset, UnsignedBits (&keys)[ITEMS_PER_THREAD])    
+    {
+        // tile_offset < num_items always, hence the line below works
+        bool full_tile = num_items - tile_offset >= TILE_ITEMS;
+        if (full_tile)
+        {
+            LoadDirectStriped<BLOCK_THREADS>(
+                threadIdx.x, d_keys_in + tile_offset, keys);
+        }
+        else
+        {
+            LoadDirectStriped<BLOCK_THREADS>(
+                threadIdx.x, d_keys_in + tile_offset, keys,
+                num_items - tile_offset, Twiddle::DefaultKey());
+        }
+
+        #pragma unroll
+        for (int u = 0; u < ITEMS_PER_THREAD; ++u)
+        {
+            keys[u] = Twiddle::In(keys[u]);
+        }
+    }
+
+    __device__ __forceinline__
+    void AccumulateSharedHistograms(OffsetT tile_offset, UnsignedBits (&keys)[ITEMS_PER_THREAD])
+    {
+        int part = LaneId() % NUM_PARTS;
+        #pragma unroll
+        for (int current_bit = begin_bit, pass = 0;
+             current_bit < end_bit; current_bit += RADIX_BITS, ++pass)
+        {
+            int num_bits = CUB_MIN(RADIX_BITS, end_bit - current_bit);
+            ShiftDigitExtractor<KeyT> digit_extractor(current_bit, num_bits);
+            #pragma unroll
+            for (int u = 0; u < ITEMS_PER_THREAD; ++u)
+            {
+                int bin = digit_extractor.Digit(keys[u]);
+                // Using cuda::atomic<> results in lower performance on GP100,
+                // so atomicAdd() is used instead.
+                atomicAdd(&s.bins[pass][bin][part], 1);
+            }
+        }
+    }
+
+    __device__ __forceinline__ void AccumulateGlobalHistograms()
+    {
+        #pragma unroll
+        for (int bin = threadIdx.x; bin < RADIX_DIGITS; bin += BLOCK_THREADS)
+        {
+            #pragma unroll
+            for (int pass = 0; pass < num_passes; ++pass)
+            {
+                OffsetT count = internal::ThreadReduce(s.bins[pass][bin], Sum());
+                if (count > 0)
+                {
+                    // Using cuda::atomic<> here would also require using it in
+                    // other kernels. However, other kernels of onesweep sorting
+                    // (ExclusiveSum, Onesweep) don't need atomic
+                    // access. Therefore, atomicAdd() is used, until
+                    // cuda::atomic_ref<> becomes available.
+                    atomicAdd(&d_bins_out[pass * RADIX_DIGITS + bin], count);
+                }
+            }
+        }
+    }
+
+    __device__ __forceinline__ void Process()
+    {
+        // Within a portion, avoid overflowing (u)int32 counters.
+        // Between portions, accumulate results in global memory.
+        const OffsetT MAX_PORTION_SIZE = 1 << 30;
+        OffsetT num_portions = cub::DivideAndRoundUp(num_items, MAX_PORTION_SIZE);
+        for (OffsetT portion = 0; portion < num_portions; ++portion)
+        {
+            // Reset the counters.
+            Init();
+            CTA_SYNC();
+
+            // Process the tiles.
+            OffsetT portion_offset = portion * MAX_PORTION_SIZE;
+            OffsetT portion_size = CUB_MIN(MAX_PORTION_SIZE, num_items - portion_offset);
+            for (OffsetT offset = blockIdx.x * TILE_ITEMS; offset < portion_size;
+                 offset += TILE_ITEMS * gridDim.x)
+            {
+                OffsetT tile_offset = portion_offset + offset;
+                UnsignedBits keys[ITEMS_PER_THREAD];
+                LoadTileKeys(tile_offset, keys);
+                AccumulateSharedHistograms(tile_offset, keys);
+            }
+            CTA_SYNC();
+            
+            // Accumulate the result in global memory.
+            AccumulateGlobalHistograms();
+            CTA_SYNC();
+        }
+    }
+};
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/agent/agent_radix_sort_onesweep.cuh b/cuda/cub/agent/agent_radix_sort_onesweep.cuh
new file mode 100644
index 00000000..69792a5b
--- /dev/null
+++ b/cuda/cub/agent/agent_radix_sort_onesweep.cuh
@@ -0,0 +1,670 @@
+/******************************************************************************
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * agent_radix_sort_onesweep.cuh implements a stateful abstraction of CUDA
+ * thread blocks for participating in the device one-sweep radix sort kernel.
+ */
+
+#pragma once
+
+#include "../block/block_radix_rank.cuh"
+#include "../block/radix_rank_sort_operations.cuh"
+#include "../block/block_store.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+/** \brief cub::RadixSortStoreAlgorithm enumerates different algorithms to write
+ * partitioned elements (keys, values) stored in shared memory into global
+ * memory. Currently applies only to writing 4B keys in full tiles; in all other cases,
+ * RADIX_SORT_STORE_DIRECT is used.
+ */
+enum RadixSortStoreAlgorithm
+{
+    /** \brief Elements are statically distributed among block threads, which write them
+     * into the appropriate partition in global memory. This results in fewer instructions
+     * and more writes in flight at a given moment, but may generate more transactions. */
+    RADIX_SORT_STORE_DIRECT,
+    /** \brief Elements are distributed among warps in a block distribution. Each warp
+     * goes through its elements and tries to write them while minimizing the number of
+     * memory transactions. This results in fewer memory transactions, but more
+     * instructions and less writes in flight at a given moment. */
+    RADIX_SORT_STORE_ALIGNED
+};
+
+template <
+    int NOMINAL_BLOCK_THREADS_4B,
+    int NOMINAL_ITEMS_PER_THREAD_4B,
+    typename ComputeT,
+    /** \brief Number of private histograms to use in the ranker; 
+        ignored if the ranking algorithm is not one of RADIX_RANK_MATCH_EARLY_COUNTS_* */
+    int _RANK_NUM_PARTS,
+    /** \brief Ranking algorithm used in the onesweep kernel. Only algorithms that
+      support warp-strided key arrangement and count callbacks are supported. */
+    RadixRankAlgorithm _RANK_ALGORITHM,
+    BlockScanAlgorithm _SCAN_ALGORITHM,
+    RadixSortStoreAlgorithm _STORE_ALGORITHM,
+    int _RADIX_BITS,
+    typename ScalingType = RegBoundScaling<
+        NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
+struct AgentRadixSortOnesweepPolicy : ScalingType
+{
+    enum
+    {
+        RANK_NUM_PARTS = _RANK_NUM_PARTS,
+        RADIX_BITS = _RADIX_BITS,
+    };
+    static const RadixRankAlgorithm RANK_ALGORITHM = _RANK_ALGORITHM;
+    static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+    static const RadixSortStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
+};
+
+template <
+    typename AgentRadixSortOnesweepPolicy,
+    bool IS_DESCENDING,
+    typename KeyT,
+    typename ValueT,
+    typename OffsetT,
+    typename PortionOffsetT>
+struct AgentRadixSortOnesweep
+{
+    // constants
+    enum
+    {
+        ITEMS_PER_THREAD = AgentRadixSortOnesweepPolicy::ITEMS_PER_THREAD,
+        KEYS_ONLY = std::is_same<ValueT, NullType>::value,
+        BLOCK_THREADS = AgentRadixSortOnesweepPolicy::BLOCK_THREADS,
+        RANK_NUM_PARTS = AgentRadixSortOnesweepPolicy::RANK_NUM_PARTS,
+        TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+        RADIX_BITS = AgentRadixSortOnesweepPolicy::RADIX_BITS,
+        RADIX_DIGITS = 1 << RADIX_BITS,        
+        BINS_PER_THREAD = (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS,
+        FULL_BINS = BINS_PER_THREAD * BLOCK_THREADS == RADIX_DIGITS,
+        WARP_THREADS = CUB_PTX_WARP_THREADS,
+        BLOCK_WARPS = BLOCK_THREADS / WARP_THREADS,
+        WARP_MASK = ~0,
+        LOOKBACK_PARTIAL_MASK = 1 << (PortionOffsetT(sizeof(PortionOffsetT)) * 8 - 2),
+        LOOKBACK_GLOBAL_MASK = 1 << (PortionOffsetT(sizeof(PortionOffsetT)) * 8 - 1),
+        LOOKBACK_KIND_MASK = LOOKBACK_PARTIAL_MASK | LOOKBACK_GLOBAL_MASK,
+        LOOKBACK_VALUE_MASK = ~LOOKBACK_KIND_MASK,
+    };
+
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
+    typedef PortionOffsetT AtomicOffsetT;
+  
+    static const RadixRankAlgorithm RANK_ALGORITHM =
+                                    AgentRadixSortOnesweepPolicy::RANK_ALGORITHM;
+    static const BlockScanAlgorithm SCAN_ALGORITHM =
+                                    AgentRadixSortOnesweepPolicy::SCAN_ALGORITHM;
+    static const RadixSortStoreAlgorithm STORE_ALGORITHM =
+                                    sizeof(UnsignedBits) == sizeof(uint32_t) ?
+                                    AgentRadixSortOnesweepPolicy::STORE_ALGORITHM :
+                                    RADIX_SORT_STORE_DIRECT;
+
+    typedef RadixSortTwiddle<IS_DESCENDING, KeyT> Twiddle;
+
+    static_assert(RANK_ALGORITHM == RADIX_RANK_MATCH
+                  || RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ANY
+                  || RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR,
+        "for onesweep agent, the ranking algorithm must warp-strided key arrangement");
+
+    using BlockRadixRankT = cub::detail::conditional_t<
+      RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR,
+      BlockRadixRankMatchEarlyCounts<BLOCK_THREADS,
+                                     RADIX_BITS,
+                                     false,
+                                     SCAN_ALGORITHM,
+                                     WARP_MATCH_ATOMIC_OR,
+                                     RANK_NUM_PARTS>,
+      cub::detail::conditional_t<
+        RANK_ALGORITHM == RADIX_RANK_MATCH,
+        BlockRadixRankMatch<BLOCK_THREADS, RADIX_BITS, false, SCAN_ALGORITHM>,
+        BlockRadixRankMatchEarlyCounts<BLOCK_THREADS,
+                                       RADIX_BITS,
+                                       false,
+                                       SCAN_ALGORITHM,
+                                       WARP_MATCH_ANY,
+                                       RANK_NUM_PARTS>>>;
+
+    // temporary storage
+    struct TempStorage_
+    {
+        union
+        {
+            UnsignedBits keys_out[TILE_ITEMS];
+            ValueT values_out[TILE_ITEMS];
+            typename BlockRadixRankT::TempStorage rank_temp_storage;
+        };
+        union
+        {
+            OffsetT global_offsets[RADIX_DIGITS];
+            PortionOffsetT block_idx;
+        };
+    };
+
+    using TempStorage = Uninitialized<TempStorage_>;
+
+    // thread variables
+    TempStorage_& s;
+
+    // kernel parameters
+    AtomicOffsetT* d_lookback;
+    AtomicOffsetT* d_ctrs;
+    OffsetT* d_bins_out;
+    const OffsetT*  d_bins_in;
+    UnsignedBits* d_keys_out;
+    const UnsignedBits* d_keys_in;
+    ValueT* d_values_out;
+    const ValueT* d_values_in;
+    PortionOffsetT num_items;
+    ShiftDigitExtractor<KeyT> digit_extractor;
+
+    // other thread variables
+    int warp;
+    int lane;
+    PortionOffsetT block_idx;
+    bool full_block;
+
+    // helper methods
+    __device__ __forceinline__ int Digit(UnsignedBits key)
+    {
+        return digit_extractor.Digit(key);
+    }
+
+    __device__ __forceinline__ int ThreadBin(int u)
+    {
+        return threadIdx.x * BINS_PER_THREAD + u;
+    }
+
+    __device__ __forceinline__ void LookbackPartial(int (&bins)[BINS_PER_THREAD])
+    {
+        #pragma unroll
+        for (int u = 0; u < BINS_PER_THREAD; ++u) 
+        {
+            int bin = ThreadBin(u);
+            if (FULL_BINS || bin < RADIX_DIGITS)
+            {
+                // write the local sum into the bin
+                AtomicOffsetT& loc = d_lookback[block_idx * RADIX_DIGITS + bin];
+                PortionOffsetT value = bins[u] | LOOKBACK_PARTIAL_MASK;
+                ThreadStore<STORE_VOLATILE>(&loc, value);
+            }
+        }
+    }
+
+    struct CountsCallback
+    {
+        typedef AgentRadixSortOnesweep<AgentRadixSortOnesweepPolicy, IS_DESCENDING, KeyT,
+                                       ValueT, OffsetT, PortionOffsetT> AgentT;
+        AgentT& agent;
+        int (&bins)[BINS_PER_THREAD];
+        UnsignedBits (&keys)[ITEMS_PER_THREAD];
+        static const bool EMPTY = false;
+        __device__ __forceinline__ CountsCallback(
+            AgentT& agent, int (&bins)[BINS_PER_THREAD], UnsignedBits (&keys)[ITEMS_PER_THREAD])
+            : agent(agent), bins(bins), keys(keys) {}
+        __device__ __forceinline__ void operator()(int (&other_bins)[BINS_PER_THREAD])
+        {
+            #pragma unroll
+            for (int u = 0; u < BINS_PER_THREAD; ++u)
+            {
+                bins[u] = other_bins[u];
+            }
+            agent.LookbackPartial(bins);
+
+            agent.TryShortCircuit(keys, bins);
+        }
+    };
+  
+    __device__ __forceinline__ void LookbackGlobal(int (&bins)[BINS_PER_THREAD])
+    {
+        #pragma unroll
+        for (int u = 0; u < BINS_PER_THREAD; ++u)
+        {
+            int bin = ThreadBin(u);
+            if (FULL_BINS || bin < RADIX_DIGITS)
+            {
+                PortionOffsetT inc_sum = bins[u];
+                int want_mask = ~0;
+                // backtrack as long as necessary
+                for (PortionOffsetT block_jdx = block_idx - 1; block_jdx >= 0; --block_jdx)
+                {
+                    // wait for some value to appear
+                    PortionOffsetT value_j = 0;
+                    AtomicOffsetT& loc_j = d_lookback[block_jdx * RADIX_DIGITS + bin];
+                    do {
+                        __threadfence_block(); // prevent hoisting loads from loop
+                        value_j = ThreadLoad<LOAD_VOLATILE>(&loc_j);
+                    } while (value_j == 0);
+
+                    inc_sum += value_j & LOOKBACK_VALUE_MASK;
+                    want_mask = WARP_BALLOT((value_j & LOOKBACK_GLOBAL_MASK) == 0, want_mask);
+                    if (value_j & LOOKBACK_GLOBAL_MASK) break;
+                }
+                AtomicOffsetT& loc_i = d_lookback[block_idx * RADIX_DIGITS + bin];
+                PortionOffsetT value_i = inc_sum | LOOKBACK_GLOBAL_MASK;
+                ThreadStore<STORE_VOLATILE>(&loc_i, value_i);
+                s.global_offsets[bin] += inc_sum - bins[u];
+            }
+        }
+    }
+
+    __device__ __forceinline__
+    void LoadKeys(OffsetT tile_offset, UnsignedBits (&keys)[ITEMS_PER_THREAD])
+    {
+        if (full_block)
+        {
+            LoadDirectWarpStriped(threadIdx.x, d_keys_in + tile_offset, keys);
+        }
+        else
+        {
+            LoadDirectWarpStriped(threadIdx.x, d_keys_in + tile_offset, keys,
+                                  num_items - tile_offset, Twiddle::DefaultKey());
+        }
+
+        #pragma unroll
+        for (int u = 0; u < ITEMS_PER_THREAD; ++u)
+        {
+            keys[u] = Twiddle::In(keys[u]);
+        }
+    }
+
+    __device__ __forceinline__
+    void LoadValues(OffsetT tile_offset, ValueT (&values)[ITEMS_PER_THREAD])
+    {
+        if (full_block)
+        {
+            LoadDirectWarpStriped(threadIdx.x, d_values_in + tile_offset, values);
+        }
+        else
+        {
+            int tile_items = num_items - tile_offset;
+            LoadDirectWarpStriped(threadIdx.x, d_values_in + tile_offset, values,
+                                  tile_items);
+        }
+    }
+
+    /** Checks whether "short-circuiting" is possible. Short-circuiting happens
+     * if all TILE_ITEMS keys fall into the same bin, i.e. have the same digit
+     * value (note that it only happens for full tiles). If short-circuiting is
+     * performed, the part of the ranking algorithm after the CountsCallback, as
+     * well as the rest of the sorting (e.g. scattering keys and values to
+     * shared and global memory) are skipped; updates related to decoupled
+     * look-back are still performed. Instead, the keys assigned to the current
+     * thread block are written cooperatively into a contiguous location in
+     * d_keys_out corresponding to their digit. The values (if also sorting
+     * values) assigned to the current thread block are similarly copied from
+     * d_values_in to d_values_out. */
+    __device__ __forceinline__
+    void TryShortCircuit(UnsignedBits (&keys)[ITEMS_PER_THREAD], int (&bins)[BINS_PER_THREAD])
+    {
+        // check if any bin can be short-circuited
+        bool short_circuit = false;
+        #pragma unroll
+        for (int u = 0; u < BINS_PER_THREAD; ++u)
+        {
+            if (FULL_BINS || ThreadBin(u) < RADIX_DIGITS)
+            {
+                short_circuit = short_circuit || bins[u] == TILE_ITEMS;
+            }
+        }
+        short_circuit = CTA_SYNC_OR(short_circuit);
+        if (!short_circuit) return;
+
+        ShortCircuitCopy(keys, bins);
+    }
+
+    __device__ __forceinline__
+    void ShortCircuitCopy(UnsignedBits (&keys)[ITEMS_PER_THREAD], int (&bins)[BINS_PER_THREAD])
+    {
+        // short-circuit handling; note that global look-back is still required
+
+        // compute offsets
+        int common_bin = Digit(keys[0]);
+        int offsets[BINS_PER_THREAD];
+        #pragma unroll
+        for (int u = 0; u < BINS_PER_THREAD; ++u)
+        {
+            int bin = ThreadBin(u);
+            offsets[u] = bin > common_bin ? TILE_ITEMS : 0;
+        }
+
+        // global lookback
+        LoadBinsToOffsetsGlobal(offsets);
+        LookbackGlobal(bins);
+        UpdateBinsGlobal(bins, offsets);
+        CTA_SYNC();
+
+        // scatter the keys
+        OffsetT global_offset = s.global_offsets[common_bin];
+        #pragma unroll
+        for (int u = 0; u < ITEMS_PER_THREAD; ++u)
+        {
+            keys[u] = Twiddle::Out(keys[u]);
+        }
+        if (full_block)
+        {
+            StoreDirectWarpStriped(threadIdx.x, d_keys_out + global_offset, keys);
+        }
+        else
+        {
+            int tile_items = num_items - block_idx * TILE_ITEMS;
+            StoreDirectWarpStriped(threadIdx.x, d_keys_out + global_offset, keys,
+                                   tile_items);
+        }
+
+        if (!KEYS_ONLY)
+        {
+            // gather and scatter the values
+            ValueT values[ITEMS_PER_THREAD];
+            LoadValues(block_idx * TILE_ITEMS, values);
+            if (full_block)
+            {
+                StoreDirectWarpStriped(threadIdx.x, d_values_out + global_offset, values);
+            }
+            else
+            {
+                int tile_items = num_items - block_idx * TILE_ITEMS;
+                StoreDirectWarpStriped(threadIdx.x, d_values_out + global_offset, values,
+                                       tile_items);
+            }
+        }
+
+        // exit early
+        ThreadExit();
+    }
+
+    __device__ __forceinline__
+    void ScatterKeysShared(UnsignedBits (&keys)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD])
+    {
+        // write to shared memory
+        #pragma unroll
+        for (int u = 0; u < ITEMS_PER_THREAD; ++u)
+        {
+            s.keys_out[ranks[u]] = keys[u];
+        }
+    }
+
+    __device__ __forceinline__
+    void ScatterValuesShared(ValueT (&values)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD])
+    {
+        // write to shared memory
+        #pragma unroll
+        for (int u = 0; u < ITEMS_PER_THREAD; ++u)
+        {
+            s.values_out[ranks[u]] = values[u];
+        }
+    }
+
+    __device__ __forceinline__ void LoadBinsToOffsetsGlobal(int (&offsets)[BINS_PER_THREAD])
+    {
+        // global offset - global part
+        #pragma unroll
+        for (int u = 0; u < BINS_PER_THREAD; ++u)
+        {
+            int bin = ThreadBin(u);
+            if (FULL_BINS || bin < RADIX_DIGITS)
+            {
+                s.global_offsets[bin] = d_bins_in[bin] - offsets[u];
+            }
+        }        
+    }
+
+    __device__ __forceinline__ void UpdateBinsGlobal(int (&bins)[BINS_PER_THREAD],
+                                                     int (&offsets)[BINS_PER_THREAD])
+    {
+        bool last_block = (block_idx + 1) * TILE_ITEMS >= num_items;
+        if (d_bins_out != NULL && last_block)
+        {
+            #pragma unroll
+            for (int u = 0; u < BINS_PER_THREAD; ++u)
+            {
+                int bin = ThreadBin(u);
+                if (FULL_BINS || bin < RADIX_DIGITS)
+                {
+                    d_bins_out[bin] = s.global_offsets[bin] + offsets[u] + bins[u];
+                }
+            }
+        }
+    }
+
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterKeysGlobalDirect()
+    {
+        int tile_items = FULL_TILE ? TILE_ITEMS : num_items - block_idx * TILE_ITEMS;
+        #pragma unroll
+        for (int u = 0; u < ITEMS_PER_THREAD; ++u)
+        {
+            int idx = threadIdx.x + u * BLOCK_THREADS;
+            UnsignedBits key = s.keys_out[idx];
+            OffsetT global_idx = idx + s.global_offsets[Digit(key)];
+            if (FULL_TILE || idx < tile_items)
+            {
+                d_keys_out[global_idx] = Twiddle::Out(key);
+            }
+            WARP_SYNC(WARP_MASK);
+        }
+    }
+
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterValuesGlobalDirect(int (&digits)[ITEMS_PER_THREAD])
+    {
+        int tile_items = FULL_TILE ? TILE_ITEMS : num_items - block_idx * TILE_ITEMS;
+        #pragma unroll
+        for (int u = 0; u < ITEMS_PER_THREAD; ++u)
+        {
+            int idx = threadIdx.x + u * BLOCK_THREADS;
+            ValueT value = s.values_out[idx];
+            OffsetT global_idx = idx + s.global_offsets[digits[u]];
+            if (FULL_TILE || idx < tile_items) d_values_out[global_idx] = value;
+            WARP_SYNC(WARP_MASK);
+        }
+    }
+
+    __device__ __forceinline__ void ScatterKeysGlobalAligned()
+    {
+        // this only works with full tiles
+        const int ITEMS_PER_WARP = TILE_ITEMS / BLOCK_WARPS;
+        const int ALIGN = 8;
+        const auto CACHE_MODIFIER = STORE_CG;
+        
+        int warp_start = warp * ITEMS_PER_WARP;
+        int warp_end = (warp + 1) * ITEMS_PER_WARP;
+        int warp_offset = warp_start;
+        while (warp_offset < warp_end - WARP_THREADS)
+        {
+            int idx = warp_offset + lane;
+            UnsignedBits key = s.keys_out[idx];
+            UnsignedBits key_out = Twiddle::Out(key);
+            OffsetT global_idx = idx + s.global_offsets[Digit(key)];
+            int last_lane = WARP_THREADS - 1;
+            int num_writes = WARP_THREADS;
+            if (lane == last_lane)
+            {
+                num_writes -= int(global_idx + 1) % ALIGN;
+            }
+            num_writes = SHFL_IDX_SYNC(num_writes, last_lane, WARP_MASK);
+            if (lane < num_writes)
+            {
+                ThreadStore<CACHE_MODIFIER>(&d_keys_out[global_idx], key_out);
+            }
+            warp_offset += num_writes;
+        }
+        {
+            int num_writes = warp_end - warp_offset;
+            if (lane < num_writes)
+            {
+                int idx = warp_offset + lane;
+                UnsignedBits key = s.keys_out[idx];
+                OffsetT global_idx = idx + s.global_offsets[Digit(key)];
+                ThreadStore<CACHE_MODIFIER>(&d_keys_out[global_idx], Twiddle::Out(key));
+            }
+        }
+    }
+
+    __device__ __forceinline__ void ScatterKeysGlobal()
+    {
+        // write block data to global memory
+        if (full_block)
+        {
+            if (STORE_ALGORITHM == RADIX_SORT_STORE_ALIGNED)
+            {
+                ScatterKeysGlobalAligned();
+            }
+            else
+            {
+                ScatterKeysGlobalDirect<true>();
+            }
+        }
+        else
+        {
+            ScatterKeysGlobalDirect<false>();
+        }
+    }
+
+    __device__ __forceinline__ void ScatterValuesGlobal(int (&digits)[ITEMS_PER_THREAD])
+    {
+        // write block data to global memory
+        if (full_block)
+        {
+            ScatterValuesGlobalDirect<true>(digits);
+        }
+        else
+        {
+            ScatterValuesGlobalDirect<false>(digits);
+        }
+    }
+
+    __device__ __forceinline__ void ComputeKeyDigits(int (&digits)[ITEMS_PER_THREAD])
+    {
+        #pragma unroll
+        for (int u = 0; u < ITEMS_PER_THREAD; ++u)
+        {
+            int idx = threadIdx.x + u * BLOCK_THREADS;
+            digits[u] = Digit(s.keys_out[idx]);
+        }
+    }
+
+    __device__ __forceinline__ void GatherScatterValues(
+        int (&ranks)[ITEMS_PER_THREAD], Int2Type<false> keys_only)
+    {
+        // compute digits corresponding to the keys
+        int digits[ITEMS_PER_THREAD];
+        ComputeKeyDigits(digits);
+        
+        // load values
+        ValueT values[ITEMS_PER_THREAD];
+        LoadValues(block_idx * TILE_ITEMS, values);
+        
+        // scatter values
+        CTA_SYNC();
+        ScatterValuesShared(values, ranks);
+
+        CTA_SYNC();
+        ScatterValuesGlobal(digits);
+    }
+        
+
+    __device__ __forceinline__ void GatherScatterValues(
+        int (&ranks)[ITEMS_PER_THREAD], Int2Type<true> keys_only) {}
+
+    __device__ __forceinline__ void Process()
+    {
+        // load keys
+        // if warp1 < warp2, all elements of warp1 occur before those of warp2
+        // in the source array
+        UnsignedBits keys[ITEMS_PER_THREAD];
+        LoadKeys(block_idx * TILE_ITEMS, keys);
+
+        // rank keys
+        int ranks[ITEMS_PER_THREAD];
+        int exclusive_digit_prefix[BINS_PER_THREAD];
+        int bins[BINS_PER_THREAD];
+        BlockRadixRankT(s.rank_temp_storage).RankKeys(
+            keys, ranks, digit_extractor, exclusive_digit_prefix,
+            CountsCallback(*this, bins, keys));
+        
+        // scatter keys in shared memory
+        CTA_SYNC();
+        ScatterKeysShared(keys, ranks);
+
+        // compute global offsets
+        LoadBinsToOffsetsGlobal(exclusive_digit_prefix);
+        LookbackGlobal(bins);
+        UpdateBinsGlobal(bins, exclusive_digit_prefix);
+                
+        // scatter keys in global memory
+        CTA_SYNC();
+        ScatterKeysGlobal();
+
+        // scatter values if necessary
+        GatherScatterValues(ranks, Int2Type<KEYS_ONLY>());
+    }
+
+    __device__ __forceinline__ //
+    AgentRadixSortOnesweep(TempStorage &temp_storage,
+                           AtomicOffsetT *d_lookback,
+                           AtomicOffsetT *d_ctrs,
+                           OffsetT *d_bins_out,
+                           const OffsetT *d_bins_in,
+                           KeyT *d_keys_out,
+                           const KeyT *d_keys_in,
+                           ValueT *d_values_out,
+                           const ValueT *d_values_in,
+                           PortionOffsetT num_items,
+                           int current_bit,
+                           int num_bits)
+        : s(temp_storage.Alias())
+        , d_lookback(d_lookback)
+        , d_ctrs(d_ctrs)
+        , d_bins_out(d_bins_out)
+        , d_bins_in(d_bins_in)
+        , d_keys_out(reinterpret_cast<UnsignedBits *>(d_keys_out))
+        , d_keys_in(reinterpret_cast<const UnsignedBits *>(d_keys_in))
+        , d_values_out(d_values_out)
+        , d_values_in(d_values_in)
+        , num_items(num_items)
+        , digit_extractor(current_bit, num_bits)
+        , warp(threadIdx.x / WARP_THREADS)
+        , lane(LaneId())
+    {
+        // initialization
+        if (threadIdx.x == 0)
+        {
+            s.block_idx = atomicAdd(d_ctrs, 1);
+        }
+        CTA_SYNC();
+        block_idx = s.block_idx;
+        full_block = (block_idx + 1) * TILE_ITEMS <= num_items;
+    }
+};
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/agent/agent_radix_sort_upsweep.cuh b/cuda/cub/agent/agent_radix_sort_upsweep.cuh
new file mode 100644
index 00000000..a65360de
--- /dev/null
+++ b/cuda/cub/agent/agent_radix_sort_upsweep.cuh
@@ -0,0 +1,519 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
+ */
+
+#pragma once
+
+#include "../thread/thread_reduce.cuh"
+#include "../thread/thread_load.cuh"
+#include "../warp/warp_reduce.cuh"
+#include "../block/block_load.cuh"
+#include "../block/radix_rank_sort_operations.cuh"
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentRadixSortUpsweep
+ */
+template <
+    int                 NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
+    int                 NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
+    typename            ComputeT,                       ///< Dominant compute type
+    CacheLoadModifier   _LOAD_MODIFIER,                 ///< Cache load modifier for reading keys
+    int                 _RADIX_BITS,                    ///< The number of radix bits, i.e., log2(bins)
+    typename            ScalingType = RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
+struct AgentRadixSortUpsweepPolicy :
+    ScalingType
+{
+    enum
+    {
+        RADIX_BITS          = _RADIX_BITS,          ///< The number of radix bits, i.e., log2(bins)
+    };
+
+    static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;      ///< Cache load modifier for reading keys
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
+ */
+template <
+    typename AgentRadixSortUpsweepPolicy,   ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type
+    typename KeyT,                          ///< KeyT type
+    typename OffsetT>                       ///< Signed integer type for global offsets
+struct AgentRadixSortUpsweep
+{
+
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
+
+    // Integer type for digit counters (to be packed into words of PackedCounters)
+    typedef unsigned char DigitCounter;
+
+    // Integer type for packing DigitCounters into columns of shared memory banks
+    typedef unsigned int PackedCounter;
+
+    static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER;
+
+    enum
+    {
+        RADIX_BITS              = AgentRadixSortUpsweepPolicy::RADIX_BITS,
+        BLOCK_THREADS           = AgentRadixSortUpsweepPolicy::BLOCK_THREADS,
+        KEYS_PER_THREAD         = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD,
+
+        RADIX_DIGITS            = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS        = CUB_PTX_LOG_WARP_THREADS,
+        WARP_THREADS            = 1 << LOG_WARP_THREADS,
+        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        TILE_ITEMS              = BLOCK_THREADS * KEYS_PER_THREAD,
+
+        BYTES_PER_COUNTER       = sizeof(DigitCounter),
+        LOG_BYTES_PER_COUNTER   = Log2<BYTES_PER_COUNTER>::VALUE,
+
+        PACKING_RATIO           = sizeof(PackedCounter) / sizeof(DigitCounter),
+        LOG_PACKING_RATIO       = Log2<PACKING_RATIO>::VALUE,
+
+        LOG_COUNTER_LANES       = CUB_MAX(0, int(RADIX_BITS) - int(LOG_PACKING_RATIO)),
+        COUNTER_LANES           = 1 << LOG_COUNTER_LANES,
+
+        // To prevent counter overflow, we must periodically unpack and aggregate the
+        // digit counters back into registers.  Each counter lane is assigned to a
+        // warp for aggregation.
+
+        LANES_PER_WARP          = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
+
+        // Unroll tiles in batches without risk of counter overflow
+        UNROLL_COUNT            = CUB_MIN(64, 255 / KEYS_PER_THREAD),
+        UNROLLED_ELEMENTS       = UNROLL_COUNT * TILE_ITEMS,
+    };
+
+
+    // Input iterator wrapper type (for applying cache modifier)s
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT> KeysItr;
+
+    // Digit extractor type
+    typedef BFEDigitExtractor<KeyT> DigitExtractorT;
+
+    /**
+     * Shared memory storage layout
+     */
+    union __align__(16) _TempStorage
+    {
+        DigitCounter    thread_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+        PackedCounter   packed_thread_counters[COUNTER_LANES][BLOCK_THREADS];
+        OffsetT         block_counters[WARP_THREADS][RADIX_DIGITS];
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields (aggregate state bundle)
+    //---------------------------------------------------------------------
+
+    // Shared storage for this CTA
+    _TempStorage    &temp_storage;
+
+    // Thread-local counters for periodically aggregating composite-counter lanes
+    OffsetT         local_counts[LANES_PER_WARP][PACKING_RATIO];
+
+    // Input and output device pointers
+    KeysItr         d_keys_in;
+
+    // Digit extractor
+    DigitExtractorT digit_extractor;
+
+
+    //---------------------------------------------------------------------
+    // Helper structure for templated iteration
+    //---------------------------------------------------------------------
+
+    // Iterate
+    template <int COUNT, int MAX>
+    struct Iterate
+    {
+        // BucketKeys
+        static __device__ __forceinline__ void BucketKeys(
+            AgentRadixSortUpsweep       &cta,
+            UnsignedBits                keys[KEYS_PER_THREAD])
+        {
+            cta.Bucket(keys[COUNT]);
+
+            // Next
+            Iterate<COUNT + 1, MAX>::BucketKeys(cta, keys);
+        }
+    };
+
+    // Terminate
+    template <int MAX>
+    struct Iterate<MAX, MAX>
+    {
+        // BucketKeys
+        static __device__ __forceinline__ void BucketKeys(AgentRadixSortUpsweep &/*cta*/, UnsignedBits /*keys*/[KEYS_PER_THREAD]) {}
+    };
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Decode a key and increment corresponding smem digit counter
+     */
+    __device__ __forceinline__ void Bucket(UnsignedBits key)
+    {
+        // Perform transform op
+        UnsignedBits converted_key = Traits<KeyT>::TwiddleIn(key);
+
+        // Extract current digit bits
+        UnsignedBits digit = digit_extractor.Digit(converted_key);
+
+        // Get sub-counter offset
+        UnsignedBits sub_counter = digit & (PACKING_RATIO - 1);
+
+        // Get row offset
+        UnsignedBits row_offset = digit >> LOG_PACKING_RATIO;
+
+        // Increment counter
+        temp_storage.thread_counters[row_offset][threadIdx.x][sub_counter]++;
+    }
+
+
+    /**
+     * Reset composite counters
+     */
+    __device__ __forceinline__ void ResetDigitCounters()
+    {
+        #pragma unroll
+        for (int LANE = 0; LANE < COUNTER_LANES; LANE++)
+        {
+            temp_storage.packed_thread_counters[LANE][threadIdx.x] = 0;
+        }
+    }
+
+
+    /**
+     * Reset the unpacked counters in each thread
+     */
+    __device__ __forceinline__ void ResetUnpackedCounters()
+    {
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            #pragma unroll
+            for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+            {
+                local_counts[LANE][UNPACKED_COUNTER] = 0;
+            }
+        }
+    }
+
+
+    /**
+     * Extracts and aggregates the digit counters for each counter lane
+     * owned by this warp
+     */
+    __device__ __forceinline__ void UnpackDigitCounts()
+    {
+        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid = LaneId();
+
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            const int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                #pragma unroll
+                for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS)
+                {
+                    #pragma unroll
+                    for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                    {
+                        OffsetT counter = temp_storage.thread_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
+                        local_counts[LANE][UNPACKED_COUNTER] += counter;
+                    }
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Processes a single, full tile
+     */
+    __device__ __forceinline__ void ProcessFullTile(OffsetT block_offset)
+    {
+        // Tile of keys
+        UnsignedBits keys[KEYS_PER_THREAD];
+
+        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys);
+
+        // Prevent hoisting
+        CTA_SYNC();
+
+        // Bucket tile of keys
+        Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
+    }
+
+
+    /**
+     * Processes a single load (may have some threads masked off)
+     */
+    __device__ __forceinline__ void ProcessPartialTile(
+        OffsetT block_offset,
+        const OffsetT &block_end)
+    {
+        // Process partial tile if necessary using single loads
+        for (OffsetT offset = threadIdx.x; offset < block_end - block_offset; offset += BLOCK_THREADS) 
+        {
+            // Load and bucket key
+            UnsignedBits key = d_keys_in[block_offset + offset];
+            Bucket(key);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortUpsweep(
+        TempStorage &temp_storage,
+        const KeyT  *d_keys_in,
+        int         current_bit,
+        int         num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+        digit_extractor(current_bit, num_bits)
+    {}
+
+
+    /**
+     * Compute radix digit histograms from a segment of input tiles.
+     */
+    __device__ __forceinline__ void ProcessRegion(
+        OffsetT          block_offset,
+        const OffsetT    &block_end)
+    {
+        // Reset digit counters in smem and unpacked counters in registers
+        ResetDigitCounters();
+        ResetUnpackedCounters();
+
+        // Unroll batches of full tiles
+        while (block_end - block_offset >= UNROLLED_ELEMENTS)
+        {
+            for (int i = 0; i < UNROLL_COUNT; ++i)
+            {
+                ProcessFullTile(block_offset);
+                block_offset += TILE_ITEMS;
+            }
+
+            CTA_SYNC();
+
+            // Aggregate back into local_count registers to prevent overflow
+            UnpackDigitCounts();
+
+            CTA_SYNC();
+
+            // Reset composite counters in lanes
+            ResetDigitCounters();
+        }
+
+        // Unroll single full tiles
+        while (block_end - block_offset >= TILE_ITEMS)
+        {
+            ProcessFullTile(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Process partial tile if necessary
+        ProcessPartialTile(
+            block_offset,
+            block_end);
+
+        CTA_SYNC();
+
+        // Aggregate back into local_count registers
+        UnpackDigitCounts();
+    }
+
+
+    /**
+     * Extract counts (saving them to the external array)
+     */
+    template <bool IS_DESCENDING>
+    __device__ __forceinline__ void ExtractCounts(
+        OffsetT     *counters,
+        int         bin_stride = 1,
+        int         bin_offset = 0)
+    {
+        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid   = LaneId();
+
+        // Place unpacked digit counters in shared memory
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+                #pragma unroll
+                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                {
+                    int bin_idx = digit_row + UNPACKED_COUNTER;
+
+                    temp_storage.block_counters[warp_tid][bin_idx] =
+                        local_counts[LANE][UNPACKED_COUNTER];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Rake-reduce bin_count reductions
+
+        // Whole blocks
+        #pragma unroll
+        for (int BIN_BASE   = RADIX_DIGITS % BLOCK_THREADS;
+            (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS;
+            BIN_BASE += BLOCK_THREADS)
+        {
+            int bin_idx = BIN_BASE + threadIdx.x;
+
+            OffsetT bin_count = 0;
+            #pragma unroll
+            for (int i = 0; i < WARP_THREADS; ++i)
+                bin_count += temp_storage.block_counters[i][bin_idx];
+
+            if (IS_DESCENDING)
+                bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
+        }
+
+        // Remainder
+        if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS))
+        {
+            int bin_idx = threadIdx.x;
+
+            OffsetT bin_count = 0;
+            #pragma unroll
+            for (int i = 0; i < WARP_THREADS; ++i)
+                bin_count += temp_storage.block_counters[i][bin_idx];
+
+            if (IS_DESCENDING)
+                bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
+        }
+    }
+
+
+    /**
+     * Extract counts
+     */
+    template <int BINS_TRACKED_PER_THREAD>
+    __device__ __forceinline__ void ExtractCounts(
+        OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD])  ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid   = LaneId();
+
+        // Place unpacked digit counters in shared memory
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+                #pragma unroll
+                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                {
+                    int bin_idx = digit_row + UNPACKED_COUNTER;
+
+                    temp_storage.block_counters[warp_tid][bin_idx] =
+                        local_counts[LANE][UNPACKED_COUNTER];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Rake-reduce bin_count reductions
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                bin_count[track] = 0;
+
+                #pragma unroll
+                for (int i = 0; i < WARP_THREADS; ++i)
+                    bin_count[track] += temp_storage.block_counters[i][bin_idx];
+            }
+        }
+    }
+
+};
+
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/agent/agent_reduce.cuh b/cuda/cub/agent/agent_reduce.cuh
new file mode 100644
index 00000000..73a2ba49
--- /dev/null
+++ b/cuda/cub/agent/agent_reduce.cuh
@@ -0,0 +1,383 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../block/block_load.cuh"
+#include "../block/block_reduce.cuh"
+#include "../grid/grid_mapping.cuh"
+#include "../grid/grid_even_share.cuh"
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+
+CUB_NAMESPACE_BEGIN
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentReduce
+ */
+template <
+    int                     NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
+    int                     NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
+    typename                ComputeT,                       ///< Dominant compute type
+    int                     _VECTOR_LOAD_LENGTH,            ///< Number of items per vectorized load
+    BlockReduceAlgorithm    _BLOCK_ALGORITHM,               ///< Cooperative block-wide reduction algorithm to use
+    CacheLoadModifier       _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    typename                ScalingType =  MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
+struct AgentReducePolicy :
+    ScalingType
+{
+    enum
+    {
+        VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,  ///< Number of items per vectorized load
+    };
+
+    static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;     ///< Cooperative block-wide reduction algorithm to use
+    static const CacheLoadModifier     LOAD_MODIFIER        = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+};
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
+ *
+ * Each thread reduces only the values it loads. If \p FIRST_TILE, this
+ * partial reduction is stored into \p thread_aggregate.  Otherwise it is
+ * accumulated into \p thread_aggregate.
+ */
+template <
+    typename AgentReducePolicy,        ///< Parameterized AgentReducePolicy tuning policy type
+    typename InputIteratorT,           ///< Random-access iterator type for input
+    typename OutputIteratorT,          ///< Random-access iterator type for output
+    typename OffsetT,                  ///< Signed integer type for global offsets
+    typename ReductionOp>              ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+struct AgentReduce
+{
+
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The input value type
+    using InputT = cub::detail::value_t<InputIteratorT>;
+
+    /// The output value type
+    using OutputT = cub::detail::non_void_value_t<OutputIteratorT, InputT>;
+
+    /// Vector type of InputT for data movement
+    using VectorT =
+      typename CubVector<InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH>::Type;
+
+    /// Input iterator wrapper type (for applying cache modifier)
+    // Wrap the native input pointer with CacheModifiedInputIterator
+    // or directly use the supplied input iterator type
+    using WrappedInputIteratorT = cub::detail::conditional_t<
+      std::is_pointer<InputIteratorT>::value,
+      CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>,
+      InputIteratorT>;
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentReducePolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentReducePolicy::ITEMS_PER_THREAD,
+        VECTOR_LOAD_LENGTH  = CUB_MIN(int(ITEMS_PER_THREAD), int(AgentReducePolicy::VECTOR_LOAD_LENGTH)),
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
+        ATTEMPT_VECTORIZATION   = (VECTOR_LOAD_LENGTH > 1) &&
+                                    (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
+                                    (std::is_pointer<InputIteratorT>::value) && Traits<InputT>::PRIMITIVE,
+
+    };
+
+    static const CacheLoadModifier    LOAD_MODIFIER   = AgentReducePolicy::LOAD_MODIFIER;
+    static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM;
+
+    /// Parameterized BlockReduce primitive
+    using BlockReduceT =
+      BlockReduce<OutputT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM>;
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        typename BlockReduceT::TempStorage  reduce;
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&           temp_storage;       ///< Reference to temp_storage
+    InputIteratorT          d_in;               ///< Input data to reduce
+    WrappedInputIteratorT   d_wrapped_in;       ///< Wrapped input data to reduce
+    ReductionOp             reduction_op;       ///< Binary reduction operator
+
+
+    //---------------------------------------------------------------------
+    // Utility
+    //---------------------------------------------------------------------
+
+
+    // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
+    template <typename Iterator>
+    static __device__ __forceinline__ bool IsAligned(
+        Iterator        d_in,
+        Int2Type<true>  /*can_vectorize*/)
+    {
+        return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
+    }
+
+    // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
+    template <typename Iterator>
+    static __device__ __forceinline__ bool IsAligned(
+        Iterator        /*d_in*/,
+        Int2Type<false> /*can_vectorize*/)
+    {
+        return false;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentReduce(
+        TempStorage&            temp_storage,       ///< Reference to temp_storage
+        InputIteratorT          d_in,               ///< Input data to reduce
+        ReductionOp             reduction_op)       ///< Binary reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_wrapped_in(d_in),
+        reduction_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Tile consumption
+    //---------------------------------------------------------------------
+
+    /**
+     * Consume a full tile of input (non-vectorized)
+     */
+    template <int IS_FIRST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     /*valid_items*/,    ///< The number of valid items in the tile
+        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<false>         /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        OutputT items[ITEMS_PER_THREAD];
+
+        // Load items in striped fashion
+        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);
+
+        // Reduce items within each thread stripe
+        thread_aggregate = (IS_FIRST_TILE) ?
+            internal::ThreadReduce(items, reduction_op) :
+            internal::ThreadReduce(items, reduction_op, thread_aggregate);
+    }
+
+
+    /**
+     * Consume a full tile of input (vectorized)
+     */
+    template <int IS_FIRST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     /*valid_items*/,    ///< The number of valid items in the tile
+        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<true>          /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        // Alias items as an array of VectorT and load it in striped fashion
+        enum { WORDS =  ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
+
+        // Fabricate a vectorized input iterator
+        InputT *d_in_unqualified = const_cast<InputT*>(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
+        CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, VectorT, OffsetT> d_vec_in(
+            reinterpret_cast<VectorT*>(d_in_unqualified));
+
+        // Load items as vector items
+        InputT input_items[ITEMS_PER_THREAD];
+        VectorT *vec_items = reinterpret_cast<VectorT*>(input_items);
+        #pragma unroll
+        for (int i = 0; i < WORDS; ++i)
+            vec_items[i] = d_vec_in[BLOCK_THREADS * i];
+
+        // Convert from input type to output type
+        OutputT items[ITEMS_PER_THREAD];
+        #pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+            items[i] = input_items[i];
+
+        // Reduce items within each thread stripe
+        thread_aggregate = (IS_FIRST_TILE) ?
+            internal::ThreadReduce(items, reduction_op) :
+            internal::ThreadReduce(items, reduction_op, thread_aggregate);
+    }
+
+
+    /**
+     * Consume a partial tile of input
+     */
+    template <int IS_FIRST_TILE, int CAN_VECTORIZE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     valid_items,        ///< The number of valid items in the tile
+        Int2Type<false>         /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<CAN_VECTORIZE> /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        // Partial tile
+        int thread_offset = threadIdx.x;
+
+        // Read first item
+        if ((IS_FIRST_TILE) && (thread_offset < valid_items))
+        {
+            thread_aggregate = d_wrapped_in[block_offset + thread_offset];
+            thread_offset += BLOCK_THREADS;
+        }
+
+        // Continue reading items (block-striped)
+        while (thread_offset < valid_items)
+        {
+            OutputT item        (d_wrapped_in[block_offset + thread_offset]);
+            thread_aggregate    = reduction_op(thread_aggregate, item);
+            thread_offset       += BLOCK_THREADS;
+        }
+    }
+
+
+    //---------------------------------------------------------------
+    // Consume a contiguous segment of tiles
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Reduce a contiguous segment of input tiles
+     */
+    template <int CAN_VECTORIZE>
+    __device__ __forceinline__ OutputT ConsumeRange(
+        GridEvenShare<OffsetT> &even_share,          ///< GridEvenShare descriptor
+        Int2Type<CAN_VECTORIZE> can_vectorize)      ///< Whether or not we can vectorize loads
+    {
+        OutputT thread_aggregate;
+
+        if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
+        {
+            // First tile isn't full (not all threads have valid items)
+            int valid_items = even_share.block_end - even_share.block_offset;
+            ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
+            return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
+        }
+
+        // At least one full block
+        ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+        even_share.block_offset += even_share.block_stride;
+
+        // Consume subsequent full tiles of input
+        while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
+        {
+            ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+            even_share.block_offset += even_share.block_stride;
+        }
+
+        // Consume a partially-full tile
+        if (even_share.block_offset < even_share.block_end)
+        {
+            int valid_items = even_share.block_end - even_share.block_offset;
+            ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
+        }
+
+        // Compute block-wide reduction (all threads have valid items)
+        return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
+    }
+
+
+    /**
+     * \brief Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ OutputT ConsumeRange(
+        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        GridEvenShare<OffsetT> even_share;
+        even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);
+
+        return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
+            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
+            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
+    }
+
+
+    /**
+     * Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ OutputT ConsumeTiles(
+        GridEvenShare<OffsetT> &even_share)        ///< [in] GridEvenShare descriptor
+    {
+        // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
+        even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
+
+        return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
+            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
+            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
+
+    }
+
+};
+
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/agent/agent_reduce_by_key.cuh b/cuda/cub/agent/agent_reduce_by_key.cuh
new file mode 100644
index 00000000..48aa17a8
--- /dev/null
+++ b/cuda/cub/agent/agent_reduce_by_key.cuh
@@ -0,0 +1,542 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../config.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentReduceByKey
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentReduceByKeyPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
+ */
+template <
+    typename    AgentReduceByKeyPolicyT,        ///< Parameterized AgentReduceByKeyPolicy tuning policy type
+    typename    KeysInputIteratorT,             ///< Random-access input iterator type for keys
+    typename    UniqueOutputIteratorT,          ///< Random-access output iterator type for keys
+    typename    ValuesInputIteratorT,           ///< Random-access input iterator type for values
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    NumRunsOutputIteratorT,         ///< Output iterator type for recording number of items selected
+    typename    EqualityOpT,                    ///< KeyT equality operator type
+    typename    ReductionOpT,                   ///< ValueT reduction operator type
+    typename    OffsetT>                        ///< Signed integer type for global offsets
+struct AgentReduceByKey
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input keys type
+    using KeyInputT = cub::detail::value_t<KeysInputIteratorT>;
+
+    // The output keys type
+    using KeyOutputT =
+      cub::detail::non_void_value_t<UniqueOutputIteratorT, KeyInputT>;
+
+    // The input values type
+    using ValueInputT = cub::detail::value_t<ValuesInputIteratorT>;
+
+    // The output values type
+    using ValueOutputT =
+      cub::detail::non_void_value_t<AggregatesOutputIteratorT, ValueInputT>;
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    using OffsetValuePairT = KeyValuePair<OffsetT, ValueOutputT>;
+
+    // Tuple type for pairing keys and values
+    using KeyValuePairT = KeyValuePair<KeyOutputT, ValueOutputT>;
+
+    // Tile status descriptor interface type
+    using ScanTileStateT = ReduceByKeyScanTileState<ValueOutputT, OffsetT>;
+
+    // Guarded inequality functor
+    template <typename _EqualityOpT>
+    struct GuardedInequalityWrapper
+    {
+        _EqualityOpT     op;             ///< Wrapped equality operator
+        int             num_remaining;  ///< Items remaining
+
+        /// Constructor
+        __host__ __device__ __forceinline__
+        GuardedInequalityWrapper(_EqualityOpT op, int num_remaining) : op(op), num_remaining(num_remaining) {}
+
+        /// Boolean inequality operator, returns <tt>(a != b)</tt>
+        template <typename T>
+        __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b, int idx) const
+        {
+            if (idx < num_remaining)
+                return !op(a, b);   // In bounds
+
+            // Return true if first out-of-bounds item, false otherwise
+            return (idx == num_remaining);
+       }
+    };
+
+
+    // Constants
+    enum
+    {
+      BLOCK_THREADS     = AgentReduceByKeyPolicyT::BLOCK_THREADS,
+      ITEMS_PER_THREAD  = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD,
+      TILE_ITEMS        = BLOCK_THREADS * ITEMS_PER_THREAD,
+      TWO_PHASE_SCATTER = (ITEMS_PER_THREAD > 1),
+
+      // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
+      HAS_IDENTITY_ZERO = (std::is_same<ReductionOpT, cub::Sum>::value) &&
+                          (Traits<ValueOutputT>::PRIMITIVE),
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
+    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+    // or directly use the supplied input iterator type
+    using WrappedKeysInputIteratorT = cub::detail::conditional_t<
+      std::is_pointer<KeysInputIteratorT>::value,
+      CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER,
+                                 KeyInputT,
+                                 OffsetT>,
+      KeysInputIteratorT>;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
+    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+    // or directly use the supplied input iterator type
+    using WrappedValuesInputIteratorT = cub::detail::conditional_t<
+      std::is_pointer<ValuesInputIteratorT>::value,
+      CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER,
+                                 ValueInputT,
+                                 OffsetT>,
+      ValuesInputIteratorT>;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
+    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+    // or directly use the supplied input iterator type
+    using WrappedFixupInputIteratorT = cub::detail::conditional_t<
+      std::is_pointer<AggregatesOutputIteratorT>::value,
+      CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER,
+                                 ValueInputT,
+                                 OffsetT>,
+      AggregatesOutputIteratorT>;
+
+    // Reduce-value-by-segment scan operator
+    using ReduceBySegmentOpT = ReduceBySegmentOp<ReductionOpT>;
+
+    // Parameterized BlockLoad type for keys
+    using BlockLoadKeysT = BlockLoad<KeyOutputT,
+                                     BLOCK_THREADS,
+                                     ITEMS_PER_THREAD,
+                                     AgentReduceByKeyPolicyT::LOAD_ALGORITHM>;
+
+    // Parameterized BlockLoad type for values
+    using BlockLoadValuesT = BlockLoad<ValueOutputT,
+                                       BLOCK_THREADS,
+                                       ITEMS_PER_THREAD,
+                                       AgentReduceByKeyPolicyT::LOAD_ALGORITHM>;
+
+    // Parameterized BlockDiscontinuity type for keys
+    using BlockDiscontinuityKeys =
+      BlockDiscontinuity<KeyOutputT, BLOCK_THREADS>;
+
+    // Parameterized BlockScan type
+    using BlockScanT = BlockScan<OffsetValuePairT,
+                                 BLOCK_THREADS,
+                                 AgentReduceByKeyPolicyT::SCAN_ALGORITHM>;
+
+    // Callback type for obtaining tile prefix during block scan
+    using TilePrefixCallbackOpT =
+      TilePrefixCallbackOp<OffsetValuePairT, ReduceBySegmentOpT, ScanTileStateT>;
+
+    // Key and value exchange types
+    typedef KeyOutputT    KeyExchangeT[TILE_ITEMS + 1];
+    typedef ValueOutputT  ValueExchangeT[TILE_ITEMS + 1];
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct ScanStorage
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+            typename BlockDiscontinuityKeys::TempStorage    discontinuity;  // Smem needed for discontinuity detection
+        } scan_storage;
+
+        // Smem needed for loading keys
+        typename BlockLoadKeysT::TempStorage load_keys;
+
+        // Smem needed for loading values
+        typename BlockLoadValuesT::TempStorage load_values;
+
+        // Smem needed for compacting key value pairs(allows non POD items in this union)
+        Uninitialized<KeyValuePairT[TILE_ITEMS + 1]> raw_exchange;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedKeysInputIteratorT       d_keys_in;          ///< Input keys
+    UniqueOutputIteratorT           d_unique_out;       ///< Unique output keys
+    WrappedValuesInputIteratorT     d_values_in;        ///< Input values
+    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
+    NumRunsOutputIteratorT          d_num_runs_out;     ///< Output pointer for total number of segments identified
+    EqualityOpT                     equality_op;        ///< KeyT equality operator
+    ReductionOpT                    reduction_op;       ///< Reduction operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentReduceByKey(
+        TempStorage&                temp_storage,       ///< Reference to temp_storage
+        KeysInputIteratorT          d_keys_in,          ///< Input keys
+        UniqueOutputIteratorT       d_unique_out,       ///< Unique output keys
+        ValuesInputIteratorT        d_values_in,        ///< Input values
+        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
+        NumRunsOutputIteratorT      d_num_runs_out,     ///< Output pointer for total number of segments identified
+        EqualityOpT                 equality_op,        ///< KeyT equality operator
+        ReductionOpT                reduction_op)       ///< ValueT reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(d_keys_in),
+        d_unique_out(d_unique_out),
+        d_values_in(d_values_in),
+        d_aggregates_out(d_aggregates_out),
+        d_num_runs_out(d_num_runs_out),
+        equality_op(equality_op),
+        reduction_op(reduction_op),
+        scan_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Scatter utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Directly scatter flagged items to output offsets
+     */
+    __device__ __forceinline__ void ScatterDirect(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD])
+    {
+        // Scatter flagged keys and values
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (segment_flags[ITEM])
+            {
+                d_unique_out[segment_indices[ITEM]]     = scatter_items[ITEM].key;
+                d_aggregates_out[segment_indices[ITEM]] = scatter_items[ITEM].value;
+            }
+        }
+    }
+
+
+    /**
+     * 2-phase scatter flagged items to output offsets
+     *
+     * The exclusive scan causes each head flag to be paired with the previous
+     * value aggregate: the scatter offsets must be decremented for value aggregates
+     */
+    __device__ __forceinline__ void ScatterTwoPhase(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
+        OffsetT         num_tile_segments,
+        OffsetT         num_tile_segments_prefix)
+    {
+        CTA_SYNC();
+
+        // Compact and scatter pairs
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (segment_flags[ITEM])
+            {
+                temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM];
+            }
+        }
+
+        CTA_SYNC();
+
+        for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
+        {
+            KeyValuePairT pair                                  = temp_storage.raw_exchange.Alias()[item];
+            d_unique_out[num_tile_segments_prefix + item]       = pair.key;
+            d_aggregates_out[num_tile_segments_prefix + item]   = pair.value;
+        }
+    }
+
+
+    /**
+     * Scatter flagged items
+     */
+    __device__ __forceinline__ void Scatter(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
+        OffsetT         num_tile_segments,
+        OffsetT         num_tile_segments_prefix)
+    {
+        // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one
+        if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS))
+        {
+            ScatterTwoPhase(
+                scatter_items,
+                segment_flags,
+                segment_indices,
+                num_tile_segments,
+                num_tile_segments_prefix);
+        }
+        else
+        {
+            ScatterDirect(
+                scatter_items,
+                segment_flags,
+                segment_indices);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        KeyOutputT          keys[ITEMS_PER_THREAD];             // Tile keys
+        KeyOutputT          prev_keys[ITEMS_PER_THREAD];        // Tile keys shuffled up
+        ValueOutputT        values[ITEMS_PER_THREAD];           // Tile values
+        OffsetT             head_flags[ITEMS_PER_THREAD];       // Segment head flags
+        OffsetT             segment_indices[ITEMS_PER_THREAD];  // Segment indices
+        OffsetValuePairT    scan_items[ITEMS_PER_THREAD];       // Zipped values and segment flags|indices
+        KeyValuePairT       scatter_items[ITEMS_PER_THREAD];    // Zipped key value pairs for scattering
+
+        // Load keys
+        if (IS_LAST_TILE)
+            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining);
+        else
+            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
+
+        // Load tile predecessor key in first thread
+        KeyOutputT tile_predecessor;
+        if (threadIdx.x == 0)
+        {
+            tile_predecessor = (tile_idx == 0) ?
+                keys[0] :                       // First tile gets repeat of first item (thus first item will not be flagged as a head)
+                d_keys_in[tile_offset - 1];     // Subsequent tiles get last key from previous tile
+        }
+
+        CTA_SYNC();
+
+        // Load values
+        if (IS_LAST_TILE)
+            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining);
+        else
+            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values);
+
+        CTA_SYNC();
+
+        // Initialize head-flags and shuffle up the previous keys
+        if (IS_LAST_TILE)
+        {
+            // Use custom flag operator to additionally flag the first out-of-bounds item
+            GuardedInequalityWrapper<EqualityOpT> flag_op(equality_op, num_remaining);
+            BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity).FlagHeads(
+                head_flags, keys, prev_keys, flag_op, tile_predecessor);
+        }
+        else
+        {
+            InequalityWrapper<EqualityOpT> flag_op(equality_op);
+            BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity).FlagHeads(
+                head_flags, keys, prev_keys, flag_op, tile_predecessor);
+        }
+
+        // Zip values and head flags
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            scan_items[ITEM].value  = values[ITEM];
+            scan_items[ITEM].key    = head_flags[ITEM];
+        }
+
+        // Perform exclusive tile scan
+        OffsetValuePairT    block_aggregate;        // Inclusive block-wide scan aggregate
+        OffsetT             num_segments_prefix;    // Number of segments prior to this tile
+        OffsetValuePairT    total_aggregate;        // The tile prefix folded with block_aggregate
+        if (tile_idx == 0)
+        {
+            // Scan first tile
+            BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate);
+            num_segments_prefix     = 0;
+            total_aggregate         = block_aggregate;
+
+            // Update tile status if there are successor tiles
+            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
+                tile_state.SetInclusive(0, block_aggregate);
+        }
+        else
+        {
+            // Scan non-first tile
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, scan_op, tile_idx);
+            BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, prefix_op);
+
+            block_aggregate         = prefix_op.GetBlockAggregate();
+            num_segments_prefix     = prefix_op.GetExclusivePrefix().key;
+            total_aggregate         = prefix_op.GetInclusivePrefix();
+        }
+
+        // Rezip scatter items and segment indices
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            scatter_items[ITEM].key     = prev_keys[ITEM];
+            scatter_items[ITEM].value   = scan_items[ITEM].value;
+            segment_indices[ITEM]       = scan_items[ITEM].key;
+        }
+
+        // At this point, each flagged segment head has:
+        //  - The key for the previous segment
+        //  - The reduced value from the previous segment
+        //  - The segment index for the reduced value
+
+        // Scatter flagged keys and values
+        OffsetT num_tile_segments = block_aggregate.key;
+        Scatter(scatter_items, head_flags, segment_indices, num_tile_segments, num_segments_prefix);
+
+        // Last thread in last tile will output final count (and last pair, if necessary)
+        if ((IS_LAST_TILE) && (threadIdx.x == BLOCK_THREADS - 1))
+        {
+            OffsetT num_segments = num_segments_prefix + num_tile_segments;
+
+            // If the last tile is a whole tile, output the final_value
+            if (num_remaining == TILE_ITEMS)
+            {
+                d_unique_out[num_segments]      = keys[ITEMS_PER_THREAD - 1];
+                d_aggregates_out[num_segments]  = total_aggregate.value;
+                num_segments++;
+            }
+
+            // Output the total number of items selected
+            *d_num_runs_out = num_segments;
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        OffsetT             num_items,          ///< Total number of input items
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        int                 start_tile)         ///< The starting tile for the current grid
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
+        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not last tile
+            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+        else if (num_remaining > 0)
+        {
+            // Last tile
+            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+    }
+
+};
+
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/agent/agent_rle.cuh b/cuda/cub/agent/agent_rle.cuh
new file mode 100644
index 00000000..37e5d373
--- /dev/null
+++ b/cuda/cub/agent/agent_rle.cuh
@@ -0,0 +1,831 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../config.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentRle
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    bool                        _STORE_WARP_TIME_SLICING,       ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentRlePolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+        STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,     ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode 
+ */
+template <
+    typename    AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
+    typename    InputIteratorT,         ///< Random-access input iterator type for data
+    typename    OffsetsOutputIteratorT, ///< Random-access output iterator type for offset values
+    typename    LengthsOutputIteratorT, ///< Random-access output iterator type for length values
+    typename    EqualityOpT,            ///< T equality operator type
+    typename    OffsetT>                ///< Signed integer type for global offsets
+struct AgentRle
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The input value type
+    using T = cub::detail::value_t<InputIteratorT>;
+
+    /// The lengths output value type
+    using LengthT =
+      cub::detail::non_void_value_t<LengthsOutputIteratorT, OffsetT>;
+
+    /// Tuple type for scanning (pairs run-length and run-index)
+    using LengthOffsetPair = KeyValuePair<OffsetT, LengthT>;
+
+    /// Tile status descriptor interface type
+    using ScanTileStateT = ReduceByKeyScanTileState<LengthT, OffsetT>;
+
+    // Constants
+    enum
+    {
+        WARP_THREADS            = CUB_WARP_THREADS(PTX_ARCH),
+        BLOCK_THREADS           = AgentRlePolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentRlePolicyT::ITEMS_PER_THREAD,
+        WARP_ITEMS              = WARP_THREADS * ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        /// Whether or not to sync after loading data
+        SYNC_AFTER_LOAD         = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
+
+        /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+        STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING,
+        ACTIVE_EXCHANGE_WARPS   = (STORE_WARP_TIME_SLICING) ? 1 : WARPS,
+    };
+
+
+    /**
+     * Special operator that signals all out-of-bounds items are not equal to everything else,
+     * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked
+     * trivial.
+     */
+    template <bool LAST_TILE>
+    struct OobInequalityOp
+    {
+        OffsetT         num_remaining;
+        EqualityOpT      equality_op;
+
+        __device__ __forceinline__ OobInequalityOp(
+            OffsetT     num_remaining,
+            EqualityOpT  equality_op)
+        :
+            num_remaining(num_remaining),
+            equality_op(equality_op)
+        {}
+
+        template <typename Index>
+        __host__ __device__ __forceinline__ bool operator()(T first, T second, Index idx)
+        {
+            if (!LAST_TILE || (idx < num_remaining))
+                return !equality_op(first, second);
+            else
+                return true;
+        }
+    };
+
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for data
+    // Wrap the native input pointer with CacheModifiedVLengthnputIterator
+    // Directly use the supplied input iterator type
+    using WrappedInputIteratorT = cub::detail::conditional_t<
+      std::is_pointer<InputIteratorT>::value,
+      CacheModifiedInputIterator<AgentRlePolicyT::LOAD_MODIFIER, T, OffsetT>,
+      InputIteratorT>;
+
+    // Parameterized BlockLoad type for data
+    using BlockLoadT = BlockLoad<T,
+                                 AgentRlePolicyT::BLOCK_THREADS,
+                                 AgentRlePolicyT::ITEMS_PER_THREAD,
+                                 AgentRlePolicyT::LOAD_ALGORITHM>;
+
+    // Parameterized BlockDiscontinuity type for data
+    using BlockDiscontinuityT = BlockDiscontinuity<T, BLOCK_THREADS> ;
+
+    // Parameterized WarpScan type
+    using WarpScanPairs = WarpScan<LengthOffsetPair>;
+
+    // Reduce-length-by-run scan operator
+    using ReduceBySegmentOpT = ReduceBySegmentOp<cub::Sum>;
+
+    // Callback type for obtaining tile prefix during block scan
+    using TilePrefixCallbackOpT =
+      TilePrefixCallbackOp<LengthOffsetPair, ReduceBySegmentOpT, ScanTileStateT>;
+
+    // Warp exchange types
+    using WarpExchangePairs = WarpExchange<LengthOffsetPair, ITEMS_PER_THREAD>;
+
+    using WarpExchangePairsStorage =
+      cub::detail::conditional_t<STORE_WARP_TIME_SLICING,
+                                 typename WarpExchangePairs::TempStorage,
+                                 NullType>;
+
+    using WarpExchangeOffsets = WarpExchange<OffsetT, ITEMS_PER_THREAD>;
+    using WarpExchangeLengths = WarpExchange<LengthT, ITEMS_PER_THREAD>;
+
+    typedef LengthOffsetPair WarpAggregates[WARPS];
+
+    // Shared memory type for this thread block
+    struct _TempStorage
+    {
+        // Aliasable storage layout
+        union Aliasable
+        {
+            struct ScanStorage
+            {
+                typename BlockDiscontinuityT::TempStorage       discontinuity;              // Smem needed for discontinuity detection
+                typename WarpScanPairs::TempStorage             warp_scan[WARPS];           // Smem needed for warp-synchronous scans
+                Uninitialized<LengthOffsetPair[WARPS]>          warp_aggregates;            // Smem needed for sharing warp-wide aggregates
+                typename TilePrefixCallbackOpT::TempStorage     prefix;                     // Smem needed for cooperative prefix callback
+            } scan_storage;
+
+            // Smem needed for input loading
+            typename BlockLoadT::TempStorage                    load;
+
+            // Aliasable layout needed for two-phase scatter
+            union ScatterAliasable
+            {
+                unsigned long long                              align;
+                WarpExchangePairsStorage                        exchange_pairs[ACTIVE_EXCHANGE_WARPS];
+                typename WarpExchangeOffsets::TempStorage       exchange_offsets[ACTIVE_EXCHANGE_WARPS];
+                typename WarpExchangeLengths::TempStorage       exchange_lengths[ACTIVE_EXCHANGE_WARPS];
+
+            } scatter_aliasable;
+
+        } aliasable;
+
+        OffsetT             tile_idx;                   // Shared tile index
+        LengthOffsetPair    tile_inclusive;             // Inclusive tile prefix
+        LengthOffsetPair    tile_exclusive;             // Exclusive tile prefix
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+
+    WrappedInputIteratorT           d_in;               ///< Pointer to input sequence of data items
+    OffsetsOutputIteratorT          d_offsets_out;      ///< Input run offsets
+    LengthsOutputIteratorT          d_lengths_out;      ///< Output run lengths
+
+    EqualityOpT                     equality_op;        ///< T equality operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-length-by-flag scan operator
+    OffsetT                         num_items;          ///< Total number of input items
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentRle(
+        TempStorage                 &temp_storage,      ///< [in] Reference to temp_storage
+        InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
+        OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run offsets
+        LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run lengths
+        EqualityOpT                 equality_op,        ///< [in] T equality operator
+        OffsetT                     num_items)          ///< [in] Total number of input items
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_offsets_out(d_offsets_out),
+        d_lengths_out(d_lengths_out),
+        equality_op(equality_op),
+        scan_op(cub::Sum()),
+        num_items(num_items)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for initializing the selections
+    //---------------------------------------------------------------------
+
+    template <bool FIRST_TILE, bool LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT             tile_offset,
+        OffsetT             num_remaining,
+        T                   (&items)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
+    {
+        bool                head_flags[ITEMS_PER_THREAD];
+        bool                tail_flags[ITEMS_PER_THREAD];
+
+        OobInequalityOp<LAST_TILE> inequality_op(num_remaining, equality_op);
+
+        if (FIRST_TILE && LAST_TILE)
+        {
+            // First-and-last-tile always head-flags the first item and tail-flags the last item
+
+            BlockDiscontinuityT(temp_storage.aliasable.scan_storage.discontinuity).FlagHeadsAndTails(
+                head_flags, tail_flags, items, inequality_op);
+        }
+        else if (FIRST_TILE)
+        {
+            // First-tile always head-flags the first item
+
+            // Get the first item from the next tile
+            T tile_successor_item;
+            if (threadIdx.x == BLOCK_THREADS - 1)
+                tile_successor_item = d_in[tile_offset + TILE_ITEMS];
+
+            BlockDiscontinuityT(temp_storage.aliasable.scan_storage.discontinuity).FlagHeadsAndTails(
+                head_flags, tail_flags, tile_successor_item, items, inequality_op);
+        }
+        else if (LAST_TILE)
+        {
+            // Last-tile always flags the last item
+
+            // Get the last item from the previous tile
+            T tile_predecessor_item;
+            if (threadIdx.x == 0)
+                tile_predecessor_item = d_in[tile_offset - 1];
+
+            BlockDiscontinuityT(temp_storage.aliasable.scan_storage.discontinuity).FlagHeadsAndTails(
+                head_flags, tile_predecessor_item, tail_flags, items, inequality_op);
+        }
+        else
+        {
+            // Get the first item from the next tile
+            T tile_successor_item;
+            if (threadIdx.x == BLOCK_THREADS - 1)
+                tile_successor_item = d_in[tile_offset + TILE_ITEMS];
+
+            // Get the last item from the previous tile
+            T tile_predecessor_item;
+            if (threadIdx.x == 0)
+                tile_predecessor_item = d_in[tile_offset - 1];
+
+            BlockDiscontinuityT(temp_storage.aliasable.scan_storage.discontinuity).FlagHeadsAndTails(
+                head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op);
+        }
+
+        // Zip counts and runs
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            lengths_and_num_runs[ITEM].key      = head_flags[ITEM] && (!tail_flags[ITEM]);
+            lengths_and_num_runs[ITEM].value    = ((!head_flags[ITEM]) || (!tail_flags[ITEM]));
+        }
+    }
+
+    //---------------------------------------------------------------------
+    // Scan utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Scan of allocations
+     */
+    __device__ __forceinline__ void WarpScanAllocations(
+        LengthOffsetPair    &tile_aggregate,
+        LengthOffsetPair    &warp_aggregate,
+        LengthOffsetPair    &warp_exclusive_in_tile,
+        LengthOffsetPair    &thread_exclusive_in_warp,
+        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
+    {
+        // Perform warpscans
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        LengthOffsetPair identity;
+        identity.key = 0;
+        identity.value = 0;
+
+        LengthOffsetPair thread_inclusive;
+        LengthOffsetPair thread_aggregate = internal::ThreadReduce(lengths_and_num_runs, scan_op);
+        WarpScanPairs(temp_storage.aliasable.scan_storage.warp_scan[warp_id]).Scan(
+            thread_aggregate,
+            thread_inclusive,
+            thread_exclusive_in_warp,
+            identity,
+            scan_op);
+
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.aliasable.scan_storage.warp_aggregates.Alias()[warp_id] = thread_inclusive;
+
+        CTA_SYNC();
+
+        // Accumulate total selected and the warp-wide prefix
+        warp_exclusive_in_tile          = identity;
+        warp_aggregate                  = temp_storage.aliasable.scan_storage.warp_aggregates.Alias()[warp_id];
+        tile_aggregate                  = temp_storage.aliasable.scan_storage.warp_aggregates.Alias()[0];
+
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; ++WARP)
+        {
+            if (warp_id == WARP)
+                warp_exclusive_in_tile = tile_aggregate;
+
+            tile_aggregate = scan_op(tile_aggregate, temp_storage.aliasable.scan_storage.warp_aggregates.Alias()[WARP]);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for scattering selections
+    //---------------------------------------------------------------------
+
+    /**
+     * Two-phase scatter, specialized for warp time-slicing
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
+        Int2Type<true>      is_warp_time_slice)
+    {
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        // Locally compact items within the warp (first warp)
+        if (warp_id == 0)
+        {
+            WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
+                lengths_and_offsets, thread_num_runs_exclusive_in_warp);
+        }
+
+        // Locally compact items within the warp (remaining warps)
+        #pragma unroll
+        for (int SLICE = 1; SLICE < WARPS; ++SLICE)
+        {
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
+                    lengths_and_offsets, thread_num_runs_exclusive_in_warp);
+            }
+        }
+
+        // Global scatter
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    (ITEM * WARP_THREADS) + lane_id;
+
+                // Scatter offset
+                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
+
+                // Scatter length if not the first (global) length
+                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
+                {
+                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Two-phase scatter
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
+        Int2Type<false>     is_warp_time_slice)
+    {
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        // Unzip
+        OffsetT run_offsets[ITEMS_PER_THREAD];
+        LengthT run_lengths[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            run_offsets[ITEM] = lengths_and_offsets[ITEM].key;
+            run_lengths[ITEM] = lengths_and_offsets[ITEM].value;
+        }
+
+        WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id]).ScatterToStriped(
+            run_offsets, thread_num_runs_exclusive_in_warp);
+
+        WARP_SYNC(0xffffffff);
+
+        WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id]).ScatterToStriped(
+            run_lengths, thread_num_runs_exclusive_in_warp);
+
+        // Global scatter
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    (ITEM * WARP_THREADS) + lane_id;
+
+                // Scatter offset
+                d_offsets_out[item_offset] = run_offsets[ITEM];
+
+                // Scatter length if not the first (global) length
+                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
+                {
+                    d_lengths_out[item_offset - 1] = run_lengths[ITEM];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Direct scatter
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterDirect(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    thread_num_runs_exclusive_in_warp[ITEM];
+
+                // Scatter offset
+                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
+
+                // Scatter length if not the first (global) length
+                if (item_offset >= 1)
+                {
+                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scatter
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void Scatter(
+        OffsetT             tile_num_runs_aggregate,
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
+    {
+        if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS))
+        {
+            // Direct scatter if the warp has any items
+            if (warp_num_runs_aggregate)
+            {
+                ScatterDirect<FIRST_TILE>(
+                    tile_num_runs_exclusive_in_global,
+                    warp_num_runs_aggregate,
+                    warp_num_runs_exclusive_in_tile,
+                    thread_num_runs_exclusive_in_warp,
+                    lengths_and_offsets);
+            }
+        }
+        else
+        {
+            // Scatter two phase
+            ScatterTwoPhase<FIRST_TILE>(
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets,
+                Int2Type<STORE_WARP_TIME_SLICING>());
+        }
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <
+        bool                LAST_TILE>
+    __device__ __forceinline__ LengthOffsetPair ConsumeTile(
+        OffsetT             num_items,          ///< Total number of global input items
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT      &tile_status)       ///< Global list of tile status
+    {
+        if (tile_idx == 0)
+        {
+            // First tile
+
+            // Load items
+            T items[ITEMS_PER_THREAD];
+            if (LAST_TILE)
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
+            else
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
+
+            if (SYNC_AFTER_LOAD)
+                CTA_SYNC();
+
+            // Set flags
+            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
+
+            InitializeSelections<true, LAST_TILE>(
+                tile_offset,
+                num_remaining,
+                items,
+                lengths_and_num_runs);
+
+            // Exclusive scan of lengths and runs
+            LengthOffsetPair tile_aggregate;
+            LengthOffsetPair warp_aggregate;
+            LengthOffsetPair warp_exclusive_in_tile;
+            LengthOffsetPair thread_exclusive_in_warp;
+
+            WarpScanAllocations(
+                tile_aggregate,
+                warp_aggregate,
+                warp_exclusive_in_tile,
+                thread_exclusive_in_warp,
+                lengths_and_num_runs);
+
+            // Update tile status if this is not the last tile
+            if (!LAST_TILE && (threadIdx.x == 0))
+                tile_status.SetInclusive(0, tile_aggregate);
+
+            // Update thread_exclusive_in_warp to fold in warp run-length
+            if (thread_exclusive_in_warp.key == 0)
+                thread_exclusive_in_warp.value += warp_exclusive_in_tile.value;
+
+            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
+            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
+            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
+
+            // Downsweep scan through lengths_and_num_runs
+            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
+
+            // Zip
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
+                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
+                                                                lengths_and_num_runs2[ITEM].key :         // keep
+                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
+            }
+
+            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
+            OffsetT tile_num_runs_exclusive_in_global    = 0;
+            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
+            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;
+
+            // Scatter
+            Scatter<true>(
+                tile_num_runs_aggregate,
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets);
+
+            // Return running total (inclusive of this tile)
+            return tile_aggregate;
+        }
+        else
+        {
+            // Not first tile
+
+            // Load items
+            T items[ITEMS_PER_THREAD];
+            if (LAST_TILE)
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
+            else
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
+
+            if (SYNC_AFTER_LOAD)
+                CTA_SYNC();
+
+            // Set flags
+            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
+
+            InitializeSelections<false, LAST_TILE>(
+                tile_offset,
+                num_remaining,
+                items,
+                lengths_and_num_runs);
+
+            // Exclusive scan of lengths and runs
+            LengthOffsetPair tile_aggregate;
+            LengthOffsetPair warp_aggregate;
+            LengthOffsetPair warp_exclusive_in_tile;
+            LengthOffsetPair thread_exclusive_in_warp;
+
+            WarpScanAllocations(
+                tile_aggregate,
+                warp_aggregate,
+                warp_exclusive_in_tile,
+                thread_exclusive_in_warp,
+                lengths_and_num_runs);
+
+            // First warp computes tile prefix in lane 0
+            TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.aliasable.scan_storage.prefix, Sum(), tile_idx);
+            unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+            if (warp_id == 0)
+            {
+                prefix_op(tile_aggregate);
+                if (threadIdx.x == 0)
+                    temp_storage.tile_exclusive = prefix_op.exclusive_prefix;
+            }
+
+            CTA_SYNC();
+
+            LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive;
+
+            // Update thread_exclusive_in_warp to fold in warp and tile run-lengths
+            LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile);
+            if (thread_exclusive_in_warp.key == 0)
+                thread_exclusive_in_warp.value += thread_exclusive.value;
+
+            // Downsweep scan through lengths_and_num_runs
+            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
+            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
+            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
+
+            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
+
+            // Zip
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
+                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
+                                                                lengths_and_num_runs2[ITEM].key :         // keep
+                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
+            }
+
+            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
+            OffsetT tile_num_runs_exclusive_in_global    = tile_exclusive_in_global.key;
+            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
+            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;
+
+            // Scatter
+            Scatter<false>(
+                tile_num_runs_aggregate,
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets);
+
+            // Return running total (inclusive of this tile)
+            return prefix_op.inclusive_prefix;
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    template <typename NumRunsIteratorT>            ///< Output iterator type for recording number of items selected
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_tiles,              ///< Total number of input tiles
+        ScanTileStateT&     tile_status,            ///< Global list of tile status
+        NumRunsIteratorT    d_num_runs_out)         ///< Output pointer for total number of runs identified
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                  // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
+
+        if (tile_idx < num_tiles - 1)
+        {
+            // Not the last tile (full)
+            ConsumeTile<false>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
+        }
+        else if (num_remaining > 0)
+        {
+            // The last tile (possibly partially-full)
+            LengthOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
+
+            if (threadIdx.x == 0)
+            {
+                // Output the total number of items selected
+                *d_num_runs_out = running_total.key;
+
+                // The inclusive prefix contains accumulated length reduction for the last run
+                if (running_total.key > 0)
+                    d_lengths_out[running_total.key - 1] = running_total.value;
+            }
+        }
+    }
+};
+
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/agent/agent_scan.cuh b/cuda/cub/agent/agent_scan.cuh
new file mode 100644
index 00000000..3702b8c2
--- /dev/null
+++ b/cuda/cub/agent/agent_scan.cuh
@@ -0,0 +1,490 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../config.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentScan
+ */
+template <
+    int                         NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
+    int                         NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
+    typename                    ComputeT,                       ///< Dominant compute type
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockStoreAlgorithm         _STORE_ALGORITHM,               ///< The BlockStore algorithm to use
+    BlockScanAlgorithm          _SCAN_ALGORITHM,                ///< The BlockScan algorithm to use
+    typename                    ScalingType =  MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
+
+struct AgentScanPolicy :
+    ScalingType
+{
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
+    static const BlockStoreAlgorithm    STORE_ALGORITHM         = _STORE_ALGORITHM;         ///< The BlockStore algorithm to use
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;          ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
+ */
+template <
+    typename AgentScanPolicyT,      ///< Parameterized AgentScanPolicyT tuning policy type
+    typename InputIteratorT,        ///< Random-access input iterator type
+    typename OutputIteratorT,       ///< Random-access output iterator type
+    typename ScanOpT,               ///< Scan functor type
+    typename InitValueT,            ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan)
+    typename OffsetT>               ///< Signed integer type for global offsets
+struct AgentScan
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input value type
+    using InputT = cub::detail::value_t<InputIteratorT>;
+
+    // The output value type -- used as the intermediate accumulator
+    // Per https://wg21.link/P0571, use InitValueT if provided, otherwise the
+    // input iterator's value type.
+    using OutputT =
+      cub::detail::conditional_t<std::is_same<InitValueT, NullType>::value,
+                                 InputT,
+                                 InitValueT>;
+
+    // Tile status descriptor interface type
+    using ScanTileStateT = ScanTileState<OutputT>;
+
+    // Input iterator wrapper type (for applying cache modifier)
+    // Wrap the native input pointer with CacheModifiedInputIterator
+    // or directly use the supplied input iterator type
+    using WrappedInputIteratorT = cub::detail::conditional_t<
+      std::is_pointer<InputIteratorT>::value,
+      CacheModifiedInputIterator<AgentScanPolicyT::LOAD_MODIFIER, InputT, OffsetT>,
+      InputIteratorT>;
+
+    // Constants
+    enum
+    {
+        // Inclusive scan if no init_value type is provided
+        IS_INCLUSIVE        = std::is_same<InitValueT, NullType>::value,
+        BLOCK_THREADS       = AgentScanPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentScanPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Parameterized BlockLoad type
+    typedef BlockLoad<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::ITEMS_PER_THREAD,
+            AgentScanPolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockStore type
+    typedef BlockStore<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::ITEMS_PER_THREAD,
+            AgentScanPolicyT::STORE_ALGORITHM>
+        BlockStoreT;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            OutputT,
+            ScanOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
+    typedef BlockScanRunningPrefixOp<
+            OutputT,
+            ScanOpT>
+        RunningPrefixCallbackOp;
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        typename BlockLoadT::TempStorage    load;       // Smem needed for tile loading
+        typename BlockStoreT::TempStorage   store;      // Smem needed for tile storing
+
+        struct ScanStorage
+        {
+            typename TilePrefixCallbackOpT::TempStorage  prefix;     // Smem needed for cooperative prefix callback
+            typename BlockScanT::TempStorage             scan;       // Smem needed for tile scanning
+        } scan_storage;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&               temp_storage;       ///< Reference to temp_storage
+    WrappedInputIteratorT       d_in;               ///< Input data
+    OutputIteratorT             d_out;              ///< Output data
+    ScanOpT                     scan_op;            ///< Binary scan operator
+    InitValueT                  init_value;         ///< The init_value element for ScanOpT
+
+
+    //---------------------------------------------------------------------
+    // Block scan utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Exclusive scan specialization (first tile)
+     */
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        OutputT             init_value,
+        ScanOpT             scan_op,
+        OutputT             &block_aggregate,
+        Int2Type<false>     /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate);
+        block_aggregate = scan_op(init_value, block_aggregate);
+    }
+
+
+    /**
+     * Inclusive scan specialization (first tile)
+     */
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        InitValueT          /*init_value*/,
+        ScanOpT             scan_op,
+        OutputT             &block_aggregate,
+        Int2Type<true>      /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * Exclusive scan specialization (subsequent tiles)
+     */
+    template <typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        ScanOpT             scan_op,
+        PrefixCallback      &prefix_op,
+        Int2Type<false>     /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op);
+    }
+
+
+    /**
+     * Inclusive scan specialization (subsequent tiles)
+     */
+    template <typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        ScanOpT             scan_op,
+        PrefixCallback      &prefix_op,
+        Int2Type<true>      /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan_storage.scan).InclusiveScan(items, items, scan_op, prefix_op);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentScan(
+        TempStorage&    temp_storage,       ///< Reference to temp_storage
+        InputIteratorT  d_in,               ///< Input data
+        OutputIteratorT d_out,              ///< Output data
+        ScanOpT         scan_op,            ///< Binary scan operator
+        InitValueT      init_value)         ///< Initial value to seed the exclusive scan
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_out(d_out),
+        scan_op(scan_op),
+        init_value(init_value)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        // Load items
+        OutputT items[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+            // Fill last element with the first element because collectives are
+            // not suffix guarded.
+            BlockLoadT(temp_storage.load)
+              .Load(d_in + tile_offset,
+                    items,
+                    num_remaining,
+                    *(d_in + tile_offset));
+        }
+        else
+        {
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
+        }
+
+        CTA_SYNC();
+
+        // Perform tile scan
+        if (tile_idx == 0)
+        {
+            // Scan first tile
+            OutputT block_aggregate;
+            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
+            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
+                tile_state.SetInclusive(0, block_aggregate);
+        }
+        else
+        {
+            // Scan non-first tile
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, scan_op, tile_idx);
+            ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
+        }
+
+        CTA_SYNC();
+
+        // Store items
+        if (IS_LAST_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, num_remaining);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        OffsetT             num_items,          ///< Total number of input items
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        int                 start_tile)         ///< The starting tile for the current grid
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
+        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not last tile
+            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+        else if (num_remaining > 0)
+        {
+            // Last tile
+            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Scan an sequence of consecutive tiles (independent of other thread blocks)
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input
+     */
+    template <
+        bool                        IS_FIRST_TILE,
+        bool                        IS_LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT                     tile_offset,                ///< Tile offset
+        RunningPrefixCallbackOp&    prefix_op,                  ///< Running prefix operator
+        int                         valid_items = TILE_ITEMS)   ///< Number of valid items in the tile
+    {
+        // Load items
+        OutputT items[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+            // Fill last element with the first element because collectives are
+            // not suffix guarded.
+            BlockLoadT(temp_storage.load)
+              .Load(d_in + tile_offset,
+                    items,
+                    valid_items,
+                    *(d_in + tile_offset));
+        }
+        else
+        {
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
+        }
+
+        CTA_SYNC();
+
+        // Block scan
+        if (IS_FIRST_TILE)
+        {
+            OutputT block_aggregate;
+            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
+            prefix_op.running_total = block_aggregate;
+        }
+        else
+        {
+            ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
+        }
+
+        CTA_SYNC();
+
+        // Store items
+        if (IS_LAST_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, valid_items);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        OffsetT  range_offset,      ///< [in] Threadblock begin offset (inclusive)
+        OffsetT  range_end)         ///< [in] Threadblock end offset (exclusive)
+    {
+        BlockScanRunningPrefixOp<OutputT, ScanOpT> prefix_op(scan_op);
+
+        if (range_offset + TILE_ITEMS <= range_end)
+        {
+            // Consume first tile of input (full)
+            ConsumeTile<true, true>(range_offset, prefix_op);
+            range_offset += TILE_ITEMS;
+
+            // Consume subsequent full tiles of input
+            while (range_offset + TILE_ITEMS <= range_end)
+            {
+                ConsumeTile<false, true>(range_offset, prefix_op);
+                range_offset += TILE_ITEMS;
+            }
+
+            // Consume a partially-full tile
+            if (range_offset < range_end)
+            {
+                int valid_items = range_end - range_offset;
+                ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
+            }
+        }
+        else
+        {
+            // Consume the first tile of input (partially-full)
+            int valid_items = range_end - range_offset;
+            ConsumeTile<true, false>(range_offset, prefix_op, valid_items);
+        }
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles, seeded with the specified prefix value
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        OffsetT range_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT range_end,                          ///< [in] Threadblock end offset (exclusive)
+        OutputT prefix)                             ///< [in] The prefix to apply to the scan segment
+    {
+        BlockScanRunningPrefixOp<OutputT, ScanOpT> prefix_op(prefix, scan_op);
+
+        // Consume full tiles of input
+        while (range_offset + TILE_ITEMS <= range_end)
+        {
+            ConsumeTile<true, false>(range_offset, prefix_op);
+            range_offset += TILE_ITEMS;
+        }
+
+        // Consume a partially-full tile
+        if (range_offset < range_end)
+        {
+            int valid_items = range_end - range_offset;
+            ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
+        }
+    }
+
+};
+
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/agent/agent_scan_by_key.cuh b/cuda/cub/agent/agent_scan_by_key.cuh
new file mode 100644
index 00000000..3d0fd782
--- /dev/null
+++ b/cuda/cub/agent/agent_scan_by_key.cuh
@@ -0,0 +1,464 @@
+/******************************************************************************
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * AgentScanByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan by key.
+ */
+
+#pragma once
+
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/block/block_discontinuity.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/config.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/util_type.cuh>
+
+#include <iterator>
+
+
+CUB_NAMESPACE_BEGIN
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentScanByKey
+ */
+
+template <int                      _BLOCK_THREADS,
+          int                      _ITEMS_PER_THREAD = 1,
+          BlockLoadAlgorithm       _LOAD_ALGORITHM   = BLOCK_LOAD_DIRECT,
+          CacheLoadModifier        _LOAD_MODIFIER    = LOAD_DEFAULT,
+          BlockScanAlgorithm       _SCAN_ALGORITHM   = BLOCK_SCAN_WARP_SCANS,
+          BlockStoreAlgorithm      _STORE_ALGORITHM  = BLOCK_STORE_DIRECT>
+struct AgentScanByKeyPolicy
+{
+    enum
+    {
+        BLOCK_THREADS    = _BLOCK_THREADS,
+        ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+    };
+
+    static const BlockLoadAlgorithm  LOAD_ALGORITHM  = _LOAD_ALGORITHM;
+    static const CacheLoadModifier   LOAD_MODIFIER   = _LOAD_MODIFIER;
+    static const BlockScanAlgorithm  SCAN_ALGORITHM  = _SCAN_ALGORITHM;
+    static const BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentScanByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan by key.
+ */
+template <
+    typename AgentScanByKeyPolicyT,       ///< Parameterized AgentScanPolicyT tuning policy type
+    typename KeysInputIteratorT,          ///< Random-access input iterator type
+    typename ValuesInputIteratorT,        ///< Random-access input iterator type
+    typename ValuesOutputIteratorT,       ///< Random-access output iterator type
+    typename EqualityOp,                  ///< Equality functor type
+    typename ScanOpT,                     ///< Scan functor type
+    typename InitValueT,                  ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan)
+    typename OffsetT>                     ///< Signed integer type for global offsets
+struct AgentScanByKey
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    using KeyT = cub::detail::value_t<KeysInputIteratorT>;
+    using InputT = cub::detail::value_t<ValuesInputIteratorT>;
+
+    // The output value type -- used as the intermediate accumulator
+    // Per https://wg21.link/P0571, use InitValueT if provided, otherwise the
+    // input iterator's value type.
+    using OutputT =
+      cub::detail::conditional_t<std::is_same<InitValueT, NullType>::value,
+                                 InputT,
+                                 InitValueT>;
+
+    using SizeValuePairT = KeyValuePair<OffsetT, OutputT>;
+    using KeyValuePairT = KeyValuePair<KeyT, OutputT>;
+    using ReduceBySegmentOpT = ReduceBySegmentOp<ScanOpT>;
+
+    using ScanTileStateT = ReduceByKeyScanTileState<OutputT, OffsetT>;
+
+    // Constants
+    enum
+    {
+        IS_INCLUSIVE        = std::is_same<InitValueT, NullType>::value, // Inclusive scan if no init_value type is provided
+        BLOCK_THREADS       = AgentScanByKeyPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentScanByKeyPolicyT::ITEMS_PER_THREAD,
+        ITEMS_PER_TILE      = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    using WrappedKeysInputIteratorT = cub::detail::conditional_t<std::is_pointer<KeysInputIteratorT>::value,
+        CacheModifiedInputIterator<AgentScanByKeyPolicyT::LOAD_MODIFIER, KeyT, OffsetT>,   // Wrap the native input pointer with CacheModifiedInputIterator
+        KeysInputIteratorT>;
+    using WrappedValuesInputIteratorT = cub::detail::conditional_t<std::is_pointer<ValuesInputIteratorT>::value,
+        CacheModifiedInputIterator<AgentScanByKeyPolicyT::LOAD_MODIFIER, InputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedInputIterator
+        ValuesInputIteratorT>;
+
+    using BlockLoadKeysT = BlockLoad<KeyT, BLOCK_THREADS, ITEMS_PER_THREAD, AgentScanByKeyPolicyT::LOAD_ALGORITHM>;
+    using BlockLoadValuesT = BlockLoad<OutputT, BLOCK_THREADS, ITEMS_PER_THREAD, AgentScanByKeyPolicyT::LOAD_ALGORITHM>;
+    using BlockStoreValuesT = BlockStore<OutputT, BLOCK_THREADS, ITEMS_PER_THREAD, AgentScanByKeyPolicyT::STORE_ALGORITHM>;
+    using BlockDiscontinuityKeysT = BlockDiscontinuity<KeyT, BLOCK_THREADS, 1, 1>;
+
+    using TilePrefixCallbackT = TilePrefixCallbackOp<SizeValuePairT, ReduceBySegmentOpT, ScanTileStateT>;
+    using BlockScanT = BlockScan<SizeValuePairT, BLOCK_THREADS, AgentScanByKeyPolicyT::SCAN_ALGORITHM, 1, 1>;
+
+    union TempStorage_
+    {
+        struct ScanStorage
+        {
+            typename BlockScanT::TempStorage              scan;
+            typename TilePrefixCallbackT::TempStorage     prefix;
+            typename BlockDiscontinuityKeysT::TempStorage discontinuity;
+        } scan_storage;
+
+        typename BlockLoadKeysT::TempStorage    load_keys;
+        typename BlockLoadValuesT::TempStorage  load_values;
+        typename BlockStoreValuesT::TempStorage store_values;
+    };
+
+    struct TempStorage : cub::Uninitialized<TempStorage_> {};
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    TempStorage_                 &storage;
+    WrappedKeysInputIteratorT     d_keys_in;
+    WrappedValuesInputIteratorT   d_values_in;
+    ValuesOutputIteratorT         d_values_out;
+    InequalityWrapper<EqualityOp> inequality_op;
+    ScanOpT                       scan_op;
+    ReduceBySegmentOpT            pair_scan_op;
+    InitValueT                    init_value;
+
+    //---------------------------------------------------------------------
+    // Block scan utility methods (first tile)
+    //---------------------------------------------------------------------
+
+    // Exclusive scan specialization
+    __device__ __forceinline__
+    void ScanTile(
+        SizeValuePairT (&scan_items)[ITEMS_PER_THREAD],
+        SizeValuePairT &tile_aggregate,
+        Int2Type<false> /* is_inclusive */)
+    {
+        BlockScanT(storage.scan_storage.scan)
+            .ExclusiveScan(scan_items, scan_items, pair_scan_op, tile_aggregate);
+    }
+
+    // Inclusive scan specialization
+    __device__ __forceinline__
+    void ScanTile(
+        SizeValuePairT (&scan_items)[ITEMS_PER_THREAD],
+        SizeValuePairT &tile_aggregate,
+        Int2Type<true> /* is_inclusive */)
+    {
+        BlockScanT(storage.scan_storage.scan)
+            .InclusiveScan(scan_items, scan_items, pair_scan_op, tile_aggregate);
+    }
+
+    //---------------------------------------------------------------------
+    // Block scan utility methods (subsequent tiles)
+    //---------------------------------------------------------------------
+
+    // Exclusive scan specialization (with prefix from predecessors)
+    __device__ __forceinline__
+    void ScanTile(
+        SizeValuePairT (&scan_items)[ITEMS_PER_THREAD],
+        SizeValuePairT & tile_aggregate,
+        TilePrefixCallbackT &prefix_op,
+        Int2Type<false> /* is_incclusive */)
+    {
+        BlockScanT(storage.scan_storage.scan)
+            .ExclusiveScan(scan_items, scan_items, pair_scan_op, prefix_op);
+        tile_aggregate = prefix_op.GetBlockAggregate();
+    }
+
+    // Inclusive scan specialization (with prefix from predecessors)
+    __device__ __forceinline__
+    void ScanTile(
+        SizeValuePairT (&scan_items)[ITEMS_PER_THREAD],
+        SizeValuePairT & tile_aggregate,
+        TilePrefixCallbackT &prefix_op,
+        Int2Type<true> /* is_inclusive */)
+    {
+        BlockScanT(storage.scan_storage.scan)
+            .InclusiveScan(scan_items, scan_items, pair_scan_op, prefix_op);
+        tile_aggregate = prefix_op.GetBlockAggregate();
+    }
+
+    //---------------------------------------------------------------------
+    // Zip utility methods
+    //---------------------------------------------------------------------
+
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__
+    void ZipValuesAndFlags(
+        OffsetT num_remaining,
+        OutputT (&values)[ITEMS_PER_THREAD],
+        OffsetT (&segment_flags)[ITEMS_PER_THREAD],
+        SizeValuePairT (&scan_items)[ITEMS_PER_THREAD])
+    {
+        // Zip values and segment_flags
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Set segment_flags for first out-of-bounds item, zero for others
+            if (IS_LAST_TILE &&
+                OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM == num_remaining)
+                segment_flags[ITEM] = 1;
+
+            scan_items[ITEM].value = values[ITEM];
+            scan_items[ITEM].key   = segment_flags[ITEM];
+        }
+    }
+
+    __device__ __forceinline__
+    void UnzipValues(
+        OutputT (&values)[ITEMS_PER_THREAD],
+        SizeValuePairT (&scan_items)[ITEMS_PER_THREAD])
+    {
+        // Zip values and segment_flags
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            values[ITEM] = scan_items[ITEM].value;
+        }
+    }
+
+    template <bool IsNull = std::is_same<InitValueT, NullType>::value,
+              typename std::enable_if<!IsNull, int>::type = 0>
+    __device__ __forceinline__ void AddInitToScan(
+        OutputT (&items)[ITEMS_PER_THREAD],
+        OffsetT (&flags)[ITEMS_PER_THREAD])
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            items[ITEM] = flags[ITEM] ? init_value : scan_op(init_value, items[ITEM]);
+        }
+    }
+
+    template <bool IsNull = std::is_same<InitValueT, NullType>::value,
+              typename std::enable_if<IsNull, int>::type = 0>
+    __device__ __forceinline__
+    void AddInitToScan(
+        OutputT (&/*items*/)[ITEMS_PER_THREAD],
+        OffsetT (&/*flags*/)[ITEMS_PER_THREAD])
+    {}
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    // Process a tile of input (dynamic chained scan)
+    //
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__
+    void ConsumeTile(
+        OffsetT          /*num_items*/,
+        OffsetT          num_remaining,
+        int              tile_idx,
+        OffsetT          tile_base,
+        ScanTileStateT&  tile_state)
+    {
+        // Load items
+        KeyT           keys[ITEMS_PER_THREAD];
+        OutputT        values[ITEMS_PER_THREAD];
+        OffsetT        segment_flags[ITEMS_PER_THREAD];
+        SizeValuePairT scan_items[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+            // Fill last element with the first element
+            // because collectives are not suffix guarded
+            BlockLoadKeysT(storage.load_keys)
+                .Load(d_keys_in + tile_base,
+                      keys,
+                      num_remaining,
+                      *(d_keys_in + tile_base));
+        }
+        else
+        {
+            BlockLoadKeysT(storage.load_keys)
+                .Load(d_keys_in + tile_base, keys);
+        }
+
+        CTA_SYNC();
+
+        if (IS_LAST_TILE)
+        {
+            // Fill last element with the first element
+            // because collectives are not suffix guarded
+            BlockLoadValuesT(storage.load_values)
+                .Load(d_values_in + tile_base,
+                      values,
+                      num_remaining,
+                      *(d_values_in + tile_base));
+        }
+        else
+        {
+            BlockLoadValuesT(storage.load_values)
+                .Load(d_values_in + tile_base, values);
+        }
+
+        CTA_SYNC();
+
+        // first tile
+        if (tile_idx == 0)
+        {
+            BlockDiscontinuityKeysT(storage.scan_storage.discontinuity)
+                .FlagHeads(segment_flags, keys, inequality_op);
+
+            // Zip values and segment_flags
+            ZipValuesAndFlags<IS_LAST_TILE>(num_remaining,
+                                            values,
+                                            segment_flags,
+                                            scan_items);
+
+            // Exclusive scan of values and segment_flags
+            SizeValuePairT tile_aggregate;
+            ScanTile(scan_items, tile_aggregate, Int2Type<IS_INCLUSIVE>());
+
+            if (threadIdx.x == 0)
+            {
+                if (!IS_LAST_TILE)
+                    tile_state.SetInclusive(0, tile_aggregate);
+
+                scan_items[0].key = 0;
+            }
+        }
+        else
+        {
+            KeyT tile_pred_key = (threadIdx.x == 0) ? d_keys_in[tile_base - 1] : KeyT();
+            BlockDiscontinuityKeysT(storage.scan_storage.discontinuity)
+                .FlagHeads(segment_flags, keys, inequality_op, tile_pred_key);
+
+            // Zip values and segment_flags
+            ZipValuesAndFlags<IS_LAST_TILE>(num_remaining,
+                                            values,
+                                            segment_flags,
+                                            scan_items);
+
+            SizeValuePairT  tile_aggregate;
+            TilePrefixCallbackT prefix_op(tile_state, storage.scan_storage.prefix, pair_scan_op, tile_idx);
+            ScanTile(scan_items, tile_aggregate, prefix_op, Int2Type<IS_INCLUSIVE>());
+        }
+
+        CTA_SYNC();
+
+        UnzipValues(values, scan_items);
+
+        AddInitToScan(values, segment_flags);
+
+        // Store items
+        if (IS_LAST_TILE)
+        {
+            BlockStoreValuesT(storage.store_values)
+                .Store(d_values_out + tile_base, values, num_remaining);
+        }
+        else
+        {
+            BlockStoreValuesT(storage.store_values)
+                .Store(d_values_out + tile_base, values);
+        }
+    }
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Dequeue and scan tiles of items as part of a dynamic chained scan
+    // with Init functor
+    __device__ __forceinline__
+    AgentScanByKey(
+        TempStorage &         storage,
+        KeysInputIteratorT    d_keys_in,
+        ValuesInputIteratorT  d_values_in,
+        ValuesOutputIteratorT d_values_out,
+        EqualityOp            equality_op,
+        ScanOpT               scan_op,
+        InitValueT            init_value)
+    : 
+        storage(storage.Alias()),
+        d_keys_in(d_keys_in),
+        d_values_in(d_values_in),
+        d_values_out(d_values_out),
+        inequality_op(equality_op),
+        scan_op(scan_op),
+        pair_scan_op(scan_op),
+        init_value(init_value)
+    {}
+    
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        OffsetT             num_items,          ///< Total number of input items
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        int                 start_tile)         ///< The starting tile for the current grid
+    {
+        int  tile_idx         = blockIdx.x;
+        OffsetT tile_base     = OffsetT(ITEMS_PER_TILE) * tile_idx;
+        OffsetT num_remaining = num_items - tile_base;
+
+        if (num_remaining > ITEMS_PER_TILE)
+        {
+            // Not the last tile (full)
+            ConsumeTile<false>(num_items,
+                               num_remaining,
+                               tile_idx,
+                               tile_base,
+                               tile_state);
+        }
+        else if (num_remaining > 0)
+        {
+            // The last tile (possibly partially-full)
+            ConsumeTile<true>(num_items,
+                              num_remaining,
+                              tile_idx,
+                              tile_base,
+                              tile_state);
+        }
+    }
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/agent/agent_segment_fixup.cuh b/cuda/cub/agent/agent_segment_fixup.cuh
new file mode 100644
index 00000000..78361ef3
--- /dev/null
+++ b/cuda/cub/agent/agent_segment_fixup.cuh
@@ -0,0 +1,373 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../config.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSegmentFixup
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentSegmentFixupPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
+ */
+template <
+    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
+    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    EqualityOpT,                    ///< KeyT equality operator type
+    typename    ReductionOpT,                   ///< ValueT reduction operator type
+    typename    OffsetT>                        ///< Signed integer type for global offsets
+struct AgentSegmentFixup
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Data type of key-value input iterator
+    using KeyValuePairT = cub::detail::value_t<PairsInputIteratorT>;
+
+    // Value type
+    using ValueT = typename KeyValuePairT::Value;
+
+    // Tile status descriptor interface type
+    using ScanTileStateT = ReduceByKeyScanTileState<ValueT, OffsetT>;
+
+    // Constants
+    enum
+    {
+      BLOCK_THREADS    = AgentSegmentFixupPolicyT::BLOCK_THREADS,
+      ITEMS_PER_THREAD = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD,
+      TILE_ITEMS       = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+      // Whether or not do fixup using RLE + global atomics
+      USE_ATOMIC_FIXUP = (CUB_PTX_ARCH >= 350) &&
+                         (std::is_same<ValueT, float>::value ||
+                          std::is_same<ValueT, int>::value ||
+                          std::is_same<ValueT, unsigned int>::value ||
+                          std::is_same<ValueT, unsigned long long>::value),
+
+      // Whether or not the scan operation has a zero-valued identity value
+      // (true if we're performing addition on a primitive type)
+      HAS_IDENTITY_ZERO = (std::is_same<ReductionOpT, cub::Sum>::value) &&
+                          (Traits<ValueT>::PRIMITIVE),
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
+    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+    // or directly use the supplied input iterator type
+    using WrappedPairsInputIteratorT = cub::detail::conditional_t<
+      std::is_pointer<PairsInputIteratorT>::value,
+      CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER,
+                                 KeyValuePairT,
+                                 OffsetT>,
+      PairsInputIteratorT>;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
+    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+    // or directly use the supplied input iterator type
+    using WrappedFixupInputIteratorT = cub::detail::conditional_t<
+      std::is_pointer<AggregatesOutputIteratorT>::value,
+      CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER,
+                                 ValueT,
+                                 OffsetT>,
+      AggregatesOutputIteratorT>;
+
+    // Reduce-value-by-segment scan operator
+    using ReduceBySegmentOpT = ReduceByKeyOp<cub::Sum>;
+
+    // Parameterized BlockLoad type for pairs
+    using BlockLoadPairs = BlockLoad<KeyValuePairT,
+                                     BLOCK_THREADS,
+                                     ITEMS_PER_THREAD,
+                                     AgentSegmentFixupPolicyT::LOAD_ALGORITHM>;
+
+    // Parameterized BlockScan type
+    using BlockScanT = BlockScan<KeyValuePairT,
+                                 BLOCK_THREADS,
+                                 AgentSegmentFixupPolicyT::SCAN_ALGORITHM>;
+
+    // Callback type for obtaining tile prefix during block scan
+    using TilePrefixCallbackOpT =
+      TilePrefixCallbackOp<KeyValuePairT, ReduceBySegmentOpT, ScanTileStateT>;
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct ScanStorage
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+        } scan_storage;
+
+        // Smem needed for loading keys
+        typename BlockLoadPairs::TempStorage load_pairs;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedPairsInputIteratorT      d_pairs_in;          ///< Input keys
+    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
+    WrappedFixupInputIteratorT      d_fixup_in;         ///< Fixup input values
+    InequalityWrapper<EqualityOpT>  inequality_op;      ///< KeyT inequality operator
+    ReductionOpT                    reduction_op;       ///< Reduction operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentSegmentFixup(
+        TempStorage&                temp_storage,       ///< Reference to temp_storage
+        PairsInputIteratorT         d_pairs_in,          ///< Input keys
+        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
+        EqualityOpT                 equality_op,        ///< KeyT equality operator
+        ReductionOpT                reduction_op)       ///< ValueT reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_pairs_in(d_pairs_in),
+        d_aggregates_out(d_aggregates_out),
+        d_fixup_in(d_aggregates_out),
+        inequality_op(equality_op),
+        reduction_op(reduction_op),
+        scan_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Process input tile.  Specialized for atomic-fixup
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        Int2Type<true>      use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
+    {
+        KeyValuePairT   pairs[ITEMS_PER_THREAD];
+
+        // Load pairs
+        KeyValuePairT oob_pair;
+        oob_pair.key = -1;
+
+        if (IS_LAST_TILE)
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
+        else
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
+
+        // RLE 
+        #pragma unroll
+        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            ValueT* d_scatter = d_aggregates_out + pairs[ITEM - 1].key;
+            if (pairs[ITEM].key != pairs[ITEM - 1].key)
+                atomicAdd(d_scatter, pairs[ITEM - 1].value);
+            else
+                pairs[ITEM].value = reduction_op(pairs[ITEM - 1].value, pairs[ITEM].value);
+        }
+
+        // Flush last item if valid
+        ValueT* d_scatter = d_aggregates_out + pairs[ITEMS_PER_THREAD - 1].key;
+        if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0))
+            atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value);
+    }
+
+
+    /**
+     * Process input tile.  Specialized for reduce-by-key fixup
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        Int2Type<false>     use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
+    {
+        KeyValuePairT   pairs[ITEMS_PER_THREAD];
+        KeyValuePairT   scatter_pairs[ITEMS_PER_THREAD];
+
+        // Load pairs
+        KeyValuePairT oob_pair;
+        oob_pair.key = -1;
+
+        if (IS_LAST_TILE)
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
+        else
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
+
+        CTA_SYNC();
+
+        KeyValuePairT tile_aggregate;
+        if (tile_idx == 0)
+        {
+            // Exclusive scan of values and segment_flags
+            BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, tile_aggregate);
+
+            // Update tile status if this is not the last tile
+            if (threadIdx.x == 0)
+            {
+                // Set first segment id to not trigger a flush (invalid from exclusive scan)
+                scatter_pairs[0].key = pairs[0].key;
+
+                if (!IS_LAST_TILE)
+                    tile_state.SetInclusive(0, tile_aggregate);
+
+            }
+        }
+        else
+        {
+            // Exclusive scan of values and segment_flags
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, scan_op, tile_idx);
+            BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, prefix_op);
+            tile_aggregate = prefix_op.GetBlockAggregate();
+        }
+
+        // Scatter updated values
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (scatter_pairs[ITEM].key != pairs[ITEM].key)
+            {
+                // Update the value at the key location
+                ValueT value    = d_fixup_in[scatter_pairs[ITEM].key];
+                value           = reduction_op(value, scatter_pairs[ITEM].value);
+
+                d_aggregates_out[scatter_pairs[ITEM].key] = value;
+            }
+        }
+
+        // Finalize the last item
+        if (IS_LAST_TILE)
+        {
+            // Last thread will output final count and last item, if necessary
+            if (threadIdx.x == BLOCK_THREADS - 1)
+            {
+                // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment
+                if (num_remaining == TILE_ITEMS)
+                {
+                    // Update the value at the key location
+                    OffsetT last_key = pairs[ITEMS_PER_THREAD - 1].key;
+                    d_aggregates_out[last_key] = reduction_op(tile_aggregate.value, d_fixup_in[last_key]);
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        OffsetT             num_items,          ///< Total number of input items
+        int                 num_tiles,          ///< Total number of input tiles
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not the last tile (full)
+            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());
+        }
+        else if (num_remaining > 0)
+        {
+            // The last tile (possibly partially-full)
+            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());
+        }
+    }
+
+};
+
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/agent/agent_segmented_radix_sort.cuh b/cuda/cub/agent/agent_segmented_radix_sort.cuh
new file mode 100644
index 00000000..b4e36c37
--- /dev/null
+++ b/cuda/cub/agent/agent_segmented_radix_sort.cuh
@@ -0,0 +1,286 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/agent/agent_radix_sort_downsweep.cuh>
+#include <cub/agent/agent_radix_sort_upsweep.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/config.cuh>
+#include <cub/util_namespace.cuh>
+#include <cub/util_type.cuh>
+
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * This agent will be implementing the `DeviceSegmentedRadixSort` when the
+ * https://github.com/NVIDIA/cub/issues/383 is addressed.
+ *
+ * @tparam IS_DESCENDING
+ *   Whether or not the sorted-order is high-to-low
+ *
+ * @tparam SegmentedPolicyT
+ *   Chained tuning policy
+ *
+ * @tparam KeyT
+ *   Key type
+ *
+ * @tparam ValueT
+ *   Value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ */
+template <bool IS_DESCENDING,
+          typename SegmentedPolicyT,
+          typename KeyT,
+          typename ValueT,
+          typename OffsetT>
+struct AgentSegmentedRadixSort
+{
+  OffsetT num_items;
+
+  static constexpr int ITEMS_PER_THREAD = SegmentedPolicyT::ITEMS_PER_THREAD;
+  static constexpr int BLOCK_THREADS    = SegmentedPolicyT::BLOCK_THREADS;
+  static constexpr int RADIX_BITS       = SegmentedPolicyT::RADIX_BITS;
+  static constexpr int RADIX_DIGITS     = 1 << RADIX_BITS;
+  static constexpr int KEYS_ONLY        = std::is_same<ValueT, NullType>::value;
+
+  // Huge segment handlers
+  using BlockUpsweepT = AgentRadixSortUpsweep<SegmentedPolicyT, KeyT, OffsetT>;
+  using DigitScanT    = BlockScan<OffsetT, BLOCK_THREADS>;
+  using BlockDownsweepT =
+    AgentRadixSortDownsweep<SegmentedPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT>;
+
+  /// Number of bin-starting offsets tracked per thread
+  static constexpr int BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD;
+
+  // Small segment handlers
+  using BlockRadixSortT =
+    BlockRadixSort<KeyT,
+                   BLOCK_THREADS,
+                   ITEMS_PER_THREAD,
+                   ValueT,
+                   RADIX_BITS,
+                   (SegmentedPolicyT::RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
+                   SegmentedPolicyT::SCAN_ALGORITHM>;
+
+  using BlockKeyLoadT = BlockLoad<KeyT,
+                                  BLOCK_THREADS,
+                                  ITEMS_PER_THREAD,
+                                  SegmentedPolicyT::LOAD_ALGORITHM>;
+
+  using BlockValueLoadT = BlockLoad<ValueT,
+                                    BLOCK_THREADS,
+                                    ITEMS_PER_THREAD,
+                                    SegmentedPolicyT::LOAD_ALGORITHM>;
+
+  union _TempStorage
+  {
+    // Huge segment handlers
+    typename BlockUpsweepT::TempStorage upsweep;
+    typename BlockDownsweepT::TempStorage downsweep;
+
+    struct UnboundBlockSort
+    {
+      OffsetT reverse_counts_in[RADIX_DIGITS];
+      OffsetT reverse_counts_out[RADIX_DIGITS];
+      typename DigitScanT::TempStorage scan;
+    } unbound_sort;
+
+    // Small segment handlers
+    typename BlockKeyLoadT::TempStorage keys_load;
+    typename BlockValueLoadT::TempStorage values_load;
+    typename BlockRadixSortT::TempStorage sort;
+  };
+
+  using TempStorage = Uninitialized<_TempStorage>;
+  _TempStorage &temp_storage;
+
+  __device__ __forceinline__
+  AgentSegmentedRadixSort(OffsetT num_items,
+                          TempStorage &temp_storage)
+      : num_items(num_items)
+      , temp_storage(temp_storage.Alias())
+  {}
+
+  __device__ __forceinline__ void ProcessSinglePass(int begin_bit,
+                                                    int end_bit,
+                                                    const KeyT *d_keys_in,
+                                                    const ValueT *d_values_in,
+                                                    KeyT *d_keys_out,
+                                                    ValueT *d_values_out)
+  {
+    KeyT thread_keys[ITEMS_PER_THREAD];
+    ValueT thread_values[ITEMS_PER_THREAD];
+
+    // For FP64 the difference is:
+    // Lowest() -> -1.79769e+308 = 00...00b -> TwiddleIn -> -0 = 10...00b
+    // LOWEST   -> -nan          = 11...11b -> TwiddleIn ->  0 = 00...00b
+
+    using UnsignedBitsT = typename Traits<KeyT>::UnsignedBits;
+    UnsignedBitsT default_key_bits = IS_DESCENDING ? Traits<KeyT>::LOWEST_KEY
+                                                   : Traits<KeyT>::MAX_KEY;
+    KeyT oob_default = reinterpret_cast<KeyT &>(default_key_bits);
+
+    if (!KEYS_ONLY)
+    {
+      BlockValueLoadT(temp_storage.values_load)
+        .Load(d_values_in, thread_values, num_items);
+
+      CTA_SYNC();
+    }
+
+    {
+      BlockKeyLoadT(temp_storage.keys_load)
+        .Load(d_keys_in, thread_keys, num_items, oob_default);
+
+      CTA_SYNC();
+    }
+
+    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(
+      thread_keys,
+      thread_values,
+      begin_bit,
+      end_bit,
+      Int2Type<IS_DESCENDING>(),
+      Int2Type<KEYS_ONLY>());
+
+    cub::StoreDirectStriped<BLOCK_THREADS>(
+      threadIdx.x, d_keys_out, thread_keys, num_items);
+
+    if (!KEYS_ONLY)
+    {
+      cub::StoreDirectStriped<BLOCK_THREADS>(
+        threadIdx.x, d_values_out, thread_values, num_items);
+    }
+  }
+
+  __device__ __forceinline__ void ProcessIterative(int current_bit,
+                                                   int pass_bits,
+                                                   const KeyT *d_keys_in,
+                                                   const ValueT *d_values_in,
+                                                   KeyT *d_keys_out,
+                                                   ValueT *d_values_out)
+  {
+    // Upsweep
+    BlockUpsweepT upsweep(temp_storage.upsweep,
+                          d_keys_in,
+                          current_bit,
+                          pass_bits);
+    upsweep.ProcessRegion(OffsetT{}, num_items);
+
+    CTA_SYNC();
+
+    // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads)
+    OffsetT bin_count[BINS_TRACKED_PER_THREAD];
+    upsweep.ExtractCounts(bin_count);
+
+    CTA_SYNC();
+
+    if (IS_DESCENDING)
+    {
+      // Reverse bin counts
+      #pragma unroll
+      for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+      {
+        int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+        if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+        {
+          temp_storage.unbound_sort.reverse_counts_in[bin_idx] = bin_count[track];
+        }
+      }
+
+      CTA_SYNC();
+
+      #pragma unroll
+      for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+      {
+        int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+        if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+        {
+          bin_count[track] = temp_storage.unbound_sort.reverse_counts_in[RADIX_DIGITS - bin_idx - 1];
+        }
+      }
+    }
+
+    // Scan
+    // The global scatter base offset for each digit value in this pass
+    // (valid in the first RADIX_DIGITS threads)
+    OffsetT bin_offset[BINS_TRACKED_PER_THREAD];
+    DigitScanT(temp_storage.unbound_sort.scan).ExclusiveSum(bin_count, bin_offset);
+
+    if (IS_DESCENDING)
+    {
+      // Reverse bin offsets
+      #pragma unroll
+      for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+      {
+        int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+        if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+        {
+          temp_storage.unbound_sort.reverse_counts_out[threadIdx.x] = bin_offset[track];
+        }
+      }
+
+      CTA_SYNC();
+
+      #pragma unroll
+      for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+      {
+        int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+        if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+        {
+          bin_offset[track] = temp_storage.unbound_sort.reverse_counts_out[RADIX_DIGITS - bin_idx - 1];
+        }
+      }
+    }
+
+    CTA_SYNC();
+
+    // Downsweep
+    BlockDownsweepT downsweep(temp_storage.downsweep,
+                              bin_offset,
+                              num_items,
+                              d_keys_in,
+                              d_keys_out,
+                              d_values_in,
+                              d_values_out,
+                              current_bit,
+                              pass_bits);
+    downsweep.ProcessRegion(OffsetT{}, num_items);
+  }
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/agent/agent_select_if.cuh b/cuda/cub/agent/agent_select_if.cuh
new file mode 100644
index 00000000..abb098b7
--- /dev/null
+++ b/cuda/cub/agent/agent_select_if.cuh
@@ -0,0 +1,686 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide select.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../config.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSelectIf
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentSelectIfPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+
+/**
+ * \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection
+ *
+ * Performs functor-based selection if SelectOpT functor type != NullType
+ * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename    AgentSelectIfPolicyT,           ///< Parameterized AgentSelectIfPolicy tuning policy type
+    typename    InputIteratorT,                 ///< Random-access input iterator type for selection items
+    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename    SelectedOutputIteratorT,        ///< Random-access output iterator type for selection_flags items
+    typename    SelectOpT,                      ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
+    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
+struct AgentSelectIf
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input value type
+    using InputT = cub::detail::value_t<InputIteratorT>;
+
+    // The flag value type
+    using FlagT = cub::detail::value_t<FlagsInputIteratorT>;
+
+    // Tile status descriptor interface type
+    using ScanTileStateT = ScanTileState<OffsetT>;
+
+    // Constants
+    enum
+    {
+        USE_SELECT_OP,
+        USE_SELECT_FLAGS,
+        USE_DISCONTINUITY,
+
+        BLOCK_THREADS           = AgentSelectIfPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentSelectIfPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TWO_PHASE_SCATTER       = (ITEMS_PER_THREAD > 1),
+
+      SELECT_METHOD =
+        (!std::is_same<SelectOpT, NullType>::value) ? USE_SELECT_OP
+        : (!std::is_same<FlagT, NullType>::value)   ? USE_SELECT_FLAGS
+                                                    : USE_DISCONTINUITY
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for items
+    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+    // or directly use the supplied input iterator type
+    using WrappedInputIteratorT = cub::detail::conditional_t<
+      std::is_pointer<InputIteratorT>::value,
+      CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER,
+                                 InputT,
+                                 OffsetT>,
+      InputIteratorT>;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
+    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+    // or directly use the supplied input iterator type
+    using WrappedFlagsInputIteratorT = cub::detail::conditional_t<
+      std::is_pointer<FlagsInputIteratorT>::value,
+      CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER,
+                                 FlagT,
+                                 OffsetT>,
+      FlagsInputIteratorT>;
+
+    // Parameterized BlockLoad type for input data
+    using BlockLoadT = BlockLoad<InputT,
+                                 BLOCK_THREADS,
+                                 ITEMS_PER_THREAD,
+                                 AgentSelectIfPolicyT::LOAD_ALGORITHM>;
+
+    // Parameterized BlockLoad type for flags
+    using BlockLoadFlags = BlockLoad<FlagT,
+                                     BLOCK_THREADS,
+                                     ITEMS_PER_THREAD,
+                                     AgentSelectIfPolicyT::LOAD_ALGORITHM>;
+
+    // Parameterized BlockDiscontinuity type for items
+    using BlockDiscontinuityT = BlockDiscontinuity<InputT, BLOCK_THREADS>;
+
+    // Parameterized BlockScan type
+    using BlockScanT =
+      BlockScan<OffsetT, BLOCK_THREADS, AgentSelectIfPolicyT::SCAN_ALGORITHM>;
+
+    // Callback type for obtaining tile prefix during block scan
+    using TilePrefixCallbackOpT =
+      TilePrefixCallbackOp<OffsetT, cub::Sum, ScanTileStateT>;
+
+    // Item exchange type
+    typedef InputT ItemExchangeT[TILE_ITEMS];
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct ScanStorage
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+            typename BlockDiscontinuityT::TempStorage       discontinuity;  // Smem needed for discontinuity detection
+        } scan_storage;
+
+        // Smem needed for loading items
+        typename BlockLoadT::TempStorage load_items;
+
+        // Smem needed for loading values
+        typename BlockLoadFlags::TempStorage load_flags;
+
+        // Smem needed for compacting items (allows non POD items in this union)
+        Uninitialized<ItemExchangeT> raw_exchange;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedInputIteratorT           d_in;               ///< Input items
+    SelectedOutputIteratorT         d_selected_out;     ///< Unique output items
+    WrappedFlagsInputIteratorT      d_flags_in;         ///< Input selection flags (if applicable)
+    InequalityWrapper<EqualityOpT>  inequality_op;      ///< T inequality operator
+    SelectOpT                       select_op;          ///< Selection operator
+    OffsetT                         num_items;          ///< Total number of input items
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentSelectIf(
+        TempStorage                 &temp_storage,      ///< Reference to temp_storage
+        InputIteratorT              d_in,               ///< Input data
+        FlagsInputIteratorT         d_flags_in,         ///< Input selection flags (if applicable)
+        SelectedOutputIteratorT     d_selected_out,     ///< Output data
+        SelectOpT                   select_op,          ///< Selection operator
+        EqualityOpT                 equality_op,        ///< Equality operator
+        OffsetT                     num_items)          ///< Total number of input items
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_flags_in(d_flags_in),
+        d_selected_out(d_selected_out),
+        select_op(select_op),
+        inequality_op(equality_op),
+        num_items(num_items)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for initializing the selections
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize selections (specialized for selection operator)
+     */
+    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     /*tile_offset*/,
+        OffsetT                     num_tile_items,
+        InputT                      (&items)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type<USE_SELECT_OP>     /*select_method*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Out-of-bounds items are selection_flags
+            selection_flags[ITEM] = 1;
+
+            if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
+                selection_flags[ITEM] = select_op(items[ITEM]);
+        }
+    }
+
+
+    /**
+     * Initialize selections (specialized for valid flags)
+     */
+    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     tile_offset,
+        OffsetT                     num_tile_items,
+        InputT                      (&/*items*/)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type<USE_SELECT_FLAGS>  /*select_method*/)
+    {
+        CTA_SYNC();
+
+        FlagT flags[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+            // Out-of-bounds items are selection_flags
+            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1);
+        }
+        else
+        {
+            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags);
+        }
+
+        // Convert flag type to selection_flags type
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            selection_flags[ITEM] = static_cast<bool>(flags[ITEM]);
+        }
+    }
+
+
+    /**
+     * Initialize selections (specialized for discontinuity detection)
+     */
+    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     tile_offset,
+        OffsetT                     num_tile_items,
+        InputT                      (&items)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type<USE_DISCONTINUITY> /*select_method*/)
+    {
+        if (IS_FIRST_TILE)
+        {
+            CTA_SYNC();
+
+            // Set head selection_flags.  First tile sets the first flag for the first item
+            BlockDiscontinuityT(temp_storage.scan_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op);
+        }
+        else
+        {
+            InputT tile_predecessor;
+            if (threadIdx.x == 0)
+                tile_predecessor = d_in[tile_offset - 1];
+
+            CTA_SYNC();
+
+            BlockDiscontinuityT(temp_storage.scan_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op, tile_predecessor);
+        }
+
+        // Set selection flags for out-of-bounds items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Set selection_flags for out-of-bounds items
+            if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
+                selection_flags[ITEM] = 1;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Scatter utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Scatter flagged items to output offsets (specialized for direct scattering)
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void ScatterDirect(
+        InputT  (&items)[ITEMS_PER_THREAD],
+        OffsetT (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT (&selection_indices)[ITEMS_PER_THREAD],
+        OffsetT num_selections)
+    {
+        // Scatter flagged items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (selection_flags[ITEM])
+            {
+                if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections)
+                {
+                    d_selected_out[selection_indices[ITEM]] = items[ITEM];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scatter flagged items to output offsets (specialized for two-phase scattering)
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        InputT          (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             /*num_tile_items*/,                         ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         /*num_rejected_prefix*/,                    ///< Total number of rejections prior to this tile
+        Int2Type<false> /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
+    {
+        CTA_SYNC();
+
+        // Compact and scatter items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix;
+            if (selection_flags[ITEM])
+            {
+                temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
+            }
+        }
+
+        CTA_SYNC();
+
+        for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
+        {
+            d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item];
+        }
+    }
+
+
+    /**
+     * Scatter flagged items to output offsets (specialized for two-phase scattering)
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        InputT          (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             num_tile_items,                             ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
+        Int2Type<true>  /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
+    {
+        CTA_SYNC();
+
+        int tile_num_rejections = num_tile_items - num_tile_selections;
+
+        // Scatter items to shared memory (rejections first)
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int item_idx                = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+            int local_selection_idx     = selection_indices[ITEM] - num_selections_prefix;
+            int local_rejection_idx     = item_idx - local_selection_idx;
+            int local_scatter_offset    = (selection_flags[ITEM]) ?
+                                            tile_num_rejections + local_selection_idx :
+                                            local_rejection_idx;
+
+            temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        // Gather items from shared memory and scatter to global
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int item_idx            = (ITEM * BLOCK_THREADS) + threadIdx.x;
+            int rejection_idx       = item_idx;
+            int selection_idx       = item_idx - tile_num_rejections;
+            OffsetT scatter_offset  = (item_idx < tile_num_rejections) ?
+                                        num_items - num_rejected_prefix - rejection_idx - 1 :
+                                        num_selections_prefix + selection_idx;
+
+            InputT item = temp_storage.raw_exchange.Alias()[item_idx];
+
+            if (!IS_LAST_TILE || (item_idx < num_tile_items))
+            {
+                d_selected_out[scatter_offset] = item;
+            }
+        }
+    }
+
+
+    /**
+     * Scatter flagged items
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void Scatter(
+        InputT          (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             num_tile_items,                             ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
+        OffsetT         num_selections)                             ///< Total number of selections including this tile
+    {
+        // Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one
+        if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS)))
+        {
+            ScatterTwoPhase<IS_LAST_TILE, IS_FIRST_TILE>(
+                items,
+                selection_flags,
+                selection_indices,
+                num_tile_items,
+                num_tile_selections,
+                num_selections_prefix,
+                num_rejected_prefix,
+                Int2Type<KEEP_REJECTS>());
+        }
+        else
+        {
+            ScatterDirect<IS_LAST_TILE, IS_FIRST_TILE>(
+                items,
+                selection_flags,
+                selection_indices,
+                num_selections);
+        }
+    }
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Process first tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeFirstTile(
+        int                 num_tile_items,     ///< Number of input items comprising this tile
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        InputT      items[ITEMS_PER_THREAD];
+        OffsetT     selection_flags[ITEMS_PER_THREAD];
+        OffsetT     selection_indices[ITEMS_PER_THREAD];
+
+        // Load items
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+        else
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+
+        // Initialize selection_flags
+        InitializeSelections<true, IS_LAST_TILE>(
+            tile_offset,
+            num_tile_items,
+            items,
+            selection_flags,
+            Int2Type<SELECT_METHOD>());
+
+        CTA_SYNC();
+
+        // Exclusive scan of selection_flags
+        OffsetT num_tile_selections;
+        BlockScanT(temp_storage.scan_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections);
+
+        if (threadIdx.x == 0)
+        {
+            // Update tile status if this is not the last tile
+            if (!IS_LAST_TILE)
+                tile_state.SetInclusive(0, num_tile_selections);
+        }
+
+        // Discount any out-of-bounds selections
+        if (IS_LAST_TILE)
+            num_tile_selections -= (TILE_ITEMS - num_tile_items);
+
+        // Scatter flagged items
+        Scatter<IS_LAST_TILE, true>(
+            items,
+            selection_flags,
+            selection_indices,
+            num_tile_items,
+            num_tile_selections,
+            0,
+            0,
+            num_tile_selections);
+
+        return num_tile_selections;
+    }
+
+
+    /**
+     * Process subsequent tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeSubsequentTile(
+        int                 num_tile_items,     ///< Number of input items comprising this tile
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        InputT      items[ITEMS_PER_THREAD];
+        OffsetT     selection_flags[ITEMS_PER_THREAD];
+        OffsetT     selection_indices[ITEMS_PER_THREAD];
+
+        // Load items
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+        else
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+
+        // Initialize selection_flags
+        InitializeSelections<false, IS_LAST_TILE>(
+            tile_offset,
+            num_tile_items,
+            items,
+            selection_flags,
+            Int2Type<SELECT_METHOD>());
+
+        CTA_SYNC();
+
+        // Exclusive scan of values and selection_flags
+        TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, cub::Sum(), tile_idx);
+        BlockScanT(temp_storage.scan_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op);
+
+        OffsetT num_tile_selections     = prefix_op.GetBlockAggregate();
+        OffsetT num_selections          = prefix_op.GetInclusivePrefix();
+        OffsetT num_selections_prefix   = prefix_op.GetExclusivePrefix();
+        OffsetT num_rejected_prefix     = (tile_idx * TILE_ITEMS) - num_selections_prefix;
+
+        // Discount any out-of-bounds selections
+        if (IS_LAST_TILE)
+        {
+            int num_discount    = TILE_ITEMS - num_tile_items;
+            num_selections      -= num_discount;
+            num_tile_selections -= num_discount;
+        }
+
+        // Scatter flagged items
+        Scatter<IS_LAST_TILE, false>(
+            items,
+            selection_flags,
+            selection_indices,
+            num_tile_items,
+            num_tile_selections,
+            num_selections_prefix,
+            num_rejected_prefix,
+            num_selections);
+
+        return num_selections;
+    }
+
+
+    /**
+     * Process a tile of input
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeTile(
+        int                 num_tile_items,     ///< Number of input items comprising this tile
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OffsetT num_selections;
+        if (tile_idx == 0)
+        {
+            num_selections = ConsumeFirstTile<IS_LAST_TILE>(num_tile_items, tile_offset, tile_state);
+        }
+        else
+        {
+            num_selections = ConsumeSubsequentTile<IS_LAST_TILE>(num_tile_items, tile_idx, tile_offset, tile_state);
+        }
+
+        return num_selections;
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    template <typename NumSelectedIteratorT>        ///< Output iterator type for recording number of items selection_flags
+    __device__ __forceinline__ void ConsumeRange(
+        int                     num_tiles,          ///< Total number of input tiles
+        ScanTileStateT&         tile_state,         ///< Global tile state descriptor
+        NumSelectedIteratorT    d_num_selected_out) ///< Output total number selection_flags
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
+
+        if (tile_idx < num_tiles - 1)
+        {
+            // Not the last tile (full)
+            ConsumeTile<false>(TILE_ITEMS, tile_idx, tile_offset, tile_state);
+        }
+        else
+        {
+            // The last tile (possibly partially-full)
+            OffsetT num_remaining   = num_items - tile_offset;
+            OffsetT num_selections  = ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+
+            if (threadIdx.x == 0)
+            {
+                // Output the total number of items selection_flags
+                *d_num_selected_out = num_selections;
+            }
+        }
+    }
+
+};
+
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/agent/agent_spmv_orig.cuh b/cuda/cub/agent/agent_spmv_orig.cuh
new file mode 100644
index 00000000..3e1cf3d0
--- /dev/null
+++ b/cuda/cub/agent/agent_spmv_orig.cuh
@@ -0,0 +1,680 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../util_type.cuh"
+#include "../block/block_reduce.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../config.cuh"
+#include "../thread/thread_search.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/counting_input_iterator.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSpmv
+ */
+template <
+    int                             _BLOCK_THREADS,                         ///< Threads per thread block
+    int                             _ITEMS_PER_THREAD,                      ///< Items per thread (per tile of input)
+    CacheLoadModifier               _ROW_OFFSETS_SEARCH_LOAD_MODIFIER,      ///< Cache load modifier for reading CSR row-offsets during search
+    CacheLoadModifier               _ROW_OFFSETS_LOAD_MODIFIER,             ///< Cache load modifier for reading CSR row-offsets
+    CacheLoadModifier               _COLUMN_INDICES_LOAD_MODIFIER,          ///< Cache load modifier for reading CSR column-indices
+    CacheLoadModifier               _VALUES_LOAD_MODIFIER,                  ///< Cache load modifier for reading CSR values
+    CacheLoadModifier               _VECTOR_VALUES_LOAD_MODIFIER,           ///< Cache load modifier for reading vector values
+    bool                            _DIRECT_LOAD_NONZEROS,                  ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory)
+    BlockScanAlgorithm              _SCAN_ALGORITHM>                        ///< The BlockScan algorithm to use
+struct AgentSpmvPolicy
+{
+    enum
+    {
+        BLOCK_THREADS                                                   = _BLOCK_THREADS,                       ///< Threads per thread block
+        ITEMS_PER_THREAD                                                = _ITEMS_PER_THREAD,                    ///< Items per thread (per tile of input)
+        DIRECT_LOAD_NONZEROS                                            = _DIRECT_LOAD_NONZEROS,                ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory)
+    };
+
+    static const CacheLoadModifier  ROW_OFFSETS_SEARCH_LOAD_MODIFIER    = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER;    ///< Cache load modifier for reading CSR row-offsets
+    static const CacheLoadModifier  ROW_OFFSETS_LOAD_MODIFIER           = _ROW_OFFSETS_LOAD_MODIFIER;           ///< Cache load modifier for reading CSR row-offsets
+    static const CacheLoadModifier  COLUMN_INDICES_LOAD_MODIFIER        = _COLUMN_INDICES_LOAD_MODIFIER;        ///< Cache load modifier for reading CSR column-indices
+    static const CacheLoadModifier  VALUES_LOAD_MODIFIER                = _VALUES_LOAD_MODIFIER;                ///< Cache load modifier for reading CSR values
+    static const CacheLoadModifier  VECTOR_VALUES_LOAD_MODIFIER         = _VECTOR_VALUES_LOAD_MODIFIER;         ///< Cache load modifier for reading vector values
+    static const BlockScanAlgorithm SCAN_ALGORITHM                      = _SCAN_ALGORITHM;                      ///< The BlockScan algorithm to use
+
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+template <
+    typename        ValueT,              ///< Matrix and vector value type
+    typename        OffsetT>             ///< Signed integer type for sequence offsets
+struct SpmvParams
+{
+    const ValueT*   d_values;            ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+    const OffsetT*  d_row_end_offsets;   ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
+    const OffsetT*  d_column_indices;    ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+    const ValueT*   d_vector_x;          ///< Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+    ValueT*         d_vector_y;          ///< Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
+    int             num_rows;            ///< Number of rows of matrix <b>A</b>.
+    int             num_cols;            ///< Number of columns of matrix <b>A</b>.
+    int             num_nonzeros;        ///< Number of nonzero elements of matrix <b>A</b>.
+    ValueT          alpha;               ///< Alpha multiplicand
+    ValueT          beta;                ///< Beta addend-multiplicand
+};
+
+
+/**
+ * \brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
+ */
+template <
+    typename    AgentSpmvPolicyT,           ///< Parameterized AgentSpmvPolicy tuning policy type
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT,                    ///< Signed integer type for sequence offsets
+    bool        HAS_ALPHA,                  ///< Whether the input parameter \p alpha is 1
+    bool        HAS_BETA,                   ///< Whether the input parameter \p beta is 0
+    int         PTX_ARCH = CUB_PTX_ARCH>    ///< PTX compute capability
+struct AgentSpmv
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = AgentSpmvPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentSpmvPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    /// 2D merge path coordinate type
+    typedef typename CubVector<OffsetT, 2>::Type CoordinateT;
+
+    /// Input iterator wrapper types (for applying cache modifiers)
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsSearchIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        ColumnIndicesIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        ValueIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        VectorValueIteratorT;
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;
+
+    // Reduce-value-by-segment scan operator
+    typedef ReduceByKeyOp<cub::Sum> ReduceBySegmentOpT;
+
+    // BlockReduce specialization
+    typedef BlockReduce<
+            ValueT,
+            BLOCK_THREADS,
+            BLOCK_REDUCE_WARP_REDUCTIONS>
+        BlockReduceT;
+
+    // BlockScan specialization
+    typedef BlockScan<
+            KeyValuePairT,
+            BLOCK_THREADS,
+            AgentSpmvPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // BlockScan specialization
+    typedef BlockScan<
+            ValueT,
+            BLOCK_THREADS,
+            AgentSpmvPolicyT::SCAN_ALGORITHM>
+        BlockPrefixSumT;
+
+    // BlockExchange specialization
+    typedef BlockExchange<
+            ValueT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD>
+        BlockExchangeT;
+
+    /// Merge item type (either a non-zero value or a row-end offset)
+    union MergeItem
+    {
+      // Value type to pair with index type OffsetT
+      // (NullType if loading values directly during merge)
+      using MergeValueT =
+        cub::detail::conditional_t<
+          AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS, NullType, ValueT>;
+
+      OffsetT row_end_offset;
+      MergeValueT nonzero;
+    };
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        CoordinateT tile_coords[2];
+
+        union Aliasable
+        {
+            // Smem needed for tile of merge items
+            MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1];
+
+            // Smem needed for block exchange
+            typename BlockExchangeT::TempStorage exchange;
+
+            // Smem needed for block-wide reduction
+            typename BlockReduceT::TempStorage reduce;
+
+            // Smem needed for tile scanning
+            typename BlockScanT::TempStorage scan;
+
+            // Smem needed for tile prefix sum
+            typename BlockPrefixSumT::TempStorage prefix_sum;
+
+        } aliasable;
+    };
+
+    /// Temporary storage type (unionable)
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+
+    _TempStorage&                   temp_storage;         /// Reference to temp_storage
+
+    SpmvParams<ValueT, OffsetT>&    spmv_params;
+
+    ValueIteratorT                  wd_values;            ///< Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+    RowOffsetsIteratorT             wd_row_end_offsets;   ///< Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
+    ColumnIndicesIteratorT          wd_column_indices;    ///< Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+    VectorValueIteratorT            wd_vector_x;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+    VectorValueIteratorT            wd_vector_y;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentSpmv(
+        TempStorage&                    temp_storage,           ///< Reference to temp_storage
+        SpmvParams<ValueT, OffsetT>&    spmv_params)            ///< SpMV input parameter bundle
+    :
+        temp_storage(temp_storage.Alias()),
+        spmv_params(spmv_params),
+        wd_values(spmv_params.d_values),
+        wd_row_end_offsets(spmv_params.d_row_end_offsets),
+        wd_column_indices(spmv_params.d_column_indices),
+        wd_vector_x(spmv_params.d_vector_x),
+        wd_vector_y(spmv_params.d_vector_y)
+    {}
+
+
+
+
+    /**
+     * Consume a merge tile, specialized for direct-load of nonzeros
+     */
+    __device__ __forceinline__ KeyValuePairT ConsumeTile(
+        int             tile_idx,
+        CoordinateT     tile_start_coord,
+        CoordinateT     tile_end_coord,
+        Int2Type<true>  is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
+    {
+        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
+        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+
+        // Gather the row end-offsets for the merge tile into shared memory
+        for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS)
+        {
+            const OffsetT offset =
+              (cub::min)(static_cast<OffsetT>(tile_start_coord.x + item),
+                         static_cast<OffsetT>(spmv_params.num_rows - 1));
+            s_tile_row_end_offsets[item] = wd_row_end_offsets[offset];
+        }
+
+        CTA_SYNC();
+
+        // Search for the thread's starting coordinate within the merge tile
+        CountingInputIterator<OffsetT>  tile_nonzero_indices(tile_start_coord.y);
+        CoordinateT                     thread_start_coord;
+
+        MergePathSearch(
+            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal
+            s_tile_row_end_offsets,                     // List A
+            tile_nonzero_indices,                       // List B
+            tile_num_rows,
+            tile_num_nonzeros,
+            thread_start_coord);
+
+        CTA_SYNC();            // Perf-sync
+
+        // Compute the thread's merge path segment
+        CoordinateT     thread_current_coord = thread_start_coord;
+        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];
+
+        ValueT          running_total = 0.0;
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            OffsetT nonzero_idx         = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1);
+            OffsetT column_idx          = wd_column_indices[nonzero_idx];
+            ValueT  value               = wd_values[nonzero_idx];
+
+            ValueT  vector_value        = wd_vector_x[column_idx];
+
+            ValueT  nonzero             = value * vector_value;
+
+            OffsetT row_end_offset      = s_tile_row_end_offsets[thread_current_coord.x];
+
+            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
+            {
+                // Move down (accumulate)
+                running_total += nonzero;
+                scan_segment[ITEM].value    = running_total;
+                scan_segment[ITEM].key      = tile_num_rows;
+                ++thread_current_coord.y;
+            }
+            else
+            {
+                // Move right (reset)
+                scan_segment[ITEM].value    = running_total;
+                scan_segment[ITEM].key      = thread_current_coord.x;
+                running_total               = 0.0;
+                ++thread_current_coord.x;
+            }
+        }
+
+        CTA_SYNC();
+
+        // Block-wide reduce-value-by-segment
+        KeyValuePairT       tile_carry;
+        ReduceBySegmentOpT  scan_op;
+        KeyValuePairT       scan_item;
+
+        scan_item.value = running_total;
+        scan_item.key   = thread_current_coord.x;
+
+        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
+
+        if (tile_num_rows > 0)
+        {
+            if (threadIdx.x == 0)
+                scan_item.key = -1;
+
+            // Direct scatter
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                if (scan_segment[ITEM].key < tile_num_rows)
+                {
+                    if (scan_item.key == scan_segment[ITEM].key)
+                        scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value;
+
+                    if (HAS_ALPHA)
+                    {
+                        scan_segment[ITEM].value *= spmv_params.alpha;
+                    }
+
+                    if (HAS_BETA)
+                    {
+                        // Update the output vector element
+                        ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key];
+                        scan_segment[ITEM].value += addend;
+                    }
+
+                    // Set the output vector element
+                    spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value;
+                }
+            }
+        }
+
+        // Return the tile's running carry-out
+        return tile_carry;
+    }
+
+
+
+    /**
+     * Consume a merge tile, specialized for indirect load of nonzeros
+     */
+    __device__ __forceinline__ KeyValuePairT ConsumeTile(
+        int             tile_idx,
+        CoordinateT     tile_start_coord,
+        CoordinateT     tile_end_coord,
+        Int2Type<false> is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
+    {
+        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
+        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
+
+#if (CUB_PTX_ARCH >= 520)
+
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
+
+        // Gather the nonzeros for the merge tile into shared memory
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
+
+            ValueIteratorT a                = wd_values + tile_start_coord.y + nonzero_idx;
+            ColumnIndicesIteratorT ci       = wd_column_indices + tile_start_coord.y + nonzero_idx;
+            ValueT* s                       = s_tile_nonzeros + nonzero_idx;
+
+            if (nonzero_idx < tile_num_nonzeros)
+            {
+
+                OffsetT column_idx              = *ci;
+                ValueT  value                   = *a;
+
+                ValueT  vector_value            = wd_vector_x[column_idx];
+
+                ValueT  nonzero                 = value * vector_value;
+
+                *s    = nonzero;
+            }
+        }
+
+
+#else
+
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
+
+        // Gather the nonzeros for the merge tile into shared memory
+        if (tile_num_nonzeros > 0)
+        {
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                int     nonzero_idx             = threadIdx.x + (ITEM * BLOCK_THREADS);
+                nonzero_idx                     = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1);
+
+                OffsetT column_idx              = wd_column_indices[tile_start_coord.y + nonzero_idx];
+                ValueT  value                   = wd_values[tile_start_coord.y + nonzero_idx];
+
+                ValueT  vector_value            = wd_vector_x[column_idx];
+
+                ValueT  nonzero                 = value * vector_value;
+
+                s_tile_nonzeros[nonzero_idx]    = nonzero;
+            }
+        }
+
+#endif
+
+        // Gather the row end-offsets for the merge tile into shared memory
+        #pragma unroll 1
+        for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS)
+        {
+            const OffsetT offset =
+              (cub::min)(static_cast<OffsetT>(tile_start_coord.x + item),
+                         static_cast<OffsetT>(spmv_params.num_rows - 1));
+            s_tile_row_end_offsets[item] = wd_row_end_offsets[offset];
+        }
+
+        CTA_SYNC();
+
+        // Search for the thread's starting coordinate within the merge tile
+        CountingInputIterator<OffsetT>  tile_nonzero_indices(tile_start_coord.y);
+        CoordinateT                     thread_start_coord;
+
+        MergePathSearch(
+            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal
+            s_tile_row_end_offsets,                     // List A
+            tile_nonzero_indices,                       // List B
+            tile_num_rows,
+            tile_num_nonzeros,
+            thread_start_coord);
+
+        CTA_SYNC();            // Perf-sync
+
+        // Compute the thread's merge path segment
+        CoordinateT     thread_current_coord = thread_start_coord;
+        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];
+        ValueT          running_total = 0.0;
+
+        OffsetT row_end_offset  = s_tile_row_end_offsets[thread_current_coord.x];
+        ValueT  nonzero         = s_tile_nonzeros[thread_current_coord.y];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
+            {
+                // Move down (accumulate)
+                scan_segment[ITEM].value    = nonzero;
+                running_total               += nonzero;
+                ++thread_current_coord.y;
+                nonzero                     = s_tile_nonzeros[thread_current_coord.y];
+            }
+            else
+            {
+                // Move right (reset)
+                scan_segment[ITEM].value    = 0.0;
+                running_total               = 0.0;
+                ++thread_current_coord.x;
+                row_end_offset              = s_tile_row_end_offsets[thread_current_coord.x];
+            }
+
+            scan_segment[ITEM].key = thread_current_coord.x;
+        }
+
+        CTA_SYNC();
+
+        // Block-wide reduce-value-by-segment
+        KeyValuePairT       tile_carry;
+        ReduceBySegmentOpT  scan_op;
+        KeyValuePairT       scan_item;
+
+        scan_item.value = running_total;
+        scan_item.key = thread_current_coord.x;
+
+        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
+
+        if (threadIdx.x == 0)
+        {
+            scan_item.key = thread_start_coord.x;
+            scan_item.value = 0.0;
+        }
+
+        if (tile_num_rows > 0)
+        {
+
+            CTA_SYNC();
+
+            // Scan downsweep and scatter
+            ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero;
+
+            if (scan_item.key != scan_segment[0].key)
+            {
+                s_partials[scan_item.key] = scan_item.value;
+            }
+            else
+            {
+                scan_segment[0].value += scan_item.value;
+            }
+
+            #pragma unroll
+            for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key)
+                {
+                    s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value;
+                }
+                else
+                {
+                    scan_segment[ITEM].value += scan_segment[ITEM - 1].value;
+                }
+            }
+
+            CTA_SYNC();
+
+            #pragma unroll 1
+            for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)
+            {
+                spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item];
+            }
+        }
+
+        // Return the tile's running carry-out
+        return tile_carry;
+    }
+
+
+    /**
+     * Consume input tile
+     */
+    __device__ __forceinline__ void ConsumeTile(
+        CoordinateT*    d_tile_coordinates,     ///< [in] Pointer to the temporary array of tile starting coordinates
+        KeyValuePairT*  d_tile_carry_pairs,     ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
+        int             num_merge_tiles)        ///< [in] Number of merge tiles
+    {
+        int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+
+        if (tile_idx >= num_merge_tiles)
+            return;
+
+        // Read our starting coordinates
+        if (threadIdx.x < 2)
+        {
+            if (d_tile_coordinates == NULL)
+            {
+                // Search our starting coordinates
+                OffsetT                         diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS;
+                CoordinateT                     tile_coord;
+                CountingInputIterator<OffsetT>  nonzero_indices(0);
+
+                // Search the merge path
+                MergePathSearch(
+                    diagonal,
+                    RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
+                    nonzero_indices,
+                    spmv_params.num_rows,
+                    spmv_params.num_nonzeros,
+                    tile_coord);
+
+                temp_storage.tile_coords[threadIdx.x] = tile_coord;
+            }
+            else
+            {
+                temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x];
+            }
+        }
+
+        CTA_SYNC();
+
+        CoordinateT tile_start_coord     = temp_storage.tile_coords[0];
+        CoordinateT tile_end_coord       = temp_storage.tile_coords[1];
+
+        // Consume multi-segment tile
+        KeyValuePairT tile_carry = ConsumeTile(
+            tile_idx,
+            tile_start_coord,
+            tile_end_coord,
+            Int2Type<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS>());
+
+        // Output the tile's carry-out
+        if (threadIdx.x == 0)
+        {
+            if (HAS_ALPHA)
+            {
+                tile_carry.value *= spmv_params.alpha;
+            }
+
+            tile_carry.key += tile_start_coord.x;
+            if (tile_carry.key >= spmv_params.num_rows)
+            {
+                // FIXME: This works around an invalid memory access in the
+                // fixup kernel. The underlying issue needs to be debugged and
+                // properly fixed, but this hack prevents writes to
+                // out-of-bounds addresses. It doesn't appear to have an effect
+                // on the validity of the results, since this only affects the
+                // carry-over from last tile in the input.
+                tile_carry.key = spmv_params.num_rows - 1;
+                tile_carry.value = ValueT{};
+            };
+
+            d_tile_carry_pairs[tile_idx]    = tile_carry;
+        }
+    }
+
+
+};
+
+
+
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/agent/agent_sub_warp_merge_sort.cuh b/cuda/cub/agent/agent_sub_warp_merge_sort.cuh
new file mode 100644
index 00000000..ad65f2a3
--- /dev/null
+++ b/cuda/cub/agent/agent_sub_warp_merge_sort.cuh
@@ -0,0 +1,327 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+#include <cub/util_type.cuh>
+#include <cub/warp/warp_load.cuh>
+#include <cub/warp/warp_merge_sort.cuh>
+#include <cub/warp/warp_store.cuh>
+
+#include <thrust/system/cuda/detail/core/util.h>
+
+
+CUB_NAMESPACE_BEGIN
+
+
+template <
+  int                      WARP_THREADS_ARG,
+  int                      ITEMS_PER_THREAD_ARG,
+  cub::WarpLoadAlgorithm   LOAD_ALGORITHM_ARG   = cub::WARP_LOAD_DIRECT,
+  cub::CacheLoadModifier   LOAD_MODIFIER_ARG    = cub::LOAD_LDG,
+  cub::WarpStoreAlgorithm  STORE_ALGORITHM_ARG  = cub::WARP_STORE_DIRECT>
+struct AgentSubWarpMergeSortPolicy
+{
+  static constexpr int WARP_THREADS       = WARP_THREADS_ARG;
+  static constexpr int ITEMS_PER_THREAD   = ITEMS_PER_THREAD_ARG;
+  static constexpr int ITEMS_PER_TILE     = WARP_THREADS * ITEMS_PER_THREAD;
+
+  static constexpr cub::WarpLoadAlgorithm  LOAD_ALGORITHM   = LOAD_ALGORITHM_ARG;
+  static constexpr cub::CacheLoadModifier  LOAD_MODIFIER    = LOAD_MODIFIER_ARG;
+  static constexpr cub::WarpStoreAlgorithm STORE_ALGORITHM  = STORE_ALGORITHM_ARG;
+};
+
+template <int BLOCK_THREADS_ARG,
+          typename SmallPolicy,
+          typename MediumPolicy>
+struct AgentSmallAndMediumSegmentedSortPolicy
+{
+  static constexpr int BLOCK_THREADS = BLOCK_THREADS_ARG;
+  using SmallPolicyT                 = SmallPolicy;
+  using MediumPolicyT                = MediumPolicy;
+
+  constexpr static int SEGMENTS_PER_MEDIUM_BLOCK = BLOCK_THREADS /
+                                                   MediumPolicyT::WARP_THREADS;
+
+  constexpr static int SEGMENTS_PER_SMALL_BLOCK = BLOCK_THREADS /
+                                                  SmallPolicyT::WARP_THREADS;
+};
+
+
+/**
+ * @brief AgentSubWarpSort implements a sub-warp merge sort.
+ *
+ * This agent can work with any power of two number of threads, not exceeding
+ * 32. The number of threads is defined in the `PolicyT::WARP_THREADS`. Virtual
+ * warp of `PolicyT::WARP_THREADS` will efficiently load data using
+ * `PolicyT::LOAD_ALGORITHM`, sort it using `WarpMergeSort`, and store it back
+ * using `PolicyT::STORE_ALGORITHM`.
+ *
+ * @tparam IS_DESCENDING
+ *   Whether or not the sorted-order is high-to-low
+ *
+ * @tparam PolicyT
+ *   Chained tuning policy
+ *
+ * @tparam KeyT
+ *   Key type
+ *
+ * @tparam ValueT
+ *   Value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ */
+template <bool IS_DESCENDING,
+          typename PolicyT,
+          typename KeyT,
+          typename ValueT,
+          typename OffsetT>
+class AgentSubWarpSort
+{
+  struct BinaryOpT
+  {
+    template <typename T>
+    __device__ bool operator()(T lhs, T rhs)
+    {
+      if (IS_DESCENDING)
+      {
+        return lhs > rhs;
+      }
+      else
+      {
+        return lhs < rhs;
+      }
+    }
+
+#if defined(__CUDA_FP16_TYPES_EXIST__) && (CUB_PTX_ARCH < 530)
+    __device__ bool operator()(__half lhs, __half rhs)
+    {
+      return (*this)(__half2float(lhs), __half2float(rhs));
+    }
+#endif
+  };
+
+#if defined(__CUDA_FP16_TYPES_EXIST__) && (CUB_PTX_ARCH < 530)
+  __device__ static bool equal(__half lhs, __half rhs)
+  {
+    return __half2float(lhs) == __half2float(rhs);
+  }
+#endif
+
+  template <typename T>
+  __device__ static bool equal(T lhs, T rhs)
+  {
+    return lhs == rhs;
+  }
+
+public:
+  static constexpr bool KEYS_ONLY = std::is_same<ValueT, cub::NullType>::value;
+
+  using WarpMergeSortT =
+    WarpMergeSort<KeyT, PolicyT::ITEMS_PER_THREAD, PolicyT::WARP_THREADS, ValueT>;
+
+  using KeysLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::
+    LoadIterator<PolicyT, const KeyT *>::type;
+  using ItemsLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::
+    LoadIterator<PolicyT, const ValueT *>::type;
+
+  using WarpLoadKeysT  = cub::WarpLoad<KeyT,
+                                      PolicyT::ITEMS_PER_THREAD,
+                                      PolicyT::LOAD_ALGORITHM,
+                                      PolicyT::WARP_THREADS>;
+  using WarpLoadItemsT = cub::WarpLoad<ValueT,
+                                       PolicyT::ITEMS_PER_THREAD,
+                                       PolicyT::LOAD_ALGORITHM,
+                                       PolicyT::WARP_THREADS>;
+
+  using WarpStoreKeysT  = cub::WarpStore<KeyT,
+                                         PolicyT::ITEMS_PER_THREAD,
+                                         PolicyT::STORE_ALGORITHM,
+                                         PolicyT::WARP_THREADS>;
+  using WarpStoreItemsT = cub::WarpStore<ValueT,
+                                         PolicyT::ITEMS_PER_THREAD,
+                                         PolicyT::STORE_ALGORITHM,
+                                         PolicyT::WARP_THREADS>;
+
+  union _TempStorage
+  {
+    typename WarpLoadKeysT::TempStorage load_keys;
+    typename WarpLoadItemsT::TempStorage load_items;
+    typename WarpMergeSortT::TempStorage sort;
+    typename WarpStoreKeysT::TempStorage store_keys;
+    typename WarpStoreItemsT::TempStorage store_items;
+  };
+
+  /// Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage> {};
+
+  _TempStorage &storage;
+
+  __device__ __forceinline__
+  explicit AgentSubWarpSort(TempStorage &temp_storage)
+    : storage(temp_storage.Alias())
+  {
+  }
+
+
+  __device__ __forceinline__
+  void ProcessSegment(int segment_size,
+                      KeysLoadItT keys_input,
+                      KeyT *keys_output,
+                      ItemsLoadItT values_input,
+                      ValueT *values_output)
+  {
+    WarpMergeSortT warp_merge_sort(storage.sort);
+
+    if (segment_size < 3)
+    {
+      ShortCircuit(warp_merge_sort.get_linear_tid(),
+                   segment_size,
+                   keys_input,
+                   keys_output,
+                   values_input,
+                   values_output,
+                   BinaryOpT{});
+    }
+    else
+    {
+      KeyT keys[PolicyT::ITEMS_PER_THREAD];
+      ValueT values[PolicyT::ITEMS_PER_THREAD];
+
+      // For FP64 the difference is:
+      // Lowest() -> -1.79769e+308 = 00...00b -> TwiddleIn -> -0 = 10...00b
+      // LOWEST   -> -nan          = 11...11b -> TwiddleIn ->  0 = 00...00b
+
+      using UnsignedBitsT = typename Traits<KeyT>::UnsignedBits;
+      UnsignedBitsT default_key_bits = IS_DESCENDING ? Traits<KeyT>::LOWEST_KEY
+                                                     : Traits<KeyT>::MAX_KEY;
+      KeyT oob_default = reinterpret_cast<KeyT &>(default_key_bits);
+
+      WarpLoadKeysT(storage.load_keys)
+        .Load(keys_input, keys, segment_size, oob_default);
+      WARP_SYNC(warp_merge_sort.get_member_mask());
+
+      if (!KEYS_ONLY)
+      {
+        WarpLoadItemsT(storage.load_items)
+          .Load(values_input, values, segment_size);
+
+        WARP_SYNC(warp_merge_sort.get_member_mask());
+      }
+
+      warp_merge_sort.Sort(keys, values, BinaryOpT{}, segment_size, oob_default);
+      WARP_SYNC(warp_merge_sort.get_member_mask());
+
+      WarpStoreKeysT(storage.store_keys).Store(keys_output, keys, segment_size);
+
+      if (!KEYS_ONLY)
+      {
+        WARP_SYNC(warp_merge_sort.get_member_mask());
+        WarpStoreItemsT(storage.store_items)
+          .Store(values_output, values, segment_size);
+      }
+    }
+  }
+
+private:
+  /**
+   * This method implements a shortcut for sorting less than three items.
+   * Only the first thread of a virtual warp is used for soring.
+   */
+  template <typename CompareOpT>
+  __device__ __forceinline__ void ShortCircuit(unsigned int linear_tid,
+                                               OffsetT segment_size,
+                                               KeysLoadItT keys_input,
+                                               KeyT *keys_output,
+                                               ItemsLoadItT values_input,
+                                               ValueT *values_output,
+                                               CompareOpT binary_op)
+  {
+    if (segment_size == 1)
+    {
+      if (linear_tid == 0)
+      {
+        if (keys_input.ptr != keys_output)
+        {
+          keys_output[0] = keys_input[0];
+        }
+
+        if (!KEYS_ONLY)
+        {
+          if (values_input.ptr != values_output)
+          {
+            values_output[0] = values_input[0];
+          }
+        }
+      }
+    }
+    else if (segment_size == 2)
+    {
+      if (linear_tid == 0)
+      {
+        KeyT lhs = keys_input[0];
+        KeyT rhs = keys_input[1];
+
+        if (equal(lhs, rhs) || binary_op(lhs, rhs))
+        {
+          keys_output[0] = lhs;
+          keys_output[1] = rhs;
+
+          if (!KEYS_ONLY)
+          {
+            if (values_output != values_input.ptr)
+            {
+              values_output[0] = values_input[0];
+              values_output[1] = values_input[1];
+            }
+          }
+        }
+        else
+        {
+          keys_output[0] = rhs;
+          keys_output[1] = lhs;
+
+          if (!KEYS_ONLY)
+          {
+            // values_output might be an alias for values_input, so
+            // we have to use registers here
+
+            const ValueT lhs_val = values_input[0];
+            const ValueT rhs_val = values_input[1];
+
+            values_output[0] = rhs_val;
+            values_output[1] = lhs_val;
+          }
+        }
+      }
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/agent/agent_three_way_partition.cuh b/cuda/cub/agent/agent_three_way_partition.cuh
new file mode 100644
index 00000000..c12eafd0
--- /dev/null
+++ b/cuda/cub/agent/agent_three_way_partition.cuh
@@ -0,0 +1,614 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <iterator>
+#include <type_traits>
+
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/block/block_discontinuity.cuh>
+#include <cub/block/block_exchange.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/config.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+
+
+CUB_NAMESPACE_BEGIN
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+template <int _BLOCK_THREADS,
+          int _ITEMS_PER_THREAD,
+          BlockLoadAlgorithm _LOAD_ALGORITHM,
+          CacheLoadModifier _LOAD_MODIFIER,
+          BlockScanAlgorithm _SCAN_ALGORITHM>
+struct AgentThreeWayPartitionPolicy
+{
+  constexpr static int BLOCK_THREADS                 = _BLOCK_THREADS;
+  constexpr static int ITEMS_PER_THREAD              = _ITEMS_PER_THREAD;
+  constexpr static BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+  constexpr static CacheLoadModifier LOAD_MODIFIER   = _LOAD_MODIFIER;
+  constexpr static BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+};
+
+
+/**
+ * \brief Implements a device-wide three-way partitioning
+ *
+ * Splits input data into three parts based on the selection functors. If the
+ * first functor selects an item, the algorithm places it in the first part.
+ * Otherwise, if the second functor selects an item, the algorithm places it in
+ * the second part. If both functors don't select an item, the algorithm places
+ * it into the unselected part.
+ */
+template <typename PolicyT,
+          typename InputIteratorT,
+          typename FirstOutputIteratorT,
+          typename SecondOutputIteratorT,
+          typename UnselectedOutputIteratorT,
+          typename SelectFirstPartOp,
+          typename SelectSecondPartOp,
+          typename OffsetT>
+struct AgentThreeWayPartition
+{
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+
+  // The input value type
+  using InputT = cub::detail::value_t<InputIteratorT>;
+
+  // Tile status descriptor interface type
+  using ScanTileStateT = cub::ScanTileState<OffsetT>;
+
+  // Constants
+  constexpr static int BLOCK_THREADS = PolicyT::BLOCK_THREADS;
+  constexpr static int ITEMS_PER_THREAD = PolicyT::ITEMS_PER_THREAD;
+  constexpr static int TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+  using WrappedInputIteratorT = cub::detail::conditional_t<
+    std::is_pointer<InputIteratorT>::value,
+    cub::CacheModifiedInputIterator<PolicyT::LOAD_MODIFIER, InputT, OffsetT>,
+    InputIteratorT>;
+
+  // Parameterized BlockLoad type for input data
+  using BlockLoadT = cub::BlockLoad<InputT,
+                                    BLOCK_THREADS,
+                                    ITEMS_PER_THREAD,
+                                    PolicyT::LOAD_ALGORITHM>;
+
+  // Parameterized BlockScan type
+  using BlockScanT =
+    cub::BlockScan<OffsetT, BLOCK_THREADS, PolicyT::SCAN_ALGORITHM>;
+
+  // Callback type for obtaining tile prefix during block scan
+  using TilePrefixCallbackOpT =
+    cub::TilePrefixCallbackOp<OffsetT, cub::Sum, ScanTileStateT>;
+
+  // Item exchange type
+  using ItemExchangeT = InputT[TILE_ITEMS];
+
+  // Shared memory type for this thread block
+  union _TempStorage
+  {
+    struct ScanStorage
+    {
+      // Smem needed for tile scanning
+      typename BlockScanT::TempStorage scan;
+
+      // Smem needed for cooperative prefix callback
+      typename TilePrefixCallbackOpT::TempStorage prefix;
+    } scan_storage;
+
+    // Smem needed for loading items
+    typename BlockLoadT::TempStorage load_items;
+
+    // Smem needed for compacting items (allows non POD items in this union)
+    cub::Uninitialized<ItemExchangeT> raw_exchange;
+  };
+
+  // Alias wrapper allowing storage to be unioned
+  struct TempStorage : cub::Uninitialized<_TempStorage> {};
+
+
+  //---------------------------------------------------------------------
+  // Per-thread fields
+  //---------------------------------------------------------------------
+
+  _TempStorage&                        temp_storage;       ///< Reference to temp_storage
+  WrappedInputIteratorT                d_in;               ///< Input items
+  FirstOutputIteratorT                 d_first_part_out;
+  SecondOutputIteratorT                d_second_part_out;
+  UnselectedOutputIteratorT            d_unselected_out;
+  SelectFirstPartOp                    select_first_part_op;
+  SelectSecondPartOp                   select_second_part_op;
+  OffsetT                              num_items;          ///< Total number of input items
+
+
+  //---------------------------------------------------------------------
+  // Constructor
+  //---------------------------------------------------------------------
+
+  // Constructor
+  __device__ __forceinline__
+  AgentThreeWayPartition(TempStorage &temp_storage,
+                         InputIteratorT d_in,
+                         FirstOutputIteratorT d_first_part_out,
+                         SecondOutputIteratorT d_second_part_out,
+                         UnselectedOutputIteratorT d_unselected_out,
+                         SelectFirstPartOp select_first_part_op,
+                         SelectSecondPartOp select_second_part_op,
+                         OffsetT num_items)
+      : temp_storage(temp_storage.Alias())
+      , d_in(d_in)
+      , d_first_part_out(d_first_part_out)
+      , d_second_part_out(d_second_part_out)
+      , d_unselected_out(d_unselected_out)
+      , select_first_part_op(select_first_part_op)
+      , select_second_part_op(select_second_part_op)
+      , num_items(num_items)
+  {}
+
+  //---------------------------------------------------------------------
+  // Utility methods for initializing the selections
+  //---------------------------------------------------------------------
+
+  template <bool IS_LAST_TILE>
+  __device__ __forceinline__ void
+  Initialize(OffsetT num_tile_items,
+             InputT (&items)[ITEMS_PER_THREAD],
+             OffsetT (&first_items_selection_flags)[ITEMS_PER_THREAD],
+             OffsetT (&second_items_selection_flags)[ITEMS_PER_THREAD])
+  {
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      // Out-of-bounds items are selection_flags
+      first_items_selection_flags[ITEM]  = 1;
+      second_items_selection_flags[ITEM] = 1;
+
+      if (!IS_LAST_TILE ||
+          (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
+      {
+        first_items_selection_flags[ITEM] = select_first_part_op(items[ITEM]);
+        second_items_selection_flags[ITEM] =
+          first_items_selection_flags[ITEM]
+            ? 0
+            : select_second_part_op(items[ITEM]);
+      }
+    }
+  }
+
+  template <bool IS_LAST_TILE>
+  __device__ __forceinline__ void Scatter(
+    InputT          (&items)[ITEMS_PER_THREAD],
+    OffsetT         (&first_items_selection_flags)[ITEMS_PER_THREAD],
+    OffsetT         (&first_items_selection_indices)[ITEMS_PER_THREAD],
+    OffsetT         (&second_items_selection_flags)[ITEMS_PER_THREAD],
+    OffsetT         (&second_items_selection_indices)[ITEMS_PER_THREAD],
+    int             num_tile_items,
+    int             num_first_tile_selections,
+    int             num_second_tile_selections,
+    OffsetT         num_first_selections_prefix,
+    OffsetT         num_second_selections_prefix,
+    OffsetT         num_rejected_prefix)
+  {
+    CTA_SYNC();
+
+    int first_item_end = num_first_tile_selections;
+    int second_item_end = first_item_end + num_second_tile_selections;
+
+    // Scatter items to shared memory (rejections first)
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      int item_idx = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+
+      if (!IS_LAST_TILE || (item_idx < num_tile_items))
+      {
+        int local_scatter_offset = 0;
+
+        if (first_items_selection_flags[ITEM])
+        {
+          local_scatter_offset = first_items_selection_indices[ITEM]
+                               - num_first_selections_prefix;
+        }
+        else if (second_items_selection_flags[ITEM])
+        {
+          local_scatter_offset = first_item_end +
+                                 second_items_selection_indices[ITEM] -
+                                 num_second_selections_prefix;
+        }
+        else
+        {
+          // Medium item
+          int local_selection_idx =
+           (first_items_selection_indices[ITEM] - num_first_selections_prefix)
+         + (second_items_selection_indices[ITEM] - num_second_selections_prefix);
+          local_scatter_offset = second_item_end + item_idx - local_selection_idx;
+        }
+
+        temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
+      }
+    }
+
+    CTA_SYNC();
+
+    // Gather items from shared memory and scatter to global
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x;
+
+      if (!IS_LAST_TILE || (item_idx < num_tile_items))
+      {
+        InputT item = temp_storage.raw_exchange.Alias()[item_idx];
+
+        if (item_idx < first_item_end)
+        {
+          d_first_part_out[num_first_selections_prefix + item_idx] = item;
+        }
+        else if (item_idx < second_item_end)
+        {
+          d_second_part_out[num_second_selections_prefix + item_idx - first_item_end] = item;
+        }
+        else
+        {
+          int rejection_idx = item_idx - second_item_end;
+          d_unselected_out[num_rejected_prefix + rejection_idx] = item;
+        }
+      }
+    }
+  }
+
+
+  //---------------------------------------------------------------------
+  // Cooperatively scan a device-wide sequence of tiles with other CTAs
+  //---------------------------------------------------------------------
+
+
+  /**
+   * Process first tile of input (dynamic chained scan).
+   * Returns the running count of selections (including this tile)
+   *
+   * @param num_tile_items Number of input items comprising this tile
+   * @param tile_offset Tile offset
+   * @param first_tile_state Global tile state descriptor
+   * @param second_tile_state Global tile state descriptor
+   */
+  template <bool IS_LAST_TILE>
+  __device__ __forceinline__ void
+  ConsumeFirstTile(int num_tile_items,
+                   OffsetT tile_offset,
+                   ScanTileStateT &first_tile_state,
+                   ScanTileStateT &second_tile_state,
+                   OffsetT &first_items,
+                   OffsetT &second_items)
+  {
+    InputT items[ITEMS_PER_THREAD];
+
+    OffsetT first_items_selection_flags[ITEMS_PER_THREAD];
+    OffsetT first_items_selection_indices[ITEMS_PER_THREAD];
+
+    OffsetT second_items_selection_flags[ITEMS_PER_THREAD];
+    OffsetT second_items_selection_indices[ITEMS_PER_THREAD];
+
+    // Load items
+    if (IS_LAST_TILE)
+    {
+      BlockLoadT(temp_storage.load_items)
+        .Load(d_in + tile_offset, items, num_tile_items);
+    }
+    else
+    {
+      BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+    }
+
+    // Initialize selection_flags
+    Initialize<IS_LAST_TILE>(
+      num_tile_items,
+      items,
+      first_items_selection_flags,
+      second_items_selection_flags);
+
+    CTA_SYNC();
+
+    // Exclusive scan of selection_flags
+    BlockScanT(temp_storage.scan_storage.scan)
+      .ExclusiveSum(first_items_selection_flags,
+                    first_items_selection_indices,
+                    first_items);
+
+    if (threadIdx.x == 0)
+    {
+      // Update tile status if this is not the last tile
+      if (!IS_LAST_TILE)
+      {
+        first_tile_state.SetInclusive(0, first_items);
+      }
+    }
+
+    CTA_SYNC();
+
+    // Exclusive scan of selection_flags
+    BlockScanT(temp_storage.scan_storage.scan)
+      .ExclusiveSum(second_items_selection_flags,
+                    second_items_selection_indices,
+                    second_items);
+
+    if (threadIdx.x == 0)
+    {
+      // Update tile status if this is not the last tile
+      if (!IS_LAST_TILE)
+      {
+        second_tile_state.SetInclusive(0, second_items);
+      }
+    }
+
+    // Discount any out-of-bounds selections
+    if (IS_LAST_TILE)
+    {
+      first_items -= (TILE_ITEMS - num_tile_items);
+      second_items -= (TILE_ITEMS - num_tile_items);
+    }
+
+    // Scatter flagged items
+    Scatter<IS_LAST_TILE>(
+      items,
+      first_items_selection_flags,
+      first_items_selection_indices,
+      second_items_selection_flags,
+      second_items_selection_indices,
+      num_tile_items,
+      first_items,
+      second_items,
+      // all the prefixes equal to 0 because it's the first tile
+      0, 0, 0);
+  }
+
+
+  /**
+   * Process subsequent tile of input (dynamic chained scan).
+   * Returns the running count of selections (including this tile)
+   *
+   * @param num_tile_items Number of input items comprising this tile
+   * @param tile_idx Tile index
+   * @param tile_offset Tile offset
+   * @param first_tile_state Global tile state descriptor
+   * @param second_tile_state Global tile state descriptor
+   */
+  template <bool IS_LAST_TILE>
+  __device__ __forceinline__ void
+  ConsumeSubsequentTile(int num_tile_items,
+                        int tile_idx,
+                        OffsetT tile_offset,
+                        ScanTileStateT &first_tile_state,
+                        ScanTileStateT &second_tile_state,
+                        OffsetT &num_first_items_selections,
+                        OffsetT &num_second_items_selections)
+  {
+    InputT items[ITEMS_PER_THREAD];
+
+    OffsetT first_items_selection_flags[ITEMS_PER_THREAD];
+    OffsetT first_items_selection_indices[ITEMS_PER_THREAD];
+
+    OffsetT second_items_selection_flags[ITEMS_PER_THREAD];
+    OffsetT second_items_selection_indices[ITEMS_PER_THREAD];
+
+    // Load items
+    if (IS_LAST_TILE)
+    {
+      BlockLoadT(temp_storage.load_items).Load(
+        d_in + tile_offset, items, num_tile_items);
+    }
+    else
+    {
+      BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+    }
+
+    // Initialize selection_flags
+    Initialize<IS_LAST_TILE>(
+      num_tile_items,
+      items,
+      first_items_selection_flags,
+      second_items_selection_flags);
+
+    CTA_SYNC();
+
+    // Exclusive scan of values and selection_flags
+    TilePrefixCallbackOpT first_prefix_op(first_tile_state,
+                                          temp_storage.scan_storage.prefix,
+                                          cub::Sum(),
+                                          tile_idx);
+
+    BlockScanT(temp_storage.scan_storage.scan)
+      .ExclusiveSum(first_items_selection_flags,
+                    first_items_selection_indices,
+                    first_prefix_op);
+
+    num_first_items_selections                  = first_prefix_op.GetInclusivePrefix();
+    OffsetT num_first_items_in_tile_selections  = first_prefix_op.GetBlockAggregate();
+    OffsetT num_first_items_selections_prefix   = first_prefix_op.GetExclusivePrefix();
+
+    CTA_SYNC();
+
+    TilePrefixCallbackOpT second_prefix_op(second_tile_state,
+                                           temp_storage.scan_storage.prefix,
+                                           cub::Sum(),
+                                           tile_idx);
+    BlockScanT(temp_storage.scan_storage.scan)
+      .ExclusiveSum(second_items_selection_flags,
+                    second_items_selection_indices,
+                    second_prefix_op);
+
+    num_second_items_selections                  = second_prefix_op.GetInclusivePrefix();
+    OffsetT num_second_items_in_tile_selections  = second_prefix_op.GetBlockAggregate();
+    OffsetT num_second_items_selections_prefix   = second_prefix_op.GetExclusivePrefix();
+
+    OffsetT num_rejected_prefix = (tile_idx * TILE_ITEMS)
+                                  - num_first_items_selections_prefix
+                                  - num_second_items_selections_prefix;
+
+    // Discount any out-of-bounds selections. There are exactly
+    // TILE_ITEMS - num_tile_items elements like that because we
+    // marked them as selected in Initialize method.
+    if (IS_LAST_TILE)
+    {
+      const int num_discount = TILE_ITEMS - num_tile_items;
+
+      num_first_items_selections          -= num_discount;
+      num_first_items_in_tile_selections  -= num_discount;
+      num_second_items_selections         -= num_discount;
+      num_second_items_in_tile_selections -= num_discount;
+    }
+
+    // Scatter flagged items
+    Scatter<IS_LAST_TILE>(
+      items,
+      first_items_selection_flags,
+      first_items_selection_indices,
+      second_items_selection_flags,
+      second_items_selection_indices,
+      num_tile_items,
+      num_first_items_in_tile_selections,
+      num_second_items_in_tile_selections,
+      num_first_items_selections_prefix,
+      num_second_items_selections_prefix,
+      num_rejected_prefix);
+  }
+
+
+  /**
+   * Process a tile of input
+   */
+  template <bool IS_LAST_TILE>
+  __device__ __forceinline__ void ConsumeTile(
+    int                 num_tile_items,
+    int                 tile_idx,
+    OffsetT             tile_offset,
+    ScanTileStateT&     first_tile_state,
+    ScanTileStateT&     second_tile_state,
+    OffsetT&            first_items,
+    OffsetT&            second_items)
+  {
+    if (tile_idx == 0)
+    {
+      ConsumeFirstTile<IS_LAST_TILE>(num_tile_items,
+                                     tile_offset,
+                                     first_tile_state,
+                                     second_tile_state,
+                                     first_items,
+                                     second_items);
+    }
+    else
+    {
+      ConsumeSubsequentTile<IS_LAST_TILE>(num_tile_items,
+                                          tile_idx,
+                                          tile_offset,
+                                          first_tile_state,
+                                          second_tile_state,
+                                          first_items,
+                                          second_items);
+    }
+  }
+
+
+  /**
+   * Scan tiles of items as part of a dynamic chained scan
+   *
+   * @tparam NumSelectedIteratorT
+   *   Output iterator type for recording number of items selection_flags
+   *
+   * @param num_tiles
+   *   Total number of input tiles
+   *
+   * @param first_tile_state
+   *   Global tile state descriptor
+   *
+   * @param second_tile_state
+   *   Global tile state descriptor
+   *
+   * @param d_num_selected_out
+   *   Output total number selection_flags
+   */
+  template <typename NumSelectedIteratorT>
+  __device__ __forceinline__ void
+  ConsumeRange(int num_tiles,
+               ScanTileStateT &first_tile_state,
+               ScanTileStateT &second_tile_state,
+               NumSelectedIteratorT d_num_selected_out)
+  {
+    // Blocks are launched in increasing order, so just assign one tile per block
+    // Current tile index
+    int tile_idx = static_cast<int>((blockIdx.x * gridDim.y) + blockIdx.y);
+
+    // Global offset for the current tile
+    OffsetT tile_offset = tile_idx * TILE_ITEMS;
+
+    OffsetT num_first_selections;
+    OffsetT num_second_selections;
+
+    if (tile_idx < num_tiles - 1)
+    {
+      // Not the last tile (full)
+      ConsumeTile<false>(TILE_ITEMS,
+                         tile_idx,
+                         tile_offset,
+                         first_tile_state,
+                         second_tile_state,
+                         num_first_selections,
+                         num_second_selections);
+    }
+    else
+    {
+      // The last tile (possibly partially-full)
+      OffsetT num_remaining = num_items - tile_offset;
+
+      ConsumeTile<true>(num_remaining,
+                        tile_idx,
+                        tile_offset,
+                        first_tile_state,
+                        second_tile_state,
+                        num_first_selections,
+                        num_second_selections);
+
+      if (threadIdx.x == 0)
+      {
+        // Output the total number of items selection_flags
+        d_num_selected_out[0] = num_first_selections;
+        d_num_selected_out[1] = num_second_selections;
+      }
+    }
+  }
+
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/agent/agent_unique_by_key.cuh b/cuda/cub/agent/agent_unique_by_key.cuh
new file mode 100644
index 00000000..d5925c43
--- /dev/null
+++ b/cuda/cub/agent/agent_unique_by_key.cuh
@@ -0,0 +1,571 @@
+/******************************************************************************
+ * Copyright (c) NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentUniqueByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide unique-by-key.
+ */
+
+#pragma once
+
+#include <iterator>
+#include <type_traits>
+
+#include "../thread/thread_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_scan.cuh"
+#include "../agent/single_pass_scan_operators.cuh"
+#include "../block/block_discontinuity.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentUniqueByKey
+ */
+template <int                     _BLOCK_THREADS,
+          int                     _ITEMS_PER_THREAD = 1,
+          cub::BlockLoadAlgorithm _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+          cub::CacheLoadModifier  _LOAD_MODIFIER    = cub::LOAD_LDG,
+          cub::BlockScanAlgorithm _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS>
+struct AgentUniqueByKeyPolicy
+{
+    enum
+    {
+        BLOCK_THREADS    = _BLOCK_THREADS,
+        ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+    };
+    static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+    static const cub::CacheLoadModifier  LOAD_MODIFIER  = _LOAD_MODIFIER;
+    static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+
+/**
+ * \brief AgentUniqueByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide unique-by-key
+ */
+template <
+    typename AgentUniqueByKeyPolicyT,           ///< Parameterized AgentUniqueByKeyPolicy tuning policy type
+    typename KeyInputIteratorT,                 ///< Random-access input iterator type for keys
+    typename ValueInputIteratorT,               ///< Random-access input iterator type for values
+    typename KeyOutputIteratorT,                ///< Random-access output iterator type for keys
+    typename ValueOutputIteratorT,              ///< Random-access output iterator type for values
+    typename EqualityOpT,                       ///< Equality operator type
+    typename OffsetT>                           ///< Signed integer type for global offsets
+struct AgentUniqueByKey
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input key and value type
+    using KeyT = typename std::iterator_traits<KeyInputIteratorT>::value_type;
+    using ValueT = typename std::iterator_traits<ValueInputIteratorT>::value_type;
+
+    // Tile status descriptor interface type
+    using ScanTileStateT = ScanTileState<OffsetT>;
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS           = AgentUniqueByKeyPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentUniqueByKeyPolicyT::ITEMS_PER_THREAD,
+        ITEMS_PER_TILE          = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
+    using WrappedKeyInputIteratorT = typename std::conditional<std::is_pointer<KeyInputIteratorT>::value,
+            CacheModifiedInputIterator<AgentUniqueByKeyPolicyT::LOAD_MODIFIER, KeyT, OffsetT>,     // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            KeyInputIteratorT>::type;                                                              // Directly use the supplied input iterator type
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
+    using WrappedValueInputIteratorT = typename std::conditional<std::is_pointer<ValueInputIteratorT>::value,
+            CacheModifiedInputIterator<AgentUniqueByKeyPolicyT::LOAD_MODIFIER, ValueT, OffsetT>,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            ValueInputIteratorT>::type;                                                            // Directly use the supplied input iterator type
+
+    // Parameterized BlockLoad type for input data
+    using BlockLoadKeys = BlockLoad<
+            KeyT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentUniqueByKeyPolicyT::LOAD_ALGORITHM>;
+
+    // Parameterized BlockLoad type for flags
+    using BlockLoadValues = BlockLoad<
+            ValueT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentUniqueByKeyPolicyT::LOAD_ALGORITHM>;
+
+    // Parameterized BlockDiscontinuity type for items
+    using BlockDiscontinuityKeys = cub::BlockDiscontinuity<KeyT, BLOCK_THREADS>;
+
+    // Parameterized BlockScan type
+    using BlockScanT = cub::BlockScan<OffsetT, BLOCK_THREADS, AgentUniqueByKeyPolicyT::SCAN_ALGORITHM>;
+
+    // Parameterized BlockDiscontinuity type for items
+    using TilePrefixCallback = cub::TilePrefixCallbackOp<OffsetT, cub::Sum, ScanTileStateT>;
+
+    // Key exchange type
+    using KeyExchangeT = KeyT[ITEMS_PER_TILE];
+
+    // Value exchange type
+    using ValueExchangeT = ValueT[ITEMS_PER_TILE];
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct ScanStorage
+        {
+            typename BlockScanT::TempStorage             scan;
+            typename TilePrefixCallback::TempStorage     prefix;
+            typename BlockDiscontinuityKeys::TempStorage discontinuity;
+        } scan_storage;
+
+        // Smem needed for loading keys
+        typename BlockLoadKeys::TempStorage   load_keys;
+
+        // Smem needed for loading values
+        typename BlockLoadValues::TempStorage load_values;
+
+        // Smem needed for compacting items (allows non POD items in this union)
+        Uninitialized<KeyExchangeT>   shared_keys;
+        Uninitialized<ValueExchangeT> shared_values;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                       temp_storage;
+    WrappedKeyInputIteratorT            d_keys_in;
+    WrappedValueInputIteratorT          d_values_in;
+    KeyOutputIteratorT                  d_keys_out;
+    ValueOutputIteratorT                d_values_out;
+    cub::InequalityWrapper<EqualityOpT> inequality_op;
+    OffsetT                             num_items;
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentUniqueByKey(
+        TempStorage                  &temp_storage_,
+        WrappedKeyInputIteratorT     d_keys_in_,
+        WrappedValueInputIteratorT   d_values_in_,
+        KeyOutputIteratorT           d_keys_out_,
+        ValueOutputIteratorT         d_values_out_,
+        EqualityOpT                  equality_op_,
+        OffsetT                      num_items_)
+    : 
+        temp_storage(temp_storage_.Alias()),
+        d_keys_in(d_keys_in_),
+        d_values_in(d_values_in_),
+        d_keys_out(d_keys_out_),
+        d_values_out(d_values_out_),
+        inequality_op(equality_op_),
+        num_items(num_items_)
+    {}
+
+
+
+    //---------------------------------------------------------------------
+    // Utility functions
+    //---------------------------------------------------------------------
+
+    struct KeyTagT {};
+    struct ValueTagT {};
+
+    __device__ __forceinline__
+    KeyExchangeT &GetShared(KeyTagT)
+    {
+        return temp_storage.shared_keys.Alias();
+    }
+    __device__ __forceinline__
+    ValueExchangeT &GetShared(ValueTagT)
+    {
+        return temp_storage.shared_values.Alias();
+    }
+
+
+    //---------------------------------------------------------------------
+    // Scatter utility methods
+    //---------------------------------------------------------------------
+    template <typename Tag,
+              typename OutputIt,
+              typename T>
+    __device__ __forceinline__ void Scatter(
+        Tag      tag,
+        OutputIt items_out,
+        T (&items)[ITEMS_PER_THREAD],
+        OffsetT (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT (&selection_indices)[ITEMS_PER_THREAD],
+        int  /*num_tile_items*/,
+        int  num_tile_selections,
+        OffsetT num_selections_prefix,
+        OffsetT /*num_selections*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int local_scatter_offset = selection_indices[ITEM] -
+                                       num_selections_prefix;
+            if (selection_flags[ITEM])
+            {
+                GetShared(tag)[local_scatter_offset] = items[ITEM];
+            }
+        }
+
+        CTA_SYNC();
+
+        for (int item = threadIdx.x;
+             item < num_tile_selections;
+             item += BLOCK_THREADS)
+        {
+            items_out[num_selections_prefix + item] = GetShared(tag)[item];
+        }
+
+        CTA_SYNC();
+    }
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Process first tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeFirstTile(
+        int                 num_tile_items,     ///< Number of input items comprising this tile
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        KeyT        keys[ITEMS_PER_THREAD];
+        OffsetT     selection_flags[ITEMS_PER_THREAD];
+        OffsetT     selection_idx[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+            // Fill last elements with the first element
+            // because collectives are not suffix guarded
+            BlockLoadKeys(temp_storage.load_keys)
+                .Load(d_keys_in + tile_offset,
+                      keys,
+                      num_tile_items,
+                      *(d_keys_in + tile_offset));
+        }
+        else
+        {
+            BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
+        }
+
+
+        CTA_SYNC();
+
+        ValueT values[ITEMS_PER_THREAD];
+        if (IS_LAST_TILE)
+        {
+            // Fill last elements with the first element
+            // because collectives are not suffix guarded
+            BlockLoadValues(temp_storage.load_values)
+                .Load(d_values_in + tile_offset,
+                      values,
+                      num_tile_items,
+                      *(d_values_in + tile_offset));
+        }
+        else
+        {
+            BlockLoadValues(temp_storage.load_values)
+                .Load(d_values_in + tile_offset, values);
+        }
+
+        CTA_SYNC();
+
+        BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity)
+             .FlagHeads(selection_flags, keys, inequality_op);
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Set selection_flags for out-of-bounds items
+            if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
+                selection_flags[ITEM] = 1;
+        }
+
+        CTA_SYNC();
+
+
+        OffsetT num_tile_selections   = 0;
+        OffsetT num_selections        = 0;
+        OffsetT num_selections_prefix = 0;
+
+        BlockScanT(temp_storage.scan_storage.scan)
+             .ExclusiveSum(selection_flags,
+                           selection_idx,
+                           num_tile_selections);
+
+        if (threadIdx.x == 0)
+        {
+            // Update tile status if this is not the last tile
+            if (!IS_LAST_TILE)
+                tile_state.SetInclusive(0, num_tile_selections);
+        }
+
+        // Do not count any out-of-bounds selections
+        if (IS_LAST_TILE)
+        {
+            int num_discount = ITEMS_PER_TILE - num_tile_items;
+            num_tile_selections -= num_discount;
+        }
+        num_selections = num_tile_selections;
+
+        CTA_SYNC();
+
+        Scatter(KeyTagT(),
+                d_keys_out,
+                keys,
+                selection_flags,
+                selection_idx,
+                num_tile_items,
+                num_tile_selections,
+                num_selections_prefix,
+                num_selections);
+
+        CTA_SYNC();
+
+        Scatter(ValueTagT(),
+                d_values_out,
+                values,
+                selection_flags,
+                selection_idx,
+                num_tile_items,
+                num_tile_selections,
+                num_selections_prefix,
+                num_selections);
+
+        return num_selections;
+    }
+
+    /**
+     * Process subsequent tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeSubsequentTile(
+        int                 num_tile_items,     ///< Number of input items comprising this tile
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        KeyT        keys[ITEMS_PER_THREAD];
+        OffsetT     selection_flags[ITEMS_PER_THREAD];
+        OffsetT     selection_idx[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+            // Fill last elements with the first element
+            // because collectives are not suffix guarded
+            BlockLoadKeys(temp_storage.load_keys)
+                .Load(d_keys_in + tile_offset,
+                      keys,
+                      num_tile_items,
+                      *(d_keys_in + tile_offset));
+        }
+        else
+        {
+            BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
+        }
+
+
+        CTA_SYNC();
+
+        ValueT values[ITEMS_PER_THREAD];
+        if (IS_LAST_TILE)
+        {
+            // Fill last elements with the first element
+            // because collectives are not suffix guarded
+            BlockLoadValues(temp_storage.load_values)
+                .Load(d_values_in + tile_offset,
+                      values,
+                      num_tile_items,
+                      *(d_values_in + tile_offset));
+        }
+        else
+        {
+            BlockLoadValues(temp_storage.load_values)
+                .Load(d_values_in + tile_offset, values);
+        }
+
+        CTA_SYNC();
+
+        KeyT tile_predecessor = d_keys_in[tile_offset - 1];
+        BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity)
+            .FlagHeads(selection_flags, keys, inequality_op, tile_predecessor);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Set selection_flags for out-of-bounds items
+            if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
+                selection_flags[ITEM] = 1;
+        }
+
+        CTA_SYNC();
+
+
+        OffsetT num_tile_selections   = 0;
+        OffsetT num_selections        = 0;
+        OffsetT num_selections_prefix = 0;
+
+        TilePrefixCallback prefix_cb(tile_state,
+                                     temp_storage.scan_storage.prefix,
+                                     cub::Sum(),
+                                     tile_idx);
+        BlockScanT(temp_storage.scan_storage.scan)
+            .ExclusiveSum(selection_flags,
+                          selection_idx,
+                          prefix_cb);
+
+        num_selections        = prefix_cb.GetInclusivePrefix();
+        num_tile_selections   = prefix_cb.GetBlockAggregate();
+        num_selections_prefix = prefix_cb.GetExclusivePrefix();
+
+        if (IS_LAST_TILE)
+        {
+            int num_discount = ITEMS_PER_TILE - num_tile_items;
+            num_tile_selections -= num_discount;
+            num_selections -= num_discount;
+        }
+
+        CTA_SYNC();
+
+        Scatter(KeyTagT(),
+                d_keys_out,
+                keys,
+                selection_flags,
+                selection_idx,
+                num_tile_items,
+                num_tile_selections,
+                num_selections_prefix,
+                num_selections);
+
+        CTA_SYNC();
+
+        Scatter(ValueTagT(),
+                d_values_out,
+                values,
+                selection_flags,
+                selection_idx,
+                num_tile_items,
+                num_tile_selections,
+                num_selections_prefix,
+                num_selections);
+
+        return num_selections;
+    }
+
+
+    /**
+     * Process a tile of input
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeTile(
+        int                 num_tile_items,     ///< Number of input items comprising this tile
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OffsetT num_selections;
+        if (tile_idx == 0)
+        {
+            num_selections = ConsumeFirstTile<IS_LAST_TILE>(num_tile_items, tile_offset, tile_state);
+        }
+        else
+        {
+            num_selections = ConsumeSubsequentTile<IS_LAST_TILE>(num_tile_items, tile_idx, tile_offset, tile_state);
+        }
+
+        return num_selections;
+    }
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    template <typename NumSelectedIteratorT>        ///< Output iterator type for recording number of items selection_flags
+    __device__ __forceinline__ void ConsumeRange(
+        int                     num_tiles,          ///< Total number of input tiles
+        ScanTileStateT&         tile_state,         ///< Global tile state descriptor
+        NumSelectedIteratorT    d_num_selected_out) ///< Output total number selection_flags
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * ITEMS_PER_TILE;                // Global offset for the current tile
+
+        if (tile_idx < num_tiles - 1)
+        {
+            ConsumeTile<false>(ITEMS_PER_TILE,
+                               tile_idx,
+                               tile_offset,
+                               tile_state);
+        }
+        else
+        {
+            int  num_remaining  = static_cast<int>(num_items - tile_offset);
+            OffsetT num_selections = ConsumeTile<true>(num_remaining,
+                                                       tile_idx,                                    
+                                                       tile_offset,
+                                                       tile_state);
+            if (threadIdx.x == 0)                                                               
+            {
+                *d_num_selected_out = num_selections;
+            }
+        }
+    }
+};
+
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/agent/single_pass_scan_operators.cuh b/cuda/cub/agent/single_pass_scan_operators.cuh
new file mode 100644
index 00000000..08d99e8f
--- /dev/null
+++ b/cuda/cub/agent/single_pass_scan_operators.cuh
@@ -0,0 +1,804 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Callback operator types for supplying BlockScan prefixes
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../warp/warp_reduce.cuh"
+#include "../config.cuh"
+#include "../util_device.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/******************************************************************************
+ * Prefix functor type for maintaining a running prefix while scanning a
+ * region independent of other thread blocks
+ ******************************************************************************/
+
+/**
+ * Stateful callback operator type for supplying BlockScan prefixes.
+ * Maintains a running prefix that can be applied to consecutive
+ * BlockScan operations.
+ */
+template <
+    typename T,                 ///< BlockScan value type
+    typename ScanOpT>            ///< Wrapped scan operator type
+struct BlockScanRunningPrefixOp
+{
+    ScanOpT     op;                 ///< Wrapped scan operator
+    T           running_total;      ///< Running block-wide prefix
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op)
+    :
+        op(op)
+    {}
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRunningPrefixOp(
+        T starting_prefix,
+        ScanOpT op)
+    :
+        op(op),
+        running_total(starting_prefix)
+    {}
+
+    /**
+     * Prefix callback operator.  Returns the block-wide running_total in thread-0.
+     */
+    __device__ __forceinline__ T operator()(
+        const T &block_aggregate)              ///< The aggregate sum of the BlockScan inputs
+    {
+        T retval = running_total;
+        running_total = op(running_total, block_aggregate);
+        return retval;
+    }
+};
+
+
+/******************************************************************************
+ * Generic tile status interface types for block-cooperative scans
+ ******************************************************************************/
+
+/**
+ * Enumerations of tile status
+ */
+enum ScanTileStatus
+{
+    SCAN_TILE_OOB,          // Out-of-bounds (e.g., padding)
+    SCAN_TILE_INVALID = 99, // Not yet processed
+    SCAN_TILE_PARTIAL,      // Tile aggregate is available
+    SCAN_TILE_INCLUSIVE,    // Inclusive tile prefix is available
+};
+
+
+/**
+ * Tile status interface.
+ */
+template <
+    typename    T,
+    bool        SINGLE_WORD = Traits<T>::PRIMITIVE>
+struct ScanTileState;
+
+
+/**
+ * Tile status interface specialized for scan status and value types
+ * that can be combined into one machine word that can be
+ * read/written coherently in a single access.
+ */
+template <typename T>
+struct ScanTileState<T, true>
+{
+    // Status word type
+    using StatusWord = cub::detail::conditional_t<
+      sizeof(T) == 8,
+      long long,
+      cub::detail::conditional_t<
+        sizeof(T) == 4,
+        int,
+        cub::detail::conditional_t<sizeof(T) == 2, short, char>>>;
+
+    // Unit word type
+    using TxnWord = cub::detail::conditional_t<
+      sizeof(T) == 8,
+      longlong2,
+      cub::detail::conditional_t<
+        sizeof(T) == 4,
+        int2,
+        cub::detail::conditional_t<sizeof(T) == 2, int, uchar2>>>;
+
+    // Device word type
+    struct TileDescriptor
+    {
+        StatusWord  status;
+        T           value;
+    };
+
+
+    // Constants
+    enum
+    {
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+
+    // Device storage
+    TxnWord *d_tile_descriptors;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ScanTileState()
+    :
+        d_tile_descriptors(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     /*num_tiles*/,                      ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+        TxnWord val = TxnWord();
+        TileDescriptor *descriptor = reinterpret_cast<TileDescriptor*>(&val);
+
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            descriptor->status = StatusWord(SCAN_TILE_INVALID);
+            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            descriptor->status = StatusWord(SCAN_TILE_OOB);
+            d_tile_descriptors[threadIdx.x] = val;
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status = SCAN_TILE_INCLUSIVE;
+        tile_descriptor.value = tile_inclusive;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status = SCAN_TILE_PARTIAL;
+        tile_descriptor.value = tile_partial;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int             tile_idx,
+        StatusWord      &status,
+        T               &value)
+    {
+        TileDescriptor tile_descriptor;
+        do
+        {
+            __threadfence_block(); // prevent hoisting loads from loop
+            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+
+        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
+
+        status = tile_descriptor.status;
+        value = tile_descriptor.value;
+    }
+
+};
+
+
+
+/**
+ * Tile status interface specialized for scan status and value types that
+ * cannot be combined into one machine word.
+ */
+template <typename T>
+struct ScanTileState<T, false>
+{
+    // Status word type
+    typedef char StatusWord;
+
+    // Constants
+    enum
+    {
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+    // Device storage
+    StatusWord  *d_tile_status;
+    T           *d_tile_partial;
+    T           *d_tile_inclusive;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ScanTileState()
+    :
+        d_tile_status(NULL),
+        d_tile_partial(NULL),
+        d_tile_inclusive(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     num_tiles,                          ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  temp_storage_bytes)                 ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        cudaError_t error = cudaSuccess;
+        do
+        {
+            void*   allocations[3] = {};
+            size_t  allocation_sizes[3];
+
+            allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);           // bytes needed for tile status descriptors
+            allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for partials
+            allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for inclusives
+
+            // Compute allocation pointers into the single storage blob
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Alias the offsets
+            d_tile_status       = reinterpret_cast<StatusWord*>(allocations[0]);
+            d_tile_partial      = reinterpret_cast<T*>(allocations[1]);
+            d_tile_inclusive    = reinterpret_cast<T*>(allocations[2]);
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        // Specify storage allocation requirements
+        size_t  allocation_sizes[3];
+        allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);         // bytes needed for tile status descriptors
+        allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for partials
+        allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for inclusives
+
+        // Set the necessary size of the blob
+        void* allocations[3] = {};
+        return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes));
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID);
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB);
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
+    {
+        // Update tile inclusive value
+        ThreadStore<STORE_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive);
+
+        // Fence
+        __threadfence();
+
+        // Update tile status
+        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE));
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
+    {
+        // Update tile partial value
+        ThreadStore<STORE_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial);
+
+        // Fence
+        __threadfence();
+
+        // Update tile status
+        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL));
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int             tile_idx,
+        StatusWord      &status,
+        T               &value)
+    {
+        do {
+            status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx);
+
+            __threadfence();    // prevent hoisting loads from loop or loads below above this one
+
+        } while (status == SCAN_TILE_INVALID);
+
+        if (status == StatusWord(SCAN_TILE_PARTIAL)) 
+            value = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
+        else
+            value = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
+    }
+};
+
+
+/******************************************************************************
+ * ReduceByKey tile status interface types for block-cooperative scans
+ ******************************************************************************/
+
+/**
+ * Tile status interface for reduction by key.
+ *
+ */
+template <
+    typename    ValueT,
+    typename    KeyT,
+    bool        SINGLE_WORD = (Traits<ValueT>::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16)>
+struct ReduceByKeyScanTileState;
+
+
+/**
+ * Tile status interface for reduction by key, specialized for scan status and value types that
+ * cannot be combined into one machine word.
+ */
+template <
+    typename    ValueT,
+    typename    KeyT>
+struct ReduceByKeyScanTileState<ValueT, KeyT, false> :
+    ScanTileState<KeyValuePair<KeyT, ValueT> >
+{
+    typedef ScanTileState<KeyValuePair<KeyT, ValueT> > SuperClass;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ReduceByKeyScanTileState() : SuperClass() {}
+};
+
+
+/**
+ * Tile status interface for reduction by key, specialized for scan status and value types that
+ * can be combined into one machine word that can be read/written coherently in a single access.
+ */
+template <
+    typename ValueT,
+    typename KeyT>
+struct ReduceByKeyScanTileState<ValueT, KeyT, true>
+{
+    typedef KeyValuePair<KeyT, ValueT>KeyValuePairT;
+
+    // Constants
+    enum
+    {
+        PAIR_SIZE           = static_cast<int>(sizeof(ValueT) + sizeof(KeyT)),
+        TXN_WORD_SIZE       = 1 << Log2<PAIR_SIZE + 1>::VALUE,
+        STATUS_WORD_SIZE    = TXN_WORD_SIZE - PAIR_SIZE,
+
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+    // Status word type
+    using StatusWord = cub::detail::conditional_t<
+      STATUS_WORD_SIZE == 8,
+      long long,
+      cub::detail::conditional_t<
+        STATUS_WORD_SIZE == 4,
+        int,
+        cub::detail::conditional_t<STATUS_WORD_SIZE == 2, short, char>>>;
+
+    // Status word type
+    using TxnWord = cub::detail::conditional_t<
+      TXN_WORD_SIZE == 16,
+      longlong2,
+      cub::detail::conditional_t<TXN_WORD_SIZE == 8, long long, int>>;
+
+    // Device word type (for when sizeof(ValueT) == sizeof(KeyT))
+    struct TileDescriptorBigStatus
+    {
+        KeyT        key;
+        ValueT      value;
+        StatusWord  status;
+    };
+
+    // Device word type (for when sizeof(ValueT) != sizeof(KeyT))
+    struct TileDescriptorLittleStatus
+    {
+        ValueT      value;
+        StatusWord  status;
+        KeyT        key;
+    };
+
+    // Device word type
+    using TileDescriptor =
+      cub::detail::conditional_t<sizeof(ValueT) == sizeof(KeyT),
+                                 TileDescriptorBigStatus,
+                                 TileDescriptorLittleStatus>;
+
+    // Device storage
+    TxnWord *d_tile_descriptors;
+
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ReduceByKeyScanTileState()
+    :
+        d_tile_descriptors(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     /*num_tiles*/,                      ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int             tile_idx    = (blockIdx.x * blockDim.x) + threadIdx.x;
+        TxnWord         val         = TxnWord();
+        TileDescriptor  *descriptor = reinterpret_cast<TileDescriptor*>(&val);
+
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            descriptor->status = StatusWord(SCAN_TILE_INVALID);
+            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            descriptor->status = StatusWord(SCAN_TILE_OOB);
+            d_tile_descriptors[threadIdx.x] = val;
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status  = SCAN_TILE_INCLUSIVE;
+        tile_descriptor.value   = tile_inclusive.value;
+        tile_descriptor.key     = tile_inclusive.key;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, KeyValuePairT tile_partial)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status  = SCAN_TILE_PARTIAL;
+        tile_descriptor.value   = tile_partial.value;
+        tile_descriptor.key     = tile_partial.key;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int                     tile_idx,
+        StatusWord              &status,
+        KeyValuePairT           &value)
+    {
+//        TxnWord         alias           = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+//        TileDescriptor  tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+//
+//        while (tile_descriptor.status == SCAN_TILE_INVALID)
+//        {
+//            __threadfence_block(); // prevent hoisting loads from loop
+//
+//            alias           = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+//            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+//        }
+//
+//        status      = tile_descriptor.status;
+//        value.value = tile_descriptor.value;
+//        value.key   = tile_descriptor.key;
+
+        TileDescriptor tile_descriptor;
+        do
+        {
+            __threadfence_block(); // prevent hoisting loads from loop
+            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+
+        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
+
+        status      = tile_descriptor.status;
+        value.value = tile_descriptor.value;
+        value.key   = tile_descriptor.key;
+    }
+
+};
+
+
+/******************************************************************************
+ * Prefix call-back operator for coupling local block scan within a
+ * block-cooperative scan
+ ******************************************************************************/
+
+/**
+ * Stateful block-scan prefix functor.  Provides the the running prefix for
+ * the current tile by using the call-back warp to wait on on
+ * aggregates/prefixes from predecessor tiles to become available.
+ */
+template <
+    typename    T,
+    typename    ScanOpT,
+    typename    ScanTileStateT,
+    int         PTX_ARCH = CUB_PTX_ARCH>
+struct TilePrefixCallbackOp
+{
+    // Parameterized warp reduce
+    typedef WarpReduce<T, CUB_PTX_WARP_THREADS, PTX_ARCH> WarpReduceT;
+
+    // Temporary storage type
+    struct _TempStorage
+    {
+        typename WarpReduceT::TempStorage   warp_reduce;
+        T                                   exclusive_prefix;
+        T                                   inclusive_prefix;
+        T                                   block_aggregate;
+    };
+
+    // Alias wrapper allowing temporary storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+    // Type of status word
+    typedef typename ScanTileStateT::StatusWord StatusWord;
+
+    // Fields
+    _TempStorage&               temp_storage;       ///< Reference to a warp-reduction instance
+    ScanTileStateT&             tile_status;        ///< Interface to tile status
+    ScanOpT                     scan_op;            ///< Binary scan operator
+    int                         tile_idx;           ///< The current tile index
+    T                           exclusive_prefix;   ///< Exclusive prefix for the tile
+    T                           inclusive_prefix;   ///< Inclusive prefix for the tile
+
+    // Constructor
+    __device__ __forceinline__
+    TilePrefixCallbackOp(
+        ScanTileStateT       &tile_status,
+        TempStorage         &temp_storage,
+        ScanOpT              scan_op,
+        int                 tile_idx)
+    :
+        temp_storage(temp_storage.Alias()),
+        tile_status(tile_status),
+        scan_op(scan_op),
+        tile_idx(tile_idx) {}
+
+
+    // Block until all predecessors within the warp-wide window have non-invalid status
+    __device__ __forceinline__
+    void ProcessWindow(
+        int         predecessor_idx,        ///< Preceding tile index to inspect
+        StatusWord  &predecessor_status,    ///< [out] Preceding tile status
+        T           &window_aggregate)      ///< [out] Relevant partial reduction from this window of preceding tiles
+    {
+        T value;
+        tile_status.WaitForValid(predecessor_idx, predecessor_status, value);
+
+        // Perform a segmented reduction to get the prefix for the current window.
+        // Use the swizzled scan operator because we are now scanning *down* towards thread0.
+
+        int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE));
+        window_aggregate = WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce(
+            value,
+            tail_flag,
+            SwizzleScanOp<ScanOpT>(scan_op));
+    }
+
+
+    // BlockScan prefix callback functor (called by the first warp)
+    __device__ __forceinline__
+    T operator()(T block_aggregate)
+    {
+
+        // Update our status with our tile-aggregate
+        if (threadIdx.x == 0)
+        {
+            temp_storage.block_aggregate = block_aggregate;
+            tile_status.SetPartial(tile_idx, block_aggregate);
+        }
+
+        int         predecessor_idx = tile_idx - threadIdx.x - 1;
+        StatusWord  predecessor_status;
+        T           window_aggregate;
+
+        // Wait for the warp-wide window of predecessor tiles to become valid
+        ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
+
+        // The exclusive tile prefix starts out as the current window aggregate
+        exclusive_prefix = window_aggregate;
+
+        // Keep sliding the window back until we come across a tile whose inclusive prefix is known
+        while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff))
+        {
+            predecessor_idx -= CUB_PTX_WARP_THREADS;
+
+            // Update exclusive tile prefix with the window prefix
+            ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
+            exclusive_prefix = scan_op(window_aggregate, exclusive_prefix);
+        }
+
+        // Compute the inclusive tile prefix and update the status for this tile
+        if (threadIdx.x == 0)
+        {
+            inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
+            tile_status.SetInclusive(tile_idx, inclusive_prefix);
+
+            temp_storage.exclusive_prefix = exclusive_prefix;
+            temp_storage.inclusive_prefix = inclusive_prefix;
+        }
+
+        // Return exclusive_prefix
+        return exclusive_prefix;
+    }
+
+    // Get the exclusive prefix stored in temporary storage
+    __device__ __forceinline__
+    T GetExclusivePrefix()
+    {
+        return temp_storage.exclusive_prefix;
+    }
+
+    // Get the inclusive prefix stored in temporary storage
+    __device__ __forceinline__
+    T GetInclusivePrefix()
+    {
+        return temp_storage.inclusive_prefix;
+    }
+
+    // Get the block aggregate stored in temporary storage
+    __device__ __forceinline__
+    T GetBlockAggregate()
+    {
+        return temp_storage.block_aggregate;
+    }
+
+};
+
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/block/block_adjacent_difference.cuh b/cuda/cub/block/block_adjacent_difference.cuh
new file mode 100644
index 00000000..89bf10d9
--- /dev/null
+++ b/cuda/cub/block/block_adjacent_difference.cuh
@@ -0,0 +1,1454 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * The cub::BlockAdjacentDifference class provides
+ * [<em>collective</em>](index.html#sec0) methods for computing the differences
+ * of adjacent elements partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * @brief BlockAdjacentDifference provides
+ *        [<em>collective</em>](index.html#sec0) methods for computing the
+ *        differences of adjacent elements partitioned across a CUDA thread
+ *        block.
+ *
+ * @ingroup BlockModule
+ *
+ * @par Overview
+ * - BlockAdjacentDifference calculates the differences of adjacent elements in
+ *   the elements partitioned across a CUDA thread block. Because the binary
+ *   operation could be noncommutative, there are two sets of methods.
+ *   Methods named SubtractLeft subtract left element `i - 1` of input sequence
+ *   from current element `i`. Methods named SubtractRight subtract current
+ *   element `i` from the right one `i + 1`:
+ *   @par
+ *   @code
+ *   int values[4]; // [1, 2, 3, 4]
+ *   //...
+ *   int subtract_left_result[4];  <-- [  1,  1,  1,  1 ]
+ *   int subtract_right_result[4]; <-- [ -1, -1, -1,  4 ]
+ *   @endcode
+ * - For SubtractLeft, if the left element is out of bounds, the
+ *   output value is assigned to `input[0]` without modification.
+ * - For SubtractRight, if the right element is out of bounds, the output value
+ *   is assigned to the current input value without modification.
+ * - The following example under the examples/block folder illustrates usage of
+ *   dynamically shared memory with BlockReduce and how to re-purpose
+ *   the same memory region:
+ *   <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *   This example can be easily adapted to the storage required by
+ *   BlockAdjacentDifference.
+ *
+ * @par Snippet
+ * The code snippet below illustrates how to use @p BlockAdjacentDifference to
+ * compute the left difference between adjacent elements.
+ *
+ * @par
+ * @code
+ * #include <cub/cub.cuh>
+ * // or equivalently <cub/block/block_adjacent_difference.cuh>
+ *
+ * struct CustomDifference
+ * {
+ *   template <typename DataType>
+ *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
+ *   {
+ *     return lhs - rhs;
+ *   }
+ * };
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockAdjacentDifference for a 1D block of
+ *     // 128 threads of type int
+ *     using BlockAdjacentDifferenceT =
+ *        cub::BlockAdjacentDifference<int, 128>;
+ *
+ *     // Allocate shared memory for BlockDiscontinuity
+ *     __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Collectively compute adjacent_difference
+ *     int result[4];
+ *
+ *     BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
+ *         result,
+ *         thread_data,
+ *         CustomDifference());
+ *
+ * @endcode
+ * @par
+ * Suppose the set of input `thread_data` across the block of threads is
+ * <tt>{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }</tt>.
+ * The corresponding output `result` in those threads will be
+ * <tt>{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }</tt>.
+ *
+ */
+template <typename T,
+          int BLOCK_DIM_X,
+          int BLOCK_DIM_Y = 1,
+          int BLOCK_DIM_Z = 1,
+          int PTX_ARCH    = CUB_PTX_ARCH>
+class BlockAdjacentDifference
+{
+private:
+
+    /***************************************************************************
+     * Constants and type definitions
+     **************************************************************************/
+
+    /// Constants
+
+    /// The thread block size in threads
+    static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    struct _TempStorage
+    {
+        T first_items[BLOCK_THREADS];
+        T last_items[BLOCK_THREADS];
+    };
+
+
+    /***************************************************************************
+     * Utility methods
+     **************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /// Specialization for when FlagOp has third index param
+    template <typename FlagOp,
+              bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
+    struct ApplyOp
+    {
+        // Apply flag operator
+      static __device__ __forceinline__ T FlagT(FlagOp flag_op,
+                                                const T &a,
+                                                const T &b,
+                                                int idx)
+      {
+        return flag_op(b, a, idx);
+      }
+    };
+
+    /// Specialization for when FlagOp does not have a third index param
+    template <typename FlagOp>
+    struct ApplyOp<FlagOp, false>
+    {
+      // Apply flag operator
+      static __device__ __forceinline__ T FlagT(FlagOp flag_op,
+                                                const T &a,
+                                                const T &b,
+                                                int /*idx*/)
+      {
+        return flag_op(b, a);
+      }
+    };
+
+    /// Templated unrolling of item comparison (inductive case)
+    struct Iterate
+    {
+        /**
+         * Head flags
+         *
+         * @param[out] flags Calling thread's discontinuity head_flags
+         * @param[in] input Calling thread's input items
+         * @param[out] preds Calling thread's predecessor items
+         * @param[in] flag_op Binary boolean flag predicate
+         */
+        template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+        static __device__ __forceinline__ void
+        FlagHeads(int linear_tid,
+                  FlagT (&flags)[ITEMS_PER_THREAD],
+                  T (&input)[ITEMS_PER_THREAD],
+                  T (&preds)[ITEMS_PER_THREAD],
+                  FlagOp flag_op)
+        {
+          #pragma unroll
+          for (int i = 1; i < ITEMS_PER_THREAD; ++i) {
+            preds[i] = input[i - 1];
+            flags[i] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[i],
+                input[i],
+                (linear_tid * ITEMS_PER_THREAD) + i);
+          }
+        }
+
+        /**
+         * Tail flags
+         *
+         * @param[out] flags Calling thread's discontinuity head_flags
+         * @param[in] input Calling thread's input items
+         * @param[in] flag_op Binary boolean flag predicate
+         */
+        template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+        static __device__ __forceinline__ void
+        FlagTails(int linear_tid,
+                  FlagT (&flags)[ITEMS_PER_THREAD],
+                  T (&input)[ITEMS_PER_THREAD],
+                  FlagOp flag_op)
+        {
+          #pragma unroll
+          for (int i = 0; i < ITEMS_PER_THREAD - 1; ++i) {
+            flags[i] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[i],
+                input[i + 1],
+                (linear_tid * ITEMS_PER_THREAD) + i + 1);
+          }
+        }
+    };
+
+    /***************************************************************************
+     * Thread fields
+     **************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockDiscontinuity}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /***********************************************************************//**
+     * @name Collective constructors
+     **************************************************************************/
+    //@{
+
+    /**
+     * @brief Collective constructor using a private static allocation of shared
+     *        memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockAdjacentDifference()
+        : temp_storage(PrivateStorage())
+        , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+    /**
+     * @brief Collective constructor using the specified memory allocation as
+     *        temporary storage.
+     *
+     * @param[in] temp_storage Reference to memory allocation having layout type TempStorage
+     */
+    __device__ __forceinline__ BlockAdjacentDifference(TempStorage &temp_storage)
+        : temp_storage(temp_storage.Alias())
+        , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+    //@}  end member group
+    /***********************************************************************//**
+     * @name Read left operations
+     **************************************************************************/
+    //@{
+
+    /**
+     * @brief Subtracts the left element of each adjacent pair of elements
+     *        partitioned across a CUDA thread block.
+     *
+     * @par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * @par Snippet
+     * The code snippet below illustrates how to use @p BlockAdjacentDifference
+     * to compute the left difference between adjacent elements.
+     *
+     * @par
+     * @code
+     * #include <cub/cub.cuh>
+     * // or equivalently <cub/block/block_adjacent_difference.cuh>
+     *
+     * struct CustomDifference
+     * {
+     *   template <typename DataType>
+     *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
+     *   {
+     *     return lhs - rhs;
+     *   }
+     * };
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockAdjacentDifference for a 1D block
+     *     // of 128 threads of type int
+     *     using BlockAdjacentDifferenceT =
+     *        cub::BlockAdjacentDifference<int, 128>;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute adjacent_difference
+     *     BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
+     *         thread_data,
+     *         thread_data,
+     *         CustomDifference());
+     *
+     * @endcode
+     * @par
+     * Suppose the set of input `thread_data` across the block of threads is
+     * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
+     * The corresponding output `result` in those threads will be
+     * `{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }`.
+     *
+     * @param[out] output
+     *   Calling thread's adjacent difference result
+     *
+     * @param[in] input
+     *   Calling thread's input items (may be aliased to @p output)
+     *
+     * @param[in] difference_op
+     *   Binary difference operator
+     */
+    template <int ITEMS_PER_THREAD,
+              typename OutputType,
+              typename DifferenceOpT>
+    __device__ __forceinline__ void
+    SubtractLeft(T (&input)[ITEMS_PER_THREAD],
+                 OutputType (&output)[ITEMS_PER_THREAD],
+                 DifferenceOpT difference_op)
+    {
+      // Share last item
+      temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+      CTA_SYNC();
+
+      #pragma unroll
+      for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
+      {
+        output[item] = difference_op(input[item], input[item - 1]);
+      }
+
+      if (linear_tid == 0)
+      {
+        output[0] = input[0];
+      }
+      else
+      {
+        output[0] = difference_op(input[0],
+                                  temp_storage.last_items[linear_tid - 1]);
+      }
+    }
+
+    /**
+     * @brief Subtracts the left element of each adjacent pair of elements
+     *        partitioned across a CUDA thread block.
+     *
+     * @par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * @par Snippet
+     * The code snippet below illustrates how to use @p BlockAdjacentDifference
+     * to compute the left difference between adjacent elements.
+     *
+     * @par
+     * @code
+     * #include <cub/cub.cuh>
+     * // or equivalently <cub/block/block_adjacent_difference.cuh>
+     *
+     * struct CustomDifference
+     * {
+     *   template <typename DataType>
+     *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
+     *   {
+     *     return lhs - rhs;
+     *   }
+     * };
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockAdjacentDifference for a 1D block of
+     *     // 128 threads of type int
+     *     using BlockAdjacentDifferenceT =
+     *        cub::BlockAdjacentDifference<int, 128>;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // The last item in the previous tile:
+     *     int tile_predecessor_item = ...;
+     *
+     *     // Collectively compute adjacent_difference
+     *     BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
+     *         thread_data,
+     *         thread_data,
+     *         CustomDifference(),
+     *         tile_predecessor_item);
+     *
+     * @endcode
+     * @par
+     * Suppose the set of input `thread_data` across the block of threads is
+     * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
+     * and that `tile_predecessor_item` is `3`. The corresponding output
+     * `result` in those threads will be
+     * `{ [1,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }`.
+     *
+     * @param[out] output
+     *   Calling thread's adjacent difference result
+     *
+     * @param[in] input
+     *   Calling thread's input items (may be aliased to \p output)
+     *
+     * @param[in] difference_op
+     *   Binary difference operator
+     *
+     * @param[in] tile_predecessor_item
+     *   <b>[<em>thread</em><sub>0</sub> only]</b> item which is going to be
+     *   subtracted from the first tile item (<tt>input<sub>0</sub></tt> from
+     *   <em>thread</em><sub>0</sub>).
+     */
+    template <int ITEMS_PER_THREAD,
+              typename OutputT,
+              typename DifferenceOpT>
+    __device__ __forceinline__ void
+    SubtractLeft(T (&input)[ITEMS_PER_THREAD],
+                 OutputT (&output)[ITEMS_PER_THREAD],
+                 DifferenceOpT difference_op,
+                 T tile_predecessor_item)
+    {
+      // Share last item
+      temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+      CTA_SYNC();
+
+      #pragma unroll
+      for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
+      {
+        output[item] = difference_op(input[item], input[item - 1]);
+      }
+
+      // Set flag for first thread-item
+      if (linear_tid == 0)
+      {
+        output[0] = difference_op(input[0], tile_predecessor_item);
+      }
+      else
+      {
+        output[0] = difference_op(input[0],
+                                  temp_storage.last_items[linear_tid - 1]);
+      }
+    }
+
+    /**
+     * @brief Subtracts the left element of each adjacent pair of elements 
+     *        partitioned across a CUDA thread block.
+     *
+     * @par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * @par Snippet
+     * The code snippet below illustrates how to use @p BlockAdjacentDifference 
+     * to compute the left difference between adjacent elements.
+     *
+     * @par
+     * @code
+     * #include <cub/cub.cuh>
+     * // or equivalently <cub/block/block_adjacent_difference.cuh>
+     *
+     * struct CustomDifference
+     * {
+     *   template <typename DataType>
+     *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
+     *   {
+     *     return lhs - rhs;
+     *   }
+     * };
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *   // Specialize BlockAdjacentDifference for a 1D block of 
+     *   // 128 threads of type int
+     *   using BlockAdjacentDifferenceT =
+     *      cub::BlockAdjacentDifference<int, 128>;
+     *
+     *   // Allocate shared memory for BlockDiscontinuity
+     *   __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+     *
+     *   // Obtain a segment of consecutive items that are blocked across threads
+     *   int thread_data[4];
+     *   ...
+     *   int valid_items = 9;
+     *
+     *   // Collectively compute adjacent_difference
+     *   BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
+     *       thread_data,
+     *       thread_data,
+     *       CustomDifference(),
+     *       valid_items);
+     *
+     * @endcode
+     * @par
+     * Suppose the set of input `thread_data` across the block of threads is
+     * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
+     * The corresponding output `result` in those threads will be
+     * `{ [4,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`.
+     *
+     * @param[out] output
+     *   Calling thread's adjacent difference result
+     *
+     * @param[in] input
+     *   Calling thread's input items (may be aliased to \p output)
+     *
+     * @param[in] difference_op
+     *   Binary difference operator
+     *
+     * @param[in] valid_items
+     *   Number of valid items in thread block
+     */
+    template <int ITEMS_PER_THREAD,
+              typename OutputType,
+              typename DifferenceOpT>
+    __device__ __forceinline__ void
+    SubtractLeftPartialTile(T (&input)[ITEMS_PER_THREAD],
+                            OutputType (&output)[ITEMS_PER_THREAD],
+                            DifferenceOpT difference_op,
+                            int valid_items)
+    {
+      // Share last item
+      temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+      CTA_SYNC();
+
+      if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items)
+      {
+        #pragma unroll
+        for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
+        {
+          output[item] = difference_op(input[item], input[item - 1]);
+        }
+      }
+      else
+      {
+        #pragma unroll
+        for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
+        {
+          const int idx = linear_tid * ITEMS_PER_THREAD + item;
+
+          if (idx < valid_items)
+          {
+            output[item] = difference_op(input[item], input[item - 1]);
+          }
+          else
+          {
+            output[item] = input[item];
+          }
+        }
+      }
+
+      if (linear_tid == 0 || valid_items <= linear_tid * ITEMS_PER_THREAD)
+      {
+        output[0] = input[0];
+      }
+      else
+      {
+        output[0] = difference_op(input[0],
+                                  temp_storage.last_items[linear_tid - 1]);
+      }
+    }
+
+    /**
+     * @brief Subtracts the left element of each adjacent pair of elements 
+     *        partitioned across a CUDA thread block.
+     *
+     * @par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * @par Snippet
+     * The code snippet below illustrates how to use @p BlockAdjacentDifference 
+     * to compute the left difference between adjacent elements.
+     *
+     * @par
+     * @code
+     * #include <cub/cub.cuh>
+     * // or equivalently <cub/block/block_adjacent_difference.cuh>
+     *
+     * struct CustomDifference
+     * {
+     *   template <typename DataType>
+     *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
+     *   {
+     *     return lhs - rhs;
+     *   }
+     * };
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *   // Specialize BlockAdjacentDifference for a 1D block of 
+     *   // 128 threads of type int
+     *   using BlockAdjacentDifferenceT =
+     *      cub::BlockAdjacentDifference<int, 128>;
+     *
+     *   // Allocate shared memory for BlockDiscontinuity
+     *   __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+     *
+     *   // Obtain a segment of consecutive items that are blocked across threads
+     *   int thread_data[4];
+     *   ...
+     *   int valid_items = 9;
+     *   int tile_predecessor_item = 4;
+     *
+     *   // Collectively compute adjacent_difference
+     *   BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
+     *       thread_data,
+     *       thread_data,
+     *       CustomDifference(),
+     *       valid_items,
+     *       tile_predecessor_item);
+     *
+     * @endcode
+     * @par
+     * Suppose the set of input `thread_data` across the block of threads is
+     * `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
+     * The corresponding output `result` in those threads will be
+     * `{ [0,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }`.
+     *
+     * @param[out] output
+     *   Calling thread's adjacent difference result
+     *
+     * @param[in] input
+     *   Calling thread's input items (may be aliased to \p output)
+     *
+     * @param[in] difference_op
+     *   Binary difference operator
+     *
+     * @param[in] valid_items
+     *   Number of valid items in thread block
+     *
+     * @param[in] tile_predecessor_item
+     *   **[<em>thread</em><sub>0</sub> only]** item which is going to be
+     *   subtracted from the first tile item (<tt>input<sub>0</sub></tt> from
+     *   <em>thread</em><sub>0</sub>).
+     */
+    template <int ITEMS_PER_THREAD,
+              typename OutputType,
+              typename DifferenceOpT>
+    __device__ __forceinline__ void
+    SubtractLeftPartialTile(T (&input)[ITEMS_PER_THREAD],
+                            OutputType (&output)[ITEMS_PER_THREAD],
+                            DifferenceOpT difference_op,
+                            int valid_items,
+                            T tile_predecessor_item)
+    {
+      // Share last item
+      temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+      CTA_SYNC();
+
+      if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items)
+      {
+        #pragma unroll
+        for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
+        {
+          output[item] = difference_op(input[item], input[item - 1]);
+        }
+      }
+      else
+      {
+        #pragma unroll
+        for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
+        {
+          const int idx = linear_tid * ITEMS_PER_THREAD + item;
+
+          if (idx < valid_items)
+          {
+            output[item] = difference_op(input[item], input[item - 1]);
+          }
+          else
+          {
+            output[item] = input[item];
+          }
+        }
+      }
+
+      if (valid_items <= linear_tid * ITEMS_PER_THREAD)
+      {
+        output[0] = input[0];
+      }
+      else if (linear_tid == 0) 
+      {
+        output[0] = difference_op(input[0], 
+                                  tile_predecessor_item);
+      }
+      else
+      {
+        output[0] = difference_op(input[0],
+                                  temp_storage.last_items[linear_tid - 1]);
+      }
+    }
+
+    //@}  end member group
+    /******************************************************************//**
+     * @name Read right operations
+     *********************************************************************/
+    //@{
+
+    /**
+     * @brief Subtracts the right element of each adjacent pair of elements
+     *        partitioned across a CUDA thread block.
+     *
+     * @par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * @par Snippet
+     * The code snippet below illustrates how to use @p BlockAdjacentDifference
+     * to compute the right difference between adjacent elements.
+     *
+     * @par
+     * @code
+     * #include <cub/cub.cuh>
+     * // or equivalently <cub/block/block_adjacent_difference.cuh>
+     *
+     * struct CustomDifference
+     * {
+     *   template <typename DataType>
+     *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
+     *   {
+     *     return lhs - rhs;
+     *   }
+     * };
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockAdjacentDifference for a 1D block of
+     *     // 128 threads of type int
+     *     using BlockAdjacentDifferenceT =
+     *        cub::BlockAdjacentDifference<int, 128>;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute adjacent_difference
+     *     BlockAdjacentDifferenceT(temp_storage).SubtractRight(
+     *         thread_data,
+     *         thread_data,
+     *         CustomDifference());
+     *
+     * @endcode
+     * @par
+     * Suppose the set of input `thread_data` across the block of threads is
+     * `{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }`.
+     * The corresponding output `result` in those threads will be
+     * `{ ..., [-1,2,1,0], [0,0,0,-1], [-1,0,0,0], [-1,3,-3,4] }`.
+     *
+     * @param[out] output
+     *   Calling thread's adjacent difference result
+     *
+     * @param[in] input
+     *   Calling thread's input items (may be aliased to \p output)
+     *
+     * @param[in] difference_op
+     *   Binary difference operator
+     */
+    template <int ITEMS_PER_THREAD,
+              typename OutputT,
+              typename DifferenceOpT>
+    __device__ __forceinline__ void
+    SubtractRight(T (&input)[ITEMS_PER_THREAD],
+                  OutputT (&output)[ITEMS_PER_THREAD],
+                  DifferenceOpT difference_op)
+    {
+      // Share first item
+      temp_storage.first_items[linear_tid] = input[0];
+
+      CTA_SYNC();
+
+      #pragma unroll
+      for (int item = 0; item < ITEMS_PER_THREAD - 1; item++)
+      {
+        output[item] = difference_op(input[item], input[item + 1]);
+      }
+
+      if (linear_tid == BLOCK_THREADS - 1)
+      {
+        output[ITEMS_PER_THREAD - 1] = input[ITEMS_PER_THREAD - 1];
+      }
+      else
+      {
+        output[ITEMS_PER_THREAD - 1] =
+          difference_op(input[ITEMS_PER_THREAD - 1],
+                        temp_storage.first_items[linear_tid + 1]);
+      }
+    }
+
+    /**
+     * @brief Subtracts the right element of each adjacent pair of elements
+     *        partitioned across a CUDA thread block.
+     *
+     * @par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * @par Snippet
+     * The code snippet below illustrates how to use @p BlockAdjacentDifference
+     * to compute the right difference between adjacent elements.
+     *
+     * @par
+     * @code
+     * #include <cub/cub.cuh>
+     * // or equivalently <cub/block/block_adjacent_difference.cuh>
+     *
+     * struct CustomDifference
+     * {
+     *   template <typename DataType>
+     *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
+     *   {
+     *     return lhs - rhs;
+     *   }
+     * };
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockAdjacentDifference for a 1D block of
+     *     // 128 threads of type int
+     *     using BlockAdjacentDifferenceT =
+     *        cub::BlockAdjacentDifference<int, 128>;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // The first item in the nest tile:
+     *     int tile_successor_item = ...;
+     *
+     *     // Collectively compute adjacent_difference
+     *     BlockAdjacentDifferenceT(temp_storage).SubtractRight(
+     *         thread_data,
+     *         thread_data,
+     *         CustomDifference(),
+     *         tile_successor_item);
+     *
+     * @endcode
+     * @par
+     * Suppose the set of input `thread_data` across the block of threads is
+     * `{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }`,
+     * and that `tile_successor_item` is `3`. The corresponding output `result`
+     * in those threads will be
+     * `{ ..., [-1,2,1,0], [0,0,0,-1], [-1,0,0,0], [-1,3,-3,1] }`.
+     *
+     * @param[out] output
+     *   Calling thread's adjacent difference result
+     *
+     * @param[in] input
+     *   Calling thread's input items (may be aliased to @p output)
+     *
+     * @param[in] difference_op
+     *   Binary difference operator
+     *
+     * @param[in] tile_successor_item
+     *   <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> item
+     *   which is going to be subtracted from the last tile item
+     *   (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from
+     *   <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+     */
+    template <int ITEMS_PER_THREAD,
+              typename OutputT,
+              typename DifferenceOpT>
+    __device__ __forceinline__ void
+    SubtractRight(T (&input)[ITEMS_PER_THREAD],
+                  OutputT (&output)[ITEMS_PER_THREAD],
+                  DifferenceOpT difference_op,
+                  T tile_successor_item)
+    {
+      // Share first item
+      temp_storage.first_items[linear_tid] = input[0];
+
+      CTA_SYNC();
+
+      // Set flag for last thread-item
+      T successor_item = (linear_tid == BLOCK_THREADS - 1)
+                           ? tile_successor_item // Last thread
+                           : temp_storage.first_items[linear_tid + 1];
+
+      #pragma unroll
+      for (int item = 0; item < ITEMS_PER_THREAD - 1; item++)
+      {
+        output[item] = difference_op(input[item], input[item + 1]);
+      }
+
+      output[ITEMS_PER_THREAD - 1] =
+        difference_op(input[ITEMS_PER_THREAD - 1], successor_item);
+    }
+
+    /**
+     * @brief Subtracts the right element of each adjacent pair in range of
+     *        elements partitioned across a CUDA thread block.
+     *
+     * @par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * @par Snippet
+     * The code snippet below illustrates how to use @p BlockAdjacentDifference to
+     * compute the right difference between adjacent elements.
+     *
+     * @par
+     * @code
+     * #include <cub/cub.cuh>
+     * // or equivalently <cub/block/block_adjacent_difference.cuh>
+     *
+     * struct CustomDifference
+     * {
+     *   template <typename DataType>
+     *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
+     *   {
+     *     return lhs - rhs;
+     *   }
+     * };
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockAdjacentDifference for a 1D block of
+     *     // 128 threads of type int
+     *     using BlockAdjacentDifferenceT =
+     *        cub::BlockAdjacentDifference<int, 128>;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute adjacent_difference
+     *     BlockAdjacentDifferenceT(temp_storage).SubtractRightPartialTile(
+     *         thread_data,
+     *         thread_data,
+     *         CustomDifference(),
+     *         valid_items);
+     *
+     * @endcode
+     * @par
+     * Suppose the set of input `thread_data` across the block of threads is
+     * `{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }`.
+     * and that `valid_items` is `507`. The corresponding output `result` in
+     * those threads will be
+     * `{ ..., [-1,2,1,0], [0,0,0,-1], [-1,0,3,3], [3,4,1,4] }`.
+     *
+     * @param[out] output
+     *   Calling thread's adjacent difference result
+     *
+     * @param[in] input
+     *   Calling thread's input items (may be aliased to @p output)
+     *
+     * @param[in] difference_op
+     *   Binary difference operator
+     *
+     * @param[in] valid_items
+     *   Number of valid items in thread block
+     */
+    template <int ITEMS_PER_THREAD,
+              typename OutputT,
+              typename DifferenceOpT>
+    __device__ __forceinline__ void
+    SubtractRightPartialTile(T (&input)[ITEMS_PER_THREAD],
+                             OutputT (&output)[ITEMS_PER_THREAD],
+                             DifferenceOpT difference_op,
+                             int valid_items)
+    {
+      // Share first item
+      temp_storage.first_items[linear_tid] = input[0];
+
+      CTA_SYNC();
+
+      if ((linear_tid + 1) * ITEMS_PER_THREAD < valid_items)
+      {
+        #pragma unroll
+        for (int item = 0; item < ITEMS_PER_THREAD - 1; item++)
+        {
+           output[item] = difference_op(input[item], input[item + 1]);
+        }
+
+        output[ITEMS_PER_THREAD - 1] =
+          difference_op(input[ITEMS_PER_THREAD - 1],
+                        temp_storage.first_items[linear_tid + 1]);
+      }
+      else
+      {
+        #pragma unroll
+        for (int item = 0; item < ITEMS_PER_THREAD; item++)
+        {
+          const int idx = linear_tid * ITEMS_PER_THREAD + item;
+
+          // Right element of input[valid_items - 1] is out of bounds.
+          // According to the API it's copied into output array
+          // without modification.
+          if (idx < valid_items - 1)
+          {
+            output[item] = difference_op(input[item], input[item + 1]);
+          }
+          else
+          {
+            output[item] = input[item];
+          }
+        }
+      }
+    }
+
+    //@}  end member group
+    /******************************************************************//**
+     * @name Head flag operations (deprecated)
+     *********************************************************************/
+    //@{
+
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /**
+     * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads
+     * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    CUB_DEPRECATED __device__ __forceinline__ void FlagHeads(
+        FlagT           (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],     ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op)                        ///< [in] Binary boolean flag predicate
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        if (linear_tid == 0)
+        {
+            // Set flag for first thread-item (preds[0] is undefined)
+            output[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            output[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set output for remaining items
+        Iterate::FlagHeads(linear_tid, output, input, preds, flag_op);
+    }
+
+    /**
+     * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads
+     * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead.
+     */
+    template <int             ITEMS_PER_THREAD,
+              typename        FlagT,
+              typename        FlagOp>
+    CUB_DEPRECATED __device__ __forceinline__ void FlagHeads(
+        FlagT           (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity result
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],     ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op,                        ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)          ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        output[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+
+        // Set output for remaining items
+        Iterate::FlagHeads(linear_tid, output, input, preds, flag_op);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    /**
+     * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads
+     * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead.
+     */
+    template <int ITEMS_PER_THREAD,
+              typename FlagT,
+              typename FlagOp>
+    CUB_DEPRECATED __device__ __forceinline__ void
+    FlagHeads(FlagT           (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity result
+              T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+              FlagOp          flag_op)                        ///< [in] Binary boolean flag predicate
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(output, input, preds, flag_op);
+    }
+
+    /**
+     * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads
+     * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead.
+     */
+    template <int ITEMS_PER_THREAD,
+              typename FlagT,
+              typename FlagOp>
+    CUB_DEPRECATED __device__ __forceinline__ void
+    FlagHeads(FlagT           (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity result
+              T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+              FlagOp          flag_op,                        ///< [in] Binary boolean flag predicate
+              T               tile_predecessor_item)          ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(output, input, preds, flag_op, tile_predecessor_item);
+    }
+
+
+    /**
+     * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagTails
+     * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractRight instead.
+     */
+    template <
+      int             ITEMS_PER_THREAD,
+      typename        FlagT,
+      typename        FlagOp>
+    CUB_DEPRECATED __device__ __forceinline__ void FlagTails(
+        FlagT           (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity result
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        FlagOp          flag_op)                        ///< [in] Binary boolean flag predicate
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        output[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set output for remaining items
+        Iterate::FlagTails(linear_tid, output, input, flag_op);
+    }
+
+
+    /**
+     * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagTails
+     * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractRight instead.
+     */
+    template <
+      int             ITEMS_PER_THREAD,
+      typename        FlagT,
+      typename        FlagOp>
+    CUB_DEPRECATED __device__ __forceinline__ void FlagTails(
+        FlagT           (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity result
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        FlagOp          flag_op,                        ///< [in] Binary boolean flag predicate
+        T               tile_successor_item)            ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        output[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set output for remaining items
+        Iterate::FlagTails(linear_tid, output, input, flag_op);
+    }
+
+
+    /**
+     * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails
+     * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or
+     * cub::BlockAdjacentDifference::SubtractRight instead.
+     */
+    template <
+      int             ITEMS_PER_THREAD,
+      typename        FlagT,
+      typename        FlagOp>
+    CUB_DEPRECATED __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = temp_storage.last_items[linear_tid - 1];
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails
+     * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or
+     * cub::BlockAdjacentDifference::SubtractRight instead.
+     */
+    template <
+      int             ITEMS_PER_THREAD,
+      typename        FlagT,
+      typename        FlagOp>
+    CUB_DEPRECATED __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+    /**
+     * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails
+     * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or
+     * cub::BlockAdjacentDifference::SubtractRight instead.
+     */
+    template <
+      int             ITEMS_PER_THREAD,
+      typename        FlagT,
+      typename        FlagOp>
+    CUB_DEPRECATED __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails
+     * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or
+     * cub::BlockAdjacentDifference::SubtractRight instead.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    CUB_DEPRECATED __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/block/block_discontinuity.cuh b/cuda/cub/block/block_discontinuity.cuh
new file mode 100644
index 00000000..08f89faa
--- /dev/null
+++ b/cuda/cub/block/block_discontinuity.cuh
@@ -0,0 +1,1119 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \brief The BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                The data type to be flagged.
+ * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
+ * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
+ *   that differ from their predecessors (or successors).  For example, head flags are convenient
+ *   for demarcating disjoint data segments as part of a segmented scan or reduction.
+ * - \blocked
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockDiscontinuity}
+ * \par
+ * The code snippet below illustrates the head flagging of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+ *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+ *
+ *     // Allocate shared memory for BlockDiscontinuity
+ *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Collectively compute head flags for discontinuities in the segment
+ *     int head_flags[4];
+ *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is
+ * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
+ * The corresponding output \p head_flags in those threads will be
+ * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+ *
+ * \par Performance Considerations
+ * - Incurs zero bank conflicts for most types
+ *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockDiscontinuity.
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,
+    int         BLOCK_DIM_Y     = 1,
+    int         BLOCK_DIM_Z     = 1,
+    int         PTX_ARCH        = CUB_PTX_ARCH>
+class BlockDiscontinuity
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    struct _TempStorage
+    {
+        T first_items[BLOCK_THREADS];
+        T last_items[BLOCK_THREADS];
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /// Specialization for when FlagOp has third index param
+    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
+    struct ApplyOp
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
+        {
+            return flag_op(a, b, idx);
+        }
+    };
+
+    /// Specialization for when FlagOp does not have a third index param
+    template <typename FlagOp>
+    struct ApplyOp<FlagOp, false>
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
+        {
+            return flag_op(a, b);
+        }
+    };
+
+    /// Templated unrolling of item comparison (inductive case)
+    struct Iterate
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            #pragma unroll
+            for (int i = 1; i < ITEMS_PER_THREAD; ++i) {
+                preds[i] = input[i - 1];
+                flags[i] = ApplyOp<FlagOp>::FlagT(
+                    flag_op,
+                    preds[i],
+                    input[i],
+                    (linear_tid * ITEMS_PER_THREAD) + i);
+            }
+        }
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            #pragma unroll
+            for (int i = 0; i < ITEMS_PER_THREAD - 1; ++i) {
+                flags[i] = ApplyOp<FlagOp>::FlagT(
+                    flag_op,
+                    input[i],
+                    input[i + 1],
+                    (linear_tid * ITEMS_PER_THREAD) + i + 1);
+            }
+        }
+
+    };
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockDiscontinuity}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockDiscontinuity()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockDiscontinuity(
+        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head flag operations
+     *********************************************************************/
+    //@{
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        if (linear_tid == 0)
+        {
+            // Set flag for first thread-item (preds[0] is undefined)
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set head_flags for remaining items
+        Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+    /**
+     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute head flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
+     * The corresponding output \p head_flags in those threads will be
+     * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op);
+    }
+
+
+    /**
+     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     *   against \p tile_predecessor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Collectively compute head flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagHeads(
+     *         head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>,
+     * and that \p tile_predecessor_item is \p 0.  The corresponding output \p head_flags in those threads will be
+     * <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Tail flag operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged.
+     *
+     * \par
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute tail flags for discontinuities in the segment
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>.
+     * The corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     *   against \p tile_successor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute tail flags for discontinuities in the segment
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         tail_flags, thread_data, cub::Inequality(), tile_successor_item);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that \p tile_successor_item is \p 125.  The corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head & tail flag operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tail_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     *   against \p tile_predecessor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     *   against \p tile_predecessor_item.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+     *         thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
+     * that the \p tile_predecessor_item is \p 0, and that the
+     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     *   against \p tile_predecessor_item.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     *   against \p tile_successor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+     *         thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
+     * that the \p tile_predecessor_item is \p 0, and that the
+     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+
+
+    //@}  end member group
+
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/block/block_exchange.cuh b/cuda/cub/block/block_exchange.cuh
new file mode 100644
index 00000000..8f254c2c
--- /dev/null
+++ b/cuda/cub/block/block_exchange.cuh
@@ -0,0 +1,1129 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+#include "../warp/warp_exchange.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \brief The BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                    The data type to be exchanged.
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of items partitioned onto each thread.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds.  Yields a smaller memory footprint at the expense of decreased parallelism.  (Default: false)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - It is commonplace for blocks of threads to rearrange data items between
+ *   threads.  For example, the device-accessible memory subsystem prefers access patterns
+ *   where data items are "striped" across threads (where consecutive threads access consecutive items),
+ *   yet most block-wide operations prefer a "blocked" partitioning of items across threads
+ *   (where consecutive items belong to a single thread).
+ * - BlockExchange supports the following types of data exchanges:
+ *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>striped</em>](index.html#sec5sec3) arrangements
+ *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>warp-striped</em>](index.html#sec5sec3) arrangements
+ *   - Scattering ranked items to a [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *   - Scattering ranked items to a [<em>striped arrangement</em>](index.html#sec5sec3)
+ * - \rowmajor
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockExchange}
+ * \par
+ * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+ * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+ *
+ *     // Allocate shared memory for BlockExchange
+ *     __shared__ typename BlockExchange::TempStorage temp_storage;
+ *
+ *     // Load a tile of data striped across threads
+ *     int thread_data[4];
+ *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
+ *
+ *     // Collectively exchange data into a blocked arrangement across threads
+ *     BlockExchange(temp_storage).StripedToBlocked(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of striped input \p thread_data across the block of threads is
+ * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt>.
+ * The corresponding output \p thread_data in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+ *
+ * \par Performance Considerations
+ * - Proper device-specific padding ensures zero bank conflicts for most types.
+ *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockExchange.
+ */
+template <
+    typename    InputT,
+    int         BLOCK_DIM_X,
+    int         ITEMS_PER_THREAD,
+    bool        WARP_TIME_SLICING   = false,
+    int         BLOCK_DIM_Y         = 1,
+    int         BLOCK_DIM_Z         = 1,
+    int         PTX_ARCH            = CUB_PTX_ARCH>
+class BlockExchange
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
+        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
+
+        TILE_ITEMS                  = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        TIME_SLICES                 = (WARP_TIME_SLICING) ? WARPS : 1,
+
+        TIME_SLICED_THREADS         = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
+        TIME_SLICED_ITEMS           = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
+
+        WARP_TIME_SLICED_THREADS    = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
+        WARP_TIME_SLICED_ITEMS      = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
+
+        // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
+        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
+        PADDING_ITEMS               = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type
+    struct __align__(16) _TempStorage
+    {
+        InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS];
+    };
+
+public:
+
+    /// \smemstorage{BlockExchange}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+    unsigned int lane_id;
+    unsigned int warp_id;
+    unsigned int warp_offset;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT          (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT          (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+            }
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Read a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_items[ITEM] = temp_storage.buff[item_offset];
+                    }
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for no timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT          (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        WARP_SYNC(0xffffffff);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for warp-timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT          (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        if (warp_id == 0)
+        {
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
+                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                temp_storage.buff[item_offset] = input_items[ITEM];
+            }
+
+            WARP_SYNC(0xffffffff);
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                output_items[ITEM] = temp_storage.buff[item_offset];
+            }
+        }
+
+        #pragma unroll
+        for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE)
+        {
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+
+                WARP_SYNC(0xffffffff);
+
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    output_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT          (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        // No timeslicing
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT          (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        // Warp time-slicing
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Write a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_storage.buff[item_offset] = input_items[ITEM];
+                    }
+                }
+            }
+
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT          (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            new (&temp_storage.buff[item_offset]) InputT (input_items[ITEM]);
+        }
+
+        WARP_SYNC(0xffffffff);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            new(&output_items[ITEM]) OutputT(temp_storage.buff[item_offset]);
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT          (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        #pragma unroll
+        for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
+        {
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+
+                WARP_SYNC(0xffffffff);
+
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    output_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT          (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         (&ranks)[ITEMS_PER_THREAD],            ///< [in] Corresponding scatter ranks
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+    /**
+     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT          (&input_items)[ITEMS_PER_THREAD],     ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         (&output_items)[ITEMS_PER_THREAD],    ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],              ///< [in] Corresponding scatter ranks
+        Int2Type<true>  /*time_slicing*/)
+    {
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            CTA_SYNC();
+
+            const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ranks[ITEM] - SLICE_OFFSET;
+                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+                {
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+            }
+
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT          (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         (&ranks)[ITEMS_PER_THREAD],            ///< [in] Corresponding scatter ranks
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT          (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         (&ranks)[ITEMS_PER_THREAD],            ///< [in] Corresponding scatter ranks
+        Int2Type<true> /*time_slicing*/)
+    {
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ranks[ITEM] - SLICE_OFFSET;
+                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+                {
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+            }
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Read a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_items[ITEM] = temp_storage.buff[item_offset];
+                    }
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+public:
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockExchange()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId()),
+        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockExchange(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        lane_id(LaneId()),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Structured exchanges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Load a tile of ordered data into a striped arrangement across block threads
+     *     int thread_data[4];
+     *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
+     *
+     *     // Collectively exchange data into a blocked arrangement across threads
+     *     BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of striped input \p thread_data across the block of threads is
+     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> after loading from device-accessible memory.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT      (&input_items)[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     (&output_items)[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        StripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+    /**
+     * \brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively exchange data into a striped arrangement across threads
+     *     BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data);
+     *
+     *     // Store data striped across block threads into an ordered tile
+     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of blocked input \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> in
+     * preparation for storing to device-accessible memory.
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT      (&input_items)[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     (&output_items)[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Load a tile of ordered data into a warp-striped arrangement across warp threads
+     *     int thread_data[4];
+     *     cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data);
+     *
+     *     // Collectively exchange data into a blocked arrangement across threads
+     *     BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of warp-striped input \p thread_data across the block of threads is
+     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
+     * after loading from device-accessible memory.  (The first 128 items are striped across
+     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT      (&input_items)[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     (&output_items)[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        WarpStripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively exchange data into a warp-striped arrangement across threads
+     *     BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data);
+     *
+     *     // Store data striped across warp threads into an ordered tile
+     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of blocked input \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
+     * in preparation for storing to device-accessible memory. (The first 128 items are striped across
+     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT      (&input_items)[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     (&output_items)[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToWarpStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Scatter exchanges
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>blocked</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT      (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     (&ranks)[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    {
+        ScatterToBlocked(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT      (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     (&ranks)[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStriped(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.  Items with rank -1 are not exchanged.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStripedGuarded(
+        InputT      (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     (&ranks)[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            if (ranks[ITEM] >= 0)
+                temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+
+
+    /**
+     * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     * \tparam ValidFlag                            <b>[inferred]</b> FlagT type denoting which items are valid
+     */
+    template <typename OutputT, typename OffsetT, typename ValidFlag>
+    __device__ __forceinline__ void ScatterToStripedFlagged(
+        InputT      (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     (&ranks)[ITEMS_PER_THREAD],            ///< [in] Corresponding scatter ranks
+        ValidFlag   (&is_valid)[ITEMS_PER_THREAD])         ///< [in] Corresponding flag denoting item validity
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            if (is_valid[ITEM])
+                temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    //@}  end member group
+
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT      (&items)[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        StripedToBlocked(items, items);
+    }
+
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT      (&items)[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToStriped(items, items);
+    }
+
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT      (&items)[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        WarpStripedToBlocked(items, items);
+    }
+
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT      (&items)[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToWarpStriped(items, items);
+    }
+
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT      (&items)[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     (&ranks)[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToBlocked(items, items, ranks);
+    }
+
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT      (&items)[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     (&ranks)[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStriped(items, items, ranks);
+    }
+
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToStripedGuarded(
+        InputT      (&items)[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     (&ranks)[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStripedGuarded(items, items, ranks);
+    }
+
+    template <typename OffsetT, typename ValidFlag>
+    __device__ __forceinline__ void ScatterToStripedFlagged(
+        InputT      (&items)[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     (&ranks)[ITEMS_PER_THREAD],        ///< [in] Corresponding scatter ranks
+        ValidFlag   (&is_valid)[ITEMS_PER_THREAD])     ///< [in] Corresponding flag denoting item validity
+    {
+        ScatterToStriped(items, items, ranks, is_valid);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+};
+
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/block/block_histogram.cuh b/cuda/cub/block/block_histogram.cuh
new file mode 100644
index 00000000..ef15332e
--- /dev/null
+++ b/cuda/cub/block/block_histogram.cuh
@@ -0,0 +1,428 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_histogram_sort.cuh"
+#include "specializations/block_histogram_atomic.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms.
+ */
+enum BlockHistogramAlgorithm
+{
+
+    /**
+     * \par Overview
+     * Sorting followed by differentiation.  Execution is comprised of two phases:
+     * -# Sort the data using efficient radix sort
+     * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
+     *
+     * \par Performance Considerations
+     * Delivers consistent throughput regardless of sample bin distribution.
+     */
+    BLOCK_HISTO_SORT,
+
+
+    /**
+     * \par Overview
+     * Use atomic addition to update byte counts directly
+     *
+     * \par Performance Considerations
+     * Performance is strongly tied to the hardware implementation of atomic
+     * addition, and may be significantly degraded for non uniformly-random
+     * input distributions where many concurrent updates are likely to be
+     * made to the same bin counter.
+     */
+    BLOCK_HISTO_ATOMIC,
+};
+
+
+
+/******************************************************************************
+ * Block histogram
+ ******************************************************************************/
+
+
+/**
+ * \brief The BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                    The sample type being histogrammed (must be castable to an integer bin identifier)
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of items per thread
+ * \tparam BINS                 The number bins within the histogram
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
+ *   counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
+ * - The `T` type must be implicitly castable to an integer type.
+ * - BlockHistogram expects each integral `input[i]` value to satisfy
+ *   `0 <= input[i] < BINS`. Values outside of this range result in undefined
+ *   behavior.
+ * - BlockHistogram can be optionally specialized to use different algorithms:
+ *   -# <b>cub::BLOCK_HISTO_SORT</b>.  Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm)
+ *   -# <b>cub::BLOCK_HISTO_ATOMIC</b>.  Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm)
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockHistogram}
+ * \par
+ * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+ * are partitioned across 128 threads where each thread owns 4 samples.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+ *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+ *
+ *     // Allocate shared memory for BlockHistogram
+ *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+ *
+ *     // Allocate shared memory for block-wide histogram bin counts
+ *     __shared__ unsigned int smem_histogram[256];
+ *
+ *     // Obtain input samples per thread
+ *     unsigned char data[4];
+ *     ...
+ *
+ *     // Compute the block-wide histogram
+ *     BlockHistogram(temp_storage).Histogram(data, smem_histogram);
+ *
+ * \endcode
+ *
+ * \par Performance and Usage Considerations
+ * - All input values must fall between [0, BINS), or behavior is undefined.
+ * - The histogram output can be constructed in shared or device-accessible memory
+ * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
+ *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockHistogram.
+ */
+template <
+    typename                T,
+    int                     BLOCK_DIM_X,
+    int                     ITEMS_PER_THREAD,
+    int                     BINS,
+    BlockHistogramAlgorithm ALGORITHM           = BLOCK_HISTO_SORT,
+    int                     BLOCK_DIM_Y         = 1,
+    int                     BLOCK_DIM_Z         = 1,
+    int                     PTX_ARCH            = CUB_PTX_ARCH>
+class BlockHistogram
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /**
+     * Ensure the template parameterization meets the requirements of the
+     * targeted device architecture.  BLOCK_HISTO_ATOMIC can only be used
+     * on version SM120 or later.  Otherwise BLOCK_HISTO_SORT is used
+     * regardless.
+     */
+    static const BlockHistogramAlgorithm SAFE_ALGORITHM =
+        ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (PTX_ARCH < 120)) ?
+            BLOCK_HISTO_SORT :
+            ALGORITHM;
+
+    /// Internal specialization.
+    using InternalBlockHistogram =
+      cub::detail::conditional_t<SAFE_ALGORITHM == BLOCK_HISTO_SORT,
+                                 BlockHistogramSort<T,
+                                                    BLOCK_DIM_X,
+                                                    ITEMS_PER_THREAD,
+                                                    BINS,
+                                                    BLOCK_DIM_Y,
+                                                    BLOCK_DIM_Z,
+                                                    PTX_ARCH>,
+                                 BlockHistogramAtomic<BINS>>;
+
+    /// Shared memory storage layout type for BlockHistogram
+    typedef typename InternalBlockHistogram::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+public:
+
+    /// \smemstorage{BlockHistogram}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockHistogram()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockHistogram(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Histogram operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Initialize the shared histogram counters to zero.
+     *
+     * \par Snippet
+     * The code snippet below illustrates a the initialization and update of a
+     * histogram of 512 integer samples that are partitioned across 128 threads
+     * where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Initialize the block-wide histogram
+     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+     *
+     *     // Update the block-wide histogram
+     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     */
+    template <typename CounterT     >
+    __device__ __forceinline__ void InitHistogram(CounterT      histogram[BINS])
+    {
+        // Initialize histogram bin counts to zeros
+        int histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            histogram[histo_offset + linear_tid] = 0;
+        }
+        // Finish up with guarded initialization if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            histogram[histo_offset + linear_tid] = 0;
+        }
+    }
+
+
+    /**
+     * \brief Constructs a block-wide histogram in shared/device-accessible memory.  Each thread contributes an array of input elements.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+     * are partitioned across 128 threads where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Compute the block-wide histogram
+     *     BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     */
+    template <
+        typename            CounterT     >
+    __device__ __forceinline__ void Histogram(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT             histogram[BINS])                ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        // Initialize histogram bin counts to zeros
+        InitHistogram(histogram);
+
+        CTA_SYNC();
+
+        // Composite the histogram
+        InternalBlockHistogram(temp_storage).Composite(items, histogram);
+    }
+
+
+
+    /**
+     * \brief Updates an existing block-wide histogram in shared/device-accessible memory.  Each thread composites an array of input elements.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a the initialization and update of a
+     * histogram of 512 integer samples that are partitioned across 128 threads
+     * where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Initialize the block-wide histogram
+     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+     *
+     *     // Update the block-wide histogram
+     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     */
+    template <
+        typename            CounterT     >
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        InternalBlockHistogram(temp_storage).Composite(items, histogram);
+    }
+
+};
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/block/block_load.cuh b/cuda/cub/block/block_load.cuh
new file mode 100644
index 00000000..3e78ce95
--- /dev/null
+++ b/cuda/cub/block/block_load.cuh
@@ -0,0 +1,1306 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Operations for reading linear tiles of data into the CUDA thread block.
+ */
+
+#pragma once
+
+#include <iterator>
+#include <type_traits>
+
+#include "../block/block_exchange.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+
+/******************************************************************//**
+ * \name Blocked arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block.
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    // Load directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items)
+        {
+            items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    typename        DefaultT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        items[ITEM] = oob_default;
+
+    LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+}
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Internal implementation for load vectorization
+ */
+template <
+    CacheLoadModifier   MODIFIER,
+    typename            T,
+    int                 ITEMS_PER_THREAD>
+__device__ __forceinline__ void InternalLoadDirectBlockedVectorized(
+    int    linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T      *block_ptr,                 ///< [in] Input pointer for loading from
+    T      (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    // Biggest memory access word that T is a whole multiple of
+    typedef typename UnitWord<T>::DeviceWord DeviceWord;
+
+    enum
+    {
+        TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord),
+
+        VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ?
+            4 :
+            (TOTAL_WORDS % 2 == 0) ?
+                2 :
+                1,
+
+        VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE,
+    };
+
+    // Vector type
+    typedef typename CubVector<DeviceWord, VECTOR_SIZE>::Type Vector;
+
+    // Vector items
+    Vector vec_items[VECTORS_PER_THREAD];
+
+    // Aliased input ptr
+    Vector* vec_ptr = reinterpret_cast<Vector*>(block_ptr) + (linear_tid * VECTORS_PER_THREAD);
+
+    // Load directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)
+    {
+        vec_items[ITEM] = ThreadLoad<MODIFIER>(vec_ptr + ITEM);
+    }
+
+    // Copy
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = *(reinterpret_cast<T*>(vec_items) + ITEM);
+    }
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block.
+ *
+ * \blocked
+ *
+ * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned
+ *
+ * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
+ *   - \p ITEMS_PER_THREAD is odd
+ *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ */
+template <
+    typename        T,
+    int             ITEMS_PER_THREAD>
+__device__ __forceinline__ void LoadDirectBlockedVectorized(
+    int linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T   *block_ptr,                 ///< [in] Input pointer for loading from
+    T   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+}
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block.
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS];
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items)
+        {
+            items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS];
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        InputT,
+    typename        DefaultT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        items[ITEM] = oob_default;
+
+    LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Warp-striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        new(&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]);
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
+        {
+            new(&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]);
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    typename        DefaultT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        items[ITEM] = oob_default;
+
+    LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
+}
+
+
+
+//@}  end member group
+
+/** @} */       // end group UtilIo
+
+
+
+//-----------------------------------------------------------------------------
+// Generic BlockLoad abstraction
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
+ */
+enum BlockLoadAlgorithm
+{
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
+     * directly from memory.
+     *
+     * \par Performance Considerations
+     * The utilization of memory transactions (coalescing) decreases as the
+     * access stride between threads increases (i.e., the number items per thread).
+     */
+    BLOCK_LOAD_DIRECT,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
+     * directly from memory.
+     *
+     * \par Performance Considerations
+     * The utilization of memory transactions (coalescing) doesn't depend on
+     * the number of items per thread.
+     */
+    BLOCK_LOAD_STRIPED,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
+     * from memory using CUDA's built-in vectorized loads as a coalescing optimization.
+     * For example, <tt>ld.global.v4.s32</tt> instructions will be generated
+     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high until the the
+     *   access stride between threads (i.e., the number items per thread) exceeds the
+     *   maximum vector load width (typically 4 items or 64B, whichever is lower).
+     * - The following conditions will prevent vectorization and loading will fall
+     *   back to cub::BLOCK_LOAD_DIRECT:
+     *   - \p ITEMS_PER_THREAD is odd
+     *   - The \p InputIteratorT is not a simple pointer type
+     *   - The block input offset is not quadword-aligned
+     *   - The data type \p T is not a built-in primitive or CUDA vector type
+     *     (e.g., \p short, \p int2, \p double, \p float2, etc.)
+     */
+    BLOCK_LOAD_VECTORIZE,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
+     * efficiently from memory and then locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec3).
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+     */
+    BLOCK_LOAD_TRANSPOSE,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>warp-striped arrangement</em>](index.html#sec5sec3) of data is
+     * read efficiently from memory and then locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec3).
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - The local reordering incurs slightly larger latencies than the
+     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+     * - Provisions more shared storage, but incurs smaller latencies than the
+     *   BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
+     */
+    BLOCK_LOAD_WARP_TRANSPOSE,
+
+    /**
+     * \par Overview
+     *
+     * Like \p BLOCK_LOAD_WARP_TRANSPOSE, a [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     * of data is read directly from memory and then is locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec3). To reduce the shared memory
+     * requirement, only one warp's worth of shared memory is provisioned and is
+     * subsequently time-sliced among warps.
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - Provisions less shared memory temporary storage, but incurs larger
+     *   latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative.
+     */
+    BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
+};
+
+
+/**
+ * \brief The BlockLoad class provides [<em>collective</em>](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [<em>blocked arrangement</em>](index.html#sec5sec3) across a CUDA thread block.  ![](block_load_logo.png)
+ * \ingroup BlockModule
+ * \ingroup UtilIo
+ *
+ * \tparam InputT               The data type to read into (which must be convertible from the input iterator's value type).
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockLoadAlgorithm tuning policy.  default: cub::BLOCK_LOAD_DIRECT.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - The BlockLoad class provides a single data movement abstraction that can be specialized
+ *   to implement different cub::BlockLoadAlgorithm strategies.  This facilitates different
+ *   performance policies for different architectures, data types, granularity sizes, etc.
+ * - BlockLoad can be optionally specialized by different data movement strategies:
+ *   -# <b>cub::BLOCK_LOAD_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
+*    -# <b>cub::BLOCK_LOAD_STRIPED,</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory using CUDA's built-in vectorized loads as a
+ *      coalescing optimization.    [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_TRANSPOSE</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec3) one warp at a time.  [More...](\ref cub::BlockLoadAlgorithm)
+ * - \rowmajor
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockLoad}
+ * \par
+ * The code snippet below illustrates the loading of a linear
+ * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+ * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+ * meaning memory references are efficiently coalesced using a warp-striped access
+ * pattern (after which items are locally reordered among threads).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+ *
+ *     // Allocate shared memory for BlockLoad
+ *     __shared__ typename BlockLoad::TempStorage temp_storage;
+ *
+ *     // Load a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     BlockLoad(temp_storage).Load(d_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+ * The set of \p thread_data across the block of threads in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+ *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockLoad.
+ */
+template <
+    typename            InputT,
+    int                 BLOCK_DIM_X,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  ALGORITHM           = BLOCK_LOAD_DIRECT,
+    int                 BLOCK_DIM_Y         = 1,
+    int                 BLOCK_DIM_Z         = 1,
+    int                 PTX_ARCH            = CUB_PTX_ARCH>
+class BlockLoad
+{
+private:
+
+    /******************************************************************************
+     * Constants and typed definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /******************************************************************************
+     * Algorithmic variants
+     ******************************************************************************/
+
+    /// Load helper
+    template <BlockLoadAlgorithm _POLICY, int DUMMY>
+    struct LoadInternal;
+
+
+    /**
+     * BLOCK_LOAD_DIRECT specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
+        }
+
+    };
+
+
+    /**
+    * BLOCK_LOAD_STRIPED specialization of load helper
+    */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_STRIPED, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_VECTORIZE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputT               *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
+            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            const InputT         *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
+            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template <
+            CacheLoadModifier   MODIFIER,
+            typename            ValueType,
+            typename            OffsetT>
+        __device__ __forceinline__ void Load(
+            CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT>    block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT                                                     (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_itr.ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)
+        template <typename _InputIteratorT>
+        __device__ __forceinline__ void Load(
+            _InputIteratorT   block_itr,                    ///< [in] The thread block's base input iterator for loading from
+            InputT           (&items)[ITEMS_PER_THREAD])   ///< [out] Data to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range (skips vectorization)
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization)
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_TRANSPOSE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
+    {
+        // BlockExchange utility type for keys
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {};
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {};
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+    };
+
+
+    /**
+     * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {};
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+    };
+
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Internal load implementation to use
+    typedef LoadInternal<ALGORITHM, 0> InternalLoad;
+
+
+    /// Shared memory storage layout type
+    typedef typename InternalLoad::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Thread reference to shared storage
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+public:
+
+    /// \smemstorage{BlockLoad}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockLoad()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockLoad(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Load a linear segment of items from memory.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
+    }
+
+
+    /**
+     * \brief Load a linear segment of items from memory, guarded by range.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the guarded loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt> and \p valid_items is \p 5.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }</tt>, with only the first two threads
+     * being unmasked to load portions of valid data (and other items remaining unassigned).
+     *
+     */
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+        int             valid_items)                ///< [in] Number of valid items to load
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
+    }
+
+
+    /**
+     * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the guarded loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt>,
+     * \p valid_items is \p 5, and the out-of-bounds default is \p -1.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }</tt>, with only the first two threads
+     * being unmasked to load portions of valid data (and other items are assigned \p -1)
+     *
+     */
+    template <typename InputIteratorT, typename DefaultT>
+    __device__ __forceinline__ void Load(
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+        int             valid_items,                ///< [in] Number of valid items to load
+        DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
+    }
+
+
+    //@}  end member group
+
+};
+
+template <class Policy,
+          class It,
+          class T = cub::detail::value_t<It>>
+struct BlockLoadType
+{
+  using type = cub::BlockLoad<T,
+                              Policy::BLOCK_THREADS,
+                              Policy::ITEMS_PER_THREAD,
+                              Policy::LOAD_ALGORITHM>;
+};
+
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/block/block_merge_sort.cuh b/cuda/cub/block/block_merge_sort.cuh
new file mode 100644
index 00000000..4769df36
--- /dev/null
+++ b/cuda/cub/block/block_merge_sort.cuh
@@ -0,0 +1,805 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/thread/thread_sort.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_namespace.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+
+CUB_NAMESPACE_BEGIN
+
+
+// Additional details of the Merge-Path Algorithm can be found in:
+// S. Odeh, O. Green, Z. Mwassi, O. Shmueli, Y. Birk, " Merge Path - Parallel
+// Merging Made Simple", Multithreaded Architectures and Applications (MTAAP)
+// Workshop, IEEE 26th International Parallel & Distributed Processing
+// Symposium (IPDPS), 2012
+template <typename KeyT,
+          typename KeyIteratorT,
+          typename OffsetT,
+          typename BinaryPred>
+__device__ __forceinline__ OffsetT MergePath(KeyIteratorT keys1,
+                                             KeyIteratorT keys2,
+                                             OffsetT keys1_count,
+                                             OffsetT keys2_count,
+                                             OffsetT diag,
+                                             BinaryPred binary_pred)
+{
+  OffsetT keys1_begin = diag < keys2_count ? 0 : diag - keys2_count;
+  OffsetT keys1_end   = (cub::min)(diag, keys1_count);
+
+  while (keys1_begin < keys1_end)
+  {
+    OffsetT mid = cub::MidPoint<OffsetT>(keys1_begin, keys1_end);
+    KeyT key1   = keys1[mid];
+    KeyT key2   = keys2[diag - 1 - mid];
+    bool pred   = binary_pred(key2, key1);
+
+    if (pred)
+    {
+      keys1_end = mid;
+    }
+    else
+    {
+      keys1_begin = mid + 1;
+    }
+  }
+  return keys1_begin;
+}
+
+template <typename KeyT, typename CompareOp, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void SerialMerge(KeyT *keys_shared,
+                                            int keys1_beg,
+                                            int keys2_beg,
+                                            int keys1_count,
+                                            int keys2_count,
+                                            KeyT (&output)[ITEMS_PER_THREAD],
+                                            int (&indices)[ITEMS_PER_THREAD],
+                                            CompareOp compare_op)
+{
+  int keys1_end = keys1_beg + keys1_count;
+  int keys2_end = keys2_beg + keys2_count;
+
+  KeyT key1 = keys_shared[keys1_beg];
+  KeyT key2 = keys_shared[keys2_beg];
+
+#pragma unroll
+  for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+  {
+    bool p = (keys2_beg < keys2_end) &&
+             ((keys1_beg >= keys1_end)
+              || compare_op(key2, key1));
+
+    output[item]  = p ? key2 : key1;
+    indices[item] = p ? keys2_beg++ : keys1_beg++;
+
+    if (p)
+    {
+      key2 = keys_shared[keys2_beg];
+    }
+    else
+    {
+      key1 = keys_shared[keys1_beg];
+    }
+  }
+}
+
+/**
+ * @brief Generalized merge sort algorithm
+ *
+ * This class is used to reduce code duplication. Warp and Block merge sort
+ * differ only in how they compute thread index and how they synchronize
+ * threads. Since synchronization might require access to custom data
+ * (like member mask), CRTP is used.
+ *
+ * @par
+ * The code snippet below illustrates the way this class can be used.
+ * @par
+ * @code
+ * #include <cub/cub.cuh> // or equivalently <cub/block/block_merge_sort.cuh>
+ *
+ * constexpr int BLOCK_THREADS = 256;
+ * constexpr int ITEMS_PER_THREAD = 9;
+ *
+ * class BlockMergeSort : public BlockMergeSortStrategy<int,
+ *                                                      cub::NullType,
+ *                                                      BLOCK_THREADS,
+ *                                                      ITEMS_PER_THREAD,
+ *                                                      BlockMergeSort>
+ * {
+ *   using BlockMergeSortStrategyT =
+ *     BlockMergeSortStrategy<int,
+ *                            cub::NullType,
+ *                            BLOCK_THREADS,
+ *                            ITEMS_PER_THREAD,
+ *                            BlockMergeSort>;
+ * public:
+ *   __device__ __forceinline__ explicit BlockMergeSort(
+ *     typename BlockMergeSortStrategyT::TempStorage &temp_storage)
+ *       : BlockMergeSortStrategyT(temp_storage, threadIdx.x)
+ *   {}
+ *
+ *   __device__ __forceinline__ void SyncImplementation() const
+ *   {
+ *     __syncthreads();
+ *   }
+ * };
+ * @endcode
+ *
+ * @tparam KeyT
+ *   KeyT type
+ *
+ * @tparam ValueT
+ *   ValueT type. cub::NullType indicates a keys-only sort
+ *
+ * @tparam SynchronizationPolicy
+ *   Provides a way of synchronizing threads. Should be derived from
+ *   `BlockMergeSortStrategy`.
+ */
+template <typename KeyT,
+          typename ValueT,
+          int NUM_THREADS,
+          int ITEMS_PER_THREAD,
+          typename SynchronizationPolicy>
+class BlockMergeSortStrategy
+{
+  static_assert(PowerOfTwo<NUM_THREADS>::VALUE,
+                "NUM_THREADS must be a power of two");
+
+private:
+
+  static constexpr int ITEMS_PER_TILE = ITEMS_PER_THREAD * NUM_THREADS;
+
+  // Whether or not there are values to be trucked along with keys
+  static constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+
+  /// Shared memory type required by this thread block
+  union _TempStorage
+  {
+    KeyT keys_shared[ITEMS_PER_TILE + 1];
+    ValueT items_shared[ITEMS_PER_TILE + 1];
+  }; // union TempStorage
+
+  /// Shared storage reference
+  _TempStorage &temp_storage;
+
+  /// Internal storage allocator
+  __device__ __forceinline__ _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+  const unsigned int linear_tid;
+
+public:
+  /// \smemstorage{BlockMergeSort}
+  struct TempStorage : Uninitialized<_TempStorage> {};
+
+  BlockMergeSortStrategy() = delete;
+  explicit __device__ __forceinline__
+  BlockMergeSortStrategy(unsigned int linear_tid)
+      : temp_storage(PrivateStorage())
+      , linear_tid(linear_tid)
+  {}
+
+  __device__ __forceinline__ BlockMergeSortStrategy(TempStorage &temp_storage,
+                                                    unsigned int linear_tid)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(linear_tid)
+  {}
+
+  __device__ __forceinline__ unsigned int get_linear_tid() const
+  {
+    return linear_tid;
+  }
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using
+   *        a merge sorting method.
+   *
+   * @par
+   * Sort is not guaranteed to be stable. That is, suppose that i and j are
+   * equivalent: neither one is less than the other. It is not guaranteed
+   * that the relative order of these two elements will be preserved by sort.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp>
+  __device__ __forceinline__ void Sort(KeyT (&keys)[ITEMS_PER_THREAD],
+                                       CompareOp compare_op)
+  {
+    ValueT items[ITEMS_PER_THREAD];
+    Sort<CompareOp, false>(keys, items, compare_op, ITEMS_PER_TILE, keys[0]);
+  }
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using
+   *        a merge sorting method.
+   *
+   * @par
+   * - Sort is not guaranteed to be stable. That is, suppose that `i` and `j`
+   *   are equivalent: neither one is less than the other. It is not guaranteed
+   *   that the relative order of these two elements will be preserved by sort.
+   * - The value of `oob_default` is assigned to all elements that are out of
+   *   `valid_items` boundaries. It's expected that `oob_default` is ordered
+   *   after any value in the `valid_items` boundaries. The algorithm always
+   *   sorts a fixed amount of elements, which is equal to
+   *   `ITEMS_PER_THREAD * BLOCK_THREADS`. If there is a value that is ordered
+   *   after `oob_default`, it won't be placed within `valid_items` boundaries.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] valid_items
+   *   Number of valid items to sort
+   *
+   * @param[in] oob_default
+   *   Default value to assign out-of-bound items
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp>
+  __device__ __forceinline__ void Sort(KeyT (&keys)[ITEMS_PER_THREAD],
+                                       CompareOp compare_op,
+                                       int valid_items,
+                                       KeyT oob_default)
+  {
+    ValueT items[ITEMS_PER_THREAD];
+    Sort<CompareOp, true>(keys, items, compare_op, valid_items, oob_default);
+  }
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using a merge sorting method.
+   *
+   * @par
+   * Sort is not guaranteed to be stable. That is, suppose that `i` and `j` are
+   * equivalent: neither one is less than the other. It is not guaranteed
+   * that the relative order of these two elements will be preserved by sort.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in,out] items
+   *   Values to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp>
+  __device__ __forceinline__ void Sort(KeyT (&keys)[ITEMS_PER_THREAD],
+                                       ValueT (&items)[ITEMS_PER_THREAD],
+                                       CompareOp compare_op)
+  {
+    Sort<CompareOp, false>(keys, items, compare_op, ITEMS_PER_TILE, keys[0]);
+  }
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using
+   *        a merge sorting method.
+   *
+   * @par
+   * - Sort is not guaranteed to be stable. That is, suppose that `i` and `j`
+   *   are equivalent: neither one is less than the other. It is not guaranteed
+   *   that the relative order of these two elements will be preserved by sort.
+   * - The value of `oob_default` is assigned to all elements that are out of
+   *   `valid_items` boundaries. It's expected that `oob_default` is ordered
+   *   after any value in the `valid_items` boundaries. The algorithm always
+   *   sorts a fixed amount of elements, which is equal to
+   *   `ITEMS_PER_THREAD * BLOCK_THREADS`. If there is a value that is ordered
+   *   after `oob_default`, it won't be placed within `valid_items` boundaries.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @tparam IS_LAST_TILE
+   *   True if `valid_items` isn't equal to the `ITEMS_PER_TILE`
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in,out] items
+   *   Values to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] valid_items
+   *   Number of valid items to sort
+   *
+   * @param[in] oob_default
+   *   Default value to assign out-of-bound items
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp,
+            bool IS_LAST_TILE = true>
+  __device__ __forceinline__ void Sort(KeyT (&keys)[ITEMS_PER_THREAD],
+                                       ValueT (&items)[ITEMS_PER_THREAD],
+                                       CompareOp compare_op,
+                                       int valid_items,
+                                       KeyT oob_default)
+  {
+    if (IS_LAST_TILE)
+    {
+      // if last tile, find valid max_key
+      // and fill the remaining keys with it
+      //
+      KeyT max_key = oob_default;
+
+      #pragma unroll
+      for (int item = 1; item < ITEMS_PER_THREAD; ++item)
+      {
+        if (ITEMS_PER_THREAD * linear_tid + item < valid_items)
+        {
+          max_key = compare_op(max_key, keys[item]) ? keys[item] : max_key;
+        }
+        else
+        {
+          keys[item] = max_key;
+        }
+      }
+    }
+
+    // if first element of thread is in input range, stable sort items
+    //
+    if (!IS_LAST_TILE || ITEMS_PER_THREAD * linear_tid < valid_items)
+    {
+      StableOddEvenSort(keys, items, compare_op);
+    }
+
+    // each thread has sorted keys
+    // merge sort keys in shared memory
+    //
+    #pragma unroll
+    for (int target_merged_threads_number = 2;
+         target_merged_threads_number <= NUM_THREADS;
+         target_merged_threads_number *= 2)
+    {
+      int merged_threads_number = target_merged_threads_number / 2;
+      int mask = target_merged_threads_number - 1;
+
+      Sync();
+
+      // store keys in shmem
+      //
+      #pragma unroll
+      for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+      {
+        int idx                       = ITEMS_PER_THREAD * linear_tid + item;
+        temp_storage.keys_shared[idx] = keys[item];
+      }
+
+      Sync();
+
+      int indices[ITEMS_PER_THREAD];
+
+      int first_thread_idx_in_thread_group_being_merged = ~mask & linear_tid;
+      int start = ITEMS_PER_THREAD * first_thread_idx_in_thread_group_being_merged;
+      int size  = ITEMS_PER_THREAD * merged_threads_number;
+
+      int thread_idx_in_thread_group_being_merged = mask & linear_tid;
+
+      int diag =
+        (cub::min)(valid_items,
+                   ITEMS_PER_THREAD * thread_idx_in_thread_group_being_merged);
+
+      int keys1_beg = (cub::min)(valid_items, start);
+      int keys1_end = (cub::min)(valid_items, keys1_beg + size);
+      int keys2_beg = keys1_end;
+      int keys2_end = (cub::min)(valid_items, keys2_beg + size);
+
+      int keys1_count = keys1_end - keys1_beg;
+      int keys2_count = keys2_end - keys2_beg;
+
+      int partition_diag = MergePath<KeyT>(&temp_storage.keys_shared[keys1_beg],
+                                           &temp_storage.keys_shared[keys2_beg],
+                                           keys1_count,
+                                           keys2_count,
+                                           diag,
+                                           compare_op);
+
+      int keys1_beg_loc   = keys1_beg + partition_diag;
+      int keys1_end_loc   = keys1_end;
+      int keys2_beg_loc   = keys2_beg + diag - partition_diag;
+      int keys2_end_loc   = keys2_end;
+      int keys1_count_loc = keys1_end_loc - keys1_beg_loc;
+      int keys2_count_loc = keys2_end_loc - keys2_beg_loc;
+      SerialMerge(&temp_storage.keys_shared[0],
+                  keys1_beg_loc,
+                  keys2_beg_loc,
+                  keys1_count_loc,
+                  keys2_count_loc,
+                  keys,
+                  indices,
+                  compare_op);
+
+      if (!KEYS_ONLY)
+      {
+        Sync();
+
+        // store keys in shmem
+        //
+        #pragma unroll
+        for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+        {
+          int idx = ITEMS_PER_THREAD * linear_tid + item;
+          temp_storage.items_shared[idx] = items[item];
+        }
+
+        Sync();
+
+        // gather items from shmem
+        //
+        #pragma unroll
+        for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+        {
+          items[item] = temp_storage.items_shared[indices[item]];
+        }
+      }
+    }
+  } // func block_merge_sort
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using
+   *        a merge sorting method.
+   *
+   * @par
+   * StableSort is stable: it preserves the relative ordering of equivalent
+   * elements. That is, if `x` and `y` are elements such that `x` precedes `y`,
+   * and if the two elements are equivalent (neither `x < y` nor `y < x`) then
+   * a postcondition of StableSort is that `x` still precedes `y`.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp>
+  __device__ __forceinline__ void StableSort(KeyT (&keys)[ITEMS_PER_THREAD],
+                                             CompareOp compare_op)
+  {
+    Sort(keys, compare_op);
+  }
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using
+   *        a merge sorting method.
+   *
+   * @par
+   * StableSort is stable: it preserves the relative ordering of equivalent
+   * elements. That is, if `x` and `y` are elements such that `x` precedes `y`,
+   * and if the two elements are equivalent (neither `x < y` nor `y < x`) then
+   * a postcondition of StableSort is that `x` still precedes `y`.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in,out] items
+   *   Values to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp>
+  __device__ __forceinline__ void StableSort(KeyT (&keys)[ITEMS_PER_THREAD],
+                                             ValueT (&items)[ITEMS_PER_THREAD],
+                                             CompareOp compare_op)
+  {
+    Sort(keys, items, compare_op);
+  }
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using
+   *        a merge sorting method.
+   *
+   * @par
+   * - StableSort is stable: it preserves the relative ordering of equivalent
+   *   elements. That is, if `x` and `y` are elements such that `x` precedes
+   *   `y`, and if the two elements are equivalent (neither `x < y` nor `y < x`)
+   *   then a postcondition of StableSort is that `x` still precedes `y`.
+   * - The value of `oob_default` is assigned to all elements that are out of
+   *   `valid_items` boundaries. It's expected that `oob_default` is ordered
+   *   after any value in the `valid_items` boundaries. The algorithm always
+   *   sorts a fixed amount of elements, which is equal to
+   *   `ITEMS_PER_THREAD * BLOCK_THREADS`.
+   *   If there is a value that is ordered after `oob_default`, it won't be
+   *   placed within `valid_items` boundaries.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] valid_items
+   *   Number of valid items to sort
+   *
+   * @param[in] oob_default
+   *   Default value to assign out-of-bound items
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp>
+  __device__ __forceinline__ void StableSort(KeyT (&keys)[ITEMS_PER_THREAD],
+                                             CompareOp compare_op,
+                                             int valid_items,
+                                             KeyT oob_default)
+  {
+    Sort(keys, compare_op, valid_items, oob_default);
+  }
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using
+   *        a merge sorting method.
+   *
+   * @par
+   * - StableSort is stable: it preserves the relative ordering of equivalent
+   *   elements. That is, if `x` and `y` are elements such that `x` precedes
+   *   `y`, and if the two elements are equivalent (neither `x < y` nor `y < x`)
+   *   then a postcondition of StableSort is that `x` still precedes `y`.
+   * - The value of `oob_default` is assigned to all elements that are out of
+   *   `valid_items` boundaries. It's expected that `oob_default` is ordered
+   *   after any value in the `valid_items` boundaries. The algorithm always
+   *   sorts a fixed amount of elements, which is equal to
+   *   `ITEMS_PER_THREAD * BLOCK_THREADS`. If there is a value that is ordered
+   *   after `oob_default`, it won't be placed within `valid_items` boundaries.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @tparam IS_LAST_TILE
+   *   True if `valid_items` isn't equal to the `ITEMS_PER_TILE`
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in,out] items
+   *   Values to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] valid_items
+   *   Number of valid items to sort
+   *
+   * @param[in] oob_default
+   *   Default value to assign out-of-bound items
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp,
+            bool IS_LAST_TILE = true>
+  __device__ __forceinline__ void StableSort(KeyT (&keys)[ITEMS_PER_THREAD],
+                                             ValueT (&items)[ITEMS_PER_THREAD],
+                                             CompareOp compare_op,
+                                             int valid_items,
+                                             KeyT oob_default)
+  {
+    Sort<CompareOp, IS_LAST_TILE>(keys,
+                                  items,
+                                  compare_op,
+                                  valid_items,
+                                  oob_default);
+  }
+
+private:
+  __device__ __forceinline__ void Sync() const
+  {
+    static_cast<const SynchronizationPolicy*>(this)->SyncImplementation();
+  }
+};
+
+
+/**
+ * @brief The BlockMergeSort class provides methods for sorting items
+ *        partitioned across a CUDA thread block using a merge sorting method.
+ * @ingroup BlockModule
+ *
+ * @tparam KeyT
+ *   KeyT type
+ *
+ * @tparam BLOCK_DIM_X
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   The number of items per thread
+ *
+ * @tparam ValueT
+ *   **[optional]** ValueT type (default: `cub::NullType`, which indicates
+ *   a keys-only sort)
+ *
+ * @tparam BLOCK_DIM_Y
+ *   **[optional]** The thread block length in threads along the Y dimension
+ *   (default: 1)
+ *
+ * @tparam BLOCK_DIM_Z
+ *   **[optional]** The thread block length in threads along the Z dimension
+ *   (default: 1)
+ *
+ * @par Overview
+ *   BlockMergeSort arranges items into ascending order using a comparison
+ *   functor with less-than semantics. Merge sort can handle arbitrary types
+ *   and comparison functors, but is slower than BlockRadixSort when sorting
+ *   arithmetic types into ascending/descending order.
+ *
+ * @par A Simple Example
+ * @blockcollective{BlockMergeSort}
+ * @par
+ * The code snippet below illustrates a sort of 512 integer keys that are
+ * partitioned across 128 threads * where each thread owns 4 consecutive items.
+ * @par
+ * @code
+ * #include <cub/cub.cuh>  // or equivalently <cub/block/block_merge_sort.cuh>
+ *
+ * struct CustomLess
+ * {
+ *   template <typename DataType>
+ *   __device__ bool operator()(const DataType &lhs, const DataType &rhs)
+ *   {
+ *     return lhs < rhs;
+ *   }
+ * };
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockMergeSort for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockMergeSort<int, 128, 4> BlockMergeSort;
+ *
+ *     // Allocate shared memory for BlockMergeSort
+ *     __shared__ typename BlockMergeSort::TempStorage temp_storage_shuffle;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_keys[4];
+ *     ...
+ *
+ *     BlockMergeSort(temp_storage_shuffle).Sort(thread_keys, CustomLess());
+ *     ...
+ * }
+ * @endcode
+ * @par
+ * Suppose the set of input `thread_keys` across the block of threads is
+ * `{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }`.
+ * The corresponding output `thread_keys` in those threads will be
+ * `{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }`.
+ *
+ * @par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockMergeSort.
+ */
+template <typename KeyT,
+          int BLOCK_DIM_X,
+          int ITEMS_PER_THREAD,
+          typename ValueT = NullType,
+          int BLOCK_DIM_Y = 1,
+          int BLOCK_DIM_Z = 1>
+class BlockMergeSort
+    : public BlockMergeSortStrategy<KeyT,
+                                    ValueT,
+                                    BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+                                    ITEMS_PER_THREAD,
+                                    BlockMergeSort<KeyT,
+                                                   BLOCK_DIM_X,
+                                                   ITEMS_PER_THREAD,
+                                                   ValueT,
+                                                   BLOCK_DIM_Y,
+                                                   BLOCK_DIM_Z>>
+{
+private:
+  // The thread block size in threads
+  static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+  static constexpr int ITEMS_PER_TILE = ITEMS_PER_THREAD * BLOCK_THREADS;
+
+  using BlockMergeSortStrategyT =
+    BlockMergeSortStrategy<KeyT,
+                           ValueT,
+                           BLOCK_THREADS,
+                           ITEMS_PER_THREAD,
+                           BlockMergeSort>;
+
+public:
+  __device__ __forceinline__ BlockMergeSort()
+      : BlockMergeSortStrategyT(
+          RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  __device__ __forceinline__ explicit BlockMergeSort(
+    typename BlockMergeSortStrategyT::TempStorage &temp_storage)
+      : BlockMergeSortStrategyT(
+          temp_storage,
+          RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+private:
+  __device__ __forceinline__ void SyncImplementation() const
+  {
+    CTA_SYNC();
+  }
+
+  friend BlockMergeSortStrategyT;
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/block/block_radix_rank.cuh b/cuda/cub/block/block_radix_rank.cuh
new file mode 100644
index 00000000..e75d6363
--- /dev/null
+++ b/cuda/cub/block/block_radix_rank.cuh
@@ -0,0 +1,1132 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "../thread/thread_reduce.cuh"
+#include "../thread/thread_scan.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/radix_rank_sort_operations.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \brief Radix ranking algorithm, the algorithm used to implement stable ranking of the
+ * keys from a single tile. Note that different ranking algorithms require different
+ * initial arrangements of keys to function properly.
+ */
+enum RadixRankAlgorithm
+{
+    /** Ranking using the BlockRadixRank algorithm with MEMOIZE_OUTER_SCAN == false. It
+     * uses thread-private histograms, and thus uses more shared memory. Requires blocked
+     * arrangement of keys. Does not support count callbacks. */
+    RADIX_RANK_BASIC,
+    /** Ranking using the BlockRadixRank algorithm with MEMOIZE_OUTER_SCAN ==
+     * true. Similar to RADIX_RANK BASIC, it requires blocked arrangement of
+     * keys and does not support count callbacks.*/
+    RADIX_RANK_MEMOIZE,
+    /** Ranking using the BlockRadixRankMatch algorithm. It uses warp-private
+     * histograms and matching for ranking the keys in a single warp. Therefore,
+     * it uses less shared memory compared to RADIX_RANK_BASIC. It requires
+     * warp-striped key arrangement and supports count callbacks. */
+    RADIX_RANK_MATCH,
+    /** Ranking using the BlockRadixRankMatchEarlyCounts algorithm with
+     * MATCH_ALGORITHM == WARP_MATCH_ANY. An alternative implementation of
+     * match-based ranking that computes bin counts early. Because of this, it
+     * works better with onesweep sorting, which requires bin counts for
+     * decoupled look-back. Assumes warp-striped key arrangement and supports
+     * count callbacks.*/
+    RADIX_RANK_MATCH_EARLY_COUNTS_ANY,
+    /** Ranking using the BlockRadixRankEarlyCounts algorithm with
+     * MATCH_ALGORITHM == WARP_MATCH_ATOMIC_OR. It uses extra space in shared
+     * memory to generate warp match masks using atomicOr(). This is faster when
+     * there are few matches, but can lead to slowdowns if the number of
+     * matching keys among warp lanes is high. Assumes warp-striped key
+     * arrangement and supports count callbacks. */
+    RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR
+};
+
+
+/** Empty callback implementation */
+template <int BINS_PER_THREAD>
+struct BlockRadixRankEmptyCallback
+{
+    __device__ __forceinline__ void operator()(int (&bins)[BINS_PER_THREAD]) {}
+};
+
+
+/**
+ * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
+ * \ingroup BlockModule
+ *
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam RADIX_BITS           The number of radix bits per digit place
+ * \tparam IS_DESCENDING           Whether or not the sorted-order is high-to-low
+ * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
+ * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
+ * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * Blah...
+ * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
+ * - \blocked
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par Examples
+ * \par
+ * - <b>Example 1:</b> Simple radix rank of 32-bit integer keys
+ *      \code
+ *      #include <cub/cub.cuh>
+ *
+ *      template <int BLOCK_THREADS>
+ *      __global__ void ExampleKernel(...)
+ *      {
+ *
+ *      \endcode
+ *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockRadixRank.
+ */
+template <
+    int                     BLOCK_DIM_X,
+    int                     RADIX_BITS,
+    bool                    IS_DESCENDING,
+    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     PTX_ARCH                = CUB_PTX_ARCH>
+class BlockRadixRank
+{
+private:
+
+    /******************************************************************************
+     * Type definitions and constants
+     ******************************************************************************/
+
+    // Integer type for digit counters (to be packed into words of type PackedCounters)
+    typedef unsigned short DigitCounter;
+
+    // Integer type for packing DigitCounters into columns of shared memory banks
+    using PackedCounter =
+      cub::detail::conditional_t<SMEM_CONFIG == cudaSharedMemBankSizeEightByte,
+                                 unsigned long long,
+                                 unsigned int>;
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        RADIX_DIGITS                = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        BYTES_PER_COUNTER           = sizeof(DigitCounter),
+        LOG_BYTES_PER_COUNTER       = Log2<BYTES_PER_COUNTER>::VALUE,
+
+        PACKING_RATIO               = static_cast<int>(sizeof(PackedCounter) / sizeof(DigitCounter)),
+        LOG_PACKING_RATIO           = Log2<PACKING_RATIO>::VALUE,
+
+        LOG_COUNTER_LANES           = CUB_MAX((int(RADIX_BITS) - int(LOG_PACKING_RATIO)), 0),                // Always at least one lane
+        COUNTER_LANES               = 1 << LOG_COUNTER_LANES,
+
+        // The number of packed counters per thread (plus one for padding)
+        PADDED_COUNTER_LANES        = COUNTER_LANES + 1,
+        RAKING_SEGMENT              = PADDED_COUNTER_LANES,
+    };
+
+public:
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
+    };
+
+private:
+
+
+    /// BlockScan type
+    typedef BlockScan<
+            PackedCounter,
+            BLOCK_DIM_X,
+            INNER_SCAN_ALGORITHM,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockScan;
+
+
+    /// Shared memory storage layout type for BlockRadixRank
+    struct __align__(16) _TempStorage
+    {
+        union Aliasable
+        {
+            DigitCounter            digit_counters[PADDED_COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+            PackedCounter           raking_grid[BLOCK_THREADS][RAKING_SEGMENT];
+
+        } aliasable;
+
+        // Storage for scanning local ranks
+        typename BlockScan::TempStorage block_scan;
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+    /// Copy of raking segment, promoted to registers
+    PackedCounter cached_segment[RAKING_SEGMENT];
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /**
+     * Internal storage allocator
+     */
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /**
+     * Performs upsweep raking reduction, returning the aggregate
+     */
+    __device__ __forceinline__ PackedCounter Upsweep()
+    {
+        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
+        PackedCounter *raking_ptr;
+
+        if (MEMOIZE_OUTER_SCAN)
+        {
+            // Copy data into registers
+            #pragma unroll
+            for (int i = 0; i < RAKING_SEGMENT; i++)
+            {
+                cached_segment[i] = smem_raking_ptr[i];
+            }
+            raking_ptr = cached_segment;
+        }
+        else
+        {
+            raking_ptr = smem_raking_ptr;
+        }
+
+        return internal::ThreadReduce<RAKING_SEGMENT>(raking_ptr, Sum());
+    }
+
+
+    /// Performs exclusive downsweep raking scan
+    __device__ __forceinline__ void ExclusiveDownsweep(
+        PackedCounter raking_partial)
+    {
+        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
+
+        PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ?
+            cached_segment :
+            smem_raking_ptr;
+
+        // Exclusive raking downsweep scan
+        internal::ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, Sum(), raking_partial);
+
+        if (MEMOIZE_OUTER_SCAN)
+        {
+            // Copy data back to smem
+            #pragma unroll
+            for (int i = 0; i < RAKING_SEGMENT; i++)
+            {
+                smem_raking_ptr[i] = cached_segment[i];
+            }
+        }
+    }
+
+
+    /**
+     * Reset shared memory digit counters
+     */
+    __device__ __forceinline__ void ResetCounters()
+    {
+        // Reset shared memory digit counters
+        #pragma unroll
+        for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++)
+        {
+            *((PackedCounter*) temp_storage.aliasable.digit_counters[LANE][linear_tid]) = 0;
+        }
+    }
+
+
+    /**
+     * Block-scan prefix callback
+     */
+    struct PrefixCallBack
+    {
+        __device__ __forceinline__ PackedCounter operator()(PackedCounter block_aggregate)
+        {
+            PackedCounter block_prefix = 0;
+
+            // Propagate totals in packed fields
+            #pragma unroll
+            for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
+            {
+                block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED);
+            }
+
+            return block_prefix;
+        }
+    };
+
+
+    /**
+     * Scan shared memory digit counters.
+     */
+    __device__ __forceinline__ void ScanCounters()
+    {
+        // Upsweep scan
+        PackedCounter raking_partial = Upsweep();
+
+        // Compute exclusive sum
+        PackedCounter exclusive_partial;
+        PrefixCallBack prefix_call_back;
+        BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back);
+
+        // Downsweep scan with exclusive partial
+        ExclusiveDownsweep(exclusive_partial);
+    }
+
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixRank()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixRank(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Raking
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Rank keys.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD,
+        typename        DigitExtractorT>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
+        DigitExtractorT digit_extractor)                    ///< [in] The digit extractor
+    {
+        DigitCounter    thread_prefixes[KEYS_PER_THREAD];   // For each key, the count of previous keys in this tile having the same digit
+        DigitCounter*   digit_counters[KEYS_PER_THREAD];    // For each key, the byte-offset of its corresponding digit counter in smem
+
+        // Reset shared memory digit counters
+        ResetCounters();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // Get digit
+            unsigned int digit = digit_extractor.Digit(keys[ITEM]);
+
+            // Get sub-counter
+            unsigned int sub_counter = digit >> LOG_COUNTER_LANES;
+
+            // Get counter lane
+            unsigned int counter_lane = digit & (COUNTER_LANES - 1);
+
+            if (IS_DESCENDING)
+            {
+                sub_counter = PACKING_RATIO - 1 - sub_counter;
+                counter_lane = COUNTER_LANES - 1 - counter_lane;
+            }
+
+            // Pointer to smem digit counter
+            digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane][linear_tid][sub_counter];
+
+            // Load thread-exclusive prefix
+            thread_prefixes[ITEM] = *digit_counters[ITEM];
+
+            // Store inclusive prefix
+            *digit_counters[ITEM] = thread_prefixes[ITEM] + 1;
+        }
+
+        CTA_SYNC();
+
+        // Scan shared memory counters
+        ScanCounters();
+
+        CTA_SYNC();
+
+        // Extract the local ranks of each key
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // Add in thread block exclusive prefix
+            ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];
+        }
+    }
+
+
+    /**
+     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD,
+        typename        DigitExtractorT>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
+        DigitExtractorT digit_extractor,                    ///< [in] The digit extractor
+        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        // Rank keys
+        RankKeys(keys, ranks, digit_extractor);
+
+        // Get the inclusive and exclusive digit totals corresponding to the calling thread.
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                // Obtain ex/inclusive digit counts.  (Unfortunately these all reside in the
+                // first counter column, resulting in unavoidable bank conflicts.)
+                unsigned int counter_lane   = (bin_idx & (COUNTER_LANES - 1));
+                unsigned int sub_counter    = bin_idx >> (LOG_COUNTER_LANES);
+
+                exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counters[counter_lane][0][sub_counter];
+            }
+        }
+    }
+};
+
+
+
+
+
+/**
+ * Radix-rank using match.any
+ */
+template <
+    int                     BLOCK_DIM_X,
+    int                     RADIX_BITS,
+    bool                    IS_DESCENDING,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     PTX_ARCH                = CUB_PTX_ARCH>
+class BlockRadixRankMatch
+{
+private:
+
+    /******************************************************************************
+     * Type definitions and constants
+     ******************************************************************************/
+
+    typedef int32_t    RankT;
+    typedef int32_t    DigitCounterT;
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        RADIX_DIGITS                = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        PADDED_WARPS            = ((WARPS & 0x1) == 0) ?
+                                    WARPS + 1 :
+                                    WARPS,
+
+        COUNTERS                = PADDED_WARPS * RADIX_DIGITS,
+        RAKING_SEGMENT          = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS,
+        PADDED_RAKING_SEGMENT   = ((RAKING_SEGMENT & 0x1) == 0) ?
+                                    RAKING_SEGMENT + 1 :
+                                    RAKING_SEGMENT,
+    };
+
+public:
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
+    };
+
+private:
+
+    /// BlockScan type
+    typedef BlockScan<
+            DigitCounterT,
+            BLOCK_THREADS,
+            INNER_SCAN_ALGORITHM,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockScanT;
+
+
+    /// Shared memory storage layout type for BlockRadixRank
+    struct __align__(16) _TempStorage
+    {
+        typename BlockScanT::TempStorage            block_scan;
+
+        union __align__(16) Aliasable
+        {
+            volatile DigitCounterT                  warp_digit_counters[RADIX_DIGITS][PADDED_WARPS];
+            DigitCounterT                           raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT];
+
+        } aliasable;
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixRankMatch(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Raking
+     *********************************************************************/
+    //@{
+
+    /** \brief Computes the count of keys for each digit value, and calls the
+     * callback with the array of key counts.
+
+     * @tparam CountsCallback The callback type. It should implement an instance
+     * overload of operator()(int (&bins)[BINS_TRACKED_PER_THREAD]), where bins
+     * is an array of key counts for each digit value distributed in block
+     * distribution among the threads of the thread block. Key counts can be
+     * used, to update other data structures in global or shared
+     * memory. Depending on the implementation of the ranking algoirhtm
+     * (see BlockRadixRankMatchEarlyCounts), key counts may become available
+     * early, therefore, they are returned through a callback rather than a
+     * separate output parameter of RankKeys().
+     */
+    template <int KEYS_PER_THREAD, typename CountsCallback>
+    __device__ __forceinline__ void CallBack(CountsCallback callback)
+    {
+        int bins[BINS_TRACKED_PER_THREAD];
+        // Get count for each digit
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+            const int TILE_ITEMS = KEYS_PER_THREAD * BLOCK_THREADS;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                {
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+                    bins[track] = (bin_idx > 0 ?
+                        temp_storage.aliasable.warp_digit_counters[bin_idx - 1][0] : TILE_ITEMS) -
+                        temp_storage.aliasable.warp_digit_counters[bin_idx][0];
+                }
+                else
+                {
+                    bins[track] = (bin_idx < RADIX_DIGITS - 1 ?
+                        temp_storage.aliasable.warp_digit_counters[bin_idx + 1][0] : TILE_ITEMS) -
+                        temp_storage.aliasable.warp_digit_counters[bin_idx][0];
+                }
+            }
+        }
+        callback(bins);
+    }
+
+    /**
+     * \brief Rank keys.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD,
+        typename        DigitExtractorT,
+        typename        CountsCallback>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
+        DigitExtractorT digit_extractor,                    ///< [in] The digit extractor
+        CountsCallback    callback)
+    {
+        // Initialize shared digit counters
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0;
+
+        CTA_SYNC();
+
+        // Each warp will strip-mine its section of input, one strip at a time
+
+        volatile DigitCounterT  *digit_counters[KEYS_PER_THREAD];
+        uint32_t                warp_id         = linear_tid >> LOG_WARP_THREADS;
+        uint32_t                lane_mask_lt    = LaneMaskLt();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // My digit
+            uint32_t digit = digit_extractor.Digit(keys[ITEM]);
+
+            if (IS_DESCENDING)
+                digit = RADIX_DIGITS - digit - 1;
+
+            // Mask of peers who have same digit as me
+            uint32_t peer_mask = MatchAny<RADIX_BITS>(digit);
+
+            // Pointer to smem digit counter for this key
+            digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit][warp_id];
+
+            // Number of occurrences in previous strips
+            DigitCounterT warp_digit_prefix = *digit_counters[ITEM];
+
+            // Warp-sync
+            WARP_SYNC(0xFFFFFFFF);
+
+            // Number of peers having same digit as me
+            int32_t digit_count = __popc(peer_mask);
+
+            // Number of lower-ranked peers having same digit seen so far
+            int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt);
+
+            if (peer_digit_prefix == 0)
+            {
+                // First thread for each digit updates the shared warp counter
+                *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count);
+            }
+
+            // Warp-sync
+            WARP_SYNC(0xFFFFFFFF);
+
+            // Number of prior keys having same digit
+            ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);
+        }
+
+        CTA_SYNC();
+
+        // Scan warp counters
+
+        DigitCounterT scan_counters[PADDED_RAKING_SEGMENT];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid][ITEM];
+
+        BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM];
+
+        CTA_SYNC();
+        if (!std::is_same<
+              CountsCallback,
+              BlockRadixRankEmptyCallback<BINS_TRACKED_PER_THREAD>>::value)
+        {
+            CallBack<KEYS_PER_THREAD>(callback);
+        }
+
+        // Seed ranks with counter values from previous warps
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+            ranks[ITEM] += *digit_counters[ITEM];
+    }
+
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD,
+        typename        DigitExtractorT>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD],
+        DigitExtractorT digit_extractor)
+    {
+        RankKeys(keys, ranks, digit_extractor,
+                 BlockRadixRankEmptyCallback<BINS_TRACKED_PER_THREAD>());
+    }
+
+    /**
+     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD,
+        typename        DigitExtractorT,
+        typename        CountsCallback>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
+        DigitExtractorT digit_extractor,                    ///< [in] The digit extractor
+        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD],            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+        CountsCallback callback)
+    {
+        RankKeys(keys, ranks, digit_extractor, callback);
+
+        // Get exclusive count for each digit
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx][0];
+            }
+        }
+    }
+
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD,
+        typename        DigitExtractorT>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
+        DigitExtractorT digit_extractor,
+        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        RankKeys(keys, ranks, digit_extractor, exclusive_digit_prefix,
+                 BlockRadixRankEmptyCallback<BINS_TRACKED_PER_THREAD>());
+    }
+};
+
+enum WarpMatchAlgorithm
+{
+    WARP_MATCH_ANY,
+    WARP_MATCH_ATOMIC_OR
+};
+
+/**
+ * Radix-rank using matching which computes the counts of keys for each digit
+ * value early, at the expense of doing more work. This may be useful e.g. for
+ * decoupled look-back, where it reduces the time other thread blocks need to
+ * wait for digit counts to become available.
+ */
+template <int BLOCK_DIM_X, int RADIX_BITS, bool IS_DESCENDING,
+          BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
+          WarpMatchAlgorithm MATCH_ALGORITHM = WARP_MATCH_ANY, int NUM_PARTS = 1>
+struct BlockRadixRankMatchEarlyCounts
+{
+    // constants
+    enum
+    {
+        BLOCK_THREADS = BLOCK_DIM_X,
+        RADIX_DIGITS = 1 << RADIX_BITS,
+        BINS_PER_THREAD = (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS,
+        BINS_TRACKED_PER_THREAD = BINS_PER_THREAD,
+        FULL_BINS = BINS_PER_THREAD * BLOCK_THREADS == RADIX_DIGITS,
+        WARP_THREADS = CUB_PTX_WARP_THREADS,
+        BLOCK_WARPS = BLOCK_THREADS / WARP_THREADS,
+        WARP_MASK = ~0,
+        NUM_MATCH_MASKS = MATCH_ALGORITHM == WARP_MATCH_ATOMIC_OR ? BLOCK_WARPS : 0,
+        // Guard against declaring zero-sized array:
+        MATCH_MASKS_ALLOC_SIZE = NUM_MATCH_MASKS < 1 ? 1 : NUM_MATCH_MASKS,
+    };
+
+    // types
+    typedef cub::BlockScan<int, BLOCK_THREADS, INNER_SCAN_ALGORITHM> BlockScan;
+
+    
+
+    // temporary storage
+    struct TempStorage
+    {
+        union
+        {
+            int warp_offsets[BLOCK_WARPS][RADIX_DIGITS];
+            int warp_histograms[BLOCK_WARPS][RADIX_DIGITS][NUM_PARTS];
+        };
+
+        int match_masks[MATCH_MASKS_ALLOC_SIZE][RADIX_DIGITS];
+
+        typename BlockScan::TempStorage prefix_tmp;
+    };
+
+    TempStorage& temp_storage;
+
+    // internal ranking implementation
+    template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT,
+              typename CountsCallback>
+    struct BlockRadixRankMatchInternal
+    {
+        TempStorage& s;
+        DigitExtractorT digit_extractor;
+        CountsCallback callback;
+        int warp;
+        int lane;
+
+        __device__ __forceinline__ int Digit(UnsignedBits key)
+        {
+            int digit =  digit_extractor.Digit(key);
+            return IS_DESCENDING ? RADIX_DIGITS - 1 - digit : digit;
+        }
+
+        __device__ __forceinline__ int ThreadBin(int u)
+        {
+            int bin = threadIdx.x * BINS_PER_THREAD + u;
+            return IS_DESCENDING ? RADIX_DIGITS - 1 - bin : bin;
+        }
+
+        __device__ __forceinline__
+        void ComputeHistogramsWarp(UnsignedBits (&keys)[KEYS_PER_THREAD])
+        {
+            //int* warp_offsets = &s.warp_offsets[warp][0];
+            int (&warp_histograms)[RADIX_DIGITS][NUM_PARTS] = s.warp_histograms[warp];
+            // compute warp-private histograms
+            #pragma unroll
+            for (int bin = lane; bin < RADIX_DIGITS; bin += WARP_THREADS)
+            {
+                #pragma unroll
+                for (int part = 0; part < NUM_PARTS; ++part)
+                {
+                    warp_histograms[bin][part] = 0;
+                }
+            }
+            if (MATCH_ALGORITHM == WARP_MATCH_ATOMIC_OR)
+            {
+                int* match_masks = &s.match_masks[warp][0];
+                #pragma unroll
+                for (int bin = lane; bin < RADIX_DIGITS; bin += WARP_THREADS)
+                {
+                    match_masks[bin] = 0;
+                }                    
+            }
+            WARP_SYNC(WARP_MASK);
+
+            // compute private per-part histograms
+            int part = lane % NUM_PARTS;
+            #pragma unroll
+            for (int u = 0; u < KEYS_PER_THREAD; ++u)
+            {
+                atomicAdd(&warp_histograms[Digit(keys[u])][part], 1);
+            }
+            
+            // sum different parts;
+            // no extra work is necessary if NUM_PARTS == 1
+            if (NUM_PARTS > 1)
+            {
+                WARP_SYNC(WARP_MASK);
+                // TODO: handle RADIX_DIGITS % WARP_THREADS != 0 if it becomes necessary
+                const int WARP_BINS_PER_THREAD = RADIX_DIGITS / WARP_THREADS;
+                int bins[WARP_BINS_PER_THREAD];
+                #pragma unroll
+                for (int u = 0; u < WARP_BINS_PER_THREAD; ++u)
+                {
+                    int bin = lane + u * WARP_THREADS;
+                    bins[u] = internal::ThreadReduce(warp_histograms[bin], Sum());
+                }
+                CTA_SYNC();
+
+                // store the resulting histogram in shared memory
+                int* warp_offsets = &s.warp_offsets[warp][0];
+                #pragma unroll
+                for (int u = 0; u < WARP_BINS_PER_THREAD; ++u)
+                {
+                    int bin = lane + u * WARP_THREADS;
+                    warp_offsets[bin] = bins[u];
+                }
+            }
+        }
+
+        __device__ __forceinline__
+        void ComputeOffsetsWarpUpsweep(int (&bins)[BINS_PER_THREAD])
+        {
+            // sum up warp-private histograms
+            #pragma unroll
+            for (int u = 0; u < BINS_PER_THREAD; ++u) 
+            {
+                bins[u] = 0;
+                int bin = ThreadBin(u);
+                if (FULL_BINS || (bin >= 0 && bin < RADIX_DIGITS))
+                {
+                    #pragma unroll
+                    for (int j_warp = 0; j_warp < BLOCK_WARPS; ++j_warp)
+                    {
+                        int warp_offset = s.warp_offsets[j_warp][bin];
+                        s.warp_offsets[j_warp][bin] = bins[u];
+                        bins[u] += warp_offset;
+                    }
+                }
+            }
+        }
+
+        __device__ __forceinline__
+        void ComputeOffsetsWarpDownsweep(int (&offsets)[BINS_PER_THREAD])
+        {
+            #pragma unroll
+            for (int u = 0; u < BINS_PER_THREAD; ++u)
+            {
+                int bin = ThreadBin(u);
+                if (FULL_BINS || (bin >= 0 && bin < RADIX_DIGITS))
+                {
+                    int digit_offset = offsets[u];
+                    #pragma unroll
+                    for (int j_warp = 0; j_warp < BLOCK_WARPS; ++j_warp)
+                    {
+                        s.warp_offsets[j_warp][bin] += digit_offset;
+                    }
+                }
+            }
+        }
+
+        __device__ __forceinline__
+        void ComputeRanksItem(
+            UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD],
+            Int2Type<WARP_MATCH_ATOMIC_OR>)
+        {
+            // compute key ranks
+            int lane_mask = 1 << lane;
+            int* warp_offsets = &s.warp_offsets[warp][0];
+            int* match_masks = &s.match_masks[warp][0];
+            #pragma unroll
+            for (int u = 0; u < KEYS_PER_THREAD; ++u)
+            {
+                int bin = Digit(keys[u]);
+                int* p_match_mask = &match_masks[bin];
+                atomicOr(p_match_mask, lane_mask);
+                WARP_SYNC(WARP_MASK);
+                int bin_mask = *p_match_mask;
+                int leader = (WARP_THREADS - 1) - __clz(bin_mask);
+                int warp_offset = 0;
+                int popc = __popc(bin_mask & LaneMaskLe());
+                if (lane == leader)
+                {
+                    // atomic is a bit faster
+                    warp_offset = atomicAdd(&warp_offsets[bin], popc);
+                }
+                warp_offset = SHFL_IDX_SYNC(warp_offset, leader, bin_mask);
+                if (lane == leader) *p_match_mask = 0;
+                WARP_SYNC(WARP_MASK);
+                ranks[u] = warp_offset + popc - 1;
+            }
+        }
+
+        __device__ __forceinline__
+        void ComputeRanksItem(
+            UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD],
+            Int2Type<WARP_MATCH_ANY>)
+        {
+            // compute key ranks
+            int* warp_offsets = &s.warp_offsets[warp][0];
+            #pragma unroll
+            for (int u = 0; u < KEYS_PER_THREAD; ++u)
+            {
+                int bin = Digit(keys[u]);
+                int bin_mask = MatchAny<RADIX_BITS>(bin);
+                int leader = (WARP_THREADS - 1) - __clz(bin_mask);
+                int warp_offset = 0;
+                int popc = __popc(bin_mask & LaneMaskLe());
+                if (lane == leader)
+                {
+                    // atomic is a bit faster
+                    warp_offset = atomicAdd(&warp_offsets[bin], popc);
+                }
+                warp_offset = SHFL_IDX_SYNC(warp_offset, leader, bin_mask);
+                ranks[u] = warp_offset + popc - 1;
+            }
+        }
+
+        __device__ __forceinline__ void RankKeys(
+            UnsignedBits (&keys)[KEYS_PER_THREAD],
+            int (&ranks)[KEYS_PER_THREAD],
+            int (&exclusive_digit_prefix)[BINS_PER_THREAD])
+        {
+            ComputeHistogramsWarp(keys);
+            
+            CTA_SYNC();
+            int bins[BINS_PER_THREAD];
+            ComputeOffsetsWarpUpsweep(bins);
+            callback(bins);
+            
+            BlockScan(s.prefix_tmp).ExclusiveSum(bins, exclusive_digit_prefix);
+
+            ComputeOffsetsWarpDownsweep(exclusive_digit_prefix);
+            CTA_SYNC();
+            ComputeRanksItem(keys, ranks, Int2Type<MATCH_ALGORITHM>());
+        }
+
+        __device__ __forceinline__ BlockRadixRankMatchInternal
+        (TempStorage& temp_storage, DigitExtractorT digit_extractor, CountsCallback callback)
+            : s(temp_storage), digit_extractor(digit_extractor),
+              callback(callback), warp(threadIdx.x / WARP_THREADS), lane(LaneId())
+            {}
+    };
+
+    __device__ __forceinline__ BlockRadixRankMatchEarlyCounts
+    (TempStorage& temp_storage) : temp_storage(temp_storage) {}
+
+    /**
+     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     */
+    template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT,
+        typename CountsCallback>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],
+        int             (&ranks)[KEYS_PER_THREAD],
+        DigitExtractorT digit_extractor,
+        int             (&exclusive_digit_prefix)[BINS_PER_THREAD],
+        CountsCallback  callback)
+    {
+        BlockRadixRankMatchInternal<UnsignedBits, KEYS_PER_THREAD, DigitExtractorT, CountsCallback>
+            internal(temp_storage, digit_extractor, callback);
+        internal.RankKeys(keys, ranks, exclusive_digit_prefix);        
+    }
+
+    template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],
+        int             (&ranks)[KEYS_PER_THREAD],
+        DigitExtractorT digit_extractor,
+        int             (&exclusive_digit_prefix)[BINS_PER_THREAD])
+    {
+        typedef BlockRadixRankEmptyCallback<BINS_PER_THREAD> CountsCallback;
+        BlockRadixRankMatchInternal<UnsignedBits, KEYS_PER_THREAD, DigitExtractorT, CountsCallback>
+            internal(temp_storage, digit_extractor, CountsCallback());
+        internal.RankKeys(keys, ranks, exclusive_digit_prefix);
+    }
+
+    template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],
+        int             (&ranks)[KEYS_PER_THREAD],
+        DigitExtractorT digit_extractor)
+    {
+        int exclusive_digit_prefix[BINS_PER_THREAD];
+        RankKeys(keys, ranks, digit_extractor, exclusive_digit_prefix);
+    }
+};
+
+
+CUB_NAMESPACE_END
+
+
diff --git a/cuda/cub/block/block_radix_sort.cuh b/cuda/cub/block/block_radix_sort.cuh
new file mode 100644
index 00000000..73a75636
--- /dev/null
+++ b/cuda/cub/block/block_radix_sort.cuh
@@ -0,0 +1,905 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block.
+ */
+
+
+#pragma once
+
+#include "block_exchange.cuh"
+#include "block_radix_rank.cuh"
+#include "radix_rank_sort_operations.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \brief The BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method.  ![](sorting_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam KeyT                 KeyT type
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of items per thread
+ * \tparam ValueT               <b>[optional]</b> ValueT type (default: cub::NullType, which indicates a keys-only sort)
+ * \tparam RADIX_BITS           <b>[optional]</b> The number of radix bits per digit place (default: 4 bits)
+ * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).
+ * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
+ * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ * items into ascending order.  It relies upon a positional representation for
+ * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ * characters, etc.) specified from least-significant to most-significant.  For a
+ * given input sequence of keys and a set of rules specifying a total ordering
+ * of the symbolic alphabet, the radix sorting method produces a lexicographic
+ * ordering of those keys.
+ *
+ * \rowmajor
+ *
+ * \par Supported Types
+ * BlockRadixSort can sort all of the built-in C++ numeric primitive types
+ * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
+ * half-precision floating-point type.
+ *
+ * \par Floating-Point Special Cases
+ *
+ * - Positive and negative zeros are considered equivalent, and will be treated
+ *   as such in the output.
+ * - No special handling is implemented for NaN values; these are sorted
+ *   according to their bit representations after any transformations.
+ *
+ * \par Bitwise Key Transformations
+ * Although the direct radix sorting method can only be applied to unsigned
+ * integral types, BlockRadixSort is able to sort signed and floating-point
+ * types via simple bit-wise transformations that ensure lexicographic key
+ * ordering.
+ *
+ * These transformations must be considered when restricting the
+ * `[begin_bit, end_bit)` range, as the bitwise transformations will occur
+ * before the bit-range truncation.
+ *
+ * Any transformations applied to the keys prior to sorting are reversed
+ * while writing to the final output buffer.
+ *
+ * \par Type Specific Bitwise Transformations
+ * To convert the input values into a radix-sortable bitwise representation,
+ * the following transformations take place prior to sorting:
+ *
+ * - For unsigned integral values, the keys are used directly.
+ * - For signed integral values, the sign bit is inverted.
+ * - For positive floating point values, the sign bit is inverted.
+ * - For negative floating point values, the full key is inverted.
+ *
+ * \par No Descending Sort Transformations
+ * Unlike `DeviceRadixSort`, `BlockRadixSort` does not invert the input key bits
+ * when performing a descending sort. Instead, it has special logic to reverse
+ * the order of the keys while sorting.
+ *
+ * \par Stability
+ * BlockRadixSort is stable. For floating-point types -0.0 and +0.0
+ * are considered equal and appear in the result in the same order as they
+ * appear in the input.
+ *
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockRadixSort}
+ * \par
+ * The code snippet below illustrates a sort of 512 integer keys that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+ *
+ *     // Allocate shared memory for BlockRadixSort
+ *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_keys[4];
+ *     ...
+ *
+ *     // Collectively sort the keys
+ *     BlockRadixSort(temp_storage).Sort(thread_keys);
+ *
+ *     ...
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_keys across the block of threads is
+ * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+ * corresponding output \p thread_keys in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+ *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockRadixSort.
+ */
+template <
+    typename                KeyT,
+    int                     BLOCK_DIM_X,
+    int                     ITEMS_PER_THREAD,
+    typename                ValueT                   = NullType,
+    int                     RADIX_BITS              = 4,
+    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     PTX_ARCH                = CUB_PTX_ARCH>
+class BlockRadixSort
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        // Whether or not there are values to be trucked along with keys
+        KEYS_ONLY                   = std::is_same<ValueT, NullType>::value,
+    };
+
+    // KeyT traits and unsigned bits type
+    typedef Traits<KeyT>                        KeyTraits;
+    typedef typename KeyTraits::UnsignedBits    UnsignedBits;
+
+    /// Ascending BlockRadixRank utility type
+    typedef BlockRadixRank<
+            BLOCK_DIM_X,
+            RADIX_BITS,
+            false,
+            MEMOIZE_OUTER_SCAN,
+            INNER_SCAN_ALGORITHM,
+            SMEM_CONFIG,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        AscendingBlockRadixRank;
+
+    /// Descending BlockRadixRank utility type
+    typedef BlockRadixRank<
+            BLOCK_DIM_X,
+            RADIX_BITS,
+            true,
+            MEMOIZE_OUTER_SCAN,
+            INNER_SCAN_ALGORITHM,
+            SMEM_CONFIG,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        DescendingBlockRadixRank;
+
+    /// Digit extractor type
+    typedef BFEDigitExtractor<KeyT> DigitExtractorT;
+
+    /// BlockExchange utility type for keys
+    typedef BlockExchange<KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeKeys;
+
+    /// BlockExchange utility type for values
+    typedef BlockExchange<ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeValues;
+
+    /// Shared memory storage layout type
+    union _TempStorage
+    {
+        typename AscendingBlockRadixRank::TempStorage  asending_ranking_storage;
+        typename DescendingBlockRadixRank::TempStorage descending_ranking_storage;
+        typename BlockExchangeKeys::TempStorage        exchange_keys;
+        typename BlockExchangeValues::TempStorage      exchange_values;
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+    /// Rank keys (specialized for ascending sort)
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        DigitExtractorT digit_extractor,
+        Int2Type<false> /*is_descending*/)
+    {
+        AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(
+                unsigned_keys,
+                ranks,
+                digit_extractor);
+    }
+
+    /// Rank keys (specialized for descending sort)
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        DigitExtractorT digit_extractor,
+        Int2Type<true>  /*is_descending*/)
+    {
+        DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys(
+                unsigned_keys,
+                ranks,
+                digit_extractor);
+    }
+
+    /// ExchangeValues (specialized for key-value sort, to-blocked arrangement)
+    __device__ __forceinline__ void ExchangeValues(
+        ValueT          (&values)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        Int2Type<false> /*is_keys_only*/,
+        Int2Type<true>  /*is_blocked*/)
+    {
+        CTA_SYNC();
+
+        // Exchange values through shared memory in blocked arrangement
+        BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks);
+    }
+
+    /// ExchangeValues (specialized for key-value sort, to-striped arrangement)
+    __device__ __forceinline__ void ExchangeValues(
+        ValueT          (&values)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        Int2Type<false> /*is_keys_only*/,
+        Int2Type<false> /*is_blocked*/)
+    {
+        CTA_SYNC();
+
+        // Exchange values through shared memory in blocked arrangement
+        BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
+    }
+
+    /// ExchangeValues (specialized for keys-only sort)
+    template <int IS_BLOCKED>
+    __device__ __forceinline__ void ExchangeValues(
+        ValueT                  (&/*values*/)[ITEMS_PER_THREAD],
+        int                     (&/*ranks*/)[ITEMS_PER_THREAD],
+        Int2Type<true>          /*is_keys_only*/,
+        Int2Type<IS_BLOCKED>    /*is_blocked*/)
+    {}
+
+    /// Sort blocked arrangement
+    template <int DESCENDING, int KEYS_ONLY>
+    __device__ __forceinline__ void SortBlocked(
+        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
+        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
+        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
+        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
+        Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
+    {
+        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
+        // Twiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+        }
+
+        // Radix sorting passes
+        while (true)
+        {
+            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+            DigitExtractorT digit_extractor(begin_bit, pass_bits);
+
+            // Rank the blocked keys
+            int ranks[ITEMS_PER_THREAD];
+            RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
+            begin_bit += RADIX_BITS;
+
+            CTA_SYNC();
+
+            // Exchange keys through shared memory in blocked arrangement
+            BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
+
+            // Exchange values through shared memory in blocked arrangement
+            ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
+
+            // Quit if done
+            if (begin_bit >= end_bit) break;
+
+            CTA_SYNC();
+        }
+
+        // Untwiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+        }
+    }
+
+public:
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Sort blocked -> striped arrangement
+    template <int DESCENDING, int KEYS_ONLY>
+    __device__ __forceinline__ void SortBlockedToStriped(
+        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
+        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
+        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
+        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
+        Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
+    {
+        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
+        // Twiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+        }
+
+        // Radix sorting passes
+        while (true)
+        {
+            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+            DigitExtractorT digit_extractor(begin_bit, pass_bits);
+
+            // Rank the blocked keys
+            int ranks[ITEMS_PER_THREAD];
+            RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
+            begin_bit += RADIX_BITS;
+
+            CTA_SYNC();
+
+            // Check if this is the last pass
+            if (begin_bit >= end_bit)
+            {
+                // Last pass exchanges keys through shared memory in striped arrangement
+                BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks);
+
+                // Last pass exchanges through shared memory in striped arrangement
+                ExchangeValues(values, ranks, is_keys_only, Int2Type<false>());
+
+                // Quit
+                break;
+            }
+
+            // Exchange keys through shared memory in blocked arrangement
+            BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
+
+            // Exchange values through shared memory in blocked arrangement
+            ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
+
+            CTA_SYNC();
+        }
+
+        // Untwiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+        }
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    /// \smemstorage{BlockRadixSort}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixSort()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixSort(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Sorting (blocked arrangements)
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Performs an ascending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).Sort(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
+     * The corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     */
+    __device__ __forceinline__ void Sort(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs an ascending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void Sort(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+    /**
+     * \brief Performs a descending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).Sort(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
+     * The corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
+     */
+    __device__ __forceinline__ void SortDescending(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs a descending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortDescending(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Sorting (blocked arrangement -> striped arrangement)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortDescendingBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortDescendingBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    //@}  end member group
+
+};
+
+/**
+ * \example example_block_radix_sort.cu
+ */
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/block/block_raking_layout.cuh b/cuda/cub/block/block_raking_layout.cuh
new file mode 100644
index 00000000..8bd1be13
--- /dev/null
+++ b/cuda/cub/block/block_raking_layout.cuh
@@ -0,0 +1,145 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
+ */
+
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_type.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.    ![](raking.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * This type facilitates a shared memory usage pattern where a block of CUDA
+ * threads places elements into shared memory and then reduces the active
+ * parallelism to one "raking" warp of threads for serially aggregating consecutive
+ * sequences of shared items.  Padding is inserted to eliminate bank conflicts
+ * (for most data types).
+ *
+ * \tparam T                        The data type to be exchanged.
+ * \tparam BLOCK_THREADS            The thread block size in threads.
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ */
+template <
+    typename    T,
+    int         BLOCK_THREADS,
+    int         PTX_ARCH = CUB_PTX_ARCH>
+struct BlockRakingLayout
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// The total number of elements that need to be cooperatively reduced
+        SHARED_ELEMENTS = BLOCK_THREADS,
+
+        /// Maximum number of warp-synchronous raking threads
+        MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Number of raking elements per warp-synchronous raking thread (rounded up)
+        SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
+
+        /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
+        RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
+
+        /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
+        HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
+
+        /// Degree of bank conflicts (e.g., 4-way)
+        CONFLICT_DEGREE = (HAS_CONFLICTS) ?
+            (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
+            1,
+
+        /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load
+        USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2),
+
+        /// Total number of elements in the raking grid
+        GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING),
+
+        /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
+        UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
+    };
+
+
+    /**
+     * \brief Shared memory storage type
+     */
+    struct __align__(16) _TempStorage
+    {
+        T buff[BlockRakingLayout::GRID_ELEMENTS];
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /**
+     * \brief Returns the location for the calling thread to place data into the grid
+     */
+    static __device__ __forceinline__ T* PlacementPtr(
+        TempStorage &temp_storage,
+        unsigned int linear_tid)
+    {
+        // Offset for partial
+        unsigned int offset = linear_tid;
+
+        // Add in one padding element for every segment
+        if (USE_SEGMENT_PADDING > 0)
+        {
+            offset += offset / SEGMENT_LENGTH;
+        }
+
+        // Incorporating a block of padding partials every shared memory segment
+        return temp_storage.Alias().buff + offset;
+    }
+
+
+    /**
+     * \brief Returns the location for the calling thread to begin sequential raking
+     */
+    static __device__ __forceinline__ T* RakingPtr(
+        TempStorage &temp_storage,
+        unsigned int linear_tid)
+    {
+        return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING));
+    }
+};
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/block/block_reduce.cuh b/cuda/cub/block/block_reduce.cuh
new file mode 100644
index 00000000..eabcc285
--- /dev/null
+++ b/cuda/cub/block/block_reduce.cuh
@@ -0,0 +1,608 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_reduce_raking.cuh"
+#include "specializations/block_reduce_raking_commutative_only.cuh"
+#include "specializations/block_reduce_warp_reductions.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+#include "../thread/thread_operators.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * BlockReduceAlgorithm enumerates alternative algorithms for parallel
+ * reduction across a CUDA thread block.
+ */
+enum BlockReduceAlgorithm
+{
+
+    /**
+     * \par Overview
+     * An efficient "raking" reduction algorithm that only supports commutative
+     * reduction operators (true for most operations, e.g., addition).
+     *
+     * \par
+     * Execution is comprised of three phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Threads in warps other than the first warp place
+     *    their partial reductions into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within the first
+     *    warp continue to accumulate by raking across segments of shared partial reductions
+     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
+     *
+     * \par
+     * \image html block_reduce.png
+     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
+     *   and is preferable when the reduction operator is commutative.  This variant
+     *   applies fewer reduction operators  than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
+     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
+     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
+     *   when the GPU is under-occupied.
+     */
+    BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,
+
+
+    /**
+     * \par Overview
+     * An efficient "raking" reduction algorithm that supports commutative
+     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
+     * operators. \blocked.
+     *
+     * \par
+     * Execution is comprised of three phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Each thread then places the partial reduction
+     *    of its item(s) into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within a
+     *    single warp rake across segments of shared partial reductions.
+     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
+     *
+     * \par
+     * \image html block_reduce.png
+     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - This variant performs more communication than BLOCK_REDUCE_RAKING
+     *   and is only preferable when the reduction operator is non-commutative.  This variant
+     *   applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
+     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
+     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
+     *   when the GPU is under-occupied.
+     */
+    BLOCK_REDUCE_RAKING,
+
+
+    /**
+     * \par Overview
+     * A quick "tiled warp-reductions" reduction algorithm that supports commutative
+     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
+     * operators.
+     *
+     * \par
+     * Execution is comprised of four phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Each thread then places the partial reduction
+     *    of its item(s) into shared memory.
+     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
+     *    reduction within each warp.
+     * -# A propagation phase where the warp reduction outputs in each warp are
+     *    updated with the aggregate from each preceding warp.
+     *
+     * \par
+     * \image html block_scan_warpscans.png
+     * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
+     *   or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall
+     *   throughput across the GPU.  However turn-around latency may be lower and
+     *   thus useful when the GPU is under-occupied.
+     */
+    BLOCK_REDUCE_WARP_REDUCTIONS,
+};
+
+
+/******************************************************************************
+ * Block reduce
+ ******************************************************************************/
+
+/**
+ * \brief The BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                Data type being reduced
+ * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
+ * \tparam ALGORITHM        <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
+ * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ *   uses a binary combining operator to compute a single aggregate from a list of input elements.
+ * - \rowmajor
+ * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles:
+ *   -# <b>cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY</b>.  An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *   -# <b>cub::BLOCK_REDUCE_RAKING</b>.  An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *   -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>.  A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *
+ * \par Performance Considerations
+ * - \granularity
+ * - Very efficient (only one synchronization barrier).
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *   - Summation (<b><em>vs.</em></b> generic reduction)
+ *   - \p BLOCK_THREADS is a multiple of the architecture's warp size
+ *   - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
+ * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockReduce}
+ * \par
+ * The code snippet below illustrates a sum reduction of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockReduce for a 1D block of 128 threads of type int
+ *     typedef cub::BlockReduce<int, 128> BlockReduce;
+ *
+ *     // Allocate shared memory for BlockReduce
+ *     __shared__ typename BlockReduce::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Compute the block-wide sum for thread0
+ *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ */
+template <
+    typename                T,
+    int                     BLOCK_DIM_X,
+    BlockReduceAlgorithm    ALGORITHM       = BLOCK_REDUCE_WARP_REDUCTIONS,
+    int                     BLOCK_DIM_Y     = 1,
+    int                     BLOCK_DIM_Z     = 1,
+    int                     PTX_ARCH        = CUB_PTX_ARCH>
+class BlockReduce
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    typedef BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>           WarpReductions;
+    typedef BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>    RakingCommutativeOnly;
+    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>                   Raking;
+
+    /// Internal specialization type
+    using InternalBlockReduce = cub::detail::conditional_t<
+      ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS,
+      WarpReductions,
+      cub::detail::conditional_t<ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,
+                                 RakingCommutativeOnly,
+                                 Raking>>; // BlockReduceRaking
+
+    /// Shared memory storage layout type for BlockReduce
+    typedef typename InternalBlockReduce::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockReduce}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockReduce()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockReduce(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Generic reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads of type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+     *
+     * \endcode
+     *
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               input,                      ///< [in] Calling thread's input
+        ReductionOp     reduction_op)               ///< [in] Binary reduction functor 
+    {
+        return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads of type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+     *
+     * \endcode
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               (&inputs)[ITEMS_PER_THREAD],    ///< [in] Calling thread's input segment
+        ReductionOp     reduction_op)                   ///< [in] Binary reduction functor 
+    {
+        // Reduce partials
+        T partial = internal::ThreadReduce(inputs, reduction_op);
+        return Reduce(partial, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  The first \p num_valid threads each contribute one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction of a partially-full tile of integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(int num_valid, ...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads of type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     if (threadIdx.x < num_valid) thread_data = ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
+     *
+     * \endcode
+     *
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,                  ///< [in] Calling thread's input
+        ReductionOp         reduction_op,           ///< [in] Binary reduction functor 
+        int                 num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
+    {
+        // Determine if we scan skip bounds checking
+        if (num_valid >= BLOCK_THREADS)
+        {
+            return InternalBlockReduce(temp_storage).template Reduce<true>(input, num_valid, reduction_op);
+        }
+        else
+        {
+            return InternalBlockReduce(temp_storage).template Reduce<false>(input, num_valid, reduction_op);
+        }
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Summation reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads of type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+     *
+     * \endcode
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T   input)                      ///< [in] Calling thread's input
+    {
+        return InternalBlockReduce(temp_storage).template Sum<true>(input, BLOCK_THREADS);
+    }
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads of type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+     *
+     * \endcode
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ T Sum(
+        T   (&inputs)[ITEMS_PER_THREAD])    ///< [in] Calling thread's input segment
+    {
+        // Reduce partials
+        T partial = internal::ThreadReduce(inputs, cub::Sum());
+        return Sum(partial);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  The first \p num_valid threads each contribute one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(int num_valid, ...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads of type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item (up to num_items)
+     *     int thread_data;
+     *     if (threadIdx.x < num_valid)
+     *         thread_data = ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
+     *
+     * \endcode
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T   input,                  ///< [in] Calling thread's input
+        int num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
+    {
+        // Determine if we scan skip bounds checking
+        if (num_valid >= BLOCK_THREADS)
+        {
+            return InternalBlockReduce(temp_storage).template Sum<true>(input, num_valid);
+        }
+        else
+        {
+            return InternalBlockReduce(temp_storage).template Sum<false>(input, num_valid);
+        }
+    }
+
+
+    //@}  end member group
+};
+
+/**
+ * \example example_block_reduce.cu
+ */
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/block/block_run_length_decode.cuh b/cuda/cub/block/block_run_length_decode.cuh
new file mode 100644
index 00000000..41a3ab22
--- /dev/null
+++ b/cuda/cub/block/block_run_length_decode.cuh
@@ -0,0 +1,392 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "../config.cuh"
+#include "../thread/thread_search.cuh"
+#include "../util_math.cuh"
+#include "../util_namespace.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+#include "block_scan.cuh"
+#include <limits>
+#include <type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \brief The BlockRunLengthDecode class supports decoding a run-length encoded array of items. That is, given
+ * the two arrays run_value[N] and run_lengths[N], run_value[i] is repeated run_lengths[i] many times in the output
+ * array.
+ * Due to the nature of the run-length decoding algorithm ("decompression"), the output size of the run-length decoded
+ * array is runtime-dependent and potentially without any upper bound. To address this, BlockRunLengthDecode allows
+ * retrieving a "window" from the run-length decoded array. The window's offset can be specified and BLOCK_THREADS *
+ * DECODED_ITEMS_PER_THREAD (i.e., referred to as window_size) decoded items from the specified window will be returned.
+ *
+ * \note: Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array).
+ * A run of length zero may not be followed by a run length that is not zero.
+ *
+ * \par
+ * \code
+ * __global__ void ExampleKernel(...)
+ * {
+ *   // Specialising BlockRunLengthDecode to run-length decode items of type uint64_t
+ *   using RunItemT = uint64_t;
+ *   // Type large enough to index into the run-length decoded array
+ *   using RunLengthT = uint32_t;
+ *
+ *   // Specialising BlockRunLengthDecode for a 1D block of 128 threads
+ *   constexpr int BLOCK_DIM_X = 128;
+ *   // Specialising BlockRunLengthDecode to have each thread contribute 2 run-length encoded runs
+ *   constexpr int RUNS_PER_THREAD = 2;
+ *   // Specialising BlockRunLengthDecode to have each thread hold 4 run-length decoded items
+ *   constexpr int DECODED_ITEMS_PER_THREAD = 4;
+ *
+ *   // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
+ *   using BlockRunLengthDecodeT =
+ *     cub::BlockRunLengthDecode<RunItemT, BLOCK_DIM_X, RUNS_PER_THREAD, DECODED_ITEMS_PER_THREAD>;
+ *
+ *   // Allocate shared memory for BlockRunLengthDecode
+ *   __shared__ typename BlockRunLengthDecodeT::TempStorage temp_storage;
+ *
+ *   // The run-length encoded items and how often they shall be repeated in the run-length decoded output
+ *   RunItemT run_values[RUNS_PER_THREAD];
+ *   RunLengthT run_lengths[RUNS_PER_THREAD];
+ *   ...
+ *
+ *   // Initialize the BlockRunLengthDecode with the runs that we want to run-length decode
+ *   uint32_t total_decoded_size = 0;
+ *   BlockRunLengthDecodeT block_rld(temp_storage, run_values, run_lengths, total_decoded_size);
+ *
+ *   // Run-length decode ("decompress") the runs into a window buffer of limited size. This is repeated until all runs
+ *   // have been decoded.
+ *   uint32_t decoded_window_offset = 0U;
+ *   while (decoded_window_offset < total_decoded_size)
+ *   {
+ *     RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD];
+ *     RunItemT decoded_items[DECODED_ITEMS_PER_THREAD];
+ *
+ *     // The number of decoded items that are valid within this window (aka pass) of run-length decoding
+ *     uint32_t num_valid_items = total_decoded_size - decoded_window_offset;
+ *     block_rld.RunLengthDecode(decoded_items, relative_offsets, decoded_window_offset);
+ *
+ *     decoded_window_offset += BLOCK_DIM_X * DECODED_ITEMS_PER_THREAD;
+ *
+ *     ...
+ *   }
+ * }
+ * \endcode
+ * \par
+ * Suppose the set of input \p run_values across the block of threads is
+ * <tt>{ [0, 1], [2, 3], [4, 5], [6, 7], ..., [254, 255] }</tt> and
+ * \p run_lengths is <tt>{ [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }</tt>.
+ * The corresponding output \p decoded_items in those threads will be <tt>{ [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4],
+ * [4, 4, 4, 5], ..., [169, 169, 170, 171] }</tt> and \p relative_offsets will be <tt>{ [0, 0, 1, 0], [1, 2, 0, 1], [2,
+ * 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] }</tt> during the first iteration of the while loop.
+ *
+ * \tparam ItemT The data type of the items being run-length decoded
+ * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
+ * \tparam RUNS_PER_THREAD The number of consecutive runs that each thread contributes
+ * \tparam DECODED_ITEMS_PER_THREAD The maximum number of decoded items that each thread holds
+ * \tparam DecodedOffsetT Type used to index into the block's decoded items (large enough to hold the sum over all the
+ * runs' lengths)
+ * \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension
+ * \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension
+ */
+template <typename ItemT,
+          int BLOCK_DIM_X,
+          int RUNS_PER_THREAD,
+          int DECODED_ITEMS_PER_THREAD,
+          typename DecodedOffsetT = uint32_t,
+          int BLOCK_DIM_Y         = 1,
+          int BLOCK_DIM_Z         = 1>
+class BlockRunLengthDecode
+{
+  //---------------------------------------------------------------------
+  // CONFIGS & TYPE ALIASES
+  //---------------------------------------------------------------------
+private:
+  /// The thread block size in threads
+  static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+
+  /// The number of runs that the block decodes (out-of-bounds items may be padded with run lengths of '0')
+  static constexpr int BLOCK_RUNS = BLOCK_THREADS * RUNS_PER_THREAD;
+
+  /// BlockScan used to determine the beginning of each run (i.e., prefix sum over the runs' length)
+  using RunOffsetScanT = BlockScan<DecodedOffsetT, BLOCK_DIM_X, BLOCK_SCAN_RAKING_MEMOIZE, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+  /// Type used to index into the block's runs
+  using RunOffsetT = uint32_t;
+
+  /// Shared memory type required by this thread block
+  union _TempStorage
+  {
+    typename RunOffsetScanT::TempStorage offset_scan;
+    struct
+    {
+      ItemT run_values[BLOCK_RUNS];
+      DecodedOffsetT run_offsets[BLOCK_RUNS];
+    } runs;
+  }; // union TempStorage
+
+  /// Internal storage allocator (used when the user does not provide pre-allocated shared memory)
+  __device__ __forceinline__ _TempStorage &PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+  /// Shared storage reference
+  _TempStorage &temp_storage;
+
+  /// Linear thread-id
+  uint32_t linear_tid;
+
+public:
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //---------------------------------------------------------------------
+  // CONSTRUCTOR
+  //---------------------------------------------------------------------
+
+  /**
+   * \brief Constructor specialised for user-provided temporary storage, initializing using the runs' lengths. The
+   * algorithm's temporary storage may not be repurposed between the constructor call and subsequent
+   * <b>RunLengthDecode</b> calls.
+   */
+  template <typename RunLengthT, typename TotalDecodedSizeT>
+  __device__ __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage,
+                                                  ItemT (&run_values)[RUNS_PER_THREAD],
+                                                  RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+                                                  TotalDecodedSizeT &total_decoded_size)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {
+    InitWithRunLengths(run_values, run_lengths, total_decoded_size);
+  }
+
+  /**
+   * \brief Constructor specialised for user-provided temporary storage, initializing using the runs' offsets. The
+   * algorithm's temporary storage may not be repurposed between the constructor call and subsequent
+   * <b>RunLengthDecode</b> calls.
+   */
+  template <typename UserRunOffsetT>
+  __device__ __forceinline__ BlockRunLengthDecode(TempStorage &temp_storage,
+                                                  ItemT (&run_values)[RUNS_PER_THREAD],
+                                                  UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {
+    InitWithRunOffsets(run_values, run_offsets);
+  }
+
+  /**
+   * \brief Constructor specialised for static temporary storage, initializing using the runs' lengths.
+   */
+  template <typename RunLengthT, typename TotalDecodedSizeT>
+  __device__ __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD],
+                                                  RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+                                                  TotalDecodedSizeT &total_decoded_size)
+      : temp_storage(PrivateStorage())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {
+    InitWithRunLengths(run_values, run_lengths, total_decoded_size);
+  }
+
+  /**
+   * \brief Constructor specialised for static temporary storage, initializing using the runs' offsets.
+   */
+  template <typename UserRunOffsetT>
+  __device__ __forceinline__ BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD],
+                                                  UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
+      : temp_storage(PrivateStorage())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {
+    InitWithRunOffsets(run_values, run_offsets);
+  }
+
+private:
+  /**
+   * \brief Returns the offset of the first value within \p input which compares greater than \p val. This version takes
+   * \p MAX_NUM_ITEMS, an upper bound of the array size, which will be used to determine the number of binary search
+   * iterations at compile time.
+   */
+  template <int MAX_NUM_ITEMS,
+            typename InputIteratorT,
+            typename OffsetT,
+            typename T>
+  __device__ __forceinline__ OffsetT StaticUpperBound(InputIteratorT input, ///< [in] Input sequence
+                                                      OffsetT num_items,    ///< [in] Input sequence length
+                                                      T val)                ///< [in] Search key
+  {
+    OffsetT lower_bound = 0;
+    OffsetT upper_bound = num_items;
+#pragma unroll
+    for (int i = 0; i <= Log2<MAX_NUM_ITEMS>::VALUE; i++)
+    {
+      OffsetT mid = cub::MidPoint<OffsetT>(lower_bound, upper_bound);
+      mid         = (cub::min)(mid, num_items - 1);
+
+      if (val < input[mid])
+      {
+        upper_bound = mid;
+      }
+      else
+      {
+        lower_bound = mid + 1;
+      }
+    }
+
+    return lower_bound;
+  }
+
+  template <typename RunOffsetT>
+  __device__ __forceinline__ void InitWithRunOffsets(ItemT (&run_values)[RUNS_PER_THREAD],
+                                                     RunOffsetT (&run_offsets)[RUNS_PER_THREAD])
+  {
+    // Keep the runs' items and the offsets of each run's beginning in the temporary storage
+    RunOffsetT thread_dst_offset = static_cast<RunOffsetT>(linear_tid) * static_cast<RunOffsetT>(RUNS_PER_THREAD);
+#pragma unroll
+    for (int i = 0; i < RUNS_PER_THREAD; i++)
+    {
+      temp_storage.runs.run_values[thread_dst_offset]  = run_values[i];
+      temp_storage.runs.run_offsets[thread_dst_offset] = run_offsets[i];
+      thread_dst_offset++;
+    }
+
+    // Ensure run offsets and run values have been writen to shared memory
+    CTA_SYNC();
+  }
+
+  template <typename RunLengthT, typename TotalDecodedSizeT>
+  __device__ __forceinline__ void InitWithRunLengths(ItemT (&run_values)[RUNS_PER_THREAD],
+                                                     RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+                                                     TotalDecodedSizeT &total_decoded_size)
+  {
+    // Compute the offset for the beginning of each run
+    DecodedOffsetT run_offsets[RUNS_PER_THREAD];
+#pragma unroll
+    for (int i = 0; i < RUNS_PER_THREAD; i++)
+    {
+      run_offsets[i] = static_cast<DecodedOffsetT>(run_lengths[i]);
+    }
+    DecodedOffsetT decoded_size_aggregate;
+    RunOffsetScanT(this->temp_storage.offset_scan).ExclusiveSum(run_offsets, run_offsets, decoded_size_aggregate);
+    total_decoded_size = static_cast<TotalDecodedSizeT>(decoded_size_aggregate);
+
+    // Ensure the prefix scan's temporary storage can be reused (may be superfluous, but depends on scan implementation)
+    CTA_SYNC();
+
+    InitWithRunOffsets(run_values, run_offsets);
+  }
+
+public:
+  /**
+   * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded
+   * items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the
+   * run-length decode buffer (i.e., <b>DECODED_ITEMS_PER_THREAD * BLOCK_THREADS</b>), only the items that fit within
+   * the buffer are returned. Subsequent calls to <b>RunLengthDecode</b> adjusting \p from_decoded_offset can be
+   * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to
+   * <b>RunLengthDecode</b> is not required.
+   * \p item_offsets can be used to retrieve each run-length decoded item's relative index within its run. E.g., the
+   * run-length encoded array of `3, 1, 4` with the respective run lengths of `2, 1, 3` would yield the run-length
+   * decoded array of `3, 3, 1, 4, 4, 4` with the relative offsets of `0, 1, 0, 0, 1, 2`.
+   * \smemreuse
+   *
+   * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement
+   * \param[out] item_offsets The run-length decoded items' relative offset within the run they belong to
+   * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results
+   * in undefined behavior.
+   */
+  template <typename RelativeOffsetT>
+  __device__ __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD],
+                                                  RelativeOffsetT (&item_offsets)[DECODED_ITEMS_PER_THREAD],
+                                                  DecodedOffsetT from_decoded_offset = 0)
+  {
+    // The (global) offset of the first item decoded by this thread
+    DecodedOffsetT thread_decoded_offset = from_decoded_offset + linear_tid * DECODED_ITEMS_PER_THREAD;
+
+    // The run that the first decoded item of this thread belongs to
+    // If this thread's <thread_decoded_offset> is already beyond the total decoded size, it will be assigned to the
+    // last run
+    RunOffsetT assigned_run =
+      StaticUpperBound<BLOCK_RUNS>(temp_storage.runs.run_offsets, BLOCK_RUNS, thread_decoded_offset) -
+      static_cast<RunOffsetT>(1U);
+
+    DecodedOffsetT assigned_run_begin = temp_storage.runs.run_offsets[assigned_run];
+
+    // If this thread is getting assigned the last run, we make sure it will not fetch any other run after this
+    DecodedOffsetT assigned_run_end = (assigned_run == BLOCK_RUNS - 1)
+                                        ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD
+                                        : temp_storage.runs.run_offsets[assigned_run + 1];
+
+    ItemT val = temp_storage.runs.run_values[assigned_run];
+
+#pragma unroll
+    for (DecodedOffsetT i = 0; i < DECODED_ITEMS_PER_THREAD; i++)
+    {
+      decoded_items[i] = val;
+      item_offsets[i]  = thread_decoded_offset - assigned_run_begin;
+      if (thread_decoded_offset == assigned_run_end - 1)
+      {
+        // We make sure that a thread is not re-entering this conditional when being assigned to the last run already by
+        // extending the last run's length to all the thread's item
+        assigned_run++;
+        assigned_run_begin = temp_storage.runs.run_offsets[assigned_run];
+
+        // If this thread is getting assigned the last run, we make sure it will not fetch any other run after this
+        assigned_run_end = (assigned_run == BLOCK_RUNS - 1) ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD
+                                                            : temp_storage.runs.run_offsets[assigned_run + 1];
+        val              = temp_storage.runs.run_values[assigned_run];
+      }
+      thread_decoded_offset++;
+    }
+  }
+
+  /**
+   * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded
+   * items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the
+   * run-length decode buffer (i.e., <b>DECODED_ITEMS_PER_THREAD * BLOCK_THREADS</b>), only the items that fit within
+   * the buffer are returned. Subsequent calls to <b>RunLengthDecode</b> adjusting \p from_decoded_offset can be
+   * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to
+   * <b>RunLengthDecode</b> is not required.
+   *
+   * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement
+   * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results
+   * in undefined behavior.
+   */
+  __device__ __forceinline__ void RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD],
+                                                  DecodedOffsetT from_decoded_offset = 0)
+  {
+    DecodedOffsetT item_offsets[DECODED_ITEMS_PER_THREAD];
+    RunLengthDecode(decoded_items, item_offsets, from_decoded_offset);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/block/block_scan.cuh b/cuda/cub/block/block_scan.cuh
new file mode 100644
index 00000000..0e2561ec
--- /dev/null
+++ b/cuda/cub/block/block_scan.cuh
@@ -0,0 +1,2131 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_scan_raking.cuh"
+#include "specializations/block_scan_warp_scans.cuh"
+#include "../config.cuh"
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block.
+ */
+enum BlockScanAlgorithm
+{
+
+    /**
+     * \par Overview
+     * An efficient "raking reduce-then-scan" prefix scan algorithm.  Execution is comprised of five phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within a single warp rake across segments of shared partial reductions.
+     * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
+     * -# Downsweep sequential exclusive scan in shared memory.  Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output.
+     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
+     *
+     * \par
+     * \image html block_scan_raking.png
+     * <div class="centercaption">\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - Although this variant may suffer longer turnaround latencies when the
+     *   GPU is under-occupied, it can often provide higher overall throughput
+     *   across the GPU when suitably occupied.
+     */
+    BLOCK_SCAN_RAKING,
+
+
+    /**
+     * \par Overview
+     * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at
+     * the expense of higher register pressure.  Raking threads preserve their
+     * "upsweep" segment of values in registers while performing warp-synchronous
+     * scan, allowing the "downsweep" not to re-read them from shared memory.
+     */
+    BLOCK_SCAN_RAKING_MEMOIZE,
+
+
+    /**
+     * \par Overview
+     * A quick "tiled warpscans" prefix scan algorithm.  Execution is comprised of four phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
+     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
+     * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp.
+     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
+     *
+     * \par
+     * \image html block_scan_warpscans.png
+     * <div class="centercaption">\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - Although this variant may suffer lower overall throughput across the
+     *   GPU because due to a heavy reliance on inefficient warpscans, it can
+     *   often provide lower turnaround latencies when the GPU is under-occupied.
+     */
+    BLOCK_SCAN_WARP_SCANS,
+};
+
+
+/******************************************************************************
+ * Block scan
+ ******************************************************************************/
+
+/**
+ * \brief The BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                Data type being scanned
+ * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
+ * \tparam ALGORITHM        <b>[optional]</b> cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING)
+ * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ *   produces an output list where each element is computed to be the reduction
+ *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
+ *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ *   the <em>i</em><sup>th</sup> output reduction.
+ * - \rowmajor
+ * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
+ *   -# <b>cub::BLOCK_SCAN_RAKING</b>.  An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
+ *   -# <b>cub::BLOCK_SCAN_RAKING_MEMOIZE</b>.  Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm)
+ *   -# <b>cub::BLOCK_SCAN_WARP_SCANS</b>.  A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
+ *
+ * \par Performance Considerations
+ * - \granularity
+ * - Uses special instructions when applicable (e.g., warp \p SHFL)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Invokes a minimal number of minimal block-wide synchronization barriers (only
+ *   one or two depending on algorithm selection)
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *   - Prefix sum variants (<b><em>vs.</em></b> generic scan)
+ *   - \blocksize
+ * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockScan}
+ * \par
+ * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockScan for a 1D block of 128 threads of type int
+ *     typedef cub::BlockScan<int, 128> BlockScan;
+ *
+ *     // Allocate shared memory for BlockScan
+ *     __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Collectively compute the block-wide exclusive prefix sum
+ *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is
+ * <tt>{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}</tt>.
+ * The corresponding output \p thread_data in those threads will be
+ * <tt>{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}</tt>.
+ *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockScan.
+ */
+template <
+    typename            T,
+    int                 BLOCK_DIM_X,
+    BlockScanAlgorithm  ALGORITHM       = BLOCK_SCAN_RAKING,
+    int                 BLOCK_DIM_Y     = 1,
+    int                 BLOCK_DIM_Z     = 1,
+    int                 PTX_ARCH        = CUB_PTX_ARCH>
+class BlockScan
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /**
+     * Ensure the template parameterization meets the requirements of the
+     * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy
+     * cannot be used with thread block sizes not a multiple of the
+     * architectural warp size.
+     */
+    static const BlockScanAlgorithm SAFE_ALGORITHM =
+        ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ?
+            BLOCK_SCAN_RAKING :
+            ALGORITHM;
+
+    typedef BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> WarpScans;
+    typedef BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE), PTX_ARCH> Raking;
+
+    /// Define the delegate type for the desired algorithm
+    using InternalBlockScan =
+      cub::detail::conditional_t<
+        SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS, WarpScans, Raking>;
+
+    /// Shared memory storage layout type for BlockScan
+    typedef typename InternalBlockScan::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Public types
+     ******************************************************************************/
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockScan()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockScan(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sum operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</sub>.
+     *
+     * \par
+     * - \identityzero
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads of type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
+    {
+        T initial_value{};
+
+        ExclusiveScan(input, output, initial_value, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</sub>.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads of type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
+     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        T initial_value{};
+
+        ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide exclusive prefix sum
+     *         BlockScan(temp_storage).ExclusiveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 1, ..., 127</tt>.
+     * The output for the second segment will be <tt>128, 129, ..., 255</tt>.
+     *
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sum operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  The value of 0 is applied as the initial value, and is assigned to \p output[0] in <em>thread</em><sub>0</sub>.
+     *
+     * \par
+     * - \identityzero
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads of type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD])  ///< [out] Calling thread's output items (may be aliased to \p input)
+    {
+        T initial_value{};
+
+        ExclusiveScan(input, output, initial_value, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  The value of 0 is applied as the initial value, and is assigned to \p output[0] in <em>thread</em><sub>0</sub>.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads of type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                 (&input)[ITEMS_PER_THREAD],       ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],      ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 &block_aggregate)                 ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+        T initial_value{};
+
+        ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
+     * across 128 threads where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide exclusive prefix sum
+     *         int block_aggregate;
+     *         BlockScan(temp_storage.scan).ExclusiveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 1, 2, 3, ..., 510, 511</tt>.
+     * The output for the second segment will be <tt>512, 513, 514, 515, ..., 1022, 1023</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                       (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)    ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+    }
+
+
+
+    //@}  end member group        // Exclusive prefix sums
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads of type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               initial_value,                  ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads of type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp          scan_op,            ///< [in] Binary scan functor 
+        T               &block_aggregate)   ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide exclusive prefix max scan
+     *         BlockScan(temp_storage).ExclusiveScan(
+     *             thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     * The output for the second segment will be <tt>126, 128, 128, 130, ..., 252, 254</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op);
+    }
+
+
+    //@}  end member group        // Inclusive prefix sums
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads of type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 initial_value,                ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp            scan_op)                      ///< [in] Binary scan functor
+    {
+        // Reduce consecutive thread items in registers
+        T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
+
+        // Exclusive scan in registers with prefix as seed
+        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads of type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
+     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 initial_value,                ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp            scan_op,                      ///< [in] Binary scan functor
+        T                 &block_aggregate)             ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+        T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
+
+        // Exclusive scan in registers with prefix as seed
+        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide exclusive prefix max scan
+     *         BlockScan(temp_storage.scan).ExclusiveScan(
+     *             thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510</tt>.
+     * The output for the second segment will be <tt>510, 512, 512, 514, 514, 516, ..., 1020, 1022</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp,
+        typename        BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        // Reduce consecutive thread items in registers
+        T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
+
+        // Exclusive scan in registers with prefix as seed
+        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+    }
+
+
+    //@}  end member group
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document no-initial-value scans
+
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (no initial value, single datum per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
+    }
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (no initial value, multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp            scan_op)                      ///< [in] Binary scan functor
+    {
+        // Reduce consecutive thread items in registers
+        T thread_partial = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_partial, thread_partial, scan_op);
+
+        // Exclusive scan in registers with prefix
+        internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+        T thread_partial = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
+
+        // Exclusive scan in registers with prefix
+        internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+    }
+
+
+    //@}  end member group
+#endif // DOXYGEN_SHOULD_SKIP_THIS  // Do not document no-initial-value scans
+
+    /******************************************************************//**
+     * \name Inclusive prefix sum operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.
+     *
+     * \par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads of type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
+     *
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
+    {
+        InclusiveScan(input, output, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads of type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
+     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
+     *
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InclusiveScan(input, output, cub::Sum(), block_aggregate);
+    }
+
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide inclusive prefix sum
+     *         BlockScan(temp_storage).InclusiveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>1, 2, ..., 128</tt>.
+     * The output for the second segment will be <tt>129, 130, ..., 256</tt>.
+     *
+     * \tparam BlockPrefixCallbackOp          <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveSum(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix sum operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads of type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void InclusiveSum(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD])    ///< [out] Calling thread's output items (may be aliased to \p input)
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0]);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveSum(thread_prefix, thread_prefix);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads of type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be
+     * <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
+     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void InclusiveSum(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0], block_aggregate);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveSum(thread_prefix, thread_prefix, block_aggregate);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
+     * across 128 threads where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide inclusive prefix sum
+     *         BlockScan(temp_storage.scan).IncluisveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>1, 2, 3, 4, ..., 511, 512</tt>.
+     * The output for the second segment will be <tt>513, 514, 515, 516, ..., 1023, 1024</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveSum(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0], block_prefix_callback_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
+        }
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scan operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads of type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+    {
+        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads of type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor 
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide inclusive prefix max scan
+     *         BlockScan(temp_storage).InclusiveScan(
+     *             thread_data, thread_data, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     * The output for the second segment will be <tt>128, 128, 130, 130, ..., 254, 254</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scan operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads of type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveScan(thread_prefix, thread_prefix, scan_op);
+
+            // Inclusive scan in registers with prefix as seed (first thread does not seed)
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads of type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
+     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename         ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor 
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op, block_aggregate);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan (with no initial value)
+            ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate);
+
+            // Inclusive scan in registers with prefix as seed (first thread does not seed)
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide inclusive prefix max scan
+     *         BlockScan(temp_storage.scan).InclusiveScan(
+     *             thread_data, thread_data, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, 4, 4, ..., 510, 510</tt>.
+     * The output for the second segment will be <tt>512, 512, 514, 514, 516, 516, ..., 1022, 1022</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp,
+        typename        BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
+        }
+    }
+
+    //@}  end member group
+
+
+};
+
+/**
+ * \example example_block_scan.cu
+ */
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/block/block_shuffle.cuh b/cuda/cub/block/block_shuffle.cuh
new file mode 100644
index 00000000..bae72068
--- /dev/null
+++ b/cuda/cub/block/block_shuffle.cuh
@@ -0,0 +1,296 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \brief The BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block.
+ * \ingroup BlockModule
+ *
+ * \tparam T                    The data type to be exchanged.
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * It is commonplace for blocks of threads to rearrange data items between
+ * threads.  The BlockShuffle abstraction allows threads to efficiently shift items
+ * either (a) up to their successor or (b) down to their predecessor.
+ *
+ */
+template <
+    typename            T,
+    int                 BLOCK_DIM_X,
+    int                 BLOCK_DIM_Y         = 1,
+    int                 BLOCK_DIM_Z         = 1,
+    int                 PTX_ARCH            = CUB_PTX_ARCH>
+class BlockShuffle
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    enum
+    {
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    typedef T _TempStorage[BLOCK_THREADS];
+
+
+public:
+
+    /// \smemstorage{BlockShuffle}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+public:
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockShuffle()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockShuffle(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Shuffle movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>. The offset \p distance may be negative.
+     *
+     * \par
+     * - \smemreuse
+     */
+    __device__ __forceinline__ void Offset(
+        T   input,                  ///< [in] The input item from the calling thread (<em>thread<sub>i</sub></em>)
+        T&  output,                 ///< [out] The \p input item from the successor (or predecessor) thread <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub> (may be aliased to \p input).  This value is only updated for for <em>thread<sub>i</sub></em> when 0 <= (<em>i</em> + \p distance) < <tt>BLOCK_THREADS-1</tt>
+        int distance = 1)           ///< [in] Offset distance (may be negative)
+    {
+        temp_storage[linear_tid] = input;
+
+        CTA_SYNC();
+
+        const int offset_tid = static_cast<int>(linear_tid) + distance;
+        if ((offset_tid >= 0) && (offset_tid < BLOCK_THREADS))
+        {
+            output = temp_storage[static_cast<size_t>(offset_tid)];
+        }
+    }
+
+
+    /**
+     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>.
+     *
+     * \par
+     * - \smemreuse
+     */
+    __device__ __forceinline__ void Rotate(
+        T   input,                  ///< [in] The calling thread's input item
+        T&  output,                 ///< [out] The \p input item from thread <em>thread</em><sub>(<em>i</em>+<tt>distance></tt>)%<tt>BLOCK_THREADS</tt></sub> (may be aliased to \p input).  This value is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>
+        unsigned int distance = 1)  ///< [in] Offset distance (0 < \p distance < <tt>BLOCK_THREADS</tt>)
+    {
+        temp_storage[linear_tid] = input;
+
+        CTA_SYNC();
+
+        unsigned int offset = threadIdx.x + distance;
+        if (offset >= BLOCK_THREADS)
+            offset -= BLOCK_THREADS;
+
+        output = temp_storage[offset];
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Up(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
+    {
+        temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
+            prev[ITEM] = input[ITEM - 1];
+
+        if (linear_tid > 0)
+            prev[0] = temp_storage[linear_tid - 1];
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item.  All threads receive the \p input provided by <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Up(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
+        T &block_suffix)                ///< [out] The item \p input[ITEMS_PER_THREAD-1] from <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>, provided to all threads
+    {
+        Up(input, prev);
+        block_suffix = temp_storage[BLOCK_THREADS - 1];
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it down by one item
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Down(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
+    {
+        temp_storage[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD - 1; ITEM++)
+            prev[ITEM] = input[ITEM + 1];
+
+        if (linear_tid < BLOCK_THREADS - 1)
+            prev[ITEMS_PER_THREAD - 1] = temp_storage[linear_tid + 1];
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of input items, shifting it down by one item.  All threads receive \p input[0] provided by <em>thread</em><sub><tt>0</tt></sub>.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Down(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
+        T &block_prefix)                ///< [out] The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
+    {
+        Down(input, prev);
+        block_prefix = temp_storage[0];
+    }
+
+    //@}  end member group
+
+
+};
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/block/block_store.cuh b/cuda/cub/block/block_store.cuh
new file mode 100644
index 00000000..d363f347
--- /dev/null
+++ b/cuda/cub/block/block_store.cuh
@@ -0,0 +1,1070 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Operations for writing linear segments of data from the CUDA thread block
+ */
+
+#pragma once
+
+#include <iterator>
+#include <type_traits>
+
+#include "block_exchange.cuh"
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+
+/******************************************************************//**
+ * \name Blocked arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectBlocked(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+    // Store directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[ITEM] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectBlocked(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+    // Store directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
+        {
+            thread_itr[ITEM] = items[ITEM];
+        }
+    }
+}
+
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
+ *
+ * \blocked
+ *
+ * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned,
+ * which is the default starting offset returned by \p cudaMalloc()
+ *
+ * \par
+ * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
+ *   - \p ITEMS_PER_THREAD is odd
+ *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD>
+__device__ __forceinline__ void StoreDirectBlockedVectorized(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T                   *block_ptr,                 ///< [in] Input pointer for storing from
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    enum
+    {
+        // Maximum CUDA vector size is 4 elements
+        MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
+
+        // Vector size must be a power of two and an even divisor of the items per thread
+        VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
+            MAX_VEC_SIZE :
+            1,
+
+        VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
+    };
+
+    // Vector type
+    typedef typename CubVector<T, VEC_SIZE>::Type Vector;
+
+    // Alias global pointer
+    Vector *block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
+
+    // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
+    Vector raw_vector[VECTORS_PER_THREAD];
+    T *raw_items = reinterpret_cast<T*>(raw_vector);
+
+    // Copy
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        raw_items[ITEM] = items[ITEM];
+    }
+
+    // Direct-store using vector types
+    StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector);
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Store a striped arrangement of data across the thread block into a linear segment of items.
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    int                 BLOCK_THREADS,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    OutputIteratorT thread_itr = block_itr + linear_tid;
+
+    // Store directly in striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    int                 BLOCK_THREADS,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    OutputIteratorT thread_itr = block_itr + linear_tid;
+
+    // Store directly in striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
+        {
+            thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
+        }
+    }
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Warp-striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectWarpStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    OutputIteratorT thread_itr = block_itr + warp_offset + tid;
+
+    // Store directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectWarpStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    OutputIteratorT thread_itr = block_itr + warp_offset + tid;
+
+    // Store directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
+        {
+            thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
+        }
+    }
+}
+
+
+//@}  end member group
+
+
+/** @} */       // end group UtilIo
+
+
+//-----------------------------------------------------------------------------
+// Generic BlockStore abstraction
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory.
+ */
+enum BlockStoreAlgorithm
+{
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
+     * directly to memory.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) decreases as the
+     *   access stride between threads increases (i.e., the number items per thread).
+     */
+    BLOCK_STORE_DIRECT,
+
+    /**
+     * \par Overview
+     * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is written
+     * directly to memory.
+     *
+     * \par Performance Considerations
+     * The utilization of memory transactions (coalescing) remains high regardless
+     * of items written per thread.
+     */
+    BLOCK_STORE_STRIPED,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written directly
+     * to memory using CUDA's built-in vectorized stores as a coalescing optimization.
+     * For example, <tt>st.global.v4.s32</tt> instructions will be generated
+     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high until the the
+     *   access stride between threads (i.e., the number items per thread) exceeds the
+     *   maximum vector store width (typically 4 items or 64B, whichever is lower).
+     * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
+     *   - \p ITEMS_PER_THREAD is odd
+     *   - The \p OutputIteratorT is not a simple pointer type
+     *   - The block output offset is not quadword-aligned
+     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+     */
+    BLOCK_STORE_VECTORIZE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+     */
+    BLOCK_STORE_TRANSPOSE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a
+     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+     */
+    BLOCK_STORE_WARP_TRANSPOSE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a
+     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     * To reduce the shared memory requirement, only one warp's worth of shared
+     * memory is provisioned and is subsequently time-sliced among warps.
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - Provisions less shared memory temporary storage, but incurs larger
+     *   latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.
+     */
+    BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+};
+
+
+/**
+ * \brief The BlockStore class provides [<em>collective</em>](index.html#sec0) data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory.  ![](block_store_logo.png)
+ * \ingroup BlockModule
+ * \ingroup UtilIo
+ *
+ * \tparam T                    The type of data to be written.
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration.  default: cub::BLOCK_STORE_DIRECT.
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - The BlockStore class provides a single data movement abstraction that can be specialized
+ *   to implement different cub::BlockStoreAlgorithm strategies.  This facilitates different
+ *   performance policies for different architectures, data types, granularity sizes, etc.
+ * - BlockStore can be optionally specialized by different data movement strategies:
+ *   -# <b>cub::BLOCK_STORE_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
+ *      directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_STRIPED</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
+ *      of data is written directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is written directly to memory using CUDA's built-in vectorized stores as a
+ *      coalescing optimization.  [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec3) which is
+ *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec3) which is
+ *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec3) which is
+ *      then written to memory. To reduce the shared memory requireent, only one warp's worth of shared
+ *      memory is provisioned and is subsequently time-sliced among warps.  [More...](\ref cub::BlockStoreAlgorithm)
+ * - \rowmajor
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockStore}
+ * \par
+ * The code snippet below illustrates the storing of a "blocked" arrangement
+ * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+ * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+ * meaning items are locally reordered among threads so that memory references will be
+ * efficiently coalesced using a warp-striped access pattern.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+ *
+ *     // Allocate shared memory for BlockStore
+ *     __shared__ typename BlockStore::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Store items to linear memory
+ *     BlockStore(temp_storage).Store(d_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of \p thread_data across the block of threads is
+ * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+ * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+ *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockStore.
+ */
+template <
+    typename                T,
+    int                     BLOCK_DIM_X,
+    int                     ITEMS_PER_THREAD,
+    BlockStoreAlgorithm     ALGORITHM           = BLOCK_STORE_DIRECT,
+    int                     BLOCK_DIM_Y         = 1,
+    int                     BLOCK_DIM_Z         = 1,
+    int                     PTX_ARCH            = CUB_PTX_ARCH>
+class BlockStore
+{
+private:
+    /******************************************************************************
+     * Constants and typed definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /******************************************************************************
+     * Algorithmic variants
+     ******************************************************************************/
+
+    /// Store helper
+    template <BlockStoreAlgorithm _POLICY, int DUMMY>
+    struct StoreInternal;
+
+
+    /**
+     * BLOCK_STORE_DIRECT specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+    * BLOCK_STORE_STRIPED specialization of store helper
+    */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_STRIPED, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_VECTORIZE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)
+        __device__ __forceinline__ void Store(
+            T                   *block_ptr,                 ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
+        }
+
+        /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_TRANSPOSE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
+    {
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToStriped(items);
+            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
+            T                 (&items)[ITEMS_PER_THREAD])   ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            StoreDirectWarpStriped(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
+            T                 (&items)[ITEMS_PER_THREAD],   ///< [in] Data to store
+            int               valid_items)                  ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            StoreDirectWarpStriped(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+ 
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Internal load implementation to use
+    typedef StoreInternal<ALGORITHM, 0> InternalStore;
+
+
+    /// Shared memory storage layout type
+    typedef typename InternalStore::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Thread reference to shared storage
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+public:
+
+
+    /// \smemstorage{BlockStore}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockStore()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockStore(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Store items into a linear segment of memory.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the storing of a "blocked" arrangement
+     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+     * meaning items are locally reordered among threads so that memory references will be
+     * efficiently coalesced using a warp-striped access pattern.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+     *
+     *     // Allocate shared memory for BlockStore
+     *     __shared__ typename BlockStore::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Store items to linear memory
+     *     int thread_data[4];
+     *     BlockStore(temp_storage).Store(d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+     *
+     */
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(
+        OutputIteratorT     block_itr,                  ///< [out] The thread block's base output iterator for storing to
+        T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+    {
+        InternalStore(temp_storage, linear_tid).Store(block_itr, items);
+    }
+
+    /**
+     * \brief Store items into a linear segment of memory, guarded by range.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the guarded storing of a "blocked" arrangement
+     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+     * meaning items are locally reordered among threads so that memory references will be
+     * efficiently coalesced using a warp-striped access pattern.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+     *
+     *     // Allocate shared memory for BlockStore
+     *     __shared__ typename BlockStore::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Store items to linear memory
+     *     int thread_data[4];
+     *     BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt> and \p valid_items is \p 5.
+     * The output \p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ?, ...</tt>, with
+     * only the first two threads being unmasked to store portions of valid data.
+     *
+     */
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(
+        OutputIteratorT     block_itr,                  ///< [out] The thread block's base output iterator for storing to
+        T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+        int                 valid_items)                ///< [in] Number of valid items to write
+    {
+        InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
+    }
+
+    //@}  end member group
+};
+
+template <class Policy,
+          class It,
+          class T = cub::detail::value_t<It>>
+struct BlockStoreType
+{
+  using type = cub::BlockStore<T,
+                               Policy::BLOCK_THREADS,
+                               Policy::ITEMS_PER_THREAD,
+                               Policy::STORE_ALGORITHM>;
+};
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/block/radix_rank_sort_operations.cuh b/cuda/cub/block/radix_rank_sort_operations.cuh
new file mode 100644
index 00000000..413daf4d
--- /dev/null
+++ b/cuda/cub/block/radix_rank_sort_operations.cuh
@@ -0,0 +1,145 @@
+/******************************************************************************
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * radix_rank_sort_operations.cuh contains common abstractions, definitions and
+ * operations used for radix sorting and ranking.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+
+CUB_NAMESPACE_BEGIN
+
+/** \brief Twiddling keys for radix sort. */
+template <bool IS_DESCENDING, typename KeyT>
+struct RadixSortTwiddle
+{
+    typedef Traits<KeyT> TraitsT;
+    typedef typename TraitsT::UnsignedBits UnsignedBits;
+    static __host__ __device__ __forceinline__ UnsignedBits In(UnsignedBits key)
+    {
+        key = TraitsT::TwiddleIn(key);
+        if (IS_DESCENDING) key = ~key;
+        return key;
+    }
+    static __host__ __device__ __forceinline__ UnsignedBits Out(UnsignedBits key)
+    {
+        if (IS_DESCENDING) key = ~key;
+        key = TraitsT::TwiddleOut(key);
+        return key;
+    }
+    static __host__ __device__ __forceinline__ UnsignedBits DefaultKey()
+    {
+        return Out(~UnsignedBits(0));
+    }
+};
+
+/** \brief Base struct for digit extractor. Contains common code to provide
+    special handling for floating-point -0.0.
+
+    \note This handles correctly both the case when the keys are
+    bitwise-complemented after twiddling for descending sort (in onesweep) as
+    well as when the keys are not bit-negated, but the implementation handles
+    descending sort separately (in other implementations in CUB). Twiddling
+    alone maps -0.0f to 0x7fffffff and +0.0f to 0x80000000 for float, which are
+    subsequent bit patterns and bitwise complements of each other. For onesweep,
+    both -0.0f and +0.0f are mapped to the bit pattern of +0.0f (0x80000000) for
+    ascending sort, and to the pattern of -0.0f (0x7fffffff) for descending
+    sort. For all other sorting implementations in CUB, both are always mapped
+    to +0.0f. Since bit patterns for both -0.0f and +0.0f are next to each other
+    and only one of them is used, the sorting works correctly. For double, the
+    same applies, but with 64-bit patterns.
+*/
+template <typename KeyT>
+struct BaseDigitExtractor
+{
+    typedef Traits<KeyT> TraitsT;
+    typedef typename TraitsT::UnsignedBits UnsignedBits;
+
+    enum
+    {
+        FLOAT_KEY = TraitsT::CATEGORY == FLOATING_POINT,
+    };
+
+    static __device__ __forceinline__ UnsignedBits ProcessFloatMinusZero(UnsignedBits key)
+    {
+        if (!FLOAT_KEY) {
+            return key;
+        } else {
+            UnsignedBits TWIDDLED_MINUS_ZERO_BITS =
+                TraitsT::TwiddleIn(UnsignedBits(1) << UnsignedBits(8 * sizeof(UnsignedBits) - 1));
+            UnsignedBits TWIDDLED_ZERO_BITS = TraitsT::TwiddleIn(0);
+            return key == TWIDDLED_MINUS_ZERO_BITS ? TWIDDLED_ZERO_BITS : key;
+        }
+    }
+};
+
+/** \brief A wrapper type to extract digits. Uses the BFE intrinsic to extract a
+ * key from a digit. */
+template <typename KeyT>
+struct BFEDigitExtractor : BaseDigitExtractor<KeyT>
+{   
+    using typename BaseDigitExtractor<KeyT>::UnsignedBits;
+
+    uint32_t bit_start, num_bits;
+    explicit __device__ __forceinline__ BFEDigitExtractor(
+        uint32_t bit_start = 0, uint32_t num_bits = 0)
+        : bit_start(bit_start), num_bits(num_bits)
+    { }
+
+    __device__ __forceinline__ uint32_t Digit(UnsignedBits key)
+    {
+        return BFE(this->ProcessFloatMinusZero(key), bit_start, num_bits);
+    }
+};
+
+/** \brief A wrapper type to extract digits. Uses a combination of shift and
+ * bitwise and to extract digits. */
+template <typename KeyT>
+struct ShiftDigitExtractor : BaseDigitExtractor<KeyT>
+{
+    using typename BaseDigitExtractor<KeyT>::UnsignedBits;
+
+    uint32_t bit_start, mask;
+    explicit __device__ __forceinline__ ShiftDigitExtractor(
+        uint32_t bit_start = 0, uint32_t num_bits = 0)
+        : bit_start(bit_start), mask((1 << num_bits) - 1)
+    { }
+
+    __device__ __forceinline__ uint32_t Digit(UnsignedBits key)
+    {
+        return uint32_t(this->ProcessFloatMinusZero(key) >> UnsignedBits(bit_start)) & mask;
+    }
+};
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/block/specializations/block_histogram_atomic.cuh b/cuda/cub/block/specializations/block_histogram_atomic.cuh
new file mode 100644
index 00000000..93299fa7
--- /dev/null
+++ b/cuda/cub/block/specializations/block_histogram_atomic.cuh
@@ -0,0 +1,77 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+template <int BINS>
+struct BlockHistogramAtomic
+{
+    /// Shared memory storage layout type
+    struct TempStorage {};
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockHistogramAtomic(
+        TempStorage &temp_storage)
+    {}
+
+
+    /// Composite data onto an existing histogram
+    template <
+        typename            T,
+        typename            CounterT,     
+        int                 ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        // Update histogram
+        #pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+        {
+              atomicAdd(histogram + items[i], 1);
+        }
+    }
+
+};
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/block/specializations/block_histogram_sort.cuh b/cuda/cub/block/specializations/block_histogram_sort.cuh
new file mode 100644
index 00000000..5bd2a80b
--- /dev/null
+++ b/cuda/cub/block/specializations/block_histogram_sort.cuh
@@ -0,0 +1,221 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../block/block_radix_sort.cuh"
+#include "../../block/block_discontinuity.cuh"
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+
+/**
+ * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+template <
+    typename    T,                  ///< Sample type
+    int         BLOCK_DIM_X,        ///< The thread block length in threads along the X dimension
+    int         ITEMS_PER_THREAD,   ///< The number of samples per thread
+    int         BINS,               ///< The number of bins into which histogram samples may fall
+    int         BLOCK_DIM_Y,        ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,        ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>           ///< The PTX compute capability for which to to specialize this collective
+struct BlockHistogramSort
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    // Parameterize BlockRadixSort type for our thread block
+    typedef BlockRadixSort<
+            T,
+            BLOCK_DIM_X,
+            ITEMS_PER_THREAD,
+            NullType,
+            4,
+            (PTX_ARCH >= 350) ? true : false,
+            BLOCK_SCAN_WARP_SCANS,
+            cudaSharedMemBankSizeFourByte,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockRadixSortT;
+
+    // Parameterize BlockDiscontinuity type for our thread block
+    typedef BlockDiscontinuity<
+            T,
+            BLOCK_DIM_X,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockDiscontinuityT;
+
+    /// Shared memory
+    union _TempStorage
+    {
+        // Storage for sorting bin values
+        typename BlockRadixSortT::TempStorage sort;
+
+        struct Discontinuities
+        {
+            // Storage for detecting discontinuities in the tile of sorted bin values
+            typename BlockDiscontinuityT::TempStorage flag;
+
+            // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
+            unsigned int run_begin[BINS];
+            unsigned int run_end[BINS];
+        } discontinuities;
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockHistogramSort(
+        TempStorage     &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    // Discontinuity functor
+    struct DiscontinuityOp
+    {
+        // Reference to temp_storage
+        _TempStorage &temp_storage;
+
+        // Constructor
+        __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
+            temp_storage(temp_storage)
+        {}
+
+        // Discontinuity predicate
+        __device__ __forceinline__ bool operator()(const T &a, const T &b, int b_index)
+        {
+            if (a != b)
+            {
+                // Note the begin/end offsets in shared storage
+                temp_storage.discontinuities.run_begin[b] = b_index;
+                temp_storage.discontinuities.run_end[a] = b_index;
+
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    };
+
+
+    // Composite data onto an existing histogram
+    template <
+        typename            CounterT     >
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT            histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
+
+        // Sort bytes in blocked arrangement
+        BlockRadixSortT(temp_storage.sort).Sort(items);
+
+        CTA_SYNC();
+
+        // Initialize the shared memory's run_begin and run_end for each bin
+        int histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            temp_storage.discontinuities.run_begin[histo_offset + linear_tid] = TILE_SIZE;
+            temp_storage.discontinuities.run_end[histo_offset + linear_tid] = TILE_SIZE;
+        }
+        // Finish up with guarded initialization if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            temp_storage.discontinuities.run_begin[histo_offset + linear_tid] = TILE_SIZE;
+            temp_storage.discontinuities.run_end[histo_offset + linear_tid] = TILE_SIZE;
+        }
+
+        CTA_SYNC();
+
+        int flags[ITEMS_PER_THREAD];    // unused
+
+        // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
+        DiscontinuityOp flag_op(temp_storage);
+        BlockDiscontinuityT(temp_storage.discontinuities.flag).FlagHeads(flags, items, flag_op);
+
+        // Update begin for first item
+        if (linear_tid == 0) temp_storage.discontinuities.run_begin[items[0]] = 0;
+
+        CTA_SYNC();
+
+        // Composite into histogram
+        histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            int thread_offset = histo_offset + linear_tid;
+            CounterT      count = temp_storage.discontinuities.run_end[thread_offset] - temp_storage.discontinuities.run_begin[thread_offset];
+            histogram[thread_offset] += count;
+        }
+
+        // Finish up with guarded composition if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            int thread_offset = histo_offset + linear_tid;
+            CounterT      count = temp_storage.discontinuities.run_end[thread_offset] - temp_storage.discontinuities.run_begin[thread_offset];
+            histogram[thread_offset] += count;
+        }
+    }
+
+};
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/block/specializations/block_reduce_raking.cuh b/cuda/cub/block/specializations/block_reduce_raking.cuh
new file mode 100644
index 00000000..d8284849
--- /dev/null
+++ b/cuda/cub/block/specializations/block_reduce_raking.cuh
@@ -0,0 +1,221 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "../../block/block_raking_layout.cuh"
+#include "../../warp/warp_reduce.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ *
+ * Supports non-commutative binary reduction operators.  Unlike commutative
+ * reduction operators (e.g., addition), the application of a non-commutative
+ * reduction operator (e.g, string concatenation) across a sequence of inputs must
+ * honor the relative ordering of items and partial reductions when applying the
+ * reduction operator.
+ *
+ * Compared to the implementation of BlockReduceRaking (which does not support
+ * non-commutative operators), this implementation requires a few extra
+ * rounds of inter-thread communication.
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceRaking
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /// Layout type for padded thread block raking grid
+    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
+
+    ///  WarpReduce utility type
+    typedef typename WarpReduce<T, BlockRakingLayout::RAKING_THREADS, PTX_ARCH>::InternalWarpReduce WarpReduce;
+
+    /// Constants
+    enum
+    {
+        /// Number of raking threads
+        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
+
+        /// Cooperative work can be entirely warp synchronous
+        WARP_SYNCHRONOUS = (int(RAKING_THREADS) == int(BLOCK_THREADS)),
+
+        /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two
+        WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo<RAKING_THREADS>::VALUE,
+
+        /// Whether or not accesses into smem are unguarded
+        RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED,
+
+    };
+
+
+    /// Shared memory storage layout type
+    union _TempStorage
+    {
+        typename WarpReduce::TempStorage            warp_storage;        ///< Storage for warp-synchronous reduction
+        typename BlockRakingLayout::TempStorage     raking_grid;         ///< Padded thread block raking grid
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceRaking(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    template <bool IS_FULL_TILE, typename ReductionOp, int ITERATION>
+    __device__ __forceinline__ T RakingReduction(
+        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
+        T                           *raking_segment,
+        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<ITERATION>         /*iteration*/)
+    {
+        // Update partial if addend is in range
+        if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid))
+        {
+            T addend = raking_segment[ITERATION];
+            partial = reduction_op(partial, addend);
+        }
+        return RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<ITERATION + 1>());
+    }
+
+    template <bool IS_FULL_TILE, typename ReductionOp>
+    __device__ __forceinline__ T RakingReduction(
+        ReductionOp                 /*reduction_op*/,   ///< [in] Binary scan operator
+        T                           * /*raking_segment*/,
+        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                         /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<SEGMENT_LENGTH>    /*iteration*/)
+    {
+        return partial;
+    }
+
+
+
+    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                IS_FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
+            partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE>(
+                partial,
+                num_valid,
+                reduction_op);
+        }
+        else
+        {
+            // Place partial into shared memory grid.
+            *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
+
+            CTA_SYNC();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = raking_segment[0];
+
+                partial = RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<1>());
+
+                int valid_raking_threads = (IS_FULL_TILE) ?
+                    RAKING_THREADS :
+                    (num_valid + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH;
+
+                partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE && RAKING_UNGUARDED>(
+                    partial,
+                    valid_raking_threads,
+                    reduction_op);
+
+            }
+        }
+
+        return partial;
+    }
+
+
+    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool IS_FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        cub::Sum reduction_op;
+
+        return Reduce<IS_FULL_TILE>(partial, num_valid, reduction_op);
+    }
+
+
+
+};
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/cuda/cub/block/specializations/block_reduce_raking_commutative_only.cuh
new file mode 100644
index 00000000..4dd8a130
--- /dev/null
+++ b/cuda/cub/block/specializations/block_reduce_raking_commutative_only.cuh
@@ -0,0 +1,195 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "block_reduce_raking.cuh"
+#include "../../warp/warp_reduce.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.  Does not support block sizes that are not a multiple of the warp size.
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceRakingCommutativeOnly
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values
+    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> FallBack;
+
+    /// Constants
+    enum
+    {
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// Whether or not to use fall-back
+        USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)),
+
+        /// Number of raking threads
+        RAKING_THREADS = WARP_THREADS,
+
+        /// Number of threads actually sharing items with the raking threads
+        SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS),
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS,
+    };
+
+    ///  WarpReduce utility type
+    typedef WarpReduce<T, RAKING_THREADS, PTX_ARCH> WarpReduce;
+
+    /// Layout type for padded thread block raking grid
+    typedef BlockRakingLayout<T, SHARING_THREADS, PTX_ARCH> BlockRakingLayout;
+
+    /// Shared memory storage layout type
+    union _TempStorage
+    {
+        struct DefaultStorage
+        {
+            typename WarpReduce::TempStorage        warp_storage;        ///< Storage for warp-synchronous reduction
+            typename BlockRakingLayout::TempStorage raking_grid;         ///< Padded thread block raking grid
+        } default_storage;
+
+        typename FallBack::TempStorage              fallback_storage;    ///< Fall-back storage for non-commutative block scan
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceRakingCommutativeOnly(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        if (USE_FALLBACK || !FULL_TILE)
+        {
+            return FallBack(temp_storage.fallback_storage).template Sum<FULL_TILE>(partial, num_valid);
+        }
+        else
+        {
+            // Place partial into shared memory grid
+            if (linear_tid >= RAKING_THREADS)
+                *BlockRakingLayout::PlacementPtr(temp_storage.default_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
+
+            CTA_SYNC();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.default_storage.raking_grid, linear_tid);
+                partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, cub::Sum(), partial);
+
+                // Warpscan
+                partial = WarpReduce(temp_storage.default_storage.warp_storage).Sum(partial);
+            }
+        }
+
+        return partial;
+    }
+
+
+    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        if (USE_FALLBACK || !FULL_TILE)
+        {
+            return FallBack(temp_storage.fallback_storage).template Reduce<FULL_TILE>(partial, num_valid, reduction_op);
+        }
+        else
+        {
+            // Place partial into shared memory grid
+            if (linear_tid >= RAKING_THREADS)
+                *BlockRakingLayout::PlacementPtr(temp_storage.default_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
+
+            CTA_SYNC();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.default_storage.raking_grid, linear_tid);
+                partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, reduction_op, partial);
+
+                // Warpscan
+                partial = WarpReduce(temp_storage.default_storage.warp_storage).Reduce(partial, reduction_op);
+            }
+        }
+
+        return partial;
+    }
+
+};
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/block/specializations/block_reduce_warp_reductions.cuh b/cuda/cub/block/specializations/block_reduce_warp_reductions.cuh
new file mode 100644
index 00000000..6b440e5a
--- /dev/null
+++ b/cuda/cub/block/specializations/block_reduce_warp_reductions.cuh
@@ -0,0 +1,212 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "../../warp/warp_reduce.cuh"
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceWarpReductions
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        /// The logical warp size for warp reductions
+        LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
+
+        /// Whether or not the logical warp size evenly divides the thread block size
+        EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0)
+    };
+
+
+    ///  WarpReduce utility type
+    typedef typename WarpReduce<T, LOGICAL_WARP_SIZE, PTX_ARCH>::InternalWarpReduce WarpReduce;
+
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpReduce::TempStorage    warp_reduce[WARPS];         ///< Buffer for warp-synchronous scan
+        T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous scan
+        T                                   block_prefix;               ///< Shared prefix for the entire thread block
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    int linear_tid;
+    int warp_id;
+    int lane_id;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceWarpReductions(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId())
+    {}
+
+
+    template <bool FULL_TILE, typename ReductionOp, int SUCCESSOR_WARP>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
+        T                           warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<SUCCESSOR_WARP>    /*successor_warp*/)
+    {
+        if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid))
+        {
+            T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP];
+            warp_aggregate = reduction_op(warp_aggregate, addend);
+        }
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<SUCCESSOR_WARP + 1>());
+    }
+
+    template <bool FULL_TILE, typename ReductionOp>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp         /*reduction_op*/,   ///< [in] Binary scan operator
+        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                 /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<WARPS>     /*successor_warp*/)
+    {
+        return warp_aggregate;
+    }
+
+
+    /// Returns block-wide aggregate in <em>thread</em><sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp         reduction_op,       ///< [in] Binary scan operator
+        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        // Share lane aggregates
+        if (lane_id == 0)
+        {
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+        }
+
+        CTA_SYNC();
+
+        // Update total aggregate in warp 0, lane 0
+        if (linear_tid == 0)
+        {
+            warp_aggregate = ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<1>());
+        }
+
+        return warp_aggregate;
+    }
+
+
+    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   input,          ///< [in] Calling thread's input partial reductions
+        int                 num_valid)      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        cub::Sum    reduction_op;
+        int         warp_offset = (warp_id * LOGICAL_WARP_SIZE);
+        int         warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ?
+                            LOGICAL_WARP_SIZE :
+                            num_valid - warp_offset;
+
+        // Warp reduction in every warp
+        T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>(
+            input,
+            warp_num_valid,
+            cub::Sum());
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
+    }
+
+
+    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        int         warp_offset = warp_id * LOGICAL_WARP_SIZE;
+        int         warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ?
+                            LOGICAL_WARP_SIZE :
+                            num_valid - warp_offset;
+
+        // Warp reduction in every warp
+        T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>(
+            input,
+            warp_num_valid,
+            reduction_op);
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
+    }
+
+};
+
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/block/specializations/block_scan_raking.cuh b/cuda/cub/block/specializations/block_scan_raking.cuh
new file mode 100644
index 00000000..863939e1
--- /dev/null
+++ b/cuda/cub/block/specializations/block_scan_raking.cuh
@@ -0,0 +1,660 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+
+/**
+ * \file
+ * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+#include "../../block/block_raking_layout.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../thread/thread_scan.cuh"
+#include "../../warp/warp_scan.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,              ///< Data type being scanned
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    bool        MEMOIZE,        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanRaking
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /// Layout type for padded thread block raking grid
+    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
+
+    /// Constants
+    enum
+    {
+        /// Number of raking threads
+        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
+
+        /// Cooperative work can be entirely warp synchronous
+        WARP_SYNCHRONOUS = (int(BLOCK_THREADS) == int(RAKING_THREADS)),
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, RAKING_THREADS, PTX_ARCH> WarpScan;
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpScan::TempStorage              warp_scan;          ///< Buffer for warp-synchronous scan
+        typename BlockRakingLayout::TempStorage     raking_grid;        ///< Padded thread block raking grid
+        T                                           block_aggregate;    ///< Block aggregate
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    T               cached_segment[SEGMENT_LENGTH];
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /// Templated reduction
+    template <int ITERATION, typename ScanOp>
+    __device__ __forceinline__ T GuardedReduce(
+        T*                  raking_ptr,         ///< [in] Input array
+        ScanOp              scan_op,            ///< [in] Binary reduction operator
+        T                   raking_partial,     ///< [in] Prefix to seed reduction with
+        Int2Type<ITERATION> /*iteration*/)
+    {
+        if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS))
+        {
+            T addend = raking_ptr[ITERATION];
+            raking_partial = scan_op(raking_partial, addend);
+        }
+
+        return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type<ITERATION + 1>());
+    }
+
+
+    /// Templated reduction (base case)
+    template <typename ScanOp>
+    __device__ __forceinline__ T GuardedReduce(
+        T*                          /*raking_ptr*/,    ///< [in] Input array
+        ScanOp                      /*scan_op*/,       ///< [in] Binary reduction operator
+        T                           raking_partial,    ///< [in] Prefix to seed reduction with
+        Int2Type<SEGMENT_LENGTH>    /*iteration*/)
+    {
+        return raking_partial;
+    }
+
+
+    /// Templated copy
+    template <int ITERATION>
+    __device__ __forceinline__ void CopySegment(
+        T*                  out,            ///< [out] Out array
+        T*                  in,             ///< [in] Input array
+        Int2Type<ITERATION> /*iteration*/)
+    {
+        out[ITERATION] = in[ITERATION];
+        CopySegment(out, in, Int2Type<ITERATION + 1>());
+    }
+
+ 
+    /// Templated copy (base case)
+    __device__ __forceinline__ void CopySegment(
+        T*                  /*out*/,            ///< [out] Out array
+        T*                  /*in*/,             ///< [in] Input array
+        Int2Type<SEGMENT_LENGTH> /*iteration*/)
+    {}
+
+
+    /// Performs upsweep raking reduction, returning the aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ T Upsweep(
+        ScanOp scan_op)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        // Read data into registers
+        CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+
+        T raking_partial = cached_segment[0];
+
+        return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>());
+    }
+
+
+    /// Performs exclusive downsweep raking scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveDownsweep(
+        ScanOp          scan_op,
+        T               raking_partial,
+        bool            apply_prefix = true)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        // Read data back into registers
+        if (!MEMOIZE)
+        {
+            CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+        }
+
+        internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
+
+        // Write data back to smem
+        CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
+    }
+
+
+    /// Performs inclusive downsweep raking scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveDownsweep(
+        ScanOp          scan_op,
+        T               raking_partial,
+        bool            apply_prefix = true)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        // Read data back into registers
+        if (!MEMOIZE)
+        {
+            CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+        }
+
+        internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
+
+        // Write data back to smem
+        CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRaking(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            exclusive_output = *placement_ptr;
+        }
+    }
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Exclusive Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial);
+            }
+
+            CTA_SYNC();
+
+            // Grab exclusive partial from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial= Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T inclusive_partial;
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+
+                // Broadcast aggregate to all threads
+                if (linear_tid == RAKING_THREADS - 1)
+                    temp_storage.block_aggregate = inclusive_partial;
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial);
+
+                // Broadcast aggregate to other threads
+                if (linear_tid == 0)
+                    temp_storage.block_aggregate = block_aggregate;
+            }
+
+            CTA_SYNC();
+
+            // Grab exclusive partial from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            T block_aggregate;
+            WarpScan warp_scan(temp_storage.warp_scan);
+            warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate);
+
+            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+            output = scan_op(block_prefix, output);
+            if (linear_tid == 0)
+                output = block_prefix;
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                WarpScan warp_scan(temp_storage.warp_scan);
+
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial, block_aggregate;
+                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
+
+                // Obtain block-wide prefix in lane0, then broadcast to other lanes
+                T block_prefix = block_prefix_callback_op(block_aggregate);
+                block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+                // Update prefix with warpscan exclusive partial
+                T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
+                if (linear_tid == 0)
+                    downsweep_prefix = block_prefix;
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, downsweep_prefix);
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Exclusive Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T inclusive_partial;
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+
+                // Broadcast aggregate to all threads
+                if (linear_tid == RAKING_THREADS - 1)
+                    temp_storage.block_aggregate = inclusive_partial;
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            T block_aggregate;
+            WarpScan warp_scan(temp_storage.warp_scan);
+            warp_scan.InclusiveScan(input, output, scan_op, block_aggregate);
+
+            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+            // Update prefix with exclusive warpscan partial
+            output = scan_op(block_prefix, output);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                WarpScan warp_scan(temp_storage.warp_scan);
+
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial, block_aggregate;
+                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
+
+                // Obtain block-wide prefix in lane0, then broadcast to other lanes
+                T block_prefix = block_prefix_callback_op(block_aggregate);
+                block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+                // Update prefix with warpscan exclusive partial
+                T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
+                if (linear_tid == 0)
+                    downsweep_prefix = block_prefix;
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, downsweep_prefix);
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+};
+
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/block/specializations/block_scan_warp_scans.cuh b/cuda/cub/block/specializations/block_scan_warp_scans.cuh
new file mode 100644
index 00000000..8273e43b
--- /dev/null
+++ b/cuda/cub/block/specializations/block_scan_warp_scans.cuh
@@ -0,0 +1,386 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+#include "../../warp/warp_scan.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanWarpScans
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScan;
+
+    /// Shared memory storage layout type
+
+    struct __align__(32) _TempStorage
+    {
+        T                               warp_aggregates[WARPS];
+        typename WarpScanT::TempStorage warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
+        T                               block_prefix;               ///< Shared prefix for the entire thread block
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId())
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    template <typename ScanOp, int WARP>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARP>  /*addend_warp*/)
+    {
+        if (warp_id == WARP)
+            warp_prefix = block_aggregate;
+
+        T addend = temp_storage.warp_aggregates[WARP];
+        block_aggregate = scan_op(block_aggregate, addend);
+
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
+    }
+
+    template <typename ScanOp>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &/*warp_prefix*/,       ///< [out] The calling thread's partial reduction
+        ScanOp          /*scan_op*/,            ///< [in] Binary scan operator
+        T               &/*block_aggregate*/,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARPS> /*addend_warp*/)
+    {}
+
+
+    /// Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+
+        CTA_SYNC();
+
+        // Accumulate block aggregates and save the one that is our warp's prefix
+        T warp_prefix;
+        block_aggregate = temp_storage.warp_aggregates[0];
+
+        // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
+/*
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; ++WARP)
+        {
+            if (warp_id == WARP)
+                warp_prefix = block_aggregate;
+
+            T addend = temp_storage.warp_aggregates[WARP];
+            block_aggregate = scan_op(block_aggregate, addend);
+        }
+*/
+
+        return warp_prefix;
+    }
+
+
+    /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        const T         &initial_value)     ///< [in] Initial value to seed the exclusive scan
+    {
+        T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
+
+        warp_prefix = scan_op(initial_value, warp_prefix);
+
+        if (warp_id == 0)
+            warp_prefix = initial_value;
+
+        return warp_prefix;
+    }
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item
+        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            exclusive_output = scan_op(warp_prefix, exclusive_output);
+            if (lane_id == 0)
+                exclusive_output = warp_prefix;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
+
+        // Apply warp prefix to our lane's partial
+        exclusive_output = scan_op(warp_prefix, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = warp_prefix;
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        if (linear_tid > 0)
+        {
+            exclusive_output = scan_op(block_prefix, exclusive_output);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            inclusive_output = scan_op(warp_prefix, inclusive_output);
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        T block_aggregate;
+        InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        exclusive_output = scan_op(block_prefix, exclusive_output);
+    }
+
+
+};
+
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/block/specializations/block_scan_warp_scans2.cuh b/cuda/cub/block/specializations/block_scan_warp_scans2.cuh
new file mode 100644
index 00000000..a485356b
--- /dev/null
+++ b/cuda/cub/block/specializations/block_scan_warp_scans2.cuh
@@ -0,0 +1,430 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+#include "../../warp/warp_scan.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanWarpScans
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScanT;
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpAggregateScanT::TempStorage    inner_scan[WARPS];          ///< Buffer for warp-synchronous scans
+        typename WarpScanT::TempStorage             warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
+        T                                           warp_aggregates[WARPS];
+        T                                           block_prefix;               ///< Shared prefix for the entire thread block
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId())
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    template <typename ScanOp, int WARP>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARP>  addend_warp)
+    {
+        if (warp_id == WARP)
+            warp_prefix = block_aggregate;
+
+        T addend = temp_storage.warp_aggregates[WARP];
+        block_aggregate = scan_op(block_aggregate, addend);
+
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
+    }
+
+    template <typename ScanOp>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARPS> addend_warp)
+    {}
+
+
+    /// Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+
+        CTA_SYNC();
+
+        // Accumulate block aggregates and save the one that is our warp's prefix
+        T warp_prefix;
+        block_aggregate = temp_storage.warp_aggregates[0];
+
+        // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
+/*
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; ++WARP)
+        {
+            if (warp_id == WARP)
+                warp_prefix = block_aggregate;
+
+            T addend = temp_storage.warp_aggregates[WARP];
+            block_aggregate = scan_op(block_aggregate, addend);
+        }
+*/
+
+        return warp_prefix;
+    }
+
+
+    /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        const T         &initial_value)     ///< [in] Initial value to seed the exclusive scan
+    {
+        T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
+
+        warp_prefix = scan_op(initial_value, warp_prefix);
+
+        if (warp_id == 0)
+            warp_prefix = initial_value;
+
+        return warp_prefix;
+    }
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item
+        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
+
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+//        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+//--------------------------------------------------
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        // Get the warp scan partial
+        T warp_inclusive, warp_prefix;
+        if (lane_id < WARPS)
+        {
+            // Scan the warpscan partials
+            T warp_val = temp_storage.warp_aggregates[lane_id];
+            WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, scan_op);
+        }
+
+        warp_prefix         = my_warp_scan.Broadcast(warp_prefix, warp_id);
+        block_aggregate     = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
+//--------------------------------------------------
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            exclusive_output = scan_op(warp_prefix, exclusive_output);
+            if (lane_id == 0)
+                exclusive_output = warp_prefix;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
+
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp
+//        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
+
+//--------------------------------------------------
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        // Get the warp scan partial
+        T warp_inclusive, warp_prefix;
+        if (lane_id < WARPS)
+        {
+            // Scan the warpscan partials
+            T warp_val = temp_storage.warp_aggregates[lane_id];
+            WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, initial_value, scan_op);
+        }
+
+        warp_prefix         = my_warp_scan.Broadcast(warp_prefix, warp_id);
+        block_aggregate     = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
+//--------------------------------------------------
+
+        // Apply warp prefix to our lane's partial
+        exclusive_output = scan_op(warp_prefix, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = warp_prefix;
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        if (linear_tid > 0)
+        {
+            exclusive_output = scan_op(block_prefix, exclusive_output);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            inclusive_output = scan_op(warp_prefix, inclusive_output);
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        T block_aggregate;
+        InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        exclusive_output = scan_op(block_prefix, exclusive_output);
+    }
+
+
+};
+
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/block/specializations/block_scan_warp_scans3.cuh b/cuda/cub/block/specializations/block_scan_warp_scans3.cuh
new file mode 100644
index 00000000..dad06fd2
--- /dev/null
+++ b/cuda/cub/block/specializations/block_scan_warp_scans3.cuh
@@ -0,0 +1,412 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../util_ptx.cuh"
+#include "../../warp/warp_scan.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanWarpScans
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of warp threads
+        INNER_WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+        OUTER_WARP_THREADS = BLOCK_THREADS / INNER_WARP_THREADS,
+
+        /// Number of outer scan warps
+        OUTER_WARPS = INNER_WARP_THREADS
+    };
+
+    ///  Outer WarpScan utility type
+    typedef WarpScan<T, OUTER_WARP_THREADS, PTX_ARCH> OuterWarpScanT;
+
+    ///  Inner WarpScan utility type
+    typedef WarpScan<T, INNER_WARP_THREADS, PTX_ARCH> InnerWarpScanT;
+
+    typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS];
+
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        union Aliasable
+        {
+            Uninitialized<OuterScanArray>           outer_warp_scan;  ///< Buffer for warp-synchronous outer scans
+            typename InnerWarpScanT::TempStorage    inner_warp_scan;  ///< Buffer for warp-synchronous inner scan
+
+        } aliasable;
+
+        T                               warp_aggregates[OUTER_WARPS];
+
+        T                               block_aggregate;                           ///< Shared prefix for the entire thread block
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((OUTER_WARPS == 1) ? 0 : linear_tid / OUTER_WARP_THREADS),
+        lane_id((OUTER_WARPS == 1) ? linear_tid : linear_tid % OUTER_WARP_THREADS)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item
+        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
+            input, inclusive_output, exclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
+            T outer_warp_exclusive;
+
+            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
+                outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
+
+            temp_storage.block_aggregate                = block_aggregate;
+            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
+        }
+
+        CTA_SYNC();
+
+        if (warp_id != 0)
+        {
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+
+            // Apply warp prefix to our lane's partial
+            T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+            exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
+            if (lane_id == 0)
+                exclusive_output = outer_warp_exclusive;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
+            input, inclusive_output, exclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+        {
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+        }
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
+            T outer_warp_exclusive;
+
+            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
+                outer_warp_input, outer_warp_exclusive, initial_value, scan_op, block_aggregate);
+
+            temp_storage.block_aggregate                = block_aggregate;
+            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
+        }
+
+        CTA_SYNC();
+
+        // Retrieve block aggregate
+        block_aggregate = temp_storage.block_aggregate;
+
+        // Apply warp prefix to our lane's partial
+        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+        exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = outer_warp_exclusive;
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  The call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
+            input, inclusive_output, exclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
+
+            T upsweep = temp_storage.warp_aggregates[linear_tid];
+            T downsweep_prefix, block_aggregate;
+
+            inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
+
+            // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = inner_scan.Broadcast(block_prefix, 0);
+
+            downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
+            if (linear_tid == 0)
+                downsweep_prefix = block_prefix;
+
+            temp_storage.warp_aggregates[linear_tid] = downsweep_prefix;
+        }
+
+        CTA_SYNC();
+
+        // Apply warp prefix to our lane's partial (or assign it if partial is invalid)
+        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+        exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = outer_warp_exclusive;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
+            input, inclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
+            T outer_warp_exclusive;
+
+            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
+                outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
+
+            temp_storage.block_aggregate                = block_aggregate;
+            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
+        }
+
+        CTA_SYNC();
+
+        if (warp_id != 0)
+        {
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+
+            // Apply warp prefix to our lane's partial
+            T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+            inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
+            input, inclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
+
+            T upsweep = temp_storage.warp_aggregates[linear_tid];
+            T downsweep_prefix, block_aggregate;
+            inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
+
+            // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = inner_scan.Broadcast(block_prefix, 0);
+
+            downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
+            if (linear_tid == 0)
+                downsweep_prefix = block_prefix;
+
+            temp_storage.warp_aggregates[linear_tid]    = downsweep_prefix;
+        }
+
+        CTA_SYNC();
+
+        // Apply warp prefix to our lane's partial
+        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+        inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
+    }
+
+
+};
+
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/cmake/cub-config-version.cmake b/cuda/cub/cmake/cub-config-version.cmake
new file mode 100644
index 00000000..5d9dd5b9
--- /dev/null
+++ b/cuda/cub/cmake/cub-config-version.cmake
@@ -0,0 +1,31 @@
+# Parse version information from version.cuh:
+include("${CMAKE_CURRENT_LIST_DIR}/cub-header-search.cmake")
+
+file(READ "${_CUB_VERSION_INCLUDE_DIR}/cub/version.cuh" CUB_VERSION_HEADER)
+string(REGEX MATCH "#define[ \t]+CUB_VERSION[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}")
+set(CUB_VERSION_FLAT ${CMAKE_MATCH_1})
+# Note that CUB calls this the PATCH number, CMake calls it the TWEAK number:
+string(REGEX MATCH "#define[ \t]+CUB_PATCH_NUMBER[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}")
+set(CUB_VERSION_TWEAK ${CMAKE_MATCH_1})
+
+math(EXPR CUB_VERSION_MAJOR "${CUB_VERSION_FLAT} / 100000")
+math(EXPR CUB_VERSION_MINOR "(${CUB_VERSION_FLAT} / 100) % 1000")
+math(EXPR CUB_VERSION_PATCH "${CUB_VERSION_FLAT} % 100") # CUB: "subminor" CMake: "patch"
+
+set(CUB_VERSION "${CUB_VERSION_MAJOR}.${CUB_VERSION_MINOR}.${CUB_VERSION_PATCH}.${CUB_VERSION_TWEAK}")
+
+set(PACKAGE_VERSION ${CUB_VERSION})
+set(PACKAGE_VERSION_COMPATIBLE FALSE)
+set(PACKAGE_VERSION_EXACT FALSE)
+set(PACKAGE_VERSION_UNSUITABLE FALSE)
+
+if(PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION)
+  if(CUB_VERSION_MAJOR VERSION_EQUAL PACKAGE_FIND_VERSION_MAJOR AND
+     CUB_VERSION_MINOR VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MINOR)
+    set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  endif()
+
+  if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
+    set(PACKAGE_VERSION_EXACT TRUE)
+  endif()
+endif()
diff --git a/cuda/cub/cmake/cub-config.cmake b/cuda/cub/cmake/cub-config.cmake
new file mode 100644
index 00000000..3d7f64bd
--- /dev/null
+++ b/cuda/cub/cmake/cub-config.cmake
@@ -0,0 +1,75 @@
+#
+# find_package(CUB) config file.
+#
+# Defines a CUB::CUB target that may be linked from user projects to include
+# CUB.
+
+if (TARGET CUB::CUB)
+  return()
+endif()
+
+function(_cub_declare_interface_alias alias_name ugly_name)
+  # 1) Only IMPORTED and ALIAS targets can be placed in a namespace.
+  # 2) When an IMPORTED library is linked to another target, its include
+  #    directories are treated as SYSTEM includes.
+  # 3) nvcc will automatically check the CUDA Toolkit include path *before* the
+  #    system includes. This means that the Toolkit CUB will *always* be used
+  #    during compilation, and the include paths of an IMPORTED CUB::CUB
+  #    target will never have any effect.
+  # 4) This behavior can be fixed by setting the property NO_SYSTEM_FROM_IMPORTED
+  #    on EVERY target that links to CUB::CUB. This would be a burden and a
+  #    footgun for our users. Forgetting this would silently pull in the wrong CUB!
+  # 5) A workaround is to make a non-IMPORTED library outside of the namespace,
+  #    configure it, and then ALIAS it into the namespace (or ALIAS and then
+  #    configure, that seems to work too).
+  add_library(${ugly_name} INTERFACE)
+  add_library(${alias_name} ALIAS ${ugly_name})
+endfunction()
+
+#
+# Setup targets
+#
+
+_cub_declare_interface_alias(CUB::CUB _CUB_CUB)
+# Pull in the include dir detected by cub-config-version.cmake
+set(_CUB_INCLUDE_DIR "${_CUB_VERSION_INCLUDE_DIR}"
+  CACHE INTERNAL "Location of CUB headers."
+)
+unset(_CUB_VERSION_INCLUDE_DIR CACHE) # Clear tmp variable from cache
+target_include_directories(_CUB_CUB INTERFACE "${_CUB_INCLUDE_DIR}")
+
+if (CUB_IGNORE_DEPRECATED_API OR THRUST_IGNORE_DEPRECATED_API)
+  target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_API")
+endif()
+
+if (CUB_IGNORE_DEPRECATED_CPP_DIALECT OR
+    THRUST_IGNORE_DEPRECATED_CPP_DIALECT)
+  target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_CPP_DIALECT")
+endif()
+
+if (CUB_IGNORE_DEPRECATED_CPP_11 OR
+    THRUST_IGNORE_DEPRECATED_CPP_11)
+  target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_CPP_11")
+endif()
+
+if (CUB_IGNORE_DEPRECATED_COMPILER OR
+    THRUST_IGNORE_DEPRECATED_COMPILER)
+  target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_COMPILER")
+endif()
+
+#
+# Standardize version info
+#
+
+set(CUB_VERSION ${${CMAKE_FIND_PACKAGE_NAME}_VERSION} CACHE INTERNAL "")
+set(CUB_VERSION_MAJOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MAJOR} CACHE INTERNAL "")
+set(CUB_VERSION_MINOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MINOR} CACHE INTERNAL "")
+set(CUB_VERSION_PATCH ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_PATCH} CACHE INTERNAL "")
+set(CUB_VERSION_TWEAK ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_TWEAK} CACHE INTERNAL "")
+set(CUB_VERSION_COUNT ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_COUNT} CACHE INTERNAL "")
+
+include(FindPackageHandleStandardArgs)
+if (NOT CUB_CONFIG)
+  set(CUB_CONFIG "${CMAKE_CURRENT_LIST_FILE}")
+endif()
+find_package_handle_standard_args(CUB CONFIG_MODE)
diff --git a/cuda/cub/cmake/cub-header-search.cmake b/cuda/cub/cmake/cub-header-search.cmake
new file mode 100644
index 00000000..2ff1a8ac
--- /dev/null
+++ b/cuda/cub/cmake/cub-header-search.cmake
@@ -0,0 +1,7 @@
+unset(_CUB_VERSION_INCLUDE_DIR CACHE) # Clear old result to force search
+find_path(_CUB_VERSION_INCLUDE_DIR cub/version.cuh
+  NO_DEFAULT_PATH # Only search explicit paths below:
+  PATHS
+    "${CMAKE_CURRENT_LIST_DIR}/../.."            # Source tree
+)
+set_property(CACHE _CUB_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL)
diff --git a/cuda/cub/cmake/cub-header-search.cmake.in b/cuda/cub/cmake/cub-header-search.cmake.in
new file mode 100644
index 00000000..271b1b27
--- /dev/null
+++ b/cuda/cub/cmake/cub-header-search.cmake.in
@@ -0,0 +1,18 @@
+# Parse version information from version.h:
+unset(_CUB_VERSION_INCLUDE_DIR CACHE) # Clear old result to force search
+
+# Find CMAKE_INSTALL_INCLUDEDIR=@CMAKE_INSTALL_INCLUDEDIR@ directory"
+set(from_install_prefix "@install_location@")
+
+# Transform to a list of directories, replace each directoy with "../"
+# and convert back to a string
+string(REGEX REPLACE "/" ";" from_install_prefix "${from_install_prefix}")
+list(TRANSFORM from_install_prefix REPLACE ".+" "../")
+list(JOIN from_install_prefix "" from_install_prefix)
+
+find_path(_CUB_VERSION_INCLUDE_DIR cub/version.cuh
+  NO_DEFAULT_PATH # Only search explicit paths below:
+  PATHS
+    "${CMAKE_CURRENT_LIST_DIR}/${from_install_prefix}/@CMAKE_INSTALL_INCLUDEDIR@"
+)
+set_property(CACHE _CUB_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL)
diff --git a/cuda/cub/config.cuh b/cuda/cub/config.cuh
new file mode 100644
index 00000000..b909bbf7
--- /dev/null
+++ b/cuda/cub/config.cuh
@@ -0,0 +1,40 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Static configuration header for the CUB project.
+ */
+
+#pragma once
+
+#include "util_arch.cuh"
+#include "util_compiler.cuh"
+#include "util_cpp_dialect.cuh"
+#include "util_deprecated.cuh"
+#include "util_macro.cuh"
+#include "util_namespace.cuh"
diff --git a/cuda/cub/cub.cuh b/cuda/cub/cub.cuh
new file mode 100644
index 00000000..39856775
--- /dev/null
+++ b/cuda/cub/cub.cuh
@@ -0,0 +1,108 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * CUB umbrella include file
+ */
+
+#pragma once
+
+// Static configuration
+#include "config.cuh"
+
+// Block
+#include "block/block_histogram.cuh"
+#include "block/block_adjacent_difference.cuh"
+#include "block/block_discontinuity.cuh"
+#include "block/block_exchange.cuh"
+#include "block/block_load.cuh"
+#include "block/block_radix_rank.cuh"
+#include "block/block_radix_sort.cuh"
+#include "block/block_merge_sort.cuh"
+#include "block/block_reduce.cuh"
+#include "block/block_scan.cuh"
+#include "block/block_store.cuh"
+//#include "block/block_shift.cuh"
+
+// Device
+#include "device/device_merge_sort.cuh"
+#include "device/device_histogram.cuh"
+#include "device/device_partition.cuh"
+#include "device/device_radix_sort.cuh"
+#include "device/device_reduce.cuh"
+#include "device/device_run_length_encode.cuh"
+#include "device/device_scan.cuh"
+#include "device/device_segmented_sort.cuh"
+#include "device/device_segmented_radix_sort.cuh"
+#include "device/device_segmented_reduce.cuh"
+#include "device/device_select.cuh"
+#include "device/device_spmv.cuh"
+#include "device/device_adjacent_difference.cuh"
+
+// Grid
+//#include "grid/grid_barrier.cuh"
+#include "grid/grid_even_share.cuh"
+#include "grid/grid_mapping.cuh"
+#include "grid/grid_queue.cuh"
+
+// Thread
+#include "thread/thread_load.cuh"
+#include "thread/thread_operators.cuh"
+#include "thread/thread_reduce.cuh"
+#include "thread/thread_scan.cuh"
+#include "thread/thread_store.cuh"
+
+// Warp
+#include "warp/warp_exchange.cuh"
+#include "warp/warp_load.cuh"
+#include "warp/warp_merge_sort.cuh"
+#include "warp/warp_reduce.cuh"
+#include "warp/warp_scan.cuh"
+#include "warp/warp_store.cuh"
+
+// Iterator
+#include "iterator/arg_index_input_iterator.cuh"
+#include "iterator/cache_modified_input_iterator.cuh"
+#include "iterator/cache_modified_output_iterator.cuh"
+#include "iterator/constant_input_iterator.cuh"
+#include "iterator/counting_input_iterator.cuh"
+#include "iterator/discard_output_iterator.cuh"
+#include "iterator/tex_obj_input_iterator.cuh"
+#include "iterator/tex_ref_input_iterator.cuh"
+#include "iterator/transform_input_iterator.cuh"
+
+// Util
+#include "util_allocator.cuh"
+#include "util_arch.cuh"
+#include "util_debug.cuh"
+#include "util_device.cuh"
+#include "util_macro.cuh"
+#include "util_ptx.cuh"
+#include "util_type.cuh"
+
diff --git a/cuda/cub/detail/choose_offset.cuh b/cuda/cub/detail/choose_offset.cuh
new file mode 100644
index 00000000..ed703e26
--- /dev/null
+++ b/cuda/cub/detail/choose_offset.cuh
@@ -0,0 +1,62 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#include <cstdint>
+#include <type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+/**
+ * ChooseOffsetT checks NumItemsT, the type of the num_items parameter, and
+ * selects the offset type based on it.
+ */
+template <typename NumItemsT>
+struct ChooseOffsetT
+{
+  // NumItemsT must be an integral type (but not bool).
+  static_assert(
+    std::is_integral<NumItemsT>::value &&
+      !std::is_same<typename std::remove_cv<NumItemsT>::type, bool>::value,
+    "NumItemsT must be an integral type, but not bool");
+
+  // Unsigned integer type for global offsets.
+  using Type = typename std::conditional<sizeof(NumItemsT) <= 4,
+                                         std::uint32_t,
+                                         unsigned long long>::type;
+};
+
+} // namespace detail
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/detail/device_double_buffer.cuh b/cuda/cub/detail/device_double_buffer.cuh
new file mode 100644
index 00000000..c427dcb4
--- /dev/null
+++ b/cuda/cub/detail/device_double_buffer.cuh
@@ -0,0 +1,90 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/util_namespace.cuh>
+
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+
+/**
+ * @brief It's a double-buffer storage wrapper for multi-pass stream
+ *        transformations that require more than one storage array for
+ *        streaming intermediate results back and forth.
+ *
+ * Many multi-pass computations require a pair of "ping-pong" storage buffers
+ * (e.g., one for reading from and the other for writing to, and then
+ * vice-versa for the subsequent pass). This structure wraps a set of device
+ * buffers.
+ *
+ * Unlike `cub::DoubleBuffer` this class doesn't provide a "selector" member
+ * to track which buffer is "current". The main reason for this class existence
+ * is the performance difference. Since `cub::DoubleBuffer` relies on the
+ * runtime variable to index pointers arrays, they are placed in the local
+ * memory instead of registers. Local memory accesses significantly affect
+ * performance. On the contrary, this class swaps pointer, so all operations
+ * can be performed in registers.
+ */
+template <typename T>
+class device_double_buffer
+{
+  /// Pair of device buffer pointers
+  T *m_current_buffer {};
+  T *m_alternate_buffer {};
+
+public:
+  /**
+   * @param d_current
+   *   The currently valid buffer
+   *
+   * @param d_alternate
+   *   Alternate storage buffer of the same size as @p d_current
+   */
+  __host__ __device__ __forceinline__ device_double_buffer(T *current,
+                                                           T *alternate)
+      : m_current_buffer(current)
+      , m_alternate_buffer(alternate)
+  {}
+
+  /// \brief Return pointer to the currently valid buffer
+  __host__ __device__ __forceinline__ T *current() const
+  {
+    return m_current_buffer;
+  }
+
+  /// \brief Return pointer to the currently invalid buffer
+  __host__ __device__ __forceinline__ T *alternate() const
+  {
+    return m_alternate_buffer;
+  }
+
+  __host__ __device__ void swap()
+  {
+    T *tmp             = m_current_buffer;
+    m_current_buffer   = m_alternate_buffer;
+    m_alternate_buffer = tmp;
+  }
+};
+
+
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/detail/device_synchronize.cuh b/cuda/cub/detail/device_synchronize.cuh
new file mode 100644
index 00000000..075c0aa6
--- /dev/null
+++ b/cuda/cub/detail/device_synchronize.cuh
@@ -0,0 +1,70 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/detail/exec_check_disable.cuh>
+#include <cub/util_arch.cuh>
+#include <cub/util_namespace.cuh>
+
+#include <cuda_runtime_api.h>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+/**
+ * Call `cudaDeviceSynchronize()` using the proper API for the current CUB and
+ * CUDA configuration.
+ */
+CUB_EXEC_CHECK_DISABLE
+CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize()
+{
+  cudaError_t result = cudaErrorUnknown;
+
+  if (CUB_IS_HOST_CODE)
+  {
+#if CUB_INCLUDE_HOST_CODE
+    result = cudaDeviceSynchronize();
+#endif
+  }
+  else
+  {
+    // Device code with the CUDA runtime.
+#if defined(CUB_INCLUDE_DEVICE_CODE) && defined(CUB_RUNTIME_ENABLED)
+
+#if defined(__CUDACC__) &&                                                     \
+  ((__CUDACC_VER_MAJOR__ > 11) ||                                              \
+   ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 6)))
+    // CUDA >= 11.6
+    result = __cudaDeviceSynchronizeDeprecationAvoidance();
+#else // CUDA < 11.6
+    result = cudaDeviceSynchronize();
+#endif
+
+#else // Device code without the CUDA runtime.
+    // Device side CUDA API calls are not supported in this configuration.
+    result = cudaErrorInvalidConfiguration;
+#endif
+  }
+
+  return result;
+}
+
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/detail/exec_check_disable.cuh b/cuda/cub/detail/exec_check_disable.cuh
new file mode 100644
index 00000000..c5f4b457
--- /dev/null
+++ b/cuda/cub/detail/exec_check_disable.cuh
@@ -0,0 +1,41 @@
+/*
+*  Copyright 2021 NVIDIA Corporation
+*
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*/
+
+#pragma once
+
+#include <cub/util_compiler.cuh>
+
+/**
+ * @def CUB_EXEC_CHECK_DISABLE
+ * Wrapper around `#pragma nv_exec_check_disable`.
+ */
+
+// #pragma nv_exec_check_disable is only recognized by NVCC.
+#if defined(__CUDACC__) && \
+    !defined(_NVHPC_CUDA) && \
+    !(defined(__CUDA__) && defined(__clang__))
+
+#if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
+#define CUB_EXEC_CHECK_DISABLE __pragma("nv_exec_check_disable")
+#else // // !MSVC
+#define CUB_EXEC_CHECK_DISABLE _Pragma("nv_exec_check_disable")
+#endif // MSVC
+
+#else // !NVCC
+
+#define CUB_EXEC_CHECK_DISABLE
+
+#endif // NVCC
diff --git a/cuda/cub/detail/temporary_storage.cuh b/cuda/cub/detail/temporary_storage.cuh
new file mode 100644
index 00000000..005a3763
--- /dev/null
+++ b/cuda/cub/detail/temporary_storage.cuh
@@ -0,0 +1,350 @@
+/*
+*  Copyright 2021 NVIDIA Corporation
+*
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*/
+
+#pragma once
+
+#include <cub/util_namespace.cuh>
+#include <cub/util_device.cuh>
+
+
+CUB_NAMESPACE_BEGIN
+
+
+namespace detail
+{
+
+
+namespace temporary_storage
+{
+
+
+class slot;
+
+template <typename T>
+class alias;
+
+template <int SlotsCount>
+class layout;
+
+
+/**
+ * @brief Temporary storage slot that can be considered a C++ union with an
+ *        arbitrary fields count.
+ *
+ * @warning slot lifetime is defined by the lifetime of the associated layout.
+ *          It's impossible to request new array if layout is already mapped.
+ *
+ * @par A Simple Example
+ * @code
+ * auto slot = temporary_storage.get_slot(0);
+ *
+ * // Add fields into the slot
+ * // Create an int alias with 0 elements:
+ * auto int_array = slot->create_alias<int>();
+ * // Create a double alias with 2 elements:
+ * auto double_array = slot->create_alias<double>(2);
+ * // Create a char alias with 0 elements:
+ * auto empty_array = slot->create_alias<char>();
+ * // Slot size is defined by double_array size (2 * sizeof(double))
+ *
+ * if (condition)
+ * {
+ *   int_array.grow(42);
+ *   // Now slot size is defined by int_array size (42 * sizeof(int))
+ * }
+ *
+ * // Temporary storage mapping
+ * // ...
+
+ * int *d_int_array = int_array.get();
+ * double *d_double_array = double_array.get();
+ * char *d_empty_array = empty_array.get(); // Guaranteed to return nullptr
+ * @endcode
+ */
+class slot
+{
+  std::size_t m_size{};
+  void *m_pointer{};
+
+public:
+  slot() = default;
+
+  /**
+   * @brief Returns an array of type @p T and length @p elements
+   */
+  template <typename T>
+  __host__ __device__ alias<T> create_alias(std::size_t elements = 0);
+
+private:
+  __host__ __device__ void set_bytes_required(std::size_t new_size)
+  {
+    m_size = (max)(m_size, new_size);
+  }
+
+  __host__ __device__ std::size_t get_bytes_required() const
+  {
+    return m_size;
+  }
+
+  __host__ __device__ void set_storage(void *ptr) { m_pointer = ptr; }
+  __host__ __device__ void *get_storage() const { return m_pointer; }
+
+  template <typename T>
+  friend class alias;
+
+  template <int>
+  friend class layout;
+};
+
+
+/**
+ * @brief Named memory region of a temporary storage slot
+ *
+ * @par Overview
+ * This class provides a typed wrapper of a temporary slot memory region.
+ * It can be considered as a field in the C++ union. It's only possible to
+ * increase the array size.
+ *
+ * @warning alias lifetime is defined by the lifetime of the associated slot
+ *          It's impossible to grow the array if the layout is already mapped.
+ */
+template <typename T>
+class alias
+{
+  slot &m_slot;
+  std::size_t m_elements{};
+
+  __host__ __device__ explicit alias(slot &slot,
+                                     std::size_t elements = 0)
+    : m_slot(slot)
+    , m_elements(elements)
+  {
+    this->update_slot();
+  }
+
+  __host__ __device__ void update_slot()
+  {
+    m_slot.set_bytes_required(m_elements * sizeof(T));
+  }
+
+public:
+  alias() = delete;
+
+  /**
+   * @brief Increases the number of elements
+   *
+   * @warning
+   *   This method should be called before temporary storage mapping stage.
+   *
+   * @param[in] new_elements Increases the memory region occupied in the
+   *                         temporary slot to fit up to @p new_elements items
+   *                         of type @p T.
+   */
+  __host__ __device__ void grow(std::size_t new_elements)
+  {
+    m_elements = new_elements;
+    this->update_slot();
+  }
+
+  /**
+   * @brief Returns pointer to array
+   *
+   * If the @p elements number is equal to zero, or storage layout isn't mapped,
+   * @p nullptr is returned.
+   */
+  __host__ __device__ T *get() const
+  {
+    if (m_elements == 0)
+    {
+      return nullptr;
+    }
+
+    return reinterpret_cast<T *>(m_slot.get_storage());
+  }
+
+  friend class slot;
+};
+
+
+template <typename T>
+__host__ __device__ alias<T> slot::create_alias(std::size_t elements)
+{
+  return alias<T>(*this, elements);
+}
+
+
+/**
+ * @brief Temporary storage layout represents a structure with
+ *        @p SlotsCount union-like fields
+ *
+ * The layout can be mapped to a temporary buffer only once.
+ *
+ * @par A Simple Example
+ * @code
+ * cub::detail::temporary_storage::layout<3> temporary_storage;
+ *
+ * auto slot_1 = temporary_storage.get_slot(0);
+ * auto slot_2 = temporary_storage.get_slot(1);
+ *
+ * // Add fields into the first slot
+ * auto int_array = slot_1->create_alias<int>(1);
+ * auto double_array = slot_1->create_alias<double>(2);
+ *
+ * // Add fields into the second slot
+ * auto char_array = slot_2->create_alias<char>();
+ *
+ * // The equivalent C++ structure could look like
+ * // struct StorageLayout
+ * // {
+ * //   union {
+ * //   } slot_0;
+ * //   std::byte padding_0[256 - sizeof (slot_0)];
+ * //
+ * //   union {
+ * //     int alias_0[1];
+ * //     double alias_1[2];
+ * //   } slot_1;
+ * //   std::byte padding_1[256 - sizeof (slot_1)];
+ * //
+ * //   union {
+ * //     char alias_0[0];
+ * //   } slot_2;
+ * //   std::byte padding_2[256 - sizeof (slot_2)];
+ * // };
+ *
+ * // The third slot is empty
+ *
+ * // Temporary storage mapping
+ * if (d_temp_storage == nullptr)
+ * {
+ *   temp_storage_bytes = temporary_storage.get_size();
+ *   return;
+ * }
+ * else
+ * {
+ *   temporary_storage.map_to_buffer(d_temp_storage, temp_storage_bytes);
+ * }
+ *
+ * // Use pointers
+ * int *d_int_array = int_array.get();
+ * double *d_double_array = double_array.get();
+ * char *d_char_array = char_array.get();
+ * @endcode
+ */
+template <int SlotsCount>
+class layout
+{
+  slot m_slots[SlotsCount];
+  std::size_t m_sizes[SlotsCount];
+  void *m_pointers[SlotsCount];
+  bool m_layout_was_mapped {};
+
+public:
+  layout() = default;
+
+  __host__ __device__ slot *get_slot(int slot_id)
+  {
+    if (slot_id < SlotsCount)
+    {
+      return &m_slots[slot_id];
+    }
+
+    return nullptr;
+  }
+
+  /**
+   * @brief Returns required temporary storage size in bytes
+   */
+  __host__ __device__ std::size_t get_size()
+  {
+    this->prepare_interface();
+
+    // AliasTemporaries can return error only in mapping stage,
+    // so it's safe to ignore it here.
+    std::size_t temp_storage_bytes{};
+    AliasTemporaries(nullptr, temp_storage_bytes, m_pointers, m_sizes);
+
+    if (temp_storage_bytes == 0)
+    {
+      // The current CUB convention implies that there are two stages for each
+      // device-scope function call. The first one returns the required storage
+      // size. The second stage consumes temporary storage to perform some work.
+      // The only way to distinguish between the two stages is by checking the
+      // value of the temporary storage pointer. If zero bytes are requested,
+      // `cudaMalloc` will return `nullptr`. This fact makes it impossible to
+      // distinguish between the two stages, so we request some fixed amount of
+      // bytes (even if we don't need it) to have a non-null temporary storage
+      // pointer.
+      return 1;
+    }
+
+    return temp_storage_bytes;
+  }
+
+  /**
+   * @brief Maps the layout to the temporary storage buffer.
+   */
+  __host__ __device__ cudaError_t map_to_buffer(void *d_temp_storage,
+                                                std::size_t temp_storage_bytes)
+  {
+    if (m_layout_was_mapped)
+    {
+      return cudaErrorAlreadyMapped;
+    }
+
+    this->prepare_interface();
+
+    cudaError_t error = cudaSuccess;
+    if ((error = AliasTemporaries(d_temp_storage,
+                                  temp_storage_bytes,
+                                  m_pointers,
+                                  m_sizes)))
+    {
+      return error;
+    }
+
+    for (std::size_t slot_id = 0; slot_id < SlotsCount; slot_id++)
+    {
+      m_slots[slot_id].set_storage(m_pointers[slot_id]);
+    }
+
+    m_layout_was_mapped = true;
+    return error;
+  }
+
+private:
+  __host__ __device__ void prepare_interface()
+  {
+    if (m_layout_was_mapped)
+    {
+      return;
+    }
+
+    for (std::size_t slot_id = 0; slot_id < SlotsCount; slot_id++)
+    {
+      const std::size_t slot_size = m_slots[slot_id].get_bytes_required();
+
+      m_sizes[slot_id]    = slot_size;
+      m_pointers[slot_id] = nullptr;
+    }
+  }
+};
+
+} // namespace temporary_storage
+
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/detail/type_traits.cuh b/cuda/cub/detail/type_traits.cuh
new file mode 100644
index 00000000..9b28e2c9
--- /dev/null
+++ b/cuda/cub/detail/type_traits.cuh
@@ -0,0 +1,54 @@
+/******************************************************************************
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Wrappers and extensions around <type_traits> utilities.
+ */
+
+#pragma once
+
+#include "../util_cpp_dialect.cuh"
+#include "../util_namespace.cuh"
+
+#include <type_traits>
+
+
+CUB_NAMESPACE_BEGIN
+namespace detail {
+
+template <typename Invokable, typename... Args>
+using invoke_result_t =
+#if CUB_CPP_DIALECT < 2017
+  typename std::result_of<Invokable(Args...)>::type;
+#else // 2017+
+  std::invoke_result_t<Invokable, Args...>;
+#endif
+
+
+} // namespace detail
+CUB_NAMESPACE_END
diff --git a/cuda/cub/device/device_adjacent_difference.cuh b/cuda/cub/device/device_adjacent_difference.cuh
new file mode 100644
index 00000000..18603015
--- /dev/null
+++ b/cuda/cub/device/device_adjacent_difference.cuh
@@ -0,0 +1,629 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+#include <cub/detail/choose_offset.cuh>
+#include <cub/device/dispatch/dispatch_adjacent_difference.cuh>
+#include <cub/util_namespace.cuh>
+
+#include <thrust/detail/integer_traits.h>
+#include <thrust/detail/cstdint.h>
+
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * @brief DeviceAdjacentDifference provides device-wide, parallel operations for
+ *        computing the differences of adjacent elements residing within
+ *        device-accessible memory.
+ *
+ * @ingroup SingleModule
+ *
+ * @par Overview
+ * - DeviceAdjacentDifference calculates the differences of adjacent elements in
+ *   d_input. Because the binary operation could be noncommutative, there
+ *   are two sets of methods. Methods named SubtractLeft subtract left element
+ *   `*(i - 1)` of input sequence from current element `*i`.
+ *   Methods named `SubtractRight` subtract current element `*i` from the
+ *   right one `*(i + 1)`:
+ *   @par
+ *   @code
+ *   int *d_values; // [1, 2, 3, 4]
+ *   //...
+ *   int *d_subtract_left_result  <-- [  1,  1,  1,  1 ]
+ *   int *d_subtract_right_result <-- [ -1, -1, -1,  4 ]
+ *   @endcode
+ * - For SubtractLeft, if the left element is out of bounds, the iterator is
+ *   assigned to <tt>\*(result + (i - first))</tt> without modification.
+ * - For SubtractRight, if the right element is out of bounds, the iterator is
+ *   assigned to <tt>\*(result + (i - first))</tt> without modification.
+ *
+ * @par Snippet
+ * The code snippet below illustrates how to use @p DeviceAdjacentDifference to
+ * compute the left difference between adjacent elements.
+ *
+ * @par
+ * @code
+ * #include <cub/cub.cuh>
+ * // or equivalently <cub/device/device_adjacent_difference.cuh>
+ *
+ * // Declare, allocate, and initialize device-accessible pointers
+ * int  num_items;       // e.g., 8
+ * int  *d_values;       // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
+ * //...
+ *
+ * // Determine temporary device storage requirements
+ * void     *d_temp_storage = NULL;
+ * size_t   temp_storage_bytes = 0;
+ *
+ * cub::DeviceAdjacentDifference::SubtractLeft(
+ *   d_temp_storage, temp_storage_bytes, d_values, num_items);
+ *
+ * // Allocate temporary storage
+ * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+ *
+ * // Run operation
+ * cub::DeviceAdjacentDifference::SubtractLeft(
+ *   d_temp_storage, temp_storage_bytes, d_values, num_items);
+ *
+ * // d_values <-- [1, 1, -1, 1, -1, 1, -1, 1]
+ * @endcode
+ */
+struct DeviceAdjacentDifference
+{
+private:
+  template <bool may_alias,
+            bool read_left,
+            typename NumItemsT,
+            typename InputIteratorT,
+            typename OutputIteratorT,
+            typename DifferenceOpT>
+  static CUB_RUNTIME_FUNCTION cudaError_t
+  AdjacentDifference(void *d_temp_storage,
+                     std::size_t &temp_storage_bytes,
+                     InputIteratorT d_input,
+                     OutputIteratorT d_output,
+                     NumItemsT num_items,
+                     DifferenceOpT difference_op,
+                     cudaStream_t stream,
+                     bool debug_synchronous)
+  {
+    using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
+
+    using DispatchT = DispatchAdjacentDifference<InputIteratorT,
+                                                 OutputIteratorT,
+                                                 DifferenceOpT,
+                                                 OffsetT,
+                                                 may_alias,
+                                                 read_left>;
+
+    return DispatchT::Dispatch(d_temp_storage,
+                               temp_storage_bytes,
+                               d_input,
+                               d_output,
+                               static_cast<OffsetT>(num_items),
+                               difference_op,
+                               stream,
+                               debug_synchronous);
+  }
+
+public:
+
+  /**
+   * @brief Subtracts the left element of each adjacent pair of elements residing within device-accessible memory.
+   * @ingroup SingleModule
+   *
+   * @par Overview
+   * - Calculates the differences of adjacent elements in `d_input`. That is,
+   *   `*d_input` is assigned to `*d_output`, and, for each iterator `i` in the
+   *   range `[d_input + 1, d_input + num_items)`, the result of
+   *   `difference_op(*i, *(i - 1))` is assigned to
+   *   `*(d_output + (i - d_input))`.
+   * - Note that the behavior is undefined if the input and output ranges
+   *   overlap in any way.
+   *
+   * @par Snippet
+   * The code snippet below illustrates how to use @p DeviceAdjacentDifference
+   * to compute the difference between adjacent elements.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_adjacent_difference.cuh>
+   *
+   * struct CustomDifference
+   * {
+   *   template <typename DataType>
+   *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
+   *   {
+   *     return lhs - rhs;
+   *   }
+   * };
+   *
+   * // Declare, allocate, and initialize device-accessible pointers
+   * int  num_items;      // e.g., 8
+   * int  *d_input;       // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
+   * int  *d_output;
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   *
+   * cub::DeviceAdjacentDifference::SubtractLeftCopy(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_input, d_output,
+   *   num_items, CustomDifference());
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run operation
+   * cub::DeviceAdjacentDifference::SubtractLeftCopy(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_input, d_output,
+   *   num_items, CustomDifference());
+   *
+   * // d_input  <-- [1, 2, 1, 2, 1, 2, 1, 2]
+   * // d_output <-- [1, 1, -1, 1, -1, 1, -1, 1]
+   * @endcode
+   *
+   * @tparam InputIteratorT
+   *   is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+   *   and `x` and `y` are objects of `InputIteratorT`'s `value_type`, then
+   *   `x - y` is defined, and `InputIteratorT`'s `value_type` is convertible to
+   *   a type in `OutputIteratorT`'s set of `value_types`, and the return type
+   *   of `x - y` is convertible to a type in `OutputIteratorT`'s set of
+   *   `value_types`.
+   *
+   * @tparam OutputIteratorT
+   *   is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+   *
+   * @tparam DifferenceOpT
+   *   Its `result_type` is convertible to a type in `OutputIteratorT`'s set of
+   *   `value_types`.
+   *
+   * @tparam NumItemsT **[inferred]** Type of num_items
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_input
+   *   Pointer to the input sequence
+   *
+   * @param[out] d_output
+   *   Pointer to the output sequence
+   *
+   * @param[in] num_items
+   *   Number of items in the input sequence
+   *
+   * @param[in] difference_op
+   *   The binary function used to compute differences
+   *
+   * @param[in] stream
+   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>
+   *
+   * @param[in] debug_synchronous
+   *   <b>[optional]</b> Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations to
+   *   be printed to the console. Default is `false`
+   */
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename DifferenceOpT = cub::Difference,
+            typename NumItemsT = std::uint32_t>
+  static CUB_RUNTIME_FUNCTION cudaError_t
+  SubtractLeftCopy(void *d_temp_storage,
+                   std::size_t &temp_storage_bytes,
+                   InputIteratorT d_input,
+                   OutputIteratorT d_output,
+                   NumItemsT num_items,
+                   DifferenceOpT difference_op = {},
+                   cudaStream_t stream         = 0,
+                   bool debug_synchronous      = false)
+  {
+    constexpr bool may_alias = false;
+    constexpr bool read_left = true;
+
+    return AdjacentDifference<may_alias, read_left>(d_temp_storage,
+                                                    temp_storage_bytes,
+                                                    d_input,
+                                                    d_output,
+                                                    num_items,
+                                                    difference_op,
+                                                    stream,
+                                                    debug_synchronous);
+  }
+
+  /**
+   * @brief Subtracts the left element of each adjacent pair of elements
+   *        residing within device-accessible memory.
+   *
+   * @ingroup SingleModule
+   *
+   * @par Overview
+   * Calculates the differences of adjacent elements in `d_input`. That is, for
+   * each iterator `i` in the range `[d_input + 1, d_input + num_items)`, the
+   * result of `difference_op(*i, *(i - 1))` is assigned to
+   * `*(d_input + (i - d_input))`.
+   *
+   * @par Snippet
+   * The code snippet below illustrates how to use @p DeviceAdjacentDifference
+   * to compute the difference between adjacent elements.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_adjacent_difference.cuh>
+   *
+   * struct CustomDifference
+   * {
+   *   template <typename DataType>
+   *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
+   *   {
+   *     return lhs - rhs;
+   *   }
+   * };
+   *
+   * // Declare, allocate, and initialize device-accessible pointers
+   * int  num_items;     // e.g., 8
+   * int  *d_data;       // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceAdjacentDifference::SubtractLeft(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_data, num_items, CustomDifference());
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run operation
+   * cub::DeviceAdjacentDifference::SubtractLeft(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_data, num_items, CustomDifference());
+   *
+   * // d_data <-- [1, 1, -1, 1, -1, 1, -1, 1]
+   * @endcode
+   *
+   * @tparam RandomAccessIteratorT
+   *   is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
+   *   `RandomAccessIteratorT` is mutable. If `x` and `y` are objects of
+   *   `RandomAccessIteratorT`'s `value_type`, and `x - y` is defined, then the
+   *   return type of `x - y` should be convertible to a type in
+   *   `RandomAccessIteratorT`'s set of `value_types`.
+   *
+   * @tparam DifferenceOpT
+   *   Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s
+   *   set of `value_types`.
+   *
+   * @tparam NumItemsT **[inferred]** Type of num_items
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of @p d_temp_storage allocation
+   *
+   * @param[in,out] d_input
+   *   Pointer to the input sequence and the result
+   *
+   * @param[in] num_items
+   *   Number of items in the input sequence
+   *
+   * @param[in] difference_op
+   *   The binary function used to compute differences
+   *
+   * @param[in] stream
+   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   <b>[optional]</b> Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations to
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename RandomAccessIteratorT,
+            typename DifferenceOpT = cub::Difference,
+            typename NumItemsT = std::uint32_t>
+  static CUB_RUNTIME_FUNCTION cudaError_t
+  SubtractLeft(void *d_temp_storage,
+               std::size_t &temp_storage_bytes,
+               RandomAccessIteratorT d_input,
+               NumItemsT num_items,
+               DifferenceOpT difference_op = {},
+               cudaStream_t stream         = 0,
+               bool debug_synchronous      = false)
+  {
+    constexpr bool may_alias = true;
+    constexpr bool read_left = true;
+
+    return AdjacentDifference<may_alias, read_left>(d_temp_storage,
+                                                    temp_storage_bytes,
+                                                    d_input,
+                                                    d_input,
+                                                    num_items,
+                                                    difference_op,
+                                                    stream,
+                                                    debug_synchronous);
+  }
+
+  /**
+   * @brief Subtracts the right element of each adjacent pair of elements
+   *        residing within device-accessible memory.
+   *
+   * @ingroup SingleModule
+   *
+   * @par Overview
+   * - Calculates the right differences of adjacent elements in `d_input`. That
+   *   is, `*(d_input + num_items - 1)` is assigned to
+   *   `*(d_output + num_items - 1)`, and, for each iterator `i` in the range
+   *   `[d_input, d_input + num_items - 1)`, the result of
+   *   `difference_op(*i, *(i + 1))` is assigned to
+   *   `*(d_output + (i - d_input))`.
+   * - Note that the behavior is undefined if the input and output ranges
+   *   overlap in any way.
+   *
+   * @par Snippet
+   * The code snippet below illustrates how to use @p DeviceAdjacentDifference
+   * to compute the difference between adjacent elements.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_adjacent_difference.cuh>
+   *
+   * struct CustomDifference
+   * {
+   *   template <typename DataType>
+   *   __device__ DataType operator()(DataType &lhs, DataType &rhs)
+   *   {
+   *     return lhs - rhs;
+   *   }
+   * };
+   *
+   * // Declare, allocate, and initialize device-accessible pointers
+   * int  num_items;     // e.g., 8
+   * int  *d_input;      // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
+   * int  *d_output;
+   * ..
+   *
+   * // Determine temporary device storage requirements
+   * void *d_temp_storage = nullptr;
+   * size_t temp_storage_bytes = 0;
+   * cub::DeviceAdjacentDifference::SubtractRightCopy(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_input, d_output, num_items, CustomDifference());
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run operation
+   * cub::DeviceAdjacentDifference::SubtractRightCopy(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_input, d_output, num_items, CustomDifference());
+   *
+   * // d_input <-- [1, 2, 1, 2, 1, 2, 1, 2]
+   * // d_data  <-- [-1, 1, -1, 1, -1, 1, -1, 2]
+   * @endcode
+   *
+   * @tparam InputIteratorT
+   *   is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+   *   and `x` and `y` are objects of `InputIteratorT`'s `value_type`, then
+   *   `x - y` is defined, and `InputIteratorT`'s `value_type` is convertible to
+   *   a type in `OutputIteratorT`'s set of `value_types`, and the return type
+   *   of `x - y` is convertible to a type in `OutputIteratorT`'s set of
+   *   `value_types`.
+   *
+   * @tparam OutputIteratorT
+   *   is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+   *
+   * @tparam DifferenceOpT
+   *   Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s
+   *   set of `value_types`.
+   *
+   * @tparam NumItemsT **[inferred]** Type of num_items
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_input
+   *   Pointer to the input sequence
+   *
+   * @param[out] d_output
+   *   Pointer to the output sequence
+   *
+   * @param[in] num_items
+   *   Number of items in the input sequence
+   *
+   * @param[in] difference_op
+   *   The binary function used to compute differences.
+   *
+   * @param[in] stream
+   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   <b>[optional]</b> Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations to
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename DifferenceOpT = cub::Difference,
+            typename NumItemsT = std::uint32_t>
+  static CUB_RUNTIME_FUNCTION cudaError_t
+  SubtractRightCopy(void *d_temp_storage,
+                    std::size_t &temp_storage_bytes,
+                    InputIteratorT d_input,
+                    OutputIteratorT d_output,
+                    NumItemsT num_items,
+                    DifferenceOpT difference_op = {},
+                    cudaStream_t stream         = 0,
+                    bool debug_synchronous      = false)
+  {
+    constexpr bool may_alias  = false;
+    constexpr bool read_left = false;
+
+    return AdjacentDifference<may_alias, read_left>(d_temp_storage,
+                                                    temp_storage_bytes,
+                                                    d_input,
+                                                    d_output,
+                                                    num_items,
+                                                    difference_op,
+                                                    stream,
+                                                    debug_synchronous);
+  }
+
+  /**
+   * @brief Subtracts the right element of each adjacent pair of elements
+   *        residing within device-accessible memory.
+   *
+   * @ingroup SingleModule
+   *
+   * @par Overview
+   * Calculates the right differences of adjacent elements in `d_input`. That
+   * is, for each iterator `i` in the range
+   * `[d_input, d_input + num_items - 1)`, the result of
+   * `difference_op(*i, *(i + 1))` is assigned to
+   * `*(d_input + (i - d_input))`.
+   *
+   * @par Snippet
+   * The code snippet below illustrates how to use @p DeviceAdjacentDifference
+   * to compute the difference between adjacent elements.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_adjacent_difference.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers
+   * int  num_items;    // e.g., 8
+   * int  *d_data;      // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void *d_temp_storage = NULL;
+   * size_t temp_storage_bytes = 0;
+   * cub::DeviceAdjacentDifference::SubtractRight(
+   *   d_temp_storage, temp_storage_bytes, d_data, num_items);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run operation
+   * cub::DeviceAdjacentDifference::SubtractRight(
+   *   d_temp_storage, temp_storage_bytes, d_data, num_items);
+   *
+   * // d_data  <-- [-1, 1, -1, 1, -1, 1, -1, 2]
+   * @endcode
+   *
+   * @tparam RandomAccessIteratorT
+   *   is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
+   *   `RandomAccessIteratorT` is mutable. If `x` and `y` are objects of
+   *   `RandomAccessIteratorT`'s `value_type`, and `x - y` is defined, then the
+   *   return type of `x - y` should be convertible to a type in
+   *   `RandomAccessIteratorT`'s set of `value_types`.
+   *
+   * @tparam DifferenceOpT
+   *   Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s
+   *   set of `value_types`.
+   *
+   * @tparam NumItemsT **[inferred]** Type of num_items
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_input
+   *   Pointer to the input sequence
+   *
+   * @param[in] num_items
+   *   Number of items in the input sequence
+   *
+   * @param[in] difference_op
+   *   The binary function used to compute differences
+   *
+   * @param[in] stream
+   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *  <b>[optional]</b> Whether or not to synchronize the stream after every
+   *  kernel launch to check for errors. Also causes launch configurations to be
+   *  printed to the console. Default is `false`.
+   */
+  template <typename RandomAccessIteratorT,
+            typename DifferenceOpT = cub::Difference,
+            typename NumItemsT = std::uint32_t>
+  static CUB_RUNTIME_FUNCTION cudaError_t
+  SubtractRight(void *d_temp_storage,
+                std::size_t &temp_storage_bytes,
+                RandomAccessIteratorT d_input,
+                NumItemsT num_items,
+                DifferenceOpT difference_op = {},
+                cudaStream_t stream         = 0,
+                bool debug_synchronous      = false)
+  {
+    constexpr bool may_alias = true;
+    constexpr bool read_left = false;
+
+    return AdjacentDifference<may_alias, read_left>(d_temp_storage,
+                                                    temp_storage_bytes,
+                                                    d_input,
+                                                    d_input,
+                                                    num_items,
+                                                    difference_op,
+                                                    stream,
+                                                    debug_synchronous);
+  }
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/device/device_histogram.cuh b/cuda/cub/device/device_histogram.cuh
new file mode 100644
index 00000000..4caf9726
--- /dev/null
+++ b/cuda/cub/device/device_histogram.cuh
@@ -0,0 +1,861 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "dispatch/dispatch_histogram.cuh"
+#include "../config.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. ![](histogram_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
+ * counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceHistogram}
+ *
+ */
+struct DeviceHistogram
+{
+    /******************************************************************//**
+     * \name Evenly-segmented bin ranges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
+     *
+     * \par
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of a six-bin histogram
+     * from a sequence of float samples
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_samples;    // e.g., 10
+     * float*   d_samples;      // e.g., [2.2, 6.1, 7.1, 2.9, 3.5, 0.3, 2.9, 2.1, 6.1, 999.5]
+     * int*     d_histogram;    // e.g., [ -, -, -, -, -, -]
+     * int      num_levels;     // e.g., 7       (seven level boundaries for six bins)
+     * float    lower_level;    // e.g., 0.0     (lower sample value boundary of lowest bin)
+     * float    upper_level;    // e.g., 12.0    (upper sample value boundary of upper bin)
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
+     *
+     * // d_histogram   <-- [1, 5, 0, 3, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramEven(
+        void*               d_temp_storage,                             ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
+        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
+        OffsetT             num_samples,                                ///< [in] The number of input samples (i.e., the length of \p d_samples)
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        using SampleT = cub::detail::value_t<SampleIteratorT>;
+
+        CounterT*           d_histogram1[1]     = {d_histogram};
+        int                 num_levels1[1]      = {num_levels};
+        LevelT              lower_level1[1]     = {lower_level};
+        LevelT              upper_level1[1]     = {upper_level};
+
+        return MultiHistogramEven<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            lower_level1,
+            upper_level1,
+            num_samples,
+            static_cast<OffsetT>(1),
+            sizeof(SampleT) * num_samples,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
+     *
+     * \par
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of a six-bin histogram
+     * from a 2x5 region of interest within a flattened 2x7 array of float samples.
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_row_samples;    // e.g., 5
+     * int      num_rows;           // e.g., 2;
+     * size_t   row_stride_bytes;   // e.g., 7 * sizeof(float)
+     * float*   d_samples;          // e.g., [2.2, 6.1, 7.1, 2.9, 3.5,   -, -,
+     *                              //        0.3, 2.9, 2.1, 6.1, 999.5, -, -]
+     * int*     d_histogram;        // e.g., [ -, -, -, -, -, -]
+     * int      num_levels;         // e.g., 7       (seven level boundaries for six bins)
+     * float    lower_level;        // e.g., 0.0     (lower sample value boundary of lowest bin)
+     * float    upper_level;        // e.g., 12.0    (upper sample value boundary of upper bin)
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage  = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [1, 5, 0, 3, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramEven(
+        void*               d_temp_storage,                             ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
+        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
+        OffsetT             num_row_samples,                            ///< [in] The number of data samples per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        CounterT*           d_histogram1[1]     = {d_histogram};
+        int                 num_levels1[1]      = {num_levels};
+        LevelT              lower_level1[1]     = {lower_level};
+        LevelT              upper_level1[1]     = {upper_level};
+
+        return MultiHistogramEven<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            lower_level1,
+            upper_level1,
+            num_row_samples,
+            num_rows,
+            row_stride_bytes,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms
+     * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int              num_pixels;         // e.g., 5
+     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2),
+     *                                      //        (0, 6, 7, 5), (3, 0, 2, 6)]
+     * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
+     *                                      //       each allocated with 256 integer counters
+     * int              num_levels[3];      // e.g., {257, 257, 257};
+     * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
+     * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels);
+     *
+     * // d_histogram   <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
+     * //                     [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
+     * //                     [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramEven(
+        void*               d_temp_storage,                             ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_pixels,                                 ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        using SampleT = cub::detail::value_t<SampleIteratorT>;
+
+        return MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram,
+            num_levels,
+            lower_level,
+            upper_level,
+            num_pixels,
+            static_cast<OffsetT>(1),
+            sizeof(SampleT) * NUM_CHANNELS * num_pixels,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms from a 2x3 region of
+     * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel).
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int              num_row_pixels;     // e.g., 3
+     * int              num_rows;           // e.g., 2
+     * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
+     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -),
+     *                                      //        (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)]
+     * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
+     *                                      //       each allocated with 256 integer counters
+     * int              num_levels[3];      // e.g., {257, 257, 257};
+     * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
+     * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
+     * //                     [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
+     * //                     [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramEven(
+        void*               d_temp_storage,                             ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        using SampleT = cub::detail::value_t<SampleIteratorT>;
+        Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+
+        if ((sizeof(OffsetT) > sizeof(int)) &&
+            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX))
+        {
+            // Down-convert OffsetT data type
+
+
+            return DispatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchEven(
+                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
+                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
+                stream, debug_synchronous, is_byte_sample);
+        }
+
+        return DispatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchEven(
+            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
+            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
+            stream, debug_synchronous, is_byte_sample);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Custom bin ranges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of an six-bin histogram
+     * from a sequence of float samples
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_samples;    // e.g., 10
+     * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
+     * int*     d_histogram;    // e.g., [ -, -, -, -, -, -]
+     * int      num_levels      // e.g., 7 (seven level boundaries for six bins)
+     * float*   d_levels;       // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_samples);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_samples);
+     *
+     * // d_histogram   <-- [1, 5, 0, 3, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramRange(
+        void*               d_temp_storage,                         ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_samples,                            ///< [in] The number of data samples per row in the region of interest
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        using SampleT = cub::detail::value_t<SampleIteratorT>;
+
+        CounterT*           d_histogram1[1] = {d_histogram};
+        int                 num_levels1[1]  = {num_levels};
+        LevelT*             d_levels1[1]    = {d_levels};
+
+        return MultiHistogramRange<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            d_levels1,
+            num_samples,
+            (OffsetT)1,
+            (size_t)(sizeof(SampleT) * num_samples),
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of a six-bin histogram
+     * from a 2x5 region of interest within a flattened 2x7 array of float samples.
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_row_samples;    // e.g., 5
+     * int      num_rows;           // e.g., 2;
+     * int      row_stride_bytes;   // e.g., 7 * sizeof(float)
+     * float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
+     *                              //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
+     * int*     d_histogram;        // e.g., [ -, -, -, -, -, -]
+     * int      num_levels          // e.g., 7 (seven level boundaries for six bins)
+     * float    *d_levels;          // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [1, 5, 0, 3, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramRange(
+        void*               d_temp_storage,                         ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_samples,                        ///< [in] The number of data samples per row in the region of interest
+        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        CounterT*           d_histogram1[1]     = {d_histogram};
+        int                 num_levels1[1]      = {num_levels};
+        LevelT*             d_levels1[1]        = {d_levels};
+
+        return MultiHistogramRange<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            d_levels1,
+            num_row_samples,
+            num_rows,
+            row_stride_bytes,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms
+     * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int            num_pixels;       // e.g., 5
+     * unsigned char  *d_samples;       // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2),
+     *                                  //        (0, 6, 7, 5),(3, 0, 2, 6)]
+     * unsigned int   *d_histogram[3];  // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
+     * int            num_levels[3];    // e.g., {5, 5, 5};
+     * unsigned int   *d_levels[3];     // e.g., [ [0, 2, 4, 6, 8],
+     *                                  //         [0, 2, 4, 6, 8],
+     *                                  //         [0, 2, 4, 6, 8] ];
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_pixels);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_pixels);
+     *
+     * // d_histogram   <-- [ [1, 3, 0, 1],
+     * //                     [3, 0, 0, 2],
+     * //                     [0, 2, 0, 3] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramRange(
+        void*               d_temp_storage,                         ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_pixels,                             ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        using SampleT = cub::detail::value_t<SampleIteratorT>;
+
+        return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram,
+            num_levels,
+            d_levels,
+            num_pixels,
+            (OffsetT)1,
+            (size_t)(sizeof(SampleT) * NUM_CHANNELS * num_pixels),
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms from a 2x3 region of
+     * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel).
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int              num_row_pixels;     // e.g., 3
+     * int              num_rows;           // e.g., 2
+     * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
+     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -),
+     *                                      //        (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)]
+     * int*             d_histogram[3];     // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
+     * int              num_levels[3];      // e.g., {5, 5, 5};
+     * unsigned int*    d_levels[3];        // e.g., [ [0, 2, 4, 6, 8],
+     *                                      //         [0, 2, 4, 6, 8],
+     *                                      //         [0, 2, 4, 6, 8] ];
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [ [2, 3, 0, 1],
+     * //                     [3, 0, 0, 2],
+     * //                     [1, 2, 0, 3] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramRange(
+        void*               d_temp_storage,                         ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        using SampleT = cub::detail::value_t<SampleIteratorT>;
+        Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+
+        if ((sizeof(OffsetT) > sizeof(int)) &&
+            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX))
+        {
+            // Down-convert OffsetT data type
+            return DispatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchRange(
+                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
+                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
+                stream, debug_synchronous, is_byte_sample);
+        }
+
+        return DispatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchRange(
+            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
+            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
+            stream, debug_synchronous, is_byte_sample);
+    }
+
+
+
+    //@}  end member group
+};
+
+CUB_NAMESPACE_END
+
+
diff --git a/cuda/cub/device/device_merge_sort.cuh b/cuda/cub/device/device_merge_sort.cuh
new file mode 100644
index 00000000..0c01aafe
--- /dev/null
+++ b/cuda/cub/device/device_merge_sort.cuh
@@ -0,0 +1,876 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_namespace.cuh"
+#include "dispatch/dispatch_merge_sort.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * @brief DeviceMergeSort provides device-wide, parallel operations for
+ *        computing a merge sort across a sequence of data items residing within
+ *        device-accessible memory.
+ *
+ * @ingroup SingleModule
+ *
+ * @par Overview
+ * - DeviceMergeSort arranges items into ascending order using a comparison
+ *   functor with less-than semantics. Merge sort can handle arbitrary types (as
+ *   long as a value of these types is a model of [LessThan Comparable]) and
+ *   comparison functors, but is slower than DeviceRadixSort when sorting
+ *   arithmetic types into ascending/descending order.
+ * - Another difference from RadixSort is the fact that DeviceMergeSort can
+ *   handle arbitrary random-access iterators, as shown below.
+ *
+ * @par A Simple Example
+ * @par
+ * The code snippet below illustrates a thrust reverse iterator usage.
+ * @par
+ * @code
+ * #include <cub/cub.cuh>  // or equivalently <cub/device/device_merge_sort.cuh>
+ *
+ * struct CustomLess
+ * {
+ *   template <typename DataType>
+ *   __device__ bool operator()(const DataType &lhs, const DataType &rhs)
+ *   {
+ *     return lhs < rhs;
+ *   }
+ * };
+ *
+ * // Declare, allocate, and initialize device-accessible pointers
+ * // for sorting data
+ * thrust::device_vector<KeyType> d_keys(num_items);
+ * thrust::device_vector<DataType> d_values(num_items);
+ * // ...
+ *
+ * // Initialize iterator
+ * using KeyIterator = typename thrust::device_vector<KeyType>::iterator;
+ * thrust::reverse_iterator<KeyIterator> reverse_iter(d_keys.end());
+ *
+ * // Determine temporary device storage requirements
+ * std::size_t temp_storage_bytes = 0;
+ * cub::DeviceMergeSort::SortPairs(
+ *   nullptr,
+ *   temp_storage_bytes,
+ *   reverse_iter,
+ *   thrust::raw_pointer_cast(d_values.data()),
+ *   num_items,
+ *   CustomLess());
+ *
+ * // Allocate temporary storage
+ * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+ *
+ * // Run sorting operation
+ * cub::DeviceMergeSort::SortPairs(
+ *   d_temp_storage,
+ *   temp_storage_bytes,
+ *   reverse_iter,
+ *   thrust::raw_pointer_cast(d_values.data()),
+ *   num_items,
+ *   CustomLess());
+ * @endcode
+ *
+ * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable
+ */
+struct DeviceMergeSort
+{
+
+  /**
+   * @brief Sorts items using a merge sorting method.
+   *
+   * @par
+   * SortPairs is not guaranteed to be stable. That is, suppose that i and j are
+   * equivalent: neither one is less than the other. It is not guaranteed
+   * that the relative order of these two elements will be preserved by sort.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sorting of a device vector of `int`
+   * keys with associated vector of `int` values.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_merge_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for
+   * // sorting data
+   * int  num_items;       // e.g., 7
+   * int  *d_keys;         // e.g., [8, 6, 6, 5, 3, 0, 9]
+   * int  *d_values;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * ...
+   *
+   * // Initialize comparator
+   * CustomOpT custom_op;
+   *
+   * // Determine temporary device storage requirements
+   * void *d_temp_storage = nullptr;
+   * std::size_t temp_storage_bytes = 0;
+   * cub::DeviceMergeSort::SortPairs(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, d_values, num_items, custom_op);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceMergeSort::SortPairs(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, d_values, num_items, custom_op);
+   *
+   * // d_keys      <-- [0, 3, 5, 6, 6, 8, 9]
+   * // d_values    <-- [5, 4, 3, 2, 1, 0, 6]
+   *
+   * @endcode
+   *
+   * @tparam KeyIteratorT
+   *   is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and
+   *   its `value_type` is a model of [LessThan Comparable]. This `value_type`'s
+   *   ordering relation is a *strict weak ordering* as defined in
+   *   the [LessThan Comparable] requirements.
+   *
+   * @tparam ValueIteratorT
+   *   is a model of [Random Access Iterator], and `ValueIteratorT` is mutable.
+   *
+   * @tparam OffsetT
+   *   is an integer type for global offsets.
+   *
+   * @tparam CompareOpT
+   *   is a type of callable object with the signature
+   *   `bool operator()(KeyT lhs, KeyT rhs)` that models
+   *   the [Strict Weak Ordering] concept.
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_keys
+   *   Pointer to the input sequence of unsorted input keys
+   *
+   * @param[in,out] d_items
+   *   Pointer to the input sequence of unsorted input values
+   *
+   * @param[in] num_items
+   *   Number of items to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   **[optional]** Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations to
+   *   be printed to the console. Default is `false`.
+   *
+   * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable
+   */
+  template <typename KeyIteratorT,
+            typename ValueIteratorT,
+            typename OffsetT,
+            typename CompareOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortPairs(void *d_temp_storage,
+            std::size_t &temp_storage_bytes,
+            KeyIteratorT d_keys,
+            ValueIteratorT d_items,
+            OffsetT num_items,
+            CompareOpT compare_op,
+            cudaStream_t stream    = 0,
+            bool debug_synchronous = false)
+  {
+    using DispatchMergeSortT = DispatchMergeSort<KeyIteratorT,
+                                                 ValueIteratorT,
+                                                 KeyIteratorT,
+                                                 ValueIteratorT,
+                                                 OffsetT,
+                                                 CompareOpT>;
+
+    return DispatchMergeSortT::Dispatch(d_temp_storage,
+                                        temp_storage_bytes,
+                                        d_keys,
+                                        d_items,
+                                        d_keys,
+                                        d_items,
+                                        num_items,
+                                        compare_op,
+                                        stream,
+                                        debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts items using a merge sorting method.
+   *
+   * @par
+   * - SortPairsCopy is not guaranteed to be stable. That is, suppose
+   *   that `i` and `j` are equivalent: neither one is less than the
+   *   other. It is not guaranteed that the relative order of these
+   *   two elements will be preserved by sort.
+   * - Input arrays `d_input_keys` and `d_input_items` are not modified.
+   * - Note that the behavior is undefined if the input and output ranges
+   *   overlap in any way.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sorting of a device vector of
+   * `int` keys with associated vector of `int` values.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_merge_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers
+   * // for sorting data
+   * int  num_items;       // e.g., 7
+   * int  *d_keys;         // e.g., [8, 6, 6, 5, 3, 0, 9]
+   * int  *d_values;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * ...
+   *
+   * // Initialize comparator
+   * CustomOpT custom_op;
+   *
+   * // Determine temporary device storage requirements
+   * void *d_temp_storage = nullptr;
+   * std::size_t temp_storage_bytes = 0;
+   * cub::DeviceMergeSort::SortPairsCopy(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, d_values, num_items, custom_op);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceMergeSort::SortPairsCopy(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, d_values, num_items, custom_op);
+   *
+   * // d_keys      <-- [0, 3, 5, 6, 6, 8, 9]
+   * // d_values    <-- [5, 4, 3, 2, 1, 0, 6]
+   *
+   * @endcode
+   *
+   * @tparam KeyInputIteratorT
+   *   is a model of [Random Access Iterator]. Its `value_type` is a model of
+   *   [LessThan Comparable]. This `value_type`'s ordering relation is a
+   *   *strict weak ordering* as defined in the [LessThan Comparable]
+   *   requirements.
+   *
+   * @tparam ValueInputIteratorT
+   *   is a model of [Random Access Iterator].
+   *
+   * @tparam KeyIteratorT
+   *   is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and
+   *   its `value_type` is a model of [LessThan Comparable]. This `value_type`'s
+   *   ordering relation is a *strict weak ordering* as defined in
+   *   the [LessThan Comparable] requirements.
+   *
+   * @tparam ValueIteratorT
+   *   is a model of [Random Access Iterator], and `ValueIteratorT` is mutable.
+   *
+   * @tparam OffsetT
+   *   is an integer type for global offsets.
+   *
+   * @tparam CompareOpT
+   *   is a type of callable object with the signature
+   *   `bool operator()(KeyT lhs, KeyT rhs)` that models
+   *   the [Strict Weak Ordering] concept.
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_input_keys
+   *   Pointer to the input sequence of unsorted input keys
+   *
+   * @param[in] d_input_items
+   *   Pointer to the input sequence of unsorted input values
+   *
+   * @param[out] d_output_keys
+   *   Pointer to the output sequence of sorted input keys
+   *
+   * @param[out] d_output_items
+   *   Pointer to the output sequence of sorted input values
+   *
+   * @param[in] num_items
+   *   Number of items to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns `true` if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   **[optional]** Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations to
+   *   be printed to the console. Default is `false`.
+   *
+   * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable
+   */
+  template <typename KeyInputIteratorT,
+            typename ValueInputIteratorT,
+            typename KeyIteratorT,
+            typename ValueIteratorT,
+            typename OffsetT,
+            typename CompareOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortPairsCopy(void *d_temp_storage,
+                std::size_t &temp_storage_bytes,
+                KeyInputIteratorT d_input_keys,
+                ValueInputIteratorT d_input_items,
+                KeyIteratorT d_output_keys,
+                ValueIteratorT d_output_items,
+                OffsetT num_items,
+                CompareOpT compare_op,
+                cudaStream_t stream    = 0,
+                bool debug_synchronous = false)
+  {
+    using DispatchMergeSortT = DispatchMergeSort<KeyInputIteratorT,
+                                                 ValueInputIteratorT,
+                                                 KeyIteratorT,
+                                                 ValueIteratorT,
+                                                 OffsetT,
+                                                 CompareOpT>;
+
+    return DispatchMergeSortT::Dispatch(d_temp_storage,
+                                        temp_storage_bytes,
+                                        d_input_keys,
+                                        d_input_items,
+                                        d_output_keys,
+                                        d_output_items,
+                                        num_items,
+                                        compare_op,
+                                        stream,
+                                        debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts items using a merge sorting method.
+   *
+   * @par
+   * SortKeys is not guaranteed to be stable. That is, suppose that `i` and `j`
+   * are equivalent: neither one is less than the other. It is not guaranteed
+   * that the relative order of these two elements will be preserved by sort.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sorting of a device vector of `int`
+   * keys.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_merge_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers
+   * // for sorting data
+   * int  num_items;       // e.g., 7
+   * int  *d_keys;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * ...
+   *
+   * // Initialize comparator
+   * CustomOpT custom_op;
+   *
+   * // Determine temporary device storage requirements
+   * void *d_temp_storage = nullptr;
+   * std::size_t temp_storage_bytes = 0;
+   * cub::DeviceMergeSort::SortKeys(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, num_items, custom_op);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceMergeSort::SortKeys(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, num_items, custom_op);
+   *
+   * // d_keys      <-- [0, 3, 5, 6, 7, 8, 9]
+   * @endcode
+   *
+   * @tparam KeyIteratorT
+   *   is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and
+   *   its `value_type` is a model of [LessThan Comparable]. This `value_type`'s
+   *   ordering relation is a *strict weak ordering* as defined in
+   *   the [LessThan Comparable] requirements.
+   *
+   * @tparam OffsetT
+   *   is an integer type for global offsets.
+   *
+   * @tparam CompareOpT
+   *   is a type of callable object with the signature
+   *   `bool operator()(KeyT lhs, KeyT rhs)` that models
+   *   the [Strict Weak Ordering] concept.
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_keys
+   *   Pointer to the input sequence of unsorted input keys
+   *
+   * @param[in] num_items
+   *   Number of items to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   **[optional]** Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations to
+   *   be printed to the console. Default is `false`.
+   *
+   * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable
+   */
+  template <typename KeyIteratorT,
+            typename OffsetT,
+            typename CompareOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortKeys(void *d_temp_storage,
+           std::size_t &temp_storage_bytes,
+           KeyIteratorT d_keys,
+           OffsetT num_items,
+           CompareOpT compare_op,
+           cudaStream_t stream    = 0,
+           bool debug_synchronous = false)
+  {
+    using DispatchMergeSortT = DispatchMergeSort<KeyIteratorT,
+                                                 NullType *,
+                                                 KeyIteratorT,
+                                                 NullType *,
+                                                 OffsetT,
+                                                 CompareOpT>;
+
+    return DispatchMergeSortT::Dispatch(d_temp_storage,
+                                        temp_storage_bytes,
+                                        d_keys,
+                                        static_cast<NullType *>(nullptr),
+                                        d_keys,
+                                        static_cast<NullType *>(nullptr),
+                                        num_items,
+                                        compare_op,
+                                        stream,
+                                        debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts items using a merge sorting method.
+   *
+   * @par
+   * - SortKeysCopy is not guaranteed to be stable. That is, suppose that `i`
+   *   and `j` are equivalent: neither one is less than the other. It is not
+   *   guaranteed that the relative order of these two elements will be
+   *   preserved by sort.
+   * - Input array d_input_keys is not modified.
+   * - Note that the behavior is undefined if the input and output ranges
+   *   overlap in any way.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sorting of a device vector of
+   * `int` keys.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_merge_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for
+   * // sorting data
+   * int  num_items;       // e.g., 7
+   * int  *d_keys;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * ...
+   *
+   * // Initialize comparator
+   * CustomOpT custom_op;
+   *
+   * // Determine temporary device storage requirements
+   * void *d_temp_storage = nullptr;
+   * std::size_t temp_storage_bytes = 0;
+   * cub::DeviceMergeSort::SortKeysCopy(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, num_items, custom_op);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceMergeSort::SortKeysCopy(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, num_items, custom_op);
+   *
+   * // d_keys      <-- [0, 3, 5, 6, 7, 8, 9]
+   * @endcode
+   *
+   * @tparam KeyInputIteratorT
+   *   is a model of [Random Access Iterator]. Its `value_type` is a model of
+   *   [LessThan Comparable]. This `value_type`'s ordering relation is a
+   *   *strict weak ordering* as defined in the [LessThan Comparable]
+   *   requirements.
+   *
+   * @tparam KeyIteratorT
+   *   is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and
+   *   its `value_type` is a model of [LessThan Comparable]. This `value_type`'s
+   *   ordering relation is a *strict weak ordering* as defined in
+   *   the [LessThan Comparable] requirements.
+   *
+   * @tparam OffsetT
+   *   is an integer type for global offsets.
+   *
+   * @tparam CompareOpT
+   *   is a type of callable object with the signature
+   *   `bool operator()(KeyT lhs, KeyT rhs)` that models
+   *   the [Strict Weak Ordering] concept.
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_input_keys
+   *   Pointer to the input sequence of unsorted input keys
+   *
+   * @param[out] d_output_keys
+   *   Pointer to the output sequence of sorted input keys
+   *
+   * @param[in] num_items
+   *   Number of items to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   **[optional]** Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations to
+   *   be printed to the console. Default is `false`.
+   *
+   * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable
+   */
+  template <typename KeyInputIteratorT,
+            typename KeyIteratorT,
+            typename OffsetT,
+            typename CompareOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortKeysCopy(void *d_temp_storage,
+               std::size_t &temp_storage_bytes,
+               KeyInputIteratorT d_input_keys,
+               KeyIteratorT d_output_keys,
+               OffsetT num_items,
+               CompareOpT compare_op,
+               cudaStream_t stream    = 0,
+               bool debug_synchronous = false)
+  {
+    using DispatchMergeSortT = DispatchMergeSort<KeyInputIteratorT,
+                                                 NullType *,
+                                                 KeyIteratorT,
+                                                 NullType *,
+                                                 OffsetT,
+                                                 CompareOpT>;
+
+    return DispatchMergeSortT::Dispatch(d_temp_storage,
+                                        temp_storage_bytes,
+                                        d_input_keys,
+                                        static_cast<NullType *>(nullptr),
+                                        d_output_keys,
+                                        static_cast<NullType *>(nullptr),
+                                        num_items,
+                                        compare_op,
+                                        stream,
+                                        debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts items using a merge sorting method.
+   *
+   * @par
+   * StableSortPairs is stable: it preserves the relative ordering of equivalent
+   * elements. That is, if x and y are elements such that x precedes y,
+   * and if the two elements are equivalent (neither x < y nor y < x) then
+   * a postcondition of stable_sort is that x still precedes y.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sorting of a device vector of `int`
+   * keys with associated vector of `int` values.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_merge_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for
+   * // sorting data
+   * int  num_items;       // e.g., 7
+   * int  *d_keys;         // e.g., [8, 6, 6, 5, 3, 0, 9]
+   * int  *d_values;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * ...
+   *
+   * // Initialize comparator
+   * CustomOpT custom_op;
+   *
+   * // Determine temporary device storage requirements
+   * void *d_temp_storage = nullptr;
+   * std::size_t temp_storage_bytes = 0;
+   * cub::DeviceMergeSort::StableSortPairs(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, d_values, num_items, custom_op);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceMergeSort::StableSortPairs(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, d_values, num_items, custom_op);
+   *
+   * // d_keys      <-- [0, 3, 5, 6, 6, 8, 9]
+   * // d_values    <-- [5, 4, 3, 1, 2, 0, 6]
+   * @endcode
+   *
+   * @tparam KeyIteratorT
+   *   is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and
+   *   its `value_type` is a model of [LessThan Comparable]. This `value_type`'s
+   *   ordering relation is a *strict weak ordering* as defined in
+   *   the [LessThan Comparable] requirements.
+   *
+   * @tparam ValueIteratorT
+   *   is a model of [Random Access Iterator], and `ValueIteratorT` is mutable.
+   *
+   * @tparam OffsetT
+   *   is an integer type for global offsets.
+   *
+   * @tparam CompareOpT
+   *   is a type of callable object with the signature
+   *   `bool operator()(KeyT lhs, KeyT rhs)` that models
+   *   the [Strict Weak Ordering] concept.
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_keys
+   *   Pointer to the input sequence of unsorted input keys
+   *
+   * @param[in,out] d_items
+   *   Pointer to the input sequence of unsorted input values
+   *
+   * @param[in] num_items
+   *   Number of items to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   **[optional]** Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations to
+   *   be printed to the console. Default is `false`.
+   *
+   * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable
+   */
+  template <typename KeyIteratorT,
+            typename ValueIteratorT,
+            typename OffsetT,
+            typename CompareOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  StableSortPairs(void *d_temp_storage,
+                  std::size_t &temp_storage_bytes,
+                  KeyIteratorT d_keys,
+                  ValueIteratorT d_items,
+                  OffsetT num_items,
+                  CompareOpT compare_op,
+                  cudaStream_t stream    = 0,
+                  bool debug_synchronous = false)
+  {
+    return SortPairs<KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_items,
+      num_items,
+      compare_op,
+      stream,
+      debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts items using a merge sorting method.
+   *
+   * @par
+   * StableSortKeys is stable: it preserves the relative ordering of equivalent
+   * elements. That is, if `x` and `y` are elements such that `x` precedes `y`,
+   * and if the two elements are equivalent (neither `x < y` nor `y < x`) then
+   * a postcondition of stable_sort is that `x` still precedes `y`.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sorting of a device vector of `int`
+   * keys.
+   * \par
+   * \code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_merge_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for
+   * // sorting data
+   * int  num_items;       // e.g., 7
+   * int  *d_keys;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * ...
+   *
+   * // Initialize comparator
+   * CustomOpT custom_op;
+   *
+   * // Determine temporary device storage requirements
+   * void *d_temp_storage = nullptr;
+   * std::size_t temp_storage_bytes = 0;
+   * cub::DeviceMergeSort::StableSortKeys(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, num_items, custom_op);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceMergeSort::StableSortKeys(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, num_items, custom_op);
+   *
+   * // d_keys      <-- [0, 3, 5, 6, 7, 8, 9]
+   * @endcode
+   *
+   * @tparam KeyIteratorT
+   *   is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and
+   *   its `value_type` is a model of [LessThan Comparable]. This `value_type`'s
+   *   ordering relation is a *strict weak ordering* as defined in
+   *   the [LessThan Comparable] requirements.
+   *
+   * @tparam OffsetT
+   *   is an integer type for global offsets.
+   *
+   * @tparam CompareOpT
+   *   is a type of callable object with the signature
+   *   `bool operator()(KeyT lhs, KeyT rhs)` that models
+   *   the [Strict Weak Ordering] concept.
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_keys
+   *   Pointer to the input sequence of unsorted input keys
+   *
+   * @param[in] num_items
+   *   Number of items to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   **[optional]** Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations to
+   *   be printed to the console. Default is `false`.
+   *
+   * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable
+   */
+  template <typename KeyIteratorT,
+            typename OffsetT,
+            typename CompareOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  StableSortKeys(void *d_temp_storage,
+                 std::size_t &temp_storage_bytes,
+                 KeyIteratorT d_keys,
+                 OffsetT num_items,
+                 CompareOpT compare_op,
+                 cudaStream_t stream    = 0,
+                 bool debug_synchronous = false)
+  {
+    return SortKeys<KeyIteratorT, OffsetT, CompareOpT>(d_temp_storage,
+                                                       temp_storage_bytes,
+                                                       d_keys,
+                                                       num_items,
+                                                       compare_op,
+                                                       stream,
+                                                       debug_synchronous);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/device/device_partition.cuh b/cuda/cub/device/device_partition.cuh
new file mode 100644
index 00000000..a0b044b3
--- /dev/null
+++ b/cuda/cub/device/device_partition.cuh
@@ -0,0 +1,624 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * cub::DevicePartition provides device-wide, parallel operations for
+ * partitioning sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_select_if.cuh"
+#include "dispatch/dispatch_three_way_partition.cuh"
+#include "../config.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * @brief DevicePartition provides device-wide, parallel operations for
+ *        partitioning sequences of data items residing within device-accessible
+ *        memory. ![](partition_logo.png)
+ * @ingroup SingleModule
+ *
+ * @par Overview
+ * These operations apply a selection criterion to construct a partitioned
+ * output sequence from items selected/unselected from a specified input
+ * sequence.
+ *
+ * @par Usage Considerations
+ * \cdp_class{DevicePartition}
+ *
+ * @par Performance
+ * \linear_performance{partition}
+ *
+ * @par
+ * The following chart illustrates DevicePartition::If
+ * performance across different CUDA architectures for @p int32 items,
+ * where 50% of the items are randomly selected for the first partition.
+ * \plots_below
+ *
+ * @image html partition_if_int32_50_percent.png
+ *
+ */
+struct DevicePartition
+{
+    /**
+     * @brief Uses the @p d_flags sequence to split the corresponding items from
+     *        @p d_in into a partitioned sequence @p d_out. The total number of
+     *        items copied into the first partition is written to
+     *        @p d_num_selected_out. ![](partition_flags_logo.png)
+     *
+     * @par
+     * - The value type of @p d_flags must be castable to @p bool (e.g.,
+     *   @p bool, @p char, @p int, etc.).
+     * - Copies of the selected items are compacted into @p d_out and maintain
+     *   their original relative ordering, however copies of the unselected
+     *   items are compacted into the rear of @p d_out in reverse order.
+     * - \devicestorage
+     *
+     * @par Snippet
+     * The code snippet below illustrates the compaction of items selected from
+     * an @p int device vector.
+     * @par
+     * @code
+     * #include <cub/cub.cuh>
+     * // or equivalently <cub/device/device_partition.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for
+     * // input, flags, and output
+     * int  num_items;              // e.g., 8
+     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void *d_temp_storage = nullptr;
+     * std::size_t temp_storage_bytes = 0;
+     * cub::DevicePartition::Flagged(
+     *   d_temp_storage, temp_storage_bytes,
+     *   d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DevicePartition::Flagged(
+     *   d_temp_storage, temp_storage_bytes,
+     *   d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // d_out                 <-- [1, 4, 6, 7, 8, 5, 3, 2]
+     * // d_num_selected_out    <-- [4]
+     * @endcode
+     *
+     * @tparam InputIteratorT
+     *   **[inferred]** Random-access input iterator type for reading
+     *   input items \iterator
+     *
+     * @tparam FlagIterator
+     *   **[inferred]** Random-access input iterator type for reading
+     *   selection flags \iterator
+     *
+     * @tparam OutputIteratorT
+     *   **[inferred]** Random-access output iterator type for writing
+     *   output items \iterator
+     *
+     * @tparam NumSelectedIteratorT
+     *   **[inferred]** Output iterator type for recording the number
+     *   of items selected \iterator
+     *
+     * @param[in] d_temp_storage
+     *   Device-accessible allocation of temporary storage. When `nullptr`, the
+     *   required allocation size is written to @p temp_storage_bytes and no
+     *   work is done.
+     *
+     * @param[in,out] temp_storage_bytes
+     *   Reference to size in bytes of @p d_temp_storage allocation
+     *
+     * @param[in] d_in
+     *   Pointer to the input sequence of data items
+     *
+     * @param[in] d_flags
+     *   Pointer to the input sequence of selection flags
+     *
+     * @param[out] d_out
+     *   Pointer to the output sequence of partitioned data items
+     *
+     * @param[out] d_num_selected_out
+     *   Pointer to the output total number of items selected (i.e., the
+     *   offset of the unselected partition)
+     *
+     * @param[in] num_items
+     *   Total number of items to select from
+     *
+     * @param[in] stream
+     *   **[optional]** CUDA stream to launch kernels within.
+     *   Default is stream<sub>0</sub>.
+     *
+     * @param[in] debug_synchronous
+     *   **[optional]** Whether or not to synchronize the stream after every
+     *   kernel launch to check for errors. May cause significant slowdown.
+     *   Default is @p false.
+     */
+    template <typename InputIteratorT,
+              typename FlagIterator,
+              typename OutputIteratorT,
+              typename NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+    Flagged(void *d_temp_storage,
+            size_t &temp_storage_bytes,
+            InputIteratorT d_in,
+            FlagIterator d_flags,
+            OutputIteratorT d_out,
+            NumSelectedIteratorT d_num_selected_out,
+            int num_items,
+            cudaStream_t stream    = 0,
+            bool debug_synchronous = false)
+    {
+      using OffsetT    = int;      // Signed integer type for global offsets
+      using SelectOp   = NullType; // Selection op (not used)
+      using EqualityOp = NullType; // Equality operator (not used)
+      using DispatchSelectIfT = DispatchSelectIf<InputIteratorT,
+                                                 FlagIterator,
+                                                 OutputIteratorT,
+                                                 NumSelectedIteratorT,
+                                                 SelectOp,
+                                                 EqualityOp,
+                                                 OffsetT,
+                                                 true>;
+
+      return DispatchSelectIfT::Dispatch(d_temp_storage,
+                                         temp_storage_bytes,
+                                         d_in,
+                                         d_flags,
+                                         d_out,
+                                         d_num_selected_out,
+                                         SelectOp{},
+                                         EqualityOp{},
+                                         num_items,
+                                         stream,
+                                         debug_synchronous);
+    }
+
+
+    /**
+     * @brief Uses the @p select_op functor to split the corresponding items
+     *        from @p d_in into a partitioned sequence @p d_out. The total
+     *        number of items copied into the first partition is written to
+     *        @p d_num_selected_out. ![](partition_logo.png)
+     *
+     * @par
+     * - Copies of the selected items are compacted into @p d_out and maintain
+     *   their original relative ordering, however copies of the unselected
+     *   items are compacted into the rear of @p d_out in reverse order.
+     * - \devicestorage
+     *
+     * @par Performance
+     * The following charts illustrate saturated partition-if performance across
+     * different CUDA architectures for @p int32 and @p int64 items,
+     * respectively. Items are selected for the first partition with 50%
+     * probability.
+     *
+     * @image html partition_if_int32_50_percent.png
+     * @image html partition_if_int64_50_percent.png
+     *
+     * @par
+     * The following charts are similar, but 5% selection probability for the
+     * first partition:
+     *
+     * @image html partition_if_int32_5_percent.png
+     * @image html partition_if_int64_5_percent.png
+     *
+     * @par Snippet
+     * The code snippet below illustrates the compaction of items selected from
+     * an @p int device vector.
+     * @par
+     * @code
+     * #include <cub/cub.cuh>
+     * // or equivalently <cub/device/device_partition.cuh>
+     *
+     * // Functor type for selecting values less than some criteria
+     * struct LessThan
+     * {
+     *     int compare;
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     explicit LessThan(int compare) : compare(compare) {}
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     bool operator()(const int &a) const
+     *     {
+     *         return (a < compare);
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for
+     * // input and output
+     * int      num_items;              // e.g., 8
+     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int      *d_num_selected_out;    // e.g., [ ]
+     * LessThan select_op(7);
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void *d_temp_storage = nullptr;
+     * std::size_t temp_storage_bytes = 0;
+     * cub::DevicePartition::If(
+     * d_temp_storage, temp_storage_bytes,
+     * d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DevicePartition::If(
+     *   d_temp_storage, temp_storage_bytes,
+     *   d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // d_out                 <-- [0, 2, 3, 5, 2, 8, 81, 9]
+     * // d_num_selected_out    <-- [5]
+     *
+     * @endcode
+     *
+     * @tparam InputIteratorT
+     *   **[inferred]** Random-access input iterator type for reading input
+     *   items \iterator
+     *
+     * @tparam OutputIteratorT
+     *   **[inferred]** Random-access output iterator type for writing output
+     *   items \iterator
+     *
+     * @tparam NumSelectedIteratorT
+     *   **[inferred]** Output iterator type for recording the number of items
+     *   selected \iterator
+     *
+     * @tparam SelectOp
+     *   **[inferred]** Selection functor type having member
+     *   `bool operator()(const T &a)`
+     *
+     * @param[in] d_temp_storage
+     *   Device-accessible allocation of temporary storage. When `nullptr`, the
+     *   required allocation size is written to `temp_storage_bytes` and no
+     *   work is done.
+     *
+     * @param[in,out] temp_storage_bytes
+     *   Reference to size in bytes of @p d_temp_storage allocation
+     *
+     * @param[in] d_in
+     *   Pointer to the input sequence of data items
+     *
+     * @param[out] d_out
+     *   Pointer to the output sequence of partitioned data items
+     *
+     * @param[out] d_num_selected_out
+     *   Pointer to the output total number of items selected (i.e., the
+     *   offset of the unselected partition)
+     *
+     * @param[in] num_items
+     *   Total number of items to select from
+     *
+     * @param[in] select_op
+     *   Unary selection operator
+     *
+     * @param[in] stream
+     *   **[optional]** CUDA stream to launch kernels within.
+     *   Default is stream<sub>0</sub>.
+     *
+     * @param[in] debug_synchronous
+     *   **[optional]** Whether or not to synchronize the stream after every
+     *   kernel launch to check for errors. May cause significant slowdown.
+     *   Default is @p false.
+     */
+    template <typename InputIteratorT,
+              typename OutputIteratorT,
+              typename NumSelectedIteratorT,
+              typename SelectOp>
+    CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+    If(void *d_temp_storage,
+       size_t &temp_storage_bytes,
+       InputIteratorT d_in,
+       OutputIteratorT d_out,
+       NumSelectedIteratorT d_num_selected_out,
+       int num_items,
+       SelectOp select_op,
+       cudaStream_t stream    = 0,
+       bool debug_synchronous = false)
+    {
+        using OffsetT      = int; // Signed integer type for global offsets
+        using FlagIterator = NullType *; // FlagT iterator type (not used)
+        using EqualityOp   = NullType;   // Equality operator (not used)
+
+        using DispatchSelectIfT = DispatchSelectIf<InputIteratorT,
+                                                   FlagIterator,
+                                                   OutputIteratorT,
+                                                   NumSelectedIteratorT,
+                                                   SelectOp,
+                                                   EqualityOp,
+                                                   OffsetT,
+                                                   true>;
+
+        return DispatchSelectIfT::Dispatch(d_temp_storage,
+                                           temp_storage_bytes,
+                                           d_in,
+                                           nullptr,
+                                           d_out,
+                                           d_num_selected_out,
+                                           select_op,
+                                           EqualityOp{},
+                                           num_items,
+                                           stream,
+                                           debug_synchronous);
+    }
+
+
+    /**
+     * @brief Uses two functors to split the corresponding items from @p d_in
+     *        into a three partitioned sequences @p d_first_part_out
+     *        @p d_second_part_out and @p d_unselected_out.
+     *        The total number of items copied into the first partition is written
+     *        to `d_num_selected_out[0]`, while the total number of items copied
+     *        into the second partition is written to `d_num_selected_out[1]`.
+     *
+     * @par
+     * - Copies of the items selected by @p select_first_part_op are compacted
+     *   into @p d_first_part_out and maintain their original relative ordering.
+     * - Copies of the items selected by @p select_second_part_op are compacted
+     *   into @p d_second_part_out and maintain their original relative ordering.
+     * - Copies of the unselected items are compacted into the
+     *   @p d_unselected_out in reverse order.
+     *
+     * @par Snippet
+     * The code snippet below illustrates how this algorithm can partition an
+     * input vector into small, medium, and large items so that the relative
+     * order of items remain deterministic.
+     *
+     * Let's consider any value that doesn't exceed six a small one. On the
+     * other hand, any value that exceeds 50 will be considered a large one.
+     * Since the value used to define a small part doesn't match one that
+     * defines the large part, the intermediate segment is implied.
+     *
+     * These definitions partition a value space into three categories. We want
+     * to preserve the order of items in which they appear in the input vector.
+     * Since the algorithm provides stable partitioning, this is possible.
+     *
+     * Since the number of items in each category is unknown beforehand, we need
+     * three output arrays of num_items elements each. To reduce the memory
+     * requirements, we can combine the output storage for two categories.
+     *
+     * Since each value falls precisely in one category, it's safe to add
+     * "large" values into the head of the shared output vector and the "middle"
+     * values into its tail. To add items into the tail of the output array, we
+     * can use `thrust::reverse_iterator`.
+     * @par
+     * @code
+     * #include <cub/cub.cuh>
+     * // or equivalently <cub/device/device_partition.cuh>
+     *
+     * // Functor type for selecting values less than some criteria
+     * struct LessThan
+     * {
+     *     int compare;
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     explicit LessThan(int compare) : compare(compare) {}
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     bool operator()(const int &a) const
+     *     {
+     *         return a < compare;
+     *     }
+     * };
+     *
+     * // Functor type for selecting values greater than some criteria
+     * struct GreaterThan
+     * {
+     *     int compare;
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     explicit GreaterThan(int compare) : compare(compare) {}
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     bool operator()(const int &a) const
+     *     {
+     *         return a > compare;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for
+     * // input and output
+     * int      num_items;                   // e.g., 8
+     * int      *d_in;                       // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+     * int      *d_large_and_unselected_out; // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int      *d_small_out;                // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int      *d_num_selected_out;         // e.g., [ , ]
+     * thrust::reverse_iterator<T> unselected_out(d_large_and_unselected_out + num_items);
+     * LessThan small_items_selector(7);
+     * GreaterThan large_items_selector(50);
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void *d_temp_storage = nullptr;
+     * std::size_t temp_storage_bytes = 0;
+     * cub::DevicePartition::If(
+     *      d_temp_storage, temp_storage_bytes,
+     *      d_in, d_large_and_medium_out, d_small_out, unselected_out,
+     *      d_num_selected_out, num_items,
+     *      large_items_selector, small_items_selector);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DevicePartition::If(
+     *      d_temp_storage, temp_storage_bytes,
+     *      d_in, d_large_and_medium_out, d_small_out, unselected_out,
+     *      d_num_selected_out, num_items,
+     *      large_items_selector, small_items_selector);
+     *
+     * // d_large_and_unselected_out  <-- [ 81,  ,  ,  ,  ,  , 8, 9 ]
+     * // d_small_out                 <-- [  0, 2, 3, 5, 2,  ,  ,   ]
+     * // d_num_selected_out          <-- [  1, 5 ]
+     * @endcode
+     *
+     * @tparam InputIteratorT
+     *   **[inferred]** Random-access input iterator type for reading
+     *   input items \iterator
+     *
+     * @tparam FirstOutputIteratorT
+     *   **[inferred]** Random-access output iterator type for writing output
+     *   items selected by first operator \iterator
+     *
+     * @tparam SecondOutputIteratorT
+     *   **[inferred]** Random-access output iterator type for writing output
+     *   items selected by second operator \iterator
+     *
+     * @tparam UnselectedOutputIteratorT
+     *   **[inferred]** Random-access output iterator type for writing
+     *   unselected items \iterator
+     *
+     * @tparam NumSelectedIteratorT
+     *   **[inferred]** Output iterator type for recording the number of items
+     *   selected \iterator
+     *
+     * @tparam SelectFirstPartOp
+     *   **[inferred]** Selection functor type having member
+     *   `bool operator()(const T &a)`
+     *
+     * @tparam SelectSecondPartOp
+     *   **[inferred]** Selection functor type having member
+     *   `bool operator()(const T &a)`
+     *
+     * @param[in] d_temp_storage
+     *   Device-accessible allocation of temporary storage. When `nullptr`, the
+     *   required allocation size is written to @p temp_storage_bytes and
+     *   no work is done.
+     *
+     * @param[in,out] temp_storage_bytes
+     *   Reference to size in bytes of @p d_temp_storage allocation
+     *
+     * @param[in] d_in
+     *   Pointer to the input sequence of data items
+     *
+     * @param[out] d_first_part_out
+     *   Pointer to the output sequence of data items selected by
+     *   @p select_first_part_op
+     *
+     * @param[out] d_second_part_out
+     *   Pointer to the output sequence of data items selected by
+     *   @p select_second_part_op
+     *
+     * @param[out] d_unselected_out
+     *   Pointer to the output sequence of unselected data items
+     *
+     * @param[out] d_num_selected_out
+     *   Pointer to the output array with two elements, where total number of
+     *   items selected by @p select_first_part_op is stored as
+     *   `d_num_selected_out[0]` and total number of items selected by
+     *   @p select_second_part_op is stored as `d_num_selected_out[1]`,
+     *   respectively
+     *
+     * @param[in] num_items
+     *   Total number of items to select from
+     *
+     * @param[in] select_first_part_op
+     *   Unary selection operator to select @p d_first_part_out
+     *
+     * @param[in] select_second_part_op
+     *   Unary selection operator to select @p d_second_part_out
+     *
+     * @param[in] stream
+     *   **[optional]** CUDA stream to launch kernels within.
+     *   Default is stream<sub>0</sub>.
+     *
+     * @param[in] debug_synchronous
+     *   **[optional]** Whether or not to synchronize the stream after every
+     *   kernel launch to check for errors. May cause significant slowdown.
+     *   Default is @p false.
+     */
+    template <typename InputIteratorT,
+              typename FirstOutputIteratorT,
+              typename SecondOutputIteratorT,
+              typename UnselectedOutputIteratorT,
+              typename NumSelectedIteratorT,
+              typename SelectFirstPartOp,
+              typename SelectSecondPartOp>
+    CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+    If(void *d_temp_storage,
+       std::size_t &temp_storage_bytes,
+       InputIteratorT d_in,
+       FirstOutputIteratorT d_first_part_out,
+       SecondOutputIteratorT d_second_part_out,
+       UnselectedOutputIteratorT d_unselected_out,
+       NumSelectedIteratorT d_num_selected_out,
+       int num_items,
+       SelectFirstPartOp select_first_part_op,
+       SelectSecondPartOp select_second_part_op,
+       cudaStream_t stream    = 0,
+       bool debug_synchronous = false)
+    {
+      using OffsetT = int;
+      using DispatchThreeWayPartitionIfT =
+        DispatchThreeWayPartitionIf<InputIteratorT,
+                                    FirstOutputIteratorT,
+                                    SecondOutputIteratorT,
+                                    UnselectedOutputIteratorT,
+                                    NumSelectedIteratorT,
+                                    SelectFirstPartOp,
+                                    SelectSecondPartOp,
+                                    OffsetT>;
+
+      return DispatchThreeWayPartitionIfT::Dispatch(d_temp_storage,
+                                                    temp_storage_bytes,
+                                                    d_in,
+                                                    d_first_part_out,
+                                                    d_second_part_out,
+                                                    d_unselected_out,
+                                                    d_num_selected_out,
+                                                    select_first_part_op,
+                                                    select_second_part_op,
+                                                    num_items,
+                                                    stream,
+                                                    debug_synchronous);
+    }
+};
+
+/**
+ * @example example_device_partition_flagged.cu
+ * @example example_device_partition_if.cu
+ */
+
+CUB_NAMESPACE_END
+
+
diff --git a/cuda/cub/device/device_radix_sort.cuh b/cuda/cub/device/device_radix_sort.cuh
new file mode 100644
index 00000000..87711d05
--- /dev/null
+++ b/cuda/cub/device/device_radix_sort.cuh
@@ -0,0 +1,919 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+#include <cub/detail/choose_offset.cuh>
+#include <cub/device/dispatch/dispatch_radix_sort.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. ![](sorting_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
+ * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ * characters, etc.) specified from least-significant to most-significant.  For a
+ * given input sequence of keys and a set of rules specifying a total ordering
+ * of the symbolic alphabet, the radix sorting method produces a lexicographic
+ * ordering of those keys.
+ *
+ * \par Supported Types
+ * DeviceRadixSort can sort all of the built-in C++ numeric primitive types
+ * (`unsigned char`, `int`, `double`, etc.) as well as CUDA's `__half`
+ * and `__nv_bfloat16` 16-bit floating-point types.
+ *
+ * \par Floating-Point Special Cases
+ *
+ * - Positive and negative zeros are considered equivalent, and will be treated
+ *   as such in the output.
+ * - No special handling is implemented for NaN values; these are sorted
+ *   according to their bit representations after any transformations.
+ *
+ * \par Transformations
+ * Although the direct radix sorting method can only be applied to unsigned
+ * integral types, DeviceRadixSort is able to sort signed and floating-point
+ * types via simple bit-wise transformations that ensure lexicographic key
+ * ordering. Additional transformations occur for descending sorts. These
+ * transformations must be considered when restricting the
+ * `[begin_bit, end_bit)` range, as the bitwise transformations will occur
+ * before the bit-range truncation.
+ *
+ * Any transformations applied to the keys prior to sorting are reversed
+ * while writing to the final output buffer.
+ *
+ * \par Type Specific Bitwise Transformations
+ * To convert the input values into a radix-sortable bitwise representation,
+ * the following transformations take place prior to sorting:
+ *
+ * - For unsigned integral values, the keys are used directly.
+ * - For signed integral values, the sign bit is inverted.
+ * - For positive floating point values, the sign bit is inverted.
+ * - For negative floating point values, the full key is inverted.
+ *
+ * For floating point types, positive and negative zero are a special case and
+ * will be considered equivalent during sorting.
+ *
+ * \par Descending Sort Bitwise Transformations
+ * If descending sort is used, the keys are inverted after performing any
+ * type-specific transformations, and the resulting keys are sorted in ascending
+ * order.
+ *
+ * \par Stability
+ * DeviceRadixSort is stable. For floating-point types, -0.0 and +0.0 are
+ * considered equal and appear in the result in the same order as they appear in
+ * the input.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceRadixSort}
+ *
+ * \par Performance
+ * \linear_performance{radix sort} The following chart illustrates DeviceRadixSort::SortKeys
+ * performance across different CUDA architectures for uniform-random \p uint32 keys.
+ * \plots_below
+ *
+ * \image html lsb_radix_sort_int32_keys.png
+ *
+ */
+struct DeviceRadixSort
+{
+
+    /******************************************************************//**
+     * \name KeyT-value pairs
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Sorts key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation.
+     * - Pointers to contiguous memory must be used; iterators are not currently
+     *   supported.
+     * - In-place operations are not supported. There must be no overlap between
+     *   any of the provided ranges:
+     *   - `[d_keys_in,    d_keys_in    + num_items)`
+     *   - `[d_keys_out,   d_keys_out   + num_items)`
+     *   - `[d_values_in,  d_values_in  + num_items)`
+     *   - `[d_values_out, d_values_out + num_items)`
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
+     * <tt>uint64,uint64</tt> pairs, respectively.
+     *
+     * \image html lsb_radix_sort_int32_pairs.png
+     * \image html lsb_radix_sort_int64_pairs.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [        ...        ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
+     * // d_values_out          <-- [5, 4, 3, 1, 2, 0, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     * \tparam NumItemsT <b>[inferred]</b> Type of num_items
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT,
+        typename            NumItemsT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
+        NumItemsT           num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Unsigned integer type for global offsets.
+        using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
+        
+        // We cast away const-ness, but will *not* write to these arrays.
+        // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+        // create a new double-buffer internally when the `is_overwrite_ok` flag
+        // is not set.
+        constexpr bool is_overwrite_okay = false;
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            static_cast<OffsetT>(num_items),
+            begin_bit,
+            end_bit,
+            is_overwrite_okay,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - In-place operations are not supported. There must be no overlap between
+     *   any of the provided ranges:
+     *   - `[d_keys.Current(),     d_keys.Current()     + num_items)`
+     *   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
+     *   - `[d_values.Current(),   d_values.Current()   + num_items)`
+     *   - `[d_values.Alternate(), d_values.Alternate() + num_items)`
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
+     * <tt>uint64,uint64</tt> pairs, respectively.
+     *
+     * \image html lsb_radix_sort_int32_pairs.png
+     * \image html lsb_radix_sort_int64_pairs.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
+     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     * \tparam NumItemsT <b>[inferred]</b> Type of num_items
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT,
+        typename            NumItemsT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                    *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        NumItemsT               num_items,                              ///< [in] Number of items to sort
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Unsigned integer type for global offsets.
+        using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
+
+        constexpr bool is_overwrite_okay = true;
+
+        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            is_overwrite_okay,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation.
+     * - Pointers to contiguous memory must be used; iterators are not currently
+     *   supported.
+     * - In-place operations are not supported. There must be no overlap between
+     *   any of the provided ranges:
+     *   - `[d_keys_in,    d_keys_in    + num_items)`
+     *   - `[d_keys_out,   d_keys_out   + num_items)`
+     *   - `[d_values_in,  d_values_in  + num_items)`
+     *   - `[d_values_out, d_values_out + num_items)`
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortPairs.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [        ...        ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]
+     * // d_values_out          <-- [6, 0, 2, 1, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     * \tparam NumItemsT <b>[inferred]</b> Type of num_items
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT,
+        typename            NumItemsT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
+        NumItemsT           num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Unsigned integer type for global offsets.
+        using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
+
+        // We cast away const-ness, but will *not* write to these arrays.
+        // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+        // create a new double-buffer internally when the `is_overwrite_ok` flag
+        // is not set.
+        constexpr bool is_overwrite_okay = false;
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            is_overwrite_okay,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - In-place operations are not supported. There must be no overlap between
+     *   any of the provided ranges:
+     *   - `[d_keys.Current(),     d_keys.Current()     + num_items)`
+     *   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
+     *   - `[d_values.Current(),   d_values.Current()   + num_items)`
+     *   - `[d_values.Alternate(), d_values.Alternate() + num_items)`
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortPairs.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
+     * // d_values.Current()    <-- [6, 0, 2, 1, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     * \tparam NumItemsT <b>[inferred]</b> Type of num_items
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT,
+        typename            NumItemsT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                    *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        NumItemsT               num_items,                              ///< [in] Number of items to sort
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Unsigned integer type for global offsets.
+        using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
+
+        constexpr bool is_overwrite_okay = true;
+
+        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            is_overwrite_okay,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Keys-only
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sorts keys into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation.
+     * - Pointers to contiguous memory must be used; iterators are not currently
+     *   supported.
+     * - In-place operations are not supported. There must be no overlap between
+     *   any of the provided ranges:
+     *   - `[d_keys_in,    d_keys_in    + num_items)`
+     *   - `[d_keys_out,   d_keys_out   + num_items)`
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
+     *
+     * \image html lsb_radix_sort_int32_keys.png
+     * \image html lsb_radix_sort_int64_keys.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam NumItemsT <b>[inferred]</b> Type of num_items
+     * \tparam NumItemsT <b>[inferred]</b> Type of num_items
+     */
+    template <typename KeyT,
+              typename NumItemsT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        NumItemsT           num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Unsigned integer type for global offsets.
+        using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
+
+        // We cast away const-ness, but will *not* write to these arrays.
+        // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+        // create a new double-buffer internally when the `is_overwrite_ok` flag
+        // is not set.
+        constexpr bool is_overwrite_okay = false;
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        // Null value type
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            static_cast<OffsetT>(num_items),
+            begin_bit,
+            end_bit,
+            is_overwrite_okay,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts keys into ascending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - In-place operations are not supported. There must be no overlap between
+     *   any of the provided ranges:
+     *   - `[d_keys.Current(),     d_keys.Current()     + num_items)`
+     *   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
+     *
+     * \image html lsb_radix_sort_int32_keys.png
+     * \image html lsb_radix_sort_int64_keys.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam NumItemsT <b>[inferred]</b> Type of num_items
+     */
+    template <typename KeyT,
+              typename NumItemsT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        NumItemsT           num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Unsigned integer type for global offsets.
+        using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
+
+        constexpr bool is_overwrite_okay = true;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            is_overwrite_okay,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Sorts keys into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation.
+     * - Pointers to contiguous memory must be used; iterators are not currently
+     *   supported.
+     * - In-place operations are not supported. There must be no overlap between
+     *   any of the provided ranges:
+     *   - `[d_keys_in,    d_keys_in    + num_items)`
+     *   - `[d_keys_out,   d_keys_out   + num_items)`
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortKeys.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]s
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam NumItemsT <b>[inferred]</b> Type of num_items
+     */
+    template <typename KeyT,
+              typename NumItemsT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        NumItemsT           num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Unsigned integer type for global offsets.
+        using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
+
+        // We cast away const-ness, but will *not* write to these arrays.
+        // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+        // create a new double-buffer internally when the `is_overwrite_ok` flag
+        // is not set.
+        constexpr bool is_overwrite_okay = false;
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            is_overwrite_okay,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts keys into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - In-place operations are not supported. There must be no overlap between
+     *   any of the provided ranges:
+     *   - `[d_keys.Current(),     d_keys.Current()     + num_items)`
+     *   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortKeys.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam NumItemsT <b>[inferred]</b> Type of num_items
+     */
+    template <typename KeyT,
+              typename NumItemsT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        NumItemsT           num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Unsigned integer type for global offsets.
+        using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
+
+        constexpr bool is_overwrite_okay = true;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            is_overwrite_okay,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+
+
+};
+
+/**
+ * \example example_device_radix_sort.cu
+ */
+
+CUB_NAMESPACE_END
+
+
diff --git a/cuda/cub/device/device_reduce.cuh b/cuda/cub/device/device_reduce.cuh
new file mode 100644
index 00000000..9f70a111
--- /dev/null
+++ b/cuda/cub/device/device_reduce.cuh
@@ -0,0 +1,733 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "../iterator/arg_index_input_iterator.cuh"
+#include "dispatch/dispatch_reduce.cuh"
+#include "dispatch/dispatch_reduce_by_key.cuh"
+#include "../config.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \brief DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. ![](reduce_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceReduce}
+ *
+ * \par Performance
+ * \linear_performance{reduction, reduce-by-key, and run-length encode}
+ *
+ * \par
+ * The following chart illustrates DeviceReduce::Sum
+ * performance across different CUDA architectures for \p int32 keys.
+ *
+ * \image html reduce_int32.png
+ *
+ * \par
+ * The following chart illustrates DeviceReduce::ReduceByKey (summation)
+ * performance across different CUDA architectures for \p fp32
+ * values.  Segments are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
+ *
+ * \image html reduce_by_key_fp32_len_500.png
+ *
+ * \par
+ * \plots_below
+ *
+ */
+struct DeviceReduce
+{
+    /**
+     * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor and initial value \p init.
+     *
+     * \par
+     * - Does not support binary reduction operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates a user-defined min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     __device__ __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;  // e.g., 7
+     * int          *d_in;      // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;     // e.g., [-]
+     * CustomMin    min_op;
+     * int          init;       // e.g., INT_MAX
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduction
+     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init);
+     *
+     * // d_out <-- [0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam ReductionOpT         <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+     * \tparam T                    <b>[inferred]</b> Data element type that is convertible to the \p value type of \p InputIteratorT
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    ReductionOpT,
+        typename                    T>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Reduce(
+        void                        *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        ReductionOpT                reduction_op,                       ///< [in] Binary reduction functor
+        T                           init,                               ///< [in] Initial value of the reduction
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            reduction_op,
+            init,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide sum using the addition (\p +) operator.
+     *
+     * \par
+     * - Uses \p 0 as the initial value of the reduction.
+     * - Does not support \p + operators that are non-commutative..
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sum-reduction performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.
+     *
+     * \image html reduce_int32.png
+     * \image html reduce_int64.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sum-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [-]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sum-reduction
+     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out <-- [38]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Sum(
+        void                        *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        using OffsetT = int;
+
+        // The output value type
+        using OutputT =
+          cub::detail::non_void_value_t<OutputIteratorT,
+                                        cub::detail::value_t<InputIteratorT>>;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            cub::Sum(),
+            OutputT(),            // zero-initialize
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide minimum using the less-than ('<') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction.
+     * - Does not support \p < operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [-]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run min-reduction
+     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out <-- [0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Min(
+        void                        *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        using OffsetT = int;
+
+        // The input value type
+        using InputT = cub::detail::value_t<InputIteratorT>;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            cub::Min(),
+            Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item.
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The minimum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
+     * - Does not support \p < operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_items;      // e.g., 7
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmin-reduction
+     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
+     *
+     * // d_out <-- [{5, 0}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>cub::KeyValuePair<int, T></tt>) \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMin(
+        void                        *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        using OffsetT = int;
+
+        // The input type
+        using InputValueT = cub::detail::value_t<InputIteratorT>;
+
+        // The output tuple type
+        using OutputTupleT =
+          cub::detail::non_void_value_t<OutputIteratorT,
+                                        KeyValuePair<OffsetT, InputValueT>>;
+
+        // The output value type
+        using OutputValueT = typename OutputTupleT::Value;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        using ArgIndexInputIteratorT =
+          ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+
+        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_items,
+            cub::ArgMin(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide maximum using the greater-than ('>') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
+     * - Does not support \p > operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the max-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [-]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run max-reduction
+     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
+     *
+     * // d_out <-- [9]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Max(
+        void                        *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        using OffsetT = int;
+
+        // The input value type
+        using InputT = cub::detail::value_t<InputIteratorT>;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            cub::Max(),
+            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The maximum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
+     * - Does not support \p > operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_items;      // e.g., 7
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmax-reduction
+     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
+     *
+     * // d_out <-- [{6, 9}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>cub::KeyValuePair<int, T></tt>) \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMax(
+        void                        *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        using OffsetT = int;
+
+        // The input type
+        using InputValueT = cub::detail::value_t<InputIteratorT>;
+
+        // The output tuple type
+        using OutputTupleT =
+          cub::detail::non_void_value_t<OutputIteratorT,
+                                        KeyValuePair<OffsetT, InputValueT>>;
+
+        // The output value type
+        using OutputValueT = typename OutputTupleT::Value;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        using ArgIndexInputIteratorT =
+          ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+
+        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_items,
+            cub::ArgMax(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
+     *
+     * \par
+     * This operation computes segmented reductions within \p d_values_in using
+     * the specified binary \p reduction_op functor.  The segments are identified by
+     * "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of
+     * consecutive, identical keys.  For the <em>i</em><sup>th</sup> run encountered,
+     * the first key of the run and the corresponding value aggregate of that run are
+     * written to <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_aggregates_out[<em>i</em>]</tt>,
+     * respectively. The total number of runs encountered is written to \p d_num_runs_out.
+     *
+     * \par
+     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following chart illustrates reduction-by-key (sum) performance across
+     * different CUDA architectures for \p fp32 and \p fp64 values, respectively.  Segments
+     * are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
+     *
+     * \image html reduce_by_key_fp32_len_500.png
+     * \image html reduce_by_key_fp64_len_500.png
+     *
+     * \par
+     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
+     *
+     * \image html reduce_by_key_fp32_len_5.png
+     * \image html reduce_by_key_fp64_len_5.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the segmented reduction of \p int values grouped
+     * by runs of associated \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;          // e.g., 8
+     * int          *d_keys_in;         // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int          *d_values_in;       // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
+     * int          *d_unique_out;      // e.g., [-, -, -, -, -, -, -, -]
+     * int          *d_aggregates_out;  // e.g., [-, -, -, -, -, -, -, -]
+     * int          *d_num_runs_out;    // e.g., [-]
+     * CustomMin    reduction_op;
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduce-by-key
+     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
+     *
+     * // d_unique_out      <-- [0, 2, 9, 5, 8]
+     * // d_aggregates_out  <-- [0, 1, 6, 2, 4]
+     * // d_num_runs_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam KeysInputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input keys \iterator
+     * \tparam UniqueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing unique output keys \iterator
+     * \tparam ValuesInputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input values \iterator
+     * \tparam AggregatesOutputIterator <b>[inferred]</b> Random-access output iterator type for writing output value aggregates \iterator
+     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
+     * \tparam ReductionOpT              <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+     */
+    template <
+        typename                    KeysInputIteratorT,
+        typename                    UniqueOutputIteratorT,
+        typename                    ValuesInputIteratorT,
+        typename                    AggregatesOutputIteratorT,
+        typename                    NumRunsOutputIteratorT,
+        typename                    ReductionOpT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t ReduceByKey(
+        void                        *d_temp_storage,                ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
+        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+        ReductionOpT                reduction_op,                   ///< [in] Binary reduction functor
+        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // FlagT iterator type (not used)
+
+        // Selection op (not used)
+
+        // Default == operator
+        typedef Equality EqualityOp;
+
+        return DispatchReduceByKey<KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOpT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_unique_out,
+            d_values_in,
+            d_aggregates_out,
+            d_num_runs_out,
+            EqualityOp(),
+            reduction_op,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+/**
+ * \example example_device_reduce.cu
+ */
+
+CUB_NAMESPACE_END
+
+
diff --git a/cuda/cub/device/device_run_length_encode.cuh b/cuda/cub/device/device_run_length_encode.cuh
new file mode 100644
index 00000000..c8679851
--- /dev/null
+++ b/cuda/cub/device/device_run_length_encode.cuh
@@ -0,0 +1,272 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRunLengthEncode provides device-wide, parallel operations for computing a run-length encoding across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../config.cuh"
+#include "dispatch/dispatch_rle.cuh"
+#include "dispatch/dispatch_reduce_by_key.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \brief DeviceRunLengthEncode provides device-wide, parallel operations for demarcating "runs" of same-valued items within a sequence residing within device-accessible memory. ![](run_length_encode_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Run-length_encoding"><em>run-length encoding</em></a>
+ * computes a simple compressed representation of a sequence of input elements such that each
+ * maximal "run" of consecutive same-valued data items is encoded as a single data value along with a
+ * count of the elements in that run.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceRunLengthEncode}
+ *
+ * \par Performance
+ * \linear_performance{run-length encode}
+ *
+ * \par
+ * The following chart illustrates DeviceRunLengthEncode::RunLengthEncode performance across
+ * different CUDA architectures for \p int32 items.
+ * Segments have lengths uniformly sampled from [1,1000].
+ *
+ * \image html rle_int32_len_500.png
+ *
+ * \par
+ * \plots_below
+ *
+ */
+struct DeviceRunLengthEncode
+{
+
+    /**
+     * \brief Computes a run-length encoding of the sequence \p d_in.
+     *
+     * \par
+     * - For the <em>i</em><sup>th</sup> run encountered, the first key of the run and its length are written to
+     *   <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_counts_out[<em>i</em>]</tt>,
+     *   respectively.
+     * - The total number of runs encountered is written to \p d_num_runs_out.
+     * - The <tt>==</tt> equality operator is used to determine whether values are equivalent
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated encode performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
+     * lengths uniformly sampled from [1,1000].
+     *
+     * \image html rle_int32_len_500.png
+     * \image html rle_int64_len_500.png
+     *
+     * \par
+     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
+     *
+     * \image html rle_int32_len_5.png
+     * \image html rle_int64_len_5.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the run-length encoding of a sequence of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_run_length_encode.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;          // e.g., 8
+     * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int          *d_unique_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_counts_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_num_runs_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run encoding
+     * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
+     *
+     * // d_unique_out      <-- [0, 2, 9, 5, 8]
+     * // d_counts_out      <-- [1, 2, 1, 3, 1]
+     * // d_num_runs_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT           <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam UniqueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing unique output items \iterator
+     * \tparam LengthsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing output counts \iterator
+     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    UniqueOutputIteratorT,
+        typename                    LengthsOutputIteratorT,
+        typename                    NumRunsOutputIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Encode(
+        void*                       d_temp_storage,                 ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        LengthsOutputIteratorT      d_counts_out,                   ///< [out] Pointer to the output sequence of run-lengths (one count per run)
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs
+        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        using OffsetT      = int;        // Signed integer type for global offsets
+        using FlagIterator = NullType*;  // FlagT iterator type (not used)
+        using SelectOp     = NullType;   // Selection op (not used)
+        using EqualityOp   = Equality;   // Default == operator
+        using ReductionOp  = cub::Sum;   // Value reduction operator
+
+        // The lengths output value type
+        using LengthT =
+          cub::detail::non_void_value_t<LengthsOutputIteratorT, OffsetT>;
+
+        // Generator type for providing 1s values for run-length reduction
+        using LengthsInputIteratorT = ConstantInputIterator<LengthT, OffsetT>;
+
+        return DispatchReduceByKey<InputIteratorT, UniqueOutputIteratorT, LengthsInputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOp, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_unique_out,
+            LengthsInputIteratorT((LengthT) 1),
+            d_counts_out,
+            d_num_runs_out,
+            EqualityOp(),
+            ReductionOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Enumerates the starting offsets and lengths of all non-trivial runs (of length > 1) of same-valued keys in the sequence \p d_in.
+     *
+     * \par
+     * - For the <em>i</em><sup>th</sup> non-trivial run, the run's starting offset
+     *   and its length are written to <tt>d_offsets_out[<em>i</em>]</tt> and
+     *   <tt>d_lengths_out[<em>i</em>]</tt>, respectively.
+     * - The total number of runs encountered is written to \p d_num_runs_out.
+     * - The <tt>==</tt> equality operator is used to determine whether values are equivalent
+     * - \devicestorage
+     *
+     * \par Performance
+     *
+     * \par Snippet
+     * The code snippet below illustrates the identification of non-trivial runs within a sequence of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_run_length_encode.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;          // e.g., 8
+     * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int          *d_offsets_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_lengths_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_num_runs_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run encoding
+     * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
+     *
+     * // d_offsets_out         <-- [1, 4]
+     * // d_lengths_out         <-- [2, 3]
+     * // d_num_runs_out        <-- [2]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT           <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OffsetsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing run-offset values \iterator
+     * \tparam LengthsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing run-length values \iterator
+     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
+     */
+    template <
+        typename                InputIteratorT,
+        typename                OffsetsOutputIteratorT,
+        typename                LengthsOutputIteratorT,
+        typename                NumRunsOutputIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t NonTrivialRuns(
+        void*                    d_temp_storage,                ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT          d_in,                           ///< [in] Pointer to input sequence of data items
+        OffsetsOutputIteratorT  d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets (one offset per non-trivial run)
+        LengthsOutputIteratorT  d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths (one count per non-trivial run)
+        NumRunsOutputIteratorT  d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
+        int                     num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
+        cudaStream_t            stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int         OffsetT;                    // Signed integer type for global offsets
+        typedef Equality    EqualityOp;                 // Default == operator
+
+        return DeviceRleDispatch<InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_offsets_out,
+            d_lengths_out,
+            d_num_runs_out,
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+};
+
+
+CUB_NAMESPACE_END
+
+
diff --git a/cuda/cub/device/device_scan.cuh b/cuda/cub/device/device_scan.cuh
new file mode 100644
index 00000000..5d63d962
--- /dev/null
+++ b/cuda/cub/device/device_scan.cuh
@@ -0,0 +1,873 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../config.cuh"
+#include "../thread/thread_operators.cuh"
+#include "dispatch/dispatch_scan.cuh"
+#include "dispatch/dispatch_scan_by_key.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \brief DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. ![](device_scan.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * Given a sequence of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ * produces an output sequence where each element is computed to be the reduction
+ * of the elements occurring earlier in the input sequence.  <em>Prefix sum</em>
+ * connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ * that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ * The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ * the <em>i</em><sup>th</sup> output reduction.
+ *
+ * \par
+ * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our <em>"decoupled look-back"</em> algorithm
+ * for performing global prefix scan with only a single pass through the
+ * input data, as described in our 2016 technical report [1].  The central
+ * idea is to leverage a small, constant factor of redundant work in order to overlap the latencies
+ * of global prefix propagation with local computation.  As such, our algorithm requires only
+ * ~2<em>n</em> data movement (<em>n</em> inputs are read, <em>n</em> outputs are written), and typically
+ * proceeds at "memcpy" speeds. Our algorithm supports inplace operations.
+ *
+ * \par
+ * [1] [Duane Merrill and Michael Garland.  "Single-pass Parallel Prefix Scan with Decoupled Look-back", <em>NVIDIA Technical Report NVR-2016-002</em>, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back)
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceScan}
+ *
+ * \par Performance
+ * \linear_performance{prefix scan}
+ *
+ * \par
+ * The following chart illustrates DeviceScan::ExclusiveSum
+ * performance across different CUDA architectures for \p int32 keys.
+ * \plots_below
+ *
+ * \image html scan_int32.png
+ *
+ */
+struct DeviceScan
+{
+    /******************************************************************//**
+     * \name Exclusive scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes a device-wide exclusive prefix sum.  The value of 0 is applied as the initial value, and is assigned to *d_out.
+     *
+     * \par
+     * - Supports non-commutative sum operators.
+     * - Results are not deterministic for pseudo-associative operators (e.g.,
+     *   addition of floating-point types). Results for pseudo-associative
+     *   operators may vary from run to run. Additional details can be found in
+     *   the [decoupled look-back] description.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated exclusive sum performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.
+     *
+     * \image html scan_int32.png
+     * \image html scan_int64.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the exclusive prefix sum of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run exclusive prefix sum
+     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out s<-- [0, 8, 14, 21, 26, 29, 29]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     *
+     * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+     */
+    template <
+        typename        InputIteratorT,
+        typename        OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ExclusiveSum(
+        void            *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Random-access iterator to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Random-access iterator to the output sequence of data items
+        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        using OffsetT = int;
+
+        // The output value type -- used as the intermediate accumulator
+        // Use the input value type per https://wg21.link/P0571
+        using OutputT = cub::detail::value_t<InputIteratorT>;
+
+        // Initial value
+        OutputT init_value = 0;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, detail::InputValue<OutputT>, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            Sum(),
+            detail::InputValue<OutputT>(init_value),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor.  The \p init_value value is applied as the initial value, and is assigned to *d_out.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - Results are not deterministic for pseudo-associative operators (e.g.,
+     *   addition of floating-point types). Results for pseudo-associative
+     *   operators may vary from run to run. Additional details can be found in
+     *   the [decoupled look-back] description.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the exclusive prefix min-scan of an \p int device vector
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     * #include <climits>       // for INT_MAX
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;      // e.g., 7
+     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * CustomMin    min_op;
+     * ...
+     *
+     * // Determine temporary device storage requirements for exclusive prefix scan
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) INT_MAX, num_items);
+     *
+     * // Allocate temporary storage for exclusive prefix scan
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run exclusive prefix min-scan
+     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) INT_MAX, num_items);
+     *
+     * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT   <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT  <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam InitValueT       <b>[inferred]</b> Type of the \p init_value used Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+     */
+    template <
+        typename        InputIteratorT,
+        typename        OutputIteratorT,
+        typename        ScanOpT,
+        typename        InitValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ExclusiveScan(
+        void            *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Random-access iterator to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Random-access iterator to the output sequence of data items
+        ScanOpT         scan_op,                            ///< [in] Binary scan functor
+        InitValueT      init_value,                         ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out)
+        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            scan_op,
+            detail::InputValue<InitValueT>(init_value),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+    template <
+        typename        InputIteratorT,
+        typename        OutputIteratorT,
+        typename        ScanOpT,
+        typename        InitValueT,
+        typename        InitValueIterT=InitValueT*>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ExclusiveScan(
+        void                                    *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                                  &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT                          d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT                         d_out,                              ///< [out] Pointer to the output sequence of data items
+        ScanOpT                                 scan_op,                            ///< [in] Binary scan functor
+        FutureValue<InitValueT, InitValueIterT> init_value,                         ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out)
+        int                                     num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t                            stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                                    debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            scan_op,
+            detail::InputValue<InitValueT>(init_value),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive scans
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a device-wide inclusive prefix sum.
+     *
+     * \par
+     * - Supports non-commutative sum operators.
+     * - Results are not deterministic for pseudo-associative operators (e.g.,
+     *   addition of floating-point types). Results for pseudo-associative
+     *   operators may vary from run to run. Additional details can be found in
+     *   the [decoupled look-back] description.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the inclusive prefix sum of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive prefix sum
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage for inclusive prefix sum
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix sum
+     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out <-- [8, 14, 21, 26, 29, 29, 38]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     *
+     * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t InclusiveSum(
+        void*               d_temp_storage,                 ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                           ///< [in] Random-access iterator to the input sequence of data items
+        OutputIteratorT     d_out,                          ///< [out] Random-access iterator to the output sequence of data items
+        int                 num_items,                      ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t        stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            Sum(),
+            NullType(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - Results are not deterministic for pseudo-associative operators (e.g.,
+     *   addition of floating-point types). Results for pseudo-associative
+     *   operators may vary from run to run. Additional details can be found in
+     *   the [decoupled look-back] description.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the inclusive prefix min-scan of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     * #include <climits>       // for INT_MAX
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;      // e.g., 7
+     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * CustomMin    min_op;
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive prefix scan
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
+     *
+     * // Allocate temporary storage for inclusive prefix scan
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix min-scan
+     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
+     *
+     * // d_out <-- [8, 6, 6, 5, 3, 0, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT   <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT  <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+     */
+    template <
+        typename        InputIteratorT,
+        typename        OutputIteratorT,
+        typename        ScanOpT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t InclusiveScan(
+        void            *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Random-access iterator to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Random-access iterator to the output sequence of data items
+        ScanOpT         scan_op,                            ///< [in] Binary scan functor
+        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream             = 0,             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous  = false)         ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            scan_op,
+            NullType(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Computes a device-wide exclusive prefix sum-by-key with key equality
+     * defined by \p equality_op .  The value of 0 is applied as the initial value,
+     * and is assigned to the beginning of each segment in \p d_values_out .
+     *
+     * \par
+     * - Supports non-commutative sum operators.
+     * - Results are not deterministic for pseudo-associative operators (e.g.,
+     *   addition of floating-point types). Results for pseudo-associative
+     *   operators may vary from run to run. Additional details can be found in
+     *   the [decoupled look-back] description.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the exclusive prefix sum-by-key of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_items;      // e.g., 7
+     * int *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
+     * int *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::ExclusiveSumByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run exclusive prefix sum
+     * cub::DeviceScan::ExclusiveSumByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items);
+     *
+     * // d_values_out <-- [0, 8, 0, 7, 12, 0, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeysInputIteratorT      <b>[inferred]</b> Random-access input iterator type for reading scan keys inputs \iterator
+     * \tparam ValuesInputIteratorT    <b>[inferred]</b> Random-access input iterator type for reading scan values inputs \iterator
+     * \tparam ValuesOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing scan values outputs \iterator
+     * \tparam EqualityOpT             <b>[inferred]</b> Functor type having member <tt>T operator()(const T &a, const T &b)</tt> for binary operations that defines the equality of keys
+     *
+     * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+     */
+    template <
+        typename        KeysInputIteratorT,
+        typename        ValuesInputIteratorT,
+        typename        ValuesOutputIteratorT,
+        typename        EqualityOpT = Equality>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ExclusiveSumByKey(
+        void                  *d_temp_storage,              ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                &temp_storage_bytes,          ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT    d_keys_in,                    ///< [in] Random-access input iterator to the input sequence of key items
+        ValuesInputIteratorT  d_values_in,                  ///< [in] Random-access input iterator to the input sequence of value items
+        ValuesOutputIteratorT d_values_out,                 ///< [out] Random-access output iterator to the output sequence of value items
+        int                   num_items,                    ///< [in] Total number of input items (i.e., the length of \p d_keys_in and \p d_values_in)
+        EqualityOpT           equality_op = EqualityOpT(),  ///< [in] Binary functor that defines the equality of keys. Default is cub::Equality().
+        cudaStream_t          stream=0,                     ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                  debug_synchronous=false)      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        using OffsetT = int;
+
+        // The output value type -- used as the intermediate accumulator
+        // Use the input value type per https://wg21.link/P0571
+        using OutputT = cub::detail::value_t<ValuesInputIteratorT>;
+
+        // Initial value
+        OutputT init_value = 0;
+
+        return DispatchScanByKey<
+            KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT, Sum, OutputT, OffsetT>
+        ::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_values_in,
+            d_values_out,
+            equality_op,
+            Sum(),
+            init_value,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Computes a device-wide exclusive prefix scan-by-key using the specified binary \p scan_op functor.
+     * The key equality is defined by \p equality_op .  The \p init_value value is applied as the initial value,
+     * and is assigned to the beginning of each segment in \p d_values_out .
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - Results are not deterministic for pseudo-associative operators (e.g.,
+     *   addition of floating-point types). Results for pseudo-associative
+     *   operators may vary from run to run. Additional details can be found in
+     *   the [decoupled look-back] description.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the exclusive prefix min-scan-by-key of an \p int device vector
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     * #include <climits>       // for INT_MAX
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // CustomEqual functor
+     * struct CustomEqual
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return a == b;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;      // e.g., 7
+     * int          *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
+     * int          *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * CustomMin    min_op;
+     * CustomEqual  equality_op;
+     * ...
+     *
+     * // Determine temporary device storage requirements for exclusive prefix scan
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::ExclusiveScanByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, min_op, (int) INT_MAX, num_items, equality_op);
+     *
+     * // Allocate temporary storage for exclusive prefix scan
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run exclusive prefix min-scan
+     * cub::DeviceScan::ExclusiveScanByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, min_op, (int) INT_MAX, num_items, equality_op);
+     *
+     * // d_values_out <-- [2147483647, 8, 2147483647, 7, 5, 2147483647, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeysInputIteratorT      <b>[inferred]</b> Random-access input iterator type for reading scan keys inputs \iterator
+     * \tparam ValuesInputIteratorT    <b>[inferred]</b> Random-access input iterator type for reading scan values inputs \iterator
+     * \tparam ValuesOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing scan values outputs \iterator
+     * \tparam ScanOp                  <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam InitValueT              <b>[inferred]</b> Type of the \p init_value value used in Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam EqualityOpT             <b>[inferred]</b> Functor type having member <tt>T operator()(const T &a, const T &b)</tt> for binary operations that defines the equality of keys
+     *
+     * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+     */
+    template <
+        typename        KeysInputIteratorT,
+        typename        ValuesInputIteratorT,
+        typename        ValuesOutputIteratorT,
+        typename        ScanOpT,
+        typename        InitValueT,
+        typename        EqualityOpT = Equality>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ExclusiveScanByKey(
+        void                  *d_temp_storage,              ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                &temp_storage_bytes,          ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT    d_keys_in,                    ///< [in] Random-access input iterator to the input sequence of key items
+        ValuesInputIteratorT  d_values_in,                  ///< [in] Random-access input iterator to the input sequence of value items
+        ValuesOutputIteratorT d_values_out,                 ///< [out] Random-access output iterator to the output sequence of value items
+        ScanOpT               scan_op,                      ///< [in] Binary scan functor
+        InitValueT            init_value,                   ///< [in] Initial value to seed the exclusive scan (and is assigned to the beginning of each segment in \p d_values_out)
+        int                   num_items,                    ///< [in] Total number of input items (i.e., the length of \p d_keys_in and \p d_values_in)
+        EqualityOpT           equality_op = EqualityOpT(),  ///< [in] Binary functor that defines the equality of keys. Default is cub::Equality().
+        cudaStream_t          stream=0,                     ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                  debug_synchronous=false)      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScanByKey<
+            KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT, ScanOpT, InitValueT, OffsetT>
+        ::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_values_in,
+            d_values_out,
+            equality_op,
+            scan_op,
+            init_value,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Computes a device-wide inclusive prefix sum-by-key with key equality defined by \p equality_op .
+     *
+     * \par
+     * - Supports non-commutative sum operators.
+     * - Results are not deterministic for pseudo-associative operators (e.g.,
+     *   addition of floating-point types). Results for pseudo-associative
+     *   operators may vary from run to run. Additional details can be found in
+     *   the [decoupled look-back] description.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the inclusive prefix sum-by-key of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_items;      // e.g., 7
+     * int *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
+     * int *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive prefix sum
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::InclusiveSumByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items);
+     *
+     * // Allocate temporary storage for inclusive prefix sum
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix sum
+     * cub::DeviceScan::InclusiveSumByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items);
+     *
+     * // d_out <-- [8, 14, 7, 12, 15, 0, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeysInputIteratorT      <b>[inferred]</b> Random-access input iterator type for reading scan keys inputs \iterator
+     * \tparam ValuesInputIteratorT    <b>[inferred]</b> Random-access input iterator type for reading scan values inputs \iterator
+     * \tparam ValuesOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing scan values outputs \iterator
+     * \tparam EqualityOpT             <b>[inferred]</b> Functor type having member <tt>T operator()(const T &a, const T &b)</tt> for binary operations that defines the equality of keys
+     *
+     * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+     */
+    template <
+        typename        KeysInputIteratorT,
+        typename        ValuesInputIteratorT,
+        typename        ValuesOutputIteratorT,
+        typename        EqualityOpT = Equality>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t InclusiveSumByKey(
+        void                  *d_temp_storage,              ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                &temp_storage_bytes,          ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT    d_keys_in,                    ///< [in] Random-access input iterator to the input sequence of key items
+        ValuesInputIteratorT  d_values_in,                  ///< [in] Random-access input iterator to the input sequence of value items
+        ValuesOutputIteratorT d_values_out,                 ///< [out] Random-access output iterator to the output sequence of value items
+        int                   num_items,                    ///< [in] Total number of input items (i.e., the length of \p d_keys_in and \p d_values_in)
+        EqualityOpT           equality_op = EqualityOpT(),  ///< [in] Binary functor that defines the equality of keys. Default is cub::Equality().
+        cudaStream_t          stream=0,                     ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                  debug_synchronous=false)      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScanByKey<
+            KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT, Sum, NullType, OffsetT>
+        ::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_values_in,
+            d_values_out,
+            equality_op,
+            Sum(),
+            NullType(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Computes a device-wide inclusive prefix scan-by-key using the specified binary \p scan_op functor.
+     * The key equality is defined by \p equality_op .
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - Results are not deterministic for pseudo-associative operators (e.g.,
+     *   addition of floating-point types). Results for pseudo-associative
+     *   operators may vary from run to run. Additional details can be found in
+     *   the [decoupled look-back] description.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the inclusive prefix min-scan-by-key of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     * #include <climits>       // for INT_MAX
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // CustomEqual functor
+     * struct CustomEqual
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return a == b;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;      // e.g., 7
+     * int          *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
+     * int          *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * CustomMin    min_op;
+     * CustomEqual  equality_op;
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive prefix scan
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceScan::InclusiveScanByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
+     *
+     * // Allocate temporary storage for inclusive prefix scan
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix min-scan
+     * cub::DeviceScan::InclusiveScanByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
+     *
+     * // d_out <-- [8, 6, 7, 5, 3, 0, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeysInputIteratorT      <b>[inferred]</b> Random-access input iterator type for reading scan keys inputs \iterator
+     * \tparam ValuesInputIteratorT    <b>[inferred]</b> Random-access input iterator type for reading scan values inputs \iterator
+     * \tparam ValuesOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing scan values outputs \iterator
+     * \tparam ScanOp                  <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam EqualityOpT             <b>[inferred]</b> Functor type having member <tt>T operator()(const T &a, const T &b)</tt> for binary operations that defines the equality of keys
+     *
+     * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+     */
+    template <
+        typename        KeysInputIteratorT,
+        typename        ValuesInputIteratorT,
+        typename        ValuesOutputIteratorT,
+        typename        ScanOpT,
+        typename        EqualityOpT = Equality>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t InclusiveScanByKey(
+        void                  *d_temp_storage,              ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                &temp_storage_bytes,          ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT    d_keys_in,                    ///< [in] Random-access input iterator to the input sequence of key items
+        ValuesInputIteratorT  d_values_in,                  ///< [in] Random-access input iterator to the input sequence of value items
+        ValuesOutputIteratorT d_values_out,                 ///< [out] Random-access output iterator to the output sequence of value items
+        ScanOpT               scan_op,                      ///< [in] Binary scan functor
+        int                   num_items,                    ///< [in] Total number of input items (i.e., the length of \p d_keys_in and \p d_values_in)
+        EqualityOpT           equality_op = EqualityOpT(),  ///< [in] Binary functor that defines the equality of keys. Default is cub::Equality().
+        cudaStream_t          stream=0,                     ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                  debug_synchronous=false)      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScanByKey<
+            KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT, ScanOpT, NullType, OffsetT>
+        ::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_values_in,
+            d_values_out,
+            equality_op,
+            scan_op,
+            NullType(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+    //@}  end member group
+
+};
+
+/**
+ * \example example_device_scan.cu
+ */
+
+CUB_NAMESPACE_END
+
+
diff --git a/cuda/cub/device/device_segmented_radix_sort.cuh b/cuda/cub/device/device_segmented_radix_sort.cuh
new file mode 100644
index 00000000..3e5e90db
--- /dev/null
+++ b/cuda/cub/device/device_segmented_radix_sort.cuh
@@ -0,0 +1,882 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../config.cuh"
+#include "dispatch/dispatch_radix_sort.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \brief DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. ![](segmented_sorting_logo.png)
+ * \ingroup SegmentedModule
+ *
+ * \par Overview
+ * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
+ * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ * characters, etc.) specified from least-significant to most-significant.  For a
+ * given input sequence of keys and a set of rules specifying a total ordering
+ * of the symbolic alphabet, the radix sorting method produces a lexicographic
+ * ordering of those keys.
+ *
+ * \par See Also
+ * DeviceSegmentedRadixSort shares its implementation with DeviceRadixSort. See
+ * that algorithm's documentation for more information.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSegmentedRadixSort}
+ *
+ */
+struct DeviceSegmentedRadixSort
+{
+
+    /******************************************************************//**
+     * \name Key-value pairs
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Sorts segments of key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+     * // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT                  <b>[inferred]</b> Key type
+     * \tparam ValueT                <b>[inferred]</b> Value type
+     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
+     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT,
+        typename            BeginOffsetIteratorT,
+        typename            EndOffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Device-accessible pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] Device-accessible pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        BeginOffsetIteratorT d_begin_offsets,                       ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT  d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchSegmentedRadixSort<false, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
+     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
+     */
+    template <
+        typename                KeyT,
+        typename                ValueT,
+        typename                BeginOffsetIteratorT,
+        typename                EndOffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                    *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        BeginOffsetIteratorT    d_begin_offsets,                        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT      d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchSegmentedRadixSort<false, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+     * // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
+     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT,
+        typename            BeginOffsetIteratorT,
+        typename            EndOffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Device-accessible pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] Device-accessible pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        BeginOffsetIteratorT d_begin_offsets,                       ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT  d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchSegmentedRadixSort<true, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+     * // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
+     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
+     */
+    template <
+        typename                KeyT,
+        typename                ValueT,
+        typename                BeginOffsetIteratorT,
+        typename                EndOffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                    *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        BeginOffsetIteratorT    d_begin_offsets,                        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT      d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchSegmentedRadixSort<true, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Keys-only
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sorts segments of keys into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
+     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            BeginOffsetIteratorT,
+        typename            EndOffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Device-accessible pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        BeginOffsetIteratorT d_begin_offsets,                        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT  d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchSegmentedRadixSort<false, KeyT, NullType, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of keys into ascending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
+     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            BeginOffsetIteratorT,
+        typename            EndOffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        BeginOffsetIteratorT d_begin_offsets,                        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT  d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchSegmentedRadixSort<false, KeyT, NullType, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Sorts segments of keys into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
+     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            BeginOffsetIteratorT,
+        typename            EndOffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Device-accessible pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        BeginOffsetIteratorT d_begin_offsets,                       ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT  d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchSegmentedRadixSort<true, KeyT, NullType, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of keys into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
+     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            BeginOffsetIteratorT,
+        typename            EndOffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        BeginOffsetIteratorT d_begin_offsets,                        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT  d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchSegmentedRadixSort<true, KeyT, NullType, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+
+
+};
+
+CUB_NAMESPACE_END
+
+
diff --git a/cuda/cub/device/device_segmented_reduce.cuh b/cuda/cub/device/device_segmented_reduce.cuh
new file mode 100644
index 00000000..78a4c4f1
--- /dev/null
+++ b/cuda/cub/device/device_segmented_reduce.cuh
@@ -0,0 +1,635 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across multiple sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../iterator/arg_index_input_iterator.cuh"
+#include "dispatch/dispatch_reduce.cuh"
+#include "dispatch/dispatch_reduce_by_key.cuh"
+#include "../config.cuh"
+#include "../util_type.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \brief DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across multiple sequences of data items residing within device-accessible memory. ![](reduce_logo.png)
+ * \ingroup SegmentedModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSegmentedReduce}
+ *
+ */
+struct DeviceSegmentedReduce
+{
+    /**
+     * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor.
+     *
+     * \par
+     * - Does not support binary reduction operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates a custom min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_segments;   // e.g., 3
+     * int          *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;         // e.g., [-, -, -]
+     * CustomMin    min_op;
+     * int          initial_value;           // e.g., INT_MAX
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduction
+     * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
+     *
+     * // d_out <-- [6, INT_MAX, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam BeginOffsetIteratorT <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
+     * \tparam EndOffsetIteratorT   <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam T                    <b>[inferred]</b> Data element type that is convertible to the \p value type of \p InputIteratorT
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            BeginOffsetIteratorT,
+        typename            EndOffsetIteratorT,
+        typename            ReductionOp,
+        typename            T>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Reduce(
+        void                 *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t               &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT       d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT      d_out,                              ///< [out] Pointer to the output aggregate
+        int                  num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        BeginOffsetIteratorT d_begin_offsets,                    ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT   d_end_offsets,                      ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        ReductionOp          reduction_op,                       ///< [in] Binary reduction functor 
+        T                    initial_value,                      ///< [in] Initial value of the reduction for each segment
+        cudaStream_t         stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                 debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, ReductionOp>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            reduction_op,
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented sum using the addition ('+') operator.
+     *
+     * \par
+     * - Uses \p 0 as the initial value of the reduction for each segment.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p + operators that are non-commutative..
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sum reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_segments;   // e.g., 3
+     * int *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_out;         // e.g., [-, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sum-reduction
+     * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [21, 0, 17]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam BeginOffsetIteratorT <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
+     * \tparam EndOffsetIteratorT   <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            BeginOffsetIteratorT,
+        typename            EndOffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Sum(
+        void                  *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT        d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT       d_out,                              ///< [out] Pointer to the output aggregate
+        int                   num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        BeginOffsetIteratorT  d_begin_offsets,                    ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT    d_end_offsets,                      ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t          stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                  debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The output value type
+        using OutputT =
+          cub::detail::non_void_value_t<OutputIteratorT,
+                                        cub::detail::value_t<InputIteratorT>>;
+
+        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, cub::Sum>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::Sum(),
+            OutputT(),            // zero-initialize
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented minimum using the less-than ('<') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction for each segment.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p < operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_segments;   // e.g., 3
+     * int *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_out;         // e.g., [-, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run min-reduction
+     * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [6, INT_MAX, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam BeginOffsetIteratorT <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
+     * \tparam EndOffsetIteratorT   <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            BeginOffsetIteratorT,
+        typename            EndOffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Min(
+        void                  *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT        d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT       d_out,                              ///< [out] Pointer to the output aggregate
+        int                   num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        BeginOffsetIteratorT  d_begin_offsets,                    ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT    d_end_offsets,                      ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t          stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                  debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        using OffsetT = int;
+
+        // The input value type
+        using InputT = cub::detail::value_t<InputIteratorT>;
+
+        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, cub::Min>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::Min(),
+            Traits<InputT>::Max(),    // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item.
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The minimum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p < operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_segments;   // e.g., 3
+     * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmin-reduction
+     * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>KeyValuePair<int, T></tt>) \iterator
+     * \tparam BeginOffsetIteratorT <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
+     * \tparam EndOffsetIteratorT   <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            BeginOffsetIteratorT,
+        typename            EndOffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMin(
+        void                 *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t               &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT       d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT      d_out,                              ///< [out] Pointer to the output aggregate
+        int                  num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        BeginOffsetIteratorT d_begin_offsets,                    ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT   d_end_offsets,                      ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t         stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                 debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        using OffsetT = int;
+
+        // The input type
+        using InputValueT = cub::detail::value_t<InputIteratorT>;
+
+        // The output tuple type
+        using OutputTupleT =
+          cub::detail::non_void_value_t<OutputIteratorT,
+                                        KeyValuePair<OffsetT, InputValueT>>;
+
+        // The output value type
+        using OutputValueT = typename OutputTupleT::Value;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        using ArgIndexInputIteratorT =
+          ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+
+        return DispatchSegmentedReduce<ArgIndexInputIteratorT,  OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, cub::ArgMin>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::ArgMin(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented maximum using the greater-than ('>') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p > operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the max-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_segments;   // e.g., 3
+     * int *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_out;         // e.g., [-, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run max-reduction
+     * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [8, INT_MIN, 9]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam BeginOffsetIteratorT <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
+     * \tparam EndOffsetIteratorT <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            BeginOffsetIteratorT,
+        typename            EndOffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Max(
+        void                 *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t               &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT       d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT      d_out,                              ///< [out] Pointer to the output aggregate
+        int                  num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        BeginOffsetIteratorT d_begin_offsets,                    ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT   d_end_offsets,                      ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t         stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                 debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        using OffsetT = int;
+
+        // The input value type
+        using InputT = cub::detail::value_t<InputIteratorT>;
+
+        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, cub::Max>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::Max(),
+            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The maximum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p > operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_segments;   // e.g., 3
+     * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmax-reduction
+     * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>KeyValuePair<int, T></tt>) \iterator
+     * \tparam BeginOffsetIteratorT <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
+     * \tparam EndOffsetIteratorT   <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            BeginOffsetIteratorT,
+        typename            EndOffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMax(
+        void                 *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t               &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT       d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT      d_out,                              ///< [out] Pointer to the output aggregate
+        int                  num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        BeginOffsetIteratorT d_begin_offsets,                    ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT   d_end_offsets,                      ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t         stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                 debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        using OffsetT = int;
+
+        // The input type
+        using InputValueT = cub::detail::value_t<InputIteratorT>;
+
+        // The output tuple type
+        using OutputTupleT =
+          cub::detail::non_void_value_t<OutputIteratorT,
+                                        KeyValuePair<OffsetT, InputValueT>>;
+
+        // The output value type
+        using OutputValueT = typename OutputTupleT::Value;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        using ArgIndexInputIteratorT =
+          ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+
+        return DispatchSegmentedReduce<ArgIndexInputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, cub::ArgMax>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::ArgMax(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+CUB_NAMESPACE_END
+
+
diff --git a/cuda/cub/device/device_segmented_sort.cuh b/cuda/cub/device/device_segmented_sort.cuh
new file mode 100644
index 00000000..bc80275f
--- /dev/null
+++ b/cuda/cub/device/device_segmented_sort.cuh
@@ -0,0 +1,2569 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * cub::DeviceSegmentedSort provides device-wide, parallel operations for
+ * computing a batched sort across multiple, non-overlapping sequences of
+ * data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+#include <cub/device/dispatch/dispatch_segmented_sort.cuh>
+#include <cub/util_namespace.cuh>
+
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * @brief DeviceSegmentedSort provides device-wide, parallel operations for
+ *        computing a batched sort across multiple, non-overlapping sequences of
+ *        data items residing within device-accessible memory.
+ *        ![](segmented_sorting_logo.png)
+ * @ingroup SegmentedModule
+ *
+ * @par Overview
+ * The algorithm arranges items into ascending (or descending) order.
+ * The underlying sorting algorithm is undefined. Depending on the segment size,
+ * it might be radix sort, merge sort or something else. Therefore, no
+ * assumptions on the underlying implementation should be made.
+ *
+ * @par Differences from DeviceSegmentedRadixSort
+ * DeviceSegmentedRadixSort is optimized for significantly large segments (tens
+ * of thousands of items and more). Nevertheless, some domains produce a wide
+ * range of segment sizes. DeviceSegmentedSort partitions segments into size
+ * groups and specialize sorting algorithms for each group. This approach leads
+ * to better resource utilization in the presence of segment size imbalance or
+ * moderate segment sizes (up to thousands of items).
+ * This algorithm is more complex and consists of multiple kernels. This fact
+ * leads to longer compilation times as well as larger binaries sizes.
+ *
+ * @par Supported Types
+ * The algorithm has to satisfy the underlying algorithms restrictions. Radix
+ * sort usage restricts the list of supported types. Therefore,
+ * DeviceSegmentedSort can sort all of the built-in C++ numeric primitive types
+ * (`unsigned char`, `int`, `double`, etc.) as well as CUDA's `__half` and
+ * `__nv_bfloat16` 16-bit floating-point types.
+ *
+ * @par A simple example
+ * @code
+ * #include <cub/cub.cuh>
+ * // or equivalently <cub/device/device_segmented_sort.cuh>
+ *
+ * // Declare, allocate, and initialize device-accessible pointers
+ * // for sorting data
+ * int  num_items;          // e.g., 7
+ * int  num_segments;       // e.g., 3
+ * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+ * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+ * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+ * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+ * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+ * ...
+ *
+ * // Determine temporary device storage requirements
+ * void     *d_temp_storage = NULL;
+ * size_t   temp_storage_bytes = 0;
+ * cub::DeviceSegmentedSort::SortPairs(
+ *     d_temp_storage, temp_storage_bytes,
+ *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+ *     num_items, num_segments, d_offsets, d_offsets + 1);
+ *
+ * // Allocate temporary storage
+ * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+ *
+ * // Run sorting operation
+ * cub::DeviceSegmentedSort::SortPairs(
+ *     d_temp_storage, temp_storage_bytes,
+ *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+ *     num_items, num_segments, d_offsets, d_offsets + 1);
+ *
+ * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+ * // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
+ * @endcode
+ */
+struct DeviceSegmentedSort
+{
+
+  /*************************************************************************//**
+   * @name Keys-only
+   ****************************************************************************/
+  //@{
+
+  /**
+   * @brief Sorts segments of keys into ascending order. Approximately
+   *        `num_items + 2*num_segments` auxiliary storage required.
+   *
+   * @par
+   * - The contents of the input data are not altered by the sorting operation.
+   * - When the input is a contiguous sequence of segments, a single sequence
+   *   @p segment_offsets (of length `num_segments+1`) can be aliased
+   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
+   *   the latter is specified as `segment_offsets+1`).
+   * - SortKeys is not guaranteed to be stable. That is, suppose that @p i and
+   *   @p j are equivalent: neither one is less than the other. It is not
+   *   guaranteed that the relative order of these two elements will be
+   *   preserved by sort.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments
+   * (with one zero-length segment) of @p int keys.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_segmented_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible
+   * // pointers for sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void    *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedSort::SortKeys(
+   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedSort::SortKeys(
+   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+   * @endcode
+   *
+   * @tparam KeyT
+   *   <b>[inferred]</b> Key type
+   *
+   * @tparam BeginOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When nullptr, the
+   *   required allocation size is written to @p temp_storage_bytes and no work
+   *   is done
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of @p d_temp_storage allocation
+   *
+   * @param[in] d_keys_in
+   *   Device-accessible pointer to the input data of key data to sort
+   *
+   * @param[out] d_keys_out
+   *   Device-accessible pointer to the sorted output sequence of key data
+   *
+   * @param[in] num_items
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
+   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
+   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i-th segment is
+   *   considered empty.
+   *
+   * @param[in] stream
+   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   <b>[optional]</b> Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations to
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename KeyT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortKeys(void *d_temp_storage,
+           std::size_t &temp_storage_bytes,
+           const KeyT *d_keys_in,
+           KeyT *d_keys_out,
+           int num_items,
+           int num_segments,
+           BeginOffsetIteratorT d_begin_offsets,
+           EndOffsetIteratorT d_end_offsets,
+           cudaStream_t stream    = 0,
+           bool debug_synchronous = false)
+  {
+    constexpr bool is_descending = false;
+    constexpr bool is_overwrite_okay = false;
+    using DispatchT = DispatchSegmentedSort<is_descending,
+                                            KeyT,
+                                            cub::NullType,
+                                            int,
+                                            BeginOffsetIteratorT,
+                                            EndOffsetIteratorT>;
+
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchT::Dispatch(d_temp_storage,
+                               temp_storage_bytes,
+                               d_keys,
+                               d_values,
+                               num_items,
+                               num_segments,
+                               d_begin_offsets,
+                               d_end_offsets,
+                               is_overwrite_okay,
+                               stream,
+                               debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts segments of keys into descending order. Approximately
+   *        `num_items + 2*num_segments` auxiliary storage required.
+   *
+   * @par
+   * - The contents of the input data are not altered by the sorting operation.
+   * - When the input is a contiguous sequence of segments, a single sequence
+   *   @p segment_offsets (of length `num_segments + 1`) can be aliased
+   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
+   *   the latter is specified as `segment_offsets + 1`).
+   * - SortKeysDescending is not guaranteed to be stable. That is, suppose that
+   *   @p i and @p j are equivalent: neither one is less than the other. It is
+   *   not guaranteed that the relative order of these two elements will be
+   *   preserved by sort.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments
+   * (with one zero-length segment) of @p int keys.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_segmented_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void    *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedSort::SortKeysDescending(
+   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedSort::SortKeysDescending(
+   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+   * @endcode
+   *
+   * @tparam KeyT
+   *   <b>[inferred]</b> Key type
+   *
+   * @tparam BeginOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When nullptr, the
+   *   required allocation size is written to @p temp_storage_bytes and no
+   *   work is done
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of @p d_temp_storage allocation
+   *
+   * @param[in] d_keys_in
+   *   Device-accessible pointer to the input data of key data to sort
+   *
+   * @param[out] d_keys_out
+   *   Device-accessible pointer to the sorted output sequence of key data
+   *
+   * @param[in] num_items
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
+   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   @p num_segments, such that `d_end_offsets[i] - 1` is the last element of
+   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i-th segment is
+   *   considered empty.
+   *
+   * @param[in] stream
+   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   <b>[optional]</b> Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations
+   *   to be printed to the console. Default is @p false.
+   */
+  template <typename KeyT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortKeysDescending(void *d_temp_storage,
+                     std::size_t &temp_storage_bytes,
+                     const KeyT *d_keys_in,
+                     KeyT *d_keys_out,
+                     int num_items,
+                     int num_segments,
+                     BeginOffsetIteratorT d_begin_offsets,
+                     EndOffsetIteratorT d_end_offsets,
+                     cudaStream_t stream    = 0,
+                     bool debug_synchronous = false)
+  {
+    constexpr bool is_descending = true;
+    constexpr bool is_overwrite_okay = false;
+    using DispatchT = DispatchSegmentedSort<is_descending,
+                                            KeyT,
+                                            cub::NullType,
+                                            int,
+                                            BeginOffsetIteratorT,
+                                            EndOffsetIteratorT>;
+
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchT::Dispatch(d_temp_storage,
+                               temp_storage_bytes,
+                               d_keys,
+                               d_values,
+                               num_items,
+                               num_segments,
+                               d_begin_offsets,
+                               d_end_offsets,
+                               is_overwrite_okay,
+                               stream,
+                               debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts segments of keys into ascending order. Approximately
+   *        `2*num_segments` auxiliary storage required.
+   *
+   * @par
+   * - The sorting operation is given a pair of key buffers managed by a
+   *   DoubleBuffer structure that indicates which of the two buffers is
+   *   "current" (and thus contains the input data to be sorted).
+   * - The contents of both buffers may be altered by the sorting operation.
+   * - Upon completion, the sorting operation will update the "current"
+   *   indicator within the DoubleBuffer wrapper to reference which of the two
+   *   buffers now contains the sorted output sequence (a function of the number
+   *   of key bits and the targeted device architecture).
+   * - When the input is a contiguous sequence of segments, a single sequence
+   *   @p segment_offsets (of length `num_segments+1`) can be aliased
+   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
+   *   the latter is specified as `segment_offsets+1`).
+   * - SortKeys is not guaranteed to be stable. That is, suppose that
+   *   @p i and @p j are equivalent: neither one is less than the other. It is
+   *   not guaranteed that the relative order of these two elements will be
+   *   preserved by sort.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments
+   * (with one zero-length segment) of @p int keys.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_segmented_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible
+   * // pointers for sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Create a DoubleBuffer to wrap the pair of device pointers
+   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedSort::SortKeys(
+   *     d_temp_storage, temp_storage_bytes, d_keys,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedSort::SortKeys(
+   *     d_temp_storage, temp_storage_bytes, d_keys,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+   * @endcode
+   *
+   * @tparam KeyT
+   *   <b>[inferred]</b> Key type
+   *
+   * @tparam BeginOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When nullptr, the
+   *   required allocation size is written to @p temp_storage_bytes and no
+   *   work is done
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of @p d_temp_storage allocation
+   *
+   * @param[in,out] d_keys
+   *   Reference to the double-buffer of keys whose "current" device-accessible
+   *   buffer contains the unsorted input keys and, upon return, is updated to
+   *   point to the sorted output keys
+   *
+   * @param[in] num_items
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
+   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*`
+   *   and `d_values_*`
+   *
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   @p num_segments, such that `d_end_offsets[i] - 1` is the last element of
+   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i-th segment is
+   *   considered empty.
+   *
+   * @param[in] stream
+   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   <b>[optional]</b> Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations to
+   *   be printed to the console. Default is @p false.
+   */
+  template <typename KeyT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortKeys(void *d_temp_storage,
+           std::size_t &temp_storage_bytes,
+           DoubleBuffer<KeyT> &d_keys,
+           int num_items,
+           int num_segments,
+           BeginOffsetIteratorT d_begin_offsets,
+           EndOffsetIteratorT d_end_offsets,
+           cudaStream_t stream    = 0,
+           bool debug_synchronous = false)
+  {
+    constexpr bool is_descending     = false;
+    constexpr bool is_overwrite_okay = true;
+
+    using DispatchT = DispatchSegmentedSort<is_descending,
+                                            KeyT,
+                                            cub::NullType,
+                                            int,
+                                            BeginOffsetIteratorT,
+                                            EndOffsetIteratorT>;
+
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchT::Dispatch(d_temp_storage,
+                               temp_storage_bytes,
+                               d_keys,
+                               d_values,
+                               num_items,
+                               num_segments,
+                               d_begin_offsets,
+                               d_end_offsets,
+                               is_overwrite_okay,
+                               stream,
+                               debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts segments of keys into descending order. Approximately
+   *        `2*num_segments` auxiliary storage required.
+   *
+   * @par
+   * - The sorting operation is given a pair of key buffers managed by a
+   *   DoubleBuffer structure that indicates which of the two buffers is
+   *   "current" (and thus contains the input data to be sorted).
+   * - The contents of both buffers may be altered by the sorting operation.
+   * - Upon completion, the sorting operation will update the "current"
+   *   indicator within the DoubleBuffer wrapper to reference which of the two
+   *   buffers now contains the sorted output sequence (a function of the number
+   *   of key bits and the targeted device architecture).
+   * - When the input is a contiguous sequence of segments, a single sequence
+   *   @p segment_offsets (of length `num_segments + 1`) can be aliased
+   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
+   *   the latter is specified as `segment_offsets + 1`).
+   * - SortKeysDescending is not guaranteed to be stable. That is, suppose that
+   *   @p i and @p j are equivalent: neither one is less than the other. It is
+   *   not guaranteed that the relative order of these two elements will be
+   *   preserved by sort.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments
+   * (with one zero-length segment) of @p int keys.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_segmented_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for
+   * // sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Create a DoubleBuffer to wrap the pair of device pointers
+   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedSort::SortKeysDescending(
+   *     d_temp_storage, temp_storage_bytes, d_keys,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedSort::SortKeysDescending(
+   *     d_temp_storage, temp_storage_bytes, d_keys,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+   * @endcode
+   *
+   * @tparam KeyT
+   *   <b>[inferred]</b> Key type
+   *
+   * @tparam BeginOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to @p temp_storage_bytes and no work
+   *   is done
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of @p d_temp_storage allocation
+   *
+   * @param[in,out] d_keys
+   *   Reference to the double-buffer of keys whose "current" device-accessible
+   *   buffer contains the unsorted input keys and, upon return, is updated to
+   *   point to the sorted output keys
+   *
+   * @param[in] num_items
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
+   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   @p num_segments, such that `d_end_offsets[i] - 1` is the last element of
+   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i] - 1<= d_begin_offsets[i]`, the i-th segment is
+   *   considered empty.
+   *
+   * @param[in] stream
+   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   <b>[optional]</b> Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations
+   *   to be printed to the console. Default is @p false.
+   */
+  template <typename KeyT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortKeysDescending(void *d_temp_storage,
+                     std::size_t &temp_storage_bytes,
+                     DoubleBuffer<KeyT> &d_keys,
+                     int num_items,
+                     int num_segments,
+                     BeginOffsetIteratorT d_begin_offsets,
+                     EndOffsetIteratorT d_end_offsets,
+                     cudaStream_t stream    = 0,
+                     bool debug_synchronous = false)
+  {
+    constexpr bool is_descending = true;
+    constexpr bool is_overwrite_okay = true;
+
+    using DispatchT = DispatchSegmentedSort<is_descending,
+                                            KeyT,
+                                            cub::NullType,
+                                            int,
+                                            BeginOffsetIteratorT,
+                                            EndOffsetIteratorT>;
+
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchT::Dispatch(d_temp_storage,
+                               temp_storage_bytes,
+                               d_keys,
+                               d_values,
+                               num_items,
+                               num_segments,
+                               d_begin_offsets,
+                               d_end_offsets,
+                               is_overwrite_okay,
+                               stream,
+                               debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts segments of keys into ascending order. Approximately
+   *        `num_items + 2*num_segments` auxiliary storage required.
+   *
+   * @par
+   * - The contents of the input data are not altered by the sorting operation.
+   * - When the input is a contiguous sequence of segments, a single sequence
+   *   @p segment_offsets (of length `num_segments+1`) can be aliased
+   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
+   *   the latter is specified as `segment_offsets+1`).
+   * - StableSortKeys is stable: it preserves the relative ordering of
+   *   equivalent elements. That is, if @p x and @p y are elements such that
+   *   @p x precedes @p y, and if the two elements are equivalent (neither
+   *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
+   *   @p x still precedes @p y.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments
+   * (with one zero-length segment) of @p int keys.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_segmented_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void    *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedSort::StableSortKeys(
+   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedSort::StableSortKeys(
+   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+   * @endcode
+   *
+   * @tparam KeyT
+   *   <b>[inferred]</b> Key type
+   *
+   * @tparam BeginOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When nullptr, the
+   *   required allocation size is written to @p temp_storage_bytes and no work
+   *   is done
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of @p d_temp_storage allocation
+   *
+   * @param[in] d_keys_in
+   *   Device-accessible pointer to the input data of key data to sort
+   *
+   * @param[out] d_keys_out
+   *   Device-accessible pointer to the sorted output sequence of key data
+   *
+   * @param[in] num_items
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
+   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of
+   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
+   *   considered empty.
+   *
+   * @param[in] stream
+   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   <b>[optional]</b> Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations
+   *   to be printed to the console. Default is @p false.
+   */
+  template <typename KeyT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  StableSortKeys(void *d_temp_storage,
+                 std::size_t &temp_storage_bytes,
+                 const KeyT *d_keys_in,
+                 KeyT *d_keys_out,
+                 int num_items,
+                 int num_segments,
+                 BeginOffsetIteratorT d_begin_offsets,
+                 EndOffsetIteratorT d_end_offsets,
+                 cudaStream_t stream    = 0,
+                 bool debug_synchronous = false)
+  {
+    return SortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream,
+      debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts segments of keys into descending order. Approximately
+   *        `num_items + 2*num_segments` auxiliary storage required.
+   *
+   * @par
+   * - The contents of the input data are not altered by the sorting operation.
+   * - When the input is a contiguous sequence of segments, a single sequence
+   *   @p segment_offsets (of length `num_segments+1`) can be aliased
+   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
+   *   the latter is specified as `segment_offsets+1`).
+   * - StableSortKeysDescending is stable: it preserves the relative ordering of
+   *   equivalent elements. That is, if @p x and @p y are elements such that
+   *   @p x precedes @p y, and if the two elements are equivalent (neither
+   *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
+   *   @p x still precedes @p y.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments
+   * (with one zero-length segment) of @p int keys.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_segmented_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void    *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedSort::StableSortKeysDescending(
+   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedSort::StableSortKeysDescending(
+   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+   * @endcode
+   *
+   * @tparam KeyT
+   *   <b>[inferred]</b> Key type
+   *
+   * @tparam BeginOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When nullptr, the
+   *   required allocation size is written to @p temp_storage_bytes and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of @p d_temp_storage allocation
+   *
+   * @param[in] d_keys_in
+   *   Device-accessible pointer to the input data of key data to sort
+   *
+   * @param[out] d_keys_out
+   *   Device-accessible pointer to the sorted output sequence of key data
+   *
+   * @param[in] num_items
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
+   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of
+   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
+   *   considered empty.
+   *
+   * @param[in] stream
+   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   <b>[optional]</b> Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations
+   *   to be printed to the console. Default is @p false.
+   */
+  template <typename KeyT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  StableSortKeysDescending(void *d_temp_storage,
+                           std::size_t &temp_storage_bytes,
+                           const KeyT *d_keys_in,
+                           KeyT *d_keys_out,
+                           int num_items,
+                           int num_segments,
+                           BeginOffsetIteratorT d_begin_offsets,
+                           EndOffsetIteratorT d_end_offsets,
+                           cudaStream_t stream    = 0,
+                           bool debug_synchronous = false)
+  {
+    return SortKeysDescending<KeyT,
+                              BeginOffsetIteratorT,
+                              EndOffsetIteratorT>(d_temp_storage,
+                                                  temp_storage_bytes,
+                                                  d_keys_in,
+                                                  d_keys_out,
+                                                  num_items,
+                                                  num_segments,
+                                                  d_begin_offsets,
+                                                  d_end_offsets,
+                                                  stream,
+                                                  debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts segments of keys into ascending order. Approximately
+   *        `2*num_segments` auxiliary storage required.
+   *
+   * @par
+   * - The sorting operation is given a pair of key buffers managed by a
+   *   DoubleBuffer structure that indicates which of the two buffers is
+   *   "current" (and thus contains the input data to be sorted).
+   * - The contents of both buffers may be altered by the sorting operation.
+   * - Upon completion, the sorting operation will update the "current"
+   *   indicator within the DoubleBuffer wrapper to reference which of the two
+   *   buffers now contains the sorted output sequence (a function of the number
+   *   of key bits and the targeted device architecture).
+   * - When the input is a contiguous sequence of segments, a single sequence
+   *   @p segment_offsets (of length `num_segments+1`) can be aliased
+   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
+   *   the latter is specified as `segment_offsets+1`).
+   * - StableSortKeys is stable: it preserves the relative ordering of
+   *   equivalent elements. That is, if @p x and @p y are elements such that
+   *   @p x precedes @p y, and if the two elements are equivalent (neither
+   *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
+   *   @p x still precedes @p y.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments
+   * (with one zero-length segment) of @p int keys.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_segmented_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Create a DoubleBuffer to wrap the pair of device pointers
+   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedSort::StableSortKeys(
+   *     d_temp_storage, temp_storage_bytes, d_keys,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedSort::StableSortKeys(
+   *     d_temp_storage, temp_storage_bytes, d_keys,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+   * @endcode
+   *
+   * @tparam KeyT
+   *   <b>[inferred]</b> Key type
+   *
+   * @tparam BeginOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When nullptr, the
+   *   required allocation size is written to @p temp_storage_bytes and no work
+   *   is done
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of @p d_temp_storage allocation
+   *
+   * @param[in,out] d_keys
+   *   Reference to the double-buffer of keys whose "current" device-accessible
+   *   buffer contains the unsorted input keys and, upon return, is updated to
+   *   point to the sorted output keys
+   *
+   * @param[in] num_items
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
+   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   @p num_segments, such that `d_end_offsets[i] - 1` is the last element of
+   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the i-th segment is
+   *   considered empty.
+   *
+   * @param[in] stream
+   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   <b>[optional]</b> Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations
+   *   to be printed to the console. Default is @p false.
+   */
+  template <typename KeyT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  StableSortKeys(void *d_temp_storage,
+                 std::size_t &temp_storage_bytes,
+                 DoubleBuffer<KeyT> &d_keys,
+                 int num_items,
+                 int num_segments,
+                 BeginOffsetIteratorT d_begin_offsets,
+                 EndOffsetIteratorT d_end_offsets,
+                 cudaStream_t stream    = 0,
+                 bool debug_synchronous = false)
+  {
+    return SortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream,
+      debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts segments of keys into descending order. Approximately
+   *        `2*num_segments` auxiliary storage required.
+   *
+   * @par
+   * - The sorting operation is given a pair of key buffers managed by a
+   *   DoubleBuffer structure that indicates which of the two buffers is
+   *   "current" (and thus contains the input data to be sorted).
+   * - The contents of both buffers may be altered by the sorting operation.
+   * - Upon completion, the sorting operation will update the "current"
+   *   indicator within the DoubleBuffer wrapper to reference which of the two
+   *   buffers now contains the sorted output sequence (a function of the number
+   *   of key bits and the targeted device architecture).
+   * - When the input is a contiguous sequence of segments, a single sequence
+   *   @p segment_offsets (of length `num_segments+1`) can be aliased
+   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
+   *   the latter is specified as `segment_offsets+1`).
+   * - StableSortKeysDescending is stable: it preserves the relative ordering of
+   *   equivalent elements. That is, if @p x and @p y are elements such that
+   *   @p x precedes @p y, and if the two elements are equivalent (neither
+   *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
+   *   @p x still precedes @p y.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments
+   * (with one zero-length segment) of @p int keys.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_segmented_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Create a DoubleBuffer to wrap the pair of device pointers
+   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedSort::StableSortKeysDescending(
+   *     d_temp_storage, temp_storage_bytes, d_keys,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedSort::StableSortKeysDescending(
+   *     d_temp_storage, temp_storage_bytes, d_keys,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+   * @endcode
+   *
+   * @tparam KeyT
+   *   <b>[inferred]</b> Key type
+   *
+   * @tparam BeginOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When nullptr, the
+   *   required allocation size is written to @p temp_storage_bytes and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of @p d_temp_storage allocation
+   *
+   * @param[in,out] d_keys
+   *   Reference to the double-buffer of keys whose "current" device-accessible
+   *   buffer contains the unsorted input keys and, upon return, is updated to
+   *   point to the sorted output keys
+   *
+   * @param[in] num_items
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
+   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   @p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last
+   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
+   *   `d_values_*`. If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the
+   *   i-th segment is considered empty.
+   *
+   * @param[in] stream
+   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   <b>[optional]</b> Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations
+   *   to be printed to the console. Default is @p false.
+   */
+  template <typename KeyT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  StableSortKeysDescending(void *d_temp_storage,
+                           std::size_t &temp_storage_bytes,
+                           DoubleBuffer<KeyT> &d_keys,
+                           int num_items,
+                           int num_segments,
+                           BeginOffsetIteratorT d_begin_offsets,
+                           EndOffsetIteratorT d_end_offsets,
+                           cudaStream_t stream    = 0,
+                           bool debug_synchronous = false)
+  {
+    return SortKeysDescending<KeyT,
+                              BeginOffsetIteratorT,
+                              EndOffsetIteratorT>(d_temp_storage,
+                                                  temp_storage_bytes,
+                                                  d_keys,
+                                                  num_items,
+                                                  num_segments,
+                                                  d_begin_offsets,
+                                                  d_end_offsets,
+                                                  stream,
+                                                  debug_synchronous);
+  }
+
+  //@}  end member group
+  /*************************************************************************//**
+   * @name Key-value pairs
+   ****************************************************************************/
+  //@{
+
+  /**
+   * @brief Sorts segments of key-value pairs into ascending order.
+   *        Approximately `2*num_items + 2*num_segments` auxiliary storage
+   *        required.
+   *
+   * @par
+   * - The contents of the input data are not altered by the sorting operation.
+   * - When the input is a contiguous sequence of segments, a single sequence
+   *   @p segment_offsets (of length `num_segments+1`) can be aliased
+   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
+   *   the latter is specified as `segment_offsets+1`).
+   * - SortPairs is not guaranteed to be stable. That is, suppose that @p i and
+   *   @p j are equivalent: neither one is less than the other. It is not
+   *   guaranteed that the relative order of these two elements will be
+   *   preserved by sort.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments
+   * (with one zero-length segment) of @p int keys with associated vector of
+   * @p int values.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_segmented_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+   * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedSort::SortPairs(
+   *     d_temp_storage, temp_storage_bytes,
+   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedSort::SortPairs(
+   *     d_temp_storage, temp_storage_bytes,
+   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+   * // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
+   * @endcode
+   *
+   * @tparam KeyT
+   *   <b>[inferred]</b> Key type
+   *
+   * @tparam ValueT
+   *   <b>[inferred]</b> Value type
+   *
+   * @tparam BeginOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to @p temp_storage_bytes and no work
+   *   is done
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of @p d_temp_storage allocation
+   *
+   * @param[in] d_keys_in
+   *   Device-accessible pointer to the input data of key data to sort
+   *
+   * @param[out] d_keys_out
+   *   Device-accessible pointer to the sorted output sequence of key data
+   *
+   * @param[in] d_values_in
+   *   Device-accessible pointer to the corresponding input sequence of
+   *   associated value items
+   *
+   * @param[out] d_values_out
+   *   Device-accessible pointer to the correspondingly-reordered output
+   *   sequence of associated value items
+   *
+   * @param[in] num_items
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
+   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of
+   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
+   *   considered empty.
+   *
+   * @param[in] stream
+   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   <b>[optional]</b> Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations
+   *   to be printed to the console. Default is @p false.
+   */
+  template <typename KeyT,
+            typename ValueT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortPairs(void *d_temp_storage,
+           std::size_t &temp_storage_bytes,
+           const KeyT *d_keys_in,
+           KeyT *d_keys_out,
+           const ValueT *d_values_in,
+           ValueT *d_values_out,
+           int num_items,
+           int num_segments,
+           BeginOffsetIteratorT d_begin_offsets,
+           EndOffsetIteratorT d_end_offsets,
+           cudaStream_t stream = 0,
+           bool debug_synchronous = false)
+  {
+    constexpr bool is_descending = false;
+    constexpr bool is_overwrite_okay = false;
+    using DispatchT = DispatchSegmentedSort<is_descending,
+                                            KeyT,
+                                            ValueT,
+                                            int,
+                                            BeginOffsetIteratorT,
+                                            EndOffsetIteratorT>;
+
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
+    DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in), d_values_out);
+
+    return DispatchT::Dispatch(d_temp_storage,
+                               temp_storage_bytes,
+                               d_keys,
+                               d_values,
+                               num_items,
+                               num_segments,
+                               d_begin_offsets,
+                               d_end_offsets,
+                               is_overwrite_okay,
+                               stream,
+                               debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts segments of key-value pairs into descending order. Approximately
+   *        `2*num_items + 2*num_segments` auxiliary storage required.
+   *
+   * @par
+   * - The contents of the input data are not altered by the sorting operation.
+   * - When the input is a contiguous sequence of segments, a single sequence
+   *   @p segment_offsets (of length `num_segments+1`) can be aliased
+   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
+   *   the latter is specified as `segment_offsets+1`).
+   * - SortPairs is not guaranteed to be stable. That is, suppose that @p i and
+   *   @p j are equivalent: neither one is less than the other. It is not
+   *   guaranteed that the relative order of these two elements will be
+   *   preserved by sort.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments
+   * (with one zero-length segment) of @p int keys with associated vector of
+   * @p int values.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_segmented_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for
+   * // sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+   * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void    *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedSort::SortPairsDescending(
+   *     d_temp_storage, temp_storage_bytes,
+   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedSort::SortPairsDescending(
+   *     d_temp_storage, temp_storage_bytes,
+   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+   * // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
+   * @endcode
+   *
+   * @tparam KeyT
+   *   <b>[inferred]</b> Key type
+   *
+   * @tparam ValueT
+   *   <b>[inferred]</b> Value type
+   *
+   * @tparam BeginOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When nullptr, the
+   *   required allocation size is written to @p temp_storage_bytes and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of @p d_temp_storage allocation
+   *
+   * @param[in] d_keys_in
+   *   Device-accessible pointer to the input data of key data to sort
+   *
+   * @param[out] d_keys_out
+   *   Device-accessible pointer to the sorted output sequence of key data
+   *
+   * @param[in] d_values_in
+   *   Device-accessible pointer to the corresponding input sequence of
+   *   associated value items
+   *
+   * @param[out] d_values_out
+   *   Device-accessible pointer to the correspondingly-reordered output
+   *   sequence of associated value items
+   *
+   * @param[in] num_items
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
+   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of
+   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
+   *   considered empty.
+   *
+   * @param[in] stream
+   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   <b>[optional]</b> Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations
+   *   to be printed to the console. Default is @p false.
+   */
+  template <typename KeyT,
+            typename ValueT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortPairsDescending(void *d_temp_storage,
+                      std::size_t &temp_storage_bytes,
+                      const KeyT *d_keys_in,
+                      KeyT *d_keys_out,
+                      const ValueT *d_values_in,
+                      ValueT *d_values_out,
+                      int num_items,
+                      int num_segments,
+                      BeginOffsetIteratorT d_begin_offsets,
+                      EndOffsetIteratorT d_end_offsets,
+                      cudaStream_t stream    = 0,
+                      bool debug_synchronous = false)
+  {
+    constexpr bool is_descending = true;
+    constexpr bool is_overwrite_okay = false;
+    using DispatchT = DispatchSegmentedSort<is_descending,
+                                            KeyT,
+                                            ValueT,
+                                            int,
+                                            BeginOffsetIteratorT,
+                                            EndOffsetIteratorT>;
+
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
+    DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in), d_values_out);
+
+    return DispatchT::Dispatch(d_temp_storage,
+                               temp_storage_bytes,
+                               d_keys,
+                               d_values,
+                               num_items,
+                               num_segments,
+                               d_begin_offsets,
+                               d_end_offsets,
+                               is_overwrite_okay,
+                               stream,
+                               debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts segments of key-value pairs into ascending order.
+   *        Approximately `2*num_segments` auxiliary storage required.
+   *
+   * @par
+   * - The sorting operation is given a pair of key buffers and a corresponding
+   *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+   *   structure that indicates which of the two buffers is "current" (and thus
+   *   contains the input data to be sorted).
+   * - The contents of both buffers within each pair may be altered by the sorting
+   *   operation.
+   * - Upon completion, the sorting operation will update the "current" indicator
+   *   within each DoubleBuffer wrapper to reference which of the two buffers
+   *   now contains the sorted output sequence (a function of the number of key bits
+   *   specified and the targeted device architecture).
+   * - When the input is a contiguous sequence of segments, a single sequence
+   *   @p segment_offsets (of length `num_segments+1`) can be aliased
+   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
+   *   the latter is specified as `segment_offsets+1`).
+   * - SortPairs is not guaranteed to be stable. That is, suppose that @p i and
+   *   @p j are equivalent: neither one is less than the other. It is not
+   *   guaranteed that the relative order of these two elements will be
+   *   preserved by sort.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments
+   * (with one zero-length segment) of @p int keys with associated vector of
+   * @p int values.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_segmented_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+   * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Create a set of DoubleBuffers to wrap pairs of device pointers
+   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+   * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedSort::SortPairs(
+   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedSort::SortPairs(
+   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+   * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+   *
+   * @endcode
+   *
+   * @tparam KeyT
+   *   <b>[inferred]</b> Key type
+   *
+   * @tparam ValueT
+   *   <b>[inferred]</b> Value type
+   *
+   * @tparam BeginOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to @p temp_storage_bytes and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of @p d_temp_storage allocation
+   *
+   * @param[in,out] d_keys
+   *   Reference to the double-buffer of keys whose "current" device-accessible
+   *   buffer contains the unsorted input keys and, upon return, is updated to
+   *   point to the sorted output keys
+   *
+   * @param[in,out] d_values
+   *   Double-buffer of values whose "current" device-accessible buffer contains
+   *   the unsorted input values and, upon return, is updated to point to the
+   *   sorted output values
+   *
+   * @param[in] num_items
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
+   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of
+   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
+   *   considered empty.
+   *
+   * @param[in] stream
+   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   <b>[optional]</b> Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations
+   *   to be printed to the console. Default is @p false.
+   */
+  template <typename KeyT,
+            typename ValueT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortPairs(void *d_temp_storage,
+            std::size_t &temp_storage_bytes,
+            DoubleBuffer<KeyT> &d_keys,
+            DoubleBuffer<ValueT> &d_values,
+            int num_items,
+            int num_segments,
+            BeginOffsetIteratorT d_begin_offsets,
+            EndOffsetIteratorT d_end_offsets,
+            cudaStream_t stream    = 0,
+            bool debug_synchronous = false)
+  {
+    constexpr bool is_descending = false;
+    constexpr bool is_overwrite_okay = true;
+    using DispatchT = DispatchSegmentedSort<is_descending,
+                                            KeyT,
+                                            ValueT,
+                                            int,
+                                            BeginOffsetIteratorT,
+                                            EndOffsetIteratorT>;
+
+    return DispatchT::Dispatch(d_temp_storage,
+                               temp_storage_bytes,
+                               d_keys,
+                               d_values,
+                               num_items,
+                               num_segments,
+                               d_begin_offsets,
+                               d_end_offsets,
+                               is_overwrite_okay,
+                               stream,
+                               debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts segments of key-value pairs into descending order.
+   *        Approximately `2*num_segments` auxiliary storage required.
+   *
+   * @par
+   * - The sorting operation is given a pair of key buffers and a corresponding
+   *   pair of associated value buffers. Each pair is managed by a DoubleBuffer
+   *   structure that indicates which of the two buffers is "current" (and thus
+   *   contains the input data to be sorted).
+   * - The contents of both buffers within each pair may be altered by the
+   *   sorting operation.
+   * - Upon completion, the sorting operation will update the "current"
+   *   indicator within each DoubleBuffer wrapper to reference which of the two
+   *   buffers now contains the sorted output sequence (a function of the number
+   *   of key bits specified and the targeted device architecture).
+   * - When the input is a contiguous sequence of segments, a single sequence
+   *   @p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
+   *   the latter is specified as <tt>segment_offsets+1</tt>).
+   * - SortPairsDescending is not guaranteed to be stable. That is, suppose that
+   *   @p i and @p j are equivalent: neither one is less than the other. It is
+   *   not guaranteed that the relative order of these two elements will be
+   *   preserved by sort.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments
+   * (with one zero-length segment) of @p int keys with associated vector of
+   * @p int values.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_segmented_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for
+   * // sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+   * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Create a set of DoubleBuffers to wrap pairs of device pointers
+   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+   * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedSort::SortPairsDescending(
+   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedSort::SortPairsDescending(
+   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+   * // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
+   *
+   * @endcode
+   *
+   * @tparam KeyT
+   *   <b>[inferred]</b> Key type
+   *
+   * @tparam ValueT
+   *   <b>[inferred]</b> Value type
+   *
+   * @tparam BeginOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When nullptr, the
+   *   required allocation size is written to @p temp_storage_bytes and no work
+   *   is done
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of @p d_temp_storage allocation
+   *
+   * @param[in,out] d_keys
+   *   Reference to the double-buffer of keys whose "current" device-accessible
+   *   buffer contains the unsorted input keys and, upon return, is updated to
+   *   point to the sorted output keys
+   *
+   * @param[in,out] d_values
+   *   Double-buffer of values whose "current" device-accessible buffer contains
+   *   the unsorted input values and, upon return, is updated to point to the
+   *   sorted output values
+   *
+   * @param[in] num_items
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
+   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of
+   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
+   *   considered empty.
+   *
+   * @param[in] stream
+   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   <b>[optional]</b> Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations to
+   *   be printed to the console. Default is @p false.
+   */
+  template <typename KeyT,
+            typename ValueT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortPairsDescending(void *d_temp_storage,
+                      std::size_t &temp_storage_bytes,
+                      DoubleBuffer<KeyT> &d_keys,
+                      DoubleBuffer<ValueT> &d_values,
+                      int num_items,
+                      int num_segments,
+                      BeginOffsetIteratorT d_begin_offsets,
+                      EndOffsetIteratorT d_end_offsets,
+                      cudaStream_t stream    = 0,
+                      bool debug_synchronous = false)
+  {
+    constexpr bool is_descending = true;
+    constexpr bool is_overwrite_okay = true;
+    using DispatchT = DispatchSegmentedSort<is_descending,
+                                            KeyT,
+                                            ValueT,
+                                            int,
+                                            BeginOffsetIteratorT,
+                                            EndOffsetIteratorT>;
+
+    return DispatchT::Dispatch(d_temp_storage,
+                               temp_storage_bytes,
+                               d_keys,
+                               d_values,
+                               num_items,
+                               num_segments,
+                               d_begin_offsets,
+                               d_end_offsets,
+                               is_overwrite_okay,
+                               stream,
+                               debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts segments of key-value pairs into ascending order. Approximately
+   *        `2*num_items + 2*num_segments` auxiliary storage required.
+   *
+   * @par
+   * - The contents of the input data are not altered by the sorting operation.
+   * - When the input is a contiguous sequence of segments, a single sequence
+   *   @p segment_offsets (of length `num_segments+1`) can be aliased
+   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
+   *   the latter is specified as `segment_offsets+1`).
+   * - StableSortPairs is stable: it preserves the relative ordering of
+   *   equivalent elements. That is, if @p x and @p y are elements such that
+   *   @p x precedes @p y, and if the two elements are equivalent (neither
+   *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
+   *   @p x still precedes @p y.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments
+   * (with one zero-length segment) of @p int keys with associated vector of
+   * @p int values.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_segmented_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+   * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedSort::StableSortPairs(
+   *     d_temp_storage, temp_storage_bytes,
+   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedSort::StableSortPairs(
+   *     d_temp_storage, temp_storage_bytes,
+   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+   * // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
+   * @endcode
+   *
+   * @tparam KeyT
+   *   <b>[inferred]</b> Key type
+   *
+   * @tparam ValueT
+   *   <b>[inferred]</b> Value type
+   *
+   * @tparam BeginOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When nullptr, the
+   *   required allocation size is written to @p temp_storage_bytes and no work is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of @p d_temp_storage allocation
+   *
+   * @param[in] d_keys_in
+   *   Device-accessible pointer to the input data of key data to sort
+   *
+   * @param[out] d_keys_out
+   *   Device-accessible pointer to the sorted output sequence of key data
+   *
+   * @param[in] d_values_in
+   *   Device-accessible pointer to the corresponding input sequence of
+   *   associated value items
+   *
+   * @param[out] d_values_out
+   *   Device-accessible pointer to the correspondingly-reordered output
+   *   sequence of associated value items
+   *
+   * @param[in] num_items
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
+   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of
+   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
+   *   considered empty.
+   *
+   * @param[in] stream
+   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   <b>[optional]</b> Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations to
+   *   be printed to the console. Default is @p false.
+   */
+  template <typename KeyT,
+            typename ValueT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  StableSortPairs(void *d_temp_storage,
+                  std::size_t &temp_storage_bytes,
+                  const KeyT *d_keys_in,
+                  KeyT *d_keys_out,
+                  const ValueT *d_values_in,
+                  ValueT *d_values_out,
+                  int num_items,
+                  int num_segments,
+                  BeginOffsetIteratorT d_begin_offsets,
+                  EndOffsetIteratorT d_end_offsets,
+                  cudaStream_t stream    = 0,
+                  bool debug_synchronous = false)
+  {
+    return SortPairs<KeyT,
+                     ValueT,
+                     BeginOffsetIteratorT,
+                     EndOffsetIteratorT>(d_temp_storage,
+                                         temp_storage_bytes,
+                                         d_keys_in,
+                                         d_keys_out,
+                                         d_values_in,
+                                         d_values_out,
+                                         num_items,
+                                         num_segments,
+                                         d_begin_offsets,
+                                         d_end_offsets,
+                                         stream,
+                                         debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts segments of key-value pairs into descending order.
+   *        Approximately `2*num_items + 2*num_segments` auxiliary
+   *        storage required.
+   *
+   * @par
+   * - The contents of the input data are not altered by the sorting operation.
+   * - When the input is a contiguous sequence of segments, a single sequence
+   *   @p segment_offsets (of length `num_segments+1`) can be aliased
+   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
+   *   the latter is specified as `segment_offsets+1`).
+   * - StableSortPairsDescending is stable: it preserves the relative ordering
+   *   of equivalent elements. That is, if @p x and @p y are elements such that
+   *   @p x precedes @p y, and if the two elements are equivalent (neither
+   *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
+   *   @p x still precedes @p y.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments
+   * (with one zero-length segment) of @p int keys with associated vector of
+   * @p int values.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_segmented_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+   * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedSort::StableSortPairsDescending(
+   *     d_temp_storage, temp_storage_bytes,
+   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedSort::StableSortPairsDescending(
+   *     d_temp_storage, temp_storage_bytes,
+   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+   * // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
+   * @endcode
+   *
+   * @tparam KeyT
+   *   <b>[inferred]</b> Key type
+   *
+   * @tparam ValueT
+   *   <b>[inferred]</b> Value type
+   *
+   * @tparam BeginOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to @p temp_storage_bytes and no work
+   *   is done
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of @p d_temp_storage allocation
+   *
+   * @param[in] d_keys_in
+   *   Device-accessible pointer to the input data of key data to sort
+   *
+   * @param[out] d_keys_out
+   *   Device-accessible pointer to the sorted output sequence of key data
+   *
+   * @param[in] d_values_in
+   *   Device-accessible pointer to the corresponding input sequence of
+   *   associated value items
+   *
+   * @param[out] d_values_out
+   *   Device-accessible pointer to the correspondingly-reordered output
+   *   sequence of associated value items
+   *
+   * @param[in] num_items
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
+   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of
+   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
+   *   considered empty.
+   *
+   * @param[in] stream
+   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   <b>[optional]</b> Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations to
+   *   be printed to the console. Default is @p false.
+   */
+  template <typename KeyT,
+            typename ValueT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  StableSortPairsDescending(void *d_temp_storage,
+                            std::size_t &temp_storage_bytes,
+                            const KeyT *d_keys_in,
+                            KeyT *d_keys_out,
+                            const ValueT *d_values_in,
+                            ValueT *d_values_out,
+                            int num_items,
+                            int num_segments,
+                            BeginOffsetIteratorT d_begin_offsets,
+                            EndOffsetIteratorT d_end_offsets,
+                            cudaStream_t stream    = 0,
+                            bool debug_synchronous = false)
+  {
+    return SortPairsDescending<KeyT,
+                               ValueT,
+                               BeginOffsetIteratorT,
+                               EndOffsetIteratorT>(d_temp_storage,
+                                                   temp_storage_bytes,
+                                                   d_keys_in,
+                                                   d_keys_out,
+                                                   d_values_in,
+                                                   d_values_out,
+                                                   num_items,
+                                                   num_segments,
+                                                   d_begin_offsets,
+                                                   d_end_offsets,
+                                                   stream,
+                                                   debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts segments of key-value pairs into ascending order.
+   *        Approximately `2*num_segments` auxiliary storage required.
+   *
+   * @par
+   * - The sorting operation is given a pair of key buffers and a corresponding
+   *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+   *   structure that indicates which of the two buffers is "current" (and thus
+   *   contains the input data to be sorted).
+   * - The contents of both buffers within each pair may be altered by the
+   *   sorting operation.
+   * - Upon completion, the sorting operation will update the "current"
+   *   indicator within each DoubleBuffer wrapper to reference which of the two
+   *   buffers now contains the sorted output sequence (a function of the number
+   *   of key bits specified and the targeted device architecture).
+   * - When the input is a contiguous sequence of segments, a single sequence
+   *   @p segment_offsets (of length `num_segments+1`) can be aliased
+   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
+   *   the latter is specified as `segment_offsets+1`).
+   * - StableSortPairs is stable: it preserves the relative ordering
+   *   of equivalent elements. That is, if @p x and @p y are elements such that
+   *   @p x precedes @p y, and if the two elements are equivalent (neither
+   *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
+   *   @p x still precedes @p y.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments
+   * (with one zero-length segment) of @p int keys with associated vector of
+   * @p int values.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_segmented_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+   * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Create a set of DoubleBuffers to wrap pairs of device pointers
+   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+   * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedSort::StableSortPairs(
+   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedSort::StableSortPairs(
+   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+   * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+   *
+   * @endcode
+   *
+   * @tparam KeyT
+   *   <b>[inferred]</b> Key type
+   *
+   * @tparam ValueT
+   *   <b>[inferred]</b> Value type
+   *
+   * @tparam BeginOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to @p temp_storage_bytes and no work
+   *   is done
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of @p d_temp_storage allocation
+   *
+   * @param[in,out] d_keys
+   *   Reference to the double-buffer of keys whose "current" device-accessible
+   *   buffer contains the unsorted input keys and, upon return, is updated to
+   *   point to the sorted output keys
+   *
+   * @param[in,out] d_values
+   *   Double-buffer of values whose "current" device-accessible buffer contains
+   *   the unsorted input values and, upon return, is updated to point to the
+   *   sorted output values
+   *
+   * @param[in] num_items
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
+   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of
+   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
+   *   considered empty.
+   *
+   * @param[in] stream
+   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   <b>[optional]</b> Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations to
+   *   be printed to the console. Default is @p false.
+   */
+  template <typename KeyT,
+            typename ValueT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  StableSortPairs(void *d_temp_storage,
+                  std::size_t &temp_storage_bytes,
+                  DoubleBuffer<KeyT> &d_keys,
+                  DoubleBuffer<ValueT> &d_values,
+                  int num_items,
+                  int num_segments,
+                  BeginOffsetIteratorT d_begin_offsets,
+                  EndOffsetIteratorT d_end_offsets,
+                  cudaStream_t stream    = 0,
+                  bool debug_synchronous = false)
+  {
+    return SortPairs<KeyT,
+                     ValueT,
+                     BeginOffsetIteratorT,
+                     EndOffsetIteratorT>(d_temp_storage,
+                                         temp_storage_bytes,
+                                         d_keys,
+                                         d_values,
+                                         num_items,
+                                         num_segments,
+                                         d_begin_offsets,
+                                         d_end_offsets,
+                                         stream,
+                                         debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts segments of key-value pairs into descending order.
+   *        Approximately `2*num_segments` auxiliary storage required.
+   *
+   * @par
+   * - The sorting operation is given a pair of key buffers and a corresponding
+   *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+   *   structure that indicates which of the two buffers is "current" (and thus
+   *   contains the input data to be sorted).
+   * - The contents of both buffers within each pair may be altered by the sorting
+   *   operation.
+   * - Upon completion, the sorting operation will update the "current" indicator
+   *   within each DoubleBuffer wrapper to reference which of the two buffers
+   *   now contains the sorted output sequence (a function of the number of key bits
+   *   specified and the targeted device architecture).
+   * - When the input is a contiguous sequence of segments, a single sequence
+   *   @p segment_offsets (of length `num_segments+1`) can be aliased
+   *   for both the @p d_begin_offsets and @p d_end_offsets parameters (where
+   *   the latter is specified as `segment_offsets+1`).
+   * - StableSortPairsDescending is stable: it preserves the relative ordering
+   *   of equivalent elements. That is, if @p x and @p y are elements such that
+   *   @p x precedes @p y, and if the two elements are equivalent (neither
+   *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
+   *   @p x still precedes @p y.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments
+   * (with one zero-length segment) of @p int keys with associated vector of
+   * @p int values.
+   *
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_segmented_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+   * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Create a set of DoubleBuffers to wrap pairs of device pointers
+   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+   * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedSort::StableSortPairsDescending(
+   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedSort::StableSortPairsDescending(
+   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+   * // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
+   * @endcode
+   *
+   * @tparam KeyT
+   *   <b>[inferred]</b> Key type
+   *
+   * @tparam ValueT
+   *   <b>[inferred]</b> Value type
+   *
+   * @tparam BeginOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT
+   *   <b>[inferred]</b> Random-access input iterator type for reading segment
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to @p temp_storage_bytes and no work
+   *   is done
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of @p d_temp_storage allocation
+   *
+   * @param[in,out] d_keys
+   *   Reference to the double-buffer of keys whose "current" device-accessible
+   *   buffer contains the unsorted input keys and, upon return, is updated to
+   *   point to the sorted output keys
+   *
+   * @param[in,out] d_values
+   *   Double-buffer of values whose "current" device-accessible buffer contains
+   *   the unsorted input values and, upon return, is updated to point to the
+   *   sorted output values
+   *
+   * @param[in] num_items
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length @p num_segments, such that `d_begin_offsets[i]` is the first
+   *   element of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of
+   *   the <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
+   *   considered empty.
+   *
+   * @param[in] stream
+   *   <b>[optional]</b> CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous
+   *   <b>[optional]</b> Whether or not to synchronize the stream after every
+   *   kernel launch to check for errors. Also causes launch configurations to
+   *   be printed to the console. Default is @p false.
+   */
+  template <typename KeyT,
+            typename ValueT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  StableSortPairsDescending(void *d_temp_storage,
+                            std::size_t &temp_storage_bytes,
+                            DoubleBuffer<KeyT> &d_keys,
+                            DoubleBuffer<ValueT> &d_values,
+                            int num_items,
+                            int num_segments,
+                            BeginOffsetIteratorT d_begin_offsets,
+                            EndOffsetIteratorT d_end_offsets,
+                            cudaStream_t stream    = 0,
+                            bool debug_synchronous = false)
+  {
+    return SortPairsDescending<KeyT,
+                               ValueT,
+                               BeginOffsetIteratorT,
+                               EndOffsetIteratorT>(d_temp_storage,
+                                                   temp_storage_bytes,
+                                                   d_keys,
+                                                   d_values,
+                                                   num_items,
+                                                   num_segments,
+                                                   d_begin_offsets,
+                                                   d_end_offsets,
+                                                   stream,
+                                                   debug_synchronous);
+  }
+
+  //@}  end member group
+
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/device/device_select.cuh b/cuda/cub/device/device_select.cuh
new file mode 100644
index 00000000..f9ed6d0a
--- /dev/null
+++ b/cuda/cub/device/device_select.cuh
@@ -0,0 +1,447 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_select_if.cuh"
+#include "dispatch/dispatch_unique_by_key.cuh"
+#include "../config.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. ![](select_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * These operations apply a selection criterion to selectively copy
+ * items from a specified input sequence to a compact output sequence.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSelect}
+ *
+ * \par Performance
+ * \linear_performance{select-flagged, select-if, and select-unique}
+ *
+ * \par
+ * The following chart illustrates DeviceSelect::If
+ * performance across different CUDA architectures for \p int32 items,
+ * where 50% of the items are randomly selected.
+ *
+ * \image html select_if_int32_50_percent.png
+ *
+ * \par
+ * The following chart illustrates DeviceSelect::Unique
+ * performance across different CUDA architectures for \p int32 items
+ * where segments have lengths uniformly sampled from [1,1000].
+ *
+ * \image html select_unique_int32_len_500.png
+ *
+ * \par
+ * \plots_below
+ *
+ */
+struct DeviceSelect
+{
+    /**
+     * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_flags_logo.png)
+     *
+     * \par
+     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
+     * int  num_items;              // e.g., 8
+     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // d_out                 <-- [1, 4, 6, 7]
+     * // d_num_selected_out    <-- [4]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    FlagIterator,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Flagged(
+        void*                        d_temp_storage,                ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType                SelectOp;       // Selection op (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_flags,
+            d_out,
+            d_num_selected_out,
+            SelectOp(),
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png)
+     *
+     * \par
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated select-if performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
+     * selected with 50% probability.
+     *
+     * \image html select_if_int32_50_percent.png
+     * \image html select_if_int64_50_percent.png
+     *
+     * \par
+     * The following charts are similar, but 5% selection probability:
+     *
+     * \image html select_if_int32_5_percent.png
+     * \image html select_if_int64_5_percent.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Functor type for selecting values less than some criteria
+     * struct LessThan
+     * {
+     *     int compare;
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     LessThan(int compare) : compare(compare) {}
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     bool operator()(const int &a) const {
+     *         return (a < compare);
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int      num_items;              // e.g., 8
+     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int      *d_num_selected_out;    // e.g., [ ]
+     * LessThan select_op(7);
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // d_out                 <-- [0, 2, 3, 5, 2]
+     * // d_num_selected_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     * \tparam SelectOp             <b>[inferred]</b> Selection operator type having member <tt>bool operator()(const T &a)</tt>
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT,
+        typename                    SelectOp>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t If(
+        void*                        d_temp_storage,                ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        SelectOp                    select_op,                      ///< [in] Unary selection operator
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            NULL,
+            d_out,
+            d_num_selected_out,
+            select_op,
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png)
+     *
+     * \par
+     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated select-unique performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
+     * lengths uniformly sampled from [1,1000].
+     *
+     * \image html select_unique_int32_len_500.png
+     * \image html select_unique_int64_len_500.png
+     *
+     * \par
+     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
+     *
+     * \image html select_unique_int32_len_5.png
+     * \image html select_unique_int64_len_5.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;              // e.g., 8
+     * int  *d_in;                  // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
+     *
+     * // d_out                 <-- [0, 2, 9, 5, 8]
+     * // d_num_selected_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Unique(
+        void*                       d_temp_storage,                 ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;        // Signed integer type for global offsets
+        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
+        typedef NullType                SelectOp;       // Selection op (not used)
+        typedef Equality                EqualityOp;     // Default == operator
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            NULL,
+            d_out,
+            d_num_selected_out,
+            SelectOp(),
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Given an input sequence \p d_keys_in and \p d_values_in with runs of key-value pairs with consecutive equal-valued keys, only the first key and its value from each run is selectively copied to \p d_keys_out and \p d_values_out.  The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png)
+     *
+     * \par
+     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;              // e.g., 8
+     * int  *d_keys_in;             // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int  *d_values_in;           // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+     * int  *d_keys_out;            // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_values_out;          // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::UniqueByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::UniqueByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items);
+     *
+     * // d_keys_out            <-- [0, 2, 9, 5, 8]
+     * // d_values_out          <-- [1, 2, 4, 5, 8]
+     * // d_num_selected_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam KeyInputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input keys \iterator
+     * \tparam ValueInputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input values \iterator
+     * \tparam KeyOutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected keys \iterator
+     * \tparam ValueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing selected values \iterator
+     * \tparam NumSelectedIteratorT    <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     */
+    template <
+        typename                    KeyInputIteratorT,
+        typename                    ValueInputIteratorT,
+        typename                    KeyOutputIteratorT,
+        typename                    ValueOutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t UniqueByKey(
+        void*                       d_temp_storage,                 ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeyInputIteratorT           d_keys_in,                      ///< [in] Pointer to the input sequence of keys
+        ValueInputIteratorT         d_values_in,                    ///< [in] Pointer to the input sequence of values
+        KeyOutputIteratorT          d_keys_out,                     ///< [out] Pointer to the output sequence of selected keys
+        ValueOutputIteratorT        d_values_out,                   ///< [out] Pointer to the output sequence of selected values
+        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        using OffsetT = int;
+        using EqualityOp = Equality;
+
+        return DispatchUniqueByKey<KeyInputIteratorT, ValueInputIteratorT, KeyOutputIteratorT, ValueOutputIteratorT, NumSelectedIteratorT, EqualityOp, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_values_in,
+            d_keys_out,
+            d_values_out,
+            d_num_selected_out,
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+};
+
+/**
+ * \example example_device_select_flagged.cu
+ * \example example_device_select_if.cu
+ * \example example_device_select_unique.cu
+ */
+
+CUB_NAMESPACE_END
+
+
diff --git a/cuda/cub/device/device_spmv.cuh b/cuda/cub/device/device_spmv.cuh
new file mode 100644
index 00000000..8a1250a4
--- /dev/null
+++ b/cuda/cub/device/device_spmv.cuh
@@ -0,0 +1,168 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "dispatch/dispatch_spmv_orig.cuh"
+#include "../config.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV).
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * The [<em>SpMV computation</em>](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication)
+ * performs the matrix-vector operation
+ * <em>y</em> = <b>A</b>*<em>x</em> + <em>y</em>,
+ * where:
+ *  - <b>A</b> is an <em>m</em>x<em>n</em> sparse matrix whose non-zero structure is specified in
+ *    [<em>compressed-storage-row (CSR) format</em>](http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29)
+ *    (i.e., three arrays: <em>values</em>, <em>row_offsets</em>, and <em>column_indices</em>)
+ *  - <em>x</em> and <em>y</em> are dense vectors
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSpmv}
+ *
+ */
+struct DeviceSpmv
+{
+    /******************************************************************//**
+     * \name CSR matrix operations
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief This function performs the matrix-vector operation <em>y</em> = <b>A</b>*<em>x</em>.
+     *
+     * \par Snippet
+     * The code snippet below illustrates SpMV upon a 9x9 CSR matrix <b>A</b>
+     * representing a 3x3 lattice (24 non-zeros).
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_spmv.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input vector x,
+     * // and output vector y
+     * int    num_rows = 9;
+     * int    num_cols = 9;
+     * int    num_nonzeros = 24;
+     *
+     * float* d_values;  // e.g., [1, 1, 1, 1, 1, 1, 1, 1,
+     *                   //        1, 1, 1, 1, 1, 1, 1, 1,
+     *                   //        1, 1, 1, 1, 1, 1, 1, 1]
+     *
+     * int*   d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0,
+     *                          //        4, 6, 1, 3, 5, 7, 2, 4,
+     *                          //        8, 3, 7, 4, 6, 8, 5, 7]
+     *
+     * int*   d_row_offsets;    // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24]
+     *
+     * float* d_vector_x;       // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1]
+     * float* d_vector_y;       // e.g., [ ,  ,  ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
+     *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
+     *     num_rows, num_cols, num_nonzeros);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run SpMV
+     * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
+     *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
+     *     num_rows, num_cols, num_nonzeros);
+     *
+     * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2]
+     *
+     * \endcode
+     *
+     * \tparam ValueT       <b>[inferred]</b> Matrix and vector value type (e.g., /p float, /p double, etc.)
+     */
+    template <
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t CsrMV(
+        void*               d_temp_storage,                     ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const ValueT*       d_values,                           ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+        const int*          d_row_offsets,                      ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros)
+        const int*          d_column_indices,                   ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+        const ValueT*       d_vector_x,                         ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+        ValueT*             d_vector_y,                         ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
+        int                 num_rows,                           ///< [in] number of rows of matrix <b>A</b>.
+        int                 num_cols,                           ///< [in] number of columns of matrix <b>A</b>.
+        int                 num_nonzeros,                       ///< [in] number of nonzero elements of matrix <b>A</b>.
+        cudaStream_t        stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        SpmvParams<ValueT, int> spmv_params;
+        spmv_params.d_values             = d_values;
+        spmv_params.d_row_end_offsets    = d_row_offsets + 1;
+        spmv_params.d_column_indices     = d_column_indices;
+        spmv_params.d_vector_x           = d_vector_x;
+        spmv_params.d_vector_y           = d_vector_y;
+        spmv_params.num_rows             = num_rows;
+        spmv_params.num_cols             = num_cols;
+        spmv_params.num_nonzeros         = num_nonzeros;
+        spmv_params.alpha                = ValueT{1};
+        spmv_params.beta                 = ValueT{0};
+
+        return DispatchSpmv<ValueT, int>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            spmv_params,
+            stream,
+            debug_synchronous);
+    }
+
+    //@}  end member group
+};
+
+
+
+CUB_NAMESPACE_END
+
+
diff --git a/cuda/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cuda/cub/device/dispatch/dispatch_adjacent_difference.cuh
new file mode 100644
index 00000000..56ccafa6
--- /dev/null
+++ b/cuda/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -0,0 +1,367 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/agent/agent_adjacent_difference.cuh>
+#include <cub/config.cuh>
+#include <cub/detail/type_traits.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_namespace.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <iterator>
+
+CUB_NAMESPACE_BEGIN
+
+
+template <typename AgentDifferenceInitT,
+          typename InputIteratorT,
+          typename InputT,
+          typename OffsetT>
+void __global__ DeviceAdjacentDifferenceInitKernel(InputIteratorT first,
+                                                   InputT *result,
+                                                   OffsetT num_tiles,
+                                                   int items_per_tile)
+{
+  const int tile_idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+  AgentDifferenceInitT::Process(tile_idx,
+                                first,
+                                result,
+                                num_tiles,
+                                items_per_tile);
+}
+
+template <typename ChainedPolicyT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename DifferenceOpT,
+          typename OffsetT,
+          typename InputT,
+          bool MayAlias,
+          bool ReadLeft>
+void __global__
+DeviceAdjacentDifferenceDifferenceKernel(InputIteratorT input,
+                                         InputT *first_tile_previous,
+                                         OutputIteratorT result,
+                                         DifferenceOpT difference_op,
+                                         OffsetT num_items)
+{
+  using ActivePolicyT = 
+    typename ChainedPolicyT::ActivePolicy::AdjacentDifferencePolicy;
+
+  // It is OK to introspect the return type or parameter types of the 
+  // `operator()` function of `__device__` extended lambda within device code.
+  using OutputT = detail::invoke_result_t<DifferenceOpT, InputT, InputT>;
+
+  using Agent = AgentDifference<ActivePolicyT,
+                                InputIteratorT,
+                                OutputIteratorT,
+                                DifferenceOpT,
+                                OffsetT,
+                                InputT,
+                                OutputT,
+                                MayAlias,
+                                ReadLeft>;
+
+  __shared__ typename Agent::TempStorage storage;
+
+  Agent agent(storage,
+              input,
+              first_tile_previous,
+              result,
+              difference_op,
+              num_items);
+
+  int tile_idx = static_cast<int>(blockIdx.x);
+  OffsetT tile_base  = static_cast<OffsetT>(tile_idx) 
+                     * ActivePolicyT::ITEMS_PER_TILE;
+
+  agent.Process(tile_idx, tile_base);
+}
+
+template <typename InputIteratorT>
+struct DeviceAdjacentDifferencePolicy
+{
+  using ValueT = typename std::iterator_traits<InputIteratorT>::value_type;
+
+  //------------------------------------------------------------------------------
+  // Architecture-specific tuning policies
+  //------------------------------------------------------------------------------
+
+  struct Policy300 : ChainedPolicy<300, Policy300, Policy300>
+  {
+    using AdjacentDifferencePolicy =
+      AgentAdjacentDifferencePolicy<128,
+                                    Nominal8BItemsToItems<ValueT>(7),
+                                    BLOCK_LOAD_WARP_TRANSPOSE,
+                                    LOAD_DEFAULT,
+                                    BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+
+  struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+  {
+    using AdjacentDifferencePolicy =
+      AgentAdjacentDifferencePolicy<128,
+                                    Nominal8BItemsToItems<ValueT>(7),
+                                    BLOCK_LOAD_WARP_TRANSPOSE,
+                                    LOAD_LDG,
+                                    BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+
+  using MaxPolicy = Policy350;
+};
+
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          typename DifferenceOpT,
+          typename OffsetT,
+          bool MayAlias,
+          bool ReadLeft,
+          typename SelectedPolicy =
+            DeviceAdjacentDifferencePolicy<InputIteratorT>>
+struct DispatchAdjacentDifference : public SelectedPolicy
+{
+  using InputT = typename std::iterator_traits<InputIteratorT>::value_type;
+
+  void *d_temp_storage;
+  std::size_t &temp_storage_bytes;
+  InputIteratorT d_input;
+  OutputIteratorT d_output;
+  OffsetT num_items;
+  DifferenceOpT difference_op;
+  cudaStream_t stream;
+  bool debug_synchronous;
+
+  CUB_RUNTIME_FUNCTION __forceinline__
+  DispatchAdjacentDifference(void *d_temp_storage,
+                             std::size_t &temp_storage_bytes,
+                             InputIteratorT d_input,
+                             OutputIteratorT d_output,
+                             OffsetT num_items,
+                             DifferenceOpT difference_op,
+                             cudaStream_t stream,
+                             bool debug_synchronous)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_input(d_input)
+      , d_output(d_output)
+      , num_items(num_items)
+      , difference_op(difference_op)
+      , stream(stream)
+      , debug_synchronous(debug_synchronous)
+  {}
+
+  /// Invocation
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
+  {
+    using AdjacentDifferencePolicyT =
+      typename ActivePolicyT::AdjacentDifferencePolicy;
+
+    using MaxPolicyT = typename DispatchAdjacentDifference::MaxPolicy;
+
+    cudaError error = cudaSuccess;
+
+    do
+    {
+      const int tile_size = AdjacentDifferencePolicyT::ITEMS_PER_TILE;
+      const int num_tiles =
+        static_cast<int>(DivideAndRoundUp(num_items, tile_size));
+
+      std::size_t first_tile_previous_size = MayAlias * num_tiles *
+                                             sizeof(InputT);
+
+      void *allocations[1]            = {nullptr};
+      std::size_t allocation_sizes[1] = {MayAlias * first_tile_previous_size};
+
+      if (CubDebug(error = AliasTemporaries(d_temp_storage,
+                                            temp_storage_bytes,
+                                            allocations,
+                                            allocation_sizes)))
+      {
+        break;
+      }
+
+      if (d_temp_storage == nullptr)
+      {
+        // Return if the caller is simply requesting the size of the storage
+        // allocation
+
+        if (temp_storage_bytes == 0)
+        {
+          temp_storage_bytes = 1;
+        }
+
+        break;
+      }
+
+      if (num_items == OffsetT{})
+      {
+        break;
+      }
+
+      auto first_tile_previous = reinterpret_cast<InputT*>(allocations[0]);
+
+      if (MayAlias)
+      {
+        using AgentDifferenceInitT =
+          AgentDifferenceInit<InputIteratorT, InputT, OffsetT, ReadLeft>;
+
+        const int init_block_size = AgentDifferenceInitT::BLOCK_THREADS;
+        const int init_grid_size = DivideAndRoundUp(num_tiles, init_block_size);
+
+        if (debug_synchronous)
+        {
+          _CubLog("Invoking DeviceAdjacentDifferenceInitKernel"
+                  "<<<%d, %d, 0, %lld>>>()\n",
+                  init_grid_size,
+                  init_block_size,
+                  reinterpret_cast<long long>(stream));
+        }
+
+        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size,
+                                                                init_block_size,
+                                                                0,
+                                                                stream)
+          .doit(DeviceAdjacentDifferenceInitKernel<AgentDifferenceInitT,
+                                                   InputIteratorT,
+                                                   InputT,
+                                                   OffsetT>,
+                d_input,
+                first_tile_previous,
+                num_tiles,
+                tile_size);
+
+        if (debug_synchronous)
+        {
+          if (CubDebug(error = SyncStream(stream)))
+          {
+            break;
+          }
+        }
+
+        // Check for failure to launch
+        if (CubDebug(error = cudaPeekAtLastError()))
+        {
+          break;
+        }
+      }
+
+      if (debug_synchronous)
+      {
+        _CubLog("Invoking DeviceAdjacentDifferenceDifferenceKernel"
+                "<<<%d, %d, 0, %lld>>>()\n",
+                num_tiles,
+                AdjacentDifferencePolicyT::BLOCK_THREADS,
+                reinterpret_cast<long long>(stream));
+      }
+
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+        num_tiles,
+        AdjacentDifferencePolicyT::BLOCK_THREADS,
+        0,
+        stream)
+        .doit(DeviceAdjacentDifferenceDifferenceKernel<MaxPolicyT,
+                                                       InputIteratorT,
+                                                       OutputIteratorT,
+                                                       DifferenceOpT,
+                                                       OffsetT,
+                                                       InputT,
+                                                       MayAlias,
+                                                       ReadLeft>,
+              d_input,
+              first_tile_previous,
+              d_output,
+              difference_op,
+              num_items);
+
+      if (debug_synchronous)
+      {
+        if (CubDebug(error = SyncStream(stream)))
+        {
+          break;
+        }
+      }
+
+      // Check for failure to launch
+      if (CubDebug(error = cudaPeekAtLastError()))
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+  CUB_RUNTIME_FUNCTION
+  static cudaError_t Dispatch(void *d_temp_storage,
+                              std::size_t &temp_storage_bytes,
+                              InputIteratorT d_input,
+                              OutputIteratorT d_output,
+                              OffsetT num_items,
+                              DifferenceOpT difference_op,
+                              cudaStream_t stream,
+                              bool debug_synchronous)
+  {
+    using MaxPolicyT = typename DispatchAdjacentDifference::MaxPolicy;
+
+    cudaError error = cudaSuccess;
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      if (CubDebug(error = PtxVersion(ptx_version)))
+      {
+        break;
+      }
+
+      // Create dispatch functor
+      DispatchAdjacentDifference dispatch(d_temp_storage,
+                                          temp_storage_bytes,
+                                          d_input,
+                                          d_output,
+                                          num_items,
+                                          difference_op,
+                                          stream,
+                                          debug_synchronous);
+
+      // Dispatch to chained policy
+      if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/device/dispatch/dispatch_histogram.cuh b/cuda/cub/device/dispatch/dispatch_histogram.cuh
new file mode 100644
index 00000000..035f3f2b
--- /dev/null
+++ b/cuda/cub/device/dispatch/dispatch_histogram.cuh
@@ -0,0 +1,1018 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "../../agent/agent_histogram.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_math.cuh"
+#include "../../thread/thread_search.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../config.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+CUB_NAMESPACE_BEGIN
+
+
+
+/******************************************************************************
+ * Histogram kernel entry points
+ *****************************************************************************/
+
+/**
+ * Histogram initialization kernel entry point
+ */
+template <
+    int                                             NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                                        CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
+    typename                                        OffsetT>                        ///< Signed integer type for global offsets
+__global__ void DeviceHistogramInitKernel(
+    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>          num_output_bins_wrapper,        ///< Number of output histogram bins per channel
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>    d_output_histograms_wrapper,    ///< Histogram counter data having logical dimensions <tt>CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]]</tt>
+    GridQueue<int>                                  tile_queue)                     ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+    if ((threadIdx.x == 0) && (blockIdx.x == 0))
+        tile_queue.ResetDrain();
+
+    int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+    #pragma unroll
+    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+    {
+        if (output_bin < num_output_bins_wrapper.array[CHANNEL])
+            d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0;
+    }
+}
+
+
+/**
+ * Histogram privatized sweep kernel entry point (multi-block).  Computes privatized histograms, one per thread block.
+ */
+template <
+    typename                                            AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
+    int                                                 PRIVATIZED_SMEM_BINS,           ///< Maximum number of histogram bins per channel (e.g., up to 256)
+    int                                                 NUM_CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+    int                                                 NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                                            SampleIteratorT,                ///< The input iterator type. \iterator.
+    typename                                            CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
+    typename                                            PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+    typename                                            OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+    typename                                            OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS))
+__global__ void DeviceHistogramSweepKernel(
+    SampleIteratorT                                         d_samples,                          ///< Input data to reduce
+    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_output_bins_wrapper,            ///< The number bins per final output histogram
+    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_privatized_bins_wrapper,        ///< The number bins per privatized histogram
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_output_histograms_wrapper,        ///< Reference to final output histograms
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_privatized_histograms_wrapper,    ///< Reference to privatized histograms
+    ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS>      output_decode_op_wrapper,           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+    ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS>  privatized_decode_op_wrapper,       ///< The transform operator for determining privatized counter indices from samples, one for each channel
+    OffsetT                                                 num_row_pixels,                     ///< The number of multi-channel pixels per row in the region of interest
+    OffsetT                                                 num_rows,                           ///< The number of rows in the region of interest
+    OffsetT                                                 row_stride_samples,                 ///< The number of samples between starts of consecutive rows in the region of interest
+    int                                                     tiles_per_row,                      ///< Number of image tiles per row
+    GridQueue<int>                                          tile_queue)                         ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+    // Thread block type for compositing input tiles
+    typedef AgentHistogram<
+            AgentHistogramPolicyT,
+            PRIVATIZED_SMEM_BINS,
+            NUM_CHANNELS,
+            NUM_ACTIVE_CHANNELS,
+            SampleIteratorT,
+            CounterT,
+            PrivatizedDecodeOpT,
+            OutputDecodeOpT,
+            OffsetT>
+        AgentHistogramT;
+
+    // Shared memory for AgentHistogram
+    __shared__ typename AgentHistogramT::TempStorage temp_storage;
+
+    AgentHistogramT agent(
+        temp_storage,
+        d_samples,
+        num_output_bins_wrapper.array,
+        num_privatized_bins_wrapper.array,
+        d_output_histograms_wrapper.array,
+        d_privatized_histograms_wrapper.array,
+        output_decode_op_wrapper.array,
+        privatized_decode_op_wrapper.array);
+
+    // Initialize counters
+    agent.InitBinCounters();
+
+    // Consume input tiles
+    agent.ConsumeTiles(
+        num_row_pixels,
+        num_rows,
+        row_stride_samples,
+        tiles_per_row,
+        tile_queue);
+
+    // Store output to global (if necessary)
+    agent.StoreOutput();
+
+}
+
+
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram
+ */
+template <
+    int         NUM_CHANNELS,               ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+    int         NUM_ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
+    typename    SampleIteratorT,            ///< Random-access input iterator type for reading input items \iterator
+    typename    CounterT,                   ///< Integer type for counting sample occurrences per histogram bin
+    typename    LevelT,                     ///< Type for specifying bin level boundaries
+    typename    OffsetT>                    ///< Signed integer type for global offsets
+struct DispatchHistogram
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The sample value type of the input iterator
+    using SampleT = cub::detail::value_t<SampleIteratorT>;
+
+    enum
+    {
+        // Maximum number of bins per channel for which we will use a privatized smem strategy
+        MAX_PRIVATIZED_SMEM_BINS = 256
+    };
+
+
+    //---------------------------------------------------------------------
+    // Transform functors for converting samples to bin-ids
+    //---------------------------------------------------------------------
+
+    // Searches for bin given a list of bin-boundary levels
+    template <typename LevelIteratorT>
+    struct SearchTransform
+    {
+        LevelIteratorT  d_levels;                   // Pointer to levels array
+        int             num_output_levels;          // Number of levels in array
+
+        // Initializer
+        __host__ __device__ __forceinline__ void Init(
+            LevelIteratorT  d_levels_,               // Pointer to levels array
+            int             num_output_levels_)      // Number of levels in array
+        {
+            this->d_levels          = d_levels_;
+            this->num_output_levels = num_output_levels_;
+        }
+
+        // Method for converting samples to bin-ids
+        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
+        {
+            /// Level iterator wrapper type
+            // Wrap the native input pointer with CacheModifiedInputIterator
+            // or Directly use the supplied input iterator type
+            using WrappedLevelIteratorT = cub::detail::conditional_t<
+              std::is_pointer<LevelIteratorT>::value,
+              CacheModifiedInputIterator<LOAD_MODIFIER, LevelT, OffsetT>,
+              LevelIteratorT>;
+
+            WrappedLevelIteratorT wrapped_levels(d_levels);
+
+            int num_bins = num_output_levels - 1;
+            if (valid)
+            {
+                bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1;
+                if (bin >= num_bins)
+                    bin = -1;
+            }
+        }
+    };
+
+
+    // Scales samples to evenly-spaced bins
+    struct ScaleTransform
+    {
+        int    num_bins;    // Number of levels in array
+        LevelT max;         // Max sample level (exclusive)
+        LevelT min;         // Min sample level (inclusive)
+        LevelT scale;       // Bin scaling factor
+
+        // Initializer
+        template <typename _LevelT>
+        __host__ __device__ __forceinline__ void Init(
+            int     num_output_levels,  // Number of levels in array
+            _LevelT max_,                // Max sample level (exclusive)
+            _LevelT min_,                // Min sample level (inclusive)
+            _LevelT scale_)              // Bin scaling factor
+        {
+            this->num_bins = num_output_levels - 1;
+            this->max = max_;
+            this->min = min_;
+            this->scale = scale_;
+        }
+
+        // Initializer (float specialization)
+        __host__ __device__ __forceinline__ void Init(
+            int    num_output_levels,   // Number of levels in array
+            float   max_,                // Max sample level (exclusive)
+            float   min_,                // Min sample level (inclusive)
+            float   scale_)              // Bin scaling factor
+        {
+            this->num_bins = num_output_levels - 1;
+            this->max = max_;
+            this->min = min_;
+            this->scale = float(1.0) / scale_;
+        }
+
+        // Initializer (double specialization)
+        __host__ __device__ __forceinline__ void Init(
+            int    num_output_levels,   // Number of levels in array
+            double max_,                 // Max sample level (exclusive)
+            double min_,                 // Min sample level (inclusive)
+            double scale_)               // Bin scaling factor
+        {
+            this->num_bins = num_output_levels - 1;
+            this->max = max_;
+            this->min = min_;
+            this->scale = double(1.0) / scale_;
+        }
+
+        // Method for converting samples to bin-ids
+        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
+        {
+            LevelT level_sample = (LevelT) sample;
+
+            if (valid && (level_sample >= min) && (level_sample < max))
+                bin = (int) ((level_sample - min) / scale);
+        }
+
+        // Method for converting samples to bin-ids (float specialization)
+        template <CacheLoadModifier LOAD_MODIFIER>
+        __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid)
+        {
+            LevelT level_sample = (LevelT) sample;
+
+            if (valid && (level_sample >= min) && (level_sample < max))
+                bin = (int) ((level_sample - min) * scale);
+        }
+
+        // Method for converting samples to bin-ids (double specialization)
+        template <CacheLoadModifier LOAD_MODIFIER>
+        __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid)
+        {
+            LevelT level_sample = (LevelT) sample;
+
+            if (valid && (level_sample >= min) && (level_sample < max))
+                bin = (int) ((level_sample - min) * scale);
+        }
+    };
+
+
+    // Pass-through bin transform operator
+    struct PassThruTransform
+    {
+        // Method for converting samples to bin-ids
+        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
+        {
+            if (valid)
+                bin = (int) sample;
+        }
+    };
+
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies
+    //---------------------------------------------------------------------
+
+    template <int NOMINAL_ITEMS_PER_THREAD>
+    struct TScale
+    {
+        enum
+        {
+            V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
+            VALUE   = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
+        };
+    };
+
+    /// SM35
+    struct Policy350
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                128,
+                TScale<8>::VALUE,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                BLEND,
+                true>
+            HistogramSweepPolicy;
+    };
+
+    /// SM50
+    struct Policy500
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                384,
+                TScale<16>::VALUE,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                SMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies of current PTX compiler pass
+    //---------------------------------------------------------------------
+
+#if (CUB_PTX_ARCH >= 500)
+    typedef Policy500 PtxPolicy;
+
+#else
+    typedef Policy350 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {};
+
+
+    //---------------------------------------------------------------------
+    // Utilities
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t InitConfigs(
+        int             ptx_version,
+        KernelConfig    &histogram_sweep_config)
+    {
+        cudaError_t result = cudaErrorNotSupported;
+        if (CUB_IS_DEVICE_CODE)
+        {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                result = histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
+            #endif
+        }
+        else
+        {
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 500)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
+                }
+                else
+                {
+                    result = histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
+                }
+            #endif
+        }
+        return result;
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration
+     */
+    struct KernelConfig
+    {
+        int                             block_threads;
+        int                             pixels_per_thread;
+
+        template <typename BlockPolicy>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        cudaError_t Init()
+        {
+            block_threads               = BlockPolicy::BLOCK_THREADS;
+            pixels_per_thread           = BlockPolicy::PIXELS_PER_THREAD;
+
+            return cudaSuccess;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Privatization-based dispatch routine
+     */
+    template <
+        typename                            PrivatizedDecodeOpT,                            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+        typename                            OutputDecodeOpT,                                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+        typename                            DeviceHistogramInitKernelT,                     ///< Function type of cub::DeviceHistogramInitKernel
+        typename                            DeviceHistogramSweepKernelT>                    ///< Function type of cub::DeviceHistogramSweepKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t PrivatizedDispatch(
+        void*                               d_temp_storage,                                 ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                             temp_storage_bytes,                             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT                     d_samples,                                      ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*                           d_output_histograms[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                                 num_privatized_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        PrivatizedDecodeOpT                 privatized_decode_op[NUM_ACTIVE_CHANNELS],      ///< [in] Transform operators for determining bin-ids from samples, one for each channel
+        int                                 num_output_levels[NUM_ACTIVE_CHANNELS],         ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        OutputDecodeOpT                     output_decode_op[NUM_ACTIVE_CHANNELS],          ///< [in] Transform operators for determining bin-ids from samples, one for each channel
+        int                                 max_num_output_bins,                            ///< [in] Maximum number of output bins in any channel
+        OffsetT                             num_row_pixels,                                 ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT                             num_rows,                                       ///< [in] The number of rows in the region of interest
+        OffsetT                             row_stride_samples,                             ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        DeviceHistogramInitKernelT          histogram_init_kernel,                          ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel
+        DeviceHistogramSweepKernelT         histogram_sweep_kernel,                         ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel
+        KernelConfig                        histogram_sweep_config,                         ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for
+        cudaStream_t                        stream,                                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                                debug_synchronous)                              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+    #ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+    #else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get SM occupancy for histogram_sweep_kernel
+            int histogram_sweep_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                histogram_sweep_sm_occupancy,
+                histogram_sweep_kernel,
+                histogram_sweep_config.block_threads))) break;
+
+            // Get device occupancy for histogram_sweep_kernel
+            int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count;
+
+            if (num_row_pixels * NUM_CHANNELS == row_stride_samples)
+            {
+                // Treat as a single linear array of samples
+                num_row_pixels      *= num_rows;
+                num_rows            = 1;
+                row_stride_samples  = num_row_pixels * NUM_CHANNELS;
+            }
+
+            // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
+            int pixels_per_tile     = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread;
+            int tiles_per_row       = static_cast<int>(cub::DivideAndRoundUp(num_row_pixels, pixels_per_tile));
+            int blocks_per_row      = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
+            int blocks_per_col      = (blocks_per_row > 0) ?
+                                        int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) :
+                                        0;
+            int num_thread_blocks   = blocks_per_row * blocks_per_col;
+
+            dim3 sweep_grid_dims;
+            sweep_grid_dims.x = (unsigned int) blocks_per_row;
+            sweep_grid_dims.y = (unsigned int) blocks_per_col;
+            sweep_grid_dims.z = 1;
+
+            // Temporary storage allocation requirements
+            const int   NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1;
+            void*       allocations[NUM_ALLOCATIONS] = {};
+            size_t      allocation_sizes[NUM_ALLOCATIONS];
+
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT);
+
+            allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue<int>::AllocationSize();
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the grid queue descriptor
+            GridQueue<int> tile_queue(allocations[NUM_ALLOCATIONS - 1]);
+
+            // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL];
+
+            // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_privatized_histograms_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL];
+
+            // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL];
+
+            // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL];
+
+            // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1;
+
+            // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1;
+
+            int histogram_init_block_threads    = 256;
+            int histogram_init_grid_dims        = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;
+
+            // Log DeviceHistogramInitKernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
+                histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);
+
+            // Invoke histogram_init_kernel
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                histogram_init_grid_dims, histogram_init_block_threads, 0,
+                stream
+            ).doit(histogram_init_kernel,
+                num_output_bins_wrapper,
+                d_output_histograms_wrapper,
+                tile_queue);
+
+            // Return if empty problem
+            if ((blocks_per_row == 0) || (blocks_per_col == 0))
+                break;
+
+            // Log histogram_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n",
+                sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z,
+                histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);
+
+            // Invoke histogram_sweep_kernel
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream
+            ).doit(histogram_sweep_kernel,
+                d_samples,
+                num_output_bins_wrapper,
+                num_privatized_bins_wrapper,
+                d_output_histograms_wrapper,
+                d_privatized_histograms_wrapper,
+                output_decode_op_wrapper,
+                privatized_decode_op_wrapper,
+                num_row_pixels,
+                num_rows,
+                row_stride_samples,
+                tiles_per_row,
+                tile_queue);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+        }
+        while (0);
+
+        return error;
+
+    #endif // CUB_RUNTIME_ENABLED
+    }
+
+
+
+    /**
+     * Dispatch routine for HistogramRange, specialized for sample types larger than 8bit
+     */
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t DispatchRange(
+        void*               d_temp_storage,                             ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],   ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<false>     /*is_byte_sample*/)                         ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the search transform op for converting samples to privatized bins
+            typedef SearchTransform<LevelT*> PrivatizedDecodeOpT;
+
+            // Use the pass-thru transform op for converting privatized bins to output bins
+            typedef PassThruTransform OutputDecodeOpT;
+
+            PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                     max_levels = num_output_levels[0];
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            // Dispatch
+            if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
+            {
+                // Too many bins to keep in shared memory.
+                const int PRIVATIZED_SMEM_BINS = 0;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+            else
+            {
+                // Dispatch shared-privatized approach
+                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+
+        } while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
+     */
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t DispatchRange(
+        void*               d_temp_storage,                             ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],   ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<true>      /*is_byte_sample*/)                         ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the pass-thru transform op for converting samples to privatized bins
+            typedef PassThruTransform PrivatizedDecodeOpT;
+
+            // Use the search transform op for converting privatized bins to output bins
+            typedef SearchTransform<LevelT*> OutputDecodeOpT;
+
+            int                         num_privatized_levels[NUM_ACTIVE_CHANNELS];
+            PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                         max_levels = num_output_levels[0];              // Maximum number of levels in any channel
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                num_privatized_levels[channel] = 257;
+                output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
+
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            const int PRIVATIZED_SMEM_BINS = 256;
+
+            if (CubDebug(error = PrivatizedDispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_output_histograms,
+                num_privatized_levels,
+                privatized_decode_op,
+                num_output_levels,
+                output_decode_op,
+                max_num_output_bins,
+                num_row_pixels,
+                num_rows,
+                row_stride_samples,
+                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                histogram_sweep_config,
+                stream,
+                debug_synchronous))) break;
+
+        } while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t DispatchEven(
+        void*               d_temp_storage,                            ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<false>     /*is_byte_sample*/)                         ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the scale transform op for converting samples to privatized bins
+            typedef ScaleTransform PrivatizedDecodeOpT;
+
+            // Use the pass-thru transform op for converting privatized bins to output bins
+            typedef PassThruTransform OutputDecodeOpT;
+
+            PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                         max_levels = num_output_levels[0];
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                int     bins    = num_output_levels[channel] - 1;
+                LevelT  scale   = static_cast<LevelT>((upper_level[channel] - lower_level[channel]) / bins);
+
+                privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
+
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
+            {
+                // Dispatch shared-privatized approach
+                const int PRIVATIZED_SMEM_BINS = 0;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+            else
+            {
+                // Dispatch shared-privatized approach
+                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t DispatchEven(
+        void*               d_temp_storage,                             ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],   ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<true>      /*is_byte_sample*/)                         ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the pass-thru transform op for converting samples to privatized bins
+            typedef PassThruTransform PrivatizedDecodeOpT;
+
+            // Use the scale transform op for converting privatized bins to output bins
+            typedef ScaleTransform OutputDecodeOpT;
+
+            int                     num_privatized_levels[NUM_ACTIVE_CHANNELS];
+            PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                     max_levels = num_output_levels[0];
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                num_privatized_levels[channel] = 257;
+
+                int     bins    = num_output_levels[channel] - 1;
+                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
+                output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
+
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            const int PRIVATIZED_SMEM_BINS = 256;
+
+            if (CubDebug(error = PrivatizedDispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_output_histograms,
+                num_privatized_levels,
+                privatized_decode_op,
+                num_output_levels,
+                output_decode_op,
+                max_num_output_bins,
+                num_row_pixels,
+                num_rows,
+                row_stride_samples,
+                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                histogram_sweep_config,
+                stream,
+                debug_synchronous))) break;
+
+        }
+        while (0);
+
+        return error;
+    }
+
+};
+
+
+CUB_NAMESPACE_END
+
+
diff --git a/cuda/cub/device/dispatch/dispatch_merge_sort.cuh b/cuda/cub/device/dispatch/dispatch_merge_sort.cuh
new file mode 100644
index 00000000..0b480899
--- /dev/null
+++ b/cuda/cub/device/dispatch/dispatch_merge_sort.cuh
@@ -0,0 +1,858 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/util_math.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_namespace.cuh>
+#include <cub/agent/agent_merge_sort.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+#include <thrust/detail/integer_math.h>
+
+CUB_NAMESPACE_BEGIN
+
+
+template <bool UseVShmem,
+          typename ChainedPolicyT,
+          typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyIteratorT,
+          typename ValueIteratorT,
+          typename OffsetT,
+          typename CompareOpT,
+          typename KeyT,
+          typename ValueT>
+void __global__ __launch_bounds__(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS)
+DeviceMergeSortBlockSortKernel(bool ping,
+                               KeyInputIteratorT keys_in,
+                               ValueInputIteratorT items_in,
+                               KeyIteratorT keys_out,
+                               ValueIteratorT items_out,
+                               OffsetT keys_count,
+                               KeyT *tmp_keys_out,
+                               ValueT *tmp_items_out,
+                               CompareOpT compare_op,
+                               char *vshmem)
+{
+  extern __shared__ char shmem[];
+  using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::MergeSortPolicy;
+
+  using AgentBlockSortT = AgentBlockSort<ActivePolicyT,
+                                         KeyInputIteratorT,
+                                         ValueInputIteratorT,
+                                         KeyIteratorT,
+                                         ValueIteratorT,
+                                         OffsetT,
+                                         CompareOpT,
+                                         KeyT,
+                                         ValueT>;
+
+  const OffsetT vshmem_offset = blockIdx.x *
+                                AgentBlockSortT::SHARED_MEMORY_SIZE;
+
+  typename AgentBlockSortT::TempStorage &storage =
+    *reinterpret_cast<typename AgentBlockSortT::TempStorage *>(
+      UseVShmem ? vshmem + vshmem_offset : shmem);
+
+  AgentBlockSortT agent(ping,
+                        storage,
+                        THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_in),
+                        THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_in),
+                        keys_count,
+                        keys_out,
+                        items_out,
+                        tmp_keys_out,
+                        tmp_items_out,
+                        compare_op);
+
+  agent.Process();
+}
+
+template <typename KeyIteratorT,
+          typename OffsetT,
+          typename CompareOpT,
+          typename KeyT>
+__global__ void DeviceMergeSortPartitionKernel(bool ping,
+                                               KeyIteratorT keys_ping,
+                                               KeyT *keys_pong,
+                                               OffsetT keys_count,
+                                               OffsetT num_partitions,
+                                               OffsetT *merge_partitions,
+                                               CompareOpT compare_op,
+                                               OffsetT target_merged_tiles_number,
+                                               int items_per_tile)
+{
+  OffsetT partition_idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (partition_idx < num_partitions)
+  {
+    AgentPartition<KeyIteratorT, OffsetT, CompareOpT, KeyT> agent(
+      ping,
+      keys_ping,
+      keys_pong,
+      keys_count,
+      partition_idx,
+      merge_partitions,
+      compare_op,
+      target_merged_tiles_number,
+      items_per_tile);
+
+    agent.Process();
+  }
+}
+
+template <bool UseVShmem,
+          typename ChainedPolicyT,
+          typename KeyIteratorT,
+          typename ValueIteratorT,
+          typename OffsetT,
+          typename CompareOpT,
+          typename KeyT,
+          typename ValueT>
+void __global__ __launch_bounds__(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS)
+DeviceMergeSortMergeKernel(bool ping,
+                           KeyIteratorT keys_ping,
+                           ValueIteratorT items_ping,
+                           OffsetT keys_count,
+                           KeyT *keys_pong,
+                           ValueT *items_pong,
+                           CompareOpT compare_op,
+                           OffsetT *merge_partitions,
+                           OffsetT target_merged_tiles_number,
+                           char *vshmem)
+{
+  extern __shared__ char shmem[];
+
+  using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::MergeSortPolicy;
+  using AgentMergeT = AgentMerge<ActivePolicyT,
+                                 KeyIteratorT,
+                                 ValueIteratorT,
+                                 OffsetT,
+                                 CompareOpT,
+                                 KeyT,
+                                 ValueT>;
+
+  const OffsetT vshmem_offset = blockIdx.x * AgentMergeT::SHARED_MEMORY_SIZE;
+
+  typename AgentMergeT::TempStorage &storage =
+    *reinterpret_cast<typename AgentMergeT::TempStorage *>(
+      UseVShmem ? vshmem + vshmem_offset : shmem);
+
+  AgentMergeT agent(
+    ping,
+    storage,
+    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_ping),
+    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_ping),
+    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_pong),
+    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_pong),
+    keys_count,
+    keys_pong,
+    items_pong,
+    keys_ping,
+    items_ping,
+    compare_op,
+    merge_partitions,
+    target_merged_tiles_number);
+
+  agent.Process();
+}
+
+/*******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+template <typename KeyIteratorT>
+struct DeviceMergeSortPolicy
+{
+  using KeyT = cub::detail::value_t<KeyIteratorT>;
+
+  //----------------------------------------------------------------------------
+  // Architecture-specific tuning policies
+  //----------------------------------------------------------------------------
+
+  struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
+  {
+    using MergeSortPolicy =
+      AgentMergeSortPolicy<256,
+                           Nominal4BItemsToItems<KeyT>(11),
+                           cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                           cub::LOAD_LDG,
+                           cub::BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+
+// NVBug 3384810
+#if defined(_NVHPC_CUDA)
+  using Policy520 = Policy350;
+#else
+  struct Policy520 : ChainedPolicy<520, Policy520, Policy350>
+  {
+    using MergeSortPolicy =
+      AgentMergeSortPolicy<512,
+                           Nominal4BItemsToItems<KeyT>(15),
+                           cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                           cub::LOAD_LDG,
+                           cub::BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+#endif
+
+  struct Policy600 : ChainedPolicy<600, Policy600, Policy520>
+  {
+    using MergeSortPolicy =
+      AgentMergeSortPolicy<256,
+                           Nominal4BItemsToItems<KeyT>(17),
+                           cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                           cub::LOAD_DEFAULT,
+                           cub::BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+
+
+  /// MaxPolicy
+  using MaxPolicy = Policy600;
+};
+
+template <typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyIteratorT,
+          typename ValueIteratorT,
+          typename OffsetT,
+          typename ChainedPolicyT,
+          typename ActivePolicyT,
+          typename CompareOpT,
+          typename KeyT,
+          typename ValueT,
+          bool AgentFitsIntoDefaultShmemSize>
+struct BlockSortLauncher
+{
+  int num_tiles;
+  std::size_t block_sort_shmem_size;
+  bool ping;
+
+  KeyInputIteratorT d_input_keys;
+  ValueInputIteratorT d_input_items;
+  KeyIteratorT d_output_keys;
+  ValueIteratorT d_output_items;
+  OffsetT num_items;
+  CompareOpT compare_op;
+  cudaStream_t stream;
+
+  KeyT *keys_buffer;
+  ValueT *items_buffer;
+  char* vshmem_ptr;
+
+  CUB_RUNTIME_FUNCTION __forceinline__
+  BlockSortLauncher(int num_tiles,
+                    std::size_t block_sort_shmem_size,
+                    bool ping,
+                    KeyInputIteratorT d_input_keys,
+                    ValueInputIteratorT d_input_items,
+                    KeyIteratorT d_output_keys,
+                    ValueIteratorT d_output_items,
+                    OffsetT num_items,
+                    CompareOpT compare_op,
+                    cudaStream_t stream,
+                    KeyT *keys_buffer,
+                    ValueT *items_buffer,
+                    char *vshmem_ptr)
+      : num_tiles(num_tiles)
+      , block_sort_shmem_size(block_sort_shmem_size)
+      , ping(ping)
+      , d_input_keys(d_input_keys)
+      , d_input_items(d_input_items)
+      , d_output_keys(d_output_keys)
+      , d_output_items(d_output_items)
+      , num_items(num_items)
+      , compare_op(compare_op)
+      , stream(stream)
+      , keys_buffer(keys_buffer)
+      , items_buffer(items_buffer)
+      , vshmem_ptr(vshmem_ptr)
+  {}
+
+  CUB_RUNTIME_FUNCTION __forceinline__
+  void launch() const
+  {
+    if (vshmem_ptr)
+    {
+      launch_impl<true>();
+    }
+    else
+    {
+      launch_impl<false>();
+    }
+  }
+
+  template <bool UseVShmem>
+  CUB_RUNTIME_FUNCTION __forceinline__ void launch_impl() const
+  {
+    constexpr bool use_vshmem = (AgentFitsIntoDefaultShmemSize == false) &&
+                                UseVShmem;
+
+    THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+      num_tiles,
+      ActivePolicyT::MergeSortPolicy::BLOCK_THREADS,
+      use_vshmem ? 0 : block_sort_shmem_size,
+      stream)
+      .doit(DeviceMergeSortBlockSortKernel<use_vshmem,
+                                           ChainedPolicyT,
+                                           KeyInputIteratorT,
+                                           ValueInputIteratorT,
+                                           KeyIteratorT,
+                                           ValueIteratorT,
+                                           OffsetT,
+                                           CompareOpT,
+                                           KeyT,
+                                           ValueT>,
+            ping,
+            d_input_keys,
+            d_input_items,
+            d_output_keys,
+            d_output_items,
+            num_items,
+            keys_buffer,
+            items_buffer,
+            compare_op,
+            vshmem_ptr);
+  }
+};
+
+template <typename KeyIteratorT,
+          typename ValueIteratorT,
+          typename OffsetT,
+          typename ChainedPolicyT,
+          typename ActivePolicyT,
+          typename CompareOpT,
+          typename KeyT,
+          typename ValueT,
+          bool AgentFitsIntoDefaultShmemSize>
+struct MergeLauncher
+{
+  int num_tiles;
+  std::size_t merge_shmem_size;
+
+  KeyIteratorT d_keys;
+  ValueIteratorT d_items;
+  OffsetT num_items;
+  CompareOpT compare_op;
+  OffsetT *merge_partitions;
+  cudaStream_t stream;
+
+  KeyT *keys_buffer;
+  ValueT *items_buffer;
+  char *vshmem_ptr;
+
+  CUB_RUNTIME_FUNCTION __forceinline__ MergeLauncher(int num_tiles,
+                                                     std::size_t merge_shmem_size,
+                                                     KeyIteratorT d_keys,
+                                                     ValueIteratorT d_items,
+                                                     OffsetT num_items,
+                                                     CompareOpT compare_op,
+                                                     OffsetT *merge_partitions,
+                                                     cudaStream_t stream,
+                                                     KeyT *keys_buffer,
+                                                     ValueT *items_buffer,
+                                                     char *vshmem_ptr)
+      : num_tiles(num_tiles)
+      , merge_shmem_size(merge_shmem_size)
+      , d_keys(d_keys)
+      , d_items(d_items)
+      , num_items(num_items)
+      , compare_op(compare_op)
+      , merge_partitions(merge_partitions)
+      , stream(stream)
+      , keys_buffer(keys_buffer)
+      , items_buffer(items_buffer)
+      , vshmem_ptr(vshmem_ptr)
+  {}
+
+  CUB_RUNTIME_FUNCTION __forceinline__ void
+  launch(bool ping, OffsetT target_merged_tiles_number) const
+  {
+    if (vshmem_ptr)
+    {
+      launch_impl<true>(ping, target_merged_tiles_number);
+    }
+    else
+    {
+      launch_impl<false>(ping, target_merged_tiles_number);
+    }
+  }
+
+  template <bool UseVShmem>
+  CUB_RUNTIME_FUNCTION __forceinline__ void
+  launch_impl(bool ping, OffsetT target_merged_tiles_number) const
+  {
+    constexpr bool use_vshmem = (AgentFitsIntoDefaultShmemSize == false) &&
+                                UseVShmem;
+
+    THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+      num_tiles,
+      ActivePolicyT::MergeSortPolicy::BLOCK_THREADS,
+      use_vshmem ? 0 : merge_shmem_size,
+      stream)
+      .doit(DeviceMergeSortMergeKernel<use_vshmem,
+                                       ChainedPolicyT,
+                                       KeyIteratorT,
+                                       ValueIteratorT,
+                                       OffsetT,
+                                       CompareOpT,
+                                       KeyT,
+                                       ValueT>,
+            ping,
+            d_keys,
+            d_items,
+            num_items,
+            keys_buffer,
+            items_buffer,
+            compare_op,
+            merge_partitions,
+            target_merged_tiles_number,
+            vshmem_ptr);
+  }
+};
+
+template <typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyIteratorT,
+          typename ValueIteratorT,
+          typename OffsetT,
+          typename CompareOpT,
+          typename SelectedPolicy = DeviceMergeSortPolicy<KeyIteratorT>>
+struct DispatchMergeSort : SelectedPolicy
+{
+  using KeyT   = cub::detail::value_t<KeyIteratorT>;
+  using ValueT = cub::detail::value_t<ValueIteratorT>;
+
+  /// Whether or not there are values to be trucked along with keys
+  static constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+
+  // Problem state
+
+  /// Device-accessible allocation of temporary storage. When NULL, the required
+  /// allocation size is written to \p temp_storage_bytes and no work is done.
+  void *d_temp_storage;
+
+  /// Reference to size in bytes of \p d_temp_storage allocation
+  std::size_t &temp_storage_bytes;
+
+  /// Pointer to the input sequence of unsorted input keys
+  KeyInputIteratorT d_input_keys;
+
+  /// Pointer to the input sequence of unsorted input values
+  ValueInputIteratorT d_input_items;
+
+  /// Pointer to the output sequence of sorted input keys
+  KeyIteratorT d_output_keys;
+
+  /// Pointer to the output sequence of sorted input values
+  ValueIteratorT d_output_items;
+
+  /// Number of items to sort
+  OffsetT num_items;
+
+  /// Comparison function object which returns true if the first argument is
+  /// ordered before the second
+  CompareOpT compare_op;
+
+  /// CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+  cudaStream_t stream;
+
+  /// Whether or not to synchronize the stream after every kernel launch to
+  /// check for errors. Also causes launch configurations to be printed to the
+  /// console. Default is \p false.
+  bool debug_synchronous;
+  int ptx_version;
+
+  // Constructor
+  CUB_RUNTIME_FUNCTION __forceinline__
+  DispatchMergeSort(void *d_temp_storage,
+                    std::size_t &temp_storage_bytes,
+                    KeyInputIteratorT d_input_keys,
+                    ValueInputIteratorT d_input_items,
+                    KeyIteratorT d_output_keys,
+                    ValueIteratorT d_output_items,
+                    OffsetT num_items,
+                    CompareOpT compare_op,
+                    cudaStream_t stream,
+                    bool debug_synchronous,
+                    int ptx_version)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_input_keys(d_input_keys)
+      , d_input_items(d_input_items)
+      , d_output_keys(d_output_keys)
+      , d_output_items(d_output_items)
+      , num_items(num_items)
+      , compare_op(compare_op)
+      , stream(stream)
+      , debug_synchronous(debug_synchronous)
+      , ptx_version(ptx_version)
+  {}
+
+  // Invocation
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
+  {
+    using MergePolicyT = typename ActivePolicyT::MergeSortPolicy;
+    using MaxPolicyT = typename DispatchMergeSort::MaxPolicy;
+
+    using BlockSortAgentT = AgentBlockSort<MergePolicyT,
+                                           KeyInputIteratorT,
+                                           ValueInputIteratorT,
+                                           KeyIteratorT,
+                                           ValueIteratorT,
+                                           OffsetT,
+                                           CompareOpT,
+                                           KeyT,
+                                           ValueT>;
+
+    using MergeAgentT = AgentMerge<MergePolicyT,
+                                   KeyIteratorT,
+                                   ValueIteratorT,
+                                   OffsetT,
+                                   CompareOpT,
+                                   KeyT,
+                                   ValueT>;
+
+    cudaError error = cudaSuccess;
+
+    if (num_items == 0)
+      return error;
+
+    do
+    {
+      // Get device ordinal
+      int device_ordinal = 0;
+      if (CubDebug(error = cudaGetDevice(&device_ordinal)))
+      {
+        break;
+      }
+
+      // Get shared memory size
+      const auto tile_size = MergePolicyT::ITEMS_PER_TILE;
+      const auto num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
+
+      /**
+       * Merge sort supports large types, which can lead to excessive shared
+       * memory size requirements. In these cases, merge sort allocates virtual
+       * shared memory that resides in global memory:
+       * ```
+       * extern __shared__ char shmem[];
+       * typename AgentT::TempStorage &storage =
+       *   *reinterpret_cast<typename AgentT::TempStorage *>(
+       *     UseVShmem ? vshmem + vshmem_offset : shmem);
+       * ```
+       * Having `UseVShmem` as a runtime variable leads to the generation of
+       * generic loads and stores, which causes a slowdown. Therefore,
+       * `UseVShmem` has to be known at compilation time.
+       * In the generic case, available shared memory size is queried at runtime
+       * to check if kernels requirements are satisfied. Since the query result
+       * is not known at compile-time, merge sort kernels are specialized for
+       * both cases.
+       * To address increased compilation time, the dispatch layer checks
+       * whether kernels requirements fit into default shared memory
+       * size (48KB). In this case, there's no need for virtual shared
+       * memory specialization.
+       */
+      constexpr std::size_t default_shared_memory_size = 48 * 1024;
+      constexpr auto block_sort_shmem_size =
+        static_cast<std::size_t>(BlockSortAgentT::SHARED_MEMORY_SIZE);
+      constexpr bool block_sort_fits_into_default_shmem =
+        block_sort_shmem_size < default_shared_memory_size;
+
+      constexpr auto merge_shmem_size =
+        static_cast<std::size_t>(MergeAgentT::SHARED_MEMORY_SIZE);
+      constexpr bool merge_fits_into_default_shmem = merge_shmem_size <
+                                                     default_shared_memory_size;
+      constexpr bool runtime_shmem_size_check_is_required =
+        !(merge_fits_into_default_shmem && block_sort_fits_into_default_shmem);
+
+      const auto merge_partitions_size =
+        static_cast<std::size_t>(1 + num_tiles) * sizeof(OffsetT);
+
+      const auto temporary_keys_storage_size =
+        static_cast<std::size_t>(num_items * sizeof(KeyT));
+
+      const auto temporary_values_storage_size =
+        static_cast<std::size_t>(num_items * sizeof(ValueT)) * !KEYS_ONLY;
+
+      std::size_t virtual_shared_memory_size = 0;
+      bool block_sort_requires_vshmem = false;
+      bool merge_requires_vshmem = false;
+
+      if (runtime_shmem_size_check_is_required)
+      {
+        int max_shmem = 0;
+        if (CubDebug(
+              error = cudaDeviceGetAttribute(&max_shmem,
+                                             cudaDevAttrMaxSharedMemoryPerBlock,
+                                             device_ordinal)))
+        {
+          break;
+        }
+
+        block_sort_requires_vshmem = block_sort_shmem_size >
+                                     static_cast<std::size_t>(max_shmem);
+        merge_requires_vshmem = merge_shmem_size >
+                                static_cast<std::size_t>(max_shmem);
+
+        virtual_shared_memory_size =
+          detail::VshmemSize(static_cast<std::size_t>(max_shmem),
+                     (cub::max)(block_sort_shmem_size, merge_shmem_size),
+                     static_cast<std::size_t>(num_tiles));
+      }
+
+
+      void *allocations[4] = {nullptr, nullptr, nullptr, nullptr};
+      std::size_t allocation_sizes[4] = {merge_partitions_size,
+                                         temporary_keys_storage_size,
+                                         temporary_values_storage_size,
+                                         virtual_shared_memory_size};
+
+      if (CubDebug(error = AliasTemporaries(d_temp_storage,
+                                            temp_storage_bytes,
+                                            allocations,
+                                            allocation_sizes)))
+      {
+        break;
+      }
+
+      if (d_temp_storage == nullptr)
+      {
+        // Return if the caller is simply requesting the size of the storage
+        // allocation
+        break;
+      }
+
+      const int num_passes =
+        static_cast<int>(THRUST_NS_QUALIFIER::detail::log2_ri(num_tiles));
+
+      /*
+       * The algorithm consists of stages. At each stage, there are input and
+       * output arrays. There are two pairs of arrays allocated (keys and items).
+       * One pair is from function arguments and another from temporary storage.
+       * Ping is a helper variable that controls which of these two pairs of
+       * arrays is an input and which is an output for a current stage. If the
+       * ping is true - the current stage stores its result in the temporary
+       * storage. The temporary storage acts as input data otherwise.
+       *
+       * Block sort is executed before the main loop. It stores its result in
+       * the pair of arrays that will be an input of the next stage. The initial
+       * value of the ping variable is selected so that the result of the final
+       * stage is stored in the input arrays.
+       */
+      bool ping = num_passes % 2 == 0;
+
+      auto merge_partitions = reinterpret_cast<OffsetT *>(allocations[0]);
+      auto keys_buffer      = reinterpret_cast<KeyT *>(allocations[1]);
+      auto items_buffer     = reinterpret_cast<ValueT *>(allocations[2]);
+
+      char *vshmem_ptr = virtual_shared_memory_size > 0
+                       ? reinterpret_cast<char *>(allocations[3])
+                       : nullptr;
+
+      // Invoke DeviceReduceKernel
+      BlockSortLauncher<KeyInputIteratorT,
+                        ValueInputIteratorT,
+                        KeyIteratorT,
+                        ValueIteratorT,
+                        OffsetT,
+                        MaxPolicyT,
+                        ActivePolicyT,
+                        CompareOpT,
+                        KeyT,
+                        ValueT,
+                        block_sort_fits_into_default_shmem>
+        block_sort_launcher(static_cast<int>(num_tiles),
+                            block_sort_shmem_size,
+                            ping,
+                            d_input_keys,
+                            d_input_items,
+                            d_output_keys,
+                            d_output_items,
+                            num_items,
+                            compare_op,
+                            stream,
+                            keys_buffer,
+                            items_buffer,
+                            block_sort_requires_vshmem ? vshmem_ptr : nullptr);
+
+      block_sort_launcher.launch();
+
+      if (debug_synchronous)
+      {
+        if (CubDebug(error = SyncStream(stream)))
+        {
+          break;
+        }
+      }
+
+      // Check for failure to launch
+      if (CubDebug(error = cudaPeekAtLastError()))
+      {
+        break;
+      }
+
+      const OffsetT num_partitions = num_tiles + 1;
+      const int threads_per_partition_block = 256;
+
+      const int partition_grid_size = static_cast<int>(
+        cub::DivideAndRoundUp(num_partitions, threads_per_partition_block));
+
+      MergeLauncher<KeyIteratorT,
+                    ValueIteratorT,
+                    OffsetT,
+                    MaxPolicyT,
+                    ActivePolicyT,
+                    CompareOpT,
+                    KeyT,
+                    ValueT,
+                    merge_fits_into_default_shmem>
+        merge_launcher(static_cast<int>(num_tiles),
+                       merge_shmem_size,
+                       d_output_keys,
+                       d_output_items,
+                       num_items,
+                       compare_op,
+                       merge_partitions,
+                       stream,
+                       keys_buffer,
+                       items_buffer,
+                       merge_requires_vshmem ? vshmem_ptr : nullptr);
+
+      for (int pass = 0; pass < num_passes; ++pass, ping = !ping)
+      {
+        OffsetT target_merged_tiles_number = OffsetT(2) << pass;
+
+        // Partition
+        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+          partition_grid_size,
+          threads_per_partition_block,
+          0,
+          stream)
+          .doit(DeviceMergeSortPartitionKernel<KeyIteratorT,
+                                               OffsetT,
+                                               CompareOpT,
+                                               KeyT>,
+                ping,
+                d_output_keys,
+                keys_buffer,
+                num_items,
+                num_partitions,
+                merge_partitions,
+                compare_op,
+                target_merged_tiles_number,
+                tile_size);
+
+        if (debug_synchronous)
+        {
+          if (CubDebug(error = SyncStream(stream)))
+          {
+            break;
+          }
+        }
+
+        // Check for failure to launch
+        if (CubDebug(error = cudaPeekAtLastError()))
+        {
+          break;
+        }
+
+        // Merge
+        merge_launcher.launch(ping, target_merged_tiles_number);
+
+        if (debug_synchronous)
+        {
+          if (CubDebug(error = SyncStream(stream)))
+          {
+            break;
+          }
+        }
+
+        // Check for failure to launch
+        if (CubDebug(error = cudaPeekAtLastError()))
+        {
+          break;
+        }
+      }
+    }
+    while (0);
+
+    return error;
+  }
+
+  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+  Dispatch(void *d_temp_storage,
+           std::size_t &temp_storage_bytes,
+           KeyInputIteratorT d_input_keys,
+           ValueInputIteratorT d_input_items,
+           KeyIteratorT d_output_keys,
+           ValueIteratorT d_output_items,
+           OffsetT num_items,
+           CompareOpT compare_op,
+           cudaStream_t stream,
+           bool debug_synchronous)
+  {
+    using MaxPolicyT = typename DispatchMergeSort::MaxPolicy;
+
+    cudaError error = cudaSuccess;
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      if (CubDebug(error = PtxVersion(ptx_version)))
+      {
+        break;
+      }
+
+      // Create dispatch functor
+      DispatchMergeSort dispatch(d_temp_storage,
+                                 temp_storage_bytes,
+                                 d_input_keys,
+                                 d_input_items,
+                                 d_output_keys,
+                                 d_output_items,
+                                 num_items,
+                                 compare_op,
+                                 stream,
+                                 debug_synchronous,
+                                 ptx_version);
+
+      // Dispatch to chained policy
+      if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/device/dispatch/dispatch_radix_sort.cuh b/cuda/cub/device/dispatch/dispatch_radix_sort.cuh
new file mode 100644
index 00000000..13c43948
--- /dev/null
+++ b/cuda/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -0,0 +1,2000 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/agent_radix_sort_histogram.cuh"
+#include "../../agent/agent_radix_sort_onesweep.cuh"
+#include "../../agent/agent_radix_sort_upsweep.cuh"
+#include "../../agent/agent_radix_sort_downsweep.cuh"
+#include "../../agent/agent_scan.cuh"
+#include "../../block/block_radix_sort.cuh"
+#include "../../config.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../util_type.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_math.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+// suppress warnings triggered by #pragma unroll:
+// "warning: loop not unrolled: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]"
+#if defined(__clang__)
+#  pragma clang diagnostic push
+#  pragma clang diagnostic ignored "-Wpass-failed"
+#endif
+
+CUB_NAMESPACE_BEGIN
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Upsweep digit-counting kernel entry point (multi-block).  Computes privatized digit histograms, one per block.
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int((ALT_DIGIT_BITS) ?
+    int(ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS) :
+    int(ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS)))
+__global__ void DeviceRadixSortUpsweepKernel(
+    const KeyT              *d_keys,                        ///< [in] Input keys buffer
+    OffsetT                 *d_spine,                       ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    OffsetT                 /*num_items*/,                  ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     num_bits,                       ///< [in] Number of bits of current radix digit
+    GridEvenShare<OffsetT>  even_share)                     ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
+{
+    using ActiveUpsweepPolicyT =
+      cub::detail::conditional_t<
+        ALT_DIGIT_BITS,
+        typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
+        typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>;
+
+    using ActiveDownsweepPolicyT =
+      cub::detail::conditional_t<
+        ALT_DIGIT_BITS,
+        typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
+        typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>;
+
+    enum {
+        TILE_ITEMS = CUB_MAX(
+            ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD,
+            ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD)
+    };
+
+    // Parameterize AgentRadixSortUpsweep type for the current configuration
+    typedef AgentRadixSortUpsweep<
+            ActiveUpsweepPolicyT,
+            KeyT,
+            OffsetT>
+        AgentRadixSortUpsweepT;
+
+    // Shared memory storage
+    __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage;
+
+    // Initialize GRID_MAPPING_RAKE even-share descriptor for this thread block
+    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
+
+    AgentRadixSortUpsweepT upsweep(temp_storage, d_keys, current_bit, num_bits);
+
+    upsweep.ProcessRegion(even_share.block_offset, even_share.block_end);
+
+    CTA_SYNC();
+
+    // Write out digit counts (striped)
+    upsweep.template ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
+}
+
+
+/**
+ * Spine scan kernel entry point (single-block).  Computes an exclusive prefix sum over the privatized digit histograms
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1)
+__global__ void RadixSortScanBinsKernel(
+    OffsetT                 *d_spine,                       ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    int                     num_counts)                     ///< [in] Total number of bin-counts
+{
+    // Parameterize the AgentScan type for the current configuration
+    typedef AgentScan<
+            typename ChainedPolicyT::ActivePolicy::ScanPolicy,
+            OffsetT*,
+            OffsetT*,
+            cub::Sum,
+            OffsetT,
+            OffsetT>
+        AgentScanT;
+
+    // Shared memory storage
+    __shared__ typename AgentScanT::TempStorage temp_storage;
+
+    // Block scan instance
+    AgentScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), OffsetT(0)) ;
+
+    // Process full input tiles
+    int block_offset = 0;
+    BlockScanRunningPrefixOp<OffsetT, Sum> prefix_op(0, Sum());
+    while (block_offset + AgentScanT::TILE_ITEMS <= num_counts)
+    {
+        block_scan.template ConsumeTile<false, false>(block_offset, prefix_op);
+        block_offset += AgentScanT::TILE_ITEMS;
+    }
+
+    // Process the remaining partial tile (if any).
+    if (block_offset < num_counts)
+    {
+        block_scan.template ConsumeTile<false, true>(block_offset, prefix_op,
+                                                     num_counts - block_offset);
+    }
+}
+
+
+/**
+ * Downsweep pass kernel entry point (multi-block).  Scatters keys (and values) into corresponding bins for the current digit place.
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                ValueT,                         ///< Value type
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int((ALT_DIGIT_BITS) ?
+    int(ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS) :
+    int(ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS)))
+__global__ void DeviceRadixSortDownsweepKernel(
+    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
+    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
+    const ValueT            *d_values_in,                   ///< [in] Input values buffer
+    ValueT                  *d_values_out,                  ///< [in] Output values buffer
+    OffsetT                 *d_spine,                       ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    OffsetT                 num_items,                      ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     num_bits,                       ///< [in] Number of bits of current radix digit
+    GridEvenShare<OffsetT>  even_share)                     ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
+{
+    using ActiveUpsweepPolicyT =
+      cub::detail::conditional_t<
+        ALT_DIGIT_BITS,
+        typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
+        typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>;
+
+    using ActiveDownsweepPolicyT =
+      cub::detail::conditional_t<
+        ALT_DIGIT_BITS,
+        typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
+        typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>;
+
+    enum {
+        TILE_ITEMS = CUB_MAX(
+            ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD,
+            ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD)
+    };
+
+    // Parameterize AgentRadixSortDownsweep type for the current configuration
+    typedef AgentRadixSortDownsweep<
+            ActiveDownsweepPolicyT,
+            IS_DESCENDING,
+            KeyT,
+            ValueT,
+            OffsetT>
+        AgentRadixSortDownsweepT;
+
+    // Shared memory storage
+    __shared__  typename AgentRadixSortDownsweepT::TempStorage temp_storage;
+
+    // Initialize even-share descriptor for this thread block
+    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
+
+    // Process input tiles
+    AgentRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion(
+        even_share.block_offset,
+        even_share.block_end);
+}
+
+
+/**
+ * Single pass kernel entry point (single-block).  Fully sorts a tile of input.
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                ValueT,                         ///< Value type
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
+__global__ void DeviceRadixSortSingleTileKernel(
+    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
+    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
+    const ValueT            *d_values_in,                   ///< [in] Input values buffer
+    ValueT                  *d_values_out,                  ///< [in] Output values buffer
+    OffsetT                 num_items,                      ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     end_bit)                        ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+{
+    // Constants
+    enum
+    {
+        BLOCK_THREADS           = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD,
+        KEYS_ONLY               = std::is_same<ValueT, NullType>::value,
+    };
+
+    // BlockRadixSort type
+    typedef BlockRadixSort<
+            KeyT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            ValueT,
+            ChainedPolicyT::ActivePolicy::SingleTilePolicy::RADIX_BITS,
+            (ChainedPolicyT::ActivePolicy::SingleTilePolicy::RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
+            ChainedPolicyT::ActivePolicy::SingleTilePolicy::SCAN_ALGORITHM>
+        BlockRadixSortT;
+
+    // BlockLoad type (keys)
+    typedef BlockLoad<
+        KeyT,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadKeys;
+
+    // BlockLoad type (values)
+    typedef BlockLoad<
+        ValueT,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadValues;
+
+    // Unsigned word for key bits
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBitsT;
+
+    // Shared memory storage
+    __shared__ union TempStorage
+    {
+        typename BlockRadixSortT::TempStorage       sort;
+        typename BlockLoadKeys::TempStorage         load_keys;
+        typename BlockLoadValues::TempStorage       load_values;
+
+    } temp_storage;
+
+    // Keys and values for the block
+    KeyT            keys[ITEMS_PER_THREAD];
+    ValueT          values[ITEMS_PER_THREAD];
+
+    // Get default (min/max) value for out-of-bounds keys
+    UnsignedBitsT   default_key_bits = (IS_DESCENDING) ? Traits<KeyT>::LOWEST_KEY : Traits<KeyT>::MAX_KEY;
+    KeyT            default_key = reinterpret_cast<KeyT&>(default_key_bits);
+
+    // Load keys
+    BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key);
+
+    CTA_SYNC();
+
+    // Load values
+    if (!KEYS_ONLY)
+    {
+        // Register pressure work-around: moving num_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        num_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(num_items, 0, 0xffffffff);
+
+        BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items);
+
+        CTA_SYNC();
+    }
+
+    // Sort tile
+    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(
+        keys,
+        values,
+        current_bit,
+        end_bit,
+        Int2Type<IS_DESCENDING>(),
+        Int2Type<KEYS_ONLY>());
+
+    // Store keys and values
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+        int item_offset = ITEM * BLOCK_THREADS + threadIdx.x;
+        if (item_offset < num_items)
+        {
+            d_keys_out[item_offset] = keys[ITEM];
+            if (!KEYS_ONLY)
+                d_values_out[item_offset] = values[ITEM];
+        }
+    }
+}
+
+
+/**
+ * Segmented radix sorting pass (one block per segment)
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                ValueT,                         ///< Value type
+    typename                BeginOffsetIteratorT,           ///< Random-access input iterator type for reading segment beginning offsets \iterator
+    typename                EndOffsetIteratorT,             ///< Random-access input iterator type for reading segment ending offsets \iterator
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int((ALT_DIGIT_BITS) ?
+    ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS :
+    ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS))
+__global__ void DeviceSegmentedRadixSortKernel(
+    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
+    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
+    const ValueT            *d_values_in,                   ///< [in] Input values buffer
+    ValueT                  *d_values_out,                  ///< [in] Output values buffer
+    BeginOffsetIteratorT    d_begin_offsets,                ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    EndOffsetIteratorT      d_end_offsets,                  ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    int                     /*num_segments*/,               ///< [in] The number of segments that comprise the sorting data
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     pass_bits)                      ///< [in] Number of bits of current radix digit
+{
+    //
+    // Constants
+    //
+
+    using SegmentedPolicyT = cub::detail::conditional_t<
+      ALT_DIGIT_BITS,
+      typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy,
+      typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>;
+
+    enum
+    {
+        BLOCK_THREADS       = SegmentedPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = SegmentedPolicyT::ITEMS_PER_THREAD,
+        RADIX_BITS          = SegmentedPolicyT::RADIX_BITS,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+        RADIX_DIGITS        = 1 << RADIX_BITS,
+        KEYS_ONLY           = std::is_same<ValueT, NullType>::value,
+    };
+
+    // Upsweep type
+    using BlockUpsweepT =
+      AgentRadixSortUpsweep<SegmentedPolicyT, KeyT, OffsetT>;
+
+    // Digit-scan type
+    using DigitScanT = BlockScan<OffsetT, BLOCK_THREADS>;
+
+    // Downsweep type
+    using BlockDownsweepT = AgentRadixSortDownsweep<SegmentedPolicyT,
+                                                    IS_DESCENDING,
+                                                    KeyT,
+                                                    ValueT,
+                                                    OffsetT>;
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD
+    };
+
+    //
+    // Process input tiles
+    //
+
+    // Shared memory storage
+    __shared__ union
+    {
+        typename BlockUpsweepT::TempStorage     upsweep;
+        typename BlockDownsweepT::TempStorage   downsweep;
+        struct
+        {
+            volatile OffsetT                        reverse_counts_in[RADIX_DIGITS];
+            volatile OffsetT                        reverse_counts_out[RADIX_DIGITS];
+            typename DigitScanT::TempStorage        scan;
+        };
+
+    } temp_storage;
+
+    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];
+    OffsetT segment_end     = d_end_offsets[blockIdx.x];
+    OffsetT num_items       = segment_end - segment_begin;
+
+    // Check if empty segment
+    if (num_items <= 0)
+        return;
+
+    // Upsweep
+    BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits);
+    upsweep.ProcessRegion(segment_begin, segment_end);
+
+    CTA_SYNC();
+
+    // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads)
+    OffsetT bin_count[BINS_TRACKED_PER_THREAD];
+    upsweep.ExtractCounts(bin_count);
+
+    CTA_SYNC();
+
+    if (IS_DESCENDING)
+    {
+        // Reverse bin counts
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                temp_storage.reverse_counts_in[bin_idx] = bin_count[track];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1];
+        }
+    }
+
+    // Scan
+    OffsetT bin_offset[BINS_TRACKED_PER_THREAD];     // The global scatter base offset for each digit value in this pass (valid in the first RADIX_DIGITS threads)
+    DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset);
+
+    #pragma unroll
+    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+    {
+        bin_offset[track] += segment_begin;
+    }
+
+    if (IS_DESCENDING)
+    {
+        // Reverse bin offsets
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1];
+        }
+    }
+
+    CTA_SYNC();
+
+    // Downsweep
+    BlockDownsweepT downsweep(temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits);
+    downsweep.ProcessRegion(segment_begin, segment_end);
+}
+
+
+/******************************************************************************
+ * Onesweep kernels
+ ******************************************************************************/
+
+/** 
+ * Kernel for computing multiple histograms
+ */
+
+/**
+ * Histogram kernel
+ */
+template <
+    typename ChainedPolicyT,
+    bool IS_DESCENDING,
+    typename KeyT,
+    typename OffsetT>
+__global__ void __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS)
+DeviceRadixSortHistogramKernel
+    (OffsetT* d_bins_out, const KeyT* d_keys_in, OffsetT num_items, int start_bit, int end_bit)
+{
+    typedef typename ChainedPolicyT::ActivePolicy::HistogramPolicy HistogramPolicyT;
+    typedef AgentRadixSortHistogram<HistogramPolicyT, IS_DESCENDING, KeyT, OffsetT> AgentT;
+    __shared__ typename AgentT::TempStorage temp_storage;
+    AgentT agent(temp_storage, d_bins_out, d_keys_in, num_items, start_bit, end_bit);
+    agent.Process();
+}
+
+template <
+    typename ChainedPolicyT,
+    bool IS_DESCENDING,
+    typename KeyT,
+    typename ValueT,
+    typename OffsetT,
+    typename PortionOffsetT,
+    typename AtomicOffsetT = PortionOffsetT>
+__global__ void __launch_bounds__(ChainedPolicyT::ActivePolicy::OnesweepPolicy::BLOCK_THREADS)
+DeviceRadixSortOnesweepKernel
+    (AtomicOffsetT* d_lookback, AtomicOffsetT* d_ctrs, OffsetT* d_bins_out,
+     const OffsetT* d_bins_in, KeyT* d_keys_out, const KeyT* d_keys_in, ValueT* d_values_out,
+     const ValueT* d_values_in, PortionOffsetT num_items, int current_bit, int num_bits)
+{
+    typedef typename ChainedPolicyT::ActivePolicy::OnesweepPolicy OnesweepPolicyT;
+    typedef AgentRadixSortOnesweep<OnesweepPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT,
+                                   PortionOffsetT> AgentT;
+    __shared__ typename AgentT::TempStorage s;
+
+    AgentT agent(s, d_lookback, d_ctrs, d_bins_out, d_bins_in, d_keys_out, d_keys_in,
+                 d_values_out, d_values_in, num_items, current_bit, num_bits);
+    agent.Process();
+}
+
+
+/** 
+ * Exclusive sum kernel
+ */
+template <
+    typename ChainedPolicyT,
+    typename OffsetT>
+__global__ void DeviceRadixSortExclusiveSumKernel(OffsetT* d_bins)
+{
+    typedef typename ChainedPolicyT::ActivePolicy::ExclusiveSumPolicy ExclusiveSumPolicyT;
+    const int RADIX_BITS = ExclusiveSumPolicyT::RADIX_BITS;
+    const int RADIX_DIGITS = 1 << RADIX_BITS;
+    const int BLOCK_THREADS = ExclusiveSumPolicyT::BLOCK_THREADS;
+    const int BINS_PER_THREAD = (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS;
+    typedef cub::BlockScan<OffsetT, BLOCK_THREADS> BlockScan;
+    __shared__ typename BlockScan::TempStorage temp_storage;
+
+    // load the bins
+    OffsetT bins[BINS_PER_THREAD];
+    int bin_start = blockIdx.x * RADIX_DIGITS;
+    #pragma unroll
+    for (int u = 0; u < BINS_PER_THREAD; ++u)
+    {
+        int bin = threadIdx.x * BINS_PER_THREAD + u;
+        if (bin >= RADIX_DIGITS) break;
+        bins[u] = d_bins[bin_start + bin];
+    }
+
+    // compute offsets
+    BlockScan(temp_storage).ExclusiveSum(bins, bins);
+
+    // store the offsets
+    #pragma unroll
+    for (int u = 0; u < BINS_PER_THREAD; ++u)
+    {
+        int bin = threadIdx.x * BINS_PER_THREAD + u;
+        if (bin >= RADIX_DIGITS) break;
+        d_bins[bin_start + bin] = bins[u];
+    }
+}
+
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+/**
+ * Tuning policy for kernel specialization
+ */
+template <
+    typename KeyT,          ///< Key type
+    typename ValueT,        ///< Value type
+    typename OffsetT>       ///< Signed integer type for global offsets
+struct DeviceRadixSortPolicy
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    // Whether this is a keys-only (or key-value) sort
+    constexpr static bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+
+    // Dominant-sized key/value type
+    using DominantT =
+      cub::detail::conditional_t<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>;
+
+    //------------------------------------------------------------------------------
+    // Architecture-specific tuning policies
+    //------------------------------------------------------------------------------
+
+    /// SM35
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 6 : 5,    // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m)
+            ONESWEEP = false,
+            ONESWEEP_RADIX_BITS = 8,
+        };
+
+        // Histogram policy
+        typedef AgentRadixSortHistogramPolicy <256, 8, 1, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy;
+        
+        // Exclusive sum policy
+        typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy;
+        
+        // Onesweep policy
+        typedef AgentRadixSortOnesweepPolicy <256, 21, DominantT, 1,
+            RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_STORE_DIRECT,
+            ONESWEEP_RADIX_BITS> OnesweepPolicy;
+
+        // Scan policy
+        typedef AgentScanPolicy <1024, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
+
+        // Keys-only downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, 9, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_LDG, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <64, 18, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyKeys;
+
+        // Key-value pairs downsweep policies
+        typedef DownsweepPolicyKeys DownsweepPolicyPairs;
+        typedef AgentRadixSortDownsweepPolicy <128, 15, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyPairs;
+
+        // Downsweep policies
+        using DownsweepPolicy =
+          cub::detail::conditional_t<
+            KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>;
+
+        using AltDownsweepPolicy =
+          cub::detail::conditional_t<KEYS_ONLY,
+                                   AltDownsweepPolicyKeys,
+                                   AltDownsweepPolicyPairs>;
+
+        // Upsweep policies
+        using UpsweepPolicy    = DownsweepPolicy;
+        using AltUpsweepPolicy = AltDownsweepPolicy;
+
+        // Single-tile policy
+        using SingleTilePolicy = DownsweepPolicy;
+
+        // Segmented policies
+        using SegmentedPolicy    = DownsweepPolicy;
+        using AltSegmentedPolicy = AltDownsweepPolicy;
+    };
+
+
+    /// SM50
+    struct Policy500 : ChainedPolicy<500, Policy500, Policy350>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 3.1B 32b segmented keys/s (TitanX)
+            ONESWEEP = false,
+            ONESWEEP_RADIX_BITS = 8,
+        };
+
+        // Histogram policy
+        typedef AgentRadixSortHistogramPolicy <256, 8, 1, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy;
+        
+        // Exclusive sum policy
+        typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy;
+        
+        // Onesweep policy
+        typedef AgentRadixSortOnesweepPolicy <256, 21, DominantT, 1,
+            RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_STORE_DIRECT,
+            ONESWEEP_RADIX_BITS> OnesweepPolicy;
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <160, 39, DominantT,  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>  DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, 16, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, 31, DominantT,  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>   SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, 11, DominantT,  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1>       AltSegmentedPolicy;
+    };
+
+
+    /// SM60 (GP100)
+    struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 6.9B 32b keys/s (Quadro P100)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 5.9B 32b segmented keys/s (Quadro P100)
+            ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t),  // 10.0B 32b keys/s (GP100, 64M random keys)
+            ONESWEEP_RADIX_BITS = 8,
+            OFFSET_64BIT            = sizeof(OffsetT) == 8,
+        };
+
+        // Histogram policy
+        typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy;
+        
+        // Exclusive sum policy
+        typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy;
+        
+        // Onesweep policy
+        typedef AgentRadixSortOnesweepPolicy <256, OFFSET_64BIT ? 29 : 30, DominantT, 2,
+            RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS,
+            RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy;
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <256, 25, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <192, OFFSET_64BIT ? 32 : 39, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <384, 11, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+
+    };
+
+
+    /// SM61 (GP104)
+    struct Policy610 : ChainedPolicy<610, Policy610, Policy600>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 3.3B 32b segmented keys/s (1080)
+            ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t),
+            ONESWEEP_RADIX_BITS = 8,
+        };
+
+        // Histogram policy
+        typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy;
+        
+        // Exclusive sum policy
+        typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy;
+        
+        // Onesweep policy
+        typedef AgentRadixSortOnesweepPolicy <256, 30, DominantT, 2,
+            RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS,
+            RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy;
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <384, 31, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, 35, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <128, 16, DominantT, LOAD_LDG, PRIMARY_RADIX_BITS>        UpsweepPolicy;
+        typedef AgentRadixSortUpsweepPolicy <128, 16, DominantT, LOAD_LDG, PRIMARY_RADIX_BITS - 1>    AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <384, 11, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+    };
+
+
+    /// SM62 (Tegra, less RF)
+    struct Policy620 : ChainedPolicy<620, Policy620, Policy610>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 5,
+            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+            ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t),
+            ONESWEEP_RADIX_BITS = 8,
+        };
+
+        // Histogram policy
+        typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy;
+        
+        // Exclusive sum policy
+        typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy;
+        
+        // Onesweep policy
+        typedef AgentRadixSortOnesweepPolicy <256, 30, DominantT, 2,
+            RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS,
+            RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy;
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <256, 16, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, 16, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, ALT_RADIX_BITS>       AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+    };
+
+
+    /// SM70 (GV100)
+    struct Policy700 : ChainedPolicy<700, Policy700, Policy620>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 7.62B 32b keys/s (GV100)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 8.7B 32b segmented keys/s (GV100)
+            ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t),  // 15.8B 32b keys/s (V100-SXM2, 64M random keys)
+            ONESWEEP_RADIX_BITS = 8,
+            OFFSET_64BIT            = sizeof(OffsetT) == 8,
+        };
+
+        // Histogram policy
+        typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy;
+        
+        // Exclusive sum policy
+        typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy;
+        
+        // Onesweep policy
+        typedef AgentRadixSortOnesweepPolicy <256,
+            sizeof(KeyT) == 4 && sizeof(ValueT) == 4 ? 46 : 23, DominantT, 4,
+            RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS,
+            RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy;
+
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <512, 23, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <(sizeof(KeyT) > 1) ? 256 : 128, OFFSET_64BIT ? 46 : 47, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <256, 23, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>     UpsweepPolicy;
+        typedef AgentRadixSortUpsweepPolicy <256, OFFSET_64BIT ? 46 : 47, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS - 1> AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <384, 11, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+    };
+
+
+    /// SM80
+    struct Policy800 : ChainedPolicy<800, Policy800, Policy700>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,
+            ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t),
+            ONESWEEP_RADIX_BITS = 8,
+            OFFSET_64BIT            = sizeof(OffsetT) == 8,
+        };
+
+        // Histogram policy
+        typedef AgentRadixSortHistogramPolicy <128, 16, 1, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy;
+        
+        // Exclusive sum policy
+        typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy;
+        
+        // Onesweep policy
+        typedef AgentRadixSortOnesweepPolicy <384,
+            OFFSET_64BIT && sizeof(KeyT) == 4 && !KEYS_ONLY ? 17 : 21, DominantT, 1,
+            RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_RAKING_MEMOIZE,
+            RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy;
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <512, 23, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <(sizeof(KeyT) > 1) ? 256 : 128, 47, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <256, 23, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>     UpsweepPolicy;
+        typedef AgentRadixSortUpsweepPolicy <256, 47, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS - 1> AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <384, 11, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+    };
+
+
+    /// MaxPolicy
+    typedef Policy800 MaxPolicy;
+
+
+};
+
+
+
+/******************************************************************************
+ * Single-problem dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort
+ */
+template <
+    bool     IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low
+    typename KeyT,          ///< Key type
+    typename ValueT,        ///< Value type
+    typename OffsetT,       ///< Signed integer type for global offsets
+    typename SelectedPolicy = DeviceRadixSortPolicy<KeyT, ValueT, OffsetT> >
+struct DispatchRadixSort :
+    SelectedPolicy
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    // Whether this is a keys-only (or key-value) sort
+    constexpr static bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+
+    //------------------------------------------------------------------------------
+    // Problem state
+    //------------------------------------------------------------------------------
+
+    void                    *d_temp_storage;        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+    OffsetT                 num_items;              ///< [in] Number of items to sort
+    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
+    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                     ptx_version;            ///< [in] PTX version
+    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers
+
+
+    //------------------------------------------------------------------------------
+    // Constructor
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchRadixSort(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        DoubleBuffer<KeyT>      &d_keys,
+        DoubleBuffer<ValueT>    &d_values,
+        OffsetT                 num_items,
+        int                     begin_bit,
+        int                     end_bit,
+        bool                    is_overwrite_okay,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_keys(d_keys),
+        d_values(d_values),
+        num_items(num_items),
+        begin_bit(begin_bit),
+        end_bit(end_bit),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version),
+        is_overwrite_okay(is_overwrite_okay)
+    {}
+
+
+    //------------------------------------------------------------------------------
+    // Small-problem (single tile) invocation
+    //------------------------------------------------------------------------------
+
+    /// Invoke a single block to sort in-core
+    template <
+        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename                SingleTileKernelT>      ///< Function type of cub::DeviceRadixSortSingleTileKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokeSingleTile(
+        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)single_tile_kernel;
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                temp_storage_bytes = 1;
+                break;
+            }
+
+            // Log single_tile_kernel configuration
+            if (debug_synchronous)
+                _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
+                    1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream,
+                    ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);
+
+            // Invoke upsweep_kernel with same grid size as downsweep_kernel
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
+            ).doit(single_tile_kernel,
+                d_keys.Current(),
+                d_keys.Alternate(),
+                d_values.Current(),
+                d_values.Alternate(),
+                num_items,
+                begin_bit,
+                end_bit);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Update selector
+            d_keys.selector ^= 1;
+            d_values.selector ^= 1;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Normal problem size invocation
+    //------------------------------------------------------------------------------
+
+    /**
+     * Invoke a three-kernel sorting pass at the current bit.
+     */
+    template <typename PassConfigT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePass(
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        OffsetT         *d_spine,
+        int             /*spine_length*/,
+        int             &current_bit,
+        PassConfigT     &pass_config)
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
+
+            // Log upsweep_kernel configuration
+            if (debug_synchronous)
+                _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
+                pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream,
+                pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);
+
+            // Spine length written by the upsweep kernel in the current pass.
+            int pass_spine_length = pass_config.even_share.grid_size * pass_config.radix_digits;
+
+            // Invoke upsweep_kernel with same grid size as downsweep_kernel
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                pass_config.even_share.grid_size,
+                pass_config.upsweep_config.block_threads, 0, stream
+            ).doit(pass_config.upsweep_kernel,
+                d_keys_in,
+                d_spine,
+                num_items,
+                current_bit,
+                pass_bits,
+                pass_config.even_share);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Log scan_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+                1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);
+
+            // Invoke scan_kernel
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                1, pass_config.scan_config.block_threads, 0, stream
+            ).doit(pass_config.scan_kernel,
+                d_spine,
+                pass_spine_length);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Log downsweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream,
+                pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy);
+
+            // Invoke downsweep_kernel
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                pass_config.even_share.grid_size,
+                pass_config.downsweep_config.block_threads, 0, stream
+            ).doit(pass_config.downsweep_kernel,
+                d_keys_in,
+                d_keys_out,
+                d_values_in,
+                d_values_out,
+                d_spine,
+                num_items,
+                current_bit,
+                pass_bits,
+                pass_config.even_share);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Update current bit
+            current_bit += pass_bits;
+        }
+        while (0);
+
+        return error;
+    }
+
+
+
+    /// Pass configuration structure
+    template <
+        typename UpsweepKernelT,
+        typename ScanKernelT,
+        typename DownsweepKernelT>
+    struct PassConfig
+    {
+        UpsweepKernelT          upsweep_kernel;
+        KernelConfig            upsweep_config;
+        ScanKernelT             scan_kernel;
+        KernelConfig            scan_config;
+        DownsweepKernelT        downsweep_kernel;
+        KernelConfig            downsweep_config;
+        int                     radix_bits;
+        int                     radix_digits;
+        int                     max_downsweep_grid_size;
+        GridEvenShare<OffsetT>  even_share;
+
+        /// Initialize pass configuration
+        template <
+            typename UpsweepPolicyT,
+            typename ScanPolicyT,
+            typename DownsweepPolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        cudaError_t InitPassConfig(
+            UpsweepKernelT      upsweep_kernel,
+            ScanKernelT         scan_kernel,
+            DownsweepKernelT    downsweep_kernel,
+            int                 ptx_version,
+            int                 sm_count,
+            OffsetT             num_items)
+        {
+            cudaError error = cudaSuccess;
+            do
+            {
+                this->upsweep_kernel    = upsweep_kernel;
+                this->scan_kernel       = scan_kernel;
+                this->downsweep_kernel  = downsweep_kernel;
+                radix_bits              = DownsweepPolicyT::RADIX_BITS;
+                radix_digits            = 1 << radix_bits;
+
+                if (CubDebug(error = upsweep_config.Init<UpsweepPolicyT>(upsweep_kernel))) break;
+                if (CubDebug(error = scan_config.Init<ScanPolicyT>(scan_kernel))) break;
+                if (CubDebug(error = downsweep_config.Init<DownsweepPolicyT>(downsweep_kernel))) break;
+
+                max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(ptx_version);
+
+                even_share.DispatchInit(
+                    num_items,
+                    max_downsweep_grid_size,
+                    CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size));
+
+            }
+            while (0);
+            return error;
+        }
+
+    };
+
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokeOnesweep()
+    {
+        typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;
+        // PortionOffsetT is used for offsets within a portion, and must be signed.
+        typedef int PortionOffsetT;
+        typedef PortionOffsetT AtomicOffsetT;
+
+        // compute temporary storage size
+        const int RADIX_BITS = ActivePolicyT::ONESWEEP_RADIX_BITS;
+        const int RADIX_DIGITS = 1 << RADIX_BITS;
+        const int ONESWEEP_ITEMS_PER_THREAD = ActivePolicyT::OnesweepPolicy::ITEMS_PER_THREAD;
+        const int ONESWEEP_BLOCK_THREADS = ActivePolicyT::OnesweepPolicy::BLOCK_THREADS;
+        const int ONESWEEP_TILE_ITEMS = ONESWEEP_ITEMS_PER_THREAD * ONESWEEP_BLOCK_THREADS;
+        // portions handle inputs with >=2**30 elements, due to the way lookback works
+        // for testing purposes, one portion is <= 2**28 elements
+        const PortionOffsetT PORTION_SIZE = ((1 << 28) - 1) / ONESWEEP_TILE_ITEMS * ONESWEEP_TILE_ITEMS;
+        int num_passes = cub::DivideAndRoundUp(end_bit - begin_bit, RADIX_BITS);
+        OffsetT num_portions = static_cast<OffsetT>(cub::DivideAndRoundUp(num_items, PORTION_SIZE));
+        PortionOffsetT max_num_blocks = cub::DivideAndRoundUp(
+          static_cast<int>(
+            CUB_MIN(num_items, static_cast<OffsetT>(PORTION_SIZE))),
+          ONESWEEP_TILE_ITEMS);
+
+        size_t value_size = KEYS_ONLY ? 0 : sizeof(ValueT);
+        size_t allocation_sizes[] =
+        {
+            // bins
+            num_portions * num_passes * RADIX_DIGITS * sizeof(OffsetT),
+            // lookback
+            max_num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT),
+            // extra key buffer
+            is_overwrite_okay || num_passes <= 1 ? 0 : num_items * sizeof(KeyT),
+            // extra value buffer
+            is_overwrite_okay || num_passes <= 1 ? 0 : num_items * value_size,
+            // counters
+            num_portions * num_passes * sizeof(AtomicOffsetT),
+        };
+        const int NUM_ALLOCATIONS = sizeof(allocation_sizes) / sizeof(allocation_sizes[0]);
+        void* allocations[NUM_ALLOCATIONS] = {};
+        AliasTemporaries<NUM_ALLOCATIONS>(d_temp_storage, temp_storage_bytes,
+                                          allocations, allocation_sizes);
+
+        // just return if no temporary storage is provided
+        cudaError_t error = cudaSuccess;
+        if (d_temp_storage == NULL) return error;
+
+        OffsetT* d_bins = (OffsetT*)allocations[0];
+        AtomicOffsetT* d_lookback = (AtomicOffsetT*)allocations[1];
+        KeyT* d_keys_tmp2 = (KeyT*)allocations[2];
+        ValueT* d_values_tmp2 = (ValueT*)allocations[3];
+        AtomicOffsetT* d_ctrs = (AtomicOffsetT*)allocations[4];
+
+        do { 
+            // initialization
+            if (CubDebug(error = cudaMemsetAsync(
+                   d_ctrs, 0, num_portions * num_passes * sizeof(AtomicOffsetT), stream))) break;
+
+            // compute num_passes histograms with RADIX_DIGITS bins each
+            if (CubDebug(error = cudaMemsetAsync
+                   (d_bins, 0, num_passes * RADIX_DIGITS * sizeof(OffsetT), stream))) break;
+            int device = -1;
+            int num_sms = 0;
+            if (CubDebug(error = cudaGetDevice(&device))) break;
+            if (CubDebug(error = cudaDeviceGetAttribute(
+                   &num_sms, cudaDevAttrMultiProcessorCount, device))) break;
+
+            const int HISTO_BLOCK_THREADS = ActivePolicyT::HistogramPolicy::BLOCK_THREADS;
+            int histo_blocks_per_sm = 1;
+            auto histogram_kernel = DeviceRadixSortHistogramKernel<
+                MaxPolicyT, IS_DESCENDING, KeyT, OffsetT>;
+            if (CubDebug(error = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+                &histo_blocks_per_sm, histogram_kernel, HISTO_BLOCK_THREADS, 0))) break;
+            histogram_kernel<<<histo_blocks_per_sm * num_sms, HISTO_BLOCK_THREADS, 0, stream>>>
+                (d_bins, d_keys.Current(), num_items, begin_bit, end_bit);
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // exclusive sums to determine starts
+            const int SCAN_BLOCK_THREADS = ActivePolicyT::ExclusiveSumPolicy::BLOCK_THREADS;
+            DeviceRadixSortExclusiveSumKernel<MaxPolicyT, OffsetT>
+                <<<num_passes, SCAN_BLOCK_THREADS, 0, stream>>>(d_bins);
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // use the other buffer if no overwrite is allowed
+            KeyT* d_keys_tmp = d_keys.Alternate();
+            ValueT* d_values_tmp = d_values.Alternate();
+            if (!is_overwrite_okay && num_passes % 2 == 0)
+            {
+                d_keys.d_buffers[1] = d_keys_tmp2;
+                d_values.d_buffers[1] = d_values_tmp2;
+            }
+
+            for (int current_bit = begin_bit, pass = 0; current_bit < end_bit;
+                 current_bit += RADIX_BITS, ++pass)
+            {
+                int num_bits = CUB_MIN(end_bit - current_bit, RADIX_BITS);
+                for (OffsetT portion = 0; portion < num_portions; ++portion)
+                {
+                    PortionOffsetT portion_num_items =
+                      static_cast<PortionOffsetT>(
+                        CUB_MIN(num_items - portion * PORTION_SIZE,
+                                static_cast<OffsetT>(PORTION_SIZE)));
+                    PortionOffsetT num_blocks =
+                        cub::DivideAndRoundUp(portion_num_items, ONESWEEP_TILE_ITEMS);
+                    if (CubDebug(error = cudaMemsetAsync(
+                           d_lookback, 0, num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT),
+                           stream))) break;
+                    auto onesweep_kernel = DeviceRadixSortOnesweepKernel<
+                        MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT, PortionOffsetT>;
+                    onesweep_kernel<<<num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream>>>
+                        (d_lookback, d_ctrs + portion * num_passes + pass,
+                         portion < num_portions - 1 ?
+                             d_bins + ((portion + 1) * num_passes + pass) * RADIX_DIGITS : NULL,
+                         d_bins + (portion * num_passes + pass) * RADIX_DIGITS,
+                         d_keys.Alternate(),
+                         d_keys.Current() + portion * PORTION_SIZE,
+                         d_values.Alternate(),
+                         d_values.Current() + portion * PORTION_SIZE,
+                         portion_num_items, current_bit, num_bits);
+                    if (CubDebug(error = cudaPeekAtLastError())) break;
+                }
+                
+                // use the temporary buffers if no overwrite is allowed
+                if (!is_overwrite_okay && pass == 0)
+                {
+                    d_keys = num_passes % 2 == 0 ?
+                        DoubleBuffer<KeyT>(d_keys_tmp, d_keys_tmp2) :
+                        DoubleBuffer<KeyT>(d_keys_tmp2, d_keys_tmp);
+                    d_values = num_passes % 2 == 0 ?
+                        DoubleBuffer<ValueT>(d_values_tmp, d_values_tmp2) :
+                        DoubleBuffer<ValueT>(d_values_tmp2, d_values_tmp);
+                }
+                d_keys.selector ^= 1;
+                d_values.selector ^= 1;
+            }
+        } while (0);
+        
+        return error;
+    }
+
+    /// Invocation (run multiple digit passes)
+    template <
+        typename            ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename            UpsweepKernelT,         ///< Function type of cub::DeviceRadixSortUpsweepKernel
+        typename            ScanKernelT,            ///< Function type of cub::SpineScanKernel
+        typename            DownsweepKernelT>       ///< Function type of cub::DeviceRadixSortDownsweepKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        UpsweepKernelT      upsweep_kernel,         ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
+        UpsweepKernelT      alt_upsweep_kernel,     ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
+        ScanKernelT         scan_kernel,            ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
+        DownsweepKernelT    downsweep_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
+        DownsweepKernelT    alt_downsweep_kernel)   ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)upsweep_kernel;
+        (void)alt_upsweep_kernel;
+        (void)scan_kernel;
+        (void)downsweep_kernel;
+        (void)alt_downsweep_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Init regular and alternate-digit kernel configurations
+            PassConfig<UpsweepKernelT, ScanKernelT, DownsweepKernelT> pass_config, alt_pass_config;
+            error = pass_config.template InitPassConfig<
+              typename ActivePolicyT::UpsweepPolicy,
+              typename ActivePolicyT::ScanPolicy,
+              typename ActivePolicyT::DownsweepPolicy>(upsweep_kernel,
+                                                       scan_kernel,
+                                                       downsweep_kernel,
+                                                       ptx_version,
+                                                       sm_count,
+                                                       num_items);
+            if (error)
+            {
+              break;
+            }
+
+            error = alt_pass_config.template InitPassConfig<
+              typename ActivePolicyT::AltUpsweepPolicy,
+              typename ActivePolicyT::ScanPolicy,
+              typename ActivePolicyT::AltDownsweepPolicy>(alt_upsweep_kernel,
+                                                          scan_kernel,
+                                                          alt_downsweep_kernel,
+                                                          ptx_version,
+                                                          sm_count,
+                                                          num_items);
+            if (error)
+            {
+              break;
+            }
+
+            // Get maximum spine length
+            int max_grid_size       = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
+            int spine_length        = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size;
+
+            // Temporary storage allocation requirements
+            void* allocations[3] = {};
+            size_t allocation_sizes[3] =
+            {
+                spine_length * sizeof(OffsetT),                                         // bytes needed for privatized block digit histograms
+                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                     // bytes needed for 3rd keys buffer
+                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),    // bytes needed for 3rd values buffer
+            };
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+                return cudaSuccess;
+
+            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
+            int num_bits            = end_bit - begin_bit;
+            int num_passes          = cub::DivideAndRoundUp(num_bits, pass_config.radix_bits);
+            bool is_num_passes_odd  = num_passes & 1;
+            int max_alt_passes      = (num_passes * pass_config.radix_bits) - num_bits;
+            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
+
+            // Alias the temporary storage allocations
+            OffsetT *d_spine = static_cast<OffsetT*>(allocations[0]);
+
+            DoubleBuffer<KeyT> d_keys_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[1]),
+                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[1]) : d_keys.Alternate());
+
+            DoubleBuffer<ValueT> d_values_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[2]),
+                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[2]) : d_values.Alternate());
+
+            // Run first pass, consuming from the input's current buffers
+            int current_bit = begin_bit;
+            if (CubDebug(error = InvokePass(
+                d_keys.Current(), d_keys_remaining_passes.Current(),
+                d_values.Current(), d_values_remaining_passes.Current(),
+                d_spine, spine_length, current_bit,
+                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+
+            // Run remaining passes
+            while (current_bit < end_bit)
+            {
+                if (CubDebug(error = InvokePass(
+                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    d_spine, spine_length, current_bit,
+                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;;
+
+                // Invert selectors
+                d_keys_remaining_passes.selector ^= 1;
+                d_values_remaining_passes.selector ^= 1;
+            }
+
+            // Update selector
+            if (!is_overwrite_okay) {
+                num_passes = 1; // Sorted data always ends up in the other vector
+            }
+
+            d_keys.selector = (d_keys.selector + num_passes) & 1;
+            d_values.selector = (d_values.selector + num_passes) & 1;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokeManyTiles(Int2Type<false>)
+    {
+        // Invoke upsweep-downsweep
+        typedef typename DispatchRadixSort::MaxPolicy       MaxPolicyT;
+        return InvokePasses<ActivePolicyT>(
+            DeviceRadixSortUpsweepKernel<   MaxPolicyT, false,   IS_DESCENDING, KeyT, OffsetT>,
+            DeviceRadixSortUpsweepKernel<   MaxPolicyT, true,    IS_DESCENDING, KeyT, OffsetT>,
+            RadixSortScanBinsKernel<        MaxPolicyT, OffsetT>,
+            DeviceRadixSortDownsweepKernel< MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetT>,
+            DeviceRadixSortDownsweepKernel< MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetT>);        
+    }
+
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokeManyTiles(Int2Type<true>)
+    {
+        // Invoke onesweep
+        return InvokeOnesweep<ActivePolicyT>();
+    }
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchRadixSort::MaxPolicy       MaxPolicyT;
+        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;
+
+        // Return if empty problem
+        if (num_items == 0)
+        {
+          if (d_temp_storage == nullptr)
+          {
+            temp_storage_bytes = 1;
+          }
+
+          return cudaSuccess;
+        }
+
+        // Force kernel code-generation in all compiler passes
+        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
+        {
+            // Small, single tile size
+            return InvokeSingleTile<ActivePolicyT>(
+                DeviceRadixSortSingleTileKernel<MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT>);
+        }
+        else
+        {
+            // Regular size
+            return InvokeManyTiles<ActivePolicyT>(Int2Type<ActivePolicyT::ONESWEEP>());
+        }
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,         ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        OffsetT                 num_items,              ///< [in] Number of items to sort
+        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
+        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;
+
+        cudaError_t error;
+        do {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchRadixSort dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_keys, d_values,
+                num_items, begin_bit, end_bit, is_overwrite_okay,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+
+        } while (0);
+
+        return error;
+    }
+};
+
+
+
+
+/******************************************************************************
+ * Segmented dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for segmented device-wide radix sort
+ */
+template <
+    bool     IS_DESCENDING,     ///< Whether or not the sorted-order is high-to-low
+    typename KeyT,              ///< Key type
+    typename ValueT,            ///< Value type
+    typename BeginOffsetIteratorT,   ///< Random-access input iterator type for reading segment beginning offsets \iterator
+    typename EndOffsetIteratorT,   ///< Random-access input iterator type for reading segment ending offsets \iterator
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename SelectedPolicy = DeviceRadixSortPolicy<KeyT, ValueT, OffsetT> >
+struct DispatchSegmentedRadixSort :
+    SelectedPolicy
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    // Whether this is a keys-only (or key-value) sort
+    constexpr static bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+
+    //------------------------------------------------------------------------------
+    // Parameter members
+    //------------------------------------------------------------------------------
+
+    void                    *d_temp_storage;        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+    OffsetT                 num_items;              ///< [in] Number of items to sort
+    OffsetT                 num_segments;           ///< [in] The number of segments that comprise the sorting data
+    BeginOffsetIteratorT    d_begin_offsets;        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    EndOffsetIteratorT      d_end_offsets;          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
+    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                     ptx_version;            ///< [in] PTX version
+    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers
+
+
+    //------------------------------------------------------------------------------
+    // Constructors
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchSegmentedRadixSort(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        DoubleBuffer<KeyT>      &d_keys,
+        DoubleBuffer<ValueT>    &d_values,
+        OffsetT                 num_items,
+        OffsetT                 num_segments,
+        BeginOffsetIteratorT    d_begin_offsets,
+        EndOffsetIteratorT      d_end_offsets,
+        int                     begin_bit,
+        int                     end_bit,
+        bool                    is_overwrite_okay,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_keys(d_keys),
+        d_values(d_values),
+        num_items(num_items),
+        num_segments(num_segments),
+        d_begin_offsets(d_begin_offsets),
+        d_end_offsets(d_end_offsets),
+        begin_bit(begin_bit),
+        end_bit(end_bit),
+        is_overwrite_okay(is_overwrite_okay),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+
+    //------------------------------------------------------------------------------
+    // Multi-segment invocation
+    //------------------------------------------------------------------------------
+
+    /// Invoke a three-kernel sorting pass at the current bit.
+    template <typename PassConfigT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePass(
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             &current_bit,
+        PassConfigT     &pass_config)
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
+
+            // Log kernel configuration
+            if (debug_synchronous)
+            {
+              _CubLog("Invoking segmented_kernels<<<%lld, %lld, 0, %lld>>>(), "
+                      "%lld items per thread, %lld SM occupancy, "
+                      "current bit %d, bit_grain %d\n",
+                      (long long)num_segments,
+                      (long long)pass_config.segmented_config.block_threads,
+                      (long long)stream,
+                      (long long)pass_config.segmented_config.items_per_thread,
+                      (long long)pass_config.segmented_config.sm_occupancy,
+                      current_bit,
+                      pass_bits);
+            }
+
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                num_segments, pass_config.segmented_config.block_threads, 0,
+                stream
+            ).doit(pass_config.segmented_kernel,
+                d_keys_in, d_keys_out,
+                d_values_in,  d_values_out,
+                d_begin_offsets, d_end_offsets, num_segments,
+                current_bit, pass_bits);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Update current bit
+            current_bit += pass_bits;
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /// PassConfig data structure
+    template <typename SegmentedKernelT>
+    struct PassConfig
+    {
+        SegmentedKernelT    segmented_kernel;
+        KernelConfig        segmented_config;
+        int                 radix_bits;
+        int                 radix_digits;
+
+        /// Initialize pass configuration
+        template <typename SegmentedPolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel)
+        {
+            this->segmented_kernel  = segmented_kernel;
+            this->radix_bits        = SegmentedPolicyT::RADIX_BITS;
+            this->radix_digits      = 1 << radix_bits;
+
+            return CubDebug(segmented_config.Init<SegmentedPolicyT>(segmented_kernel));
+        }
+    };
+
+
+    /// Invocation (run multiple digit passes)
+    template <
+        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename                SegmentedKernelT>       ///< Function type of cub::DeviceSegmentedRadixSortKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        SegmentedKernelT     segmented_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
+        SegmentedKernelT     alt_segmented_kernel)      ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+      (void)segmented_kernel;
+      (void)alt_segmented_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Init regular and alternate kernel configurations
+            PassConfig<SegmentedKernelT> pass_config, alt_pass_config;
+            if ((error = pass_config.template       InitPassConfig<typename ActivePolicyT::SegmentedPolicy>(segmented_kernel))) break;
+            if ((error = alt_pass_config.template   InitPassConfig<typename ActivePolicyT::AltSegmentedPolicy>(alt_segmented_kernel))) break;
+
+            // Temporary storage allocation requirements
+            void* allocations[2] = {};
+            size_t allocation_sizes[2] =
+            {
+                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                      // bytes needed for 3rd keys buffer
+                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),     // bytes needed for 3rd values buffer
+            };
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                if (temp_storage_bytes == 0)
+                    temp_storage_bytes = 1;
+                return cudaSuccess;
+            }
+
+            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
+            int radix_bits          = ActivePolicyT::SegmentedPolicy::RADIX_BITS;
+            int alt_radix_bits      = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS;
+            int num_bits            = end_bit - begin_bit;
+            int num_passes          = (num_bits + radix_bits - 1) / radix_bits;
+            bool is_num_passes_odd  = num_passes & 1;
+            int max_alt_passes      = (num_passes * radix_bits) - num_bits;
+            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));
+
+            DoubleBuffer<KeyT> d_keys_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[0]),
+                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[0]) : d_keys.Alternate());
+
+            DoubleBuffer<ValueT> d_values_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[1]),
+                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[1]) : d_values.Alternate());
+
+            // Run first pass, consuming from the input's current buffers
+            int current_bit = begin_bit;
+
+            if (CubDebug(error = InvokePass(
+                d_keys.Current(), d_keys_remaining_passes.Current(),
+                d_values.Current(), d_values_remaining_passes.Current(),
+                current_bit,
+                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+
+            // Run remaining passes
+            while (current_bit < end_bit)
+            {
+                if (CubDebug(error = InvokePass(
+                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    current_bit,
+                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+
+                // Invert selectors and update current bit
+                d_keys_remaining_passes.selector ^= 1;
+                d_values_remaining_passes.selector ^= 1;
+            }
+
+            // Update selector
+            if (!is_overwrite_okay) {
+                num_passes = 1; // Sorted data always ends up in the other vector
+            }
+
+            d_keys.selector = (d_keys.selector + num_passes) & 1;
+            d_values.selector = (d_values.selector + num_passes) & 1;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
+
+        // Return if empty problem
+        if (num_items == 0)
+        {
+          if (d_temp_storage == nullptr)
+          {
+            temp_storage_bytes = 1;
+          }
+
+          return cudaSuccess;
+        }
+
+        // Force kernel code-generation in all compiler passes
+        return InvokePasses<ActivePolicyT>(
+            DeviceSegmentedRadixSortKernel<MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>,
+            DeviceSegmentedRadixSortKernel<MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>);
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+
+    /// Internal dispatch routine
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,         ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,              ///< [in] Number of items to sort
+        int                     num_segments,           ///< [in] The number of segments that comprise the sorting data
+        BeginOffsetIteratorT    d_begin_offsets,        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT      d_end_offsets,          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
+        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
+
+        cudaError_t error;
+        do {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchSegmentedRadixSort dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_keys, d_values,
+                num_items, num_segments, d_begin_offsets, d_end_offsets,
+                begin_bit, end_bit, is_overwrite_okay,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+
+        } while (0);
+
+        return error;
+    }
+};
+
+
+CUB_NAMESPACE_END
+
+
+#if defined(__clang__)
+#  pragma clang diagnostic pop
+#endif
+
diff --git a/cuda/cub/device/dispatch/dispatch_reduce.cuh b/cuda/cub/device/dispatch/dispatch_reduce.cuh
new file mode 100644
index 00000000..e0470ccb
--- /dev/null
+++ b/cuda/cub/device/dispatch/dispatch_reduce.cuh
@@ -0,0 +1,842 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/agent_reduce.cuh"
+#include "../../iterator/arg_index_input_iterator.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../iterator/arg_index_input_iterator.cuh"
+#include "../../config.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Reduce region kernel entry point (multi-block).  Computes privatized reductions, one per thread block.
+ */
+template <
+    typename                ChainedPolicyT,             ///< Chained tuning policy
+    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                OffsetT,                    ///< Signed integer type for global offsets
+    typename                ReductionOpT>               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
+__global__ void DeviceReduceKernel(
+    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
+    OffsetT                 num_items,                  ///< [in] Total number of input data items
+    GridEvenShare<OffsetT>  even_share,                 ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+    ReductionOpT            reduction_op)               ///< [in] Binary reduction functor
+{
+    // The output value type
+    using OutputT =
+      cub::detail::non_void_value_t<OutputIteratorT,
+                                    cub::detail::value_t<InputIteratorT>>;
+
+    // Thread block type for reducing input tiles
+    using AgentReduceT =
+      AgentReduce<typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+                  InputIteratorT,
+                  OutputIteratorT,
+                  OffsetT,
+                  ReductionOpT>;
+
+    // Shared memory storage
+    __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+    // Consume input tiles
+    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share);
+
+    // Output result
+    if (threadIdx.x == 0)
+        d_out[blockIdx.x] = block_aggregate;
+}
+
+
+/**
+ * Reduce a single tile kernel entry point (single-block).  Can be used to aggregate privatized thread block reductions from a previous multi-block reduction pass.
+ */
+template <
+    typename                ChainedPolicyT,             ///< Chained tuning policy
+    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                OffsetT,                    ///< Signed integer type for global offsets
+    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename                OutputT>                     ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
+__global__ void DeviceReduceSingleTileKernel(
+    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
+    OffsetT                 num_items,                  ///< [in] Total number of input data items
+    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor
+    OutputT                  init)                       ///< [in] The initial value of the reduction
+{
+    // Thread block type for reducing input tiles
+    typedef AgentReduce<
+            typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            OffsetT,
+            ReductionOpT>
+        AgentReduceT;
+
+    // Shared memory storage
+    __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+    // Check if empty problem
+    if (num_items == 0)
+    {
+        if (threadIdx.x == 0)
+            *d_out = init;
+        return;
+    }
+
+    // Consume input tiles
+    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
+        OffsetT(0),
+        num_items);
+
+    // Output result
+    if (threadIdx.x == 0)
+        *d_out = reduction_op(init, block_aggregate);
+}
+
+
+/// Normalize input iterator to segment offset
+template <typename T, typename OffsetT, typename IteratorT>
+__device__ __forceinline__
+void NormalizeReductionOutput(
+    T &/*val*/,
+    OffsetT /*base_offset*/,
+    IteratorT /*itr*/)
+{}
+
+
+/// Normalize input iterator to segment offset (specialized for arg-index)
+template <typename KeyValuePairT, typename OffsetT, typename WrappedIteratorT, typename OutputValueT>
+__device__ __forceinline__
+void NormalizeReductionOutput(
+    KeyValuePairT &val,
+    OffsetT base_offset,
+    ArgIndexInputIterator<WrappedIteratorT, OffsetT, OutputValueT> /*itr*/)
+{
+    val.key -= base_offset;
+}
+
+
+/**
+ * Segmented reduction (one block per segment)
+ */
+template <
+    typename                ChainedPolicyT,             ///< Chained tuning policy
+    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                BeginOffsetIteratorT,       ///< Random-access input iterator type for reading segment beginning offsets \iterator
+    typename                EndOffsetIteratorT,         ///< Random-access input iterator type for reading segment ending offsets \iterator
+    typename                OffsetT,                    ///< Signed integer type for global offsets
+    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename                OutputT>                    ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
+__global__ void DeviceSegmentedReduceKernel(
+    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
+    BeginOffsetIteratorT    d_begin_offsets,            ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    EndOffsetIteratorT      d_end_offsets,              ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    int                     /*num_segments*/,           ///< [in] The number of segments that comprise the sorting data
+    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor
+    OutputT                 init)                       ///< [in] The initial value of the reduction
+{
+    // Thread block type for reducing input tiles
+    typedef AgentReduce<
+            typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            OffsetT,
+            ReductionOpT>
+        AgentReduceT;
+
+    // Shared memory storage
+    __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];
+    OffsetT segment_end     = d_end_offsets[blockIdx.x];
+
+    // Check if empty problem
+    if (segment_begin == segment_end)
+    {
+        if (threadIdx.x == 0)
+            d_out[blockIdx.x] = init;
+        return;
+    }
+
+    // Consume input tiles
+    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
+        segment_begin,
+        segment_end);
+
+    // Normalize as needed
+    NormalizeReductionOutput(block_aggregate, segment_begin, d_in);
+
+    if (threadIdx.x == 0)
+        d_out[blockIdx.x] = reduction_op(init, block_aggregate);;
+}
+
+
+
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+template <
+    typename InputT,            ///< Input data type
+    typename OutputT,           ///< Compute/output data type
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename ReductionOpT>      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+struct DeviceReducePolicy
+{
+    //------------------------------------------------------------------------------
+    // Architecture-specific tuning policies
+    //------------------------------------------------------------------------------
+
+    /// SM30
+    struct Policy300 : ChainedPolicy<300, Policy300, Policy300>
+    {
+        // ReducePolicy (GTX670: 154.0 @ 48M 4B items)
+        typedef AgentReducePolicy<
+                256, 20, InputT,                       ///< Threads per block, items per thread, compute type, compute type
+                2,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                           ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// SM35
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+    {
+        // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items)
+        typedef AgentReducePolicy<
+                256, 20, InputT,                       ///< Threads per block, items per thread, compute type
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_LDG>                               ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+    /// SM60
+    struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
+    {
+        // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
+        typedef AgentReducePolicy<
+                256, 16, InputT,                       ///< Threads per block, items per thread, compute type
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_LDG>                               ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// MaxPolicy
+    typedef Policy600 MaxPolicy;
+
+};
+
+
+
+/******************************************************************************
+ * Single-problem dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction
+ */
+template <
+    typename InputIteratorT,    ///< Random-access input iterator type for reading input items \iterator
+    typename OutputIteratorT,   ///< Output iterator type for recording the reduced aggregate \iterator
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename ReductionOpT,      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename OutputT =          ///< Data type of the output iterator
+        cub::detail::non_void_value_t<
+          OutputIteratorT,
+          cub::detail::value_t<InputIteratorT>>,
+    typename SelectedPolicy = DeviceReducePolicy<
+        cub::detail::value_t<InputIteratorT>,
+        OutputT,
+        OffsetT,
+        ReductionOpT> >
+struct DispatchReduce :
+    SelectedPolicy
+{
+    //------------------------------------------------------------------------------
+    // Problem state
+    //------------------------------------------------------------------------------
+
+    void                *d_temp_storage;                ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t              &temp_storage_bytes;            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    InputIteratorT      d_in;                           ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT     d_out;                          ///< [out] Pointer to the output aggregate
+    OffsetT             num_items;                      ///< [in] Total number of input items (i.e., length of \p d_in)
+    ReductionOpT        reduction_op;                   ///< [in] Binary reduction functor
+    OutputT             init;                           ///< [in] The initial value of the reduction
+    cudaStream_t        stream;                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                debug_synchronous;              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                 ptx_version;                    ///< [in] PTX version
+
+    //------------------------------------------------------------------------------
+    // Constructor
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchReduce(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        InputIteratorT          d_in,
+        OutputIteratorT         d_out,
+        OffsetT                 num_items,
+        ReductionOpT            reduction_op,
+        OutputT                 init,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_in(d_in),
+        d_out(d_out),
+        num_items(num_items),
+        reduction_op(reduction_op),
+        init(init),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+
+    //------------------------------------------------------------------------------
+    // Small-problem (single tile) invocation
+    //------------------------------------------------------------------------------
+
+    /// Invoke a single block block to reduce in-core
+    template <
+        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename                SingleTileKernelT>      ///< Function type of cub::DeviceReduceSingleTileKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokeSingleTile(
+        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)single_tile_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                temp_storage_bytes = 1;
+                break;
+            }
+
+            // Log single_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
+                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
+
+            // Invoke single_reduce_sweep_kernel
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
+            ).doit(single_tile_kernel,
+                d_in,
+                d_out,
+                num_items,
+                reduction_op,
+                init);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Normal problem size invocation (two-pass)
+    //------------------------------------------------------------------------------
+
+    /// Invoke two-passes to reduce
+    template <
+        typename                ActivePolicyT,              ///< Umbrella policy active for the target device
+        typename                ReduceKernelT,              ///< Function type of cub::DeviceReduceKernel
+        typename                SingleTileKernelT>          ///< Function type of cub::DeviceReduceSingleTileKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        ReduceKernelT           reduce_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceKernel
+        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)                  reduce_kernel;
+        (void)                  single_tile_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Init regular kernel configuration
+            KernelConfig reduce_config;
+            if (CubDebug(error = reduce_config.Init<typename ActivePolicyT::ReducePolicy>(reduce_kernel))) break;
+            int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count;
+
+            // Even-share work distribution
+            int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(ptx_version);
+            GridEvenShare<OffsetT> even_share;
+            even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size);
+
+            // Temporary storage allocation requirements
+            void* allocations[1] = {};
+            size_t allocation_sizes[1] =
+            {
+                max_blocks * sizeof(OutputT)    // bytes needed for privatized block reductions
+            };
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                return cudaSuccess;
+            }
+
+            // Alias the allocation for the privatized per-block reductions
+            OutputT *d_block_reductions = (OutputT*) allocations[0];
+
+            // Get grid size for device_reduce_sweep_kernel
+            int reduce_grid_size = even_share.grid_size;
+
+            // Log device_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                reduce_grid_size,
+                ActivePolicyT::ReducePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD,
+                reduce_config.sm_occupancy);
+
+            // Invoke DeviceReduceKernel
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS,
+                0, stream
+            ).doit(reduce_kernel,
+                d_in,
+                d_block_reductions,
+                num_items,
+                even_share,
+                reduction_op);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Log single_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
+                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
+
+            // Invoke DeviceReduceSingleTileKernel
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
+            ).doit(single_tile_kernel,
+                d_block_reductions,
+                d_out,
+                reduce_grid_size,
+                reduction_op,
+                init);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;
+        typedef typename DispatchReduce::MaxPolicy          MaxPolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
+        {
+            // Small, single tile size
+            return InvokeSingleTile<ActivePolicyT>(
+                DeviceReduceSingleTileKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
+        }
+        else
+        {
+            // Regular size
+            return InvokePasses<ActivePolicyT>(
+                DeviceReduceKernel<typename DispatchReduce::MaxPolicy, InputIteratorT, OutputT*, OffsetT, ReductionOpT>,
+                DeviceReduceSingleTileKernel<MaxPolicyT, OutputT*, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
+        }
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void            *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output aggregate
+        OffsetT         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        ReductionOpT    reduction_op,                       ///< [in] Binary reduction functor
+        OutputT         init,                               ///< [in] The initial value of the reduction
+        cudaStream_t    stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchReduce::MaxPolicy MaxPolicyT;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchReduce dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_in, d_out, num_items, reduction_op, init,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+
+/******************************************************************************
+ * Segmented dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction
+ */
+template <
+    typename InputIteratorT,       ///< Random-access input iterator type for reading input items \iterator
+    typename OutputIteratorT,      ///< Output iterator type for recording the reduced aggregate \iterator
+    typename BeginOffsetIteratorT, ///< Random-access input iterator type for reading segment beginning offsets \iterator
+    typename EndOffsetIteratorT,   ///< Random-access input iterator type for reading segment ending offsets \iterator
+    typename OffsetT,              ///< Signed integer type for global offsets
+    typename ReductionOpT,         ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename OutputT =             ///< Data type of the output iterator
+          cub::detail::non_void_value_t<OutputIteratorT,
+                                        cub::detail::value_t<InputIteratorT>>,
+    typename SelectedPolicy = DeviceReducePolicy<
+        cub::detail::value_t<InputIteratorT>,
+        OutputT,
+        OffsetT,
+        ReductionOpT> >
+struct DispatchSegmentedReduce :
+    SelectedPolicy
+{
+    //------------------------------------------------------------------------------
+    // Problem state
+    //------------------------------------------------------------------------------
+
+    void                 *d_temp_storage;        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t               &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    InputIteratorT       d_in;                   ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT      d_out;                  ///< [out] Pointer to the output aggregate
+    OffsetT              num_segments;           ///< [in] The number of segments that comprise the sorting data
+    BeginOffsetIteratorT d_begin_offsets;        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    EndOffsetIteratorT   d_end_offsets;          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    ReductionOpT         reduction_op;           ///< [in] Binary reduction functor
+    OutputT              init;                   ///< [in] The initial value of the reduction
+    cudaStream_t         stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                 debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                  ptx_version;            ///< [in] PTX version
+
+    //------------------------------------------------------------------------------
+    // Constructor
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchSegmentedReduce(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        InputIteratorT          d_in,
+        OutputIteratorT         d_out,
+        OffsetT                 num_segments,
+        BeginOffsetIteratorT    d_begin_offsets,
+        EndOffsetIteratorT      d_end_offsets,
+        ReductionOpT            reduction_op,
+        OutputT                 init,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_in(d_in),
+        d_out(d_out),
+        num_segments(num_segments),
+        d_begin_offsets(d_begin_offsets),
+        d_end_offsets(d_end_offsets),
+        reduction_op(reduction_op),
+        init(init),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <
+        typename                        ActivePolicyT,                  ///< Umbrella policy active for the target device
+        typename                        DeviceSegmentedReduceKernelT>   ///< Function type of cub::DeviceSegmentedReduceKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        DeviceSegmentedReduceKernelT    segmented_reduce_kernel)        ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedReduceKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)segmented_reduce_kernel;
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                temp_storage_bytes = 1;
+                return cudaSuccess;
+            }
+
+            // Init kernel configuration
+            KernelConfig segmented_reduce_config;
+            if (CubDebug(error = segmented_reduce_config.Init<typename ActivePolicyT::SegmentedReducePolicy>(segmented_reduce_kernel))) break;
+
+            // Log device_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                num_segments,
+                ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
+                segmented_reduce_config.sm_occupancy);
+
+            // Invoke DeviceReduceKernel
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                num_segments,
+                ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream
+            ).doit(segmented_reduce_kernel,
+                d_in,
+                d_out,
+                d_begin_offsets,
+                d_end_offsets,
+                num_segments,
+                reduction_op,
+                init);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+
+    }
+
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        return InvokePasses<ActivePolicyT>(
+            DeviceSegmentedReduceKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, ReductionOpT, OutputT>);
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void                 *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t               &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT       d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT      d_out,                              ///< [out] Pointer to the output aggregate
+        int                  num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        BeginOffsetIteratorT d_begin_offsets,                    ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT   d_end_offsets,                      ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        ReductionOpT         reduction_op,                       ///< [in] Binary reduction functor
+        OutputT              init,                               ///< [in] The initial value of the reduction
+        cudaStream_t         stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                 debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
+
+        if (num_segments <= 0)
+            return cudaSuccess;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchSegmentedReduce dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_in, d_out,
+                num_segments, d_begin_offsets, d_end_offsets,
+                reduction_op, init,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+
+CUB_NAMESPACE_END
+
+
diff --git a/cuda/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cuda/cub/device/dispatch/dispatch_reduce_by_key.cuh
new file mode 100644
index 00000000..00d4be5f
--- /dev/null
+++ b/cuda/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -0,0 +1,449 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduceByKey provides device-wide, parallel operations for reducing segments of values residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch_scan.cuh"
+#include "../../config.cuh"
+#include "../../agent/agent_reduce_by_key.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+#include "../../util_math.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Multi-block reduce-by-key sweep kernel entry point
+ */
+template <
+    typename            AgentReduceByKeyPolicyT,                 ///< Parameterized AgentReduceByKeyPolicyT tuning policy type
+    typename            KeysInputIteratorT,                     ///< Random-access input iterator type for keys
+    typename            UniqueOutputIteratorT,                  ///< Random-access output iterator type for keys
+    typename            ValuesInputIteratorT,                   ///< Random-access input iterator type for values
+    typename            AggregatesOutputIteratorT,              ///< Random-access output iterator type for values
+    typename            NumRunsOutputIteratorT,                 ///< Output iterator type for recording number of segments encountered
+    typename            ScanTileStateT,                         ///< Tile status interface type
+    typename            EqualityOpT,                            ///< KeyT equality operator type
+    typename            ReductionOpT,                           ///< ValueT reduction operator type
+    typename            OffsetT>                                ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentReduceByKeyPolicyT::BLOCK_THREADS))
+__global__ void DeviceReduceByKeyKernel(
+    KeysInputIteratorT          d_keys_in,                      ///< Pointer to the input sequence of keys
+    UniqueOutputIteratorT       d_unique_out,                   ///< Pointer to the output sequence of unique keys (one key per run)
+    ValuesInputIteratorT        d_values_in,                    ///< Pointer to the input sequence of corresponding values
+    AggregatesOutputIteratorT   d_aggregates_out,               ///< Pointer to the output sequence of value aggregates (one aggregate per run)
+    NumRunsOutputIteratorT      d_num_runs_out,                 ///< Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+    ScanTileStateT              tile_state,                     ///< Tile status interface
+    int                         start_tile,                     ///< The starting tile for the current grid
+    EqualityOpT                 equality_op,                    ///< KeyT equality operator
+    ReductionOpT                reduction_op,                   ///< ValueT reduction operator
+    OffsetT                     num_items)                      ///< Total number of items to select from
+{
+    // Thread block type for reducing tiles of value segments
+    typedef AgentReduceByKey<
+            AgentReduceByKeyPolicyT,
+            KeysInputIteratorT,
+            UniqueOutputIteratorT,
+            ValuesInputIteratorT,
+            AggregatesOutputIteratorT,
+            NumRunsOutputIteratorT,
+            EqualityOpT,
+            ReductionOpT,
+            OffsetT>
+        AgentReduceByKeyT;
+
+    // Shared memory for AgentReduceByKey
+    __shared__ typename AgentReduceByKeyT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentReduceByKeyT(temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op).ConsumeRange(
+        num_items,
+        tile_state,
+        start_tile);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey
+ */
+template <
+    typename    KeysInputIteratorT,         ///< Random-access input iterator type for keys
+    typename    UniqueOutputIteratorT,      ///< Random-access output iterator type for keys
+    typename    ValuesInputIteratorT,       ///< Random-access input iterator type for values
+    typename    AggregatesOutputIteratorT,  ///< Random-access output iterator type for values
+    typename    NumRunsOutputIteratorT,     ///< Output iterator type for recording number of segments encountered
+    typename    EqualityOpT,                ///< KeyT equality operator type
+    typename    ReductionOpT,               ///< ValueT reduction operator type
+    typename    OffsetT>                    ///< Signed integer type for global offsets
+struct DispatchReduceByKey
+{
+    //-------------------------------------------------------------------------
+    // Types and constants
+    //-------------------------------------------------------------------------
+
+    // The input keys type
+    using KeyInputT = cub::detail::value_t<KeysInputIteratorT>;
+
+    // The output keys type
+    using KeyOutputT =
+      cub::detail::non_void_value_t<UniqueOutputIteratorT, KeyInputT>;
+
+    // The input values type
+    using ValueInputT = cub::detail::value_t<ValuesInputIteratorT>;
+
+    // The output values type
+    using ValueOutputT =
+      cub::detail::non_void_value_t<AggregatesOutputIteratorT, ValueInputT>;
+
+    enum
+    {
+        INIT_KERNEL_THREADS     = 128,
+        MAX_INPUT_BYTES         = CUB_MAX(sizeof(KeyOutputT), sizeof(ValueOutputT)),
+        COMBINED_INPUT_BYTES    = sizeof(KeyOutputT) + sizeof(ValueOutputT),
+    };
+
+    // Tile status descriptor interface type
+    using ScanTileStateT = ReduceByKeyScanTileState<ValueOutputT, OffsetT>;
+
+    //-------------------------------------------------------------------------
+    // Tuning policies
+    //-------------------------------------------------------------------------
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 6,
+            ITEMS_PER_THREAD            = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+    typedef Policy350 PtxPolicy;
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicyT {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &reduce_by_key_config)
+    {
+        if (CUB_IS_DEVICE_CODE)
+        {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)ptx_version;
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();
+            #endif
+        }
+        else
+        {
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+
+                // (There's only one policy right now)
+                (void)ptx_version;
+                reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();
+            #endif
+        }
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduce-by-key using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    ScanInitKernelT,         ///< Function type of cub::DeviceScanInitKernel
+        typename                    ReduceByKeyKernelT>      ///< Function type of cub::DeviceReduceByKeyKernelT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,             ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT          d_keys_in,                  ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,               ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        ValuesInputIteratorT        d_values_in,                ///< [in] Pointer to the input sequence of corresponding values
+        AggregatesOutputIteratorT   d_aggregates_out,           ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
+        NumRunsOutputIteratorT      d_num_runs_out,             ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+        EqualityOpT                 equality_op,                ///< [in] KeyT equality operator
+        ReductionOpT                reduction_op,               ///< [in] ValueT reduction operator
+        OffsetT                     num_items,                  ///< [in] Total number of items to select from
+        cudaStream_t                stream,                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         /*ptx_version*/,            ///< [in] PTX version of dispatch kernels
+        ScanInitKernelT                init_kernel,                ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        ReduceByKeyKernelT             reduce_by_key_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeyKernel
+        KernelConfig                reduce_by_key_config)       ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+      (void)d_temp_storage;
+      (void)temp_storage_bytes;
+      (void)d_keys_in;
+      (void)d_unique_out;
+      (void)d_values_in;
+      (void)d_aggregates_out;
+      (void)d_num_runs_out;
+      (void)equality_op;
+      (void)reduction_op;
+      (void)num_items;
+      (void)stream;
+      (void)debug_synchronous;
+      (void)init_kernel;
+      (void)reduce_by_key_kernel;
+      (void)reduce_by_key_config;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread;
+            int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1] = {};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log init_kernel configuration
+            int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS));
+            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(init_kernel,
+                tile_state,
+                num_tiles,
+                d_num_runs_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Get SM occupancy for reduce_by_key_kernel
+            int reduce_by_key_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                reduce_by_key_sm_occupancy,            // out
+                reduce_by_key_kernel,
+                reduce_by_key_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Run grids in epochs (in case number of tiles exceeds max x-dimension
+            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
+            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+            {
+                // Log reduce_by_key_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy);
+
+                // Invoke reduce_by_key_kernel
+                THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                    scan_grid_size, reduce_by_key_config.block_threads, 0,
+                    stream
+                ).doit(reduce_by_key_kernel,
+                    d_keys_in,
+                    d_unique_out,
+                    d_values_in,
+                    d_aggregates_out,
+                    d_num_runs_out,
+                    tile_state,
+                    start_tile,
+                    equality_op,
+                    reduction_op,
+                    num_items);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
+        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+        EqualityOpT                 equality_op,                    ///< [in] KeyT equality operator
+        ReductionOpT                reduction_op,                   ///< [in] ValueT reduction operator
+        OffsetT                     num_items,                      ///< [in] Total number of items to select from
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig reduce_by_key_config;
+            InitConfigs(ptx_version, reduce_by_key_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_keys_in,
+                d_unique_out,
+                d_values_in,
+                d_aggregates_out,
+                d_num_runs_out,
+                equality_op,
+                reduction_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+                DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, ReductionOpT, OffsetT>,
+                reduce_by_key_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+CUB_NAMESPACE_END
+
+
diff --git a/cuda/cub/device/dispatch/dispatch_rle.cuh b/cuda/cub/device/dispatch/dispatch_rle.cuh
new file mode 100644
index 00000000..1ebb7953
--- /dev/null
+++ b/cuda/cub/device/dispatch/dispatch_rle.cuh
@@ -0,0 +1,429 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRle provides device-wide, parallel operations for run-length-encoding sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch_scan.cuh"
+#include "../../config.cuh"
+#include "../../agent/agent_rle.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+#include "../../util_math.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+CUB_NAMESPACE_BEGIN
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Select kernel entry point (multi-block)
+ *
+ * Performs functor-based selection if SelectOp functor type != NullType
+ * Otherwise performs flag-based selection if FlagIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename            AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
+    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename            OffsetsOutputIteratorT,     ///< Random-access output iterator type for writing run-offset values \iterator
+    typename            LengthsOutputIteratorT,     ///< Random-access output iterator type for writing run-length values \iterator
+    typename            NumRunsOutputIteratorT,     ///< Output iterator type for recording the number of runs encountered \iterator
+    typename            ScanTileStateT,              ///< Tile status interface type
+    typename            EqualityOpT,                 ///< T equality operator type
+    typename            OffsetT>                    ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentRlePolicyT::BLOCK_THREADS))
+__global__ void DeviceRleSweepKernel(
+    InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
+    OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run-offsets
+    LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run-lengths
+    NumRunsOutputIteratorT      d_num_runs_out,     ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
+    ScanTileStateT              tile_status,        ///< [in] Tile status interface
+    EqualityOpT                 equality_op,        ///< [in] Equality operator for input items
+    OffsetT                     num_items,          ///< [in] Total number of input items (i.e., length of \p d_in)
+    int                         num_tiles)          ///< [in] Total number of tiles for the entire problem
+{
+    // Thread block type for selecting data from input tiles
+    typedef AgentRle<
+        AgentRlePolicyT,
+        InputIteratorT,
+        OffsetsOutputIteratorT,
+        LengthsOutputIteratorT,
+        EqualityOpT,
+        OffsetT> AgentRleT;
+
+    // Shared memory for AgentRle
+    __shared__ typename AgentRleT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items).ConsumeRange(
+        num_tiles,
+        tile_status,
+        d_num_runs_out);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceRle
+ */
+template <
+    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename            OffsetsOutputIteratorT,     ///< Random-access output iterator type for writing run-offset values \iterator
+    typename            LengthsOutputIteratorT,     ///< Random-access output iterator type for writing run-length values \iterator
+    typename            NumRunsOutputIteratorT,     ///< Output iterator type for recording the number of runs encountered \iterator
+    typename            EqualityOpT,                ///< T equality operator type
+    typename            OffsetT>                    ///< Signed integer type for global offsets
+struct DeviceRleDispatch
+{
+    /******************************************************************************
+     * Types and constants
+     ******************************************************************************/
+
+    // The input value type
+    using T = cub::detail::value_t<InputIteratorT>;
+
+    // The lengths output value type
+    using LengthT =
+      cub::detail::non_void_value_t<LengthsOutputIteratorT, OffsetT>;
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128,
+    };
+
+    // Tile status descriptor interface type
+    using ScanTileStateT = ReduceByKeyScanTileState<LengthT, OffsetT>;
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                96,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                BLOCK_SCAN_WARP_SCANS>
+            RleSweepPolicy;
+    };
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+    typedef Policy350 PtxPolicy;
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxRleSweepPolicy : PtxPolicy::RleSweepPolicy {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig&   device_rle_config)
+    {
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                device_rle_config.template Init<PtxRleSweepPolicy>();
+            #endif
+        }
+        else
+        {
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+
+                // (There's only one policy right now)
+                (void)ptx_version;
+                device_rle_config.template Init<typename Policy350::RleSweepPolicy>();
+            #endif
+        }
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.  Mirrors the constants within AgentRlePolicyT.
+     */
+    struct KernelConfig
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        BlockLoadAlgorithm      load_policy;
+        bool                    store_warp_time_slicing;
+        BlockScanAlgorithm      scan_algorithm;
+
+        template <typename AgentRlePolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads               = AgentRlePolicyT::BLOCK_THREADS;
+            items_per_thread            = AgentRlePolicyT::ITEMS_PER_THREAD;
+            load_policy                 = AgentRlePolicyT::LOAD_ALGORITHM;
+            store_warp_time_slicing     = AgentRlePolicyT::STORE_WARP_TIME_SLICING;
+            scan_algorithm              = AgentRlePolicyT::SCAN_ALGORITHM;
+        }
+
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Print()
+        {
+            printf("%d, %d, %d, %d, %d",
+                block_threads,
+                items_per_thread,
+                load_policy,
+                store_warp_time_slicing,
+                scan_algorithm);
+        }
+    };
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide run-length-encode using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    DeviceScanInitKernelPtr,        ///< Function type of cub::DeviceScanInitKernel
+        typename                    DeviceRleSweepKernelPtr>        ///< Function type of cub::DeviceRleSweepKernelPtr
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OffsetsOutputIteratorT      d_offsets_out,                  ///< [out] Pointer to the output sequence of run-offsets
+        LengthsOutputIteratorT      d_lengths_out,                  ///< [out] Pointer to the output sequence of run-lengths
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to the total number of runs encountered (i.e., length of \p d_offsets_out)
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator for input items
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         /*ptx_version*/,                ///< [in] PTX version of dispatch kernels
+        DeviceScanInitKernelPtr     device_scan_init_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        DeviceRleSweepKernelPtr     device_rle_sweep_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceRleSweepKernel
+        KernelConfig                device_rle_config)              ///< [in] Dispatch parameters that match the policy that \p device_rle_sweep_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread;
+            int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1] = {};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_status;
+            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log device_scan_init_kernel configuration
+            int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS));
+            if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(device_scan_init_kernel,
+                tile_status,
+                num_tiles,
+                d_num_runs_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Get SM occupancy for device_rle_sweep_kernel
+            int device_rle_kernel_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                device_rle_kernel_sm_occupancy,            // out
+                device_rle_sweep_kernel,
+                device_rle_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Get grid size for scanning tiles
+            dim3 scan_grid_size;
+            scan_grid_size.z = 1;
+            scan_grid_size.y = cub::DivideAndRoundUp(num_tiles, max_dim_x);
+            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+
+            // Log device_rle_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy);
+
+            // Invoke device_rle_sweep_kernel
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                scan_grid_size, device_rle_config.block_threads, 0, stream
+            ).doit(device_rle_sweep_kernel,
+                d_in,
+                d_offsets_out,
+                d_lengths_out,
+                d_num_runs_out,
+                tile_status,
+                equality_op,
+                num_items,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to input sequence of data items
+        OffsetsOutputIteratorT      d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets
+        LengthsOutputIteratorT      d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator for input items
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig device_rle_config;
+            InitConfigs(ptx_version, device_rle_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_in,
+                d_offsets_out,
+                d_lengths_out,
+                d_num_runs_out,
+                equality_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+                DeviceRleSweepKernel<PtxRleSweepPolicy, InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, OffsetT>,
+                device_rle_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+CUB_NAMESPACE_END
+
+
diff --git a/cuda/cub/device/dispatch/dispatch_scan.cuh b/cuda/cub/device/dispatch/dispatch_scan.cuh
new file mode 100644
index 00000000..c2d04588
--- /dev/null
+++ b/cuda/cub/device/dispatch/dispatch_scan.cuh
@@ -0,0 +1,449 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../../agent/agent_scan.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../config.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_math.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Initialization kernel for tile status initialization (multi-block)
+ */
+template <
+    typename            ScanTileStateT>     ///< Tile status interface type
+__global__ void DeviceScanInitKernel(
+    ScanTileStateT      tile_state,         ///< [in] Tile status interface
+    int                 num_tiles)          ///< [in] Number of tiles
+{
+    // Initialize tile status
+    tile_state.InitializeStatus(num_tiles);
+}
+
+/**
+ * Initialization kernel for tile status initialization (multi-block)
+ */
+template <
+    typename                ScanTileStateT,         ///< Tile status interface type
+    typename                NumSelectedIteratorT>   ///< Output iterator type for recording the number of items selected
+__global__ void DeviceCompactInitKernel(
+    ScanTileStateT          tile_state,             ///< [in] Tile status interface
+    int                     num_tiles,              ///< [in] Number of tiles
+    NumSelectedIteratorT    d_num_selected_out)     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+{
+    // Initialize tile status
+    tile_state.InitializeStatus(num_tiles);
+
+    // Initialize d_num_selected_out
+    if ((blockIdx.x == 0) && (threadIdx.x == 0))
+        *d_num_selected_out = 0;
+}
+
+
+/**
+ * Scan kernel entry point (multi-block)
+ */
+template <
+    typename            ChainedPolicyT,     ///< Chained tuning policy
+    typename            InputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
+    typename            OutputIteratorT,    ///< Random-access output iterator type for writing scan outputs \iterator
+    typename            ScanTileStateT,     ///< Tile status interface type
+    typename            ScanOpT,            ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename            InitValueT,         ///< Initial value to seed the exclusive scan (cub::NullType for inclusive scans)
+    typename            OffsetT>            ///< Signed integer type for global offsets
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
+__global__ void DeviceScanKernel(
+    InputIteratorT      d_in,               ///< Input data
+    OutputIteratorT     d_out,              ///< Output data
+    ScanTileStateT      tile_state,         ///< Tile status interface
+    int                 start_tile,         ///< The starting tile for the current grid
+    ScanOpT             scan_op,            ///< Binary scan functor
+    InitValueT          init_value,         ///< Initial value to seed the exclusive scan
+    OffsetT             num_items)          ///< Total number of scan items for the entire problem
+{
+    using RealInitValueT = typename InitValueT::value_type;
+    typedef typename ChainedPolicyT::ActivePolicy::ScanPolicyT ScanPolicyT;
+
+    // Thread block type for scanning input tiles
+    typedef AgentScan<
+        ScanPolicyT,
+        InputIteratorT,
+        OutputIteratorT,
+        ScanOpT,
+        RealInitValueT,
+        OffsetT> AgentScanT;
+
+    // Shared memory for AgentScan
+    __shared__ typename AgentScanT::TempStorage temp_storage;
+
+    RealInitValueT real_init_value = init_value;
+
+    // Process tiles
+    AgentScanT(temp_storage, d_in, d_out, scan_op, real_init_value).ConsumeRange(
+        num_items,
+        tile_state,
+        start_tile);
+}
+
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+template <
+    typename OutputT> ///< Data type
+struct DeviceScanPolicy
+{
+    // For large values, use timesliced loads/stores to fit shared memory.
+    static constexpr bool LargeValues = sizeof(OutputT) > 128;
+    static constexpr BlockLoadAlgorithm ScanTransposedLoad =
+      LargeValues ? BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED
+                  : BLOCK_LOAD_WARP_TRANSPOSE;
+    static constexpr BlockStoreAlgorithm ScanTransposedStore =
+      LargeValues ? BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED
+                  : BLOCK_STORE_WARP_TRANSPOSE;
+
+    /// SM350
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
+    {
+        // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                128, 12,                                        ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+                BLOCK_SCAN_RAKING>
+            ScanPolicyT;
+    };
+
+    /// SM520
+    struct Policy520 : ChainedPolicy<520, Policy520, Policy350>
+    {
+        // Titan X: 32.47B items/s @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                128, 12,                                        ///< Threads per block, items per thread
+                OutputT,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                ScanTransposedStore,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// SM600
+    struct Policy600 : ChainedPolicy<600, Policy600, Policy520>
+    {
+        typedef AgentScanPolicy<
+                128, 15,                                        ///< Threads per block, items per thread
+                OutputT,
+                ScanTransposedLoad,
+                LOAD_DEFAULT,
+                ScanTransposedStore,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// MaxPolicy
+    typedef Policy600 MaxPolicy;
+};
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceScan
+ */
+template <
+    typename InputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
+    typename OutputIteratorT,    ///< Random-access output iterator type for writing scan outputs \iterator
+    typename ScanOpT,            ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename InitValueT,         ///< The init_value element type for ScanOpT (cub::NullType for inclusive scans)
+    typename OffsetT,            ///< Signed integer type for global offsets
+    typename SelectedPolicy = DeviceScanPolicy<
+      // Accumulator type.
+      cub::detail::conditional_t<std::is_same<InitValueT, NullType>::value,
+                                 cub::detail::value_t<InputIteratorT>,
+                                 typename InitValueT::value_type>>>
+struct DispatchScan:
+    SelectedPolicy
+{
+    //---------------------------------------------------------------------
+    // Constants and Types
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128
+    };
+
+    // The input value type
+    using InputT = cub::detail::value_t<InputIteratorT>;
+
+    // The output value type -- used as the intermediate accumulator
+    // Per https://wg21.link/P0571, use InitValueT::value_type if provided, otherwise the
+    // input iterator's value type.
+    using OutputT =
+      cub::detail::conditional_t<std::is_same<InitValueT, NullType>::value,
+                                 InputT,
+                                 typename InitValueT::value_type>;
+
+    void*           d_temp_storage;         ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t&         temp_storage_bytes;     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    InputIteratorT  d_in;                   ///< [in] Iterator to the input sequence of data items
+    OutputIteratorT d_out;                  ///< [out] Iterator to the output sequence of data items
+    ScanOpT         scan_op;                ///< [in] Binary scan functor
+    InitValueT      init_value;             ///< [in] Initial value to seed the exclusive scan
+    OffsetT         num_items;              ///< [in] Total number of input items (i.e., the length of \p d_in)
+    cudaStream_t    stream;                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool            debug_synchronous;
+    int             ptx_version;
+
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchScan(
+        void*           d_temp_storage,         ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&         temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                   ///< [in] Iterator to the input sequence of data items
+        OutputIteratorT d_out,                  ///< [out] Iterator to the output sequence of data items
+        OffsetT         num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
+        ScanOpT         scan_op,                ///< [in] Binary scan functor
+        InitValueT      init_value,             ///< [in] Initial value to seed the exclusive scan
+        cudaStream_t    stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous,
+        int             ptx_version
+    ):
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_in(d_in),
+        d_out(d_out),
+        scan_op(scan_op),
+        init_value(init_value),
+        num_items(num_items),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+    template <typename ActivePolicyT, typename InitKernel, typename ScanKernel>
+    CUB_RUNTIME_FUNCTION __host__  __forceinline__
+    cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        (void)init_kernel;
+        (void)scan_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        typedef typename ActivePolicyT::ScanPolicyT Policy;
+        typedef typename cub::ScanTileState<OutputT> ScanTileStateT;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD;
+            int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1] = {};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log init_kernel configuration
+            int init_grid_size = cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS);
+            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(init_kernel,
+                tile_state,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+
+            // Get SM occupancy for scan_kernel
+            int scan_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                scan_sm_occupancy,            // out
+                scan_kernel,
+                Policy::BLOCK_THREADS))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;
+
+            // Run grids in epochs (in case number of tiles exceeds max x-dimension
+            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
+            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+            {
+                // Log scan_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    start_tile, scan_grid_size, Policy::BLOCK_THREADS, (long long) stream, Policy::ITEMS_PER_THREAD, scan_sm_occupancy);
+
+                // Invoke scan_kernel
+                THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                    scan_grid_size, Policy::BLOCK_THREADS, 0, stream
+                ).doit(scan_kernel,
+                    d_in,
+                    d_out,
+                    tile_state,
+                    start_tile,
+                    scan_op,
+                    init_value,
+                    num_items);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __host__  __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchScan::MaxPolicy MaxPolicyT;
+        typedef typename cub::ScanTileState<OutputT> ScanTileStateT;
+        // Ensure kernels are instantiated.
+        return Invoke<ActivePolicyT>(
+            DeviceScanInitKernel<ScanTileStateT>,
+            DeviceScanKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, ScanTileStateT, ScanOpT, InitValueT, OffsetT>
+        );
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*           d_temp_storage,         ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&         temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                   ///< [in] Iterator to the input sequence of data items
+        OutputIteratorT d_out,                  ///< [out] Iterator to the output sequence of data items
+        ScanOpT         scan_op,                ///< [in] Binary scan functor
+        InitValueT      init_value,             ///< [in] Initial value to seed the exclusive scan
+        OffsetT         num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous)      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchScan::MaxPolicy MaxPolicyT;
+
+        cudaError_t error;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchScan dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_in,
+                d_out,
+                num_items,
+                scan_op,
+                init_value,
+                stream,
+                debug_synchronous,
+                ptx_version
+            );
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/device/dispatch/dispatch_scan_by_key.cuh b/cuda/cub/device/dispatch/dispatch_scan_by_key.cuh
new file mode 100644
index 00000000..ab71990e
--- /dev/null
+++ b/cuda/cub/device/dispatch/dispatch_scan_by_key.cuh
@@ -0,0 +1,437 @@
+/******************************************************************************
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../../agent/agent_scan_by_key.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../config.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_math.cuh"
+#include "dispatch_scan.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+CUB_NAMESPACE_BEGIN
+
+
+/******************************************************************************
+* Kernel entry points
+*****************************************************************************/
+
+/**
+ * Scan kernel entry point (multi-block)
+ */
+template <
+    typename ChainedPolicyT,              ///< Chained tuning policy
+    typename KeysInputIteratorT,          ///< Random-access input iterator type
+    typename ValuesInputIteratorT,        ///< Random-access input iterator type
+    typename ValuesOutputIteratorT,       ///< Random-access output iterator type
+    typename ScanByKeyTileStateT,         ///< Tile status interface type
+    typename EqualityOp,                  ///< Equality functor type
+    typename ScanOpT,                     ///< Scan functor type
+    typename InitValueT,                  ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan)
+    typename OffsetT>                     ///< Signed integer type for global offsets
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THREADS))
+__global__ void DeviceScanByKeyKernel(
+    KeysInputIteratorT    d_keys_in,          ///< Input keys data
+    ValuesInputIteratorT  d_values_in,        ///< Input values data
+    ValuesOutputIteratorT d_values_out,       ///< Output values data
+    ScanByKeyTileStateT   tile_state,         ///< Tile status interface
+    int                   start_tile,         ///< The starting tile for the current grid
+    EqualityOp            equality_op,        ///< Binary equality functor
+    ScanOpT               scan_op,            ///< Binary scan functor
+    InitValueT            init_value,         ///< Initial value to seed the exclusive scan
+    OffsetT               num_items)          ///< Total number of scan items for the entire problem
+{
+    typedef typename ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT ScanByKeyPolicyT;
+
+    // Thread block type for scanning input tiles
+    typedef AgentScanByKey<
+        ScanByKeyPolicyT,
+        KeysInputIteratorT,
+        ValuesInputIteratorT,
+        ValuesOutputIteratorT,
+        EqualityOp,
+        ScanOpT,
+        InitValueT,
+        OffsetT> AgentScanByKeyT;
+
+    // Shared memory for AgentScanByKey
+    __shared__ typename AgentScanByKeyT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentScanByKeyT(
+        temp_storage,
+        d_keys_in,
+        d_values_in,
+        d_values_out,
+        equality_op,
+        scan_op,
+        init_value
+    ).ConsumeRange(
+        num_items,
+        tile_state,
+        start_tile);
+}
+
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+template <typename KeysInputIteratorT,
+          typename ValuesInputIteratorT,
+          typename InitValueT>
+struct DeviceScanByKeyPolicy
+{
+    using KeyT = cub::detail::value_t<KeysInputIteratorT>;
+    using ValueT = cub::detail::conditional_t<
+        std::is_same<InitValueT, NullType>::value,
+        cub::detail::value_t<ValuesInputIteratorT>,
+        InitValueT>;
+    static constexpr size_t MaxInputBytes = (sizeof(KeyT) > sizeof(ValueT) ? sizeof(KeyT) : sizeof(ValueT));
+    static constexpr size_t CombinedInputBytes = sizeof(KeyT) + sizeof(ValueT);
+
+    // SM350
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
+    {
+        enum
+        {
+            NOMINAL_4B_ITEMS_PER_THREAD = 6,
+            ITEMS_PER_THREAD = ((MaxInputBytes <= 8) ? 6 :
+                Nominal4BItemsToItemsCombined(NOMINAL_4B_ITEMS_PER_THREAD, CombinedInputBytes)),
+        };
+
+        typedef AgentScanByKeyPolicy<
+                128, ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS,
+                BLOCK_STORE_WARP_TRANSPOSE>
+            ScanByKeyPolicyT;
+    };
+
+    // SM520
+    struct Policy520 : ChainedPolicy<520, Policy520, Policy350>
+    {
+        enum
+        {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+
+            ITEMS_PER_THREAD = ((MaxInputBytes <= 8) ? 9 :
+                Nominal4BItemsToItemsCombined(NOMINAL_4B_ITEMS_PER_THREAD, CombinedInputBytes)),
+        };
+
+        typedef AgentScanByKeyPolicy<
+                256, ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS,
+                BLOCK_STORE_WARP_TRANSPOSE>
+            ScanByKeyPolicyT;
+    };
+
+    /// MaxPolicy
+    typedef Policy520 MaxPolicy;
+};
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceScan
+ */
+template <
+    typename KeysInputIteratorT,          ///< Random-access input iterator type
+    typename ValuesInputIteratorT,        ///< Random-access input iterator type
+    typename ValuesOutputIteratorT,       ///< Random-access output iterator type
+    typename EqualityOp,                  ///< Equality functor type
+    typename ScanOpT,                     ///< Scan functor type
+    typename InitValueT,                  ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan)
+    typename OffsetT,                     ///< Signed integer type for global offsets
+    typename SelectedPolicy = DeviceScanByKeyPolicy<KeysInputIteratorT, ValuesInputIteratorT, InitValueT>
+>
+struct DispatchScanByKey:
+    SelectedPolicy
+{
+    //---------------------------------------------------------------------
+    // Constants and Types
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128
+    };
+
+    // The input key type
+    using KeyT = cub::detail::value_t<KeysInputIteratorT>;
+
+    // The input value type
+    using InputT = cub::detail::value_t<ValuesInputIteratorT>;
+
+    // The output value type -- used as the intermediate accumulator
+    // Per https://wg21.link/P0571, use InitValueT if provided, otherwise the
+    // input iterator's value type.
+    using OutputT =
+      cub::detail::conditional_t<std::is_same<InitValueT, NullType>::value,
+                                 InputT,
+                                 InitValueT>;
+
+    void*                 d_temp_storage;         ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t&               temp_storage_bytes;     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    KeysInputIteratorT    d_keys_in;              ///< [in] Iterator to the input sequence of key items
+    ValuesInputIteratorT  d_values_in;            ///< [in] Iterator to the input sequence of value items
+    ValuesOutputIteratorT d_values_out;           ///< [out] Iterator to the input sequence of value items
+    EqualityOp            equality_op;            ///< [in]Binary equality functor
+    ScanOpT               scan_op;                ///< [in] Binary scan functor
+    InitValueT            init_value;             ///< [in] Initial value to seed the exclusive scan
+    OffsetT               num_items;              ///< [in] Total number of input items (i.e., the length of \p d_in)
+    cudaStream_t          stream;                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                  debug_synchronous;
+    int                   ptx_version;
+
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchScanByKey(
+        void*                 d_temp_storage,         ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&               temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT    d_keys_in,              ///< [in] Iterator to the input sequence of key items
+        ValuesInputIteratorT  d_values_in,            ///< [in] Iterator to the input sequence of value items
+        ValuesOutputIteratorT d_values_out,           ///< [out] Iterator to the input sequence of value items
+        EqualityOp            equality_op,            ///< [in] Binary equality functor
+        ScanOpT               scan_op,                ///< [in] Binary scan functor
+        InitValueT            init_value,             ///< [in] Initial value to seed the exclusive scan
+        OffsetT               num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t          stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                  debug_synchronous,
+        int                   ptx_version
+    ):
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_keys_in(d_keys_in),
+        d_values_in(d_values_in),
+        d_values_out(d_values_out),
+        equality_op(equality_op),
+        scan_op(scan_op),
+        init_value(init_value),
+        num_items(num_items),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+    template <typename ActivePolicyT, typename InitKernel, typename ScanKernel>
+    CUB_RUNTIME_FUNCTION __host__  __forceinline__
+    cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        (void)init_kernel;
+        (void)scan_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+        typedef typename ActivePolicyT::ScanByKeyPolicyT Policy;
+        typedef ReduceByKeyScanTileState<OutputT, OffsetT> ScanByKeyTileStateT;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD;
+            int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanByKeyTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1] = {};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Construct the tile status interface
+            ScanByKeyTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log init_kernel configuration
+            int init_grid_size = cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS);
+            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(init_kernel, tile_state, num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+
+            // Get SM occupancy for scan_kernel
+            int scan_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                scan_sm_occupancy,            // out
+                scan_kernel,
+                Policy::BLOCK_THREADS))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;
+
+            // Run grids in epochs (in case number of tiles exceeds max x-dimension
+            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
+            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+            {
+                // Log scan_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    start_tile, scan_grid_size, Policy::BLOCK_THREADS, (long long) stream, Policy::ITEMS_PER_THREAD, scan_sm_occupancy);
+
+                // Invoke scan_kernel
+                THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                    scan_grid_size, Policy::BLOCK_THREADS, 0, stream
+                ).doit(
+                    scan_kernel,
+                    d_keys_in,
+                    d_values_in,
+                    d_values_out,
+                    tile_state,
+                    start_tile,
+                    equality_op,
+                    scan_op,
+                    init_value,
+                    num_items);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __host__  __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchScanByKey::MaxPolicy MaxPolicyT;
+        typedef ReduceByKeyScanTileState<OutputT, OffsetT> ScanByKeyTileStateT;
+        // Ensure kernels are instantiated.
+        return Invoke<ActivePolicyT>(
+            DeviceScanInitKernel<ScanByKeyTileStateT>,
+            DeviceScanByKeyKernel<
+                MaxPolicyT, KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT,
+                ScanByKeyTileStateT, EqualityOp, ScanOpT, InitValueT, OffsetT>
+        );
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                 d_temp_storage,         ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&               temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT    d_keys_in,              ///< [in] Iterator to the input sequence of key items
+        ValuesInputIteratorT  d_values_in,            ///< [in] Iterator to the input sequence of value items
+        ValuesOutputIteratorT d_values_out,           ///< [out] Iterator to the input sequence of value items
+        EqualityOp            equality_op,            ///< [in] Binary equality functor
+        ScanOpT               scan_op,                ///< [in] Binary scan functor
+        InitValueT            init_value,             ///< [in] Initial value to seed the exclusive scan
+        OffsetT               num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t          stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                  debug_synchronous)
+    {
+        typedef typename DispatchScanByKey::MaxPolicy MaxPolicyT;
+
+        cudaError_t error;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchScanByKey dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_keys_in,
+                d_values_in,
+                d_values_out,
+                equality_op,
+                scan_op,
+                init_value,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version
+            );
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/device/dispatch/dispatch_segmented_sort.cuh b/cuda/cub/device/dispatch/dispatch_segmented_sort.cuh
new file mode 100644
index 00000000..9f4b72cd
--- /dev/null
+++ b/cuda/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -0,0 +1,1742 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/agent/agent_segmented_radix_sort.cuh>
+#include <cub/detail/temporary_storage.cuh>
+#include <cub/detail/device_double_buffer.cuh>
+#include <cub/agent/agent_sub_warp_merge_sort.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_merge_sort.cuh>
+#include <cub/block/block_radix_rank.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/device/device_partition.cuh>
+#include <cub/thread/thread_sort.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_namespace.cuh>
+#include <cub/warp/warp_merge_sort.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/reverse_iterator.h>
+
+#include <type_traits>
+
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * @brief Fallback kernel, in case there's not enough segments to
+ *        take advantage of partitioning.
+ *
+ * In this case, the sorting method is still selected based on the segment size.
+ * If a single warp can sort the segment, the algorithm will use the sub-warp
+ * merge sort. Otherwise, the algorithm will use the in-shared-memory version of
+ * block radix sort. If data don't fit into shared memory, the algorithm will
+ * use in-global-memory radix sort.
+ *
+ * @param[in] d_keys_in_orig
+ *   Input keys buffer
+ *
+ * @param[out] d_keys_out_orig
+ *   Output keys buffer
+ *
+ * @param[in,out] d_keys_double_buffer
+ *   Double keys buffer
+ *
+ * @param[in] d_values_in_orig
+ *   Input values buffer
+ *
+ * @param[out] d_values_out_orig
+ *   Output values buffer
+ *
+ * @param[in,out] d_values_double_buffer
+ *   Double values buffer
+ *
+ * @param[in] d_begin_offsets
+ *   Random-access input iterator to the sequence of beginning offsets of length
+ *   @p num_segments, such that `d_begin_offsets[i]` is the first element of the
+ *   i-th data segment in `d_keys_*` and `d_values_*`
+ *
+ * @param[in] d_end_offsets
+ *   Random-access input iterator to the sequence of ending offsets of length
+ *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of the
+ *   i-th data segment in `d_keys_*` and `d_values_*`.
+ *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
+ *   considered empty.
+ */
+template <bool IS_DESCENDING,
+          typename ChainedPolicyT,
+          typename KeyT,
+          typename ValueT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename OffsetT>
+__launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS)
+__global__ void DeviceSegmentedSortFallbackKernel(
+    const KeyT *d_keys_in_orig,
+    KeyT *d_keys_out_orig,
+    cub::detail::device_double_buffer<KeyT> d_keys_double_buffer,
+    const ValueT *d_values_in_orig,
+    ValueT *d_values_out_orig,
+    cub::detail::device_double_buffer<ValueT> d_values_double_buffer,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets)
+{
+  using ActivePolicyT = typename ChainedPolicyT::ActivePolicy;
+  using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy;
+  using MediumPolicyT =
+    typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT::MediumPolicyT;
+
+  const unsigned int segment_id = blockIdx.x;
+  OffsetT segment_begin         = d_begin_offsets[segment_id];
+  OffsetT segment_end           = d_end_offsets[segment_id];
+  OffsetT num_items             = segment_end - segment_begin;
+
+  if (num_items <= 0)
+  {
+    return;
+  }
+
+  using AgentSegmentedRadixSortT =
+    cub::AgentSegmentedRadixSort<IS_DESCENDING,
+                                 LargeSegmentPolicyT,
+                                 KeyT,
+                                 ValueT,
+                                 OffsetT>;
+
+  using WarpReduceT = cub::WarpReduce<KeyT>;
+
+  using AgentWarpMergeSortT =
+    AgentSubWarpSort<IS_DESCENDING, MediumPolicyT, KeyT, ValueT, OffsetT>;
+
+  __shared__ union
+  {
+    typename AgentSegmentedRadixSortT::TempStorage block_sort;
+    typename WarpReduceT::TempStorage warp_reduce;
+    typename AgentWarpMergeSortT::TempStorage medium_warp_sort;
+  } temp_storage;
+
+  constexpr bool keys_only = std::is_same<ValueT, NullType>::value;
+  AgentSegmentedRadixSortT agent(num_items, temp_storage.block_sort);
+
+  constexpr int begin_bit = 0;
+  constexpr int end_bit = sizeof(KeyT) * 8;
+
+  constexpr int cacheable_tile_size = LargeSegmentPolicyT::BLOCK_THREADS *
+                                      LargeSegmentPolicyT::ITEMS_PER_THREAD;
+
+  d_keys_in_orig += segment_begin;
+  d_keys_out_orig += segment_begin;
+
+  if (!keys_only)
+  {
+    d_values_in_orig += segment_begin;
+    d_values_out_orig += segment_begin;
+  }
+
+  if (num_items <= MediumPolicyT::ITEMS_PER_TILE)
+  {
+    // Sort by a single warp
+    if (threadIdx.x < MediumPolicyT::WARP_THREADS)
+    {
+      AgentWarpMergeSortT(temp_storage.medium_warp_sort)
+        .ProcessSegment(num_items,
+                        d_keys_in_orig,
+                        d_keys_out_orig,
+                        d_values_in_orig,
+                        d_values_out_orig);
+    }
+  }
+  else if (num_items < cacheable_tile_size)
+  {
+    // Sort by a CTA if data fits into shared memory
+    agent.ProcessSinglePass(begin_bit,
+                            end_bit,
+                            d_keys_in_orig,
+                            d_values_in_orig,
+                            d_keys_out_orig,
+                            d_values_out_orig);
+  }
+  else
+  {
+    // Sort by a CTA with multiple reads from global memory
+    int current_bit = begin_bit;
+    int pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS},
+                               (end_bit - current_bit));
+
+    d_keys_double_buffer = cub::detail::device_double_buffer<KeyT>(
+      d_keys_double_buffer.current() + segment_begin,
+      d_keys_double_buffer.alternate() + segment_begin);
+
+    if (!keys_only)
+    {
+      d_values_double_buffer = cub::detail::device_double_buffer<ValueT>(
+        d_values_double_buffer.current() + segment_begin,
+        d_values_double_buffer.alternate() + segment_begin);
+    }
+
+    agent.ProcessIterative(current_bit,
+                           pass_bits,
+                           d_keys_in_orig,
+                           d_values_in_orig,
+                           d_keys_double_buffer.current(),
+                           d_values_double_buffer.current());
+    current_bit += pass_bits;
+
+    #pragma unroll 1
+    while (current_bit < end_bit)
+    {
+      pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS},
+                             (end_bit - current_bit));
+
+      CTA_SYNC();
+      agent.ProcessIterative(current_bit,
+                             pass_bits,
+                             d_keys_double_buffer.current(),
+                             d_values_double_buffer.current(),
+                             d_keys_double_buffer.alternate(),
+                             d_values_double_buffer.alternate());
+
+      d_keys_double_buffer.swap();
+      d_values_double_buffer.swap();
+      current_bit += pass_bits;
+    }
+  }
+}
+
+
+/**
+ * @brief Single kernel for moderate size (less than a few thousand items)
+ *        segments.
+ *
+ * This kernel allocates a sub-warp per segment. Therefore, this kernel assigns
+ * a single thread block to multiple segments. Segments fall into two
+ * categories. An architectural warp usually sorts segments in the medium-size
+ * category, while a few threads sort segments in the small-size category. Since
+ * segments are partitioned, we know the last thread block index assigned to
+ * sort medium-size segments. A particular thread block can check this number to
+ * find out which category it was assigned to sort. In both cases, the
+ * merge sort is used.
+ *
+ * @param[in] small_segments
+ *   Number of segments that can be sorted by a warp part
+ *
+ * @param[in] medium_segments
+ *   Number of segments that can be sorted by a warp
+ *
+ * @param[in] medium_blocks
+ *   Number of CTAs assigned to process medium segments
+ *
+ * @param[in] d_small_segments_indices
+ *   Small segments mapping of length @p small_segments, such that
+ *   `d_small_segments_indices[i]` is the input segment index
+ *
+ * @param[in] d_medium_segments_indices
+ *   Medium segments mapping of length @p medium_segments, such that
+ *   `d_medium_segments_indices[i]` is the input segment index
+ *
+ * @param[in] d_keys_in_orig
+ *   Input keys buffer
+ *
+ * @param[out] d_keys_out_orig
+ *   Output keys buffer
+ *
+ * @param[in] d_values_in_orig
+ *   Input values buffer
+ *
+ * @param[out] d_values_out_orig
+ *   Output values buffer
+ *
+ * @param[in] d_begin_offsets
+ *   Random-access input iterator to the sequence of beginning offsets of length
+ *   @p num_segments, such that `d_begin_offsets[i]` is the first element of the
+ *   <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`
+ *
+ * @param[in] d_end_offsets
+ *   Random-access input iterator to the sequence of ending offsets of length
+ *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of the
+ *   <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`. If
+ *   `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the <em>i</em><sup>th</sup> is
+ *   considered empty.
+ */
+template <bool IS_DESCENDING,
+          typename ChainedPolicyT,
+          typename KeyT,
+          typename ValueT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename OffsetT>
+__launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolicyT::BLOCK_THREADS)
+__global__ void DeviceSegmentedSortKernelSmall(
+    unsigned int small_segments,
+    unsigned int medium_segments,
+    unsigned int medium_blocks,
+    const unsigned int *d_small_segments_indices,
+    const unsigned int *d_medium_segments_indices,
+    const KeyT *d_keys_in,
+    KeyT *d_keys_out,
+    const ValueT *d_values_in,
+    ValueT *d_values_out,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets)
+{
+  const unsigned int tid = threadIdx.x;
+  const unsigned int bid = blockIdx.x;
+
+  using ActivePolicyT = typename ChainedPolicyT::ActivePolicy;
+  using SmallAndMediumPolicyT =
+    typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT;
+  using MediumPolicyT = typename SmallAndMediumPolicyT::MediumPolicyT;
+  using SmallPolicyT  = typename SmallAndMediumPolicyT::SmallPolicyT;
+
+  constexpr int threads_per_medium_segment = MediumPolicyT::WARP_THREADS;
+  constexpr int threads_per_small_segment = SmallPolicyT::WARP_THREADS;
+
+  using MediumAgentWarpMergeSortT =
+    AgentSubWarpSort<IS_DESCENDING, MediumPolicyT, KeyT, ValueT, OffsetT>;
+
+  using SmallAgentWarpMergeSortT =
+    AgentSubWarpSort<IS_DESCENDING, SmallPolicyT, KeyT, ValueT, OffsetT>;
+
+  constexpr auto segments_per_medium_block =
+    static_cast<unsigned int>(SmallAndMediumPolicyT::SEGMENTS_PER_MEDIUM_BLOCK);
+
+  constexpr auto segments_per_small_block =
+    static_cast<unsigned int>(SmallAndMediumPolicyT::SEGMENTS_PER_SMALL_BLOCK);
+
+  __shared__ union
+  {
+    typename MediumAgentWarpMergeSortT::TempStorage
+      medium_storage[segments_per_medium_block];
+
+    typename SmallAgentWarpMergeSortT::TempStorage
+      small_storage[segments_per_small_block];
+  } temp_storage;
+
+  if (bid < medium_blocks)
+  {
+    const unsigned int sid_within_block = tid / threads_per_medium_segment;
+    const unsigned int medium_segment_id = bid * segments_per_medium_block +
+                                           sid_within_block;
+
+    if (medium_segment_id < medium_segments)
+    {
+      const unsigned int global_segment_id =
+        d_medium_segments_indices[medium_segment_id];
+
+      const OffsetT segment_begin = d_begin_offsets[global_segment_id];
+      const OffsetT segment_end   = d_end_offsets[global_segment_id];
+      const OffsetT num_items     = segment_end - segment_begin;
+
+      MediumAgentWarpMergeSortT(temp_storage.medium_storage[sid_within_block])
+        .ProcessSegment(num_items,
+                        d_keys_in + segment_begin,
+                        d_keys_out + segment_begin,
+                        d_values_in + segment_begin,
+                        d_values_out + segment_begin);
+    }
+  }
+  else
+  {
+    const unsigned int sid_within_block = tid / threads_per_small_segment;
+    const unsigned int small_segment_id =
+      (bid - medium_blocks) * segments_per_small_block + sid_within_block;
+
+    if (small_segment_id < small_segments)
+    {
+      const unsigned int global_segment_id =
+        d_small_segments_indices[small_segment_id];
+
+      const OffsetT segment_begin = d_begin_offsets[global_segment_id];
+      const OffsetT segment_end   = d_end_offsets[global_segment_id];
+      const OffsetT num_items     = segment_end - segment_begin;
+
+      SmallAgentWarpMergeSortT(temp_storage.small_storage[sid_within_block])
+        .ProcessSegment(num_items,
+                        d_keys_in + segment_begin,
+                        d_keys_out + segment_begin,
+                        d_values_in + segment_begin,
+                        d_values_out + segment_begin);
+    }
+  }
+}
+
+/**
+ * @brief Single kernel for large size (more than a few thousand items) segments.
+ *
+ * @param[in] d_keys_in_orig
+ *   Input keys buffer
+ *
+ * @param[out] d_keys_out_orig
+ *   Output keys buffer
+ *
+ * @param[in] d_values_in_orig
+ *   Input values buffer
+ *
+ * @param[out] d_values_out_orig
+ *   Output values buffer
+ *
+ * @param[in] d_begin_offsets
+ *   Random-access input iterator to the sequence of beginning offsets of length
+ *   @p num_segments, such that `d_begin_offsets[i]` is the first element of the
+ *   <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`
+ *
+ * @param[in] d_end_offsets
+ *   Random-access input iterator to the sequence of ending offsets of length
+ *   @p num_segments, such that `d_end_offsets[i]-1` is the last element of the
+ *   <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`. If
+ *   `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the <em>i</em><sup>th</sup> is
+ *   considered empty.
+ */
+template <bool IS_DESCENDING,
+          typename ChainedPolicyT,
+          typename KeyT,
+          typename ValueT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename OffsetT>
+__launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS)
+__global__ void DeviceSegmentedSortKernelLarge(
+    const unsigned int *d_segments_indices,
+    const KeyT *d_keys_in_orig,
+    KeyT *d_keys_out_orig,
+    cub::detail::device_double_buffer<KeyT> d_keys_double_buffer,
+    const ValueT *d_values_in_orig,
+    ValueT *d_values_out_orig,
+    cub::detail::device_double_buffer<ValueT> d_values_double_buffer,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets)
+{
+  using ActivePolicyT = typename ChainedPolicyT::ActivePolicy;
+  using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy;
+
+  constexpr int small_tile_size = LargeSegmentPolicyT::BLOCK_THREADS *
+                                  LargeSegmentPolicyT::ITEMS_PER_THREAD;
+
+  using AgentSegmentedRadixSortT =
+    cub::AgentSegmentedRadixSort<IS_DESCENDING,
+                                 LargeSegmentPolicyT,
+                                 KeyT,
+                                 ValueT,
+                                 OffsetT>;
+
+  __shared__ typename AgentSegmentedRadixSortT::TempStorage storage;
+
+  const unsigned int bid = blockIdx.x;
+
+  constexpr int begin_bit = 0;
+  constexpr int end_bit   = sizeof(KeyT) * 8;
+
+  const unsigned int global_segment_id = d_segments_indices[bid];
+  const OffsetT segment_begin          = d_begin_offsets[global_segment_id];
+  const OffsetT segment_end            = d_end_offsets[global_segment_id];
+  const OffsetT num_items              = segment_end - segment_begin;
+
+  constexpr bool keys_only = std::is_same<ValueT, NullType>::value;
+  AgentSegmentedRadixSortT agent(num_items, storage);
+
+  d_keys_in_orig += segment_begin;
+  d_keys_out_orig += segment_begin;
+
+  if (!keys_only)
+  {
+    d_values_in_orig += segment_begin;
+    d_values_out_orig += segment_begin;
+  }
+
+  if (num_items < small_tile_size)
+  {
+    // Sort in shared memory if the segment fits into it
+    agent.ProcessSinglePass(begin_bit,
+                            end_bit,
+                            d_keys_in_orig,
+                            d_values_in_orig,
+                            d_keys_out_orig,
+                            d_values_out_orig);
+  }
+  else
+  {
+    // Sort reading global memory multiple times
+    int current_bit = begin_bit;
+    int pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS},
+                               (end_bit - current_bit));
+
+    d_keys_double_buffer = cub::detail::device_double_buffer<KeyT>(
+      d_keys_double_buffer.current() + segment_begin,
+      d_keys_double_buffer.alternate() + segment_begin);
+
+    if (!keys_only)
+    {
+      d_values_double_buffer = cub::detail::device_double_buffer<ValueT>(
+        d_values_double_buffer.current() + segment_begin,
+        d_values_double_buffer.alternate() + segment_begin);
+    }
+
+    agent.ProcessIterative(current_bit,
+                           pass_bits,
+                           d_keys_in_orig,
+                           d_values_in_orig,
+                           d_keys_double_buffer.current(),
+                           d_values_double_buffer.current());
+    current_bit += pass_bits;
+
+    #pragma unroll 1
+    while (current_bit < end_bit)
+    {
+      pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS},
+                             (end_bit - current_bit));
+
+      CTA_SYNC();
+      agent.ProcessIterative(current_bit,
+                             pass_bits,
+                             d_keys_double_buffer.current(),
+                             d_values_double_buffer.current(),
+                             d_keys_double_buffer.alternate(),
+                             d_values_double_buffer.alternate());
+
+      d_keys_double_buffer.swap();
+      d_values_double_buffer.swap();
+      current_bit += pass_bits;
+    }
+  }
+}
+
+/*
+ * Continuation is called after the partitioning stage. It launches kernels
+ * to sort large and small segments using the partitioning results. Separation
+ * of this stage is required to eliminate device-side synchronization in
+ * the CDP mode.
+ */
+template <typename LargeSegmentPolicyT,
+          typename SmallAndMediumPolicyT,
+          typename LargeKernelT,
+          typename SmallKernelT,
+          typename KeyT,
+          typename ValueT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT>
+CUB_RUNTIME_FUNCTION cudaError_t
+DeviceSegmentedSortContinuation(
+    LargeKernelT large_kernel,
+    SmallKernelT small_kernel,
+    int num_segments,
+    KeyT *d_current_keys,
+    KeyT *d_final_keys,
+    detail::device_double_buffer<KeyT> d_keys_double_buffer,
+    ValueT *d_current_values,
+    ValueT *d_final_values,
+    detail::device_double_buffer<ValueT> d_values_double_buffer,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    unsigned int *group_sizes,
+    unsigned int *large_and_medium_segments_indices,
+    unsigned int *small_segments_indices,
+    cudaStream_t stream,
+    bool debug_synchronous)
+{
+  cudaError error = cudaSuccess;
+
+  const unsigned int large_segments = group_sizes[0];
+
+  if (large_segments > 0)
+  {
+    // One CTA per segment
+    const unsigned int blocks_in_grid = large_segments;
+
+    if (debug_synchronous)
+    {
+      _CubLog("Invoking "
+              "DeviceSegmentedSortKernelLarge<<<%d, %d, 0, %lld>>>()\n",
+              static_cast<int>(blocks_in_grid),
+              LargeSegmentPolicyT::BLOCK_THREADS,
+              (long long)stream);
+    }
+
+    THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+      blocks_in_grid,
+      LargeSegmentPolicyT::BLOCK_THREADS,
+      0,
+      stream)
+      .doit(large_kernel,
+            large_and_medium_segments_indices,
+            d_current_keys,
+            d_final_keys,
+            d_keys_double_buffer,
+            d_current_values,
+            d_final_values,
+            d_values_double_buffer,
+            d_begin_offsets,
+            d_end_offsets);
+
+    // Check for failure to launch
+    if (CubDebug(error = cudaPeekAtLastError()))
+    {
+      return error;
+    }
+
+    // Sync the stream if specified to flush runtime errors
+    if (CUB_IS_HOST_CODE && debug_synchronous)
+    {
+      #if CUB_INCLUDE_HOST_CODE
+      if (CubDebug(error = SyncStream(stream)))
+      {
+        return error;
+      }
+      #endif
+    }
+  }
+
+  const unsigned int small_segments  = group_sizes[1];
+  const unsigned int medium_segments =
+    static_cast<unsigned int>(num_segments) -
+    (large_segments + small_segments);
+
+  const unsigned int small_blocks =
+    DivideAndRoundUp(small_segments,
+                     SmallAndMediumPolicyT::SEGMENTS_PER_SMALL_BLOCK);
+
+  const unsigned int medium_blocks =
+    DivideAndRoundUp(medium_segments,
+                     SmallAndMediumPolicyT::SEGMENTS_PER_MEDIUM_BLOCK);
+
+  const unsigned int small_and_medium_blocks_in_grid = small_blocks +
+                                                       medium_blocks;
+
+  if (small_and_medium_blocks_in_grid)
+  {
+    if (debug_synchronous)
+    {
+      _CubLog("Invoking "
+              "DeviceSegmentedSortKernelSmall<<<%d, %d, 0, %lld>>>()\n",
+              static_cast<int>(small_and_medium_blocks_in_grid),
+              SmallAndMediumPolicyT::BLOCK_THREADS,
+              (long long)stream);
+    }
+
+    THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+      small_and_medium_blocks_in_grid,
+      SmallAndMediumPolicyT::BLOCK_THREADS,
+      0,
+      stream)
+      .doit(small_kernel,
+            small_segments,
+            medium_segments,
+            medium_blocks,
+            small_segments_indices,
+            large_and_medium_segments_indices + num_segments - medium_segments,
+            d_current_keys,
+            d_final_keys,
+            d_current_values,
+            d_final_values,
+            d_begin_offsets,
+            d_end_offsets);
+
+    // Check for failure to launch
+    if (CubDebug(error = cudaPeekAtLastError()))
+    {
+      return error;
+    }
+
+    // Sync the stream if specified to flush runtime errors
+    if (CUB_IS_HOST_CODE && debug_synchronous)
+    {
+      #if CUB_INCLUDE_HOST_CODE
+      if (CubDebug(error = SyncStream(stream)))
+      {
+        return error;
+      }
+      #endif
+    }
+  }
+
+  return error;
+}
+
+/*
+ * Continuation kernel is used only in the CDP mode. It's used to
+ * launch DeviceSegmentedSortContinuation as a separate kernel.
+ */
+template <typename ChainedPolicyT,
+          typename LargeKernelT,
+          typename SmallKernelT,
+          typename KeyT,
+          typename ValueT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT>
+__launch_bounds__(1) __global__ void
+DeviceSegmentedSortContinuationKernel(
+  LargeKernelT large_kernel,
+  SmallKernelT small_kernel,
+  int num_segments,
+  KeyT *d_current_keys,
+  KeyT *d_final_keys,
+  detail::device_double_buffer<KeyT> d_keys_double_buffer,
+  ValueT *d_current_values,
+  ValueT *d_final_values,
+  detail::device_double_buffer<ValueT> d_values_double_buffer,
+  BeginOffsetIteratorT d_begin_offsets,
+  EndOffsetIteratorT d_end_offsets,
+  unsigned int *group_sizes,
+  unsigned int *large_and_medium_segments_indices,
+  unsigned int *small_segments_indices,
+  bool debug_synchronous)
+{
+  using ActivePolicyT = typename ChainedPolicyT::ActivePolicy;
+  using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy;
+  using SmallAndMediumPolicyT =
+    typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT;
+
+  // In case of CDP:
+  // 1. each CTA has a different main stream
+  // 2. all streams are non-blocking
+  // 3. child grid always completes before the parent grid
+  // 4. streams can be used only from the CTA in which they were created
+  // 5. streams created on the host cannot be used on the device
+  //
+  // Due to (4, 5), we can't pass the user-provided stream in the continuation.
+  // Due to (1, 2, 3) it's safe to pass the main stream.
+  #ifdef CUB_RUNTIME_ENABLED
+  cudaError_t error =
+    DeviceSegmentedSortContinuation<LargeSegmentPolicyT, SmallAndMediumPolicyT>(
+      large_kernel,
+      small_kernel,
+      num_segments,
+      d_current_keys,
+      d_final_keys,
+      d_keys_double_buffer,
+      d_current_values,
+      d_final_values,
+      d_values_double_buffer,
+      d_begin_offsets,
+      d_end_offsets,
+      group_sizes,
+      large_and_medium_segments_indices,
+      small_segments_indices,
+      0, // always launching on the main stream (see motivation above)
+      debug_synchronous);
+
+  CubDebug(error);
+  #else
+  // Kernel launch not supported from this device
+  CubDebug(cudaErrorNotSupported);
+  #endif
+}
+
+template <typename KeyT,
+          typename ValueT>
+struct DeviceSegmentedSortPolicy
+{
+  using DominantT =
+    cub::detail::conditional_t<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>;
+
+  constexpr static int KEYS_ONLY = std::is_same<ValueT, cub::NullType>::value;
+
+  //----------------------------------------------------------------------------
+  // Architecture-specific tuning policies
+  //----------------------------------------------------------------------------
+
+  struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
+  {
+    constexpr static int BLOCK_THREADS = 128;
+    constexpr static int RADIX_BITS = sizeof(KeyT) > 1 ? 6 : 4;
+    constexpr static int PARTITIONING_THRESHOLD = 300;
+
+    using LargeSegmentPolicy =
+      AgentRadixSortDownsweepPolicy<BLOCK_THREADS,
+                                    9,
+                                    DominantT,
+                                    BLOCK_LOAD_WARP_TRANSPOSE,
+                                    LOAD_DEFAULT,
+                                    RADIX_RANK_MATCH,
+                                    BLOCK_SCAN_WARP_SCANS,
+                                    RADIX_BITS>;
+
+    constexpr static int ITEMS_PER_SMALL_THREAD =
+      Nominal4BItemsToItems<DominantT>(5);
+
+    constexpr static int ITEMS_PER_MEDIUM_THREAD =
+      Nominal4BItemsToItems<DominantT>(5);
+
+    using SmallAndMediumSegmentedSortPolicyT =
+      AgentSmallAndMediumSegmentedSortPolicy<
+
+        BLOCK_THREADS,
+
+        // Small policy
+        cub::AgentSubWarpMergeSortPolicy<4,
+                                         ITEMS_PER_SMALL_THREAD,
+                                         WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                         CacheLoadModifier::LOAD_DEFAULT>,
+
+        // Medium policy
+        cub::AgentSubWarpMergeSortPolicy<32,
+                                         ITEMS_PER_MEDIUM_THREAD,
+                                         WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                         CacheLoadModifier::LOAD_DEFAULT>>;
+  };
+
+  struct Policy500 : ChainedPolicy<500, Policy500, Policy350>
+  {
+    constexpr static int BLOCK_THREADS = 256;
+    constexpr static int RADIX_BITS = sizeof(KeyT) > 1 ? 6 : 4;
+    constexpr static int PARTITIONING_THRESHOLD = 300;
+
+    using LargeSegmentPolicy =
+      AgentRadixSortDownsweepPolicy<BLOCK_THREADS,
+                                    16,
+                                    DominantT,
+                                    BLOCK_LOAD_DIRECT,
+                                    LOAD_DEFAULT,
+                                    RADIX_RANK_MEMOIZE,
+                                    BLOCK_SCAN_RAKING_MEMOIZE,
+                                    RADIX_BITS>;
+
+    constexpr static int ITEMS_PER_SMALL_THREAD =
+      Nominal4BItemsToItems<DominantT>(7);
+
+    constexpr static int ITEMS_PER_MEDIUM_THREAD =
+      Nominal4BItemsToItems<DominantT>(7);
+
+    using SmallAndMediumSegmentedSortPolicyT =
+      AgentSmallAndMediumSegmentedSortPolicy<
+
+        BLOCK_THREADS,
+
+        // Small policy
+        cub::AgentSubWarpMergeSortPolicy<4, // Threads per segment
+                                         ITEMS_PER_SMALL_THREAD,
+                                         WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                         CacheLoadModifier::LOAD_DEFAULT>,
+
+        // Medium policy
+        cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment
+                                         ITEMS_PER_MEDIUM_THREAD,
+                                         WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                         CacheLoadModifier::LOAD_DEFAULT>>;
+  };
+
+  struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
+  {
+    constexpr static int BLOCK_THREADS = 256;
+    constexpr static int RADIX_BITS = sizeof(KeyT) > 1 ? 6 : 4;
+    constexpr static int PARTITIONING_THRESHOLD = 500;
+
+    using LargeSegmentPolicy =
+      AgentRadixSortDownsweepPolicy<BLOCK_THREADS,
+                                    19,
+                                    DominantT,
+                                    BLOCK_LOAD_TRANSPOSE,
+                                    LOAD_DEFAULT,
+                                    RADIX_RANK_MATCH,
+                                    BLOCK_SCAN_WARP_SCANS,
+                                    RADIX_BITS>;
+
+    constexpr static int ITEMS_PER_SMALL_THREAD =
+      Nominal4BItemsToItems<DominantT>(9);
+
+    constexpr static int ITEMS_PER_MEDIUM_THREAD =
+      Nominal4BItemsToItems<DominantT>(9);
+
+    using SmallAndMediumSegmentedSortPolicyT =
+      AgentSmallAndMediumSegmentedSortPolicy<
+
+        BLOCK_THREADS,
+
+        // Small policy
+        cub::AgentSubWarpMergeSortPolicy<4, // Threads per segment
+                                         ITEMS_PER_SMALL_THREAD,
+                                         WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                         CacheLoadModifier::LOAD_DEFAULT>,
+
+        // Medium policy
+        cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment
+                                         ITEMS_PER_MEDIUM_THREAD,
+                                         WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                         CacheLoadModifier::LOAD_DEFAULT>>;
+  };
+
+  struct Policy610 : ChainedPolicy<610, Policy610, Policy600>
+  {
+    constexpr static int BLOCK_THREADS = 256;
+    constexpr static int RADIX_BITS = sizeof(KeyT) > 1 ? 6 : 4;
+    constexpr static int PARTITIONING_THRESHOLD = 500;
+
+    using LargeSegmentPolicy =
+      AgentRadixSortDownsweepPolicy<BLOCK_THREADS,
+                                    19,
+                                    DominantT,
+                                    BLOCK_LOAD_DIRECT,
+                                    LOAD_DEFAULT,
+                                    RADIX_RANK_MEMOIZE,
+                                    BLOCK_SCAN_WARP_SCANS,
+                                    RADIX_BITS>;
+
+    constexpr static int ITEMS_PER_SMALL_THREAD =
+      Nominal4BItemsToItems<DominantT>(9);
+
+    constexpr static int ITEMS_PER_MEDIUM_THREAD =
+      Nominal4BItemsToItems<DominantT>(9);
+
+    using SmallAndMediumSegmentedSortPolicyT =
+      AgentSmallAndMediumSegmentedSortPolicy<
+
+        BLOCK_THREADS,
+
+        // Small policy
+        cub::AgentSubWarpMergeSortPolicy<4, // Threads per segment
+                                         ITEMS_PER_SMALL_THREAD,
+                                         WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                         CacheLoadModifier::LOAD_DEFAULT>,
+
+        // Medium policy
+        cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment
+                                         ITEMS_PER_MEDIUM_THREAD,
+                                         WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                         CacheLoadModifier::LOAD_DEFAULT>>;
+  };
+
+  struct Policy620 : ChainedPolicy<620, Policy620, Policy610>
+  {
+    constexpr static int BLOCK_THREADS = 256;
+    constexpr static int RADIX_BITS = sizeof(KeyT) > 1 ? 5 : 4;
+    constexpr static int PARTITIONING_THRESHOLD = 500;
+
+    using LargeSegmentPolicy =
+      AgentRadixSortDownsweepPolicy<BLOCK_THREADS,
+                                    16,
+                                    DominantT,
+                                    BLOCK_LOAD_TRANSPOSE,
+                                    LOAD_DEFAULT,
+                                    RADIX_RANK_MEMOIZE,
+                                    BLOCK_SCAN_RAKING_MEMOIZE,
+                                    RADIX_BITS>;
+
+    constexpr static int ITEMS_PER_SMALL_THREAD =
+      Nominal4BItemsToItems<DominantT>(9);
+
+    constexpr static int ITEMS_PER_MEDIUM_THREAD =
+      Nominal4BItemsToItems<DominantT>(9);
+
+    using SmallAndMediumSegmentedSortPolicyT =
+      AgentSmallAndMediumSegmentedSortPolicy<
+
+        BLOCK_THREADS,
+
+        // Small policy
+        cub::AgentSubWarpMergeSortPolicy<4, // Threads per segment
+                                         ITEMS_PER_SMALL_THREAD,
+                                         WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                         CacheLoadModifier::LOAD_DEFAULT>,
+
+        // Medium policy
+        cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment
+                                         ITEMS_PER_MEDIUM_THREAD,
+                                         WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                         CacheLoadModifier::LOAD_DEFAULT>>;
+  };
+
+  struct Policy700 : ChainedPolicy<700, Policy700, Policy620>
+  {
+    constexpr static int BLOCK_THREADS = 256;
+    constexpr static int RADIX_BITS = sizeof(KeyT) > 1 ? 6 : 4;
+    constexpr static int PARTITIONING_THRESHOLD = 500;
+
+    using LargeSegmentPolicy =
+      AgentRadixSortDownsweepPolicy<BLOCK_THREADS,
+                                    19,
+                                    DominantT,
+                                    BLOCK_LOAD_DIRECT,
+                                    LOAD_DEFAULT,
+                                    RADIX_RANK_MEMOIZE,
+                                    BLOCK_SCAN_WARP_SCANS,
+                                    RADIX_BITS>;
+
+    constexpr static int ITEMS_PER_SMALL_THREAD =
+      Nominal4BItemsToItems<DominantT>(7);
+
+    constexpr static int ITEMS_PER_MEDIUM_THREAD =
+      Nominal4BItemsToItems<DominantT>(KEYS_ONLY ? 11 : 7);
+
+    using SmallAndMediumSegmentedSortPolicyT =
+      AgentSmallAndMediumSegmentedSortPolicy<
+
+        BLOCK_THREADS,
+
+        // Small policy
+        cub::AgentSubWarpMergeSortPolicy<(KEYS_ONLY ? 4 : 8), // Threads per segment
+                                         ITEMS_PER_SMALL_THREAD,
+                                         WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                         CacheLoadModifier::LOAD_DEFAULT>,
+
+        // Medium policy
+        cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment
+                                         ITEMS_PER_MEDIUM_THREAD,
+                                         WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                         CacheLoadModifier::LOAD_DEFAULT>>;
+  };
+
+  struct Policy800 : ChainedPolicy<800, Policy800, Policy700>
+  {
+    constexpr static int BLOCK_THREADS = 256;
+    constexpr static int PARTITIONING_THRESHOLD = 500;
+
+    using LargeSegmentPolicy =
+      cub::AgentRadixSortDownsweepPolicy<BLOCK_THREADS,
+                                         23,
+                                         DominantT,
+                                         cub::BLOCK_LOAD_TRANSPOSE,
+                                         cub::LOAD_DEFAULT,
+                                         cub::RADIX_RANK_MEMOIZE,
+                                         cub::BLOCK_SCAN_WARP_SCANS,
+                                         (sizeof(KeyT) > 1) ? 6 : 4>;
+
+    constexpr static int ITEMS_PER_SMALL_THREAD =
+      Nominal4BItemsToItems<DominantT>(9);
+
+    constexpr static int ITEMS_PER_MEDIUM_THREAD =
+      Nominal4BItemsToItems<DominantT>(KEYS_ONLY ? 7 : 11);
+
+    using SmallAndMediumSegmentedSortPolicyT =
+      AgentSmallAndMediumSegmentedSortPolicy<
+
+        BLOCK_THREADS,
+
+        // Small policy
+        cub::AgentSubWarpMergeSortPolicy<(KEYS_ONLY ? 4 : 2), // Threads per segment
+                                         ITEMS_PER_SMALL_THREAD,
+                                         WarpLoadAlgorithm::WARP_LOAD_TRANSPOSE,
+                                         CacheLoadModifier::LOAD_DEFAULT>,
+
+        // Medium policy
+        cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment
+                                         ITEMS_PER_MEDIUM_THREAD,
+                                         WarpLoadAlgorithm::WARP_LOAD_TRANSPOSE,
+                                         CacheLoadModifier::LOAD_DEFAULT>>;
+  };
+
+  struct Policy860 : ChainedPolicy<860, Policy860, Policy800>
+  {
+    constexpr static int BLOCK_THREADS = 256;
+    constexpr static int PARTITIONING_THRESHOLD = 500;
+
+    using LargeSegmentPolicy =
+      cub::AgentRadixSortDownsweepPolicy<BLOCK_THREADS,
+                                         23,
+                                         DominantT,
+                                         cub::BLOCK_LOAD_TRANSPOSE,
+                                         cub::LOAD_DEFAULT,
+                                         cub::RADIX_RANK_MEMOIZE,
+                                         cub::BLOCK_SCAN_WARP_SCANS,
+                                         (sizeof(KeyT) > 1) ? 6 : 4>;
+
+    constexpr static bool LARGE_ITEMS = sizeof(DominantT) > 4;
+
+    constexpr static int ITEMS_PER_SMALL_THREAD =
+      Nominal4BItemsToItems<DominantT>(LARGE_ITEMS ? 7 : 9);
+
+    constexpr static int ITEMS_PER_MEDIUM_THREAD =
+      Nominal4BItemsToItems<DominantT>(LARGE_ITEMS ? 9 : 7);
+
+    using SmallAndMediumSegmentedSortPolicyT =
+      AgentSmallAndMediumSegmentedSortPolicy<
+
+        BLOCK_THREADS,
+
+        // Small policy
+        cub::AgentSubWarpMergeSortPolicy<(LARGE_ITEMS ? 8 : 2), // Threads per segment
+                                         ITEMS_PER_SMALL_THREAD,
+                                         WarpLoadAlgorithm::WARP_LOAD_TRANSPOSE,
+                                         CacheLoadModifier::LOAD_LDG>,
+
+        // Medium policy
+        cub::AgentSubWarpMergeSortPolicy<16, // Threads per segment
+                                         ITEMS_PER_MEDIUM_THREAD,
+                                         WarpLoadAlgorithm::WARP_LOAD_TRANSPOSE,
+                                         CacheLoadModifier::LOAD_LDG>>;
+  };
+
+  /// MaxPolicy
+  using MaxPolicy = Policy860;
+};
+
+template <bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename OffsetT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename SelectedPolicy = DeviceSegmentedSortPolicy<KeyT, ValueT>>
+struct DispatchSegmentedSort : SelectedPolicy
+{
+  static constexpr int KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+
+  struct LargeSegmentsSelectorT
+  {
+    OffsetT value{};
+    BeginOffsetIteratorT d_offset_begin{};
+    EndOffsetIteratorT d_offset_end{};
+
+    __host__ __device__ __forceinline__
+    LargeSegmentsSelectorT(OffsetT value,
+                           BeginOffsetIteratorT d_offset_begin,
+                           EndOffsetIteratorT d_offset_end)
+        : value(value)
+        , d_offset_begin(d_offset_begin)
+        , d_offset_end(d_offset_end)
+    {}
+
+    __host__ __device__ __forceinline__ bool
+    operator()(unsigned int segment_id) const
+    {
+      const OffsetT segment_size = d_offset_end[segment_id] -
+                                   d_offset_begin[segment_id];
+      return segment_size > value;
+    }
+  };
+
+  struct SmallSegmentsSelectorT
+  {
+    OffsetT value{};
+    BeginOffsetIteratorT d_offset_begin{};
+    EndOffsetIteratorT d_offset_end{};
+
+    __host__ __device__ __forceinline__
+    SmallSegmentsSelectorT(OffsetT value,
+                           BeginOffsetIteratorT d_offset_begin,
+                           EndOffsetIteratorT d_offset_end)
+        : value(value)
+        , d_offset_begin(d_offset_begin)
+        , d_offset_end(d_offset_end)
+    {}
+
+    __host__ __device__ __forceinline__ bool
+    operator()(unsigned int segment_id) const
+    {
+      const OffsetT segment_size = d_offset_end[segment_id] -
+                                   d_offset_begin[segment_id];
+      return segment_size < value;
+    }
+  };
+
+  // Partition selects large and small groups. The middle group is not selected.
+  constexpr static std::size_t num_selected_groups = 2;
+
+  /**
+   * Device-accessible allocation of temporary storage. When `nullptr`, the
+   * required allocation size is written to @p temp_storage_bytes and no work
+   * is done.
+   */
+  void *d_temp_storage;
+
+  /// Reference to size in bytes of @p d_temp_storage allocation
+  std::size_t &temp_storage_bytes;
+
+  /**
+   * Double-buffer whose current buffer contains the unsorted input keys and,
+   * upon return, is updated to point to the sorted output keys
+   */
+  DoubleBuffer<KeyT> &d_keys;
+
+  /**
+   * Double-buffer whose current buffer contains the unsorted input values and,
+   * upon return, is updated to point to the sorted output values
+   */
+  DoubleBuffer<ValueT> &d_values;
+
+  /// Number of items to sort
+  OffsetT num_items;
+
+  /// The number of segments that comprise the sorting data
+  int num_segments;
+
+  /**
+   * Random-access input iterator to the sequence of beginning offsets of length
+   * @p num_segments, such that `d_begin_offsets[i]` is the first element of the
+   * <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`
+   */
+  BeginOffsetIteratorT d_begin_offsets;
+
+  /**
+   * Random-access input iterator to the sequence of ending offsets of length
+   * @p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element
+   * of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
+   * `d_values_*`. If `d_end_offsets[i]-1 <= d_begin_offsets[i]`,
+   * the <em>i</em><sup>th</sup> is considered empty.
+   */
+  EndOffsetIteratorT d_end_offsets;
+
+  /// Whether is okay to overwrite source buffers
+  bool is_overwrite_okay;
+
+  /// CUDA stream to launch kernels within.
+  cudaStream_t stream;
+
+  /**
+   * Whether or not to synchronize the stream after every kernel launch to check
+   * for errors. Also causes launch configurations to be printed to the console.
+   */
+  bool debug_synchronous;
+
+  CUB_RUNTIME_FUNCTION __forceinline__
+  DispatchSegmentedSort(void *d_temp_storage,
+                        std::size_t &temp_storage_bytes,
+                        DoubleBuffer<KeyT> &d_keys,
+                        DoubleBuffer<ValueT> &d_values,
+                        OffsetT num_items,
+                        int num_segments,
+                        BeginOffsetIteratorT d_begin_offsets,
+                        EndOffsetIteratorT d_end_offsets,
+                        bool is_overwrite_okay,
+                        cudaStream_t stream,
+                        bool debug_synchronous)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_keys(d_keys)
+      , d_values(d_values)
+      , num_items(num_items)
+      , num_segments(num_segments)
+      , d_begin_offsets(d_begin_offsets)
+      , d_end_offsets(d_end_offsets)
+      , is_overwrite_okay(is_overwrite_okay)
+      , stream(stream)
+      , debug_synchronous(debug_synchronous)
+  {}
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
+  {
+    using MaxPolicyT = typename DispatchSegmentedSort::MaxPolicy;
+    using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy;
+    using SmallAndMediumPolicyT =
+      typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT;
+
+    static_assert(
+      LargeSegmentPolicyT::LOAD_MODIFIER != CacheLoadModifier::LOAD_LDG,
+      "The memory consistency model does not apply to texture accesses");
+
+    static_assert(
+        KEYS_ONLY
+        || LargeSegmentPolicyT::LOAD_ALGORITHM != BLOCK_LOAD_STRIPED
+        || SmallAndMediumPolicyT::MediumPolicyT::LOAD_ALGORITHM != WARP_LOAD_STRIPED
+        || SmallAndMediumPolicyT::SmallPolicyT::LOAD_ALGORITHM != WARP_LOAD_STRIPED,
+        "Striped load will make this algorithm unstable");
+
+    static_assert(
+           SmallAndMediumPolicyT::MediumPolicyT::STORE_ALGORITHM != WARP_STORE_STRIPED
+        || SmallAndMediumPolicyT::SmallPolicyT::STORE_ALGORITHM != WARP_STORE_STRIPED,
+        "Striped stores will produce unsorted results");
+
+    constexpr int radix_bits = LargeSegmentPolicyT::RADIX_BITS;
+
+    cudaError error = cudaSuccess;
+
+    do
+    {
+      //------------------------------------------------------------------------
+      // Prepare temporary storage layout
+      //------------------------------------------------------------------------
+
+      const bool partition_segments = num_segments >
+                                      ActivePolicyT::PARTITIONING_THRESHOLD;
+
+      cub::detail::temporary_storage::layout<5> temporary_storage_layout;
+
+      auto keys_slot = temporary_storage_layout.get_slot(0);
+      auto values_slot = temporary_storage_layout.get_slot(1);
+      auto large_and_medium_partitioning_slot = temporary_storage_layout.get_slot(2);
+      auto small_partitioning_slot = temporary_storage_layout.get_slot(3);
+      auto group_sizes_slot = temporary_storage_layout.get_slot(4);
+
+      auto keys_allocation = keys_slot->create_alias<KeyT>();
+      auto values_allocation = values_slot->create_alias<ValueT>();
+
+      if (!is_overwrite_okay)
+      {
+        keys_allocation.grow(num_items);
+
+        if (!KEYS_ONLY)
+        {
+          values_allocation.grow(num_items);
+        }
+      }
+
+      auto large_and_medium_segments_indices =
+        large_and_medium_partitioning_slot->create_alias<unsigned int>();
+      auto small_segments_indices =
+        small_partitioning_slot->create_alias<unsigned int>();
+      auto group_sizes = group_sizes_slot->create_alias<unsigned int>();
+
+      std::size_t three_way_partition_temp_storage_bytes {};
+
+      LargeSegmentsSelectorT large_segments_selector(
+        SmallAndMediumPolicyT::MediumPolicyT::ITEMS_PER_TILE,
+        d_begin_offsets,
+        d_end_offsets);
+
+      SmallSegmentsSelectorT small_segments_selector(
+        SmallAndMediumPolicyT::SmallPolicyT::ITEMS_PER_TILE + 1,
+        d_begin_offsets,
+        d_end_offsets);
+
+      auto device_partition_temp_storage =
+        keys_slot->create_alias<std::uint8_t>();
+
+      if (partition_segments)
+      {
+        large_and_medium_segments_indices.grow(num_segments);
+        small_segments_indices.grow(num_segments);
+        group_sizes.grow(num_selected_groups);
+
+        auto medium_indices_iterator =
+          THRUST_NS_QUALIFIER::make_reverse_iterator(
+            large_and_medium_segments_indices.get());
+
+        cub::DevicePartition::If(
+          nullptr,
+          three_way_partition_temp_storage_bytes,
+          THRUST_NS_QUALIFIER::counting_iterator<OffsetT>(0),
+          large_and_medium_segments_indices.get(),
+          small_segments_indices.get(),
+          medium_indices_iterator,
+          group_sizes.get(),
+          num_segments,
+          large_segments_selector,
+          small_segments_selector,
+          stream,
+          debug_synchronous);
+
+        device_partition_temp_storage.grow(
+          three_way_partition_temp_storage_bytes);
+      }
+
+      if (d_temp_storage == nullptr)
+      {
+        temp_storage_bytes = temporary_storage_layout.get_size();
+
+        // Return if the caller is simply requesting the size of the storage
+        // allocation
+        break;
+      }
+
+      if (num_items == 0 || num_segments == 0)
+      {
+        break;
+      }
+
+      if (CubDebug(
+            error = temporary_storage_layout.map_to_buffer(d_temp_storage,
+                                                           temp_storage_bytes)))
+      {
+        break;
+      }
+
+      //------------------------------------------------------------------------
+      // Sort
+      //------------------------------------------------------------------------
+
+      const bool is_num_passes_odd = GetNumPasses(radix_bits) & 1;
+
+      /**
+       * This algorithm sorts segments that don't fit into shared memory with
+       * the in-global-memory radix sort. Radix sort splits key representation
+       * into multiple "digits". Each digit is RADIX_BITS wide. The algorithm
+       * iterates over these digits. Each of these iterations consists of a
+       * couple of stages. The first stage computes a histogram for a current
+       * digit in each segment key. This histogram helps to determine the
+       * starting position of the keys group with a similar digit.
+       * For example:
+       * keys_digits  = [ 1, 0, 0, 1 ]
+       * digit_prefix = [ 0, 2 ]
+       * The second stage checks the keys again and increments the prefix to
+       * determine the final position of the key:
+       *
+       *               expression            |  key  |   idx   |     result
+       * ----------------------------------- | ----- | ------- | --------------
+       * result[prefix[keys[0]]++] = keys[0] |   1   |    2    | [ ?, ?, 1, ? ]
+       * result[prefix[keys[1]]++] = keys[0] |   0   |    0    | [ 0, ?, 1, ? ]
+       * result[prefix[keys[2]]++] = keys[0] |   0   |    1    | [ 0, 0, 1, ? ]
+       * result[prefix[keys[3]]++] = keys[0] |   1   |    3    | [ 0, 0, 1, 1 ]
+       *
+       * If the resulting memory is aliased to the input one, we'll face the
+       * following issues:
+       *
+       *     input      |  key  |   idx   |   result/input   |      issue
+       * -------------- | ----- | ------- | ---------------- | ----------------
+       * [ 1, 0, 0, 1 ] |   1   |    2    | [ 1, 0, 1, 1 ]   | overwrite keys[2]
+       * [ 1, 0, 1, 1 ] |   0   |    0    | [ 0, 0, 1, 1 ]   |
+       * [ 0, 0, 1, 1 ] |   1   |    3    | [ 0, 0, 1, 1 ]   | extra key
+       * [ 0, 0, 1, 1 ] |   1   |    4    | [ 0, 0, 1, 1 ] 1 | OOB access
+       *
+       * To avoid these issues, we have to use extra memory. The extra memory
+       * holds temporary storage for writing intermediate results of each stage.
+       * Since we iterate over digits in keys, we potentially need:
+       * `sizeof(KeyT) * num_items * DivideAndRoundUp(sizeof(KeyT),RADIX_BITS)`
+       * auxiliary memory bytes. To reduce the auxiliary memory storage
+       * requirements, the algorithm relies on a double buffer facility. The
+       * idea behind it is in swapping destination and source buffers at each
+       * iteration. This way, we can use only two buffers. One of these buffers
+       * can be the final algorithm output destination. Therefore, only one
+       * auxiliary array is needed. Depending on the number of iterations, we
+       * can initialize the double buffer so that the algorithm output array
+       * will match the double buffer result one at the final iteration.
+       * A user can provide this algorithm with a double buffer straightaway to
+       * further reduce the auxiliary memory requirements. `is_overwrite_okay`
+       * indicates this use case.
+       */
+      detail::device_double_buffer<KeyT> d_keys_double_buffer(
+        (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : keys_allocation.get(),
+        (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? keys_allocation.get() : d_keys.Alternate());
+
+      detail::device_double_buffer<ValueT> d_values_double_buffer(
+        (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : values_allocation.get(),
+        (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? values_allocation.get() : d_values.Alternate());
+
+      if (partition_segments)
+      {
+        // Partition input segments into size groups and assign specialized
+        // kernels for each of them.
+        error =
+          SortWithPartitioning<LargeSegmentPolicyT, SmallAndMediumPolicyT>(
+            DeviceSegmentedSortKernelLarge<IS_DESCENDING,
+                                           MaxPolicyT,
+                                           KeyT,
+                                           ValueT,
+                                           BeginOffsetIteratorT,
+                                           EndOffsetIteratorT,
+                                           OffsetT>,
+            DeviceSegmentedSortKernelSmall<IS_DESCENDING,
+                                           MaxPolicyT,
+                                           KeyT,
+                                           ValueT,
+                                           BeginOffsetIteratorT,
+                                           EndOffsetIteratorT,
+                                           OffsetT>,
+            three_way_partition_temp_storage_bytes,
+            d_keys_double_buffer,
+            d_values_double_buffer,
+            large_segments_selector,
+            small_segments_selector,
+            device_partition_temp_storage,
+            large_and_medium_segments_indices,
+            small_segments_indices,
+            group_sizes);
+      }
+      else
+      {
+        // If there are not enough segments, there's no reason to spend time
+        // on extra partitioning steps.
+
+        error = SortWithoutPartitioning<LargeSegmentPolicyT>(
+          DeviceSegmentedSortFallbackKernel<IS_DESCENDING,
+                                            MaxPolicyT,
+                                            KeyT,
+                                            ValueT,
+                                            BeginOffsetIteratorT,
+                                            EndOffsetIteratorT,
+                                            OffsetT>,
+          d_keys_double_buffer,
+          d_values_double_buffer);
+      }
+
+      d_keys.selector = GetFinalSelector(d_keys.selector, radix_bits);
+      d_values.selector = GetFinalSelector(d_values.selector, radix_bits);
+
+    } while (false);
+
+    return error;
+  }
+
+  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+  Dispatch(void *d_temp_storage,
+           std::size_t &temp_storage_bytes,
+           DoubleBuffer<KeyT> &d_keys,
+           DoubleBuffer<ValueT> &d_values,
+           OffsetT num_items,
+           int num_segments,
+           BeginOffsetIteratorT d_begin_offsets,
+           EndOffsetIteratorT d_end_offsets,
+           bool is_overwrite_okay,
+           cudaStream_t stream,
+           bool debug_synchronous)
+  {
+    using MaxPolicyT = typename DispatchSegmentedSort::MaxPolicy;
+
+    cudaError error = cudaSuccess;
+
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      if (CubDebug(error = PtxVersion(ptx_version)))
+      {
+        break;
+      }
+
+      // Create dispatch functor
+      DispatchSegmentedSort dispatch(d_temp_storage,
+                                     temp_storage_bytes,
+                                     d_keys,
+                                     d_values,
+                                     num_items,
+                                     num_segments,
+                                     d_begin_offsets,
+                                     d_end_offsets,
+                                     is_overwrite_okay,
+                                     stream,
+                                     debug_synchronous);
+
+      // Dispatch to chained policy
+      if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+      {
+        break;
+      }
+    } while (false);
+
+    return error;
+  }
+
+private:
+  CUB_RUNTIME_FUNCTION __forceinline__
+  int GetNumPasses(int radix_bits)
+  {
+    const int byte_size  = 8;
+    const int num_bits   = sizeof(KeyT) * byte_size;
+    const int num_passes = DivideAndRoundUp(num_bits, radix_bits);
+    return num_passes;
+  }
+
+  CUB_RUNTIME_FUNCTION __forceinline__
+  int GetFinalSelector(int selector, int radix_bits)
+  {
+    // Sorted data always ends up in the other vector
+    if (!is_overwrite_okay)
+    {
+      return (selector + 1) & 1;
+    }
+
+    return (selector + GetNumPasses(radix_bits)) & 1;
+  }
+
+  template <typename T>
+  CUB_RUNTIME_FUNCTION __forceinline__
+  T* GetFinalOutput(int radix_bits,
+                    DoubleBuffer<T> &buffer)
+  {
+    const int final_selector = GetFinalSelector(buffer.selector, radix_bits);
+    return buffer.d_buffers[final_selector];
+  }
+
+  template <typename LargeSegmentPolicyT,
+            typename SmallAndMediumPolicyT,
+            typename LargeKernelT,
+            typename SmallKernelT>
+  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+  SortWithPartitioning(
+    LargeKernelT large_kernel,
+    SmallKernelT small_kernel,
+    std::size_t three_way_partition_temp_storage_bytes,
+    cub::detail::device_double_buffer<KeyT> &d_keys_double_buffer,
+    cub::detail::device_double_buffer<ValueT> &d_values_double_buffer,
+    LargeSegmentsSelectorT &large_segments_selector,
+    SmallSegmentsSelectorT &small_segments_selector,
+    cub::detail::temporary_storage::alias<std::uint8_t> &device_partition_temp_storage,
+    cub::detail::temporary_storage::alias<unsigned int> &large_and_medium_segments_indices,
+    cub::detail::temporary_storage::alias<unsigned int> &small_segments_indices,
+    cub::detail::temporary_storage::alias<unsigned int> &group_sizes)
+  {
+    cudaError_t error = cudaSuccess;
+
+    auto medium_indices_iterator =
+      THRUST_NS_QUALIFIER::make_reverse_iterator(
+        large_and_medium_segments_indices.get() + num_segments);
+
+    if (CubDebug(error = cub::DevicePartition::If(
+      device_partition_temp_storage.get(),
+      three_way_partition_temp_storage_bytes,
+      THRUST_NS_QUALIFIER::counting_iterator<OffsetT>(0),
+      large_and_medium_segments_indices.get(),
+      small_segments_indices.get(),
+      medium_indices_iterator,
+      group_sizes.get(),
+      num_segments,
+      large_segments_selector,
+      small_segments_selector,
+      stream,
+      debug_synchronous)))
+    {
+      return error;
+    }
+
+    if (CUB_IS_HOST_CODE)
+    {
+      #if CUB_INCLUDE_HOST_CODE
+      unsigned int h_group_sizes[num_selected_groups];
+
+      if (CubDebug(
+            error = cudaMemcpyAsync(h_group_sizes,
+                                    group_sizes.get(),
+                                    num_selected_groups * sizeof(unsigned int),
+                                    cudaMemcpyDeviceToHost,
+                                    stream)))
+      {
+        return error;
+      }
+
+      if (CubDebug(error = SyncStream(stream)))
+      {
+        return error;
+      }
+
+      error = DeviceSegmentedSortContinuation<LargeSegmentPolicyT,
+                                              SmallAndMediumPolicyT>(
+        large_kernel,
+        small_kernel,
+        num_segments,
+        d_keys.Current(),
+        GetFinalOutput<KeyT>(LargeSegmentPolicyT::RADIX_BITS, d_keys),
+        d_keys_double_buffer,
+        d_values.Current(),
+        GetFinalOutput<ValueT>(LargeSegmentPolicyT::RADIX_BITS, d_values),
+        d_values_double_buffer,
+        d_begin_offsets,
+        d_end_offsets,
+        h_group_sizes,
+        large_and_medium_segments_indices.get(),
+        small_segments_indices.get(),
+        stream,
+        debug_synchronous);
+      #endif
+    }
+    else
+    {
+      #if CUB_INCLUDE_DEVICE_CODE
+      #ifdef CUB_RUNTIME_ENABLED
+      using MaxPolicyT = typename DispatchSegmentedSort::MaxPolicy;
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(1, 1, 0, stream)
+        .doit(DeviceSegmentedSortContinuationKernel<MaxPolicyT,
+                                                    LargeKernelT,
+                                                    SmallKernelT,
+                                                    KeyT,
+                                                    ValueT,
+                                                    BeginOffsetIteratorT,
+                                                    EndOffsetIteratorT>,
+              large_kernel,
+              small_kernel,
+              num_segments,
+              d_keys.Current(),
+              GetFinalOutput<KeyT>(LargeSegmentPolicyT::RADIX_BITS, d_keys),
+              d_keys_double_buffer,
+              d_values.Current(),
+              GetFinalOutput<ValueT>(LargeSegmentPolicyT::RADIX_BITS, d_values),
+              d_values_double_buffer,
+              d_begin_offsets,
+              d_end_offsets,
+              group_sizes.get(),
+              large_and_medium_segments_indices.get(),
+              small_segments_indices.get(),
+              debug_synchronous);
+
+      if (CubDebug(error = cudaPeekAtLastError()))
+      {
+        return error;
+      }
+      #else
+      error = CubDebug(cudaErrorNotSupported);
+      #endif
+      #endif
+    }
+
+    return error;
+  }
+
+  template <typename LargeSegmentPolicyT,
+            typename FallbackKernelT>
+  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SortWithoutPartitioning(
+    FallbackKernelT fallback_kernel,
+    cub::detail::device_double_buffer<KeyT> &d_keys_double_buffer,
+    cub::detail::device_double_buffer<ValueT> &d_values_double_buffer)
+  {
+    cudaError_t error = cudaSuccess;
+
+    const auto blocks_in_grid = static_cast<unsigned int>(num_segments);
+    const auto threads_in_block =
+      static_cast<unsigned int>(LargeSegmentPolicyT::BLOCK_THREADS);
+
+    // Log kernel configuration
+    if (debug_synchronous)
+    {
+      _CubLog("Invoking DeviceSegmentedSortFallbackKernel<<<%d, %d, "
+              "0, %lld>>>(), %d items per thread, bit_grain %d\n",
+              blocks_in_grid,
+              threads_in_block,
+              (long long)stream,
+              LargeSegmentPolicyT::ITEMS_PER_THREAD,
+              LargeSegmentPolicyT::RADIX_BITS);
+    }
+
+    // Invoke fallback kernel
+    THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid,
+                                                            threads_in_block,
+                                                            0,
+                                                            stream)
+      .doit(fallback_kernel,
+            d_keys.Current(),
+            GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_keys),
+            d_keys_double_buffer,
+            d_values.Current(),
+            GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_values),
+            d_values_double_buffer,
+            d_begin_offsets,
+            d_end_offsets);
+
+    // Check for failure to launch
+    if (CubDebug(error = cudaPeekAtLastError()))
+    {
+      return error;
+    }
+
+    // Sync the stream if specified to flush runtime errors
+    if (CUB_IS_HOST_CODE && debug_synchronous)
+    {
+      #if CUB_INCLUDE_HOST_CODE
+      if (CubDebug(error = SyncStream(stream)))
+      {
+        return error;
+      }
+      #endif
+    }
+
+    return error;
+  }
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/device/dispatch/dispatch_select_if.cuh b/cuda/cub/device/dispatch/dispatch_select_if.cuh
new file mode 100644
index 00000000..5654ba29
--- /dev/null
+++ b/cuda/cub/device/dispatch/dispatch_select_if.cuh
@@ -0,0 +1,446 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch_scan.cuh"
+#include "../../config.cuh"
+#include "../../agent/agent_select_if.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+#include "../../util_math.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Select kernel entry point (multi-block)
+ *
+ * Performs functor-based selection if SelectOpT functor type != NullType
+ * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename            AgentSelectIfPolicyT,       ///< Parameterized AgentSelectIfPolicyT tuning policy type
+    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items
+    typename            FlagsInputIteratorT,        ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename            SelectedOutputIteratorT,    ///< Random-access output iterator type for writing selected items
+    typename            NumSelectedIteratorT,       ///< Output iterator type for recording the number of items selected
+    typename            ScanTileStateT,             ///< Tile status interface type
+    typename            SelectOpT,                  ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
+    typename            EqualityOpT,                ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
+    typename            OffsetT,                    ///< Signed integer type for global offsets
+    bool                KEEP_REJECTS>               ///< Whether or not we push rejected items to the back of the output
+__launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS))
+__global__ void DeviceSelectSweepKernel(
+    InputIteratorT          d_in,                   ///< [in] Pointer to the input sequence of data items
+    FlagsInputIteratorT     d_flags,                ///< [in] Pointer to the input sequence of selection flags (if applicable)
+    SelectedOutputIteratorT d_selected_out,         ///< [out] Pointer to the output sequence of selected data items
+    NumSelectedIteratorT    d_num_selected_out,     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+    ScanTileStateT          tile_status,            ///< [in] Tile status interface
+    SelectOpT               select_op,              ///< [in] Selection operator
+    EqualityOpT             equality_op,            ///< [in] Equality operator
+    OffsetT                 num_items,              ///< [in] Total number of input items (i.e., length of \p d_in)
+    int                     num_tiles)              ///< [in] Total number of tiles for the entire problem
+{
+    // Thread block type for selecting data from input tiles
+    typedef AgentSelectIf<
+        AgentSelectIfPolicyT,
+        InputIteratorT,
+        FlagsInputIteratorT,
+        SelectedOutputIteratorT,
+        SelectOpT,
+        EqualityOpT,
+        OffsetT,
+        KEEP_REJECTS> AgentSelectIfT;
+
+    // Shared memory for AgentSelectIf
+    __shared__ typename AgentSelectIfT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items).ConsumeRange(
+        num_tiles,
+        tile_status,
+        d_num_selected_out);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect
+ */
+template <
+    typename    InputIteratorT,                 ///< Random-access input iterator type for reading input items
+    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename    SelectedOutputIteratorT,        ///< Random-access output iterator type for writing selected items
+    typename    NumSelectedIteratorT,           ///< Output iterator type for recording the number of items selected
+    typename    SelectOpT,                      ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
+    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
+struct DispatchSelectIf
+{
+    /******************************************************************************
+     * Types and constants
+     ******************************************************************************/
+
+    // The input value type
+    using InputT = cub::detail::value_t<InputIteratorT>;
+
+    // The flag value type
+    using FlagT = cub::detail::value_t<FlagsInputIteratorT>;
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128,
+    };
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<OffsetT> ScanTileStateT;
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 10,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SelectIfPolicyT;
+    };
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+    typedef Policy350 PtxPolicy;
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxSelectIfPolicyT : PtxPolicy::SelectIfPolicyT {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &select_if_config)
+    {
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)ptx_version;
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                select_if_config.template Init<PtxSelectIfPolicyT>();
+            #endif
+        }
+        else
+        {
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+
+                // (There's only one policy right now)
+                (void)ptx_version;
+                select_if_config.template Init<typename Policy350::SelectIfPolicyT>();
+            #endif
+        }
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide selection using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    ScanInitKernelPtrT,             ///< Function type of cub::DeviceScanInitKernel
+        typename                    SelectIfKernelPtrT>             ///< Function type of cub::SelectIfKernelPtrT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
+        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+        SelectOpT                   select_op,                      ///< [in] Selection operator
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         /*ptx_version*/,                ///< [in] PTX version of dispatch kernels
+        ScanInitKernelPtrT          scan_init_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        SelectIfKernelPtrT          select_if_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel
+        KernelConfig                select_if_config)               ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+        (void)d_temp_storage;
+        (void)temp_storage_bytes;
+        (void)d_in;
+        (void)d_flags;
+        (void)d_selected_out;
+        (void)d_num_selected_out;
+        (void)select_op;
+        (void)equality_op;
+        (void)num_items;
+        (void)stream;
+        (void)debug_synchronous;
+        (void)scan_init_kernel;
+        (void)select_if_kernel;
+        (void)select_if_config;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = select_if_config.block_threads * select_if_config.items_per_thread;
+            int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1] = {};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_status;
+            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log scan_init_kernel configuration
+            int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS));
+            if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke scan_init_kernel to initialize tile descriptors
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(scan_init_kernel,
+                tile_status,
+                num_tiles,
+                d_num_selected_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;
+
+            // Get grid size for scanning tiles
+            dim3 scan_grid_size;
+            scan_grid_size.z = 1;
+            scan_grid_size.y = cub::DivideAndRoundUp(num_tiles, max_dim_x);
+            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+
+            // Log select_if_kernel configuration
+            if (debug_synchronous)
+            {
+              // Get SM occupancy for select_if_kernel
+              int range_select_sm_occupancy;
+              if (CubDebug(error = MaxSmOccupancy(range_select_sm_occupancy, // out
+                                                  select_if_kernel,
+                                                  select_if_config.block_threads)))
+              {
+                break;
+              }
+
+              _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, "
+                      "%lld>>>(), %d items per thread, %d SM occupancy\n",
+                      scan_grid_size.x,
+                      scan_grid_size.y,
+                      scan_grid_size.z,
+                      select_if_config.block_threads,
+                      (long long)stream,
+                      select_if_config.items_per_thread,
+                      range_select_sm_occupancy);
+            }
+
+            // Invoke select_if_kernel
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                scan_grid_size, select_if_config.block_threads, 0, stream
+            ).doit(select_if_kernel,
+                d_in,
+                d_flags,
+                d_selected_out,
+                d_num_selected_out,
+                tile_status,
+                select_op,
+                equality_op,
+                num_items,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
+        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+        SelectOpT                   select_op,                      ///< [in] Selection operator
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig select_if_config;
+            InitConfigs(ptx_version, select_if_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_in,
+                d_flags,
+                d_selected_out,
+                d_num_selected_out,
+                select_op,
+                equality_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
+                DeviceSelectSweepKernel<PtxSelectIfPolicyT, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, NumSelectedIteratorT, ScanTileStateT, SelectOpT, EqualityOpT, OffsetT, KEEP_REJECTS>,
+                select_if_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/device/dispatch/dispatch_spmv_orig.cuh b/cuda/cub/device/dispatch/dispatch_spmv_orig.cuh
new file mode 100644
index 00000000..d0de8640
--- /dev/null
+++ b/cuda/cub/device/dispatch/dispatch_spmv_orig.cuh
@@ -0,0 +1,744 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/single_pass_scan_operators.cuh"
+#include "../../agent/agent_segment_fixup.cuh"
+#include "../../agent/agent_spmv_orig.cuh"
+#include "../../util_type.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_math.cuh"
+#include "../../thread/thread_search.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../config.cuh"
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+CUB_NAMESPACE_BEGIN
+
+
+/******************************************************************************
+ * SpMV kernel entry points
+ *****************************************************************************/
+
+/**
+ * Spmv search kernel. Identifies merge path starting coordinates for each tile.
+ */
+template <
+    typename    AgentSpmvPolicyT,           ///< Parameterized SpmvPolicy tuning policy type
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT>                    ///< Signed integer type for sequence offsets
+__global__ void DeviceSpmv1ColKernel(
+    SpmvParams<ValueT, OffsetT> spmv_params)                ///< [in] SpMV input parameter bundle
+{
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        VectorValueIteratorT;
+
+    VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x);
+
+    int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (row_idx < spmv_params.num_rows)
+    {
+        OffsetT     end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx];
+        OffsetT     nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1];
+
+        ValueT value = 0.0;
+        if (end_nonzero_idx != nonzero_idx)
+        {
+            value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]];
+        }
+
+        spmv_params.d_vector_y[row_idx] = value;
+    }
+}
+
+
+/**
+ * Spmv search kernel. Identifies merge path starting coordinates for each tile.
+ */
+template <
+    typename    SpmvPolicyT,                    ///< Parameterized SpmvPolicy tuning policy type
+    typename    OffsetT,                        ///< Signed integer type for sequence offsets
+    typename    CoordinateT,                    ///< Merge path coordinate type
+    typename    SpmvParamsT>                    ///< SpmvParams type
+__global__ void DeviceSpmvSearchKernel(
+    int             num_merge_tiles,            ///< [in] Number of SpMV merge tiles (spmv grid size)
+    CoordinateT*    d_tile_coordinates,         ///< [out] Pointer to the temporary array of tile starting coordinates
+    SpmvParamsT     spmv_params)                ///< [in] SpMV input parameter bundle
+{
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = SpmvPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = SpmvPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    typedef CacheModifiedInputIterator<
+            SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsSearchIteratorT;
+
+    // Find the starting coordinate for all tiles (plus the end coordinate of the last one)
+    int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (tile_idx < num_merge_tiles + 1)
+    {
+        OffsetT                         diagonal = (tile_idx * TILE_ITEMS);
+        CoordinateT                     tile_coordinate;
+        CountingInputIterator<OffsetT>  nonzero_indices(0);
+
+        // Search the merge path
+        MergePathSearch(
+            diagonal,
+            RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
+            nonzero_indices,
+            spmv_params.num_rows,
+            spmv_params.num_nonzeros,
+            tile_coordinate);
+
+        // Output starting offset
+        d_tile_coordinates[tile_idx] = tile_coordinate;
+    }
+}
+
+
+/**
+ * Spmv agent entry point
+ */
+template <
+    typename        SpmvPolicyT,                ///< Parameterized SpmvPolicy tuning policy type
+    typename        ScanTileStateT,             ///< Tile status interface type
+    typename        ValueT,                     ///< Matrix and vector value type
+    typename        OffsetT,                    ///< Signed integer type for sequence offsets
+    typename        CoordinateT,                ///< Merge path coordinate type
+    bool            HAS_ALPHA,                  ///< Whether the input parameter Alpha is 1
+    bool            HAS_BETA>                   ///< Whether the input parameter Beta is 0
+__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS))
+__global__ void DeviceSpmvKernel(
+    SpmvParams<ValueT, OffsetT>     spmv_params,                ///< [in] SpMV input parameter bundle
+    CoordinateT*                    d_tile_coordinates,         ///< [in] Pointer to the temporary array of tile starting coordinates
+    KeyValuePair<OffsetT,ValueT>*   d_tile_carry_pairs,         ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
+    int                             num_tiles,                  ///< [in] Number of merge tiles
+    ScanTileStateT                  tile_state,                 ///< [in] Tile status interface for fixup reduce-by-key kernel
+    int                             num_segment_fixup_tiles)    ///< [in] Number of reduce-by-key tiles (fixup grid size)
+{
+    // Spmv agent type specialization
+    typedef AgentSpmv<
+            SpmvPolicyT,
+            ValueT,
+            OffsetT,
+            HAS_ALPHA,
+            HAS_BETA>
+        AgentSpmvT;
+
+    // Shared memory for AgentSpmv
+    __shared__ typename AgentSpmvT::TempStorage temp_storage;
+
+    AgentSpmvT(temp_storage, spmv_params).ConsumeTile(
+        d_tile_coordinates,
+        d_tile_carry_pairs,
+        num_tiles);
+
+    // Initialize fixup tile status
+    tile_state.InitializeStatus(num_segment_fixup_tiles);
+
+}
+
+
+/**
+ * Multi-block reduce-by-key sweep kernel entry point
+ */
+template <
+    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
+    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    typename    ScanTileStateT>                 ///< Tile status interface type
+__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
+__global__ void DeviceSegmentFixupKernel(
+    PairsInputIteratorT         d_pairs_in,         ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block
+    AggregatesOutputIteratorT   d_aggregates_out,   ///< [in,out] Output value aggregates
+    OffsetT                     num_items,          ///< [in] Total number of items to select from
+    int                         num_tiles,          ///< [in] Total number of tiles for the entire problem
+    ScanTileStateT              tile_state)         ///< [in] Tile status interface
+{
+    // Thread block type for reducing tiles of value segments
+    typedef AgentSegmentFixup<
+            AgentSegmentFixupPolicyT,
+            PairsInputIteratorT,
+            AggregatesOutputIteratorT,
+            cub::Equality,
+            cub::Sum,
+            OffsetT>
+        AgentSegmentFixupT;
+
+    // Shared memory for AgentSegmentFixup
+    __shared__ typename AgentSegmentFixupT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), cub::Sum()).ConsumeRange(
+        num_items,
+        num_tiles,
+        tile_state);
+}
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv
+ */
+template <
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT>                    ///< Signed integer type for global offsets
+struct DispatchSpmv
+{
+    //---------------------------------------------------------------------
+    // Constants and Types
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128
+    };
+
+    // SpmvParams bundle type
+    typedef SpmvParams<ValueT, OffsetT> SpmvParamsT;
+
+    // 2D merge path coordinate type
+    typedef typename CubVector<OffsetT, 2>::Type CoordinateT;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies
+    //---------------------------------------------------------------------
+
+    /// SM35
+    struct Policy350
+    {
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 96 : 128,
+                (sizeof(ValueT) > 4) ? 4 : 7,
+                LOAD_LDG,
+                LOAD_CA,
+                LOAD_LDG,
+                LOAD_LDG,
+                LOAD_LDG,
+                (sizeof(ValueT) > 4) ? true : false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+
+    /// SM37
+    struct Policy370
+    {
+
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 128 : 128,
+                (sizeof(ValueT) > 4) ? 9 : 14,
+                LOAD_LDG,
+                LOAD_CA,
+                LOAD_LDG,
+                LOAD_LDG,
+                LOAD_LDG,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+    /// SM50
+    struct Policy500
+    {
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 64 : 128,
+                (sizeof(ValueT) > 4) ? 6 : 7,
+                LOAD_LDG,
+                LOAD_DEFAULT,
+                (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
+                (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
+                LOAD_LDG,
+                (sizeof(ValueT) > 4) ? true : false,
+                (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE>
+            SpmvPolicyT;
+
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_LDG,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            SegmentFixupPolicyT;
+    };
+
+
+    /// SM60
+    struct Policy600
+    {
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 64 : 128,
+                (sizeof(ValueT) > 4) ? 5 : 7,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies of current PTX compiler pass
+    //---------------------------------------------------------------------
+
+#if (CUB_PTX_ARCH >= 600)
+    typedef Policy600 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 500)
+    typedef Policy500 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 370)
+    typedef Policy370 PtxPolicy;
+
+#else
+    typedef Policy350 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT {};
+    struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT {};
+
+
+    //---------------------------------------------------------------------
+    // Utilities
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &spmv_config,
+        KernelConfig    &segment_fixup_config)
+    {
+        if (CUB_IS_DEVICE_CODE)
+        {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                spmv_config.template Init<PtxSpmvPolicyT>();
+                segment_fixup_config.template Init<PtxSegmentFixupPolicy>();
+            #endif
+        }
+        else
+        {
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 600)
+                {
+                    spmv_config.template            Init<typename Policy600::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy600::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 500)
+                {
+                    spmv_config.template            Init<typename Policy500::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy500::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 370)
+                {
+                    spmv_config.template            Init<typename Policy370::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy370::SegmentFixupPolicyT>();
+                }
+                else
+                {
+                    spmv_config.template            Init<typename Policy350::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy350::SegmentFixupPolicyT>();
+                }
+            #endif
+        }
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction using the
+     * specified kernel functions.
+     *
+     * If the input is larger than a single tile, this method uses two-passes of
+     * kernel invocations.
+     */
+    template <
+        typename                Spmv1ColKernelT,                    ///< Function type of cub::DeviceSpmv1ColKernel
+        typename                SpmvSearchKernelT,                  ///< Function type of cub::AgentSpmvSearchKernel
+        typename                SpmvKernelT,                        ///< Function type of cub::AgentSpmvKernel
+        typename                SegmentFixupKernelT>                ///< Function type of cub::DeviceSegmentFixupKernelT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,                     ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
+        cudaStream_t            stream,                             ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous,                  ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        Spmv1ColKernelT         spmv_1col_kernel,                   ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel
+        SpmvSearchKernelT       spmv_search_kernel,                 ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel
+        SpmvKernelT             spmv_kernel,                        ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel
+        SegmentFixupKernelT     segment_fixup_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel
+        KernelConfig            spmv_config,                        ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for
+        KernelConfig            segment_fixup_config)               ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            if (spmv_params.num_rows < 0 || spmv_params.num_cols < 0)
+            {
+              return cudaErrorInvalidValue;
+            }
+
+            if (spmv_params.num_rows == 0 || spmv_params.num_cols == 0)
+            { // Empty problem, no-op.
+                if (d_temp_storage == NULL)
+                {
+                    temp_storage_bytes = 1;
+                }
+
+                break;
+            }
+
+            if (spmv_params.num_cols == 1)
+            {
+                if (d_temp_storage == NULL)
+                {
+                    // Return if the caller is simply requesting the size of the storage allocation
+                    temp_storage_bytes = 1;
+                    break;
+                }
+
+                // Get search/init grid dims
+                int degen_col_kernel_block_size = INIT_KERNEL_THREADS;
+                int degen_col_kernel_grid_size = cub::DivideAndRoundUp(spmv_params.num_rows, degen_col_kernel_block_size);
+
+                if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
+                    degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream);
+
+                // Invoke spmv_search_kernel
+                THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                    degen_col_kernel_grid_size, degen_col_kernel_block_size, 0,
+                    stream
+                ).doit(spmv_1col_kernel,
+                    spmv_params);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+                break;
+            }
+
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;
+
+            // Total number of spmv work items
+            int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros;
+
+            // Tile sizes of kernels
+            int merge_tile_size              = spmv_config.block_threads * spmv_config.items_per_thread;
+            int segment_fixup_tile_size     = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread;
+
+            // Number of tiles for kernels
+            int num_merge_tiles            = cub::DivideAndRoundUp(num_merge_items, merge_tile_size);
+            int num_segment_fixup_tiles    = cub::DivideAndRoundUp(num_merge_tiles, segment_fixup_tile_size);
+
+            // Get SM occupancy for kernels
+            int spmv_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                spmv_sm_occupancy,
+                spmv_kernel,
+                spmv_config.block_threads))) break;
+
+            int segment_fixup_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                segment_fixup_sm_occupancy,
+                segment_fixup_kernel,
+                segment_fixup_config.block_threads))) break;
+
+            // Get grid dimensions
+            dim3 spmv_grid_size(
+                CUB_MIN(num_merge_tiles, max_dim_x),
+                cub::DivideAndRoundUp(num_merge_tiles, max_dim_x),
+                1);
+
+            dim3 segment_fixup_grid_size(
+                CUB_MIN(num_segment_fixup_tiles, max_dim_x),
+                cub::DivideAndRoundUp(num_segment_fixup_tiles, max_dim_x),
+                1);
+
+            // Get the temporary storage allocation requirements
+            size_t allocation_sizes[3];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) break;    // bytes needed for reduce-by-key tile status descriptors
+            allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT);       // bytes needed for block carry-out pairs
+            allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT);   // bytes needed for tile starting coordinates
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            void* allocations[3] = {};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Alias the other allocations
+            KeyValuePairT*  d_tile_carry_pairs      = (KeyValuePairT*) allocations[1];  // Agent carry-out pairs
+            CoordinateT*    d_tile_coordinates      = (CoordinateT*) allocations[2];    // Agent starting coordinates
+
+            // Get search/init grid dims
+            int search_block_size   = INIT_KERNEL_THREADS;
+            int search_grid_size    = cub::DivideAndRoundUp(num_merge_tiles + 1, search_block_size);
+
+            if (search_grid_size < sm_count)
+//            if (num_merge_tiles < spmv_sm_occupancy * sm_count)
+            {
+                // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords
+                d_tile_coordinates = NULL;
+            }
+            else
+            {
+                // Use separate search kernel if we have enough spmv tiles to saturate the device
+
+                // Log spmv_search_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",
+                    search_grid_size, search_block_size, (long long) stream);
+
+                // Invoke spmv_search_kernel
+                THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                    search_grid_size, search_block_size, 0, stream
+                ).doit(spmv_search_kernel,
+                    num_merge_tiles,
+                    d_tile_coordinates,
+                    spmv_params);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+
+            // Log spmv_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);
+
+            // Invoke spmv_kernel
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                spmv_grid_size, spmv_config.block_threads, 0, stream
+            ).doit(spmv_kernel,
+                spmv_params,
+                d_tile_coordinates,
+                d_tile_carry_pairs,
+                num_merge_tiles,
+                tile_state,
+                num_segment_fixup_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Run reduce-by-key fixup if necessary
+            if (num_merge_tiles > 1)
+            {
+                // Log segment_fixup_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy);
+
+                // Invoke segment_fixup_kernel
+                THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                    segment_fixup_grid_size, segment_fixup_config.block_threads,
+                    0, stream
+                ).doit(segment_fixup_kernel,
+                    d_tile_carry_pairs,
+                    spmv_params.d_vector_y,
+                    num_merge_tiles,
+                    num_segment_fixup_tiles,
+                    tile_state);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,                     ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
+        cudaStream_t            stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig spmv_config, segment_fixup_config;
+            InitConfigs(ptx_version, spmv_config, segment_fixup_config);
+
+            if (CubDebug(error = Dispatch(
+                d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
+                DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
+                DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
+                DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, false, false>,
+                DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
+                spmv_config, segment_fixup_config))) break;
+
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+CUB_NAMESPACE_END
+
+
diff --git a/cuda/cub/device/dispatch/dispatch_three_way_partition.cuh b/cuda/cub/device/dispatch/dispatch_three_way_partition.cuh
new file mode 100644
index 00000000..cb6d3c7a
--- /dev/null
+++ b/cuda/cub/device/dispatch/dispatch_three_way_partition.cuh
@@ -0,0 +1,523 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <iterator>
+#include <cstdio>
+
+#include <cub/agent/agent_three_way_partition.cuh>
+#include <cub/config.cuh>
+#include <cub/device/dispatch/dispatch_scan.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+CUB_NAMESPACE_BEGIN
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+template <typename AgentThreeWayPartitionPolicyT,
+          typename InputIteratorT,
+          typename FirstOutputIteratorT,
+          typename SecondOutputIteratorT,
+          typename UnselectedOutputIteratorT,
+          typename NumSelectedIteratorT,
+          typename ScanTileStateT,
+          typename SelectFirstPartOp,
+          typename SelectSecondPartOp,
+          typename OffsetT>
+__launch_bounds__(int(AgentThreeWayPartitionPolicyT::BLOCK_THREADS)) __global__
+void DeviceThreeWayPartitionKernel(InputIteratorT d_in,
+                                   FirstOutputIteratorT d_first_part_out,
+                                   SecondOutputIteratorT d_second_part_out,
+                                   UnselectedOutputIteratorT d_unselected_out,
+                                   NumSelectedIteratorT d_num_selected_out,
+                                   ScanTileStateT tile_status_1,
+                                   ScanTileStateT tile_status_2,
+                                   SelectFirstPartOp select_first_part_op,
+                                   SelectSecondPartOp select_second_part_op,
+                                   OffsetT num_items,
+                                   int num_tiles)
+{
+  // Thread block type for selecting data from input tiles
+  using AgentThreeWayPartitionT =
+    AgentThreeWayPartition<AgentThreeWayPartitionPolicyT,
+                           InputIteratorT,
+                           FirstOutputIteratorT,
+                           SecondOutputIteratorT,
+                           UnselectedOutputIteratorT,
+                           SelectFirstPartOp,
+                           SelectSecondPartOp,
+                           OffsetT>;
+
+  // Shared memory for AgentThreeWayPartition
+  __shared__ typename AgentThreeWayPartitionT::TempStorage temp_storage;
+
+  // Process tiles
+  AgentThreeWayPartitionT(temp_storage,
+                          d_in,
+                          d_first_part_out,
+                          d_second_part_out,
+                          d_unselected_out,
+                          select_first_part_op,
+                          select_second_part_op,
+                          num_items)
+    .ConsumeRange(num_tiles, tile_status_1, tile_status_2, d_num_selected_out);
+}
+
+/**
+ * @brief Initialization kernel for tile status initialization (multi-block)
+ *
+ * @tparam ScanTileStateT
+ *   Tile status interface type
+ *
+ * @tparam NumSelectedIteratorT
+ *   Output iterator type for recording the number of items selected
+ *
+ * @param[in] tile_state_1
+ *   Tile status interface
+ *
+ * @param[in] tile_state_2
+ *   Tile status interface
+ *
+ * @param[in] num_tiles
+ *   Number of tiles
+ *
+ * @param[out] d_num_selected_out
+ *   Pointer to the total number of items selected
+ *   (i.e., length of @p d_selected_out)
+ */
+template <typename ScanTileStateT,
+          typename NumSelectedIteratorT>
+__global__ void
+DeviceThreeWayPartitionInitKernel(ScanTileStateT tile_state_1,
+                                  ScanTileStateT tile_state_2,
+                                  int num_tiles,
+                                  NumSelectedIteratorT d_num_selected_out)
+{
+  // Initialize tile status
+  tile_state_1.InitializeStatus(num_tiles);
+  tile_state_2.InitializeStatus(num_tiles);
+
+  // Initialize d_num_selected_out
+  if (blockIdx.x == 0)
+  {
+    if (threadIdx.x < 2)
+    {
+      d_num_selected_out[threadIdx.x] = 0;
+    }
+  }
+}
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+template <typename InputIteratorT,
+          typename FirstOutputIteratorT,
+          typename SecondOutputIteratorT,
+          typename UnselectedOutputIteratorT,
+          typename NumSelectedIteratorT,
+          typename SelectFirstPartOp,
+          typename SelectSecondPartOp,
+          typename OffsetT>
+struct DispatchThreeWayPartitionIf
+{
+  /*****************************************************************************
+   * Types and constants
+   ****************************************************************************/
+
+  using InputT = cub::detail::value_t<InputIteratorT>;
+  using ScanTileStateT = cub::ScanTileState<OffsetT>;
+
+  constexpr static int INIT_KERNEL_THREADS = 256;
+
+
+  /*****************************************************************************
+   * Tuning policies
+   ****************************************************************************/
+
+  /// SM35
+  struct Policy350
+  {
+    constexpr static int ITEMS_PER_THREAD = Nominal4BItemsToItems<InputT>(9);
+
+    using ThreeWayPartitionPolicy =
+      cub::AgentThreeWayPartitionPolicy<256,
+                                        ITEMS_PER_THREAD,
+                                        cub::BLOCK_LOAD_DIRECT,
+                                        cub::LOAD_DEFAULT,
+                                        cub::BLOCK_SCAN_WARP_SCANS>;
+  };
+
+  /*****************************************************************************
+   * Tuning policies of current PTX compiler pass
+   ****************************************************************************/
+
+  using PtxPolicy = Policy350;
+
+  // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+  struct PtxThreeWayPartitionPolicyT : PtxPolicy::ThreeWayPartitionPolicy {};
+
+
+  /*****************************************************************************
+   * Utilities
+   ****************************************************************************/
+
+  /**
+   * Initialize kernel dispatch configurations with the policies corresponding
+   * to the PTX assembly we will use
+   */
+  template <typename KernelConfig>
+  CUB_RUNTIME_FUNCTION __forceinline__
+  static void InitConfigs(
+    int             ptx_version,
+    KernelConfig    &select_if_config)
+  {
+    if (CUB_IS_DEVICE_CODE)
+    {
+#if CUB_INCLUDE_DEVICE_CODE
+      (void)ptx_version;
+      // We're on the device, so initialize the kernel dispatch configurations
+      // with the current PTX policy
+      select_if_config.template Init<PtxThreeWayPartitionPolicyT>();
+#endif
+    }
+    else
+    {
+#if CUB_INCLUDE_HOST_CODE
+      // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+
+      // (There's only one policy right now)
+      (void)ptx_version;
+      select_if_config.template Init<typename Policy350::ThreeWayPartitionPolicy>();
+#endif
+    }
+  }
+
+
+  /**
+   * Kernel dispatch configuration.
+   */
+  struct KernelConfig
+  {
+    int block_threads;
+    int items_per_thread;
+    int tile_items;
+
+    template <typename PolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    void Init()
+    {
+      block_threads       = PolicyT::BLOCK_THREADS;
+      items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+      tile_items          = block_threads * items_per_thread;
+    }
+  };
+
+
+  /*****************************************************************************
+   * Dispatch entrypoints
+   ****************************************************************************/
+
+  template <typename ScanInitKernelPtrT,
+            typename SelectIfKernelPtrT>
+  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+  Dispatch(void *d_temp_storage,
+           std::size_t &temp_storage_bytes,
+           InputIteratorT d_in,
+           FirstOutputIteratorT d_first_part_out,
+           SecondOutputIteratorT d_second_part_out,
+           UnselectedOutputIteratorT d_unselected_out,
+           NumSelectedIteratorT d_num_selected_out,
+           SelectFirstPartOp select_first_part_op,
+           SelectSecondPartOp select_second_part_op,
+           OffsetT num_items,
+           cudaStream_t stream,
+           bool debug_synchronous,
+           int /*ptx_version*/,
+           ScanInitKernelPtrT three_way_partition_init_kernel,
+           SelectIfKernelPtrT three_way_partition_kernel,
+           KernelConfig three_way_partition_config)
+  {
+    cudaError error = cudaSuccess;
+
+    do
+    {
+      // Get device ordinal
+      int device_ordinal;
+      if (CubDebug(error = cudaGetDevice(&device_ordinal)))
+      {
+        break;
+      }
+
+      // Number of input tiles
+      int tile_size = three_way_partition_config.block_threads *
+                      three_way_partition_config.items_per_thread;
+      int num_tiles = static_cast<int>(DivideAndRoundUp(num_items, tile_size));
+
+      // Specify temporary storage allocation requirements
+      size_t allocation_sizes[2]; // bytes needed for tile status descriptors
+
+      if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles,
+                                                          allocation_sizes[0])))
+      {
+        break;
+      }
+
+      allocation_sizes[1] = allocation_sizes[0];
+
+      // Compute allocation pointers into the single storage blob (or compute
+      // the necessary size of the blob)
+      void* allocations[2] = {};
+      if (CubDebug(error = cub::AliasTemporaries(d_temp_storage,
+                                                 temp_storage_bytes,
+                                                 allocations,
+                                                 allocation_sizes)))
+      {
+        break;
+      }
+
+      if (d_temp_storage == nullptr)
+      {
+        // Return if the caller is simply requesting the size of the storage
+        // allocation
+        break;
+      }
+
+      // Return if empty problem
+      if (num_items == 0)
+      {
+        break;
+      }
+
+      // Construct the tile status interface
+      ScanTileStateT tile_status_1;
+      ScanTileStateT tile_status_2;
+
+      if (CubDebug(error = tile_status_1.Init(num_tiles,
+                                              allocations[0],
+                                              allocation_sizes[0])))
+      {
+        break;
+      }
+
+      if (CubDebug(error = tile_status_2.Init(num_tiles,
+                                              allocations[1],
+                                              allocation_sizes[1])))
+      {
+        break;
+      }
+
+      // Log three_way_partition_init_kernel configuration
+      int init_grid_size = CUB_MAX(1, DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS));
+      if (debug_synchronous)
+      {
+        _CubLog("Invoking three_way_partition_init_kernel<<<%d, %d, 0, %lld>>>()\n",
+                init_grid_size,
+                INIT_KERNEL_THREADS,
+                reinterpret_cast<long long>(stream));
+      }
+
+      // Invoke three_way_partition_init_kernel to initialize tile descriptors
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+        init_grid_size, INIT_KERNEL_THREADS, 0, stream
+      ).doit(three_way_partition_init_kernel,
+             tile_status_1,
+             tile_status_2,
+             num_tiles,
+             d_num_selected_out);
+
+      // Check for failure to launch
+      if (CubDebug(error = cudaPeekAtLastError()))
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      if (debug_synchronous)
+      {
+        if (CubDebug(error = cub::SyncStream(stream)))
+        {
+          break;
+        }
+      }
+
+      // Get max x-dimension of grid
+      int max_dim_x;
+      if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x,
+                                                  cudaDevAttrMaxGridDimX,
+                                                  device_ordinal)))
+      {
+        break;
+      }
+
+      // Get grid size for scanning tiles
+      dim3 scan_grid_size;
+      scan_grid_size.z = 1;
+      scan_grid_size.y = DivideAndRoundUp(num_tiles, max_dim_x);
+      scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+
+      // Log select_if_kernel configuration
+      if (debug_synchronous)
+      {
+        // Get SM occupancy for select_if_kernel
+        int range_select_sm_occupancy;
+        if (CubDebug(error = MaxSmOccupancy(
+          range_select_sm_occupancy,            // out
+          three_way_partition_kernel,
+          three_way_partition_config.block_threads)))
+        {
+          break;
+        }
+
+        _CubLog("Invoking three_way_partition_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d "
+                "items per thread, %d SM occupancy\n",
+                scan_grid_size.x,
+                scan_grid_size.y,
+                scan_grid_size.z,
+                three_way_partition_config.block_threads,
+                reinterpret_cast<long long>(stream),
+                three_way_partition_config.items_per_thread,
+                range_select_sm_occupancy);
+      }
+
+      // Invoke select_if_kernel
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+        scan_grid_size, three_way_partition_config.block_threads, 0, stream
+      ).doit(three_way_partition_kernel,
+             d_in,
+             d_first_part_out,
+             d_second_part_out,
+             d_unselected_out,
+             d_num_selected_out,
+             tile_status_1,
+             tile_status_2,
+             select_first_part_op,
+             select_second_part_op,
+             num_items,
+             num_tiles);
+
+      // Check for failure to launch
+      if (CubDebug(error = cudaPeekAtLastError()))
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      if (debug_synchronous)
+      {
+        if (CubDebug(error = cub::SyncStream(stream)))
+        {
+          break;
+        }
+      }
+    }
+    while (0);
+
+    return error;
+  }
+
+
+  /**
+   * Internal dispatch routine
+   */
+  CUB_RUNTIME_FUNCTION __forceinline__
+  static cudaError_t Dispatch(
+    void*                       d_temp_storage,
+    std::size_t&                temp_storage_bytes,
+    InputIteratorT              d_in,
+    FirstOutputIteratorT        d_first_part_out,
+    SecondOutputIteratorT       d_second_part_out,
+    UnselectedOutputIteratorT   d_unselected_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    SelectFirstPartOp           select_first_part_op,
+    SelectSecondPartOp          select_second_part_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+  {
+    cudaError error = cudaSuccess;
+
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      if (CubDebug(error = cub::PtxVersion(ptx_version)))
+      {
+        break;
+      }
+
+      // Get kernel kernel dispatch configurations
+      KernelConfig select_if_config;
+      InitConfigs(ptx_version, select_if_config);
+
+      // Dispatch
+      if (CubDebug(error = Dispatch(
+                     d_temp_storage,
+                     temp_storage_bytes,
+                     d_in,
+                     d_first_part_out,
+                     d_second_part_out,
+                     d_unselected_out,
+                     d_num_selected_out,
+                     select_first_part_op,
+                     select_second_part_op,
+                     num_items,
+                     stream,
+                     debug_synchronous,
+                     ptx_version,
+                     DeviceThreeWayPartitionInitKernel<ScanTileStateT,
+                                                       NumSelectedIteratorT>,
+                     DeviceThreeWayPartitionKernel<PtxThreeWayPartitionPolicyT,
+                                                   InputIteratorT,
+                                                   FirstOutputIteratorT,
+                                                   SecondOutputIteratorT,
+                                                   UnselectedOutputIteratorT,
+                                                   NumSelectedIteratorT,
+                                                   ScanTileStateT,
+                                                   SelectFirstPartOp,
+                                                   SelectSecondPartOp,
+                                                   OffsetT>,
+                     select_if_config)))
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/device/dispatch/dispatch_unique_by_key.cuh b/cuda/cub/device/dispatch/dispatch_unique_by_key.cuh
new file mode 100644
index 00000000..a40e8bef
--- /dev/null
+++ b/cuda/cub/device/dispatch/dispatch_unique_by_key.cuh
@@ -0,0 +1,431 @@
+
+/******************************************************************************
+ * Copyright (c) NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSelect::UniqueByKey provides device-wide, parallel operations for selecting unique items by key from sequences of data items residing within device-accessible memory.
+ */
+
+#include <cub/agent/agent_unique_by_key.cuh>
+#include <cub/device/dispatch/dispatch_scan.cuh>
+#include <cub/util_macro.cuh>
+#include <cub/util_math.cuh>
+
+#include <iterator>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Unique by key kernel entry point (multi-block)
+ */
+template <
+    typename AgentUniqueByKeyPolicyT,               ///< Parameterized AgentUniqueByKeyPolicy tuning policy type
+    typename KeyInputIteratorT,                     ///< Random-access input iterator type for keys
+    typename ValueInputIteratorT,                   ///< Random-access input iterator type for values
+    typename KeyOutputIteratorT,                    ///< Random-access output iterator type for keys
+    typename ValueOutputIteratorT,                  ///< Random-access output iterator type for values
+    typename NumSelectedIteratorT,                  ///< Output iterator type for recording the number of items selected
+    typename ScanTileStateT,                        ///< Tile status interface type
+    typename EqualityOpT,                           ///< Equality operator type
+    typename OffsetT>                               ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentUniqueByKeyPolicyT::UniqueByKeyPolicyT::BLOCK_THREADS))
+__global__ void DeviceUniqueByKeySweepKernel(
+    KeyInputIteratorT       d_keys_in,              ///< [in] Pointer to the input sequence of keys
+    ValueInputIteratorT     d_values_in,            ///< [in] Pointer to the input sequence of values
+    KeyOutputIteratorT      d_keys_out,             ///< [out] Pointer to the output sequence of selected data items
+    ValueOutputIteratorT    d_values_out,           ///< [out] Pointer to the output sequence of selected data items
+    NumSelectedIteratorT    d_num_selected_out,     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out)
+    ScanTileStateT          tile_state,             ///< [in] Tile status interface
+    EqualityOpT             equality_op,            ///< [in] Equality operator
+    OffsetT                 num_items,              ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in)
+    int                     num_tiles)              ///< [in] Total number of tiles for the entire problem
+{
+    // Thread block type for selecting data from input tiles
+    using AgentUniqueByKeyT = AgentUniqueByKey<
+        typename AgentUniqueByKeyPolicyT::UniqueByKeyPolicyT,
+        KeyInputIteratorT,
+        ValueInputIteratorT,
+        KeyOutputIteratorT,
+        ValueOutputIteratorT,
+        EqualityOpT,
+        OffsetT>;
+
+    // Shared memory for AgentUniqueByKey
+    __shared__ typename AgentUniqueByKeyT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentUniqueByKeyT(temp_storage, d_keys_in, d_values_in, d_keys_out, d_values_out, equality_op, num_items).ConsumeRange(
+        num_tiles,
+        tile_state,
+        d_num_selected_out);
+}
+
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+template <typename KeyInputIteratorT>
+struct DeviceUniqueByKeyPolicy
+{
+    using KeyT = typename std::iterator_traits<KeyInputIteratorT>::value_type;
+
+    // SM350
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy350> {
+        const static int INPUT_SIZE = sizeof(KeyT);
+        enum
+        {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD = Nominal4BItemsToItems<KeyT>(NOMINAL_4B_ITEMS_PER_THREAD),
+        };
+
+        using UniqueByKeyPolicyT = AgentUniqueByKeyPolicy<128,
+                          ITEMS_PER_THREAD,
+                          cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                          cub::LOAD_LDG,
+                          cub::BLOCK_SCAN_WARP_SCANS>;
+    };
+
+    // SM520
+    struct Policy520 : ChainedPolicy<520, Policy520, Policy350>
+    {
+        const static int INPUT_SIZE = sizeof(KeyT);
+        enum
+        {
+            NOMINAL_4B_ITEMS_PER_THREAD = 11,
+            ITEMS_PER_THREAD = Nominal4BItemsToItems<KeyT>(NOMINAL_4B_ITEMS_PER_THREAD),
+        };
+
+        using UniqueByKeyPolicyT =  AgentUniqueByKeyPolicy<64,
+                            ITEMS_PER_THREAD,
+                            cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                            cub::LOAD_LDG,
+                            cub::BLOCK_SCAN_WARP_SCANS>;
+    };
+
+    /// MaxPolicy
+    using MaxPolicy = Policy520;
+};
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect
+ */
+template <
+    typename KeyInputIteratorT,                 ///< Random-access input iterator type for keys
+    typename ValueInputIteratorT,               ///< Random-access input iterator type for values
+    typename KeyOutputIteratorT,                ///< Random-access output iterator type for keys
+    typename ValueOutputIteratorT,              ///< Random-access output iterator type for values
+    typename NumSelectedIteratorT,              ///< Output iterator type for recording the number of items selected
+    typename EqualityOpT,                       ///< Equality operator type
+    typename OffsetT,                           ///< Signed integer type for global offsets
+    typename SelectedPolicy = DeviceUniqueByKeyPolicy<KeyInputIteratorT>>
+struct DispatchUniqueByKey: SelectedPolicy
+{
+    /******************************************************************************
+     * Types and constants
+     ******************************************************************************/
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128,
+    };
+
+    // The input key and value type
+    using KeyT = typename std::iterator_traits<KeyInputIteratorT>::value_type;
+    using ValueT = typename std::iterator_traits<ValueInputIteratorT>::value_type;
+
+    // Tile status descriptor interface type
+    using ScanTileStateT = ScanTileState<OffsetT>;
+
+
+    void*                   d_temp_storage;             ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t&                 temp_storage_bytes;         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    KeyInputIteratorT       d_keys_in;                  ///< [in] Pointer to the input sequence of keys
+    ValueInputIteratorT     d_values_in;                ///< [in] Pointer to the input sequence of values
+    KeyOutputIteratorT      d_keys_out;                 ///< [out] Pointer to the output sequence of selected data items
+    ValueOutputIteratorT    d_values_out;               ///< [out] Pointer to the output sequence of selected data items
+    NumSelectedIteratorT    d_num_selected_out;         ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out)
+    EqualityOpT             equality_op;                ///< [in] Equality operator
+    OffsetT                 num_items;                  ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in)
+    cudaStream_t            stream;                     ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                    debug_synchronous;
+
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchUniqueByKey(
+        void*                   d_temp_storage,         ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                 temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeyInputIteratorT       d_keys_in,              ///< [in] Pointer to the input sequence of keys
+        ValueInputIteratorT     d_values_in,            ///< [in] Pointer to the input sequence of values
+        KeyOutputIteratorT      d_keys_out,             ///< [out] Pointer to the output sequence of selected data items
+        ValueOutputIteratorT    d_values_out,           ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT    d_num_selected_out,     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out)
+        EqualityOpT             equality_op,            ///< [in] Equality operator
+        OffsetT                 num_items,              ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in)
+        cudaStream_t            stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous
+    ):
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_keys_in(d_keys_in),
+        d_values_in(d_values_in),
+        d_keys_out(d_keys_out),
+        d_values_out(d_values_out),
+        d_num_selected_out(d_num_selected_out),
+        equality_op(equality_op),
+        num_items(num_items),
+        stream(stream),
+        debug_synchronous(debug_synchronous)
+    {}
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    template <typename ActivePolicyT, typename InitKernel, typename ScanKernel>
+    CUB_RUNTIME_FUNCTION __host__  __forceinline__
+    cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        (void)init_kernel;
+        (void)scan_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        using Policy = typename ActivePolicyT::UniqueByKeyPolicyT;
+        using UniqueByKeyAgentT = AgentUniqueByKey<Policy,
+                                                   KeyInputIteratorT,
+                                                   ValueInputIteratorT,
+                                                   KeyOutputIteratorT,
+                                                   ValueOutputIteratorT,
+                                                   EqualityOpT,
+                                                   OffsetT>;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD;
+            int num_tiles = static_cast<int>(cub::DivideAndRoundUp(num_items, tile_size));
+
+            // Size of virtual shared memory
+            int max_shmem = 0;
+            if (CubDebug(
+                error = cudaDeviceGetAttribute(&max_shmem,
+                                               cudaDevAttrMaxSharedMemoryPerBlock,
+                                               device_ordinal)))
+            {
+                break;
+            }
+            std::size_t vshmem_size = detail::VshmemSize(max_shmem, sizeof(typename UniqueByKeyAgentT::TempStorage), num_tiles);
+
+            // Specify temporary storage allocation requirements
+            size_t allocation_sizes[2] = {0, vshmem_size};
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void *allocations[2] = {NULL, NULL};
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log init_kernel configuration
+            num_tiles = CUB_MAX(1, num_tiles);
+            int init_grid_size = cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS);
+            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(init_kernel, tile_state, num_tiles, d_num_selected_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;
+
+            // Get grid size for scanning tiles
+            dim3 scan_grid_size;
+            scan_grid_size.z = 1;
+            scan_grid_size.y = cub::DivideAndRoundUp(num_tiles, max_dim_x);
+            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+
+            // Log select_if_kernel configuration
+            if (debug_synchronous)
+            {
+              // Get SM occupancy for unique_by_key_kernel
+              int scan_sm_occupancy;
+              if (CubDebug(error = MaxSmOccupancy(scan_sm_occupancy, // out
+                                                  scan_kernel,
+                                                  Policy::BLOCK_THREADS)))
+              {
+                break;
+              }
+
+              _CubLog("Invoking unique_by_key_kernel<<<{%d,%d,%d}, %d, 0, "
+                      "%lld>>>(), %d items per thread, %d SM occupancy\n",
+                      scan_grid_size.x,
+                      scan_grid_size.y,
+                      scan_grid_size.z,
+                      Policy::BLOCK_THREADS,
+                      (long long)stream,
+                      Policy::ITEMS_PER_THREAD,
+                      scan_sm_occupancy);
+            }
+
+            // Invoke select_if_kernel
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                scan_grid_size, Policy::BLOCK_THREADS, 0, stream
+            ).doit(scan_kernel,
+                   d_keys_in,
+                   d_values_in,
+                   d_keys_out,
+                   d_values_out,
+                   d_num_selected_out,
+                   tile_state,
+                   equality_op,
+                   num_items,
+                   num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while(0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __host__  __forceinline__
+    cudaError_t Invoke()
+    {
+        // Ensure kernels are instantiated.
+        return Invoke<ActivePolicyT>(
+            DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
+            DeviceUniqueByKeySweepKernel<
+                ActivePolicyT,
+                KeyInputIteratorT,
+                ValueInputIteratorT,
+                KeyOutputIteratorT,
+                ValueOutputIteratorT,
+                NumSelectedIteratorT,
+                ScanTileStateT,
+                EqualityOpT,
+                OffsetT>
+        );
+    }
+
+
+    /**
+    * Internal dispatch routine
+    */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,         ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeyInputIteratorT       d_keys_in,              ///< [in] Pointer to the input sequence of keys
+        ValueInputIteratorT     d_values_in,            ///< [in] Pointer to the input sequence of values
+        KeyOutputIteratorT      d_keys_out,             ///< [out] Pointer to the output sequence of selected data items
+        ValueOutputIteratorT    d_values_out,           ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT    d_num_selected_out,     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out)
+        EqualityOpT             equality_op,            ///< [in] Equality operator
+        OffsetT                 num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t            stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous)      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        using MaxPolicyT = typename DispatchUniqueByKey::MaxPolicy;
+
+        cudaError_t error;
+        do
+        {
+            // Get PTX version
+            int ptx_version = 0;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchUniqueByKey dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_keys_in,
+                d_values_in,
+                d_keys_out,
+                d_values_out,
+                d_num_selected_out,
+                equality_op,
+                num_items,
+                stream,
+                debug_synchronous
+            );
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/grid/grid_barrier.cuh b/cuda/cub/grid/grid_barrier.cuh
new file mode 100644
index 00000000..ca946f3c
--- /dev/null
+++ b/cuda/cub/grid/grid_barrier.cuh
@@ -0,0 +1,206 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
+ */
+
+#pragma once
+
+#include "../util_debug.cuh"
+#include "../config.cuh"
+#include "../thread/thread_load.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
+ */
+class GridBarrier
+{
+protected :
+
+    typedef unsigned int SyncFlag;
+
+    // Counters in global device memory
+    SyncFlag* d_sync;
+
+public:
+
+    /**
+     * Constructor
+     */
+    GridBarrier() : d_sync(NULL) {}
+
+
+    /**
+     * Synchronize
+     */
+    __device__ __forceinline__ void Sync() const
+    {
+        volatile SyncFlag *d_vol_sync = d_sync;
+
+        // Threadfence and syncthreads to make sure global writes are visible before
+        // thread-0 reports in with its sync counter
+        __threadfence();
+        CTA_SYNC();
+
+        if (blockIdx.x == 0)
+        {
+            // Report in ourselves
+            if (threadIdx.x == 0)
+            {
+                d_vol_sync[blockIdx.x] = 1;
+            }
+
+            CTA_SYNC();
+
+            // Wait for everyone else to report in
+            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+            {
+                while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
+                {
+                    __threadfence_block();
+                }
+            }
+
+            CTA_SYNC();
+
+            // Let everyone know it's safe to proceed
+            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+            {
+                d_vol_sync[peer_block] = 0;
+            }
+        }
+        else
+        {
+            if (threadIdx.x == 0)
+            {
+                // Report in
+                d_vol_sync[blockIdx.x] = 1;
+
+                // Wait for acknowledgment
+                while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
+                {
+                    __threadfence_block();
+                }
+            }
+
+            CTA_SYNC();
+        }
+    }
+};
+
+
+/**
+ * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
+ *
+ * Uses RAII for lifetime, i.e., device resources are reclaimed when
+ * the destructor is called.
+ */
+class GridBarrierLifetime : public GridBarrier
+{
+protected:
+
+    // Number of bytes backed by d_sync
+    size_t sync_bytes;
+
+public:
+
+    /**
+     * Constructor
+     */
+    GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
+
+
+    /**
+     * DeviceFrees and resets the progress counters
+     */
+    cudaError_t HostReset()
+    {
+        cudaError_t retval = cudaSuccess;
+        if (d_sync)
+        {
+            CubDebug(retval = cudaFree(d_sync));
+            d_sync = NULL;
+        }
+        sync_bytes = 0;
+        return retval;
+    }
+
+
+    /**
+     * Destructor
+     */
+    virtual ~GridBarrierLifetime()
+    {
+        HostReset();
+    }
+
+
+    /**
+     * Sets up the progress counters for the next kernel launch (lazily
+     * allocating and initializing them if necessary)
+     */
+    cudaError_t Setup(int sweep_grid_size)
+    {
+        cudaError_t retval = cudaSuccess;
+        do {
+            size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
+            if (new_sync_bytes > sync_bytes)
+            {
+                if (d_sync)
+                {
+                    if (CubDebug(retval = cudaFree(d_sync))) break;
+                }
+
+                sync_bytes = new_sync_bytes;
+
+                // Allocate and initialize to zero
+                if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
+                if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
+            }
+        } while (0);
+
+        return retval;
+    }
+};
+
+
+/** @} */       // end group GridModule
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/grid/grid_even_share.cuh b/cuda/cub/grid/grid_even_share.cuh
new file mode 100644
index 00000000..d2150511
--- /dev/null
+++ b/cuda/cub/grid/grid_even_share.cuh
@@ -0,0 +1,222 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly the same number of fixed-size work units (grains).
+ */
+
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_namespace.cuh"
+#include "../util_macro.cuh"
+#include "../util_math.cuh"
+#include "../util_type.cuh"
+#include "grid_mapping.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridEvenShare is a descriptor utility for distributing input among
+ * CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly
+ * the same number of input tiles.
+ *
+ * \par Overview
+ * Each thread block is assigned a consecutive sequence of input tiles.  To help
+ * preserve alignment and eliminate the overhead of guarded loads for all but the
+ * last thread block, to GridEvenShare assigns one of three different amounts of
+ * work to a given thread block: "big", "normal", or "last".  The "big" workloads
+ * are one scheduling grain larger than "normal".  The "last" work unit for the
+ * last thread block may be partially-full if the input is not an even multiple of
+ * the scheduling grain size.
+ *
+ * \par
+ * Before invoking a child grid, a parent thread will typically construct an
+ * instance of GridEvenShare.  The instance can be passed to child thread blocks
+ * which can initialize their per-thread block offsets using \p BlockInit().
+ */
+template <typename OffsetT>
+struct GridEvenShare
+{
+private:
+
+    int         total_tiles;
+    int         big_shares;
+    OffsetT     big_share_items;
+    OffsetT     normal_share_items;
+    OffsetT     normal_base_offset;
+
+public:
+
+    /// Total number of input items
+    OffsetT     num_items;
+
+    /// Grid size in thread blocks
+    int         grid_size;
+
+    /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles
+    OffsetT     block_offset;
+
+    /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles
+    OffsetT     block_end;
+
+    /// Stride between input tiles
+    OffsetT     block_stride;
+
+
+    /**
+     * \brief Constructor.
+     */
+    __host__ __device__ __forceinline__ GridEvenShare() :
+        total_tiles(0),
+        big_shares(0),
+        big_share_items(0),
+        normal_share_items(0),
+        normal_base_offset(0),
+        num_items(0),
+        grid_size(0),
+        block_offset(0),
+        block_end(0),
+        block_stride(0)
+    {}
+
+
+    /**
+     * \brief Dispatch initializer. To be called prior prior to kernel launch.
+     */
+    __host__ __device__ __forceinline__ void DispatchInit(
+        OffsetT num_items_,          ///< Total number of input items
+        int     max_grid_size,      ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
+        int     tile_items)         ///< Number of data items per input tile
+    {
+        this->block_offset          = num_items_;    // Initialize past-the-end
+        this->block_end             = num_items_;    // Initialize past-the-end
+        this->num_items             = num_items_;
+        this->total_tiles           = static_cast<int>(cub::DivideAndRoundUp(num_items_, tile_items));
+        this->grid_size             = CUB_MIN(total_tiles, max_grid_size);
+        int avg_tiles_per_block     = total_tiles / grid_size;
+        // leftover grains go to big blocks:
+        this->big_shares            = total_tiles - (avg_tiles_per_block * grid_size);
+        this->normal_share_items    = avg_tiles_per_block * tile_items;
+        this->normal_base_offset    = big_shares * tile_items;
+        this->big_share_items       = normal_share_items + tile_items;
+    }
+
+
+    /**
+     * \brief Initializes ranges for the specified thread block index.  Specialized
+     * for a "raking" access pattern in which each thread block is assigned a
+     * consecutive sequence of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        int block_id,
+        Int2Type<GRID_MAPPING_RAKE> /*strategy_tag*/)
+    {
+        block_stride = TILE_ITEMS;
+        if (block_id < big_shares)
+        {
+            // This thread block gets a big share of grains (avg_tiles_per_block + 1)
+            block_offset = (block_id * big_share_items);
+            block_end = block_offset + big_share_items;
+        }
+        else if (block_id < total_tiles)
+        {
+            // This thread block gets a normal share of grains (avg_tiles_per_block)
+            block_offset = normal_base_offset + (block_id * normal_share_items);
+            // Avoid generating values greater than num_items, as it may cause overflow
+            block_end = block_offset + CUB_MIN(num_items - block_offset, normal_share_items);
+        }
+        // Else default past-the-end
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for a "raking" access
+     * pattern in which each thread block is assigned a consecutive sequence
+     * of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        int block_id,
+        Int2Type<GRID_MAPPING_STRIP_MINE> /*strategy_tag*/)
+    {
+        block_stride = grid_size * TILE_ITEMS;
+        block_offset = (block_id * TILE_ITEMS);
+        block_end = num_items;
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for "strip mining" access
+     * pattern in which the input tiles assigned to each thread block are
+     * separated by a stride equal to the the extent of the grid.
+     */
+    template <
+        int TILE_ITEMS,
+        GridMappingStrategy STRATEGY>
+    __device__ __forceinline__ void BlockInit()
+    {
+        BlockInit<TILE_ITEMS>(blockIdx.x, Int2Type<STRATEGY>());
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for a "raking" access
+     * pattern in which each thread block is assigned a consecutive sequence
+     * of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        this->block_offset = block_offset;
+        this->block_end = block_end;
+        this->block_stride = TILE_ITEMS;
+    }
+
+
+};
+
+
+
+
+
+/** @} */       // end group GridModule
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/grid/grid_mapping.cuh b/cuda/cub/grid/grid_mapping.cuh
new file mode 100644
index 00000000..b57f193d
--- /dev/null
+++ b/cuda/cub/grid/grid_mapping.cuh
@@ -0,0 +1,108 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/******************************************************************************
+ * Mapping policies
+ *****************************************************************************/
+
+
+/**
+ * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
+ */
+enum GridMappingStrategy
+{
+    /**
+     * \brief An a "raking" access pattern in which each thread block is
+     * assigned a consecutive sequence of input tiles
+     *
+     * \par Overview
+     * The input is evenly partitioned into \p p segments, where \p p is
+     * constant and corresponds loosely to the number of thread blocks that may
+     * actively reside on the target device. Each segment is comprised of
+     * consecutive tiles, where a tile is a small, constant-sized unit of input
+     * to be processed to completion before the thread block terminates or
+     * obtains more work.  The kernel invokes \p p thread blocks, each
+     * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
+     * in tile-size increments.
+     */
+    GRID_MAPPING_RAKE,
+
+    /**
+     * \brief An a "strip mining" access pattern in which the input tiles assigned
+     * to each thread block are separated by a stride equal to the the extent of
+     * the grid.
+     *
+     * \par Overview
+     * The input is evenly partitioned into \p p sets, where \p p is
+     * constant and corresponds loosely to the number of thread blocks that may
+     * actively reside on the target device. Each set is comprised of
+     * data tiles separated by stride \p tiles, where a tile is a small,
+     * constant-sized unit of input to be processed to completion before the
+     * thread block terminates or obtains more work.  The kernel invokes \p p
+     * thread blocks, each of which iteratively consumes a segment of
+     * <em>n</em>/<em>p</em> elements in tile-size increments.
+     */
+    GRID_MAPPING_STRIP_MINE,
+
+    /**
+     * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
+     *
+     * \par Overview
+     * The input is treated as a queue to be dynamically consumed by a grid of
+     * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
+     * unit of input to be processed to completion before the thread block
+     * terminates or obtains more work.  The grid size \p p is constant,
+     * loosely corresponding to the number of thread blocks that may actively
+     * reside on the target device.
+     */
+    GRID_MAPPING_DYNAMIC,
+};
+
+
+/** @} */       // end group GridModule
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/grid/grid_queue.cuh b/cuda/cub/grid/grid_queue.cuh
new file mode 100644
index 00000000..ebb82e4d
--- /dev/null
+++ b/cuda/cub/grid/grid_queue.cuh
@@ -0,0 +1,239 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridQueue is a descriptor utility for dynamic queue management.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_debug.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridQueue is a descriptor utility for dynamic queue management.
+ *
+ * \par Overview
+ * GridQueue descriptors provides abstractions for "filling" or
+ * "draining" globally-shared vectors.
+ *
+ * \par
+ * A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
+ * returning a unique offset for the calling thread to write its items.
+ * The GridQueue maintains the total "fill-size".  The fill counter must be reset
+ * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
+ * will be filling.
+ *
+ * \par
+ * Similarly, a "draining" GridQueue works by works by atomically-incrementing a
+ * zero-initialized counter, returning a unique offset for the calling thread to
+ * read its items. Threads can safely drain until the array's logical fill-size is
+ * exceeded.  The drain counter must be reset using GridQueue::ResetDrain or
+ * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that
+ * will be filling.  (For dynamic work distribution of existing data, the corresponding fill-size
+ * is simply the number of elements in the array.)
+ *
+ * \par
+ * Iterative work management can be implemented simply with a pair of flip-flopping
+ * work buffers, each with an associated set of fill and drain GridQueue descriptors.
+ *
+ * \tparam OffsetT Signed integer type for global offsets
+ */
+template <typename OffsetT>
+class GridQueue
+{
+private:
+
+    /// Counter indices
+    enum
+    {
+        FILL    = 0,
+        DRAIN   = 1,
+    };
+
+    /// Pair of counters
+    OffsetT *d_counters;
+
+public:
+
+    /// Returns the device allocation size in bytes needed to construct a GridQueue instance
+    __host__ __device__ __forceinline__
+    static size_t AllocationSize()
+    {
+        return sizeof(OffsetT) * 2;
+    }
+
+
+    /// Constructs an invalid GridQueue descriptor
+    __host__ __device__ __forceinline__ GridQueue()
+    :
+        d_counters(NULL)
+    {}
+
+
+    /// Constructs a GridQueue descriptor around the device storage allocation
+    __host__ __device__ __forceinline__ GridQueue(
+        void *d_storage)                    ///< Device allocation to back the GridQueue.  Must be at least as big as <tt>AllocationSize()</tt>.
+    :
+        d_counters((OffsetT*) d_storage)
+    {}
+
+
+    /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance.  To be called by the host or by a kernel prior to that which will be draining.
+    __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain(
+        OffsetT fill_size,
+        cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)stream;
+                d_counters[FILL] = fill_size;
+                d_counters[DRAIN] = 0;
+                result = cudaSuccess;
+            #endif
+        } else {
+            #if CUB_INCLUDE_HOST_CODE
+                OffsetT counters[2];
+                counters[FILL] = fill_size;
+                counters[DRAIN] = 0;
+                result = CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream));
+            #endif
+        }
+        return result;
+    }
+
+
+    /// This operation resets the drain so that it may advance to meet the existing fill-size.  To be called by the host or by a kernel prior to that which will be draining.
+    __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)stream;
+                d_counters[DRAIN] = 0;
+                result = cudaSuccess;
+            #endif
+        } else {
+            #if CUB_INCLUDE_HOST_CODE
+                result = CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream));
+            #endif
+        }
+        return result;
+    }
+
+
+    /// This operation resets the fill counter.  To be called by the host or by a kernel prior to that which will be filling.
+    __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)stream;
+                d_counters[FILL] = 0;
+                result = cudaSuccess;
+            #endif
+        } else {
+            #if CUB_INCLUDE_HOST_CODE
+                result = CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream));
+            #endif
+        }
+        return result;
+    }
+
+
+    /// Returns the fill-size established by the parent or by the previous kernel.
+    __host__ __device__ __forceinline__ cudaError_t FillSize(
+        OffsetT &fill_size,
+        cudaStream_t stream = 0)
+    {
+        cudaError_t result = cudaErrorUnknown;
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)stream;
+                fill_size = d_counters[FILL];
+                result = cudaSuccess;
+            #endif
+        } else {
+            #if CUB_INCLUDE_HOST_CODE
+                result = CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream));
+            #endif
+        }
+        return result;
+    }
+
+
+    /// Drain \p num_items from the queue.  Returns offset from which to read items.  To be called from CUDA kernel.
+    __device__ __forceinline__ OffsetT Drain(OffsetT num_items)
+    {
+        return atomicAdd(d_counters + DRAIN, num_items);
+    }
+
+
+    /// Fill \p num_items into the queue.  Returns offset from which to write items.    To be called from CUDA kernel.
+    __device__ __forceinline__ OffsetT Fill(OffsetT num_items)
+    {
+        return atomicAdd(d_counters + FILL, num_items);
+    }
+};
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Reset grid queue (call with 1 block of 1 thread)
+ */
+template <typename OffsetT>
+__global__ void FillAndResetDrainKernel(
+    GridQueue<OffsetT>   grid_queue,
+    OffsetT              num_items)
+{
+    grid_queue.FillAndResetDrain(num_items);
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group GridModule
+
+CUB_NAMESPACE_END
+
+
diff --git a/cuda/cub/host/mutex.cuh b/cuda/cub/host/mutex.cuh
new file mode 100644
index 00000000..a9c3dc7c
--- /dev/null
+++ b/cuda/cub/host/mutex.cuh
@@ -0,0 +1,162 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Simple portable mutex
+ */
+
+#include "../util_cpp_dialect.cuh"
+
+#pragma once
+
+#if CUB_CPP_DIALECT >= 2011
+    #include <mutex>
+#else
+    #if defined(_WIN32) || defined(_WIN64)
+        #include <intrin.h>
+
+        #define WIN32_LEAN_AND_MEAN
+        #define NOMINMAX
+        #include <windows.h>
+        #undef WIN32_LEAN_AND_MEAN
+        #undef NOMINMAX
+
+        /**
+         * Compiler read/write barrier
+         */
+        #pragma intrinsic(_ReadWriteBarrier)
+
+    #endif
+#endif
+
+#include "../config.cuh"
+
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * Simple portable mutex
+ *   - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms)
+ *   - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++)
+ */
+struct Mutex
+{
+#if CUB_CPP_DIALECT >= 2011
+
+    std::mutex mtx;
+
+    void Lock()
+    {
+        mtx.lock();
+    }
+
+    void Unlock()
+    {
+        mtx.unlock();
+    }
+
+#else       // C++11
+
+    #if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
+
+        // Microsoft VC++
+        typedef long Spinlock;
+
+    #else
+
+        // GNU g++
+        typedef int Spinlock;
+
+        /**
+         * Compiler read/write barrier
+         */
+        __forceinline__ void _ReadWriteBarrier()
+        {
+            __sync_synchronize();
+        }
+
+        /**
+         * Atomic exchange
+         */
+        __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
+        {
+            // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
+            _ReadWriteBarrier();
+            return __sync_lock_test_and_set(Target, Value);
+        }
+
+        /**
+         * Pause instruction to prevent excess processor bus usage
+         */
+        __forceinline__ void YieldProcessor()
+        {
+        }
+
+    #endif  // MSVC
+
+        /// Lock member
+        volatile Spinlock lock;
+
+        /**
+         * Constructor
+         */
+        Mutex() : lock(0) {}
+
+        /**
+         * Return when the specified spinlock has been acquired
+         */
+        __forceinline__ void Lock()
+        {
+            while (1)
+            {
+                if (!_InterlockedExchange(&lock, 1)) return;
+                while (lock) YieldProcessor();
+            }
+        }
+
+
+        /**
+         * Release the specified spinlock
+         */
+        __forceinline__ void Unlock()
+        {
+            _ReadWriteBarrier();
+            lock = 0;
+        }
+
+#endif      // C++11
+
+};
+
+
+
+
+CUB_NAMESPACE_END
+
diff --git a/cuda/cub/iterator/arg_index_input_iterator.cuh b/cuda/cub/iterator/arg_index_input_iterator.cuh
new file mode 100644
index 00000000..822869a5
--- /dev/null
+++ b/cuda/cub/iterator/arg_index_input_iterator.cuh
@@ -0,0 +1,254 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../config.cuh"
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+
+#include <thrust/version.h>
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p KeyValuePair tuples).
+ *
+ * \par Overview
+ * - ArgIndexInputIteratorTwraps a random access input iterator \p itr of type \p InputIteratorT.
+ *   Dereferencing an ArgIndexInputIteratorTat offset \p i produces a \p KeyValuePair value whose
+ *   \p key field is \p i and whose \p value field is <tt>itr[i]</tt>.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
+ *   device memory can only be dereferenced on the device.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p ArgIndexInputIteratorTto
+ * dereference an array of doubles
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/arg_index_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_in;         // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::ArgIndexInputIterator<double*> itr(d_in);
+ *
+ * // Within device code:
+ * typedef typename cub::ArgIndexInputIterator<double*>::value_type Tuple;
+ * Tuple item_offset_pair.key = *itr;
+ * printf("%f @ %d\n",
+ *   item_offset_pair.value,
+ *   item_offset_pair.key);   // 8.0 @ 0
+ *
+ * itr = itr + 6;
+ * item_offset_pair.key = *itr;
+ * printf("%f @ %d\n",
+ *   item_offset_pair.value,
+ *   item_offset_pair.key);   // 9.0 @ 6
+ *
+ * \endcode
+ *
+ * \tparam InputIteratorT       The value type of the wrapped input iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ * \tparam OutputValueT         The paired value type of the <offset,value> tuple (Default: value type of input iterator)
+ */
+template <
+    typename    InputIteratorT,
+    typename    OffsetT             = ptrdiff_t,
+    typename    OutputValueT        = cub::detail::value_t<InputIteratorT>>
+class ArgIndexInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef ArgIndexInputIterator                       self_type;              ///< My own type
+    typedef OffsetT                                     difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef KeyValuePair<difference_type, OutputValueT> value_type;             ///< The type of the element the iterator can point to
+    typedef value_type*                                 pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef value_type                                  reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
+        THRUST_NS_QUALIFIER::any_system_tag,
+        THRUST_NS_QUALIFIER::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    InputIteratorT  itr;
+    difference_type offset;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ArgIndexInputIterator(
+        InputIteratorT  itr,            ///< Input iterator to wrap
+        difference_type offset = 0)     ///< OffsetT (in items) from \p itr denoting the position of the iterator
+    :
+        itr(itr),
+        offset(offset)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        value_type retval;
+        retval.value = itr[offset];
+        retval.key = offset;
+        return retval;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(itr, offset + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(itr, offset - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        self_type offset = (*this) + n;
+        return *offset;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &(*(*this));
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return ((itr == rhs.itr) && (offset == rhs.offset));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return ((itr != rhs.itr) || (offset != rhs.offset));
+    }
+
+    /// Normalize
+    __host__ __device__ __forceinline__ void normalize()
+    {
+        itr += offset;
+        offset = 0;
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
+    {
+        return os;
+    }
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/iterator/cache_modified_input_iterator.cuh b/cuda/cub/iterator/cache_modified_input_iterator.cuh
new file mode 100644
index 00000000..0224d148
--- /dev/null
+++ b/cuda/cub/iterator/cache_modified_input_iterator.cuh
@@ -0,0 +1,235 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../config.cuh"
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+CUB_NAMESPACE_BEGIN
+
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
+ *
+ * \par Overview
+ * - CacheModifiedInputIterator is a random-access input iterator that wraps a native
+ *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
+ *   made by reading \p ValueType values through loads modified by \p MODIFIER.
+ * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG",
+ *   "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.).
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions, but can only be dereferenced within device functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p CacheModifiedInputIterator to
+ * dereference a device array of double using the "ldg" PTX load modifier
+ * (i.e., load values through texture cache).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_in;            // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::CacheModifiedInputIterator<cub::LOAD_LDG, double> itr(d_in);
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);  // 8.0
+ * printf("%f\n", itr[1]);  // 6.0
+ * printf("%f\n", itr[6]);  // 9.0
+ *
+ * \endcode
+ *
+ * \tparam CacheLoadModifier    The cub::CacheLoadModifier to use when accessing data
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    CacheLoadModifier   MODIFIER,
+    typename            ValueType,
+    typename            OffsetT = ptrdiff_t>
+class CacheModifiedInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef CacheModifiedInputIterator          self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
+        THRUST_NS_QUALIFIER::device_system_tag,
+        THRUST_NS_QUALIFIER::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+
+public:
+
+    /// Wrapped native pointer
+    ValueType* ptr;
+
+    /// Constructor
+    template <typename QualifiedValueType>
+    __host__ __device__ __forceinline__ CacheModifiedInputIterator(
+        QualifiedValueType* ptr)     ///< Native pointer to wrap
+    :
+        ptr(const_cast<typename std::remove_cv<QualifiedValueType>::type *>(ptr))
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        ptr++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        ptr++;
+        return *this;
+    }
+
+    /// Indirection
+    __device__ __forceinline__ reference operator*() const
+    {
+        return ThreadLoad<MODIFIER>(ptr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(ptr + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        ptr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(ptr - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        ptr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return ptr - other.ptr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return ThreadLoad<MODIFIER>(ptr + n);
+    }
+
+    /// Structure dereference
+    __device__ __forceinline__ pointer operator->()
+    {
+        return &ThreadLoad<MODIFIER>(ptr);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
+    {
+        return os;
+    }
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/iterator/cache_modified_output_iterator.cuh b/cuda/cub/iterator/cache_modified_output_iterator.cuh
new file mode 100644
index 00000000..be08dbce
--- /dev/null
+++ b/cuda/cub/iterator/cache_modified_output_iterator.cuh
@@ -0,0 +1,249 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../config.cuh"
+#include "../util_device.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access output wrapper for storing array values using a PTX cache-modifier.
+ *
+ * \par Overview
+ * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native
+ *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
+ *   made by writing \p ValueType values through stores modified by \p MODIFIER.
+ * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB",
+ *   "STORE_CG", "STORE_CS", "STORE_WT", etc.).
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions, but can only be dereferenced within device functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to
+ * dereference a device array of doubles using the "wt" PTX load modifier
+ * (i.e., write-through to system memory).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_output_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_out;              // e.g., [, , , , , , ]
+ *
+ * // Create an iterator wrapper
+ * cub::CacheModifiedOutputIterator<cub::STORE_WT, double> itr(d_out);
+ *
+ * // Within device code:
+ * itr[0]  = 8.0;
+ * itr[1]  = 66.0;
+ * itr[55] = 24.0;
+ *
+ * \endcode
+ *
+ * \par Usage Considerations
+ * - Can only be dereferenced within device code
+ *
+ * \tparam CacheStoreModifier     The cub::CacheStoreModifier to use when accessing data
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    CacheStoreModifier  MODIFIER,
+    typename            ValueType,
+    typename            OffsetT = ptrdiff_t>
+class CacheModifiedOutputIterator
+{
+private:
+
+    // Proxy object
+    struct Reference
+    {
+        ValueType* ptr;
+
+        /// Constructor
+        __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {}
+
+        /// Assignment
+        __device__ __forceinline__ ValueType operator =(ValueType val)
+        {
+            ThreadStore<MODIFIER>(ptr, val);
+            return val;
+        }
+    };
+
+public:
+
+    // Required iterator traits
+    typedef CacheModifiedOutputIterator         self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef void                                value_type;             ///< The type of the element the iterator can point to
+    typedef void                                pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef Reference                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
+        THRUST_NS_QUALIFIER::device_system_tag,
+        THRUST_NS_QUALIFIER::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ValueType* ptr;
+
+public:
+
+    /// Constructor
+    template <typename QualifiedValueType>
+    __host__ __device__ __forceinline__ CacheModifiedOutputIterator(
+        QualifiedValueType* ptr)     ///< Native pointer to wrap
+    :
+        ptr(const_cast<typename std::remove_cv<QualifiedValueType>::type *>(ptr))
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        ptr++;
+        return retval;
+    }
+
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        ptr++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return Reference(ptr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(ptr + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        ptr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(ptr - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        ptr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return ptr - other.ptr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return Reference(ptr + n);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        return os;
+    }
+};
+
+
+/** @} */       // end group UtilIterator
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/iterator/constant_input_iterator.cuh b/cuda/cub/iterator/constant_input_iterator.cuh
new file mode 100644
index 00000000..dc2d72e8
--- /dev/null
+++ b/cuda/cub/iterator/constant_input_iterator.cuh
@@ -0,0 +1,230 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../config.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input generator for dereferencing a sequence of homogeneous values
+ *
+ * \par Overview
+ * - Read references to a ConstantInputIteratorTiterator always return the supplied constant
+ *   of type \p ValueType.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
+ *   functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p ConstantInputIteratorTto
+ * dereference a sequence of homogeneous doubles.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/constant_input_iterator.cuh>
+ *
+ * cub::ConstantInputIterator<double> itr(5.0);
+ *
+ * printf("%f\n", itr[0]);      // 5.0
+ * printf("%f\n", itr[1]);      // 5.0
+ * printf("%f\n", itr[2]);      // 5.0
+ * printf("%f\n", itr[50]);     // 5.0
+ *
+ * \endcode
+ *
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename ValueType,
+    typename OffsetT = ptrdiff_t>
+class ConstantInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef ConstantInputIterator               self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
+        THRUST_NS_QUALIFIER::any_system_tag,
+        THRUST_NS_QUALIFIER::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ValueType   val;
+    OffsetT     offset;
+#ifdef _WIN32
+    OffsetT     pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];        // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
+#endif
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ConstantInputIterator(
+        ValueType   val,            ///< Starting value for the iterator instance to report
+        OffsetT     offset = 0)     ///< Base offset
+    :
+        val(val),
+        offset(offset)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return val;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(val, offset + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(val, offset - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance /*n*/) const
+    {
+        return val;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &val;
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (offset == rhs.offset) && ((val == rhs.val));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (offset != rhs.offset) || (val!= rhs.val);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.val << "," << itr.offset << "]";
+        return os;
+    }
+
+};
+
+
+/** @} */       // end group UtilIterator
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/iterator/counting_input_iterator.cuh b/cuda/cub/iterator/counting_input_iterator.cuh
new file mode 100644
index 00000000..e81f1d96
--- /dev/null
+++ b/cuda/cub/iterator/counting_input_iterator.cuh
@@ -0,0 +1,223 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../config.cuh"
+#include "../util_device.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+/**
+ * \brief A random-access input generator for dereferencing a sequence of incrementing integer values.
+ *
+ * \par Overview
+ * - After initializing a CountingInputIteratorTto a certain integer \p base, read references
+ *   at \p offset will return the value \p base + \p offset.
+ * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
+ *   functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p CountingInputIteratorTto
+ * dereference a sequence of incrementing integers.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/counting_input_iterator.cuh>
+ *
+ * cub::CountingInputIterator<int> itr(5);
+ *
+ * printf("%d\n", itr[0]);      // 5
+ * printf("%d\n", itr[1]);      // 6
+ * printf("%d\n", itr[2]);      // 7
+ * printf("%d\n", itr[50]);     // 55
+ *
+ * \endcode
+ *
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename ValueType,
+    typename OffsetT = ptrdiff_t>
+class CountingInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef CountingInputIterator               self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
+        THRUST_NS_QUALIFIER::any_system_tag,
+        THRUST_NS_QUALIFIER::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ValueType val;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ CountingInputIterator(
+        const ValueType &val)          ///< Starting value for the iterator instance to report
+    :
+        val(val)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        val++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        val++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return val;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(val + (ValueType) n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        val += (ValueType) n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(val - (ValueType) n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        val -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return (difference_type) (val - other.val);
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return val + (ValueType) n;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &val;
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (val == rhs.val);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (val != rhs.val);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.val << "]";
+        return os;
+    }
+
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/iterator/discard_output_iterator.cuh b/cuda/cub/iterator/discard_output_iterator.cuh
new file mode 100644
index 00000000..ac47a3ff
--- /dev/null
+++ b/cuda/cub/iterator/discard_output_iterator.cuh
@@ -0,0 +1,208 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../config.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A discard iterator
+ */
+template <typename OffsetT = ptrdiff_t>
+class DiscardOutputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef DiscardOutputIterator   self_type;              ///< My own type
+    typedef OffsetT                 difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef void                    value_type;             ///< The type of the element the iterator can point to
+    typedef void                    pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef void                    reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
+        THRUST_NS_QUALIFIER::any_system_tag,
+        THRUST_NS_QUALIFIER::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    OffsetT offset;
+
+#if defined(_WIN32) || !defined(_WIN64)
+    // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
+    OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))] = {};
+#endif
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ DiscardOutputIterator(
+        OffsetT offset = 0)     ///< Base offset
+    :
+        offset(offset)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ self_type& operator*()
+    {
+        // return self reference, which can be assigned to anything
+        return *this;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(offset + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(offset - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator[](Distance n)
+    {
+        // return self reference, which can be assigned to anything
+        return *this;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return;
+    }
+
+    /// Assignment to anything else (no-op)
+    template<typename T>
+    __host__ __device__ __forceinline__ void operator=(T const&)
+    {}
+
+    /// Cast to void* operator
+    __host__ __device__ __forceinline__ operator void*() const { return NULL; }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (offset == rhs.offset);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (offset != rhs.offset);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.offset << "]";
+        return os;
+    }
+
+};
+
+
+/** @} */       // end group UtilIterator
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/iterator/tex_obj_input_iterator.cuh b/cuda/cub/iterator/tex_obj_input_iterator.cuh
new file mode 100644
index 00000000..e7659d43
--- /dev/null
+++ b/cuda/cub/iterator/tex_obj_input_iterator.cuh
@@ -0,0 +1,321 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_debug.cuh"
+#include "../config.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+
+/**
+ * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses newer Kepler-style texture objects.
+ *
+ * \par Overview
+ * - TexObjInputIterator wraps a native device pointer of type <tt>ValueType*</tt>. References
+ *   to elements are to be loaded through texture cache.
+ * - Can be used to load any data type from memory through texture cache.
+ * - Can be manipulated and exchanged within and between host and device
+ *   functions, can only be constructed within host functions, and can only be
+ *   dereferenced within device functions.
+ * - With regard to nested/dynamic parallelism, TexObjInputIterator iterators may only be
+ *   created by the host thread, but can be used by any descendant kernel.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p TexObjInputIterator to
+ * dereference a device array of doubles through texture cache.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_obj_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * int num_items;   // e.g., 7
+ * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::TexObjInputIterator<double> itr;
+ * itr.BindTexture(d_in, sizeof(double) * num_items);
+ * ...
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);      // 8.0
+ * printf("%f\n", itr[1]);      // 6.0
+ * printf("%f\n", itr[6]);      // 9.0
+ *
+ * ...
+ * itr.UnbindTexture();
+ *
+ * \endcode
+ *
+ * \tparam T                    The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename    T,
+    typename    OffsetT = ptrdiff_t>
+class TexObjInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef TexObjInputIterator                 self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef T                                   value_type;             ///< The type of the element the iterator can point to
+    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
+        THRUST_NS_QUALIFIER::device_system_tag,
+        THRUST_NS_QUALIFIER::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    // Largest texture word we can use in device
+    typedef typename UnitWord<T>::TextureWord TextureWord;
+
+    // Number of texture words per T
+    enum {
+        TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
+    };
+
+private:
+
+    T*                  ptr;
+    difference_type     tex_offset;
+    cudaTextureObject_t tex_obj;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ TexObjInputIterator()
+    :
+        ptr(NULL),
+        tex_offset(0),
+        tex_obj(0)
+    {}
+
+    /// Use this iterator to bind \p ptr with a texture reference
+    template <typename QualifiedT>
+    cudaError_t BindTexture(
+        QualifiedT      *ptr,               ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
+        size_t          bytes,              ///< Number of bytes in the range
+        size_t          tex_offset = 0)     ///< OffsetT (in items) from \p ptr denoting the position of the iterator
+    {
+        this->ptr = const_cast<typename std::remove_cv<QualifiedT>::type *>(ptr);
+        this->tex_offset = static_cast<difference_type>(tex_offset);
+
+        cudaChannelFormatDesc   channel_desc = cudaCreateChannelDesc<TextureWord>();
+        cudaResourceDesc        res_desc;
+        cudaTextureDesc         tex_desc;
+        memset(&res_desc, 0, sizeof(cudaResourceDesc));
+        memset(&tex_desc, 0, sizeof(cudaTextureDesc));
+        res_desc.resType                = cudaResourceTypeLinear;
+        res_desc.res.linear.devPtr      = this->ptr;
+        res_desc.res.linear.desc        = channel_desc;
+        res_desc.res.linear.sizeInBytes = bytes;
+        tex_desc.readMode               = cudaReadModeElementType;
+        return CubDebug(cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL));
+    }
+
+    /// Unbind this iterator from its texture reference
+    cudaError_t UnbindTexture()
+    {
+        return CubDebug(cudaDestroyTextureObject(tex_obj));
+    }
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        tex_offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        tex_offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        if (CUB_IS_HOST_CODE) {
+            #if CUB_INCLUDE_HOST_CODE
+                // Simply dereference the pointer on the host
+                return ptr[tex_offset];
+            #else
+                // Never executed, just need a return value for this codepath.
+                // The `reference` type is actually just T, so we can fake this
+                // easily.
+                return reference{};
+            #endif
+        } else {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // Move array of uninitialized words, then alias and assign to return value
+                TextureWord words[TEXTURE_MULTIPLE];
+
+                #pragma unroll
+                for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
+                {
+                    words[i] = tex1Dfetch<TextureWord>(
+                        tex_obj,
+                        (tex_offset * TEXTURE_MULTIPLE) + i);
+                }
+
+                // Load from words
+                return *reinterpret_cast<T*>(words);
+            #else
+                // This is dead code which will never be executed.  It is here
+                // only to avoid warnings about missing return statements.
+                return ptr[tex_offset];
+            #endif
+        }
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval;
+        retval.ptr          = ptr;
+        retval.tex_obj      = tex_obj;
+        retval.tex_offset   = tex_offset + n;
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        tex_offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval;
+        retval.ptr          = ptr;
+        retval.tex_obj      = tex_obj;
+        retval.tex_offset   = tex_offset - n;
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        tex_offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return tex_offset - other.tex_offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        self_type offset = (*this) + n;
+        return *offset;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &(*(*this));
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj));
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "cub::TexObjInputIterator( ptr=" << itr.ptr
+           << ", offset=" << itr.tex_offset
+           << ", tex_obj=" << itr.tex_obj << " )";
+        return os;
+    }
+
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/iterator/tex_ref_input_iterator.cuh b/cuda/cub/iterator/tex_ref_input_iterator.cuh
new file mode 100644
index 00000000..0d877e1d
--- /dev/null
+++ b/cuda/cub/iterator/tex_ref_input_iterator.cuh
@@ -0,0 +1,108 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+#include <cub/iterator/tex_obj_input_iterator.cuh>
+
+#include <cstddef>
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+/**
+ * \brief A random-access input wrapper for dereferencing array values through texture cache.
+ *
+ * \deprecated [Since 1.13.0] The CUDA texture management APIs used by
+ * TexRefInputIterator are deprecated. Use cub::TexObjInputIterator instead.
+ *
+ * \par Overview
+ * - TexRefInputIterator wraps a native device pointer of type <tt>ValueType*</tt>. References
+ *   to elements are to be loaded through texture cache.
+ * - Can be used to load any data type from memory through texture cache.
+ * - Can be manipulated and exchanged within and between host and device
+ *   functions, can only be constructed within host functions, and can only be
+ *   dereferenced within device functions.
+ * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture
+ *   reference.  Only one TexRefInputIterator instance can be bound at any given time for a
+ *   specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host
+ *   thread, and (4) compilation .o unit.
+ * - With regard to nested/dynamic parallelism, TexRefInputIterator iterators may only be
+ *   created by the host thread and used by a top-level kernel (i.e. the one which is launched
+ *   from the host).
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p TexRefInputIterator to
+ * dereference a device array of doubles through texture cache.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_ref_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * int num_items;   // e.g., 7
+ * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::TexRefInputIterator<double, __LINE__> itr;
+ * itr.BindTexture(d_in, sizeof(double) * num_items);
+ * ...
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);      // 8.0
+ * printf("%f\n", itr[1]);      // 6.0
+ * printf("%f\n", itr[6]);      // 9.0
+ *
+ * ...
+ * itr.UnbindTexture();
+ *
+ * \endcode
+ *
+ * \tparam T                    The value type of this iterator
+ * \tparam UNIQUE_ID            A globally-unique identifier (within the compilation unit) to name the underlying texture reference
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename    T,
+    int         /*UNIQUE_ID*/,
+    typename    OffsetT = std::ptrdiff_t>
+using TexRefInputIterator CUB_DEPRECATED = cub::TexObjInputIterator<T, OffsetT>;
+
+/** @} */       // end group UtilIterator
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/iterator/transform_input_iterator.cuh b/cuda/cub/iterator/transform_input_iterator.cuh
new file mode 100644
index 00000000..e9f01227
--- /dev/null
+++ b/cuda/cub/iterator/transform_input_iterator.cuh
@@ -0,0 +1,241 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../config.cuh"
+#include "../util_device.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input wrapper for transforming dereferenced values.
+ *
+ * \par Overview
+ * - TransformInputIteratorTwraps a unary conversion functor of type \p
+ *   ConversionOp and a random-access input iterator of type <tt>InputIteratorT</tt>,
+ *   using the former to produce references of type \p ValueType from the latter.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
+ *   device memory can only be dereferenced on the device.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p TransformInputIteratorTto
+ * dereference an array of integers, tripling the values and converting them to doubles.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/transform_input_iterator.cuh>
+ *
+ * // Functor for tripling integer values and converting to doubles
+ * struct TripleDoubler
+ * {
+ *     __host__ __device__ __forceinline__
+ *     double operator()(const int &a) const {
+ *         return double(a * 3);
+ *     }
+ * };
+ *
+ * // Declare, allocate, and initialize a device array
+ * int *d_in;                   // e.g., [8, 6, 7, 5, 3, 0, 9]
+ * TripleDoubler conversion_op;
+ *
+ * // Create an iterator wrapper
+ * cub::TransformInputIterator<double, TripleDoubler, int*> itr(d_in, conversion_op);
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);  // 24.0
+ * printf("%f\n", itr[1]);  // 18.0
+ * printf("%f\n", itr[6]);  // 27.0
+ *
+ * \endcode
+ *
+ * \tparam ValueType            The value type of this iterator
+ * \tparam ConversionOp         Unary functor type for mapping objects of type \p InputType to type \p ValueType.  Must have member <tt>ValueType operator()(const InputType &datum)</tt>.
+ * \tparam InputIteratorT       The type of the wrapped input iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ *
+ */
+template <
+    typename ValueType,
+    typename ConversionOp,
+    typename InputIteratorT,
+    typename OffsetT = ptrdiff_t>
+class TransformInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef TransformInputIterator              self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
+        THRUST_NS_QUALIFIER::any_system_tag,
+        THRUST_NS_QUALIFIER::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ConversionOp    conversion_op;
+    InputIteratorT  input_itr;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ TransformInputIterator(
+        InputIteratorT      input_itr,          ///< Input iterator to wrap
+        ConversionOp        conversion_op)      ///< Conversion functor to wrap
+    :
+        conversion_op(conversion_op),
+        input_itr(input_itr)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        input_itr++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        input_itr++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return conversion_op(*input_itr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(input_itr + n, conversion_op);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        input_itr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(input_itr - n, conversion_op);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        input_itr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return input_itr - other.input_itr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return conversion_op(input_itr[n]);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (input_itr == rhs.input_itr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (input_itr != rhs.input_itr);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& /* itr */)
+    {
+        return os;
+    }
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/thread/thread_load.cuh b/cuda/cub/thread/thread_load.cuh
new file mode 100644
index 00000000..97ee37d8
--- /dev/null
+++ b/cuda/cub/thread/thread_load.cuh
@@ -0,0 +1,422 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for reading memory using PTX cache modifiers.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+//-----------------------------------------------------------------------------
+// Tags and constants
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief Enumeration of cache modifiers for memory load operations.
+ */
+enum CacheLoadModifier
+{
+    LOAD_DEFAULT,       ///< Default (no modifier)
+    LOAD_CA,            ///< Cache at all levels
+    LOAD_CG,            ///< Cache at global level
+    LOAD_CS,            ///< Cache streaming (likely to be accessed once)
+    LOAD_CV,            ///< Cache as volatile (including cached system lines)
+    LOAD_LDG,           ///< Cache as texture
+    LOAD_VOLATILE,      ///< Volatile (any memory space)
+};
+
+
+/**
+ * \name Thread I/O (cache modified)
+ * @{
+ */
+
+/**
+ * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers.  Can be used to load any data type.
+ *
+ * \par Example
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_load.cuh>
+ *
+ * // 32-bit load using cache-global modifier:
+ * int *d_in;
+ * int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
+ *
+ * // 16-bit load using default modifier
+ * short *d_in;
+ * short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
+ *
+ * // 256-bit load using cache-volatile modifier
+ * double4 *d_in;
+ * double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
+ *
+ * // 96-bit load using cache-streaming modifier
+ * struct TestFoo { bool a; short b; };
+ * TestFoo *d_struct;
+ * TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
+ * \endcode
+ *
+ * \tparam MODIFIER             <b>[inferred]</b> CacheLoadModifier enumeration
+ * \tparam InputIteratorT       <b>[inferred]</b> Input iterator type \iterator
+ */
+template <CacheLoadModifier MODIFIER,
+          typename InputIteratorT>
+__device__ __forceinline__ cub::detail::value_t<InputIteratorT>
+ThreadLoad(InputIteratorT itr);
+
+//@}  end member group
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/// Helper structure for templated load iteration (inductive case)
+template <int COUNT, int MAX>
+struct IterateThreadLoad
+{
+    template <CacheLoadModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Load(T const *ptr, T *vals)
+    {
+        vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
+        IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals);
+    }
+
+    template <typename InputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals)
+    {
+        vals[COUNT] = itr[COUNT];
+        IterateThreadLoad<COUNT + 1, MAX>::Dereference(itr, vals);
+    }
+};
+
+
+/// Helper structure for templated load iteration (termination case)
+template <int MAX>
+struct IterateThreadLoad<MAX, MAX>
+{
+    template <CacheLoadModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {}
+
+    template <typename InputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {}
+};
+
+
+/**
+ * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_16(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4 const *>(uint4 const *ptr)                   \
+    {                                                                                       \
+        uint4 retval;                                                                       \
+        asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" :                 \
+            "=r"(retval.x),                                                                 \
+            "=r"(retval.y),                                                                 \
+            "=r"(retval.z),                                                                 \
+            "=r"(retval.w) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const *>(ulonglong2 const *ptr)    \
+    {                                                                                       \
+        ulonglong2 retval;                                                                  \
+        asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" :                         \
+            "=l"(retval.x),                                                                 \
+            "=l"(retval.y) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+/**
+ * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_8(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ ushort4 ThreadLoad<cub_modifier, ushort4 const *>(ushort4 const *ptr)             \
+    {                                                                                       \
+        ushort4 retval;                                                                     \
+        asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" :                 \
+            "=h"(retval.x),                                                                 \
+            "=h"(retval.y),                                                                 \
+            "=h"(retval.z),                                                                 \
+            "=h"(retval.w) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ uint2 ThreadLoad<cub_modifier, uint2 const *>(uint2 const *ptr)                   \
+    {                                                                                       \
+        uint2 retval;                                                                       \
+        asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" :                         \
+            "=r"(retval.x),                                                                 \
+            "=r"(retval.y) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned long long ThreadLoad<cub_modifier, unsigned long long const *>(unsigned long long const *ptr)    \
+    {                                                                                       \
+        unsigned long long retval;                                                          \
+        asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" :                                  \
+            "=l"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+/**
+ * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_4(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int const *>(unsigned int const *ptr)                      \
+    {                                                                                       \
+        unsigned int retval;                                                                \
+        asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" :                                  \
+            "=r"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+
+/**
+ * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_2(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short const *>(unsigned short const *ptr)                \
+    {                                                                                       \
+        unsigned short retval;                                                              \
+        asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" :                                  \
+            "=h"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+
+/**
+ * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_1(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char const *>(unsigned char const *ptr)                   \
+    {                                                                                       \
+        unsigned short retval;                                                              \
+        asm volatile (                                                                      \
+        "{"                                                                                 \
+        "   .reg .u8 datum;"                                                                \
+        "    ld."#ptx_modifier".u8 datum, [%1];"                                            \
+        "    cvt.u16.u8 %0, datum;"                                                         \
+        "}" :                                                                               \
+            "=h"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return (unsigned char) retval;                                                      \
+    }
+
+
+/**
+ * Define powers-of-two ThreadLoad specializations for the given Cache load modifier
+ */
+#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier)                                            \
+    _CUB_LOAD_16(cub_modifier, ptx_modifier)                                                 \
+    _CUB_LOAD_8(cub_modifier, ptx_modifier)                                                  \
+    _CUB_LOAD_4(cub_modifier, ptx_modifier)                                                  \
+    _CUB_LOAD_2(cub_modifier, ptx_modifier)                                                  \
+    _CUB_LOAD_1(cub_modifier, ptx_modifier)                                                  \
+
+
+/**
+ * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers
+ */
+#if CUB_PTX_ARCH >= 200
+    _CUB_LOAD_ALL(LOAD_CA, ca)
+    _CUB_LOAD_ALL(LOAD_CG, cg)
+    _CUB_LOAD_ALL(LOAD_CS, cs)
+    _CUB_LOAD_ALL(LOAD_CV, cv)
+#else
+    _CUB_LOAD_ALL(LOAD_CA, global)
+    // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1
+    _CUB_LOAD_ALL(LOAD_CG, volatile.global)
+    _CUB_LOAD_ALL(LOAD_CS, global)
+    _CUB_LOAD_ALL(LOAD_CV, volatile.global)
+#endif
+
+#if CUB_PTX_ARCH >= 350
+    _CUB_LOAD_ALL(LOAD_LDG, global.nc)
+#else
+    _CUB_LOAD_ALL(LOAD_LDG, global)
+#endif
+
+
+// Macro cleanup
+#undef _CUB_LOAD_ALL
+#undef _CUB_LOAD_1
+#undef _CUB_LOAD_2
+#undef _CUB_LOAD_4
+#undef _CUB_LOAD_8
+#undef _CUB_LOAD_16
+
+
+
+/**
+ * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types
+ */
+template <typename InputIteratorT>
+__device__ __forceinline__ cub::detail::value_t<InputIteratorT>
+ThreadLoad(InputIteratorT          itr,
+           Int2Type<LOAD_DEFAULT>  /*modifier*/,
+           Int2Type<false>         /*is_pointer*/)
+{
+    return *itr;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoad(
+    T                       *ptr,
+    Int2Type<LOAD_DEFAULT>  /*modifier*/,
+    Int2Type<true>          /*is_pointer*/)
+{
+    return *ptr;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoadVolatilePointer(
+    T                       *ptr,
+    Int2Type<true>          /*is_primitive*/)
+{
+    T retval = *reinterpret_cast<volatile T*>(ptr);
+    return retval;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoadVolatilePointer(
+    T                       *ptr,
+    Int2Type<false>         /*is_primitive*/)
+{
+    typedef typename UnitWord<T>::VolatileWord VolatileWord;   // Word type for memcopying
+
+    const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
+
+    T retval;
+    VolatileWord *words = reinterpret_cast<VolatileWord*>(&retval);
+    IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
+        reinterpret_cast<volatile VolatileWord*>(ptr),
+        words);
+    return retval;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoad(
+    T                       *ptr,
+    Int2Type<LOAD_VOLATILE> /*modifier*/,
+    Int2Type<true>          /*is_pointer*/)
+{
+    // Apply tags for partial-specialization
+    return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>());
+}
+
+
+/**
+ * ThreadLoad definition for generic modifiers on pointer types
+ */
+template <typename T, int MODIFIER>
+__device__ __forceinline__ T ThreadLoad(
+    T const                 *ptr,
+    Int2Type<MODIFIER>      /*modifier*/,
+    Int2Type<true>          /*is_pointer*/)
+{
+    typedef typename UnitWord<T>::DeviceWord DeviceWord;
+
+    const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
+
+    DeviceWord words[DEVICE_MULTIPLE];
+
+    IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load<CacheLoadModifier(MODIFIER)>(
+        reinterpret_cast<DeviceWord*>(const_cast<T*>(ptr)),
+        words);
+
+    return *reinterpret_cast<T*>(words);
+}
+
+
+/**
+ * ThreadLoad definition for generic modifiers
+ */
+template <
+    CacheLoadModifier MODIFIER,
+    typename InputIteratorT>
+__device__ __forceinline__ cub::detail::value_t<InputIteratorT>
+ThreadLoad(InputIteratorT itr)
+{
+    // Apply tags for partial-specialization
+    return ThreadLoad(
+        itr,
+        Int2Type<MODIFIER>(),
+        Int2Type<std::is_pointer<InputIteratorT>::value>());
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group UtilIo
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/thread/thread_operators.cuh b/cuda/cub/thread/thread_operators.cuh
new file mode 100644
index 00000000..c220ecac
--- /dev/null
+++ b/cuda/cub/thread/thread_operators.cuh
@@ -0,0 +1,370 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Simple binary operator functor types
+ */
+
+/******************************************************************************
+ * Simple functor operators
+ ******************************************************************************/
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_type.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+/**
+ * \brief Default equality functor
+ */
+struct Equality
+{
+    /// Boolean equality operator, returns <tt>(a == b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
+    {
+        return a == b;
+    }
+};
+
+
+/**
+ * \brief Default inequality functor
+ */
+struct Inequality
+{
+    /// Boolean inequality operator, returns <tt>(a != b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
+    {
+        return a != b;
+    }
+};
+
+
+/**
+ * \brief Inequality functor (wraps equality functor)
+ */
+template <typename EqualityOp>
+struct InequalityWrapper
+{
+    /// Wrapped equality operator
+    EqualityOp op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    InequalityWrapper(EqualityOp op) : op(op) {}
+
+    /// Boolean inequality operator, returns <tt>(a != b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b)
+    {
+        return !op(a, b);
+    }
+};
+
+
+/**
+ * \brief Default sum functor
+ */
+struct Sum
+{
+    /// Binary sum operator, returns <tt>a + b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return a + b;
+    }
+};
+
+/**
+ * \brief Default difference functor
+ */
+struct Difference
+{
+  /// Binary difference operator, returns <tt>a - b</tt>
+  template <typename T>
+  __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+  {
+    return a - b;
+  }
+};
+
+/**
+ * \brief Default division functor
+ */
+struct Division
+{
+  /// Binary difference operator, returns <tt>a - b</tt>
+  template <typename T>
+  __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+  {
+    return a / b;
+  }
+};
+
+
+/**
+ * \brief Default max functor
+ */
+struct Max
+{
+    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return CUB_MAX(a, b);
+    }
+};
+
+
+/**
+ * \brief Arg max functor (keeps the value and offset of the first occurrence of the larger item)
+ */
+struct ArgMax
+{
+    /// Boolean max operator, preferring the item having the smaller offset in case of ties
+    template <typename T, typename OffsetT>
+    __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
+        const KeyValuePair<OffsetT, T> &a,
+        const KeyValuePair<OffsetT, T> &b) const
+    {
+// Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
+//        return ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
+
+        if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key)))
+            return b;
+        return a;
+    }
+};
+
+
+/**
+ * \brief Default min functor
+ */
+struct Min
+{
+    /// Boolean min operator, returns <tt>(a < b) ? a : b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return CUB_MIN(a, b);
+    }
+};
+
+
+/**
+ * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
+ */
+struct ArgMin
+{
+    /// Boolean min operator, preferring the item having the smaller offset in case of ties
+    template <typename T, typename OffsetT>
+    __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
+        const KeyValuePair<OffsetT, T> &a,
+        const KeyValuePair<OffsetT, T> &b) const
+    {
+// Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
+//        return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
+
+        if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key)))
+            return b;
+        return a;
+    }
+};
+
+
+/**
+ * \brief Default cast functor
+ */
+template <typename B>
+struct CastOp
+{
+    /// Cast operator, returns <tt>(B) a</tt>
+    template <typename A>
+    __host__ __device__ __forceinline__ B operator()(const A &a) const
+    {
+        return (B) a;
+    }
+};
+
+
+/**
+ * \brief Binary operator wrapper for switching non-commutative scan arguments
+ */
+template <typename ScanOp>
+class SwizzleScanOp
+{
+private:
+
+    /// Wrapped scan operator
+    ScanOp scan_op;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {}
+
+    /// Switch the scan arguments
+    template <typename T>
+    __host__ __device__ __forceinline__
+    T operator()(const T &a, const T &b)
+    {
+      T _a(a);
+      T _b(b);
+
+      return scan_op(_b, _a);
+    }
+};
+
+
+/**
+ * \brief Reduce-by-segment functor.
+ *
+ * Given two cub::KeyValuePair inputs \p a and \p b and a
+ * binary associative combining operator \p <tt>f(const T &x, const T &y)</tt>,
+ * an instance of this functor returns a cub::KeyValuePair whose \p key
+ * field is <tt>a.key</tt> + <tt>b.key</tt>, and whose \p value field
+ * is either b.value if b.key is non-zero, or f(a.value, b.value) otherwise.
+ *
+ * ReduceBySegmentOp is an associative, non-commutative binary combining operator
+ * for input sequences of cub::KeyValuePair pairings.  Such
+ * sequences are typically used to represent a segmented set of values to be reduced
+ * and a corresponding set of {0,1}-valued integer "head flags" demarcating the
+ * first value of each segment.
+ *
+ */
+template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
+struct ReduceBySegmentOp
+{
+    /// Wrapped reduction operator
+    ReductionOpT op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceBySegmentOp() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) : op(op) {}
+
+    /// Scan operator
+    template <typename KeyValuePairT>       ///< KeyValuePair pairing of T (value) and OffsetT (head flag)
+    __host__ __device__ __forceinline__ KeyValuePairT operator()(
+        const KeyValuePairT &first,         ///< First partial reduction
+        const KeyValuePairT &second)        ///< Second partial reduction
+    {
+        KeyValuePairT retval;
+        retval.key = first.key + second.key;
+#ifdef _NVHPC_CUDA // WAR bug on nvc++
+        if (second.key)
+        {
+          retval.value = second.value;
+        }
+        else
+        {
+          // If second.value isn't copied into a temporary here, nvc++ will
+          // crash while compiling the TestScanByKeyWithLargeTypes test in
+          // thrust/testing/scan_by_key.cu:
+          auto v2 = second.value;
+          retval.value = op(first.value, v2);
+        }
+#else // not nvc++:
+        retval.value = (second.key) ?
+                second.value :                          // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate
+                op(first.value, second.value);          // The second partial reduction does not span a reset, so accumulate both into the running aggregate
+#endif
+        return retval;
+    }
+};
+
+
+template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
+struct ReduceByKeyOp
+{
+    /// Wrapped reduction operator
+    ReductionOpT op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceByKeyOp() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) : op(op) {}
+
+    /// Scan operator
+    template <typename KeyValuePairT>
+    __host__ __device__ __forceinline__ KeyValuePairT operator()(
+        const KeyValuePairT &first,       ///< First partial reduction
+        const KeyValuePairT &second)      ///< Second partial reduction
+    {
+        KeyValuePairT retval = second;
+
+        if (first.key == second.key)
+            retval.value = op(first.value, retval.value);
+
+        return retval;
+    }
+};
+
+
+template <typename BinaryOpT>
+struct BinaryFlip
+{
+  BinaryOpT binary_op;
+
+  __device__ __host__ explicit BinaryFlip(BinaryOpT binary_op)
+      : binary_op(binary_op)
+  {}
+
+  template <typename T, typename U>
+  __device__ auto
+  operator()(T &&t, U &&u) -> decltype(binary_op(std::forward<U>(u),
+                                                 std::forward<T>(t)))
+  {
+    return binary_op(std::forward<U>(u), std::forward<T>(t));
+  }
+};
+
+template <typename BinaryOpT>
+__device__ __host__ BinaryFlip<BinaryOpT> MakeBinaryFlip(BinaryOpT binary_op)
+{
+  return BinaryFlip<BinaryOpT>(binary_op);
+}
+
+/** @} */       // end group UtilModule
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/thread/thread_reduce.cuh b/cuda/cub/thread/thread_reduce.cuh
new file mode 100644
index 00000000..98fb2faa
--- /dev/null
+++ b/cuda/cub/thread/thread_reduce.cuh
@@ -0,0 +1,147 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential reduction over statically-sized array types
+ */
+
+#pragma once
+
+#include "../thread/thread_operators.cuh"
+#include "../config.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
+namespace internal {
+
+/**
+ * Sequential reduction over statically-sized array types
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*                  input,                  ///< [in] Input array
+    ReductionOp         reduction_op,           ///< [in] Binary reduction operator
+    T                   prefix,                 ///< [in] Prefix to seed reduction with
+    Int2Type<LENGTH>    /*length*/)
+{
+    T retval = prefix;
+
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+        retval = reduction_op(retval, input[i]);
+
+    return retval;
+}
+
+
+/**
+ * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*          input,                  ///< [in] Input array
+    ReductionOp reduction_op,           ///< [in] Binary reduction operator
+    T           prefix)                 ///< [in] Prefix to seed reduction with
+{
+    return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
+}
+
+
+/**
+ * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*          input,                  ///< [in] Input array
+    ReductionOp reduction_op)           ///< [in] Binary reduction operator
+{
+    T prefix = input[0];
+    return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
+}
+
+
+/**
+ * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    ReductionOp reduction_op,           ///< [in] Binary reduction operator
+    T           prefix)                 ///< [in] Prefix to seed reduction with
+{
+    return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
+}
+
+
+/**
+ * \brief Serial reduction with the specified operator
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    ReductionOp reduction_op)           ///< [in] Binary reduction operator
+{
+    return ThreadReduce<LENGTH>((T*) input, reduction_op);
+}
+
+
+}               // internal namespace
+CUB_NAMESPACE_END
diff --git a/cuda/cub/thread/thread_scan.cuh b/cuda/cub/thread/thread_scan.cuh
new file mode 100644
index 00000000..b5e42710
--- /dev/null
+++ b/cuda/cub/thread/thread_scan.cuh
@@ -0,0 +1,263 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential prefix scan over statically-sized array types
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../thread/thread_operators.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
+namespace internal {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+/**
+ * \name Sequential prefix scan over statically-sized array types
+ * @{
+ */
+
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T                   inclusive,
+    T                   exclusive,
+    T                   *input,                 ///< [in] Input array
+    T                   *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp              scan_op,                ///< [in] Binary scan operator
+    Int2Type<LENGTH>    /*length*/)
+{
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+    {
+        inclusive = scan_op(exclusive, input[i]);
+        output[i] = exclusive;
+        exclusive = inclusive;
+    }
+
+    return inclusive;
+}
+
+
+
+/**
+ * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  If not, the first output element is undefined.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    T inclusive = input[0];
+    if (apply_prefix)
+    {
+        inclusive = scan_op(prefix, inclusive);
+    }
+    output[0] = prefix;
+    T exclusive = inclusive;
+
+    return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+
+/**
+ * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    return ThreadScanExclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
+}
+
+
+
+
+
+
+
+
+
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T                   inclusive,
+    T                   *input,                 ///< [in] Input array
+    T                   *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp              scan_op,                ///< [in] Binary scan operator
+    Int2Type<LENGTH>    /*length*/)
+{
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+    {
+        inclusive = scan_op(inclusive, input[i]);
+        output[i] = inclusive;
+    }
+
+    return inclusive;
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op)                ///< [in] Binary scan operator
+{
+    T inclusive = input[0];
+    output[0] = inclusive;
+
+    // Continue scan
+    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op)                ///< [in] Binary scan operator
+{
+    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op);
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    T inclusive = input[0];
+    if (apply_prefix)
+    {
+        inclusive = scan_op(prefix, inclusive);
+    }
+    output[0] = inclusive;
+
+    // Continue scan
+    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
+}
+
+
+//@}  end member group
+
+/** @} */       // end group UtilModule
+
+
+}               // internal namespace
+CUB_NAMESPACE_END
diff --git a/cuda/cub/thread/thread_search.cuh b/cuda/cub/thread/thread_search.cuh
new file mode 100644
index 00000000..a88892de
--- /dev/null
+++ b/cuda/cub/thread/thread_search.cuh
@@ -0,0 +1,152 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential search
+ */
+
+#pragma once
+
+#include <iterator>
+#include <cub/util_namespace.cuh>
+#include <cub/util_type.cuh>
+#include <cub/config.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * Computes the begin offsets into A and B for the specific diagonal
+ */
+template <
+    typename AIteratorT,
+    typename BIteratorT,
+    typename OffsetT,
+    typename CoordinateT>
+__host__ __device__ __forceinline__ void MergePathSearch(
+    OffsetT         diagonal,
+    AIteratorT      a,
+    BIteratorT      b,
+    OffsetT         a_len,
+    OffsetT         b_len,
+    CoordinateT&    path_coordinate)
+{
+    /// The value type of the input iterator
+    using T = cub::detail::value_t<AIteratorT>;
+
+    OffsetT split_min = CUB_MAX(diagonal - b_len, 0);
+    OffsetT split_max = CUB_MIN(diagonal, a_len);
+
+    while (split_min < split_max)
+    {
+        OffsetT split_pivot = (split_min + split_max) >> 1;
+        if (a[split_pivot] <= b[diagonal - split_pivot - 1])
+        {
+            // Move candidate split range up A, down B
+            split_min = split_pivot + 1;
+        }
+        else
+        {
+            // Move candidate split range up B, down A
+            split_max = split_pivot;
+        }
+    }
+
+    path_coordinate.x = CUB_MIN(split_min, a_len);
+    path_coordinate.y = diagonal - split_min;
+}
+
+
+
+/**
+ * \brief Returns the offset of the first value within \p input which does not compare less than \p val
+ */
+template <
+    typename InputIteratorT,
+    typename OffsetT,
+    typename T>
+__device__ __forceinline__ OffsetT LowerBound(
+    InputIteratorT      input,              ///< [in] Input sequence
+    OffsetT             num_items,          ///< [in] Input sequence length
+    T                   val)                ///< [in] Search key
+{
+    OffsetT retval = 0;
+    while (num_items > 0)
+    {
+        OffsetT half = num_items >> 1;
+        if (input[retval + half] < val)
+        {
+            retval = retval + (half + 1);
+            num_items = num_items - (half + 1);
+        }
+        else
+        {
+            num_items = half;
+        }
+    }
+
+    return retval;
+}
+
+
+/**
+ * \brief Returns the offset of the first value within \p input which compares greater than \p val
+ */
+template <
+    typename InputIteratorT,
+    typename OffsetT,
+    typename T>
+__device__ __forceinline__ OffsetT UpperBound(
+    InputIteratorT      input,              ///< [in] Input sequence
+    OffsetT             num_items,          ///< [in] Input sequence length
+    T                   val)                ///< [in] Search key
+{
+    OffsetT retval = 0;
+    while (num_items > 0)
+    {
+        OffsetT half = num_items >> 1;
+        if (val < input[retval + half])
+        {
+            num_items = half;
+        }
+        else
+        {
+            retval = retval + (half + 1);
+            num_items = num_items - (half + 1);
+        }
+    }
+
+    return retval;
+}
+
+
+
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/thread/thread_sort.cuh b/cuda/cub/thread/thread_sort.cuh
new file mode 100644
index 00000000..5d486789
--- /dev/null
+++ b/cuda/cub/thread/thread_sort.cuh
@@ -0,0 +1,105 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+template <typename T>
+__device__ __forceinline__ void Swap(T &lhs, T &rhs)
+{
+  T temp = lhs;
+  lhs    = rhs;
+  rhs    = temp;
+}
+
+
+/**
+ * @brief Sorts data using odd-even sort method
+ *
+ * The sorting method is stable. Further details can be found in:
+ * A. Nico Habermann. Parallel neighbor sort (or the glory of the induction
+ * principle). Technical Report AD-759 248, Carnegie Mellon University, 1972.
+ *
+ * @tparam KeyT
+ *   Key type
+ *
+ * @tparam ValueT
+ *   Value type. If `cub::NullType` is used as `ValueT`, only keys are sorted.
+ *
+ * @tparam CompareOp
+ *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   The number of items per thread
+ *
+ * @param[in,out] keys
+ *   Keys to sort
+ *
+ * @param[in,out] items
+ *   Values to sort
+ *
+ * @param[in] compare_op
+ *   Comparison function object which returns true if the first argument is
+ *   ordered before the second
+ */
+template <typename KeyT,
+          typename ValueT,
+          typename CompareOp,
+          int ITEMS_PER_THREAD>
+__device__ __forceinline__ void
+StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD],
+                  ValueT (&items)[ITEMS_PER_THREAD],
+                  CompareOp compare_op)
+{
+  constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+
+  #pragma unroll
+  for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+  {
+  #pragma unroll
+    for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
+    {
+      if (compare_op(keys[j + 1], keys[j]))
+      {
+        Swap(keys[j], keys[j + 1]);
+        if (!KEYS_ONLY)
+        {
+          Swap(items[j], items[j + 1]);
+        }
+      }
+    } // inner loop
+  }   // outer loop
+}
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/thread/thread_store.cuh b/cuda/cub/thread/thread_store.cuh
new file mode 100644
index 00000000..00944ec9
--- /dev/null
+++ b/cuda/cub/thread/thread_store.cuh
@@ -0,0 +1,415 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for writing memory using PTX cache modifiers.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+
+//-----------------------------------------------------------------------------
+// Tags and constants
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief Enumeration of cache modifiers for memory store operations.
+ */
+enum CacheStoreModifier
+{
+    STORE_DEFAULT,              ///< Default (no modifier)
+    STORE_WB,                   ///< Cache write-back all coherent levels
+    STORE_CG,                   ///< Cache at global level
+    STORE_CS,                   ///< Cache streaming (likely to be accessed once)
+    STORE_WT,                   ///< Cache write-through (to system memory)
+    STORE_VOLATILE,             ///< Volatile shared (any memory space)
+};
+
+
+/**
+ * \name Thread I/O (cache modified)
+ * @{
+ */
+
+/**
+ * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers.  Can be used to store any data type.
+ *
+ * \par Example
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_store.cuh>
+ *
+ * // 32-bit store using cache-global modifier:
+ * int *d_out;
+ * int val;
+ * cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
+ *
+ * // 16-bit store using default modifier
+ * short *d_out;
+ * short val;
+ * cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
+ *
+ * // 256-bit store using write-through modifier
+ * double4 *d_out;
+ * double4 val;
+ * cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
+ *
+ * // 96-bit store using cache-streaming cache modifier
+ * struct TestFoo { bool a; short b; };
+ * TestFoo *d_struct;
+ * TestFoo val;
+ * cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
+ * \endcode
+ *
+ * \tparam MODIFIER             <b>[inferred]</b> CacheStoreModifier enumeration
+ * \tparam InputIteratorT       <b>[inferred]</b> Output iterator type \iterator
+ * \tparam T                    <b>[inferred]</b> Data type of output value
+ */
+template <
+    CacheStoreModifier  MODIFIER,
+    typename            OutputIteratorT,
+    typename            T>
+__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val);
+
+
+//@}  end member group
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/// Helper structure for templated store iteration (inductive case)
+template <int COUNT, int MAX>
+struct IterateThreadStore
+{
+    template <CacheStoreModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Store(T *ptr, T *vals)
+    {
+        ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
+        IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
+    }
+
+    template <typename OutputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals)
+    {
+        ptr[COUNT] = vals[COUNT];
+        IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals);
+    }
+
+};
+
+/// Helper structure for templated store iteration (termination case)
+template <int MAX>
+struct IterateThreadStore<MAX, MAX>
+{
+    template <CacheStoreModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {}
+
+    template <typename OutputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {}
+};
+
+
+/**
+ * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_16(cub_modifier, ptx_modifier)                                            \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val)                         \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : :               \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val.x),                                                                     \
+            "r"(val.y),                                                                     \
+            "r"(val.z),                                                                     \
+            "r"(val.w));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val)     \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : :                       \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "l"(val.x),                                                                     \
+            "l"(val.y));                                                                    \
+    }
+
+
+/**
+ * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_8(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val)                 \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : :               \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"(val.x),                                                                     \
+            "h"(val.y),                                                                     \
+            "h"(val.z),                                                                     \
+            "h"(val.w));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val)                         \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : :                       \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val.x),                                                                     \
+            "r"(val.y));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val)     \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "l"(val));                                                                      \
+    }
+
+/**
+ * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_4(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val)                             \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val));                                                                      \
+    }
+
+
+/**
+ * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_2(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val)                     \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"(val));                                                                      \
+    }
+
+
+/**
+ * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_1(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val)                         \
+    {                                                                                       \
+        asm volatile (                                                                      \
+        "{"                                                                                 \
+        "   .reg .u8 datum;"                                                                \
+        "   cvt.u8.u16 datum, %1;"                                                          \
+        "   st."#ptx_modifier".u8 [%0], datum;"                                             \
+        "}" : :                                                                             \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"((unsigned short) val));                                                               \
+    }
+
+/**
+ * Define powers-of-two ThreadStore specializations for the given Cache load modifier
+ */
+#define _CUB_STORE_ALL(cub_modifier, ptx_modifier)                                           \
+    _CUB_STORE_16(cub_modifier, ptx_modifier)                                                \
+    _CUB_STORE_8(cub_modifier, ptx_modifier)                                                 \
+    _CUB_STORE_4(cub_modifier, ptx_modifier)                                                 \
+    _CUB_STORE_2(cub_modifier, ptx_modifier)                                                 \
+    _CUB_STORE_1(cub_modifier, ptx_modifier)                                                 \
+
+
+/**
+ * Define ThreadStore specializations for the various Cache load modifiers
+ */
+#if CUB_PTX_ARCH >= 200
+    _CUB_STORE_ALL(STORE_WB, wb)
+    _CUB_STORE_ALL(STORE_CG, cg)
+    _CUB_STORE_ALL(STORE_CS, cs)
+    _CUB_STORE_ALL(STORE_WT, wt)
+#else
+    _CUB_STORE_ALL(STORE_WB, global)
+    _CUB_STORE_ALL(STORE_CG, global)
+    _CUB_STORE_ALL(STORE_CS, global)
+    _CUB_STORE_ALL(STORE_WT, volatile.global)
+#endif
+
+
+// Macro cleanup
+#undef _CUB_STORE_ALL
+#undef _CUB_STORE_1
+#undef _CUB_STORE_2
+#undef _CUB_STORE_4
+#undef _CUB_STORE_8
+#undef _CUB_STORE_16
+
+
+/**
+ * ThreadStore definition for STORE_DEFAULT modifier on iterator types
+ */
+template <typename OutputIteratorT, typename T>
+__device__ __forceinline__ void ThreadStore(
+    OutputIteratorT             itr,
+    T                           val,
+    Int2Type<STORE_DEFAULT>     /*modifier*/,
+    Int2Type<false>             /*is_pointer*/)
+{
+    *itr = val;
+}
+
+
+/**
+ * ThreadStore definition for STORE_DEFAULT modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<STORE_DEFAULT>     /*modifier*/,
+    Int2Type<true>              /*is_pointer*/)
+{
+    *ptr = val;
+}
+
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStoreVolatilePtr(
+    T                           *ptr,
+    T                           val,
+    Int2Type<true>              /*is_primitive*/)
+{
+    *reinterpret_cast<volatile T*>(ptr) = val;
+}
+
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStoreVolatilePtr(
+    T                           *ptr,
+    T                           val,
+    Int2Type<false>             /*is_primitive*/)
+{
+    // Create a temporary using shuffle-words, then store using volatile-words
+    typedef typename UnitWord<T>::VolatileWord  VolatileWord;
+    typedef typename UnitWord<T>::ShuffleWord   ShuffleWord;
+
+    const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
+    const int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
+
+    VolatileWord words[VOLATILE_MULTIPLE];
+
+    #pragma unroll
+    for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
+        reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
+
+    IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference(
+        reinterpret_cast<volatile VolatileWord*>(ptr),
+        words);
+}
+
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<STORE_VOLATILE>    /*modifier*/,
+    Int2Type<true>              /*is_pointer*/)
+{
+    ThreadStoreVolatilePtr(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
+}
+
+
+/**
+ * ThreadStore definition for generic modifiers on pointer types
+ */
+template <typename T, int MODIFIER>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<MODIFIER>          /*modifier*/,
+    Int2Type<true>              /*is_pointer*/)
+{
+    // Create a temporary using shuffle-words, then store using device-words
+    typedef typename UnitWord<T>::DeviceWord    DeviceWord;
+    typedef typename UnitWord<T>::ShuffleWord   ShuffleWord;
+
+    const int DEVICE_MULTIPLE   = sizeof(T) / sizeof(DeviceWord);
+    const int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
+
+    DeviceWord words[DEVICE_MULTIPLE];
+
+    #pragma unroll
+    for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
+        reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
+
+    IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
+        reinterpret_cast<DeviceWord*>(ptr),
+        words);
+}
+
+
+/**
+ * ThreadStore definition for generic modifiers
+ */
+template <CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T>
+__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
+{
+    ThreadStore(
+        itr,
+        val,
+        Int2Type<MODIFIER>(),
+        Int2Type<std::is_pointer<OutputIteratorT>::value>());
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group UtilIo
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/util_allocator.cuh b/cuda/cub/util_allocator.cuh
new file mode 100644
index 00000000..22aba50c
--- /dev/null
+++ b/cuda/cub/util_allocator.cuh
@@ -0,0 +1,718 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple caching allocator for device memory allocations. The allocator is
+ * thread-safe and capable of managing device allocations on multiple devices.
+ ******************************************************************************/
+
+#pragma once
+
+#include "util_namespace.cuh"
+#include "util_debug.cuh"
+
+#include <set>
+#include <map>
+
+#include "host/mutex.cuh"
+#include <math.h>
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+
+/******************************************************************************
+ * CachingDeviceAllocator (host use)
+ ******************************************************************************/
+
+/**
+ * \brief A simple caching allocator for device memory allocations.
+ *
+ * \par Overview
+ * The allocator is thread-safe and stream-safe and is capable of managing cached
+ * device allocations on multiple devices.  It behaves as follows:
+ *
+ * \par
+ * - Allocations from the allocator are associated with an \p active_stream.  Once freed,
+ *   the allocation becomes available immediately for reuse within the \p active_stream
+ *   with which it was associated with during allocation, and it becomes available for
+ *   reuse within other streams when all prior work submitted to \p active_stream has completed.
+ * - Allocations are categorized and cached by bin size.  A new allocation request of
+ *   a given size will only consider cached allocations within the corresponding bin.
+ * - Bin limits progress geometrically in accordance with the growth factor
+ *   \p bin_growth provided during construction.  Unused device allocations within
+ *   a larger bin cache are not reused for allocation requests that categorize to
+ *   smaller bin sizes.
+ * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
+ *   (\p bin_growth ^ \p min_bin).
+ * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
+ *   bin and are simply freed when they are deallocated instead of being returned
+ *   to a bin-cache.
+ * - If the total storage of cached allocations on a given device will exceed
+ *   \p max_cached_bytes, allocations for that device are simply freed when they are
+ *   deallocated instead of being returned to their bin-cache.
+ *
+ * \par
+ * For example, the default-constructed CachingDeviceAllocator is configured with:
+ * - \p bin_growth          = 8
+ * - \p min_bin             = 3
+ * - \p max_bin             = 7
+ * - \p max_cached_bytes    = 6MB - 1B
+ *
+ * \par
+ * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
+ * and sets a maximum of 6,291,455 cached bytes per device
+ *
+ */
+struct CachingDeviceAllocator
+{
+
+    //---------------------------------------------------------------------
+    // Constants
+    //---------------------------------------------------------------------
+
+    /// Out-of-bounds bin
+    static const unsigned int INVALID_BIN = (unsigned int) -1;
+
+    /// Invalid size
+    static const size_t INVALID_SIZE = (size_t) -1;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Invalid device ordinal
+    static const int INVALID_DEVICE_ORDINAL = -1;
+
+    //---------------------------------------------------------------------
+    // Type definitions and helper types
+    //---------------------------------------------------------------------
+
+    /**
+     * Descriptor for device memory allocations
+     */
+    struct BlockDescriptor
+    {
+        void*           d_ptr;              // Device pointer
+        size_t          bytes;              // Size of allocation in bytes
+        unsigned int    bin;                // Bin enumeration
+        int             device;             // device ordinal
+        cudaStream_t    associated_stream;  // Associated associated_stream
+        cudaEvent_t     ready_event;        // Signal when associated stream has run to the point at which this block was freed
+
+        // Constructor (suitable for searching maps for a specific block, given its pointer and device)
+        BlockDescriptor(void *d_ptr, int device) :
+            d_ptr(d_ptr),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(0),
+            ready_event(0)
+        {}
+
+        // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
+        BlockDescriptor(int device) :
+            d_ptr(NULL),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(0),
+            ready_event(0)
+        {}
+
+        // Comparison functor for comparing device pointers
+        static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+        {
+            if (a.device == b.device)
+                return (a.d_ptr < b.d_ptr);
+            else
+                return (a.device < b.device);
+        }
+
+        // Comparison functor for comparing allocation sizes
+        static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+        {
+            if (a.device == b.device)
+                return (a.bytes < b.bytes);
+            else
+                return (a.device < b.device);
+        }
+    };
+
+    /// BlockDescriptor comparator function interface
+    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+
+    class TotalBytes {
+    public:
+        size_t free;
+        size_t live;
+        TotalBytes() { free = live = 0; }
+    };
+
+    /// Set type for cached blocks (ordered by size)
+    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+
+    /// Set type for live blocks (ordered by ptr)
+    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
+    /// Map type of device ordinals to the number of cached bytes cached by each device
+    typedef std::map<int, TotalBytes> GpuCachedBytes;
+
+
+    //---------------------------------------------------------------------
+    // Utility functions
+    //---------------------------------------------------------------------
+
+    /**
+     * Integer pow function for unsigned base and exponent
+     */
+    static unsigned int IntPow(
+        unsigned int base,
+        unsigned int exp)
+    {
+        unsigned int retval = 1;
+        while (exp > 0)
+        {
+            if (exp & 1) {
+                retval = retval * base;        // multiply the result by the current base
+            }
+            base = base * base;                // square the base
+            exp = exp >> 1;                    // divide the exponent in half
+        }
+        return retval;
+    }
+
+
+    /**
+     * Round up to the nearest power-of
+     */
+    void NearestPowerOf(
+        unsigned int    &power,
+        size_t          &rounded_bytes,
+        unsigned int    base,
+        size_t          value)
+    {
+        power = 0;
+        rounded_bytes = 1;
+
+        if (value * base < value)
+        {
+            // Overflow
+            power = sizeof(size_t) * 8;
+            rounded_bytes = size_t(0) - 1;
+            return;
+        }
+
+        while (rounded_bytes < value)
+        {
+            rounded_bytes *= base;
+            power++;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Fields
+    //---------------------------------------------------------------------
+
+    cub::Mutex      mutex;              /// Mutex for thread-safety
+
+    unsigned int    bin_growth;         /// Geometric growth factor for bin-sizes
+    unsigned int    min_bin;            /// Minimum bin enumeration
+    unsigned int    max_bin;            /// Maximum bin enumeration
+
+    size_t          min_bin_bytes;      /// Minimum bin size
+    size_t          max_bin_bytes;      /// Maximum bin size
+    size_t          max_cached_bytes;   /// Maximum aggregate cached bytes per device
+
+    const bool      skip_cleanup;       /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
+    bool            debug;              /// Whether or not to print (de)allocation events to stdout
+
+    GpuCachedBytes  cached_bytes;       /// Map of device ordinal to aggregate cached bytes on that device
+    CachedBlocks    cached_blocks;      /// Set of cached device allocations available for reuse
+    BusyBlocks      live_blocks;        /// Set of live device allocations currently in use
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    //---------------------------------------------------------------------
+    // Methods
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Constructor.
+     */
+    CachingDeviceAllocator(
+        unsigned int    bin_growth,                             ///< Geometric growth factor for bin-sizes
+        unsigned int    min_bin             = 1,                ///< Minimum bin (default is bin_growth ^ 1)
+        unsigned int    max_bin             = INVALID_BIN,      ///< Maximum bin (default is no max bin)
+        size_t          max_cached_bytes    = INVALID_SIZE,     ///< Maximum aggregate cached bytes per device (default is no limit)
+        bool            skip_cleanup        = false,            ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
+        bool            debug               = false)            ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
+    :
+        bin_growth(bin_growth),
+        min_bin(min_bin),
+        max_bin(max_bin),
+        min_bin_bytes(IntPow(bin_growth, min_bin)),
+        max_bin_bytes(IntPow(bin_growth, max_bin)),
+        max_cached_bytes(max_cached_bytes),
+        skip_cleanup(skip_cleanup),
+        debug(debug),
+        cached_blocks(BlockDescriptor::SizeCompare),
+        live_blocks(BlockDescriptor::PtrCompare)
+    {}
+
+
+    /**
+     * \brief Default constructor.
+     *
+     * Configured with:
+     * \par
+     * - \p bin_growth          = 8
+     * - \p min_bin             = 3
+     * - \p max_bin             = 7
+     * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
+     *
+     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
+     * sets a maximum of 6,291,455 cached bytes per device
+     */
+    CachingDeviceAllocator(
+        bool skip_cleanup = false,
+        bool debug = false)
+    :
+        bin_growth(8),
+        min_bin(3),
+        max_bin(7),
+        min_bin_bytes(IntPow(bin_growth, min_bin)),
+        max_bin_bytes(IntPow(bin_growth, max_bin)),
+        max_cached_bytes((max_bin_bytes * 3) - 1),
+        skip_cleanup(skip_cleanup),
+        debug(debug),
+        cached_blocks(BlockDescriptor::SizeCompare),
+        live_blocks(BlockDescriptor::PtrCompare)
+    {}
+
+
+    /**
+     * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
+     *
+     * Changing the ceiling of cached bytes does not cause any allocations (in-use or
+     * cached-in-reserve) to be freed.  See \p FreeAllCached().
+     */
+    cudaError_t SetMaxCachedBytes(size_t max_cached_bytes_)
+    {
+        // Lock
+        mutex.Lock();
+
+        if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes_);
+
+        this->max_cached_bytes = max_cached_bytes_;
+
+        // Unlock
+        mutex.Unlock();
+
+        return cudaSuccess;
+    }
+
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the specified device.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceAllocate(
+        int             device,             ///< [in] Device on which to place the allocation
+        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
+        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
+    {
+        *d_ptr                          = NULL;
+        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
+        cudaError_t error               = cudaSuccess;
+
+        if (device == INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+            device = entrypoint_device;
+        }
+
+        // Create a block descriptor for the requested allocation
+        bool found = false;
+        BlockDescriptor search_key(device);
+        search_key.associated_stream = active_stream;
+        NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
+
+        if (search_key.bin > max_bin)
+        {
+            // Bin is greater than our maximum bin: allocate the request
+            // exactly and give out-of-bounds bin.  It will not be cached
+            // for reuse when returned.
+            search_key.bin      = INVALID_BIN;
+            search_key.bytes    = bytes;
+        }
+        else
+        {
+            // Search for a suitable cached allocation: lock
+            mutex.Lock();
+
+            if (search_key.bin < min_bin)
+            {
+                // Bin is less than minimum bin: round up
+                search_key.bin      = min_bin;
+                search_key.bytes    = min_bin_bytes;
+            }
+
+            // Iterate through the range of cached blocks on the same device in the same bin
+            CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+            while ((block_itr != cached_blocks.end())
+                    && (block_itr->device == device)
+                    && (block_itr->bin == search_key.bin))
+            {
+                // To prevent races with reusing blocks returned by the host but still
+                // in use by the device, only consider cached blocks that are
+                // either (from the active stream) or (from an idle stream)
+                bool is_reusable = false;
+                if (active_stream == block_itr->associated_stream)
+                {
+                    is_reusable = true;
+                }
+                else
+                {
+                    const cudaError_t event_status = cudaEventQuery(block_itr->ready_event);
+                    if(event_status != cudaErrorNotReady)
+                    {
+                        CubDebug(event_status);
+                        is_reusable = true;
+                    }
+                }
+
+                if(is_reusable)
+                {
+                    // Reuse existing cache block.  Insert into live blocks.
+                    found = true;
+                    search_key = *block_itr;
+                    search_key.associated_stream = active_stream;
+                    live_blocks.insert(search_key);
+
+                    // Remove from free blocks
+                    cached_bytes[device].free -= search_key.bytes;
+                    cached_bytes[device].live += search_key.bytes;
+
+                    if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
+                        device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long)  block_itr->associated_stream);
+
+                    cached_blocks.erase(block_itr);
+
+                    break;
+                }
+                block_itr++;
+            }
+
+            // Done searching: unlock
+            mutex.Unlock();
+        }
+
+        // Allocate the block if necessary
+        if (!found)
+        {
+            // Set runtime's current device to specified device (entrypoint may not be set)
+            if (device != entrypoint_device)
+            {
+                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+                if (CubDebug(error = cudaSetDevice(device))) return error;
+            }
+
+            // Attempt to allocate
+            if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation)
+            {
+                // The allocation attempt failed: free all cached blocks on device and retry
+                if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
+                      device, (long long) search_key.bytes, (long long) search_key.associated_stream);
+
+                error = cudaSuccess;    // Reset the error we will return
+                cudaGetLastError();     // Reset CUDART's error
+
+                // Lock
+                mutex.Lock();
+
+                // Iterate the range of free blocks on the same device
+                BlockDescriptor free_key(device);
+                CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
+
+                while ((block_itr != cached_blocks.end()) && (block_itr->device == device))
+                {
+                    // No need to worry about synchronization with the device: cudaFree is
+                    // blocking and will synchronize across all kernels executing
+                    // on the current device
+
+                    // Free device memory and destroy stream event.
+                    if (CubDebug(error = cudaFree(block_itr->d_ptr))) break;
+                    if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break;
+
+                    // Reduce balance and erase entry
+                    cached_bytes[device].free -= block_itr->bytes;
+
+                    if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                        device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+
+                    cached_blocks.erase(block_itr);
+
+                    block_itr++;
+                }
+
+                // Unlock
+                mutex.Unlock();
+
+                // Return under error
+                if (error) return error;
+
+                // Try to allocate again
+                if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error;
+            }
+
+            // Create ready event
+            if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
+                return error;
+
+            // Insert into live blocks
+            mutex.Lock();
+            live_blocks.insert(search_key);
+            cached_bytes[device].live += search_key.bytes;
+            mutex.Unlock();
+
+            if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
+                      device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);
+
+            // Attempt to revert back to previous device if necessary
+            if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
+            {
+                if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+            }
+        }
+
+        // Copy device pointer to output parameter
+        *d_ptr = search_key.d_ptr;
+
+        if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
+            (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+
+        return error;
+    }
+
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the current device.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceAllocate(
+        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
+        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
+    {
+        return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
+    }
+
+
+    /**
+     * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceFree(
+        int             device,
+        void*           d_ptr)
+    {
+        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
+        cudaError_t error               = cudaSuccess;
+
+        if (device == INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+                return error;
+            device = entrypoint_device;
+        }
+
+        // Lock
+        mutex.Lock();
+
+        // Find corresponding block descriptor
+        bool recached = false;
+        BlockDescriptor search_key(d_ptr, device);
+        BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+        if (block_itr != live_blocks.end())
+        {
+            // Remove from live blocks
+            search_key = *block_itr;
+            live_blocks.erase(block_itr);
+            cached_bytes[device].live -= search_key.bytes;
+
+            // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
+            if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes))
+            {
+                // Insert returned allocation into free blocks
+                recached = true;
+                cached_blocks.insert(search_key);
+                cached_bytes[device].free += search_key.bytes;
+
+                if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                    device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
+                    (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+            }
+        }
+
+        // Unlock
+        mutex.Unlock();
+
+        // First set to specified device (entrypoint may not be set)
+        if (device != entrypoint_device)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+            if (CubDebug(error = cudaSetDevice(device))) return error;
+        }
+
+        if (recached)
+        {
+            // Insert the ready event in the associated stream (must have current device set properly)
+            if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error;
+        }
+
+        if (!recached)
+        {
+            // Free the allocation from the runtime and cleanup the event.
+            if (CubDebug(error = cudaFree(d_ptr))) return error;
+            if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
+
+            if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+        }
+
+        // Reset device
+        if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+    }
+
+
+    /**
+     * \brief Frees a live allocation of device memory on the current device, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceFree(
+        void*           d_ptr)
+    {
+        return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
+    }
+
+
+    /**
+     * \brief Frees all cached device allocations on all devices
+     */
+    cudaError_t FreeAllCached()
+    {
+        cudaError_t error         = cudaSuccess;
+        int entrypoint_device     = INVALID_DEVICE_ORDINAL;
+        int current_device        = INVALID_DEVICE_ORDINAL;
+
+        mutex.Lock();
+
+        while (!cached_blocks.empty())
+        {
+            // Get first block
+            CachedBlocks::iterator begin = cached_blocks.begin();
+
+            // Get entry-point device ordinal if necessary
+            if (entrypoint_device == INVALID_DEVICE_ORDINAL)
+            {
+                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
+            }
+
+            // Set current device ordinal if necessary
+            if (begin->device != current_device)
+            {
+                if (CubDebug(error = cudaSetDevice(begin->device))) break;
+                current_device = begin->device;
+            }
+
+            // Free device memory
+            if (CubDebug(error = cudaFree(begin->d_ptr))) break;
+            if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break;
+
+            // Reduce balance and erase entry
+            const size_t block_bytes = begin->bytes;
+            cached_bytes[current_device].free -= block_bytes;
+            cached_blocks.erase(begin);
+
+            if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                current_device, (long long) block_bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live);
+
+        }
+
+        mutex.Unlock();
+
+        // Attempt to revert back to entry-point device if necessary
+        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+    }
+
+
+    /**
+     * \brief Destructor
+     */
+    virtual ~CachingDeviceAllocator()
+    {
+        if (!skip_cleanup)
+            FreeAllCached();
+    }
+
+};
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/util_arch.cuh b/cuda/cub/util_arch.cuh
new file mode 100644
index 00000000..a4474afe
--- /dev/null
+++ b/cuda/cub/util_arch.cuh
@@ -0,0 +1,182 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Static architectural properties by SM version.
+ */
+
+#pragma once
+
+#include "util_cpp_dialect.cuh"
+#include "util_namespace.cuh"
+#include "util_macro.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+#if ((__CUDACC_VER_MAJOR__ >= 9) || defined(_NVHPC_CUDA) ||            \
+     CUDA_VERSION >= 9000) &&                                                  \
+  !defined(CUB_USE_COOPERATIVE_GROUPS)
+#define CUB_USE_COOPERATIVE_GROUPS
+#endif
+
+/// In device code, CUB_PTX_ARCH expands to the PTX version for which we are
+/// compiling. In host code, CUB_PTX_ARCH's value is implementation defined.
+#ifndef CUB_PTX_ARCH
+    #if defined(_NVHPC_CUDA)
+        // __NVCOMPILER_CUDA_ARCH__ is the target PTX version, and is defined
+        // when compiling both host code and device code. Currently, only one
+        // PTX version can be targeted.
+        #define CUB_PTX_ARCH __NVCOMPILER_CUDA_ARCH__
+    #elif !defined(__CUDA_ARCH__)
+        #define CUB_PTX_ARCH 0
+    #else
+        #define CUB_PTX_ARCH __CUDA_ARCH__
+    #endif
+#endif
+
+#ifndef CUB_IS_DEVICE_CODE
+    #if defined(_NVHPC_CUDA)
+        #define CUB_IS_DEVICE_CODE __builtin_is_device_code()
+        #define CUB_IS_HOST_CODE (!__builtin_is_device_code())
+        #define CUB_INCLUDE_DEVICE_CODE 1
+        #define CUB_INCLUDE_HOST_CODE 1
+    #elif CUB_PTX_ARCH > 0
+        #define CUB_IS_DEVICE_CODE 1
+        #define CUB_IS_HOST_CODE 0
+        #define CUB_INCLUDE_DEVICE_CODE 1
+        #define CUB_INCLUDE_HOST_CODE 0
+    #else
+        #define CUB_IS_DEVICE_CODE 0
+        #define CUB_IS_HOST_CODE 1
+        #define CUB_INCLUDE_DEVICE_CODE 0
+        #define CUB_INCLUDE_HOST_CODE 1
+    #endif
+#endif
+
+/// Maximum number of devices supported.
+#ifndef CUB_MAX_DEVICES
+    #define CUB_MAX_DEVICES 128
+#endif
+
+#if CUB_CPP_DIALECT >= 2011
+    static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0.");
+#endif
+
+/// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
+#ifndef CUB_RUNTIME_FUNCTION
+    #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__))
+        #define CUB_RUNTIME_ENABLED
+        #define CUB_RUNTIME_FUNCTION __host__ __device__
+    #else
+        #define CUB_RUNTIME_FUNCTION __host__
+    #endif
+#endif
+
+
+/// Number of threads per warp
+#ifndef CUB_LOG_WARP_THREADS
+    #define CUB_LOG_WARP_THREADS(arch)                      \
+        (5)
+    #define CUB_WARP_THREADS(arch)                          \
+        (1 << CUB_LOG_WARP_THREADS(arch))
+
+    #define CUB_PTX_WARP_THREADS        CUB_WARP_THREADS(CUB_PTX_ARCH)
+    #define CUB_PTX_LOG_WARP_THREADS    CUB_LOG_WARP_THREADS(CUB_PTX_ARCH)
+#endif
+
+
+/// Number of smem banks
+#ifndef CUB_LOG_SMEM_BANKS
+    #define CUB_LOG_SMEM_BANKS(arch)                        \
+        ((arch >= 200) ?                                    \
+            (5) :                                           \
+            (4))
+    #define CUB_SMEM_BANKS(arch)                            \
+        (1 << CUB_LOG_SMEM_BANKS(arch))
+
+    #define CUB_PTX_LOG_SMEM_BANKS      CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH)
+    #define CUB_PTX_SMEM_BANKS          CUB_SMEM_BANKS(CUB_PTX_ARCH)
+#endif
+
+
+/// Oversubscription factor
+#ifndef CUB_SUBSCRIPTION_FACTOR
+    #define CUB_SUBSCRIPTION_FACTOR(arch)                   \
+        ((arch >= 300) ?                                    \
+            (5) :                                           \
+            ((arch >= 200) ?                                \
+                (3) :                                       \
+                (10)))
+    #define CUB_PTX_SUBSCRIPTION_FACTOR             CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH)
+#endif
+
+
+/// Prefer padding overhead vs X-way conflicts greater than this threshold
+#ifndef CUB_PREFER_CONFLICT_OVER_PADDING
+    #define CUB_PREFER_CONFLICT_OVER_PADDING(arch)          \
+        ((arch >= 300) ?                                    \
+            (1) :                                           \
+            (4))
+    #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING    CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH)
+#endif
+
+
+template <
+    int NOMINAL_4B_BLOCK_THREADS,
+    int NOMINAL_4B_ITEMS_PER_THREAD,
+    typename T>
+struct RegBoundScaling
+{
+    enum {
+        ITEMS_PER_THREAD    = CUB_MAX(1, NOMINAL_4B_ITEMS_PER_THREAD * 4 / CUB_MAX(4, sizeof(T))),
+        BLOCK_THREADS       = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
+    };
+};
+
+
+template <
+    int NOMINAL_4B_BLOCK_THREADS,
+    int NOMINAL_4B_ITEMS_PER_THREAD,
+    typename T>
+struct MemBoundScaling
+{
+    enum {
+        ITEMS_PER_THREAD    = CUB_MAX(1, CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T), NOMINAL_4B_ITEMS_PER_THREAD * 2)),
+        BLOCK_THREADS       = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
+    };
+};
+
+
+
+
+#endif  // Do not document
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/util_compiler.cuh b/cuda/cub/util_compiler.cuh
new file mode 100644
index 00000000..7cda3c44
--- /dev/null
+++ b/cuda/cub/util_compiler.cuh
@@ -0,0 +1,81 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Detect compiler information.
+ */
+
+#pragma once
+
+// enumerate host compilers we know about
+#define CUB_HOST_COMPILER_UNKNOWN 0
+#define CUB_HOST_COMPILER_MSVC 1
+#define CUB_HOST_COMPILER_GCC 2
+#define CUB_HOST_COMPILER_CLANG 3
+
+// enumerate device compilers we know about
+#define CUB_DEVICE_COMPILER_UNKNOWN 0
+#define CUB_DEVICE_COMPILER_MSVC 1
+#define CUB_DEVICE_COMPILER_GCC 2
+#define CUB_DEVICE_COMPILER_NVCC 3
+#define CUB_DEVICE_COMPILER_CLANG 4
+
+// figure out which host compiler we're using
+#if defined(_MSC_VER)
+#  define CUB_HOST_COMPILER CUB_HOST_COMPILER_MSVC
+#  define CUB_MSVC_VERSION _MSC_VER
+#  define CUB_MSVC_VERSION_FULL _MSC_FULL_VER
+#elif defined(__clang__)
+#  define CUB_HOST_COMPILER CUB_HOST_COMPILER_CLANG
+#  define CUB_CLANG_VERSION                                                    \
+    (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
+#elif defined(__GNUC__)
+#  define CUB_HOST_COMPILER CUB_HOST_COMPILER_GCC
+#  define CUB_GCC_VERSION                                                      \
+    (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#else
+#  define CUB_HOST_COMPILER CUB_HOST_COMPILER_UNKNOWN
+#endif // CUB_HOST_COMPILER
+
+// figure out which device compiler we're using
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC
+#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
+#  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_MSVC
+#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC
+#  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_GCC
+#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG
+// CUDA-capable clang should behave similar to NVCC.
+#  if defined(__CUDA__)
+#    define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC
+#  else
+#    define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_CLANG
+#  endif
+#else
+#  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_UNKNOWN
+#endif
diff --git a/cuda/cub/util_cpp_dialect.cuh b/cuda/cub/util_cpp_dialect.cuh
new file mode 100644
index 00000000..23adf8e8
--- /dev/null
+++ b/cuda/cub/util_cpp_dialect.cuh
@@ -0,0 +1,151 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/*! \file
+ *  \brief Detect the version of the C++ standard used by the compiler.
+ */
+
+#pragma once
+
+#include "util_compiler.cuh"
+
+// Deprecation warnings may be silenced by defining the following macros. These
+// may be combined.
+// - CUB_IGNORE_DEPRECATED_CPP_DIALECT:
+//   Ignore all deprecated C++ dialects and outdated compilers.
+// - CUB_IGNORE_DEPRECATED_CPP_11:
+//   Ignore deprecation warnings when compiling with C++11. C++03 and outdated
+//   compilers will still issue warnings.
+// - CUB_IGNORE_DEPRECATED_COMPILER
+//   Ignore deprecation warnings when using deprecated compilers. Compiling
+//   with C++03 and C++11 will still issue warnings.
+
+// Check for the thrust opt-outs as well:
+#if !defined(CUB_IGNORE_DEPRECATED_CPP_DIALECT) && \
+     defined(THRUST_IGNORE_DEPRECATED_CPP_DIALECT)
+#  define    CUB_IGNORE_DEPRECATED_CPP_DIALECT
+#endif
+#if !defined(CUB_IGNORE_DEPRECATED_CPP_11) && \
+     defined(THRUST_IGNORE_DEPRECATED_CPP_11)
+#  define    CUB_IGNORE_DEPRECATED_CPP_11
+#endif
+#if !defined(CUB_IGNORE_DEPRECATED_COMPILER) && \
+     defined(THRUST_IGNORE_DEPRECATED_COMPILER)
+#  define    CUB_IGNORE_DEPRECATED_COMPILER
+#endif
+
+#ifdef CUB_IGNORE_DEPRECATED_CPP_DIALECT
+#  define CUB_IGNORE_DEPRECATED_CPP_11
+#  define CUB_IGNORE_DEPRECATED_COMPILER
+#endif
+
+// Define this to override the built-in detection.
+#ifndef CUB_CPP_DIALECT
+
+// MSVC does not define __cplusplus correctly. _MSVC_LANG is used instead.
+// This macro is only defined in MSVC 2015U3+.
+#  ifdef _MSVC_LANG // Do not replace with CUB_HOST_COMPILER test (see above)
+// MSVC2015 reports C++14 but lacks extended constexpr support. Treat as C++11.
+#    if CUB_MSVC_VERSION < 1910 && _MSVC_LANG > 201103L /* MSVC < 2017 && CPP > 2011 */
+#      define CUB_CPLUSPLUS 201103L /* Fix to 2011 */
+#    else
+#      define CUB_CPLUSPLUS _MSVC_LANG /* We'll trust this for now. */
+#    endif // MSVC 2015 C++14 fix
+#  else
+#    define CUB_CPLUSPLUS __cplusplus
+#  endif
+
+// Detect current dialect:
+#  if CUB_CPLUSPLUS < 201103L
+#    define CUB_CPP_DIALECT 2003
+#  elif CUB_CPLUSPLUS < 201402L
+#    define CUB_CPP_DIALECT 2011
+#  elif CUB_CPLUSPLUS < 201703L
+#    define CUB_CPP_DIALECT 2014
+#  elif CUB_CPLUSPLUS == 201703L
+#    define CUB_CPP_DIALECT 2017
+#  elif CUB_CPLUSPLUS > 201703L // unknown, but is higher than 2017.
+#    define CUB_CPP_DIALECT 2020
+#  endif
+
+#  undef CUB_CPLUSPLUS // cleanup
+
+#endif // !CUB_CPP_DIALECT
+
+// Define CUB_COMPILER_DEPRECATION macro:
+#if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
+#  define CUB_COMP_DEPR_IMPL(msg) \
+    __pragma(message(__FILE__ ":" CUB_COMP_DEPR_IMPL0(__LINE__) ": warning: " #msg))
+#  define CUB_COMP_DEPR_IMPL0(x) CUB_COMP_DEPR_IMPL1(x)
+#  define CUB_COMP_DEPR_IMPL1(x) #x
+#else // clang / gcc:
+#  define CUB_COMP_DEPR_IMPL(msg) CUB_COMP_DEPR_IMPL0(GCC warning #msg)
+#  define CUB_COMP_DEPR_IMPL0(expr) _Pragma(#expr)
+#  define CUB_COMP_DEPR_IMPL1 /* intentionally blank */
+#endif
+
+#define CUB_COMPILER_DEPRECATION(REQ) \
+  CUB_COMP_DEPR_IMPL(CUB requires at least REQ. Define CUB_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
+
+#define CUB_COMPILER_DEPRECATION_SOFT(REQ, CUR) \
+  CUB_COMP_DEPR_IMPL(CUB requires at least REQ. CUR is deprecated but still supported. CUR support will be removed in a future release. Define CUB_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
+
+#ifndef CUB_IGNORE_DEPRECATED_COMPILER
+
+// Compiler checks:
+#  if CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC && CUB_GCC_VERSION < 50000
+     CUB_COMPILER_DEPRECATION(GCC 5.0);
+#  elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG && CUB_CLANG_VERSION < 70000
+     CUB_COMPILER_DEPRECATION(Clang 7.0);
+#  elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC && CUB_MSVC_VERSION < 1910
+     // <2017. Hard upgrade message:
+     CUB_COMPILER_DEPRECATION(MSVC 2019 (19.20/16.0/14.20));
+#  elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC && CUB_MSVC_VERSION < 1920
+     // >=2017, <2019. Soft deprecation message:
+     CUB_COMPILER_DEPRECATION_SOFT(MSVC 2019 (19.20/16.0/14.20), MSVC 2017);
+#  endif
+
+#endif // CUB_IGNORE_DEPRECATED_COMPILER
+
+#ifndef CUB_IGNORE_DEPRECATED_DIALECT
+
+// Dialect checks:
+#  if CUB_CPP_DIALECT < 2011
+     // <C++11. Hard upgrade message:
+     CUB_COMPILER_DEPRECATION(C++14);
+#  elif CUB_CPP_DIALECT == 2011 && !defined(CUB_IGNORE_DEPRECATED_CPP_11)
+     // =C++11. Soft upgrade message:
+     CUB_COMPILER_DEPRECATION_SOFT(C++14, C++11);
+#  endif
+
+#endif // CUB_IGNORE_DEPRECATED_DIALECT
+
+#undef CUB_COMPILER_DEPRECATION_SOFT
+#undef CUB_COMPILER_DEPRECATION
+#undef CUB_COMP_DEPR_IMPL
+#undef CUB_COMP_DEPR_IMPL0
+#undef CUB_COMP_DEPR_IMPL1
diff --git a/cuda/cub/util_debug.cuh b/cuda/cub/util_debug.cuh
new file mode 100644
index 00000000..d017b821
--- /dev/null
+++ b/cuda/cub/util_debug.cuh
@@ -0,0 +1,157 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Error and event logging routines.
+ *
+ * The following macros definitions are supported:
+ * - \p CUB_LOG.  Simple event messages are printed to \p stdout.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include "util_namespace.cuh"
+#include "util_arch.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+
+/// CUB error reporting macro (prints error messages to stderr)
+#if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR)
+    #define CUB_STDERR
+#endif
+
+
+
+/**
+ * \brief If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
+ *
+ * \return The CUDA error.
+ */
+__host__ __device__ __forceinline__ cudaError_t Debug(
+    cudaError_t     error,
+    const char*     filename,
+    int             line)
+{
+    (void)filename;
+    (void)line;
+
+#ifdef CUB_RUNTIME_ENABLED
+    // Clear the global CUDA error state which may have been set by the last
+    // call. Otherwise, errors may "leak" to unrelated kernel launches.
+    cudaGetLastError();
+#endif
+
+#ifdef CUB_STDERR
+    if (error)
+    {
+        if (CUB_IS_HOST_CODE) {
+            #if CUB_INCLUDE_HOST_CODE
+                fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
+                fflush(stderr);
+            #endif
+        } else {
+            #if CUB_INCLUDE_DEVICE_CODE
+                printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line);
+            #endif
+        }
+    }
+#endif
+    return error;
+}
+
+
+/**
+ * \brief Debug macro
+ */
+#ifndef CubDebug
+    #define CubDebug(e) CUB_NS_QUALIFIER::Debug((cudaError_t) (e), __FILE__, __LINE__)
+#endif
+
+
+/**
+ * \brief Debug macro with exit
+ */
+#ifndef CubDebugExit
+    #define CubDebugExit(e) if (CUB_NS_QUALIFIER::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); }
+#endif
+
+
+/**
+ * \brief Log macro for printf statements.
+ */
+#if !defined(_CubLog)
+    #if defined(_NVHPC_CUDA)
+        #define _CubLog(format, ...) (__builtin_is_device_code() \
+            ? printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \
+                     blockIdx.z, blockIdx.y, blockIdx.x, \
+                     threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__) \
+            : printf(format, __VA_ARGS__));
+    #elif !(defined(__clang__) && defined(__CUDA__))
+        #if (CUB_PTX_ARCH == 0)
+            #define _CubLog(format, ...) printf(format,__VA_ARGS__);
+        #elif (CUB_PTX_ARCH >= 200)
+            #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__);
+        #endif
+    #else
+        // XXX shameless hack for clang around variadic printf...
+        //     Compilies w/o supplying -std=c++11 but shows warning,
+        //     so we sielence them :)
+        #pragma clang diagnostic ignored "-Wc++11-extensions"
+        #pragma clang diagnostic ignored "-Wunnamed-type-template-args"
+            template <class... Args>
+            inline __host__ __device__ void va_printf(char const* format, Args const&... args)
+            {
+        #ifdef __CUDA_ARCH__
+              printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...);
+        #else
+              printf(format, args...);
+        #endif
+            }
+        #ifndef __CUDA_ARCH__
+            #define _CubLog(format, ...) CUB_NS_QUALIFIER::va_printf(format,__VA_ARGS__);
+        #else
+            #define _CubLog(format, ...) CUB_NS_QUALIFIER::va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__);
+        #endif
+    #endif
+#endif
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/util_deprecated.cuh b/cuda/cub/util_deprecated.cuh
new file mode 100644
index 00000000..6819f731
--- /dev/null
+++ b/cuda/cub/util_deprecated.cuh
@@ -0,0 +1,54 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Define CUB_DEPRECATED macro.
+ */
+
+#pragma once
+
+#include "util_compiler.cuh"
+#include "util_cpp_dialect.cuh"
+
+#if defined(THRUST_IGNORE_DEPRECATED_API) && !defined(CUB_IGNORE_DEPRECATED_API)
+#  define CUB_IGNORE_DEPRECATED_API
+#endif
+
+#ifdef CUB_IGNORE_DEPRECATED_API
+#  define CUB_DEPRECATED
+#elif CUB_CPP_DIALECT >= 2014
+#  define CUB_DEPRECATED [[deprecated]]
+#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
+#  define CUB_DEPRECATED __declspec(deprecated)
+#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG
+#  define CUB_DEPRECATED __attribute__((deprecated))
+#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC
+#  define CUB_DEPRECATED __attribute__((deprecated))
+#else
+#  define CUB_DEPRECATED
+#endif
diff --git a/cuda/cub/util_device.cuh b/cuda/cub/util_device.cuh
new file mode 100644
index 00000000..403c2f0d
--- /dev/null
+++ b/cuda/cub/util_device.cuh
@@ -0,0 +1,713 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Properties of a given CUDA device and the corresponding PTX bundle
+ */
+
+#pragma once
+
+#include "detail/device_synchronize.cuh"
+
+#include "util_type.cuh"
+#include "util_arch.cuh"
+#include "util_debug.cuh"
+#include "util_cpp_dialect.cuh"
+#include "util_namespace.cuh"
+#include "util_macro.cuh"
+
+#include <atomic>
+#include <array>
+#include <cassert>
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+
+/**
+ * \brief Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
+ */
+template <int ALLOCATIONS>
+__host__ __device__ __forceinline__
+cudaError_t AliasTemporaries(
+    void    *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t& temp_storage_bytes,                 ///< [in,out] Size in bytes of \t d_temp_storage allocation
+    void*   (&allocations)[ALLOCATIONS],        ///< [in,out] Pointers to device allocations needed
+    size_t  (&allocation_sizes)[ALLOCATIONS])   ///< [in] Sizes in bytes of device allocations needed
+{
+    const int ALIGN_BYTES   = 256;
+    const int ALIGN_MASK    = ~(ALIGN_BYTES - 1);
+
+    // Compute exclusive prefix sum over allocation requests
+    size_t allocation_offsets[ALLOCATIONS];
+    size_t bytes_needed = 0;
+    for (int i = 0; i < ALLOCATIONS; ++i)
+    {
+        size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
+        allocation_offsets[i] = bytes_needed;
+        bytes_needed += allocation_bytes;
+    }
+    bytes_needed += ALIGN_BYTES - 1;
+
+    // Check if the caller is simply requesting the size of the storage allocation
+    if (!d_temp_storage)
+    {
+        temp_storage_bytes = bytes_needed;
+        return cudaSuccess;
+    }
+
+    // Check if enough storage provided
+    if (temp_storage_bytes < bytes_needed)
+    {
+        return CubDebug(cudaErrorInvalidValue);
+    }
+
+    // Alias
+    d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);
+    for (int i = 0; i < ALLOCATIONS; ++i)
+    {
+        allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * \brief Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
+ */
+template <typename T>
+__global__ void EmptyKernel(void) { }
+
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Returns the current device or -1 if an error occurred.
+ */
+CUB_RUNTIME_FUNCTION inline int CurrentDevice()
+{
+#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.
+
+    int device = -1;
+    if (CubDebug(cudaGetDevice(&device))) return -1;
+    return device;
+
+#else // Device code without the CUDA runtime.
+
+    return -1;
+
+#endif
+}
+
+/**
+ * \brief RAII helper which saves the current device and switches to the
+ *        specified device on construction and switches to the saved device on
+ *        destruction.
+ */
+struct SwitchDevice
+{
+private:
+    int const old_device;
+    bool const needs_reset;
+public:
+    __host__ inline SwitchDevice(int new_device)
+      : old_device(CurrentDevice()), needs_reset(old_device != new_device)
+    {
+        if (needs_reset)
+            CubDebug(cudaSetDevice(new_device));
+    }
+
+    __host__ inline ~SwitchDevice()
+    {
+        if (needs_reset)
+            CubDebug(cudaSetDevice(old_device));
+    }
+};
+
+/**
+ * \brief Returns the number of CUDA devices available or -1 if an error
+ *        occurred.
+ */
+CUB_RUNTIME_FUNCTION inline int DeviceCountUncached()
+{
+#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.
+
+    int count = -1;
+    if (CubDebug(cudaGetDeviceCount(&count)))
+        // CUDA makes no guarantees about the state of the output parameter if
+        // `cudaGetDeviceCount` fails; in practice, they don't, but out of
+        // paranoia we'll reset `count` to `-1`.
+        count = -1;
+    return count;
+
+#else // Device code without the CUDA runtime.
+
+    return -1;
+
+#endif
+}
+
+#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
+
+/**
+ * \brief Cache for an arbitrary value produced by a nullary function.
+ */
+template <typename T, T(*Function)()>
+struct ValueCache
+{
+    T const value;
+
+    /**
+     * \brief Call the nullary function to produce the value and construct the
+     *        cache.
+     */
+    __host__ inline ValueCache() : value(Function()) {}
+};
+
+#endif
+
+#if CUB_CPP_DIALECT >= 2011
+// Host code, only safely usable in C++11 or newer, where thread-safe
+// initialization of static locals is guaranteed.  This is a separate function
+// to avoid defining a local static in a host/device function.
+__host__ inline int DeviceCountCachedValue()
+{
+    static ValueCache<int, DeviceCountUncached> cache;
+    return cache.value;
+}
+#endif
+
+/**
+ * \brief Returns the number of CUDA devices available.
+ *
+ * \note This function may cache the result internally.
+ *
+ * \note This function is thread safe.
+ */
+CUB_RUNTIME_FUNCTION inline int DeviceCount()
+{
+    int result = -1;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            #if CUB_CPP_DIALECT >= 2011
+                // Host code and C++11.
+                result = DeviceCountCachedValue();
+            #else
+                // Host code and C++98.
+                result = DeviceCountUncached();
+            #endif
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            // Device code.
+            result = DeviceCountUncached();
+        #endif
+    }
+    return result;
+}
+
+#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
+
+/**
+ * \brief Per-device cache for a CUDA attribute value; the attribute is queried
+ *        and stored for each device upon construction.
+ */
+struct PerDeviceAttributeCache
+{
+    struct DevicePayload
+    {
+        int         attribute;
+        cudaError_t error;
+    };
+
+    // Each entry starts in the `DeviceEntryEmpty` state, then proceeds to the
+    // `DeviceEntryInitializing` state, and then proceeds to the
+    // `DeviceEntryReady` state. These are the only state transitions allowed;
+    // e.g. a linear sequence of transitions.
+    enum DeviceEntryStatus
+    {
+        DeviceEntryEmpty = 0,
+        DeviceEntryInitializing,
+        DeviceEntryReady
+    };
+
+    struct DeviceEntry
+    {
+        std::atomic<DeviceEntryStatus> flag;
+        DevicePayload                  payload;
+    };
+
+private:
+    std::array<DeviceEntry, CUB_MAX_DEVICES> entries_;
+
+public:
+    /**
+     * \brief Construct the cache.
+     */
+    __host__ inline PerDeviceAttributeCache() : entries_()
+    {
+        assert(DeviceCount() <= CUB_MAX_DEVICES);
+    }
+
+    /**
+     * \brief Retrieves the payload of the cached function \p f for \p device.
+     *
+     * \note You must pass a morally equivalent function in to every call or
+     *       this function has undefined behavior.
+     */
+    template <typename Invocable>
+    __host__ DevicePayload operator()(Invocable&& f, int device)
+    {
+        if (device >= DeviceCount())
+            return DevicePayload{0, cudaErrorInvalidDevice};
+
+        auto& entry   = entries_[device];
+        auto& flag    = entry.flag;
+        auto& payload = entry.payload;
+
+        DeviceEntryStatus old_status = DeviceEntryEmpty;
+
+        // First, check for the common case of the entry being ready.
+        if (flag.load(std::memory_order_acquire) != DeviceEntryReady)
+        {
+            // Assume the entry is empty and attempt to lock it so we can fill
+            // it by trying to set the state from `DeviceEntryReady` to
+            // `DeviceEntryInitializing`.
+            if (flag.compare_exchange_strong(old_status, DeviceEntryInitializing,
+                                             std::memory_order_acq_rel,
+                                             std::memory_order_acquire))
+            {
+                // We successfully set the state to `DeviceEntryInitializing`;
+                // we have the lock and it's our job to initialize this entry
+                // and then release it.
+
+                // We don't use `CubDebug` here because we let the user code
+                // decide whether or not errors are hard errors.
+                payload.error = std::forward<Invocable>(f)(payload.attribute);
+                if (payload.error)
+                    // Clear the global CUDA error state which may have been
+                    // set by the last call. Otherwise, errors may "leak" to
+                    // unrelated kernel launches.
+                    cudaGetLastError();
+
+                // Release the lock by setting the state to `DeviceEntryReady`.
+                flag.store(DeviceEntryReady, std::memory_order_release);
+            }
+
+            // If the `compare_exchange_weak` failed, then `old_status` has
+            // been updated with the value of `flag` that it observed.
+
+            else if (old_status == DeviceEntryInitializing)
+            {
+                // Another execution agent is initializing this entry; we need
+                // to wait for them to finish; we'll know they're done when we
+                // observe the entry status as `DeviceEntryReady`.
+                do { old_status = flag.load(std::memory_order_acquire); }
+                while (old_status != DeviceEntryReady);
+                // FIXME: Use `atomic::wait` instead when we have access to
+                // host-side C++20 atomics. We could use libcu++, but it only
+                // supports atomics for SM60 and up, even if you're only using
+                // them in host code.
+            }
+        }
+
+        // We now know that the state of our entry is `DeviceEntryReady`, so
+        // just return the entry's payload.
+        return entry.payload;
+    }
+};
+
+#endif
+
+/**
+ * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).
+ */
+CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersionUncached(int& ptx_version)
+{
+    // Instantiate `EmptyKernel<void>` in both host and device code to ensure
+    // it can be called.
+    typedef void (*EmptyKernelPtr)();
+    EmptyKernelPtr empty_kernel = EmptyKernel<void>;
+
+    // This is necessary for unused variable warnings in host compilers. The
+    // usual syntax of (void)empty_kernel; was not sufficient on MSVC2015.
+    (void)reinterpret_cast<void*>(empty_kernel);
+
+    cudaError_t result = cudaSuccess;
+    if (CUB_IS_HOST_CODE) {
+       #if CUB_INCLUDE_HOST_CODE
+            cudaFuncAttributes empty_kernel_attrs;
+
+            result = cudaFuncGetAttributes(&empty_kernel_attrs,
+                                           reinterpret_cast<void*>(empty_kernel));
+            CubDebug(result);
+
+            ptx_version = empty_kernel_attrs.ptxVersion * 10;
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            // This is necessary to ensure instantiation of EmptyKernel in device code.
+            // The `reinterpret_cast` is necessary to suppress a set-but-unused warnings.
+            // This is a meme now: https://twitter.com/blelbach/status/1222391615576100864
+            (void)reinterpret_cast<EmptyKernelPtr>(empty_kernel);
+
+            ptx_version = CUB_PTX_ARCH;
+        #endif
+    }
+    return result;
+}
+
+/**
+ * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10).
+ */
+__host__ inline cudaError_t PtxVersionUncached(int& ptx_version, int device)
+{
+    SwitchDevice sd(device);
+    (void)sd;
+    return PtxVersionUncached(ptx_version);
+}
+
+#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
+template <typename Tag>
+__host__ inline PerDeviceAttributeCache& GetPerDeviceAttributeCache()
+{
+    // C++11 guarantees that initialization of static locals is thread safe.
+    static PerDeviceAttributeCache cache;
+    return cache;
+}
+
+struct PtxVersionCacheTag {};
+struct SmVersionCacheTag {};
+#endif
+
+/**
+ * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10).
+ *
+ * \note This function may cache the result internally.
+ *
+ * \note This function is thread safe.
+ */
+__host__ inline cudaError_t PtxVersion(int& ptx_version, int device)
+{
+#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
+
+    auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
+      // If this call fails, then we get the error code back in the payload,
+      // which we check with `CubDebug` below.
+      [=] (int& pv) { return PtxVersionUncached(pv, device); },
+      device);
+
+    if (!CubDebug(payload.error))
+        ptx_version = payload.attribute;
+
+    return payload.error;
+
+#else // Pre C++11.
+
+    return PtxVersionUncached(ptx_version, device);
+
+#endif
+}
+
+/**
+ * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).
+ *
+ * \note This function may cache the result internally.
+ *
+ * \note This function is thread safe.
+ */
+CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersion(int& ptx_version)
+{
+    cudaError_t result = cudaErrorUnknown;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            #if CUB_CPP_DIALECT >= 2011
+                // Host code and C++11.
+                auto const device = CurrentDevice();
+
+                auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
+                  // If this call fails, then we get the error code back in the payload,
+                  // which we check with `CubDebug` below.
+                  [=] (int& pv) { return PtxVersionUncached(pv, device); },
+                  device);
+
+                if (!CubDebug(payload.error))
+                    ptx_version = payload.attribute;
+
+                result = payload.error;
+            #else
+                // Host code and C++98.
+                result = PtxVersionUncached(ptx_version);
+            #endif
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            // Device code.
+            result = PtxVersionUncached(ptx_version);
+        #endif
+    }
+    return result;
+}
+
+/**
+ * \brief Retrieves the SM version of \p device (major * 100 + minor * 10)
+ */
+CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice())
+{
+#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.
+
+    cudaError_t error = cudaSuccess;
+    do
+    {
+        int major = 0, minor = 0;
+        if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device))) break;
+        if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device))) break;
+        sm_version = major * 100 + minor * 10;
+    }
+    while (0);
+
+    return error;
+
+#else // Device code without the CUDA runtime.
+
+    (void)sm_version;
+    (void)device;
+
+    // CUDA API calls are not supported from this device.
+    return CubDebug(cudaErrorInvalidConfiguration);
+
+#endif
+}
+
+/**
+ * \brief Retrieves the SM version of \p device (major * 100 + minor * 10)
+ *
+ * \note This function may cache the result internally.
+ *
+ * \note This function is thread safe.
+ */
+CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int& sm_version, int device = CurrentDevice())
+{
+    cudaError_t result = cudaErrorUnknown;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            #if CUB_CPP_DIALECT >= 2011
+                // Host code and C++11
+                auto const payload = GetPerDeviceAttributeCache<SmVersionCacheTag>()(
+                  // If this call fails, then we get the error code back in the payload,
+                  // which we check with `CubDebug` below.
+                  [=] (int& pv) { return SmVersionUncached(pv, device); },
+                  device);
+
+                if (!CubDebug(payload.error))
+                    sm_version = payload.attribute;
+
+                result = payload.error;
+            #else
+                // Host code and C++98
+                result = SmVersionUncached(sm_version, device);
+            #endif
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            result = SmVersionUncached(sm_version, device);
+        #endif
+    }
+    return result;
+}
+
+/**
+ * Synchronize the specified \p stream.
+ */
+CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream(cudaStream_t stream)
+{
+    cudaError_t result = cudaErrorUnknown;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            result = CubDebug(cudaStreamSynchronize(stream));
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            #if defined(CUB_RUNTIME_ENABLED) // Device code with the CUDA runtime.
+                (void)stream;
+                // Device can't yet sync on a specific stream
+                result = CubDebug(cub::detail::device_synchronize());
+            #else // Device code without the CUDA runtime.
+                (void)stream;
+                // CUDA API calls are not supported from this device.
+                result = CubDebug(cudaErrorInvalidConfiguration);
+            #endif
+        #endif
+    }
+    return result;
+}
+
+
+/**
+ * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of the MaxSmOccupancy function.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_device.cuh>
+ *
+ * template <typename T>
+ * __global__ void ExampleKernel()
+ * {
+ *     // Allocate shared memory for BlockScan
+ *     __shared__ volatile T buffer[4096];
+ *
+ *        ...
+ * }
+ *
+ *     ...
+ *
+ * // Determine SM occupancy for ExampleKernel specialized for unsigned char
+ * int max_sm_occupancy;
+ * MaxSmOccupancy(max_sm_occupancy, ExampleKernel<unsigned char>, 64);
+ *
+ * // max_sm_occupancy  <-- 4 on SM10
+ * // max_sm_occupancy  <-- 8 on SM20
+ * // max_sm_occupancy  <-- 12 on SM35
+ *
+ * \endcode
+ *
+ */
+template <typename KernelPtr>
+CUB_RUNTIME_FUNCTION inline
+cudaError_t MaxSmOccupancy(
+    int&                max_sm_occupancy,          ///< [out] maximum number of thread blocks that can reside on a single SM
+    KernelPtr           kernel_ptr,                 ///< [in] Kernel pointer for which to compute SM occupancy
+    int                 block_threads,              ///< [in] Number of threads per thread block
+    int                 dynamic_smem_bytes = 0)	    ///< [in] Dynamically allocated shared memory in bytes. Default is 0.
+{
+#ifndef CUB_RUNTIME_ENABLED
+
+    (void)dynamic_smem_bytes;
+    (void)block_threads;
+    (void)kernel_ptr;
+    (void)max_sm_occupancy;
+
+    // CUDA API calls not supported from this device
+    return CubDebug(cudaErrorInvalidConfiguration);
+
+#else
+
+    return CubDebug(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_sm_occupancy,
+        kernel_ptr,
+        block_threads,
+        dynamic_smem_bytes));
+
+#endif  // CUB_RUNTIME_ENABLED
+}
+
+
+/******************************************************************************
+ * Policy management
+ ******************************************************************************/
+
+/**
+ * Kernel dispatch configuration
+ */
+struct KernelConfig
+{
+    int block_threads;
+    int items_per_thread;
+    int tile_size;
+    int sm_occupancy;
+
+    CUB_RUNTIME_FUNCTION __forceinline__
+    KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}
+
+    template <typename AgentPolicyT, typename KernelPtrT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Init(KernelPtrT kernel_ptr)
+    {
+        block_threads        = AgentPolicyT::BLOCK_THREADS;
+        items_per_thread     = AgentPolicyT::ITEMS_PER_THREAD;
+        tile_size            = block_threads * items_per_thread;
+        cudaError_t retval   = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
+        return retval;
+    }
+};
+
+
+
+/// Helper for dispatching into a policy chain
+template <int PTX_VERSION, typename PolicyT, typename PrevPolicyT>
+struct ChainedPolicy
+{
+  /// The policy for the active compiler pass
+  using ActivePolicy =
+    cub::detail::conditional_t<(CUB_PTX_ARCH < PTX_VERSION),
+                               typename PrevPolicyT::ActivePolicy,
+                               PolicyT>;
+
+  /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
+  template <typename FunctorT>
+  CUB_RUNTIME_FUNCTION __forceinline__
+  static cudaError_t Invoke(int ptx_version, FunctorT& op)
+  {
+      if (ptx_version < PTX_VERSION) {
+          return PrevPolicyT::Invoke(ptx_version, op);
+      }
+      return op.template Invoke<PolicyT>();
+  }
+};
+
+/// Helper for dispatching into a policy chain (end-of-chain specialization)
+template <int PTX_VERSION, typename PolicyT>
+struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
+{
+    /// The policy for the active compiler pass
+    typedef PolicyT ActivePolicy;
+
+    /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
+    template <typename FunctorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Invoke(int /*ptx_version*/, FunctorT& op) {
+        return op.template Invoke<PolicyT>();
+    }
+};
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/util_macro.cuh b/cuda/cub/util_macro.cuh
new file mode 100644
index 00000000..a681e50b
--- /dev/null
+++ b/cuda/cub/util_macro.cuh
@@ -0,0 +1,118 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Common C/C++ macro utilities
+ ******************************************************************************/
+
+#pragma once
+
+#include "util_namespace.cuh"
+
+#include <utility>
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+#ifndef CUB_ALIGN
+    #if defined(_WIN32) || defined(_WIN64)
+        /// Align struct
+        #define CUB_ALIGN(bytes) __declspec(align(32))
+    #else
+        /// Align struct
+        #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
+    #endif
+#endif
+
+#define CUB_PREVENT_MACRO_SUBSTITUTION
+
+template <typename T, typename U>
+constexpr __host__ __device__ auto min CUB_PREVENT_MACRO_SUBSTITUTION(T &&t,
+                                                                      U &&u)
+  -> decltype(t < u ? std::forward<T>(t) : std::forward<U>(u))
+{
+  return t < u ? std::forward<T>(t) : std::forward<U>(u);
+}
+
+template <typename T, typename U>
+constexpr __host__ __device__ auto max CUB_PREVENT_MACRO_SUBSTITUTION(T &&t,
+                                                                      U &&u)
+  -> decltype(t < u ? std::forward<U>(u) : std::forward<T>(t))
+{
+  return t < u ? std::forward<U>(u) : std::forward<T>(t);
+}
+
+#ifndef CUB_MAX
+    /// Select maximum(a, b)
+    #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
+#endif
+
+#ifndef CUB_MIN
+    /// Select minimum(a, b)
+    #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
+#endif
+
+#ifndef CUB_QUOTIENT_FLOOR
+    /// Quotient of x/y rounded down to nearest integer
+    #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
+#endif
+
+#ifndef CUB_QUOTIENT_CEILING
+    /// Quotient of x/y rounded up to nearest integer
+    #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
+#endif
+
+#ifndef CUB_ROUND_UP_NEAREST
+    /// x rounded up to the nearest multiple of y
+    #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
+#endif
+
+#ifndef CUB_ROUND_DOWN_NEAREST
+    /// x rounded down to the nearest multiple of y
+    #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
+#endif
+
+
+#ifndef CUB_STATIC_ASSERT
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+        #define CUB_CAT_(a, b) a ## b
+        #define CUB_CAT(a, b) CUB_CAT_(a, b)
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    /// Static assert
+    #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
+#endif
+
+/** @} */       // end group UtilModule
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/util_math.cuh b/cuda/cub/util_math.cuh
new file mode 100644
index 00000000..d69fc2ee
--- /dev/null
+++ b/cuda/cub/util_math.cuh
@@ -0,0 +1,123 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Define helper math functions.
+ */
+
+#pragma once
+
+#include <type_traits>
+
+#include "util_namespace.cuh"
+#include "util_macro.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+template <typename T>
+using is_integral_or_enum =
+  std::integral_constant<bool,
+                         std::is_integral<T>::value || std::is_enum<T>::value>;
+
+__host__ __device__ __forceinline__ constexpr  std::size_t
+VshmemSize(std::size_t max_shmem,
+           std::size_t shmem_per_block,
+           std::size_t num_blocks)
+{
+  return shmem_per_block > max_shmem ? shmem_per_block * num_blocks : 0;
+}
+
+}
+
+/**
+ * Divide n by d, round up if any remainder, and return the result.
+ *
+ * Effectively performs `(n + d - 1) / d`, but is robust against the case where
+ * `(n + d - 1)` would overflow.
+ */
+template <typename NumeratorT, typename DenominatorT>
+__host__ __device__ __forceinline__ constexpr NumeratorT
+DivideAndRoundUp(NumeratorT n, DenominatorT d)
+{
+  static_assert(cub::detail::is_integral_or_enum<NumeratorT>::value &&
+                cub::detail::is_integral_or_enum<DenominatorT>::value,
+                "DivideAndRoundUp is only intended for integral types.");
+
+  // Static cast to undo integral promotion.
+  return static_cast<NumeratorT>(n / d + (n % d != 0 ? 1 : 0));
+}
+
+constexpr __device__ __host__ int
+Nominal4BItemsToItemsCombined(int nominal_4b_items_per_thread, int combined_bytes)
+{
+  return (cub::min)(nominal_4b_items_per_thread,
+                    (cub::max)(1,
+                               nominal_4b_items_per_thread * 8 /
+                               combined_bytes));
+}
+
+template <typename T>
+constexpr __device__ __host__ int
+Nominal4BItemsToItems(int nominal_4b_items_per_thread)
+{
+  return (cub::min)(nominal_4b_items_per_thread,
+                    (cub::max)(1,
+                               nominal_4b_items_per_thread * 4 /
+                                 static_cast<int>(sizeof(T))));
+}
+
+template <typename ItemT>
+constexpr __device__ __host__ int
+Nominal8BItemsToItems(int nominal_8b_items_per_thread)
+{
+  return sizeof(ItemT) <= 8u
+           ? nominal_8b_items_per_thread
+           : (cub::min)(nominal_8b_items_per_thread,
+                        (cub::max)(1,
+                                   ((nominal_8b_items_per_thread * 8) +
+                                    static_cast<int>(sizeof(ItemT)) - 1) /
+                                     static_cast<int>(sizeof(ItemT))));
+}
+
+/**
+ * \brief Computes the midpoint of the integers
+ *
+ * Extra operation is performed in order to prevent overflow.
+ *
+ * \return Half the sum of \p begin and \p end
+ */
+template <typename T>
+constexpr __device__ __host__ T MidPoint(T begin, T end)
+{
+  return begin + (end - begin) / 2;
+}
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/util_namespace.cuh b/cuda/cub/util_namespace.cuh
new file mode 100644
index 00000000..cc8e3537
--- /dev/null
+++ b/cuda/cub/util_namespace.cuh
@@ -0,0 +1,216 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file util_namespace.cuh
+ * \brief Utilities that allow `cub::` to be placed inside an
+ * application-specific namespace.
+ */
+
+
+#pragma once
+
+// This is not used by this file; this is a hack so that we can detect the
+// CUB version from Thrust on older versions of CUB that did not have
+// version.cuh.
+#include "version.cuh"
+
+// Prior to 1.13.1, only the PREFIX/POSTFIX macros were used. Notify users
+// that they must now define the qualifier macro, too.
+#if (defined(CUB_NS_PREFIX) || defined(CUB_NS_POSTFIX)) && !defined(CUB_NS_QUALIFIER)
+#error CUB requires a definition of CUB_NS_QUALIFIER when CUB_NS_PREFIX/POSTFIX are defined.
+#endif
+
+/**
+ * \def THRUST_CUB_WRAPPED_NAMESPACE
+ * If defined, this value will be used as the name of a namespace that wraps the
+ * `thrust::` and `cub::` namespaces.
+ * This macro should not be used with any other CUB namespace macros.
+ */
+#ifdef THRUST_CUB_WRAPPED_NAMESPACE
+#define CUB_WRAPPED_NAMESPACE THRUST_CUB_WRAPPED_NAMESPACE
+#endif
+
+/**
+ * \def CUB_WRAPPED_NAMESPACE
+ * If defined, this value will be used as the name of a namespace that wraps the
+ * `cub::` namespace.
+ * If THRUST_CUB_WRAPPED_NAMESPACE is set, this will inherit that macro's value.
+ * This macro should not be used with any other CUB namespace macros.
+ */
+#ifdef CUB_WRAPPED_NAMESPACE
+#define CUB_NS_PREFIX                                                       \
+  namespace CUB_WRAPPED_NAMESPACE                                           \
+  {
+
+#define CUB_NS_POSTFIX }
+
+#define CUB_NS_QUALIFIER ::CUB_WRAPPED_NAMESPACE::cub
+#endif
+
+/**
+ * \def CUB_NS_PREFIX
+ * This macro is inserted prior to all `namespace cub { ... }` blocks. It is
+ * derived from CUB_WRAPPED_NAMESPACE, if set, and will be empty otherwise.
+ * It may be defined by users, in which case CUB_NS_PREFIX,
+ * CUB_NS_POSTFIX, and CUB_NS_QUALIFIER must all be set consistently.
+ */
+#ifndef CUB_NS_PREFIX
+#define CUB_NS_PREFIX
+#endif
+
+/**
+ * \def CUB_NS_POSTFIX
+ * This macro is inserted following the closing braces of all
+ * `namespace cub { ... }` block. It is defined appropriately when
+ * CUB_WRAPPED_NAMESPACE is set, and will be empty otherwise. It may be
+ * defined by users, in which case CUB_NS_PREFIX, CUB_NS_POSTFIX, and
+ * CUB_NS_QUALIFIER must all be set consistently.
+ */
+#ifndef CUB_NS_POSTFIX
+#define CUB_NS_POSTFIX
+#endif
+
+/**
+ * \def CUB_NS_QUALIFIER
+ * This macro is used to qualify members of cub:: when accessing them from
+ * outside of their namespace. By default, this is just `::cub`, and will be
+ * set appropriately when CUB_WRAPPED_NAMESPACE is defined. This macro may be
+ * defined by users, in which case CUB_NS_PREFIX, CUB_NS_POSTFIX, and
+ * CUB_NS_QUALIFIER must all be set consistently.
+ */
+#ifndef CUB_NS_QUALIFIER
+#define CUB_NS_QUALIFIER ::cub
+#endif
+
+#if !defined(CUB_DETAIL_MAGIC_NS_NAME)
+#define CUB_DETAIL_COUNT_N(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, \
+                           _14, _15, _16, _17, _18, _19, _20, N, ...)              \
+                           N
+#define CUB_DETAIL_COUNT(...)                                                      \
+  CUB_DETAIL_IDENTITY(CUB_DETAIL_COUNT_N(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, \
+                                         11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1))
+#define CUB_DETAIL_IDENTITY(N) N
+#define CUB_DETAIL_APPLY(MACRO, ...) CUB_DETAIL_IDENTITY(MACRO(__VA_ARGS__))
+#define CUB_DETAIL_MAGIC_NS_NAME1(P1) \
+    CUB_##P1##_NS
+#define CUB_DETAIL_MAGIC_NS_NAME2(P1, P2) \
+    CUB_##P1##_##P2##_NS
+#define CUB_DETAIL_MAGIC_NS_NAME3(P1, P2, P3) \
+    CUB_##P1##_##P2##_##P3##_NS
+#define CUB_DETAIL_MAGIC_NS_NAME4(P1, P2, P3, P4) \
+    CUB_##P1##_##P2##_##P3##_##P4##_NS
+#define CUB_DETAIL_MAGIC_NS_NAME5(P1, P2, P3, P4, P5) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_NS
+#define CUB_DETAIL_MAGIC_NS_NAME6(P1, P2, P3, P4, P5, P6) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_NS
+#define CUB_DETAIL_MAGIC_NS_NAME7(P1, P2, P3, P4, P5, P6, P7) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_NS
+#define CUB_DETAIL_MAGIC_NS_NAME8(P1, P2, P3, P4, P5, P6, P7, P8) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_NS
+#define CUB_DETAIL_MAGIC_NS_NAME9(P1, P2, P3, P4, P5, P6, P7, P8, P9) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_NS
+#define CUB_DETAIL_MAGIC_NS_NAME10(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_NS
+#define CUB_DETAIL_MAGIC_NS_NAME11(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_NS
+#define CUB_DETAIL_MAGIC_NS_NAME12(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_NS
+#define CUB_DETAIL_MAGIC_NS_NAME13(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_NS
+#define CUB_DETAIL_MAGIC_NS_NAME14(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_NS
+#define CUB_DETAIL_MAGIC_NS_NAME15(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_NS
+#define CUB_DETAIL_MAGIC_NS_NAME16(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_NS
+#define CUB_DETAIL_MAGIC_NS_NAME17(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_NS
+#define CUB_DETAIL_MAGIC_NS_NAME18(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_##P18##_NS
+#define CUB_DETAIL_MAGIC_NS_NAME19(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_##P18##_##P19##_NS
+#define CUB_DETAIL_MAGIC_NS_NAME20(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19, P20) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_##P18##_##P19##_##P20##_NS
+#define CUB_DETAIL_DISPATCH(N) CUB_DETAIL_MAGIC_NS_NAME ## N
+#define CUB_DETAIL_MAGIC_NS_NAME(...) CUB_DETAIL_IDENTITY(CUB_DETAIL_APPLY(CUB_DETAIL_DISPATCH, CUB_DETAIL_COUNT(__VA_ARGS__))(__VA_ARGS__))
+#endif // !defined(CUB_DETAIL_MAGIC_NS_NAME)
+
+#if defined(CUB_DISABLE_NAMESPACE_MAGIC)
+#if !defined(CUB_WRAPPED_NAMESPACE)
+#if !defined(CUB_IGNORE_NAMESPACE_MAGIC_ERROR)
+#error "Disabling namespace magic is unsafe without wrapping namespace"
+#endif // !defined(CUB_IGNORE_NAMESPACE_MAGIC_ERROR)
+#endif // !defined(CUB_WRAPPED_NAMESPACE)
+#define CUB_DETAIL_MAGIC_NS_BEGIN
+#define CUB_DETAIL_MAGIC_NS_END
+#else // not defined(CUB_DISABLE_NAMESPACE_MAGIC)
+#if defined(_NVHPC_CUDA)
+#define CUB_DETAIL_MAGIC_NS_BEGIN inline namespace CUB_DETAIL_MAGIC_NS_NAME(CUB_VERSION, NV_TARGET_SM_INTEGER_LIST) {
+#define CUB_DETAIL_MAGIC_NS_END }
+#else // not defined(_NVHPC_CUDA)
+#define CUB_DETAIL_MAGIC_NS_BEGIN inline namespace CUB_DETAIL_MAGIC_NS_NAME(CUB_VERSION, __CUDA_ARCH_LIST__) {
+#define CUB_DETAIL_MAGIC_NS_END }
+#endif // not defined(_NVHPC_CUDA)
+#endif // not defined(CUB_DISABLE_NAMESPACE_MAGIC)
+
+/**
+ * \def CUB_NAMESPACE_BEGIN
+ * This macro is used to open a `cub::` namespace block, along with any
+ * enclosing namespaces requested by CUB_WRAPPED_NAMESPACE, etc.
+ * This macro is defined by CUB and may not be overridden.
+ */
+#define CUB_NAMESPACE_BEGIN                                                 \
+  CUB_NS_PREFIX                                                             \
+  namespace cub                                                             \
+  {                                                                         \
+  CUB_DETAIL_MAGIC_NS_BEGIN                                                        
+
+/**
+ * \def CUB_NAMESPACE_END
+ * This macro is used to close a `cub::` namespace block, along with any
+ * enclosing namespaces requested by CUB_WRAPPED_NAMESPACE, etc.
+ * This macro is defined by CUB and may not be overridden.
+ */
+#define CUB_NAMESPACE_END                                                   \
+  CUB_DETAIL_MAGIC_NS_END                                                   \
+  } /* end namespace cub */                                                 \
+  CUB_NS_POSTFIX
+
+// Declare these namespaces here for the purpose of Doxygenating them
+CUB_NS_PREFIX
+
+/*! \namespace cub
+ *  \brief \p cub is the top-level namespace which contains all CUB
+ *         functions and types.
+ */
+namespace cub
+{
+}
+
+CUB_NS_POSTFIX
diff --git a/cuda/cub/util_ptx.cuh b/cuda/cub/util_ptx.cuh
new file mode 100644
index 00000000..087fbcb7
--- /dev/null
+++ b/cuda/cub/util_ptx.cuh
@@ -0,0 +1,767 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * PTX intrinsics
+ */
+
+
+#pragma once
+
+#include "util_type.cuh"
+#include "util_arch.cuh"
+#include "util_namespace.cuh"
+#include "util_debug.cuh"
+
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \addtogroup UtilPtx
+ * @{
+ */
+
+
+/******************************************************************************
+ * PTX helper macros
+ ******************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Register modifier for pointer-types (for inlining PTX assembly)
+ */
+#if defined(_WIN64) || defined(__LP64__)
+    #define __CUB_LP64__ 1
+    // 64-bit register modifier for inlined asm
+    #define _CUB_ASM_PTR_ "l"
+    #define _CUB_ASM_PTR_SIZE_ "u64"
+#else
+    #define __CUB_LP64__ 0
+    // 32-bit register modifier for inlined asm
+    #define _CUB_ASM_PTR_ "r"
+    #define _CUB_ASM_PTR_SIZE_ "u32"
+#endif
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Inlined PTX intrinsics
+ ******************************************************************************/
+
+/**
+ * \brief Shift-right then add.  Returns (\p x >> \p shift) + \p addend.
+ */
+__device__ __forceinline__ unsigned int SHR_ADD(
+    unsigned int x,
+    unsigned int shift,
+    unsigned int addend)
+{
+    unsigned int ret;
+    asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
+        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
+    return ret;
+}
+
+
+/**
+ * \brief Shift-left then add.  Returns (\p x << \p shift) + \p addend.
+ */
+__device__ __forceinline__ unsigned int SHL_ADD(
+    unsigned int x,
+    unsigned int shift,
+    unsigned int addend)
+{
+    unsigned int ret;
+    asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
+        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
+    return ret;
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Bitfield-extract.
+ */
+template <typename UnsignedBits, int BYTE_LEN>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits            source,
+    unsigned int            bit_start,
+    unsigned int            num_bits,
+    Int2Type<BYTE_LEN>      /*byte_len*/)
+{
+    unsigned int bits;
+    asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
+    return bits;
+}
+
+
+/**
+ * Bitfield-extract for 64-bit types.
+ */
+template <typename UnsignedBits>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits            source,
+    unsigned int            bit_start,
+    unsigned int            num_bits,
+    Int2Type<8>             /*byte_len*/)
+{
+    const unsigned long long MASK = (1ull << num_bits) - 1;
+    return (source >> bit_start) & MASK;
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Bitfield-extract.  Extracts \p num_bits from \p source starting at bit-offset \p bit_start.  The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type.
+ */
+template <typename UnsignedBits>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits source,
+    unsigned int bit_start,
+    unsigned int num_bits)
+{
+    return BFE(source, bit_start, num_bits, Int2Type<sizeof(UnsignedBits)>());
+}
+
+
+/**
+ * \brief Bitfield insert.  Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start.
+ */
+__device__ __forceinline__ void BFI(
+    unsigned int &ret,
+    unsigned int x,
+    unsigned int y,
+    unsigned int bit_start,
+    unsigned int num_bits)
+{
+    asm ("bfi.b32 %0, %1, %2, %3, %4;" :
+        "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
+}
+
+
+/**
+ * \brief Three-operand add.  Returns \p x + \p y + \p z.
+ */
+__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
+{
+    asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
+    return x;
+}
+
+
+/**
+ * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register.  For SM2.0 or later.
+ *
+ * \par
+ * The bytes in the two source registers \p a and \p b are numbered from 0 to 7:
+ * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes
+ * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within
+ * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0}
+ *
+ * \par Snippet
+ * The code snippet below illustrates byte-permute.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     int a        = 0x03020100;
+ *     int b        = 0x07060504;
+ *     int index    = 0x00007531;
+ *
+ *     int selected = PRMT(a, b, index);    // 0x07050301
+ *
+ * \endcode
+ *
+ */
+__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
+{
+    int ret;
+    asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
+    return ret;
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Sync-threads barrier.
+ */
+__device__ __forceinline__ void BAR(int count)
+{
+    asm volatile("bar.sync 1, %0;" : : "r"(count));
+}
+
+/**
+ * CTA barrier
+ */
+__device__  __forceinline__ void CTA_SYNC()
+{
+    __syncthreads();
+}
+
+
+/**
+ * CTA barrier with predicate
+ */
+__device__  __forceinline__ int CTA_SYNC_AND(int p)
+{
+    return __syncthreads_and(p);
+}
+
+
+/**
+ * CTA barrier with predicate
+ */
+__device__  __forceinline__ int CTA_SYNC_OR(int p)
+{
+    return __syncthreads_or(p);
+}
+
+
+/**
+ * Warp barrier
+ */
+__device__  __forceinline__ void WARP_SYNC(unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    __syncwarp(member_mask);
+#endif
+}
+
+
+/**
+ * Warp any
+ */
+__device__  __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    return __any_sync(member_mask, predicate);
+#else
+    return ::__any(predicate);
+#endif
+}
+
+
+/**
+ * Warp any
+ */
+__device__  __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    return __all_sync(member_mask, predicate);
+#else
+    return ::__all(predicate);
+#endif
+}
+
+
+/**
+ * Warp ballot
+ */
+__device__  __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    return __ballot_sync(member_mask, predicate);
+#else
+    return __ballot(predicate);
+#endif
+}
+
+
+/**
+ * Warp synchronous shfl_up
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));
+#else
+    asm volatile("shfl.up.b32 %0, %1, %2, %3;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags));
+#endif
+    return word;
+}
+
+/**
+ * Warp synchronous shfl_down
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));
+#else
+    asm volatile("shfl.down.b32 %0, %1, %2, %3;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags));
+#endif
+    return word;
+}
+
+/**
+ * Warp synchronous shfl_idx
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
+        : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags), "r"(member_mask));
+#else
+    asm volatile("shfl.idx.b32 %0, %1, %2, %3;"
+        : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags));
+#endif
+    return word;
+}
+
+/**
+ * Warp synchronous shfl_idx
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+  return __shfl_sync(member_mask, word, src_lane);
+#else
+  return __shfl(word, src_lane);
+#endif
+}
+
+/**
+ * Floating point multiply. (Mantissa LSB rounds towards zero.)
+ */
+__device__ __forceinline__ float FMUL_RZ(float a, float b)
+{
+    float d;
+    asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
+    return d;
+}
+
+
+/**
+ * Floating point multiply-add. (Mantissa LSB rounds towards zero.)
+ */
+__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
+{
+    float d;
+    asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
+    return d;
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Terminates the calling thread
+ */
+__device__ __forceinline__ void ThreadExit() {
+    asm volatile("exit;");
+}    
+
+
+/**
+ * \brief  Abort execution and generate an interrupt to the host CPU
+ */
+__device__ __forceinline__ void ThreadTrap() {
+    asm volatile("trap;");
+}
+
+
+/**
+ * \brief Returns the row-major linear thread identifier for a multidimensional thread block
+ */
+__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
+{
+    return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +
+            ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) +
+            threadIdx.x;
+}
+
+
+/**
+ * \brief Returns the warp lane ID of the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneId()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%laneid;" : "=r"(ret) );
+    return ret;
+}
+
+
+/**
+ * \brief Returns the warp ID of the calling thread.  Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block.
+ */
+__device__ __forceinline__ unsigned int WarpId()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%warpid;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * @brief Returns the warp mask for a warp of @p LOGICAL_WARP_THREADS threads
+ *
+ * @par
+ * If the number of threads assigned to the virtual warp is not a power of two,
+ * it's assumed that only one virtual warp exists.
+ *
+ * @tparam LOGICAL_WARP_THREADS <b>[optional]</b> The number of threads per
+ *                              "logical" warp (may be less than the number of
+ *                              hardware warp threads).
+ * @param warp_id Id of virtual warp within architectural warp
+ */
+template <int LOGICAL_WARP_THREADS,
+          int PTX_ARCH = CUB_PTX_ARCH>
+__host__ __device__ __forceinline__
+unsigned int WarpMask(unsigned int warp_id)
+{
+  constexpr bool is_pow_of_two = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE;
+  constexpr bool is_arch_warp = LOGICAL_WARP_THREADS ==
+                                CUB_WARP_THREADS(PTX_ARCH);
+
+  unsigned int member_mask =
+    0xFFFFFFFFu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS);
+
+  if (is_pow_of_two && !is_arch_warp)
+  {
+    member_mask <<= warp_id * LOGICAL_WARP_THREADS;
+  }
+
+  return member_mask;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes less than the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskLt()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskLe()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes greater than the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskGt()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskGe()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) );
+    return ret;
+}
+
+/** @} */       // end group UtilPtx
+
+
+
+
+/**
+ * \brief Shuffle-up for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>-<tt>src_offset</tt></sub>.  For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png)
+ * \ingroup WarpModule
+ *
+ * \tparam LOGICAL_WARP_THREADS     The number of threads per "logical" warp.  Must be a power-of-two <= 32.
+ * \tparam T                        <b>[inferred]</b> The input/output element type
+ *
+ * \par
+ * - Available only for SM3.0 or newer
+ *
+ * \par Snippet
+ * The code snippet below illustrates each thread obtaining a \p double value from the
+ * predecessor of its predecessor.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from two ranks below
+ *     double peer_data = ShuffleUp<32>(thread_data, 2, 0, 0xffffffff);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output \p peer_data will be <tt>{1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}</tt>.
+ *
+ */
+template <
+    int LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    typename T>
+__device__ __forceinline__ T ShuffleUp(
+    T               input,              ///< [in] The value to broadcast
+    int             src_offset,         ///< [in] The relative down-offset of the peer to read from
+    int             first_thread,       ///< [in] Index of first lane in logical warp (typically 0)
+    unsigned int    member_mask)        ///< [in] 32-bit mask of participating warp lanes
+{
+    /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+    enum {
+        SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
+    };
+
+    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+ 
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    unsigned int shuffle_word;
+    shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_thread | SHFL_C, member_mask);
+    output_alias[0] = shuffle_word;
+
+    #pragma unroll
+    for (int WORD = 1; WORD < WORDS; ++WORD)
+    {
+        shuffle_word       = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_thread | SHFL_C, member_mask);
+        output_alias[WORD] = shuffle_word;
+    }
+
+    return output;
+}
+
+
+/**
+ * \brief Shuffle-down for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>+<tt>src_offset</tt></sub>.  For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread.  ![](shfl_down_logo.png)
+ * \ingroup WarpModule
+ *
+ * \tparam LOGICAL_WARP_THREADS     The number of threads per "logical" warp.  Must be a power-of-two <= 32.
+ * \tparam T                        <b>[inferred]</b> The input/output element type
+ *
+ * \par
+ * - Available only for SM3.0 or newer
+ *
+ * \par Snippet
+ * The code snippet below illustrates each thread obtaining a \p double value from the
+ * successor of its successor.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from two ranks below
+ *     double peer_data = ShuffleDown<32>(thread_data, 2, 31, 0xffffffff);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output \p peer_data will be <tt>{3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}</tt>.
+ *
+ */
+template <
+    int LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    typename T>
+__device__ __forceinline__ T ShuffleDown(
+    T               input,              ///< [in] The value to broadcast
+    int             src_offset,         ///< [in] The relative up-offset of the peer to read from
+    int             last_thread,        ///< [in] Index of last thread in logical warp (typically 31 for a 32-thread warp)
+    unsigned int    member_mask)        ///< [in] 32-bit mask of participating warp lanes
+{
+    /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+    enum {
+        SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
+    };
+
+    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    unsigned int shuffle_word;
+    shuffle_word    = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_thread | SHFL_C, member_mask);
+    output_alias[0] = shuffle_word;
+
+    #pragma unroll
+    for (int WORD = 1; WORD < WORDS; ++WORD)
+    {
+        shuffle_word       = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_thread | SHFL_C, member_mask);
+        output_alias[WORD] = shuffle_word;
+    }
+
+    return output;
+}
+
+
+/**
+ * \brief Shuffle-broadcast for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input
+ * contributed by <em>warp-lane</em><sub><tt>src_lane</tt></sub>.  For \p src_lane < 0 or \p src_lane >= WARP_THREADS,
+ * then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png)
+ *
+ * \tparam LOGICAL_WARP_THREADS     The number of threads per "logical" warp.  Must be a power-of-two <= 32.
+ * \tparam T                        <b>[inferred]</b> The input/output element type
+ *
+ * \ingroup WarpModule
+ *
+ * \par
+ * - Available only for SM3.0 or newer
+ *
+ * \par Snippet
+ * The code snippet below illustrates each thread obtaining a \p double value from <em>warp-lane</em><sub>0</sub>.
+ *
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from thread 0
+ *     double peer_data = ShuffleIndex<32>(thread_data, 0, 0xffffffff);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output \p peer_data will be <tt>{1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}</tt>.
+ *
+ */
+template <
+    int LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    typename T>
+__device__ __forceinline__ T ShuffleIndex(
+    T               input,                  ///< [in] The value to broadcast
+    int             src_lane,               ///< [in] Which warp lane is to do the broadcasting
+    unsigned int    member_mask)            ///< [in] 32-bit mask of participating warp lanes
+{
+    /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+    enum {
+        SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1)
+    };
+
+    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    unsigned int shuffle_word;
+    shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0],
+                                 src_lane,
+                                 SHFL_C,
+                                 member_mask);
+
+    output_alias[0] = shuffle_word;
+
+    #pragma unroll
+    for (int WORD = 1; WORD < WORDS; ++WORD)
+    {
+        shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD],
+                                     src_lane,
+                                     SHFL_C,
+                                     member_mask);
+
+        output_alias[WORD] = shuffle_word;
+    }
+
+    return output;
+}
+
+
+
+/**
+ * Compute a 32b mask of threads having the same least-significant
+ * LABEL_BITS of \p label as the calling thread.
+ */
+template <int LABEL_BITS>
+inline __device__ unsigned int MatchAny(unsigned int label)
+{
+    unsigned int retval;
+
+    // Extract masks of common threads for each bit
+    #pragma unroll
+    for (int BIT = 0; BIT < LABEL_BITS; ++BIT)
+    {
+        unsigned int mask;
+        unsigned int current_bit = 1 << BIT;
+        asm ("{\n"
+            "    .reg .pred p;\n"
+            "    and.b32 %0, %1, %2;"
+            "    setp.eq.u32 p, %0, %2;\n"
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+            "    vote.ballot.sync.b32 %0, p, 0xffffffff;\n"
+#else
+            "    vote.ballot.b32 %0, p;\n"
+#endif
+            "    @!p not.b32 %0, %0;\n"
+            "}\n" : "=r"(mask) : "r"(label), "r"(current_bit));
+
+        // Remove peers who differ
+        retval = (BIT == 0) ? mask : retval & mask;
+    }
+
+    return retval;
+
+//  // VOLTA match
+//    unsigned int retval;
+//    asm ("{\n"
+//         "    match.any.sync.b32 %0, %1, 0xffffffff;\n"
+//         "}\n" : "=r"(retval) : "r"(label));
+//    return retval;
+
+}
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/util_type.cuh b/cuda/cub/util_type.cuh
new file mode 100644
index 00000000..d859bb16
--- /dev/null
+++ b/cuda/cub/util_type.cuh
@@ -0,0 +1,1225 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Common type manipulation (metaprogramming) utilities
+ */
+
+#pragma once
+
+#include <cfloat>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <type_traits>
+
+#if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) && !_NVHPC_CUDA
+    #include <cuda_fp16.h>
+#endif
+#if (__CUDACC_VER_MAJOR__ >= 11 || CUDA_VERSION >= 11000) && !_NVHPC_CUDA
+    #include <cuda_bf16.h>
+#endif
+
+#include <cub/util_arch.cuh>
+#include <cub/util_deprecated.cuh>
+#include <cub/util_macro.cuh>
+#include <cub/util_namespace.cuh>
+
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+
+
+/******************************************************************************
+ * Conditional types
+ ******************************************************************************/
+
+
+namespace detail
+{
+
+
+template <bool Test, class T1, class T2>
+using conditional_t = typename std::conditional<Test, T1, T2>::type;
+
+
+template <typename Iterator>
+using value_t = typename std::iterator_traits<Iterator>::value_type;
+
+
+/**
+ * The output value type
+ * type = (if IteratorT's value type is void) ?
+ * ... then the FallbackT,
+ * ... else the IteratorT's value type
+ */
+template <typename IteratorT, typename FallbackT>
+using non_void_value_t =
+  cub::detail::conditional_t<std::is_same<value_t<IteratorT>, void>::value,
+                             FallbackT,
+                             value_t<IteratorT>>;
+
+} // namespace detail
+
+
+/**
+ * \brief Type selection (<tt>IF ? ThenType : ElseType</tt>)
+ *
+ * \deprecated [Since 1.16.0] The cub::If APIs are deprecated.
+ *             Use cub::detail::conditional_t instead.
+ */
+template <bool IF, typename ThenType, typename ElseType>
+struct CUB_DEPRECATED If
+{
+  using Type = cub::detail::conditional_t<IF, ThenType, ElseType>;
+};
+
+
+/******************************************************************************
+ * Type equality
+ ******************************************************************************/
+
+/**
+ * \brief Type equality test
+ *
+ * \deprecated [Since 1.16.0] The cub::Equals APIs are deprecated.
+ *             Use std::is_same instead.
+ */
+template <typename A, typename B>
+struct CUB_DEPRECATED Equals
+{
+  static constexpr int VALUE = std::is_same<A, B>::value ? 1 : 0;
+  static constexpr int NEGATE = VALUE ? 0 : 1;
+};
+
+
+/******************************************************************************
+ * Static math
+ ******************************************************************************/
+
+/**
+ * \brief Statically determine log2(N), rounded up.
+ *
+ * For example:
+ *     Log2<8>::VALUE   // 3
+ *     Log2<3>::VALUE   // 2
+ */
+template <int N, int CURRENT_VAL = N, int COUNT = 0>
+struct Log2
+{
+    /// Static logarithm value
+    enum { VALUE = Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };         // Inductive case
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <int N, int COUNT>
+struct Log2<N, 0, COUNT>
+{
+    enum {VALUE = (1 << (COUNT - 1) < N) ?                                  // Base case
+        COUNT :
+        COUNT - 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Statically determine if N is a power-of-two
+ */
+template <int N>
+struct PowerOfTwo
+{
+    enum { VALUE = ((N & (N - 1)) == 0) };
+};
+
+
+
+/******************************************************************************
+ * Pointer vs. iterator detection
+ ******************************************************************************/
+
+/**
+ * \brief Pointer vs. iterator
+ *
+ * \deprecated [Since 1.16.0] The cub::IsPointer APIs are deprecated.
+ *             Use std::is_pointer instead.
+ */
+template <typename Tp>
+struct CUB_DEPRECATED IsPointer
+{
+  static constexpr int VALUE = std::is_pointer<Tp>::value;
+};
+
+
+/******************************************************************************
+ * Qualifier detection
+ ******************************************************************************/
+
+/**
+ * \brief Volatile modifier test
+ *
+ * \deprecated [Since 1.16.0] The cub::IsVolatile APIs are deprecated.
+ *             Use std::is_volatile instead.
+ */
+template <typename Tp>
+struct CUB_DEPRECATED IsVolatile
+{
+  static constexpr int VALUE = std::is_volatile<Tp>::value;
+};
+
+/******************************************************************************
+ * Qualifier removal
+ ******************************************************************************/
+
+/**
+ * \brief Removes \p const and \p volatile qualifiers from type \p Tp.
+ *
+ * \deprecated [Since 1.16.0] The cub::RemoveQualifiers APIs are deprecated.
+ *             Use std::remove_cv instead.
+ *
+ * For example:
+ *     <tt>typename RemoveQualifiers<volatile int>::Type         // int;</tt>
+ */
+template <typename Tp, typename Up = Tp>
+struct CUB_DEPRECATED RemoveQualifiers
+{
+  using Type = typename std::remove_cv<Tp>::type;
+};
+
+
+/******************************************************************************
+ * Marker types
+ ******************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * \brief A simple "NULL" marker type
+ */
+struct NullType
+{
+    using value_type = NullType;
+
+    template <typename T>
+    __host__ __device__ __forceinline__ NullType& operator =(const T&) { return *this; }
+
+    __host__ __device__ __forceinline__ bool operator ==(const NullType&) { return true; }
+
+    __host__ __device__ __forceinline__ bool operator !=(const NullType&) { return false; }
+};
+
+
+/**
+ * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values)
+ */
+template <int A>
+struct Int2Type
+{
+    enum {VALUE = A};
+};
+
+/**
+ * \brief Allows algorithms that take a value as input to take a future value that is not computed yet at launch time.
+ *
+ * Note that it is user's responsibility to ensure that the result will be ready before use via external synchronization
+ * or stream-ordering dependencies.
+ *
+ * \code
+ * int *d_intermediate_result;
+ * allocator.DeviceAllocate((void **)&d_intermediate_result, sizeof(int));
+ * compute_intermediate_result<<<blocks, threads>>>(
+ *     d_intermediate_result,  // output
+ *     arg1,                   // input
+ *     arg2);                  // input
+ * cub::FutureValue<int> init_value(d_intermediate_result);
+ * cub::DeviceScan::ExclusiveScan(
+ *     d_temp_storage,
+ *     temp_storage_bytes,
+ *     d_in,
+ *     d_out,
+ *     cub::Sum(),
+ *     init_value,
+ *     num_items);
+ * allocator.DeviceFree(d_intermediate_result);
+ * \endcode
+ */
+template <typename T, typename IterT = T*>
+struct FutureValue
+{
+    using value_type = T;
+    using iterator_type = IterT;
+    explicit __host__ __device__ __forceinline__ FutureValue(IterT iter):m_iter(iter) {}
+    __host__ __device__ __forceinline__ operator T() {
+        return *m_iter;
+    }
+
+private:
+    IterT m_iter;
+};
+
+namespace detail {
+
+/**
+ * \brief Allows algorithms to instantiate a single kernel to support both immediate value and future value.
+ */
+template <typename T, typename IterT = T*>
+struct InputValue
+{
+    using value_type = T;
+    using iterator_type = IterT;
+    __host__ __device__ __forceinline__ operator T() {
+        if (m_is_future) {
+            return m_future_value;
+        }
+        return m_immediate_value;
+    }
+    explicit __host__ __device__ __forceinline__ InputValue(T immediate_value): m_is_future(false), m_immediate_value(immediate_value) {}
+    explicit __host__ __device__ __forceinline__ InputValue(FutureValue<T, IterT> future_value): m_is_future(true), m_future_value(future_value) {}
+    __host__ __device__ __forceinline__ InputValue(const InputValue &other): m_is_future(other.m_is_future) {
+        if (m_is_future) {
+            m_future_value = other.m_future_value;
+        } else {
+            m_immediate_value = other.m_immediate_value;
+        }
+    }
+
+private:
+    bool m_is_future;
+    union
+    {
+        FutureValue<T, IterT> m_future_value;
+        T m_immediate_value;
+    };
+};
+
+} // namespace detail
+
+
+/******************************************************************************
+ * Size and alignment
+ ******************************************************************************/
+
+/// Structure alignment
+template <typename T>
+struct AlignBytes
+{
+    struct Pad
+    {
+        T       val;
+        char    byte;
+    };
+
+    enum
+    {
+        /// The "true CUDA" alignment of T in bytes
+        ALIGN_BYTES = sizeof(Pad) - sizeof(T)
+    };
+
+    /// The "truly aligned" type
+    typedef T Type;
+};
+
+// Specializations where host C++ compilers (e.g., 32-bit Windows) may disagree
+// with device C++ compilers (EDG) on types passed as template parameters through
+// kernel functions
+
+#define __CUB_ALIGN_BYTES(t, b)         \
+    template <> struct AlignBytes<t>    \
+    { enum { ALIGN_BYTES = b }; typedef __align__(b) t Type; };
+
+__CUB_ALIGN_BYTES(short4, 8)
+__CUB_ALIGN_BYTES(ushort4, 8)
+__CUB_ALIGN_BYTES(int2, 8)
+__CUB_ALIGN_BYTES(uint2, 8)
+__CUB_ALIGN_BYTES(long long, 8)
+__CUB_ALIGN_BYTES(unsigned long long, 8)
+__CUB_ALIGN_BYTES(float2, 8)
+__CUB_ALIGN_BYTES(double, 8)
+#ifdef _WIN32
+    __CUB_ALIGN_BYTES(long2, 8)
+    __CUB_ALIGN_BYTES(ulong2, 8)
+#else
+    __CUB_ALIGN_BYTES(long2, 16)
+    __CUB_ALIGN_BYTES(ulong2, 16)
+#endif
+__CUB_ALIGN_BYTES(int4, 16)
+__CUB_ALIGN_BYTES(uint4, 16)
+__CUB_ALIGN_BYTES(float4, 16)
+__CUB_ALIGN_BYTES(long4, 16)
+__CUB_ALIGN_BYTES(ulong4, 16)
+__CUB_ALIGN_BYTES(longlong2, 16)
+__CUB_ALIGN_BYTES(ulonglong2, 16)
+__CUB_ALIGN_BYTES(double2, 16)
+__CUB_ALIGN_BYTES(longlong4, 16)
+__CUB_ALIGN_BYTES(ulonglong4, 16)
+__CUB_ALIGN_BYTES(double4, 16)
+
+// clang-format off
+template <typename T> struct AlignBytes<volatile T> : AlignBytes<T> {};
+template <typename T> struct AlignBytes<const T> : AlignBytes<T> {};
+template <typename T> struct AlignBytes<const volatile T> : AlignBytes<T> {};
+// clang-format on
+
+/// Unit-words of data movement
+template <typename T>
+struct UnitWord
+{
+    enum {
+        ALIGN_BYTES = AlignBytes<T>::ALIGN_BYTES
+    };
+
+    template <typename Unit>
+    struct IsMultiple
+    {
+        enum {
+            UNIT_ALIGN_BYTES    = AlignBytes<Unit>::ALIGN_BYTES,
+            IS_MULTIPLE         = (sizeof(T) % sizeof(Unit) == 0) && (int(ALIGN_BYTES) % int(UNIT_ALIGN_BYTES) == 0)
+        };
+    };
+
+    /// Biggest shuffle word that T is a whole multiple of and is not larger than
+    /// the alignment of T
+    using ShuffleWord = cub::detail::conditional_t<
+      IsMultiple<int>::IS_MULTIPLE,
+      unsigned int,
+      cub::detail::conditional_t<IsMultiple<short>::IS_MULTIPLE,
+                                 unsigned short,
+                                 unsigned char>>;
+
+    /// Biggest volatile word that T is a whole multiple of and is not larger than
+    /// the alignment of T
+    using VolatileWord =
+      cub::detail::conditional_t<IsMultiple<long long>::IS_MULTIPLE,
+                                 unsigned long long,
+                                 ShuffleWord>;
+
+    /// Biggest memory-access word that T is a whole multiple of and is not larger
+    /// than the alignment of T
+    using DeviceWord =
+      cub::detail::conditional_t<IsMultiple<longlong2>::IS_MULTIPLE,
+                                 ulonglong2,
+                                 VolatileWord>;
+
+    /// Biggest texture reference word that T is a whole multiple of and is not
+    /// larger than the alignment of T
+    using TextureWord = cub::detail::conditional_t<
+      IsMultiple<int4>::IS_MULTIPLE,
+      uint4,
+      cub::detail::conditional_t<IsMultiple<int2>::IS_MULTIPLE, uint2, ShuffleWord>>;
+};
+
+// float2 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord <float2>
+{
+    typedef int         ShuffleWord;
+#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
+    typedef float       VolatileWord;
+    typedef uint2       DeviceWord;
+#else
+    typedef unsigned long long   VolatileWord;
+    typedef unsigned long long   DeviceWord;
+#endif
+    typedef float2      TextureWord;
+};
+
+// float4 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord <float4>
+{
+    typedef int         ShuffleWord;
+#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
+    typedef float               VolatileWord;
+    typedef uint4               DeviceWord;
+#else
+    typedef unsigned long long  VolatileWord;
+    typedef ulonglong2          DeviceWord;
+#endif
+    typedef float4              TextureWord;
+};
+
+
+// char2 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord <char2>
+{
+    typedef unsigned short      ShuffleWord;
+#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
+    typedef unsigned short      VolatileWord;
+    typedef short               DeviceWord;
+#else
+    typedef unsigned short      VolatileWord;
+    typedef unsigned short      DeviceWord;
+#endif
+    typedef unsigned short      TextureWord;
+};
+
+// clang-format off
+template <typename T> struct UnitWord<volatile T> : UnitWord<T> {};
+template <typename T> struct UnitWord<const T> : UnitWord<T> {};
+template <typename T> struct UnitWord<const volatile T> : UnitWord<T> {};
+// clang-format on
+
+
+/******************************************************************************
+ * Vector type inference utilities.
+ ******************************************************************************/
+
+/**
+ * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists.  Otherwise \p Type refers to the CubVector structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields.
+ */
+template <typename T, int vec_elements> struct CubVector;
+
+
+enum
+{
+    /// The maximum number of elements in CUDA vector types
+    MAX_VEC_ELEMENTS = 4,
+};
+
+
+/**
+ * Generic vector-1 type
+ */
+template <typename T>
+struct CubVector<T, 1>
+{
+    T x;
+
+    typedef T BaseType;
+    typedef CubVector<T, 1> Type;
+};
+
+/**
+ * Generic vector-2 type
+ */
+template <typename T>
+struct CubVector<T, 2>
+{
+    T x;
+    T y;
+
+    typedef T BaseType;
+    typedef CubVector<T, 2> Type;
+};
+
+/**
+ * Generic vector-3 type
+ */
+template <typename T>
+struct CubVector<T, 3>
+{
+    T x;
+    T y;
+    T z;
+
+    typedef T BaseType;
+    typedef CubVector<T, 3> Type;
+};
+
+/**
+ * Generic vector-4 type
+ */
+template <typename T>
+struct CubVector<T, 4>
+{
+    T x;
+    T y;
+    T z;
+    T w;
+
+    typedef T BaseType;
+    typedef CubVector<T, 4> Type;
+};
+
+
+/**
+ * Macro for expanding partially-specialized built-in vector types
+ */
+#define CUB_DEFINE_VECTOR_TYPE(base_type,short_type)                                                    \
+                                                                                                        \
+    template<> struct CubVector<base_type, 1> : short_type##1                                           \
+    {                                                                                                   \
+      typedef base_type       BaseType;                                                                 \
+      typedef short_type##1   Type;                                                                     \
+      __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {           \
+          CubVector retval;                                                                             \
+          retval.x = x + other.x;                                                                       \
+          return retval;                                                                                \
+      }                                                                                                 \
+      __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {           \
+          CubVector retval;                                                                             \
+          retval.x = x - other.x;                                                                       \
+          return retval;                                                                                \
+      }                                                                                                 \
+    };                                                                                                  \
+                                                                                                        \
+    template<> struct CubVector<base_type, 2> : short_type##2                                           \
+    {                                                                                                   \
+        typedef base_type       BaseType;                                                               \
+        typedef short_type##2   Type;                                                                   \
+        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x + other.x;                                                                     \
+            retval.y = y + other.y;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x - other.x;                                                                     \
+            retval.y = y - other.y;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+    };                                                                                                  \
+                                                                                                        \
+    template<> struct CubVector<base_type, 3> : short_type##3                                           \
+    {                                                                                                   \
+        typedef base_type       BaseType;                                                               \
+        typedef short_type##3   Type;                                                                   \
+        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x + other.x;                                                                     \
+            retval.y = y + other.y;                                                                     \
+            retval.z = z + other.z;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x - other.x;                                                                     \
+            retval.y = y - other.y;                                                                     \
+            retval.z = z - other.z;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+    };                                                                                                  \
+                                                                                                        \
+    template<> struct CubVector<base_type, 4> : short_type##4                                           \
+    {                                                                                                   \
+        typedef base_type       BaseType;                                                               \
+        typedef short_type##4   Type;                                                                   \
+        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x + other.x;                                                                     \
+            retval.y = y + other.y;                                                                     \
+            retval.z = z + other.z;                                                                     \
+            retval.w = w + other.w;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x - other.x;                                                                     \
+            retval.y = y - other.y;                                                                     \
+            retval.z = z - other.z;                                                                     \
+            retval.w = w - other.w;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+    };
+
+
+
+// Expand CUDA vector types for built-in primitives
+// clang-format off
+CUB_DEFINE_VECTOR_TYPE(char,               char)
+CUB_DEFINE_VECTOR_TYPE(signed char,        char)
+CUB_DEFINE_VECTOR_TYPE(short,              short)
+CUB_DEFINE_VECTOR_TYPE(int,                int)
+CUB_DEFINE_VECTOR_TYPE(long,               long)
+CUB_DEFINE_VECTOR_TYPE(long long,          longlong)
+CUB_DEFINE_VECTOR_TYPE(unsigned char,      uchar)
+CUB_DEFINE_VECTOR_TYPE(unsigned short,     ushort)
+CUB_DEFINE_VECTOR_TYPE(unsigned int,       uint)
+CUB_DEFINE_VECTOR_TYPE(unsigned long,      ulong)
+CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong)
+CUB_DEFINE_VECTOR_TYPE(float,              float)
+CUB_DEFINE_VECTOR_TYPE(double,             double)
+CUB_DEFINE_VECTOR_TYPE(bool,               uchar)
+// clang-format on
+
+// Undefine macros
+#undef CUB_DEFINE_VECTOR_TYPE
+
+
+/******************************************************************************
+ * Wrapper types
+ ******************************************************************************/
+
+/**
+ * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions
+ */
+template <typename T>
+struct Uninitialized
+{
+    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename UnitWord<T>::DeviceWord DeviceWord;
+
+    static constexpr std::size_t DATA_SIZE = sizeof(T);
+    static constexpr std::size_t WORD_SIZE = sizeof(DeviceWord);
+    static constexpr std::size_t WORDS = DATA_SIZE / WORD_SIZE;
+
+    /// Backing storage
+    DeviceWord storage[WORDS];
+
+    /// Alias
+    __host__ __device__ __forceinline__ T& Alias()
+    {
+        return reinterpret_cast<T&>(*this);
+    }
+};
+
+
+/**
+ * \brief A key identifier paired with a corresponding value
+ */
+template <
+    typename    _Key,
+    typename    _Value
+#if defined(_WIN32) && !defined(_WIN64)
+    , bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES)
+    , bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES)
+#endif // #if defined(_WIN32) && !defined(_WIN64)
+    >
+struct KeyValuePair
+{
+    typedef _Key    Key;                ///< Key data type
+    typedef _Value  Value;              ///< Value data type
+
+    Key     key;                        ///< Item key
+    Value   value;                      ///< Item value
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
+};
+
+#if defined(_WIN32) && !defined(_WIN64)
+
+/**
+ * Win32 won't do 16B alignment.  This can present two problems for
+ * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members:
+ * 1) If a smaller-aligned item were to be listed first, the host compiler places the
+ *    should-be-16B item at too early an offset (and disagrees with device compiler)
+ * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size
+ *    of the struct wrong (and disagrees with device compiler)
+ *
+ * So we put the larger-should-be-aligned item first, and explicitly pad the
+ * end of the struct
+ */
+
+/// Smaller key specialization
+template <typename K, typename V>
+struct KeyValuePair<K, V, true, false>
+{
+    typedef K Key;
+    typedef V Value;
+
+    typedef char Pad[AlignBytes<V>::ALIGN_BYTES - AlignBytes<K>::ALIGN_BYTES];
+
+    Value   value;  // Value has larger would-be alignment and goes first
+    Key     key;
+    Pad     pad;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
+};
+
+
+/// Smaller value specialization
+template <typename K, typename V>
+struct KeyValuePair<K, V, false, true>
+{
+    typedef K Key;
+    typedef V Value;
+
+    typedef char Pad[AlignBytes<K>::ALIGN_BYTES - AlignBytes<V>::ALIGN_BYTES];
+
+    Key     key;    // Key has larger would-be alignment and goes first
+    Value   value;
+    Pad     pad;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
+};
+
+#endif // #if defined(_WIN32) && !defined(_WIN64)
+
+
+/**
+ * \brief A wrapper for passing simple static arrays as kernel parameters
+ */
+template <typename T, int COUNT>
+struct ArrayWrapper
+{
+
+    /// Statically-sized array of type \p T
+    T array[COUNT];
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ArrayWrapper() {}
+};
+
+
+/**
+ * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth.
+ *
+ * Many multi-pass computations require a pair of "ping-pong" storage
+ * buffers (e.g., one for reading from and the other for writing to, and then
+ * vice-versa for the subsequent pass).  This structure wraps a set of device
+ * buffers and a "selector" member to track which is "current".
+ */
+template <typename T>
+struct DoubleBuffer
+{
+    /// Pair of device buffer pointers
+    T *d_buffers[2];
+
+    ///  Selector into \p d_buffers (i.e., the active/valid buffer)
+    int selector;
+
+    /// \brief Constructor
+    __host__ __device__ __forceinline__ DoubleBuffer()
+    {
+        selector = 0;
+        d_buffers[0] = NULL;
+        d_buffers[1] = NULL;
+    }
+
+    /// \brief Constructor
+    __host__ __device__ __forceinline__ DoubleBuffer(
+        T *d_current,         ///< The currently valid buffer
+        T *d_alternate)       ///< Alternate storage buffer of the same size as \p d_current
+    {
+        selector = 0;
+        d_buffers[0] = d_current;
+        d_buffers[1] = d_alternate;
+    }
+
+    /// \brief Return pointer to the currently valid buffer
+    __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; }
+
+    /// \brief Return pointer to the currently invalid buffer
+    __host__ __device__ __forceinline__ T* Alternate() { return d_buffers[selector ^ 1]; }
+
+};
+
+
+
+/******************************************************************************
+ * Typedef-detection
+ ******************************************************************************/
+
+
+/**
+ * \brief Defines a structure \p detector_name that is templated on type \p T.  The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name
+ */
+#define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name)  \
+    template <typename T>                                               \
+    struct detector_name                                                \
+    {                                                                   \
+        template <typename C>                                           \
+        static char& test(typename C::nested_type_name*);               \
+        template <typename>                                             \
+        static int& test(...);                                          \
+        enum                                                            \
+        {                                                               \
+            VALUE = sizeof(test<T>(0)) < sizeof(int)                    \
+        };                                                              \
+    };
+
+
+
+/******************************************************************************
+ * Simple enable-if (similar to Boost)
+ ******************************************************************************/
+
+/**
+ * \brief Simple enable-if (similar to Boost)
+ *
+ * \deprecated [Since 1.16.0] The cub::If APIs are deprecated.
+ *             Use std::enable_if instead.
+ */
+template <bool Condition, class T = void>
+struct CUB_DEPRECATED EnableIf
+{
+  using Type = typename std::enable_if<Condition, T>::type;
+};
+
+/******************************************************************************
+ * Typedef-detection
+ ******************************************************************************/
+
+/**
+ * \brief Determine whether or not BinaryOp's functor is of the form <tt>bool operator()(const T& a, const T&b)</tt> or <tt>bool operator()(const T& a, const T&b, unsigned int idx)</tt>
+ */
+template <typename T, typename BinaryOp>
+struct BinaryOpHasIdxParam
+{
+private:
+/*
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx) const>  struct SFINAE1 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx)>        struct SFINAE2 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx) const>                struct SFINAE3 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx)>                      struct SFINAE4 {};
+*/
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx) const>           struct SFINAE5 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx)>                 struct SFINAE6 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx) const>                         struct SFINAE7 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx)>                               struct SFINAE8 {};
+/*
+    template <typename BinaryOpT> static char Test(SFINAE1<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE2<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE3<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE4<BinaryOpT, &BinaryOpT::operator()> *);
+*/
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE5<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE6<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE7<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE8<BinaryOpT, &BinaryOpT::operator()> *);
+
+    template <typename BinaryOpT> static int Test(...);
+
+public:
+
+    /// Whether the functor BinaryOp has a third <tt>unsigned int</tt> index param
+    static const bool HAS_PARAM = sizeof(Test<BinaryOp>(NULL)) == sizeof(char);
+};
+
+
+
+
+/******************************************************************************
+ * Simple type traits utilities.
+ *
+ * For example:
+ *     Traits<int>::CATEGORY             // SIGNED_INTEGER
+ *     Traits<NullType>::NULL_TYPE       // true
+ *     Traits<uint4>::CATEGORY           // NOT_A_NUMBER
+ *     Traits<uint4>::PRIMITIVE;         // false
+ *
+ ******************************************************************************/
+
+/**
+ * \brief Basic type traits categories
+ */
+enum Category
+{
+    NOT_A_NUMBER,
+    SIGNED_INTEGER,
+    UNSIGNED_INTEGER,
+    FLOATING_POINT
+};
+
+
+/**
+ * \brief Basic type traits
+ */
+template <Category _CATEGORY, bool _PRIMITIVE, bool _NULL_TYPE, typename _UnsignedBits, typename T>
+struct BaseTraits
+{
+    /// Category
+    static const Category CATEGORY      = _CATEGORY;
+    enum
+    {
+        PRIMITIVE       = _PRIMITIVE,
+        NULL_TYPE       = _NULL_TYPE,
+    };
+};
+
+
+/**
+ * Basic type traits (unsigned primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<UNSIGNED_INTEGER, true, false, _UnsignedBits, T>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = UNSIGNED_INTEGER;
+    static const UnsignedBits   LOWEST_KEY  = UnsignedBits(0);
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1);
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+
+    static __host__ __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        return key;
+    }
+
+    static __host__ __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        return key;
+    }
+
+    static __host__ __device__ __forceinline__ T Max()
+    {
+        UnsignedBits retval_bits = MAX_KEY;
+        T retval;
+        memcpy(&retval, &retval_bits, sizeof(T));
+        return retval;
+    }
+
+    static __host__ __device__ __forceinline__ T Lowest()
+    {
+        UnsignedBits retval_bits = LOWEST_KEY;
+        T retval;
+        memcpy(&retval, &retval_bits, sizeof(T));
+        return retval;
+    }
+};
+
+
+/**
+ * Basic type traits (signed primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<SIGNED_INTEGER, true, false, _UnsignedBits, T>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = SIGNED_INTEGER;
+    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+    static const UnsignedBits   LOWEST_KEY  = HIGH_BIT;
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+    static __host__ __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        return key ^ HIGH_BIT;
+    };
+
+    static __host__ __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        return key ^ HIGH_BIT;
+    };
+
+    static __host__ __device__ __forceinline__ T Max()
+    {
+        UnsignedBits retval = MAX_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+
+    static __host__ __device__ __forceinline__ T Lowest()
+    {
+        UnsignedBits retval = LOWEST_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+};
+
+template <typename _T>
+struct FpLimits;
+
+template <>
+struct FpLimits<float>
+{
+    static __host__ __device__ __forceinline__ float Max() {
+        return FLT_MAX;
+    }
+
+    static __host__ __device__ __forceinline__ float Lowest() {
+        return FLT_MAX * float(-1);
+    }
+};
+
+template <>
+struct FpLimits<double>
+{
+    static __host__ __device__ __forceinline__ double Max() {
+        return DBL_MAX;
+    }
+
+    static __host__ __device__ __forceinline__ double Lowest() {
+        return DBL_MAX  * double(-1);
+    }
+};
+
+#if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) && !_NVHPC_CUDA
+template <>
+struct FpLimits<__half>
+{
+    static __host__ __device__ __forceinline__ __half Max() {
+        unsigned short max_word = 0x7BFF;
+        return reinterpret_cast<__half&>(max_word);
+    }
+
+    static __host__ __device__ __forceinline__ __half Lowest() {
+        unsigned short lowest_word = 0xFBFF;
+        return reinterpret_cast<__half&>(lowest_word);
+    }
+};
+#endif
+
+#if (__CUDACC_VER_MAJOR__ >= 11 || CUDA_VERSION >= 11000) && !_NVHPC_CUDA
+template <>
+struct FpLimits<__nv_bfloat16>
+{
+    static __host__ __device__ __forceinline__ __nv_bfloat16 Max() {
+        unsigned short max_word = 0x7F7F;
+        return reinterpret_cast<__nv_bfloat16&>(max_word);
+    }
+
+    static __host__ __device__ __forceinline__ __nv_bfloat16 Lowest() {
+        unsigned short lowest_word = 0xFF7F;
+        return reinterpret_cast<__nv_bfloat16&>(lowest_word);
+    }
+};
+#endif
+
+/**
+ * Basic type traits (fp primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<FLOATING_POINT, true, false, _UnsignedBits, T>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = FLOATING_POINT;
+    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+    static const UnsignedBits   LOWEST_KEY  = UnsignedBits(-1);
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+    static __host__ __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT;
+        return key ^ mask;
+    };
+
+    static __host__ __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1);
+        return key ^ mask;
+    };
+
+    static __host__ __device__ __forceinline__ T Max() {
+        return FpLimits<T>::Max();
+    }
+
+    static __host__ __device__ __forceinline__ T Lowest() {
+        return FpLimits<T>::Lowest();
+    }
+};
+
+
+/**
+ * \brief Numeric type traits
+ */
+// clang-format off
+template <typename T> struct NumericTraits :            BaseTraits<NOT_A_NUMBER, false, false, T, T> {};
+
+template <> struct NumericTraits<NullType> :            BaseTraits<NOT_A_NUMBER, false, true, NullType, NullType> {};
+
+template <> struct NumericTraits<char> :                BaseTraits<(std::numeric_limits<char>::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char, char> {};
+template <> struct NumericTraits<signed char> :         BaseTraits<SIGNED_INTEGER, true, false, unsigned char, signed char> {};
+template <> struct NumericTraits<short> :               BaseTraits<SIGNED_INTEGER, true, false, unsigned short, short> {};
+template <> struct NumericTraits<int> :                 BaseTraits<SIGNED_INTEGER, true, false, unsigned int, int> {};
+template <> struct NumericTraits<long> :                BaseTraits<SIGNED_INTEGER, true, false, unsigned long, long> {};
+template <> struct NumericTraits<long long> :           BaseTraits<SIGNED_INTEGER, true, false, unsigned long long, long long> {};
+
+template <> struct NumericTraits<unsigned char> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned char, unsigned char> {};
+template <> struct NumericTraits<unsigned short> :      BaseTraits<UNSIGNED_INTEGER, true, false, unsigned short, unsigned short> {};
+template <> struct NumericTraits<unsigned int> :        BaseTraits<UNSIGNED_INTEGER, true, false, unsigned int, unsigned int> {};
+template <> struct NumericTraits<unsigned long> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long, unsigned long> {};
+template <> struct NumericTraits<unsigned long long> :  BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long long, unsigned long long> {};
+
+template <> struct NumericTraits<float> :               BaseTraits<FLOATING_POINT, true, false, unsigned int, float> {};
+template <> struct NumericTraits<double> :              BaseTraits<FLOATING_POINT, true, false, unsigned long long, double> {};
+#if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) && !_NVHPC_CUDA
+    template <> struct NumericTraits<__half> :          BaseTraits<FLOATING_POINT, true, false, unsigned short, __half> {};
+#endif
+#if (__CUDACC_VER_MAJOR__ >= 11 || CUDA_VERSION >= 11000) && !_NVHPC_CUDA
+    template <> struct NumericTraits<__nv_bfloat16> :   BaseTraits<FLOATING_POINT, true, false, unsigned short, __nv_bfloat16> {};
+#endif
+
+template <> struct NumericTraits<bool> :                BaseTraits<UNSIGNED_INTEGER, true, false, typename UnitWord<bool>::VolatileWord, bool> {};
+// clang-format on
+
+/**
+ * \brief Type traits
+ */
+template <typename T>
+struct Traits : NumericTraits<typename std::remove_cv<T>::type> {};
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group UtilModule
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/version.cuh b/cuda/cub/version.cuh
new file mode 100644
index 00000000..df61300d
--- /dev/null
+++ b/cuda/cub/version.cuh
@@ -0,0 +1,70 @@
+/******************************************************************************
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/*! \file version.cuh
+ *  \brief Compile-time macros encoding CUB release version
+ *
+ *         <cub/version.h> is the only CUB header that is guaranteed to
+ *         change with every CUB release.
+ *
+ */
+
+#pragma once
+
+/*! \def CUB_VERSION
+ *  \brief The preprocessor macro \p CUB_VERSION encodes the version
+ *         number of the CUB library.
+ *
+ *         <tt>CUB_VERSION % 100</tt> is the sub-minor version.
+ *         <tt>CUB_VERSION / 100 % 1000</tt> is the minor version.
+ *         <tt>CUB_VERSION / 100000</tt> is the major version.
+ */
+#define CUB_VERSION 101702
+
+/*! \def CUB_MAJOR_VERSION
+ *  \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the
+ *         major version number of the CUB library.
+ */
+#define CUB_MAJOR_VERSION     (CUB_VERSION / 100000)
+
+/*! \def CUB_MINOR_VERSION
+ *  \brief The preprocessor macro \p CUB_MINOR_VERSION encodes the
+ *         minor version number of the CUB library.
+ */
+#define CUB_MINOR_VERSION     (CUB_VERSION / 100 % 1000)
+
+/*! \def CUB_SUBMINOR_VERSION
+ *  \brief The preprocessor macro \p CUB_SUBMINOR_VERSION encodes the
+ *         sub-minor version number of the CUB library.
+ */
+#define CUB_SUBMINOR_VERSION  (CUB_VERSION % 100)
+
+/*! \def CUB_PATCH_NUMBER
+ *  \brief The preprocessor macro \p CUB_PATCH_NUMBER encodes the
+ *         patch number of the CUB library.
+ */
+#define CUB_PATCH_NUMBER 0
diff --git a/cuda/cub/warp/specializations/warp_reduce_shfl.cuh b/cuda/cub/warp/specializations/warp_reduce_shfl.cuh
new file mode 100644
index 00000000..faa5cfcd
--- /dev/null
+++ b/cuda/cub/warp/specializations/warp_reduce_shfl.cuh
@@ -0,0 +1,537 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_type.cuh"
+
+#include <stdint.h>
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ *
+ * LOGICAL_WARP_THREADS must be a power-of-two
+ */
+template <
+    typename    T,                      ///< Data type being reduced
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpReduceShfl
+{
+    static_assert(PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+                  "LOGICAL_WARP_THREADS must be a power of two");
+
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// The number of warp reduction steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// Number of logical warps in a PTX warp
+        LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS,
+
+        /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+        SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8
+
+    };
+
+    template <typename S>
+    struct IsInteger
+    {
+        enum {
+            ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange
+            IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
+        };
+    };
+
+
+    /// Shared memory storage layout type
+    typedef NullType TempStorage;
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    /// Lane index in logical warp
+    int lane_id;
+
+    /// Logical warp index in 32-thread physical warp
+    int warp_id;
+
+    /// 32-thread physical warp member mask of logical warp
+    uint32_t member_mask;
+
+
+    //---------------------------------------------------------------------
+    // Construction
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ WarpReduceShfl(
+        TempStorage &/*temp_storage*/)
+        : lane_id(static_cast<int>(LaneId()))
+        , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS))
+        , member_mask(WarpMask<LOGICAL_WARP_THREADS, PTX_ARCH>(warp_id))
+    {
+        if (!IS_ARCH_WARP)
+        {
+            lane_id = lane_id % LOGICAL_WARP_THREADS;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Reduction steps
+    //---------------------------------------------------------------------
+
+    /// Reduction (specialized for summation across uint32 types)
+    __device__ __forceinline__ unsigned int ReduceStep(
+        unsigned int    input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*reduction_op*/,   ///< [in] Binary reduction operator
+        int             last_lane,          ///< [in] Index of last lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned int output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.down.b32 r0|p, %1, %2, %3;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across fp32 types)
+    __device__ __forceinline__ float ReduceStep(
+        float           input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*reduction_op*/,   ///< [in] Binary reduction operator
+        int             last_lane,          ///< [in] Index of last lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        float output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.down.b32 r0|p, %1, %2, %3;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across unsigned long long types)
+    __device__ __forceinline__ unsigned long long ReduceStep(
+        unsigned long long  input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned long long output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.u64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.down.b32 lo|p, lo, %2, %3;"
+            "  shfl.down.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.u64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across long long types)
+    __device__ __forceinline__ long long ReduceStep(
+        long long           input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        long long output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.s64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.down.b32 lo|p, lo, %2, %3;"
+            "  shfl.down.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.s64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across double types)
+    __device__ __forceinline__ double ReduceStep(
+        double              input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        double output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.down.b32 lo|p, lo, %2, %3;"
+            "  shfl.down.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT, ValueT> types)
+    template <typename ValueT, typename KeyT>
+    __device__ __forceinline__ KeyValuePair<KeyT, ValueT> ReduceStep(
+        KeyValuePair<KeyT, ValueT>                  input,              ///< [in] Calling thread's input item.
+        SwizzleScanOp<ReduceByKeyOp<cub::Sum> >     /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                                         last_lane,          ///< [in] Index of last lane in segment
+        int                                         offset)             ///< [in] Up-offset to pull from
+    {
+        KeyValuePair<KeyT, ValueT> output;
+
+        KeyT other_key = ShuffleDown<LOGICAL_WARP_THREADS>(input.key, offset, last_lane, member_mask);
+
+        output.key = input.key;
+        output.value = ReduceStep(
+            input.value,
+            cub::Sum(),
+            last_lane,
+            offset,
+            Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
+
+        if (input.key != other_key)
+            output.value = input.value;
+
+        return output;
+    }
+
+
+
+    /// Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, ValueT> types)
+    template <typename ValueT, typename OffsetT>
+    __device__ __forceinline__ KeyValuePair<OffsetT, ValueT> ReduceStep(
+        KeyValuePair<OffsetT, ValueT>                 input,              ///< [in] Calling thread's input item.
+        SwizzleScanOp<ReduceBySegmentOp<cub::Sum> >   /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                                           last_lane,          ///< [in] Index of last lane in segment
+        int                                           offset)             ///< [in] Up-offset to pull from
+    {
+        KeyValuePair<OffsetT, ValueT> output;
+
+        output.value = ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
+        output.key = ReduceStep(input.key, cub::Sum(), last_lane, offset, Int2Type<IsInteger<OffsetT>::IS_SMALL_UNSIGNED>());
+
+        if (input.key > 0)
+            output.value = input.value;
+
+        return output;
+    }
+
+
+    /// Reduction step (generic)
+    template <typename _T, typename ReductionOp>
+    __device__ __forceinline__ _T ReduceStep(
+        _T                  input,              ///< [in] Calling thread's input item.
+        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        _T output = input;
+
+        _T temp = ShuffleDown<LOGICAL_WARP_THREADS>(output, offset, last_lane, member_mask);
+
+        // Perform reduction op if valid
+        if (offset + lane_id <= last_lane)
+            output = reduction_op(input, temp);
+
+        return output;
+    }
+
+
+    /// Reduction step (specialized for small unsigned integers size 32b or less)
+    template <typename _T, typename ReductionOp>
+    __device__ __forceinline__ _T ReduceStep(
+        _T              input,                  ///< [in] Calling thread's input item.
+        ReductionOp     reduction_op,           ///< [in] Binary reduction operator
+        int             last_lane,              ///< [in] Index of last lane in segment
+        int             offset,                 ///< [in] Up-offset to pull from
+        Int2Type<true>  /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small unsigned integer
+    {
+        return ReduceStep(input, reduction_op, last_lane, offset);
+    }
+
+
+    /// Reduction step (specialized for types other than small unsigned integers size 32b or less)
+    template <typename _T, typename ReductionOp>
+    __device__ __forceinline__ _T ReduceStep(
+        _T              input,                  ///< [in] Calling thread's input item.
+        ReductionOp     reduction_op,           ///< [in] Binary reduction operator
+        int             last_lane,              ///< [in] Index of last lane in segment
+        int             offset,                 ///< [in] Up-offset to pull from
+        Int2Type<false> /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small unsigned integer
+    {
+        return ReduceStep(input, reduction_op, last_lane, offset);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Templated inclusive scan iteration
+    //---------------------------------------------------------------------
+
+    template <typename ReductionOp, int STEP>
+    __device__ __forceinline__ void ReduceStep(
+        T&              input,              ///< [in] Calling thread's input item.
+        ReductionOp     reduction_op,       ///< [in] Binary reduction operator
+        int             last_lane,          ///< [in] Index of last lane in segment
+        Int2Type<STEP>  /*step*/)
+    {
+        input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+
+        ReduceStep(input, reduction_op, last_lane, Int2Type<STEP + 1>());
+    }
+
+    template <typename ReductionOp>
+    __device__ __forceinline__ void ReduceStep(
+        T&              /*input*/,              ///< [in] Calling thread's input item.
+        ReductionOp     /*reduction_op*/,       ///< [in] Binary reduction operator
+        int             /*last_lane*/,          ///< [in] Index of last lane in segment
+        Int2Type<STEPS> /*step*/)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Reduction operations
+    //---------------------------------------------------------------------
+
+    /// Reduction
+    template <
+        bool            ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
+        typename        ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               input,                  ///< [in] Calling thread's input
+        int             valid_items,            ///< [in] Total number of valid items across the logical warp
+        ReductionOp     reduction_op)           ///< [in] Binary reduction operator
+    {
+        int last_lane = (ALL_LANES_VALID) ?
+                            LOGICAL_WARP_THREADS - 1 :
+                            valid_items - 1;
+
+        T output = input;
+
+//        // Iterate reduction steps
+//        #pragma unroll
+//        for (int STEP = 0; STEP < STEPS; STEP++)
+//        {
+//            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+//        }
+
+        // Template-iterate reduction steps
+        ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
+
+        return output;
+    }
+
+
+    /// Segmented reduction
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,              ///< [in] Calling thread's input
+        FlagT           flag,               ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op)       ///< [in] Binary reduction operator
+    {
+        // Get the start flags for each thread in the warp.
+        int warp_flags = WARP_BALLOT(flag, member_mask);
+
+        // Convert to tail-segmented
+        if (HEAD_SEGMENTED)
+            warp_flags >>= 1;
+
+        // Mask out the bits below the current thread
+        warp_flags &= LaneMaskGe();
+
+        // Mask of physical lanes outside the logical warp and convert to logical lanemask
+        if (!IS_ARCH_WARP)
+        {
+            warp_flags = (warp_flags & member_mask) >> (warp_id * LOGICAL_WARP_THREADS);
+        }
+
+        // Mask in the last lane of logical warp
+        warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1);
+
+        // Find the next set flag
+        int last_lane = __clz(__brev(warp_flags));
+
+        T output = input;
+
+//        // Iterate reduction steps
+//        #pragma unroll
+//        for (int STEP = 0; STEP < STEPS; STEP++)
+//        {
+//            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+//        }
+
+        // Template-iterate reduction steps
+        ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
+
+        return output;
+    }
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/warp/specializations/warp_reduce_smem.cuh b/cuda/cub/warp/specializations/warp_reduce_smem.cuh
new file mode 100644
index 00000000..80c7c62c
--- /dev/null
+++ b/cuda/cub/warp/specializations/warp_reduce_smem.cuh
@@ -0,0 +1,361 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../thread/thread_load.cuh"
+#include "../../thread/thread_store.cuh"
+#include "../../util_type.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+template <
+    typename    T,                      ///< Data type being reduced
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpReduceSmem
+{
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of threads in half a warp
+        HALF_WARP_THREADS = 1 << (STEPS - 1),
+
+        /// The number of shared memory elements per warp
+        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
+
+        /// FlagT status (when not using ballot)
+        UNSET   = 0x0,  // Is initially unset
+        SET     = 0x1,  // Is initially set
+        SEEN    = 0x2,  // Has seen another head flag from a successor peer
+    };
+
+    /// Shared memory flag type
+    typedef unsigned char SmemFlag;
+
+    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
+    struct _TempStorage
+    {
+        T           reduce[WARP_SMEM_ELEMENTS];
+        SmemFlag    flags[WARP_SMEM_ELEMENTS];
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage    &temp_storage;
+    unsigned int    lane_id;
+    unsigned int    member_mask;
+
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    explicit __device__ __forceinline__ WarpReduceSmem(TempStorage &temp_storage)
+        : temp_storage(temp_storage.Alias())
+        , lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS)
+        , member_mask(
+            WarpMask<LOGICAL_WARP_THREADS, PTX_ARCH>(
+              LaneId() / LOGICAL_WARP_THREADS))
+    {}
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    //---------------------------------------------------------------------
+    // Regular reduction
+    //---------------------------------------------------------------------
+
+    /**
+     * Reduction step
+     */
+    template <
+        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
+        typename            ReductionOp,
+        int                 STEP>
+    __device__ __forceinline__ T ReduceStep(
+        T                   input,                  ///< [in] Calling thread's input
+        int                 valid_items,            ///< [in] Total number of valid items across the logical warp
+        ReductionOp         reduction_op,           ///< [in] Reduction operator
+        Int2Type<STEP>      /*step*/)
+    {
+        const int OFFSET = 1 << STEP;
+
+        // Share input through buffer
+        ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+        WARP_SYNC(member_mask);
+
+        // Update input if peer_addend is in range
+        if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items))
+        {
+            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+            input = reduction_op(input, peer_addend);
+        }
+
+        WARP_SYNC(member_mask);
+
+        return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<STEP + 1>());
+    }
+
+
+    /**
+     * Reduction step (terminate)
+     */
+    template <
+        bool                ALL_LANES_VALID,            ///< Whether all lanes in each warp are contributing a valid fold of items
+        typename            ReductionOp>
+    __device__ __forceinline__ T ReduceStep(
+        T                   input,                      ///< [in] Calling thread's input
+        int                 valid_items,                ///< [in] Total number of valid items across the logical warp
+        ReductionOp         /*reduction_op*/,           ///< [in] Reduction operator
+        Int2Type<STEPS>     /*step*/)
+    {
+        return input;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Segmented reduction
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Ballot-based segmented reduce
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,                  ///< [in] Calling thread's input
+        FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op,           ///< [in] Reduction operator
+        Int2Type<true>  /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
+    {
+        // Get the start flags for each thread in the warp.
+        int warp_flags = WARP_BALLOT(flag, member_mask);
+
+        if (!HEAD_SEGMENTED)
+            warp_flags <<= 1;
+
+        // Keep bits above the current thread.
+        warp_flags &= LaneMaskGt();
+
+        // Accommodate packing of multiple logical warps in a single physical warp
+        if (!IS_ARCH_WARP)
+        {
+            warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
+        }
+
+        // Find next flag
+        int next_flag = __clz(__brev(warp_flags));
+
+        // Clip the next segment at the warp boundary if necessary
+        if (LOGICAL_WARP_THREADS != 32)
+            next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
+
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            // Share input into buffer
+            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+            WARP_SYNC(member_mask);
+
+            // Update input if peer_addend is in range
+            if (OFFSET + lane_id < next_flag)
+            {
+                T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+                input = reduction_op(input, peer_addend);
+            }
+
+            WARP_SYNC(member_mask);
+        }
+
+        return input;
+    }
+
+
+    /**
+     * Smem-based segmented reduce
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,                  ///< [in] Calling thread's input
+        FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op,           ///< [in] Reduction operator
+        Int2Type<false> /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
+    {
+        enum
+        {
+            UNSET   = 0x0,  // Is initially unset
+            SET     = 0x1,  // Is initially set
+            SEEN    = 0x2,  // Has seen another head flag from a successor peer
+        };
+
+        // Alias flags onto shared data storage
+        volatile SmemFlag *flag_storage = temp_storage.flags;
+
+        SmemFlag flag_status = (flag) ? SET : UNSET;
+
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            // Share input through buffer
+            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+            WARP_SYNC(member_mask);
+
+            // Get peer from buffer
+            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+
+            WARP_SYNC(member_mask);
+
+            // Share flag through buffer
+            flag_storage[lane_id] = flag_status;
+
+            // Get peer flag from buffer
+            SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET];
+
+            // Update input if peer was in range
+            if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
+            {
+                if (HEAD_SEGMENTED)
+                {
+                    // Head-segmented
+                    if ((flag_status & SEEN) == 0)
+                    {
+                        // Has not seen a more distant head flag
+                        if (peer_flag_status & SET)
+                        {
+                            // Has now seen a head flag
+                            flag_status |= SEEN;
+                        }
+                        else
+                        {
+                            // Peer is not a head flag: grab its count
+                            input = reduction_op(input, peer_addend);
+                        }
+
+                        // Update seen status to include that of peer
+                        flag_status |= (peer_flag_status & SEEN);
+                    }
+                }
+                else
+                {
+                    // Tail-segmented.  Simply propagate flag status
+                    if (!flag_status)
+                    {
+                        input = reduction_op(input, peer_addend);
+                        flag_status |= peer_flag_status;
+                    }
+
+                }
+            }
+        }
+
+        return input;
+    }
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    /**
+     * Reduction
+     */
+    template <
+        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,                  ///< [in] Calling thread's input
+        int                 valid_items,            ///< [in] Total number of valid items across the logical warp
+        ReductionOp         reduction_op)           ///< [in] Reduction operator
+    {
+        return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<0>());
+    }
+
+
+    /**
+     * Segmented reduction
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,              ///< [in] Calling thread's input
+        FlagT            flag,               ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op)       ///< [in] Reduction operator
+    {
+        return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>());
+    }
+
+
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/warp/specializations/warp_scan_shfl.cuh b/cuda/cub/warp/specializations/warp_scan_shfl.cuh
new file mode 100644
index 00000000..f6051c6e
--- /dev/null
+++ b/cuda/cub/warp/specializations/warp_scan_shfl.cuh
@@ -0,0 +1,624 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../util_type.cuh"
+#include "../../util_ptx.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ *
+ * LOGICAL_WARP_THREADS must be a power-of-two
+ */
+template <
+    typename    T,                      ///< Data type being scanned
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpScanShfl
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+        SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8
+    };
+
+    template <typename S>
+    struct IntegerTraits
+    {
+        enum {
+            ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange
+            IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
+        };
+    };
+
+    /// Shared memory storage layout type
+    struct TempStorage {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    /// Lane index in logical warp
+    unsigned int lane_id;
+
+    /// Logical warp index in 32-thread physical warp
+    unsigned int warp_id;
+
+    /// 32-thread physical warp member mask of logical warp
+    unsigned int member_mask;
+
+    //---------------------------------------------------------------------
+    // Construction
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    explicit __device__ __forceinline__
+    WarpScanShfl(TempStorage & /*temp_storage*/)
+        : lane_id(LaneId())
+        , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS))
+        , member_mask(WarpMask<LOGICAL_WARP_THREADS, PTX_ARCH>(warp_id))
+    {
+        if (!IS_ARCH_WARP)
+        {
+            lane_id = lane_id % LOGICAL_WARP_THREADS;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scan steps
+    //---------------------------------------------------------------------
+
+    /// Inclusive prefix scan step (specialized for summation across int32 types)
+    __device__ __forceinline__ int InclusiveScanStep(
+        int             input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        int output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .s32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.s32 r0, r0, %4;"
+            "  mov.s32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .s32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.s32 r0, r0, %4;"
+            "  mov.s32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
+#endif
+
+        return output;
+    }
+
+    /// Inclusive prefix scan step (specialized for summation across uint32 types)
+    __device__ __forceinline__ unsigned int InclusiveScanStep(
+        unsigned int    input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned int output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across fp32 types)
+    __device__ __forceinline__ float InclusiveScanStep(
+        float           input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        float output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across unsigned long long types)
+    __device__ __forceinline__ unsigned long long InclusiveScanStep(
+        unsigned long long  input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned long long output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
+            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.u64 r0, r0, %4;"
+            "  mov.u64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.up.b32 lo|p, lo, %2, %3;"
+            "  shfl.up.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.u64 r0, r0, %4;"
+            "  mov.u64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across long long types)
+    __device__ __forceinline__ long long InclusiveScanStep(
+        long long       input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        long long output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .s64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
+            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.s64 r0, r0, %4;"
+            "  mov.s64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .s64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.up.b32 lo|p, lo, %2, %3;"
+            "  shfl.up.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.s64 r0, r0, %4;"
+            "  mov.s64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across fp64 types)
+    __device__ __forceinline__ double InclusiveScanStep(
+        double          input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        double output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.up.b32 lo|p, lo, %2, %3;"
+            "  shfl.up.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c));
+#endif
+
+        return output;
+    }
+
+
+/*
+    /// Inclusive prefix scan (specialized for ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, Value> types)
+    template <typename Value, typename OffsetT>
+    __device__ __forceinline__ KeyValuePair<OffsetT, Value>InclusiveScanStep(
+        KeyValuePair<OffsetT, Value>    input,              ///< [in] Calling thread's input item.
+        ReduceBySegmentOp<cub::Sum>     scan_op,            ///< [in] Binary scan operator
+        int                             first_lane,         ///< [in] Index of first lane in segment
+        int                             offset)             ///< [in] Up-offset to pull from
+    {
+        KeyValuePair<OffsetT, Value> output;
+
+        output.value = InclusiveScanStep(input.value, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<Value>::IS_SMALL_UNSIGNED>());
+        output.key = InclusiveScanStep(input.key, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<OffsetT>::IS_SMALL_UNSIGNED>());
+
+        if (input.key > 0)
+            output.value = input.value;
+
+        return output;
+    }
+*/
+
+    /// Inclusive prefix scan step (generic)
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ _T InclusiveScanStep(
+        _T              input,              ///< [in] Calling thread's input item.
+        ScanOpT         scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        _T temp = ShuffleUp<LOGICAL_WARP_THREADS>(input, offset, first_lane, member_mask);
+
+        // Perform scan op if from a valid peer
+        _T output = scan_op(temp, input);
+        if (static_cast<int>(lane_id) < first_lane + offset)
+            output = input;
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for small integers size 32b or less)
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ _T InclusiveScanStep(
+        _T              input,              ///< [in] Calling thread's input item.
+        ScanOpT         scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset,             ///< [in] Up-offset to pull from
+        Int2Type<true>  /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small integer
+    {
+        return InclusiveScanStep(input, scan_op, first_lane, offset);
+    }
+
+
+    /// Inclusive prefix scan step (specialized for types other than small integers size 32b or less)
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ _T InclusiveScanStep(
+        _T              input,              ///< [in] Calling thread's input item.
+        ScanOpT          scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset,             ///< [in] Up-offset to pull from
+        Int2Type<false> /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small integer
+    {
+        return InclusiveScanStep(input, scan_op, first_lane, offset);
+    }
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    //---------------------------------------------------------------------
+    // Broadcast
+    //---------------------------------------------------------------------
+
+    /// Broadcast
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        int             src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        return ShuffleIndex<LOGICAL_WARP_THREADS>(input, src_lane, member_mask);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive operations
+    //---------------------------------------------------------------------
+
+    /// Inclusive scan
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ void InclusiveScan(
+        _T              input,              ///< [in] Calling thread's input item.
+        _T              &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOpT         scan_op)            ///< [in] Binary scan operator
+    {
+        inclusive_output = input;
+
+        // Iterate scan steps
+        int segment_first_lane = 0;
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            inclusive_output = InclusiveScanStep(
+                inclusive_output,
+                scan_op,
+                segment_first_lane,
+                (1 << STEP),
+                Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
+        }
+
+    }
+
+    /// Inclusive scan, specialized for reduce-value-by-key
+    template <typename KeyT, typename ValueT, typename ReductionOpT>
+    __device__ __forceinline__ void InclusiveScan(
+        KeyValuePair<KeyT, ValueT>      input,              ///< [in] Calling thread's input item.
+        KeyValuePair<KeyT, ValueT>      &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ReduceByKeyOp<ReductionOpT >    scan_op)            ///< [in] Binary scan operator
+    {
+        inclusive_output = input;
+
+        KeyT pred_key = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive_output.key, 1, 0, member_mask);
+
+        unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask);
+
+        // Mask away all lanes greater than ours
+        ballot = ballot & LaneMaskLe();
+
+        // Find index of first set bit
+        int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot));
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            inclusive_output.value = InclusiveScanStep(
+                inclusive_output.value,
+                scan_op.op,
+                segment_first_lane,
+                (1 << STEP),
+                Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
+        }
+    }
+
+
+    /// Inclusive scan with aggregate
+    template <typename ScanOpT>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOpT         scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InclusiveScan(input, inclusive_output, scan_op);
+
+        // Grab aggregate from last warp lane
+        warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive_output, LOGICAL_WARP_THREADS - 1, member_mask);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Get exclusive from inclusive
+    //---------------------------------------------------------------------
+
+    /// Update inclusive and exclusive using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update(
+        T                       /*input*/,          ///< [in]
+        T                       &inclusive,         ///< [in, out]
+        T                       &exclusive,         ///< [out]
+        ScanOpT                 /*scan_op*/,        ///< [in]
+        IsIntegerT              /*is_integer*/)     ///< [in]
+    {
+        // initial value unknown
+        exclusive = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive, 1, 0, member_mask);
+    }
+
+    /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update(
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                /*scan_op*/,
+        Int2Type<true>          /*is_integer*/)
+    {
+        // initial value presumed 0
+        exclusive = inclusive - input;
+    }
+
+    /// Update inclusive and exclusive using initial value using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        exclusive = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive, 1, 0, member_mask);
+
+        if (lane_id == 0)
+            exclusive = initial_value;
+    }
+
+    /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                scan_op,
+        T                       initial_value,
+        Int2Type<true>          /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        exclusive = inclusive - input;
+    }
+
+
+    /// Update inclusive, exclusive, and warp aggregate using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 scan_op,
+        IsIntegerT              is_integer)
+    {
+        warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive, LOGICAL_WARP_THREADS - 1, member_mask);
+        Update(input, inclusive, exclusive, scan_op, is_integer);
+    }
+
+    /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              is_integer)
+    {
+        warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive, LOGICAL_WARP_THREADS - 1, member_mask);
+        Update(input, inclusive, exclusive, scan_op, initial_value, is_integer);
+    }
+
+
+
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/warp/specializations/warp_scan_smem.cuh b/cuda/cub/warp/specializations/warp_scan_smem.cuh
new file mode 100644
index 00000000..1fecf1c7
--- /dev/null
+++ b/cuda/cub/warp/specializations/warp_scan_smem.cuh
@@ -0,0 +1,389 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../config.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../thread/thread_load.cuh"
+#include "../../thread/thread_store.cuh"
+#include "../../util_type.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+template <
+    typename    T,                      ///< Data type being scanned
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpScanSmem
+{
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of threads in half a warp
+        HALF_WARP_THREADS = 1 << (STEPS - 1),
+
+        /// The number of shared memory elements per warp
+        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
+    };
+
+    /// Storage cell type (workaround for SM1x compiler bugs with custom-ops like Max() on signed chars)
+    using CellT = T;
+
+    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
+    typedef CellT _TempStorage[WARP_SMEM_ELEMENTS];
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage    &temp_storage;
+    unsigned int    lane_id;
+    unsigned int    member_mask;
+
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    explicit __device__ __forceinline__ WarpScanSmem(
+        TempStorage     &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS),
+
+        member_mask(
+          WarpMask<LOGICAL_WARP_THREADS, PTX_ARCH>(
+            LaneId() / LOGICAL_WARP_THREADS))
+    {}
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Basic inclusive scan iteration (template unrolled, inductive-case specialization)
+    template <
+        bool        HAS_IDENTITY,
+        int         STEP,
+        typename    ScanOp>
+    __device__ __forceinline__ void ScanStep(
+        T                       &partial,
+        ScanOp                  scan_op,
+        Int2Type<STEP>          /*step*/)
+    {
+        const int OFFSET = 1 << STEP;
+
+        // Share partial into buffer
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial);
+
+        WARP_SYNC(member_mask);
+
+        // Update partial if addend is in range
+        if (HAS_IDENTITY || (lane_id >= OFFSET))
+        {
+            T addend = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]);
+            partial = scan_op(addend, partial);
+        }
+        WARP_SYNC(member_mask);
+
+        ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<STEP + 1>());
+    }
+
+
+    /// Basic inclusive scan iteration(template unrolled, base-case specialization)
+    template <
+        bool        HAS_IDENTITY,
+        typename    ScanOp>
+    __device__ __forceinline__ void ScanStep(
+        T                       &/*partial*/,
+        ScanOp                  /*scan_op*/,
+        Int2Type<STEPS>         /*step*/)
+    {}
+
+
+    /// Inclusive prefix scan (specialized for summation across primitive types)
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,              ///< [in] Calling thread's input item.
+        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        Sum                     scan_op,            ///< [in] Binary scan operator
+        Int2Type<true>          /*is_primitive*/)   ///< [in] Marker type indicating whether T is primitive type
+    {
+        T identity = 0;
+        ThreadStore<STORE_VOLATILE>(&temp_storage[lane_id], (CellT) identity);
+
+        WARP_SYNC(member_mask);
+
+        // Iterate scan steps
+        output = input;
+        ScanStep<true>(output, scan_op, Int2Type<0>());
+    }
+
+
+    /// Inclusive prefix scan
+    template <typename ScanOp, int IS_PRIMITIVE>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,              ///< [in] Calling thread's input item.
+        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp                  scan_op,            ///< [in] Binary scan operator
+        Int2Type<IS_PRIMITIVE>  /*is_primitive*/)   ///< [in] Marker type indicating whether T is primitive type
+    {
+        // Iterate scan steps
+        output = input;
+        ScanStep<false>(output, scan_op, Int2Type<0>());
+    }
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    //---------------------------------------------------------------------
+    // Broadcast
+    //---------------------------------------------------------------------
+
+    /// Broadcast
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        if (lane_id == src_lane)
+        {
+            ThreadStore<STORE_VOLATILE>(temp_storage, (CellT) input);
+        }
+
+        WARP_SYNC(member_mask);
+
+        return (T)ThreadLoad<LOAD_VOLATILE>(temp_storage);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive operations
+    //---------------------------------------------------------------------
+
+    /// Inclusive scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InclusiveScan(input, inclusive_output, scan_op, Int2Type<Traits<T>::PRIMITIVE>());
+    }
+
+
+    /// Inclusive scan with aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InclusiveScan(input, inclusive_output, scan_op);
+
+        // Retrieve aggregate
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output);
+
+        WARP_SYNC(member_mask);
+
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+
+        WARP_SYNC(member_mask);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Get exclusive from inclusive
+    //---------------------------------------------------------------------
+
+    /// Update inclusive and exclusive using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update(
+        T                       /*input*/,      ///< [in]
+        T                       &inclusive,     ///< [in, out]
+        T                       &exclusive,     ///< [out]
+        ScanOpT                 /*scan_op*/,    ///< [in]
+        IsIntegerT              /*is_integer*/) ///< [in]
+    {
+        // initial value unknown
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+    }
+
+    /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update(
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                /*scan_op*/,
+        Int2Type<true>          /*is_integer*/)
+    {
+        // initial value presumed 0
+        exclusive = inclusive - input;
+    }
+
+    /// Update inclusive and exclusive using initial value using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+        if (lane_id == 0)
+            exclusive = initial_value;
+    }
+
+    /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                scan_op,
+        T                       initial_value,
+        Int2Type<true>          /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        exclusive = inclusive - input;
+    }
+
+
+    /// Update inclusive, exclusive, and warp aggregate using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 /*scan_op*/,
+        IsIntegerT              /*is_integer*/)
+    {
+        // Initial value presumed to be unknown or identity (either way our padding is correct)
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+    }
+
+    /// Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        cub::Sum                /*scan_o*/,
+        Int2Type<true>          /*is_integer*/)
+    {
+        // Initial value presumed to be unknown or identity (either way our padding is correct)
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+        exclusive = inclusive - input;
+    }
+
+    /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              /*is_integer*/)
+    {
+        // Broadcast warp aggregate
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+
+        WARP_SYNC(member_mask);
+
+        // Update inclusive with initial value
+        inclusive = scan_op(initial_value, inclusive);
+
+        // Get exclusive from exclusive
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 2]);
+
+        if (lane_id == 0)
+            exclusive = initial_value;
+    }
+
+
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/warp/warp_exchange.cuh b/cuda/cub/warp/warp_exchange.cuh
new file mode 100644
index 00000000..c0fbef1c
--- /dev/null
+++ b/cuda/cub/warp/warp_exchange.cuh
@@ -0,0 +1,473 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * The cub::WarpExchange class provides [<em>collective</em>](index.html#sec0)
+ * methods for rearranging data partitioned across a CUDA warp.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * @brief The WarpExchange class provides [<em>collective</em>](index.html#sec0)
+ *        methods for rearranging data partitioned across a CUDA warp.
+ * @ingroup WarpModule
+ *
+ * @tparam T
+ *   The data type to be exchanged.
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   The number of items partitioned onto each thread.
+ *
+ * @tparam LOGICAL_WARP_THREADS
+ *   <b>[optional]</b> The number of threads per "logical" warp (may be less
+ *   than the number of hardware warp threads). Default is the warp size of the
+ *   targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a
+ *   power of two.
+ *
+ * @tparam PTX_ARCH
+ *   <b>[optional]</b> \ptxversion
+ *
+ * @par Overview
+ * - It is commonplace for a warp of threads to rearrange data items between
+ *   threads. For example, the global memory accesses prefer patterns where
+ *   data items are "striped" across threads (where consecutive threads access
+ *   consecutive items), yet most warp-wide operations prefer a "blocked"
+ *   partitioning of items across threads (where consecutive items belong to a
+ *   single thread).
+ * - WarpExchange supports the following types of data exchanges:
+ *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and
+ *     [<em>striped</em>](index.html#sec5sec3) arrangements
+ *   - Scattering ranked items to a
+ *     [<em>striped arrangement</em>](index.html#sec5sec3)
+ *
+ * @par A Simple Example
+ * @par
+ * The code snippet below illustrates the conversion from a "blocked" to a
+ * "striped" arrangement of 64 integer items partitioned across 16 threads where
+ * each thread owns 4 items.
+ * @par
+ * @code
+ * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_exchange.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     constexpr int warp_threads = 16;
+ *     constexpr int block_threads = 256;
+ *     constexpr int items_per_thread = 4;
+ *     constexpr int warps_per_block = block_threads / warp_threads;
+ *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+ *
+ *     // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each
+ *     using WarpExchangeT =
+ *       cub::WarpExchange<int, items_per_thread, warp_threads>;
+ *
+ *     // Allocate shared memory for WarpExchange
+ *     __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block];
+ *
+ *     // Load a tile of data striped across threads
+ *     int thread_data[items_per_thread];
+ *     // ...
+ *
+ *     // Collectively exchange data into a blocked arrangement across threads
+ *     WarpExchangeT(temp_storage[warp_id]).StripedToBlocked(thread_data, thread_data);
+ * @endcode
+ * @par
+ * Suppose the set of striped input @p thread_data across the block of threads
+ * is <tt>{ [0,16,32,48], [1,17,33,49], ..., [15, 32, 47, 63] }</tt>.
+ * The corresponding output @p thread_data in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [60,61,62,63] }</tt>.
+ */
+template <typename InputT,
+          int ITEMS_PER_THREAD,
+          int LOGICAL_WARP_THREADS  = CUB_PTX_WARP_THREADS,
+          int PTX_ARCH              = CUB_PTX_ARCH>
+class WarpExchange
+{
+  static_assert(PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+                "LOGICAL_WARP_THREADS must be a power of two");
+
+  constexpr static int ITEMS_PER_TILE =
+    ITEMS_PER_THREAD * LOGICAL_WARP_THREADS + 1;
+
+  constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS ==
+                                       CUB_WARP_THREADS(PTX_ARCH);
+
+  constexpr static int LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH);
+
+  // Insert padding if the number of items per thread is a power of two
+  // and > 4 (otherwise we can typically use 128b loads)
+  constexpr static bool INSERT_PADDING = (ITEMS_PER_THREAD > 4) &&
+                                         (PowerOfTwo<ITEMS_PER_THREAD>::VALUE);
+
+  constexpr static int PADDING_ITEMS = INSERT_PADDING
+                                     ? (ITEMS_PER_TILE >> LOG_SMEM_BANKS)
+                                     : 0;
+
+  union _TempStorage
+  {
+    InputT items_shared[ITEMS_PER_TILE + PADDING_ITEMS];
+  }; // union TempStorage
+
+  /// Shared storage reference
+  _TempStorage &temp_storage;
+
+  const unsigned int lane_id;
+  const unsigned int warp_id;
+  const unsigned int member_mask;
+
+public:
+
+  /// \smemstorage{WarpExchange}
+  struct TempStorage : Uninitialized<_TempStorage> {};
+
+  /*************************************************************************//**
+   * @name Collective constructors
+   ****************************************************************************/
+  //@{
+
+  WarpExchange() = delete;
+
+  /**
+   * @brief Collective constructor using the specified memory allocation as
+   *        temporary storage.
+   */
+  explicit __device__ __forceinline__
+  WarpExchange(TempStorage &temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , lane_id(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
+      , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS))
+      , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
+  {
+
+  }
+
+  //@}  end member group
+  /*************************************************************************//**
+   * @name Data movement
+   ****************************************************************************/
+  //@{
+
+  /**
+   * @brief Transposes data items from <em>blocked</em> arrangement to
+   *        <em>striped</em> arrangement.
+   *
+   * @par
+   * \smemreuse
+   *
+   * @par Snippet
+   * The code snippet below illustrates the conversion from a "blocked" to a
+   * "striped" arrangement of 64 integer items partitioned across 16 threads
+   * where each thread owns 4 items.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_exchange.cuh>
+   *
+   * __global__ void ExampleKernel(int *d_data, ...)
+   * {
+   *     constexpr int warp_threads = 16;
+   *     constexpr int block_threads = 256;
+   *     constexpr int items_per_thread = 4;
+   *     constexpr int warps_per_block = block_threads / warp_threads;
+   *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+   *
+   *     // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each
+   *     using WarpExchangeT = cub::WarpExchange<int, items_per_thread, warp_threads>;
+   *
+   *     // Allocate shared memory for WarpExchange
+   *     __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block];
+   *
+   *     // Obtain a segment of consecutive items that are blocked across threads
+   *     int thread_data[items_per_thread];
+   *     // ...
+   *
+   *     // Collectively exchange data into a striped arrangement across threads
+   *     WarpExchangeT(temp_storage[warp_id]).BlockedToStriped(thread_data, thread_data);
+   * @endcode
+   * @par
+   * Suppose the set of striped input @p thread_data across the block of threads
+   * is <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [60,61,62,63] }</tt>.
+   * The corresponding output @p thread_data in those threads will be
+   * <tt>{ [0,16,32,48], [1,17,33,49], ..., [15, 32, 47, 63] }</tt>.
+   *
+   * @param[in] input_items
+   *   Items to exchange, converting between <em>blocked</em> and
+   *   <em>striped</em> arrangements.
+   *
+   * @param[out] output_items
+   *   Items from exchange, converting between <em>striped</em> and
+   *   <em>blocked</em> arrangements. May be aliased to @p input_items.
+   */
+  template <typename OutputT>
+  __device__ __forceinline__ void
+  BlockedToStriped(const InputT (&input_items)[ITEMS_PER_THREAD],
+                   OutputT (&output_items)[ITEMS_PER_THREAD])
+  {
+    for (int item = 0; item < ITEMS_PER_THREAD; item++)
+    {
+      const int idx = ITEMS_PER_THREAD * lane_id + item;
+      temp_storage.items_shared[idx] = input_items[item];
+    }
+    WARP_SYNC(member_mask);
+
+    for (int item = 0; item < ITEMS_PER_THREAD; item++)
+    {
+      const int idx = LOGICAL_WARP_THREADS * item + lane_id;
+      output_items[item] = temp_storage.items_shared[idx];
+    }
+  }
+
+  /**
+   * @brief Transposes data items from <em>striped</em> arrangement to
+   *        <em>blocked</em> arrangement.
+   *
+   * @par
+   * \smemreuse
+   *
+   * @par Snippet
+   * The code snippet below illustrates the conversion from a "striped" to a
+   * "blocked" arrangement of 64 integer items partitioned across 16 threads
+   * where each thread owns 4 items.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_exchange.cuh>
+   *
+   * __global__ void ExampleKernel(int *d_data, ...)
+   * {
+   *     constexpr int warp_threads = 16;
+   *     constexpr int block_threads = 256;
+   *     constexpr int items_per_thread = 4;
+   *     constexpr int warps_per_block = block_threads / warp_threads;
+   *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+   *
+   *     // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each
+   *     using WarpExchangeT = cub::WarpExchange<int, items_per_thread, warp_threads>;
+   *
+   *     // Allocate shared memory for WarpExchange
+   *     __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block];
+   *
+   *     // Load a tile of data striped across threads
+   *     int thread_data[items_per_thread];
+   *     // ...
+   *
+   *     // Collectively exchange data into a blocked arrangement across threads
+   *     WarpExchangeT(temp_storage[warp_id]).StripedToBlocked(thread_data, thread_data);
+   * @endcode
+   * @par
+   * Suppose the set of striped input @p thread_data across the block of threads
+   * is <tt>{ [0,16,32,48], [1,17,33,49], ..., [15, 32, 47, 63] }</tt>.
+   * The corresponding output @p thread_data in those threads will be
+   * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [60,61,62,63] }</tt>.
+   *
+   * @param[in] input_items
+   *   Items to exchange
+   *
+   * @param[out] output_items
+   *   Items from exchange. May be aliased to @p input_items.
+   */
+  template <typename OutputT>
+  __device__ __forceinline__ void
+  StripedToBlocked(const InputT (&input_items)[ITEMS_PER_THREAD],
+                   OutputT (&output_items)[ITEMS_PER_THREAD])
+  {
+    for (int item = 0; item < ITEMS_PER_THREAD; item++)
+    {
+      const int idx = LOGICAL_WARP_THREADS * item + lane_id;
+      temp_storage.items_shared[idx] = input_items[item];
+    }
+    WARP_SYNC(member_mask);
+
+    for (int item = 0; item < ITEMS_PER_THREAD; item++)
+    {
+      const int idx = ITEMS_PER_THREAD * lane_id + item;
+      output_items[item] = temp_storage.items_shared[idx];
+    }
+  }
+
+  /**
+   * @brief Exchanges valid data items annotated by rank
+   *        into <em>striped</em> arrangement.
+   *
+   * @par
+   * \smemreuse
+   *
+   * @par Snippet
+   * The code snippet below illustrates the conversion from a "scatter" to a
+   * "striped" arrangement of 64 integer items partitioned across 16 threads
+   * where each thread owns 4 items.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_exchange.cuh>
+   *
+   * __global__ void ExampleKernel(int *d_data, ...)
+   * {
+   *     constexpr int warp_threads = 16;
+   *     constexpr int block_threads = 256;
+   *     constexpr int items_per_thread = 4;
+   *     constexpr int warps_per_block = block_threads / warp_threads;
+   *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+   *
+   *     // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each
+   *     using WarpExchangeT = cub::WarpExchange<int, items_per_thread, warp_threads>;
+   *
+   *     // Allocate shared memory for WarpExchange
+   *     __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block];
+   *
+   *     // Obtain a segment of consecutive items that are blocked across threads
+   *     int thread_data[items_per_thread];
+   *     int thread_ranks[items_per_thread];
+   *     // ...
+   *
+   *     // Collectively exchange data into a striped arrangement across threads
+   *     WarpExchangeT(temp_storage[warp_id]).ScatterToStriped(
+   *       thread_data, thread_ranks);
+   * @endcode
+   * @par
+   * Suppose the set of input @p thread_data across the block of threads
+   * is `{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }`, and the set of
+   * @p thread_ranks is `{ [63,62,61,60], ..., [7,6,5,4], [3,2,1,0] }`. The
+   * corresponding output @p thread_data in those threads will be
+   * `{ [63, 47, 31, 15], [62, 46, 30, 14], ..., [48, 32, 16, 0] }`.
+   *
+   * @tparam OffsetT <b>[inferred]</b> Signed integer type for local offsets
+   *
+   * @param[in,out] items Items to exchange
+   * @param[in] ranks Corresponding scatter ranks
+   */
+  template <typename OffsetT>
+  __device__ __forceinline__ void
+  ScatterToStriped(InputT (&items)[ITEMS_PER_THREAD],
+                   OffsetT (&ranks)[ITEMS_PER_THREAD])
+  {
+    ScatterToStriped(items, items, ranks);
+  }
+
+  /**
+   * @brief Exchanges valid data items annotated by rank
+   *        into <em>striped</em> arrangement.
+   *
+   * @par
+   * \smemreuse
+   *
+   * @par Snippet
+   * The code snippet below illustrates the conversion from a "scatter" to a
+   * "striped" arrangement of 64 integer items partitioned across 16 threads
+   * where each thread owns 4 items.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_exchange.cuh>
+   *
+   * __global__ void ExampleKernel(int *d_data, ...)
+   * {
+   *     constexpr int warp_threads = 16;
+   *     constexpr int block_threads = 256;
+   *     constexpr int items_per_thread = 4;
+   *     constexpr int warps_per_block = block_threads / warp_threads;
+   *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+   *
+   *     // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each
+   *     using WarpExchangeT = cub::WarpExchange<int, items_per_thread, warp_threads>;
+   *
+   *     // Allocate shared memory for WarpExchange
+   *     __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block];
+   *
+   *     // Obtain a segment of consecutive items that are blocked across threads
+   *     int thread_input[items_per_thread];
+   *     int thread_ranks[items_per_thread];
+   *     // ...
+   *
+   *     // Collectively exchange data into a striped arrangement across threads
+   *     int thread_output[items_per_thread];
+   *     WarpExchangeT(temp_storage[warp_id]).ScatterToStriped(
+   *       thread_input, thread_output, thread_ranks);
+   * @endcode
+   * @par
+   * Suppose the set of input @p thread_input across the block of threads
+   * is `{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }`, and the set of
+   * @p thread_ranks is `{ [63,62,61,60], ..., [7,6,5,4], [3,2,1,0] }`. The
+   * corresponding @p thread_output in those threads will be
+   * `{ [63, 47, 31, 15], [62, 46, 30, 14], ..., [48, 32, 16, 0] }`.
+   *
+   * @tparam OffsetT <b>[inferred]</b> Signed integer type for local offsets
+   *
+   * @param[in] input_items
+   *   Items to exchange
+   *
+   * @param[out] output_items
+   *   Items from exchange. May be aliased to @p input_items.
+   *
+   * @param[in] ranks
+   *   Corresponding scatter ranks
+   */
+  template <typename OutputT,
+            typename OffsetT>
+  __device__ __forceinline__ void
+  ScatterToStriped(const InputT (&input_items)[ITEMS_PER_THREAD],
+                   OutputT (&output_items)[ITEMS_PER_THREAD],
+                   OffsetT (&ranks)[ITEMS_PER_THREAD])
+  {
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+      if (INSERT_PADDING)
+      {
+        ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]);
+      }
+
+      temp_storage.items_shared[ranks[ITEM]] = input_items[ITEM];
+    }
+
+    WARP_SYNC(member_mask);
+
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+      int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id;
+
+      if (INSERT_PADDING)
+      {
+        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+      }
+
+      output_items[ITEM] = temp_storage.items_shared[item_offset];
+    }
+  }
+
+  //@}  end member group
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/warp/warp_load.cuh b/cuda/cub/warp/warp_load.cuh
new file mode 100644
index 00000000..28a31f40
--- /dev/null
+++ b/cuda/cub/warp/warp_load.cuh
@@ -0,0 +1,691 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * Operations for reading linear tiles of data into the CUDA warp.
+ */
+
+#pragma once
+
+#include <iterator>
+#include <type_traits>
+
+#include <cub/block/block_load.cuh>
+#include <cub/config.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+#include <cub/warp/warp_exchange.cuh>
+
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * @brief cub::WarpLoadAlgorithm enumerates alternative algorithms for
+ *        cub::WarpLoad to read a linear segment of data from memory into a
+ *        a CUDA warp.
+ */
+enum WarpLoadAlgorithm
+{
+  /**
+   * @par Overview
+   *
+   * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
+   * directly from memory.
+   *
+   * @par Performance Considerations
+   * The utilization of memory transactions (coalescing) decreases as the
+   * access stride between threads increases (i.e., the number items per thread).
+   */
+  WARP_LOAD_DIRECT,
+
+  /**
+   * @par Overview
+   *
+   * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
+   * directly from memory.
+   *
+   * @par Performance Considerations
+   * The utilization of memory transactions (coalescing) doesn't depend on
+   * the number of items per thread.
+   */
+  WARP_LOAD_STRIPED,
+
+  /**
+   * @par Overview
+   *
+   * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
+   * from memory using CUDA's built-in vectorized loads as a coalescing optimization.
+   * For example, <tt>ld.global.v4.s32</tt> instructions will be generated
+   * when @p T = @p int and @p ITEMS_PER_THREAD % 4 == 0.
+   *
+   * @par Performance Considerations
+   * - The utilization of memory transactions (coalescing) remains high until the the
+   *   access stride between threads (i.e., the number items per thread) exceeds the
+   *   maximum vector load width (typically 4 items or 64B, whichever is lower).
+   * - The following conditions will prevent vectorization and loading will fall
+   *   back to cub::WARP_LOAD_DIRECT:
+   *   - @p ITEMS_PER_THREAD is odd
+   *   - The @p InputIteratorT is not a simple pointer type
+   *   - The block input offset is not quadword-aligned
+   *   - The data type @p T is not a built-in primitive or CUDA vector type
+   *     (e.g., @p short, @p int2, @p double, @p float2, etc.)
+   */
+  WARP_LOAD_VECTORIZE,
+
+  /**
+   * @par Overview
+   *
+   * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
+   * efficiently from memory and then locally transposed into a
+   * [<em>blocked arrangement</em>](index.html#sec5sec3).
+   *
+   * @par Performance Considerations
+   * - The utilization of memory transactions (coalescing) remains high
+   *   regardless of items loaded per thread.
+   * - The local reordering incurs slightly longer latencies and throughput than
+   *   the direct cub::WARP_LOAD_DIRECT and cub::WARP_LOAD_VECTORIZE
+   *   alternatives.
+   */
+  WARP_LOAD_TRANSPOSE
+};
+
+/**
+ * @brief The WarpLoad class provides [<em>collective</em>](index.html#sec0)
+ *        data movement methods for loading a linear segment of items from
+ *        memory into a [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *        across a CUDA thread block.
+ * @ingroup WarpModule
+ * @ingroup UtilIo
+ *
+ * @tparam InputT
+ *   The data type to read into (which must be convertible from the input
+ *   iterator's value type).
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   The number of consecutive items partitioned onto each thread.
+ *
+ * @tparam ALGORITHM
+ *   <b>[optional]</b> cub::WarpLoadAlgorithm tuning policy.
+ *   default: cub::WARP_LOAD_DIRECT.
+ *
+ * @tparam LOGICAL_WARP_THREADS
+ *   <b>[optional]</b> The number of threads per "logical" warp (may be less
+ *   than the number of hardware warp threads). Default is the warp size of the
+ *   targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a
+ *   power of two.
+ *
+ * @tparam PTX_ARCH
+ *   <b>[optional]</b> \ptxversion
+ *
+ * @par Overview
+ * - The WarpLoad class provides a single data movement abstraction that can be
+ *   specialized to implement different cub::WarpLoadAlgorithm strategies. This
+ *   facilitates different performance policies for different architectures, data
+ *   types, granularity sizes, etc.
+ * - WarpLoad can be optionally specialized by different data movement strategies:
+ *   -# <b>cub::WARP_LOAD_DIRECT</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory. [More...](@ref cub::WarpLoadAlgorithm)
+*    -# <b>cub::WARP_LOAD_STRIPED,</b>. A [<em>striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory.  [More...](@ref cub::WarpLoadAlgorithm)
+ *   -# <b>cub::WARP_LOAD_VECTORIZE</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory using CUDA's built-in vectorized
+ *      loads as a coalescing optimization. [More...](@ref cub::WarpLoadAlgorithm)
+ *   -# <b>cub::WARP_LOAD_TRANSPOSE</b>. A [<em>striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec3). [More...](@ref cub::WarpLoadAlgorithm)
+ *
+ * @par A Simple Example
+ * @par
+ * The code snippet below illustrates the loading of a linear segment of 64
+ * integers into a "blocked" arrangement across 16 threads where each thread
+ * owns 4 consecutive items. The load is specialized for @p WARP_LOAD_TRANSPOSE,
+ * meaning memory references are efficiently coalesced using a warp-striped access
+ * pattern (after which items are locally reordered among threads).
+ * @par
+ * @code
+ * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_load.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     constexpr int warp_threads = 16;
+ *     constexpr int block_threads = 256;
+ *     constexpr int items_per_thread = 4;
+ *
+ *     // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each
+ *     using WarpLoadT = WarpLoad<int,
+ *                                items_per_thread,
+ *                                cub::WARP_LOAD_TRANSPOSE,
+ *                                warp_threads>;
+ *
+ *     constexpr int warps_in_block = block_threads / warp_threads;
+ *     constexpr int tile_size = items_per_thread * warp_threads;
+ *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+ *
+ *     // Allocate shared memory for WarpLoad
+ *     __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block];
+ *
+ *     // Load a segment of consecutive items that are blocked across threads
+ *     int thread_data[items_per_thread];
+ *     WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
+ *                                           thread_data);
+ * @endcode
+ * @par
+ * Suppose the input @p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+ * The set of @p thread_data across the first logical warp of threads in those
+ * threads will be:
+ * <tt>{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }</tt>.
+ */
+template <typename          InputT,
+          int               ITEMS_PER_THREAD,
+          WarpLoadAlgorithm ALGORITHM            = WARP_LOAD_DIRECT,
+          int               LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS,
+          int               PTX_ARCH             = CUB_PTX_ARCH>
+class WarpLoad
+{
+  constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS ==
+                                       CUB_WARP_THREADS(PTX_ARCH);
+
+  static_assert(PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+                "LOGICAL_WARP_THREADS must be a power of two");
+
+private:
+
+  /*****************************************************************************
+   * Algorithmic variants
+   ****************************************************************************/
+
+  /// Load helper
+  template <WarpLoadAlgorithm _POLICY, int DUMMY>
+  struct LoadInternal;
+
+
+  template <int DUMMY>
+  struct LoadInternal<WARP_LOAD_DIRECT, DUMMY>
+  {
+    using TempStorage = NullType;
+
+    int linear_tid;
+
+    __device__ __forceinline__
+    LoadInternal(TempStorage & /*temp_storage*/,
+                 int linear_tid)
+        : linear_tid(linear_tid)
+    {}
+
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+      InputIteratorT  block_itr,
+      InputT          (&items)[ITEMS_PER_THREAD])
+    {
+      LoadDirectBlocked(linear_tid, block_itr, items);
+    }
+
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+      InputIteratorT  block_itr,
+      InputT          (&items)[ITEMS_PER_THREAD],
+      int             valid_items)
+    {
+      LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+    }
+
+    template <typename InputIteratorT, typename DefaultT>
+    __device__ __forceinline__ void Load(
+      InputIteratorT  block_itr,
+      InputT          (&items)[ITEMS_PER_THREAD],
+      int             valid_items,
+      DefaultT        oob_default)
+    {
+      LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
+    }
+  };
+
+
+  template <int DUMMY>
+  struct LoadInternal<WARP_LOAD_STRIPED, DUMMY>
+  {
+    using TempStorage = NullType;
+
+    int linear_tid;
+
+    __device__ __forceinline__
+    LoadInternal(TempStorage & /*temp_storage*/,
+                 int linear_tid)
+        : linear_tid(linear_tid)
+    {}
+
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+      InputIteratorT  block_itr,
+      InputT          (&items)[ITEMS_PER_THREAD])
+    {
+      LoadDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items);
+    }
+
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+      InputIteratorT  block_itr,
+      InputT          (&items)[ITEMS_PER_THREAD],
+      int             valid_items)
+    {
+      LoadDirectStriped<LOGICAL_WARP_THREADS>(linear_tid,
+                                              block_itr,
+                                              items,
+                                              valid_items);
+    }
+
+    template <typename InputIteratorT,
+              typename DefaultT>
+    __device__ __forceinline__ void Load(
+      InputIteratorT  block_itr,
+      InputT          (&items)[ITEMS_PER_THREAD],
+      int             valid_items,
+      DefaultT        oob_default)
+    {
+      LoadDirectStriped<LOGICAL_WARP_THREADS>(linear_tid,
+                                              block_itr,
+                                              items,
+                                              valid_items,
+                                              oob_default);
+    }
+  };
+
+
+  template <int DUMMY>
+  struct LoadInternal<WARP_LOAD_VECTORIZE, DUMMY>
+  {
+    using TempStorage = NullType;
+
+    int linear_tid;
+
+    __device__ __forceinline__
+    LoadInternal(TempStorage &/*temp_storage*/,
+                 int linear_tid)
+      : linear_tid(linear_tid)
+    {}
+
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+      InputT               *block_ptr,
+      InputT               (&items)[ITEMS_PER_THREAD])
+    {
+      InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid,
+                                                        block_ptr,
+                                                        items);
+    }
+
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+      const InputT         *block_ptr,
+      InputT               (&items)[ITEMS_PER_THREAD])
+    {
+      InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid,
+                                                        block_ptr,
+                                                        items);
+    }
+
+    template <
+      CacheLoadModifier   MODIFIER,
+      typename            ValueType,
+      typename            OffsetT>
+    __device__ __forceinline__ void Load(
+      CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT>    block_itr,
+      InputT                                                     (&items)[ITEMS_PER_THREAD])
+    {
+      InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid,
+                                                    block_itr.ptr,
+                                                    items);
+    }
+
+    template <typename _InputIteratorT>
+    __device__ __forceinline__ void Load(
+      _InputIteratorT   block_itr,
+      InputT           (&items)[ITEMS_PER_THREAD])
+    {
+      LoadDirectBlocked(linear_tid, block_itr, items);
+    }
+
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+      InputIteratorT  block_itr,
+      InputT          (&items)[ITEMS_PER_THREAD],
+      int             valid_items)
+    {
+      LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+    }
+
+    template <typename InputIteratorT, typename DefaultT>
+    __device__ __forceinline__ void Load(
+      InputIteratorT  block_itr,
+      InputT          (&items)[ITEMS_PER_THREAD],
+      int             valid_items,
+      DefaultT        oob_default)
+    {
+      LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
+    }
+  };
+
+
+  template <int DUMMY>
+  struct LoadInternal<WARP_LOAD_TRANSPOSE, DUMMY>
+  {
+    using WarpExchangeT =
+      WarpExchange<InputT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, PTX_ARCH>;
+
+    struct _TempStorage : WarpExchangeT::TempStorage
+    {};
+
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+    _TempStorage &temp_storage;
+
+    int linear_tid;
+
+    __device__ __forceinline__ LoadInternal(
+      TempStorage &temp_storage,
+      int linear_tid)
+      :
+      temp_storage(temp_storage.Alias()),
+      linear_tid(linear_tid)
+    {}
+
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+      InputIteratorT  block_itr,
+      InputT          (&items)[ITEMS_PER_THREAD])
+    {
+      LoadDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items);
+      WarpExchangeT(temp_storage).StripedToBlocked(items, items);
+    }
+
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+      InputIteratorT  block_itr,
+      InputT          (&items)[ITEMS_PER_THREAD],
+      int             valid_items)
+    {
+      LoadDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items, valid_items);
+      WarpExchangeT(temp_storage).StripedToBlocked(items, items);
+    }
+
+    template <typename InputIteratorT, typename DefaultT>
+    __device__ __forceinline__ void Load(
+      InputIteratorT  block_itr,
+      InputT          (&items)[ITEMS_PER_THREAD],
+      int             valid_items,
+      DefaultT        oob_default)
+    {
+      LoadDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
+      WarpExchangeT(temp_storage).StripedToBlocked(items, items);
+    }
+  };
+
+  /*****************************************************************************
+   * Type definitions
+   ****************************************************************************/
+
+  /// Internal load implementation to use
+  using InternalLoad = LoadInternal<ALGORITHM, 0>;
+
+  /// Shared memory storage layout type
+  using _TempStorage = typename InternalLoad::TempStorage;
+
+
+  /*****************************************************************************
+   * Utility methods
+   ****************************************************************************/
+
+  /// Internal storage allocator
+  __device__ __forceinline__ _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+
+  /*****************************************************************************
+   * Thread fields
+   ****************************************************************************/
+
+  /// Thread reference to shared storage
+  _TempStorage &temp_storage;
+
+  /// Linear thread-id
+  int linear_tid;
+
+public:
+
+  /// @smemstorage{WarpLoad}
+  struct TempStorage : Uninitialized<_TempStorage> {};
+
+  /*************************************************************************//**
+   * @name Collective constructors
+   ****************************************************************************/
+  //@{
+
+  /**
+   * @brief Collective constructor using a private static allocation of
+   *        shared memory as temporary storage.
+   */
+  __device__ __forceinline__
+  WarpLoad()
+      : temp_storage(PrivateStorage())
+      , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
+  {}
+
+  /**
+   * @brief Collective constructor using the specified memory allocation as
+   *        temporary storage.
+   */
+  __device__ __forceinline__
+  WarpLoad(TempStorage &temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
+  {}
+
+  //@}  end member group
+  /*************************************************************************//**
+   * @name Data movement
+   ****************************************************************************/
+  //@{
+
+  /**
+   * @brief Load a linear segment of items from memory.
+   *
+   * @par
+   * \smemreuse
+   *
+   * @par Snippet
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_load.cuh>
+   *
+   * __global__ void ExampleKernel(int *d_data, ...)
+   * {
+   *     constexpr int warp_threads = 16;
+   *     constexpr int block_threads = 256;
+   *     constexpr int items_per_thread = 4;
+   *
+   *     // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each
+   *     using WarpLoadT = WarpLoad<int,
+   *                                items_per_thread,
+   *                                cub::WARP_LOAD_TRANSPOSE,
+   *                                warp_threads>;
+   *
+   *     constexpr int warps_in_block = block_threads / warp_threads;
+   *     constexpr int tile_size = items_per_thread * warp_threads;
+   *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+   *
+   *     // Allocate shared memory for WarpLoad
+   *     __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block];
+   *
+   *     // Load a segment of consecutive items that are blocked across threads
+   *     int thread_data[items_per_thread];
+   *     WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
+   *                                           thread_data);
+   * @endcode
+   * @par
+   * Suppose the input @p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+   * The set of @p thread_data across the first logical warp of threads in those
+   * threads will be:
+   * <tt>{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }</tt>.
+   *
+   * @param[in] block_itr The thread block's base input iterator for loading from
+   * @param[out] items Data to load
+   */
+  template <typename InputIteratorT>
+  __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                       InputT (&items)[ITEMS_PER_THREAD])
+  {
+    InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
+  }
+
+  /**
+   * @brief Load a linear segment of items from memory, guarded by range.
+   *
+   * @par
+   * \smemreuse
+   *
+   * @par Snippet
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_load.cuh>
+   *
+   * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+   * {
+   *     constexpr int warp_threads = 16;
+   *     constexpr int block_threads = 256;
+   *     constexpr int items_per_thread = 4;
+   *
+   *     // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each
+   *     using WarpLoadT = WarpLoad<int,
+   *                                items_per_thread,
+   *                                cub::WARP_LOAD_TRANSPOSE,
+   *                                warp_threads>;
+   *
+   *     constexpr int warps_in_block = block_threads / warp_threads;
+   *     constexpr int tile_size = items_per_thread * warp_threads;
+   *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+   *
+   *     // Allocate shared memory for WarpLoad
+   *     __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block];
+   *
+   *     // Load a segment of consecutive items that are blocked across threads
+   *     int thread_data[items_per_thread];
+   *     WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
+   *                                           thread_data,
+   *                                           valid_items);
+   * @endcod
+   * @par
+   * Suppose the input @p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt> and @p valid_items
+   * is @p 5.
+   * The set of @p thread_data across the first logical warp of threads in those
+   * threads will be:
+   * <tt>{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }</tt> with only the first
+   * two threads being unmasked to load portions of valid data (and other items
+   * remaining unassigned).
+   *
+   * @param[in] block_itr The thread block's base input iterator for loading from
+   * @param[out] items Data to load
+   * @param[in] valid_items Number of valid items to load
+   */
+  template <typename InputIteratorT>
+  __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                       InputT (&items)[ITEMS_PER_THREAD],
+                                       int valid_items)
+  {
+    InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
+  }
+
+
+  /**
+   * @brief Load a linear segment of items from memory, guarded by range.
+   *
+   * @par
+   * \smemreuse
+   *
+   * @par Snippet
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_load.cuh>
+   *
+   * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+   * {
+   *     constexpr int warp_threads = 16;
+   *     constexpr int block_threads = 256;
+   *     constexpr int items_per_thread = 4;
+   *
+   *     // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each
+   *     using WarpLoadT = WarpLoad<int,
+   *                                items_per_thread,
+   *                                cub::WARP_LOAD_TRANSPOSE,
+   *                                warp_threads>;
+   *
+   *     constexpr int warps_in_block = block_threads / warp_threads;
+   *     constexpr int tile_size = items_per_thread * warp_threads;
+   *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+   *
+   *     // Allocate shared memory for WarpLoad
+   *     __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block];
+   *
+   *     // Load a segment of consecutive items that are blocked across threads
+   *     int thread_data[items_per_thread];
+   *     WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
+   *                                           thread_data,
+   *                                           valid_items,
+   *                                           -1);
+   * @endcode
+   * @par
+   * Suppose the input @p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>, @p valid_items
+   * is @p 5, and the out-of-bounds default is @p -1.
+   * The set of @p thread_data across the first logical warp of threads in those
+   * threads will be:
+   * <tt>{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }</tt> with only the first
+   * two threads being unmasked to load portions of valid data (and other items
+   * are assigned @p -1).
+   *
+   * @param[in] block_itr The thread block's base input iterator for loading from
+   * @param[out] items Data to load
+   * @param[in] valid_items Number of valid items to load
+   * @param[in] oob_default Default value to assign out-of-bound items
+   */
+  template <typename InputIteratorT,
+            typename DefaultT>
+  __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                       InputT (&items)[ITEMS_PER_THREAD],
+                                       int valid_items,
+                                       DefaultT oob_default)
+  {
+    InternalLoad(temp_storage, linear_tid)
+      .Load(block_itr, items, valid_items, oob_default);
+  }
+
+  //@}  end member group
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/warp/warp_merge_sort.cuh b/cuda/cub/warp/warp_merge_sort.cuh
new file mode 100644
index 00000000..6617084a
--- /dev/null
+++ b/cuda/cub/warp/warp_merge_sort.cuh
@@ -0,0 +1,172 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/block/block_merge_sort.cuh>
+#include <cub/config.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * @brief The WarpMergeSort class provides methods for sorting items partitioned
+ *        across a CUDA warp using a merge sorting method.
+ * @ingroup WarpModule
+ *
+ * @tparam KeyT
+ *   Key type
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   The number of items per thread
+ *
+ * @tparam LOGICAL_WARP_THREADS
+ *   <b>[optional]</b> The number of threads per "logical" warp (may be less
+ *   than the number of hardware warp threads). Default is the warp size of the
+ *   targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a
+ *   power of two.
+ *
+ * @tparam ValueT
+ *   <b>[optional]</b> Value type (default: cub::NullType, which indicates a
+ *   keys-only sort)
+ *
+ * @tparam PTX_ARCH
+ *   <b>[optional]</b> \ptxversion
+ *
+ * @par Overview
+ *   WarpMergeSort arranges items into ascending order using a comparison
+ *   functor with less-than semantics. Merge sort can handle arbitrary types
+ *   and comparison functors.
+ *
+ * @par A Simple Example
+ * @par
+ * The code snippet below illustrates a sort of 64 integer keys that are
+ * partitioned across 16 threads where each thread owns 4 consecutive items.
+ * @par
+ * @code
+ * #include <cub/cub.cuh>  // or equivalently <cub/warp/warp_merge_sort.cuh>
+ *
+ * struct CustomLess
+ * {
+ *   template <typename DataType>
+ *   __device__ bool operator()(const DataType &lhs, const DataType &rhs)
+ *   {
+ *     return lhs < rhs;
+ *   }
+ * };
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     constexpr int warp_threads = 16;
+ *     constexpr int block_threads = 256;
+ *     constexpr int items_per_thread = 4;
+ *     constexpr int warps_per_block = block_threads / warp_threads;
+ *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+ *
+ *     // Specialize WarpMergeSort for a virtual warp of 16 threads
+ *     // owning 4 integer items each
+ *     using WarpMergeSortT =
+ *       cub::WarpMergeSort<int, items_per_thread, warp_threads>;
+ *
+ *     // Allocate shared memory for WarpMergeSort
+ *     __shared__ typename WarpMergeSort::TempStorage temp_storage[warps_per_block];
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_keys[items_per_thread];
+ *     // ...
+ *
+ *     WarpMergeSort(temp_storage[warp_id]).Sort(thread_keys, CustomLess());
+ *     // ...
+ * }
+ * @endcode
+ * @par
+ * Suppose the set of input @p thread_keys across the block of threads is
+ * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
+ * The corresponding output @p thread_keys in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+ */
+template <
+  typename    KeyT,
+  int         ITEMS_PER_THREAD,
+  int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+  typename    ValueT                  = NullType,
+  int         PTX_ARCH                = CUB_PTX_ARCH>
+class WarpMergeSort
+    : public BlockMergeSortStrategy<
+        KeyT,
+        ValueT,
+        LOGICAL_WARP_THREADS,
+        ITEMS_PER_THREAD,
+        WarpMergeSort<KeyT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, ValueT, PTX_ARCH>>
+{
+private:
+  constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH);
+  constexpr static bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+  constexpr static int TILE_SIZE = ITEMS_PER_THREAD * LOGICAL_WARP_THREADS;
+
+  using BlockMergeSortStrategyT = BlockMergeSortStrategy<KeyT,
+                                                         ValueT,
+                                                         LOGICAL_WARP_THREADS,
+                                                         ITEMS_PER_THREAD,
+                                                         WarpMergeSort>;
+
+  const unsigned int warp_id;
+  const unsigned int member_mask;
+
+public:
+  WarpMergeSort() = delete;
+
+  __device__ __forceinline__
+  WarpMergeSort(typename BlockMergeSortStrategyT::TempStorage &temp_storage)
+      : BlockMergeSortStrategyT(temp_storage,
+                                IS_ARCH_WARP
+                                  ? LaneId()
+                                  : (LaneId() % LOGICAL_WARP_THREADS))
+      , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS))
+      , member_mask(WarpMask<LOGICAL_WARP_THREADS, PTX_ARCH>(warp_id))
+  {
+  }
+
+  __device__ __forceinline__ unsigned int get_member_mask() const
+  {
+    return member_mask;
+  }
+
+private:
+  __device__ __forceinline__ void SyncImplementation() const
+  {
+    WARP_SYNC(member_mask);
+  }
+
+  friend BlockMergeSortStrategyT;
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/warp/warp_reduce.cuh b/cuda/cub/warp/warp_reduce.cuh
new file mode 100644
index 00000000..24a2cd4f
--- /dev/null
+++ b/cuda/cub/warp/warp_reduce.cuh
@@ -0,0 +1,608 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "specializations/warp_reduce_shfl.cuh"
+#include "specializations/warp_reduce_smem.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_type.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * \addtogroup WarpModule
+ * @{
+ */
+
+/**
+ * \brief The WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. ![](warp_reduce_logo.png)
+ *
+ * \tparam T                        The reduction input/output element type
+ * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20).
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ *   uses a binary combining operator to compute a single aggregate from a list of input elements.
+ * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads)
+ * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Performance Considerations
+ * - Uses special instructions when applicable (e.g., warp \p SHFL instructions)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *     - Summation (<b><em>vs.</em></b> generic reduction)
+ *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Simple Examples
+ * \warpcollective{WarpReduce}
+ * \par
+ * The code snippet below illustrates four concurrent warp sum reductions within a block of
+ * 128 threads (one per each of the 32-thread warps).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpReduce for type int
+ *     typedef cub::WarpReduce<int> WarpReduce;
+ *
+ *     // Allocate WarpReduce shared memory for 4 warps
+ *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
+ *
+ *     // Obtain one input item per thread
+ *     int thread_data = ...
+ *
+ *     // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
+ *     int warp_id = threadIdx.x / 32;
+ *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+ * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
+ * \p 2544, and \p 3568, respectively (and is undefined in other threads).
+ *
+ * \par
+ * The code snippet below illustrates a single warp sum reduction within a block of
+ * 128 threads.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpReduce for type int
+ *     typedef cub::WarpReduce<int> WarpReduce;
+ *
+ *     // Allocate WarpReduce shared memory for one warp
+ *     __shared__ typename WarpReduce::TempStorage temp_storage;
+ *     ...
+ *
+ *     // Only the first warp performs a reduction
+ *     if (threadIdx.x < 32)
+ *     {
+ *         // Obtain one input item per thread
+ *         int thread_data = ...
+ *
+ *         // Return the warp-wide sum to lane0
+ *         int aggregate = WarpReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the warp of threads is <tt>{0, 1, 2, 3, ..., 31}</tt>.
+ * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads).
+ *
+ */
+template <
+    typename    T,
+    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+    int         PTX_ARCH                = CUB_PTX_ARCH>
+class WarpReduce
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+    };
+
+public:
+
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Internal specialization.
+    /// Use SHFL-based reduction if LOGICAL_WARP_THREADS is a power-of-two
+    using InternalWarpReduce = cub::detail::conditional_t<
+      IS_POW_OF_TWO,
+      WarpReduceShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
+      WarpReduceSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH>>;
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+private:
+
+    /// Shared memory storage layout type for WarpReduce
+    typedef typename InternalWarpReduce::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+public:
+
+    /// \smemstorage{WarpReduce}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ WarpReduce(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias())
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Summation reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a warp-wide sum in the calling warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp sum reductions within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for 4 warps
+     *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int warp_id = threadIdx.x / 32;
+     *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
+     * \p 2544, and \p 3568, respectively (and is undefined in other threads).
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T                   input)              ///< [in] Calling thread's input
+    {
+        return InternalWarpReduce(temp_storage).template Reduce<true>(input, LOGICAL_WARP_THREADS, cub::Sum());
+    }
+
+    /**
+     * \brief Computes a partially-full warp-wide sum in the calling warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * All threads across the calling warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction within a single, partially-full
+     * block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread if in range
+     *     int thread_data;
+     *     if (threadIdx.x < valid_items)
+     *         thread_data = d_data[threadIdx.x];
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).Sum(
+     *         thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
+     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 6 (and is
+     * undefined in other threads).
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T                   input,              ///< [in] Calling thread's input
+        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
+    {
+        // Determine if we don't need bounds checking
+        return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes a segmented sum in the calling warp where segments are defined by head-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a head-segmented warp sum
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int head_flag = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
+     *         thread_data, head_flag);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     */
+    template <
+        typename            FlagT>
+    __device__ __forceinline__ T HeadSegmentedSum(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT                head_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+    {
+        return HeadSegmentedReduce(input, head_flag, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes a segmented sum in the calling warp where segments are defined by tail-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a tail-segmented warp sum
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int tail_flag = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).TailSegmentedSum(
+     *         thread_data, tail_flag);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            FlagT>
+    __device__ __forceinline__ T TailSegmentedSum(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT                tail_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+    {
+        return TailSegmentedReduce(input, tail_flag, cub::Sum());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Generic reductions
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes a warp-wide reduction in the calling warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp max reductions within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for 4 warps
+     *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int warp_id = threadIdx.x / 32;
+     *     int aggregate = WarpReduce(temp_storage[warp_id]).Reduce(
+     *         thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63,
+     * \p 95, and \p 127, respectively  (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        return InternalWarpReduce(temp_storage).template Reduce<true>(input, LOGICAL_WARP_THREADS, reduction_op);
+    }
+
+    /**
+     * \brief Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * All threads across the calling warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction within a single, partially-full
+     * block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread if in range
+     *     int thread_data;
+     *     if (threadIdx.x < valid_items)
+     *         thread_data = d_data[threadIdx.x];
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).Reduce(
+     *         thread_data, cub::Max(), valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
+     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 3 (and is
+     * undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input
+        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
+        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
+    {
+        return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a segmented reduction in the calling warp where segments are defined by head-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a head-segmented warp max
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int head_flag = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce(
+     *         thread_data, head_flag, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            ReductionOp,
+        typename            FlagT>
+    __device__ __forceinline__ T HeadSegmentedReduce(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT               head_flag,          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+        ReductionOp         reduction_op)       ///< [in] Reduction operator
+    {
+        return InternalWarpReduce(temp_storage).template SegmentedReduce<true>(input, head_flag, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a segmented reduction in the calling warp where segments are defined by tail-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a tail-segmented warp max
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int tail_flag = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).TailSegmentedReduce(
+     *         thread_data, tail_flag, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            ReductionOp,
+        typename            FlagT>
+    __device__ __forceinline__ T TailSegmentedReduce(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT               tail_flag,          ///< [in] Tail flag denoting whether or not \p input is the end of the current segment
+        ReductionOp         reduction_op)       ///< [in] Reduction operator
+    {
+        return InternalWarpReduce(temp_storage).template SegmentedReduce<false>(input, tail_flag, reduction_op);
+    }
+
+
+
+    //@}  end member group
+};
+
+/** @} */       // end group WarpModule
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/warp/warp_scan.cuh b/cuda/cub/warp/warp_scan.cuh
new file mode 100644
index 00000000..4e646957
--- /dev/null
+++ b/cuda/cub/warp/warp_scan.cuh
@@ -0,0 +1,932 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../config.cuh"
+#include "specializations/warp_scan_shfl.cuh"
+#include "specializations/warp_scan_smem.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_type.cuh"
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \addtogroup WarpModule
+ * @{
+ */
+
+/**
+ * \brief The WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.  ![](warp_scan_logo.png)
+ *
+ * \tparam T                        The scan input/output element type
+ * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20).
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ *   produces an output list where each element is computed to be the reduction
+ *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
+ *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ *   the <em>i</em><sup>th</sup> output reduction.
+ * - Supports non-commutative scan operators
+ * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads)
+ * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Performance Considerations
+ * - Uses special instructions when applicable (e.g., warp \p SHFL)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *     - Summation (<b><em>vs.</em></b> generic scan)
+ *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Simple Examples
+ * \warpcollective{WarpScan}
+ * \par
+ * The code snippet below illustrates four concurrent warp prefix sums within a block of
+ * 128 threads (one per each of the 32-thread warps).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpScan for type int
+ *     typedef cub::WarpScan<int> WarpScan;
+ *
+ *     // Allocate WarpScan shared memory for 4 warps
+ *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+ *
+ *     // Obtain one input item per thread
+ *     int thread_data = ...
+ *
+ *     // Compute warp-wide prefix sums
+ *     int warp_id = threadIdx.x / 32;
+ *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+ * The corresponding output \p thread_data in each of the four warps of threads will be
+ * <tt>0, 1, 2, 3, ..., 31}</tt>.
+ *
+ * \par
+ * The code snippet below illustrates a single warp prefix sum within a block of
+ * 128 threads.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpScan for type int
+ *     typedef cub::WarpScan<int> WarpScan;
+ *
+ *     // Allocate WarpScan shared memory for one warp
+ *     __shared__ typename WarpScan::TempStorage temp_storage;
+ *     ...
+ *
+ *     // Only the first warp performs a prefix sum
+ *     if (threadIdx.x < 32)
+ *     {
+ *         // Obtain one input item per thread
+ *         int thread_data = ...
+ *
+ *         // Compute warp-wide prefix sums
+ *         WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the warp of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+ * The corresponding output \p thread_data will be <tt>{0, 1, 2, 3, ..., 31}</tt>.
+ *
+ */
+template <
+    typename    T,
+    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+    int         PTX_ARCH                = CUB_PTX_ARCH>
+class WarpScan
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
+
+        /// Whether the data type is an integer (which has fully-associative addition)
+        IS_INTEGER = ((Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER))
+    };
+
+    /// Internal specialization.
+    /// Use SHFL-based scan if LOGICAL_WARP_THREADS is a power-of-two
+    using InternalWarpScan = cub::detail::conditional_t<
+      IS_POW_OF_TWO,
+      WarpScanShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
+      WarpScanSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH>>;
+
+    /// Shared memory storage layout type for WarpScan
+    typedef typename InternalWarpScan::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage    &temp_storage;
+    unsigned int    lane_id;
+
+
+
+    /******************************************************************************
+     * Public types
+     ******************************************************************************/
+
+public:
+
+    /// \smemstorage{WarpScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ WarpScan(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS)
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix sums
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive prefix sum across the calling warp.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix sums
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>1, 2, 3, ..., 32}</tt>.
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output)  ///< [out] Calling thread's output item.  May be aliased with \p input.
+    {
+        InclusiveScan(input, inclusive_output, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an inclusive prefix sum across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix sums
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>1, 2, 3, ..., 32}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sums
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive prefix sum across the calling warp.  The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>.
+     *
+     * \par
+     *  - \identityzero
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix sums
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>0, 1, 2, ..., 31}</tt>.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output)  ///< [out] Calling thread's output item.  May be aliased with \p input.
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, exclusive_output, initial_value, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix sum across the calling warp.  The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     *  - \identityzero
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix sums
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>0, 1, 2, ..., 31}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.
+     *
+     * \par
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveScan(
+     *         thread_data, thread_data, cub::Max(), warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            initial_value,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max(), warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)  Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,   ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            warp_aggregate,
+            scan_op,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            warp_aggregate,
+            scan_op,
+            initial_value,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Combination (inclusive & exclusive) prefix scans
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p exclusive_output computed for <em>warp-lane</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int inclusive_partial, exclusive_partial;
+     *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p inclusive_partial in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     * The corresponding output \p exclusive_partial in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void Scan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
+        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.
+     *
+     * \par
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     int inclusive_partial, exclusive_partial;
+     *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p inclusive_partial in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     * The corresponding output \p exclusive_partial in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void Scan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
+        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            initial_value,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data exchange
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Broadcast the value \p input from <em>warp-lane</em><sub><tt>src_lane</tt></sub> to all lanes in the warp
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the warp-wide broadcasts of values from
+     * lanes<sub>0</sub> in each of four warps to all other threads in those warps.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Broadcast from lane0 in each warp to all other threads in the warp
+     *     int warp_id = threadIdx.x / 32;
+     *     thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+     * The corresponding output \p thread_data will be
+     * <tt>{0, 0, ..., 0}</tt> in warp<sub>0</sub>,
+     * <tt>{32, 32, ..., 32}</tt> in warp<sub>1</sub>,
+     * <tt>{64, 64, ..., 64}</tt> in warp<sub>2</sub>, etc.
+     */
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        return InternalWarpScan(temp_storage).Broadcast(input, src_lane);
+    }
+
+    //@}  end member group
+
+};
+
+/** @} */       // end group WarpModule
+
+CUB_NAMESPACE_END
diff --git a/cuda/cub/warp/warp_store.cuh b/cuda/cub/warp/warp_store.cuh
new file mode 100644
index 00000000..18e8e226
--- /dev/null
+++ b/cuda/cub/warp/warp_store.cuh
@@ -0,0 +1,537 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * Operations for writing linear segments of data from the CUDA warp
+ */
+
+#pragma once
+
+#include <iterator>
+#include <type_traits>
+
+#include <cub/block/block_store.cuh>
+#include <cub/config.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+#include <cub/warp/warp_exchange.cuh>
+
+
+CUB_NAMESPACE_BEGIN
+
+
+/**
+ * @brief cub::WarpStoreAlgorithm enumerates alternative algorithms for
+ *        cub::WarpStore to write a blocked arrangement of items across a CUDA
+ *        warp to a linear segment of memory.
+ */
+enum WarpStoreAlgorithm
+{
+  /**
+   * @par Overview
+   * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
+   * directly to memory.
+   *
+   * @par Performance Considerations
+   * The utilization of memory transactions (coalescing) decreases as the
+   * access stride between threads increases (i.e., the number items per thread).
+   */
+  WARP_STORE_DIRECT,
+
+  /**
+   * @par Overview
+   * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is written
+   * directly to memory.
+   *
+   * @par Performance Considerations
+   * The utilization of memory transactions (coalescing) remains high regardless
+   * of items written per thread.
+   */
+  WARP_STORE_STRIPED,
+
+  /**
+   * @par Overview
+   *
+   * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
+   * directly to memory using CUDA's built-in vectorized stores as a coalescing
+   * optimization. For example, <tt>st.global.v4.s32</tt> instructions will be
+   * generated when @p T = @p int and @p ITEMS_PER_THREAD % 4 == 0.
+   *
+   * @par Performance Considerations
+   * - The utilization of memory transactions (coalescing) remains high until
+   *   the the access stride between threads (i.e., the number items per thread)
+   *   exceeds the maximum vector store width (typically 4 items or 64B,
+   *   whichever is lower).
+   * - The following conditions will prevent vectorization and writing will fall
+   *   back to cub::WARP_STORE_DIRECT:
+   *   - @p ITEMS_PER_THREAD is odd
+   *   - The @p OutputIteratorT is not a simple pointer type
+   *   - The block output offset is not quadword-aligned
+   *   - The data type @p T is not a built-in primitive or CUDA vector type
+   *     (e.g., @p short, @p int2, @p double, @p float2, etc.)
+   */
+  WARP_STORE_VECTORIZE,
+
+  /**
+   * @par Overview
+   * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+   * transposed and then efficiently written to memory as a
+   * [<em>striped arrangement</em>](index.html#sec5sec3).
+   *
+   * @par Performance Considerations
+   * - The utilization of memory transactions (coalescing) remains high
+   *   regardless of items written per thread.
+   * - The local reordering incurs slightly longer latencies and throughput than the
+   *   direct cub::WARP_STORE_DIRECT and cub::WARP_STORE_VECTORIZE alternatives.
+   */
+  WARP_STORE_TRANSPOSE
+};
+
+
+/**
+ * @brief The WarpStore class provides [<em>collective</em>](index.html#sec0)
+ *        data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *        of items partitioned across a CUDA warp to a linear segment of memory.
+ * @ingroup WarpModule
+ * @ingroup UtilIo
+ *
+ * @tparam T
+ *   The type of data to be written.
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   The number of consecutive items partitioned onto each thread.
+ *
+ * @tparam ALGORITHM
+ *   <b>[optional]</b> cub::WarpStoreAlgorithm tuning policy enumeration.
+ *   default: cub::WARP_STORE_DIRECT.
+ *
+ * @tparam LOGICAL_WARP_THREADS
+ *   <b>[optional]</b> The number of threads per "logical" warp (may be less
+ *   than the number of hardware warp threads). Default is the warp size of the
+ *   targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a
+ *   power of two.
+ *
+ * @tparam PTX_ARCH
+ *   <b>[optional]</b> \ptxversion
+ *
+ * @par Overview
+ * - The WarpStore class provides a single data movement abstraction that can be
+ *   specialized to implement different cub::WarpStoreAlgorithm strategies. This
+ *   facilitates different performance policies for different architectures,
+ *   data types, granularity sizes, etc.
+ * - WarpStore can be optionally specialized by different data movement strategies:
+ *   -# <b>cub::WARP_STORE_DIRECT</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is written directly to memory. [More...](@ref cub::WarpStoreAlgorithm)
+ *   -# <b>cub::WARP_STORE_STRIPED</b>. A [<em>striped arrangement</em>](index.html#sec5sec3)
+ *      of data is written directly to memory. [More...](@ref cub::WarpStoreAlgorithm)
+ *   -# <b>cub::WARP_STORE_VECTORIZE</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is written directly to memory using CUDA's built-in vectorized
+ *      stores as a coalescing optimization. [More...](@ref cub::WarpStoreAlgorithm)
+ *   -# <b>cub::WARP_STORE_TRANSPOSE</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec3)
+ *      which is then written to memory. [More...](@ref cub::WarpStoreAlgorithm)
+ * - \rowmajor
+ *
+ * @par A Simple Example
+ * @par
+ * The code snippet below illustrates the storing of a "blocked" arrangement
+ * of 64 integers across 16 threads (where each thread owns 4 consecutive items)
+ * into a linear segment of memory. The store is specialized for
+ * @p WARP_STORE_TRANSPOSE, meaning items are locally reordered among threads so
+ * that memory references will be efficiently coalesced using a warp-striped
+ * access pattern.
+ * @par
+ * @code
+ * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_store.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     constexpr int warp_threads = 16;
+ *     constexpr int block_threads = 256;
+ *     constexpr int items_per_thread = 4;
+ *
+ *     // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each
+ *     using WarpStoreT = WarpStore<int,
+ *                                  items_per_thread,
+ *                                  cub::WARP_STORE_TRANSPOSE,
+ *                                  warp_threads>;
+ *
+ *     constexpr int warps_in_block = block_threads / warp_threads;
+ *     constexpr int tile_size = items_per_thread * warp_threads;
+ *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+ *
+ *     // Allocate shared memory for WarpStore
+ *     __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block];
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Store items to linear memory
+ *     WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data);
+ * @endcode
+ * @par
+ * Suppose the set of @p thread_data across the warp threads is
+ * <tt>{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }</tt>.
+ * The output @p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+ */
+template <typename           T,
+          int                ITEMS_PER_THREAD,
+          WarpStoreAlgorithm ALGORITHM            = WARP_STORE_DIRECT,
+          int                LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS,
+          int                PTX_ARCH             = CUB_PTX_ARCH>
+class WarpStore
+{
+  static_assert(PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+                "LOGICAL_WARP_THREADS must be a power of two");
+
+  constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS ==
+                                       CUB_WARP_THREADS(PTX_ARCH);
+
+private:
+
+  /// Store helper
+  template <WarpStoreAlgorithm _POLICY, int DUMMY>
+  struct StoreInternal;
+
+  template <int DUMMY>
+  struct StoreInternal<WARP_STORE_DIRECT, DUMMY>
+  {
+    typedef NullType TempStorage;
+
+    int linear_tid;
+
+    __device__ __forceinline__ StoreInternal(TempStorage &/*temp_storage*/,
+                                             int linear_tid)
+      : linear_tid(linear_tid)
+    {}
+
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                          T (&items)[ITEMS_PER_THREAD])
+    {
+      StoreDirectBlocked(linear_tid, block_itr, items);
+    }
+
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                          T (&items)[ITEMS_PER_THREAD],
+                                          int valid_items)
+    {
+      StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
+    }
+  };
+
+
+  template <int DUMMY>
+  struct StoreInternal<WARP_STORE_STRIPED, DUMMY>
+  {
+    typedef NullType TempStorage;
+
+    int linear_tid;
+
+    __device__ __forceinline__ StoreInternal(TempStorage & /*temp_storage*/,
+                                             int linear_tid)
+        : linear_tid(linear_tid)
+    {}
+
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                          T (&items)[ITEMS_PER_THREAD])
+    {
+      StoreDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items);
+    }
+
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                          T (&items)[ITEMS_PER_THREAD],
+                                          int valid_items)
+    {
+      StoreDirectStriped<LOGICAL_WARP_THREADS>(linear_tid,
+                                               block_itr,
+                                               items,
+                                               valid_items);
+    }
+  };
+
+
+  template <int DUMMY>
+  struct StoreInternal<WARP_STORE_VECTORIZE, DUMMY>
+  {
+    typedef NullType TempStorage;
+
+    int linear_tid;
+
+    __device__ __forceinline__ StoreInternal(TempStorage & /*temp_storage*/,
+                                             int linear_tid)
+        : linear_tid(linear_tid)
+    {}
+
+    __device__ __forceinline__ void Store(T *block_ptr,
+                                          T (&items)[ITEMS_PER_THREAD])
+    {
+      StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
+    }
+
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                          T (&items)[ITEMS_PER_THREAD])
+    {
+      StoreDirectBlocked(linear_tid, block_itr, items);
+    }
+
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                          T (&items)[ITEMS_PER_THREAD],
+                                          int valid_items)
+    {
+      StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
+    }
+  };
+
+
+  template <int DUMMY>
+  struct StoreInternal<WARP_STORE_TRANSPOSE, DUMMY>
+  {
+    using WarpExchangeT =
+      WarpExchange<T, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, PTX_ARCH>;
+
+    struct _TempStorage : WarpExchangeT::TempStorage
+    {};
+
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+    _TempStorage &temp_storage;
+
+    int linear_tid;
+
+    __device__ __forceinline__ StoreInternal(TempStorage &temp_storage,
+                                             int linear_tid)
+        : temp_storage(temp_storage.Alias())
+        , linear_tid(linear_tid)
+    {}
+
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                          T (&items)[ITEMS_PER_THREAD])
+    {
+      WarpExchangeT(temp_storage).BlockedToStriped(items, items);
+      StoreDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items);
+    }
+
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                          T (&items)[ITEMS_PER_THREAD],
+                                          int valid_items)
+    {
+      WarpExchangeT(temp_storage).BlockedToStriped(items, items);
+      StoreDirectStriped<LOGICAL_WARP_THREADS>(linear_tid,
+                                               block_itr,
+                                               items,
+                                               valid_items);
+    }
+  };
+
+
+  /// Internal load implementation to use
+  using InternalStore = StoreInternal<ALGORITHM, 0>;
+
+  /// Shared memory storage layout type
+  using _TempStorage = typename InternalStore::TempStorage;
+
+
+  __device__ __forceinline__ _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+
+  _TempStorage &temp_storage;
+
+  int linear_tid;
+
+public:
+
+  struct TempStorage : Uninitialized<_TempStorage> {};
+
+  /*************************************************************************//**
+   * @name Collective constructors
+   ****************************************************************************/
+  //@{
+
+  /**
+   * @brief Collective constructor using a private static allocation of shared
+   *        memory as temporary storage.
+   */
+  __device__ __forceinline__ WarpStore()
+      : temp_storage(PrivateStorage())
+      , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
+  {}
+
+  /**
+   * @brief Collective constructor using the specified memory allocation as
+   *        temporary storage.
+   */
+  __device__ __forceinline__ WarpStore(TempStorage &temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
+  {}
+
+  //@}  end member group
+  /*************************************************************************//**
+   * @name Data movement
+   ****************************************************************************/
+  //@{
+
+  /**
+   * @brief Store items into a linear segment of memory.
+   *
+   * @par
+   * \smemreuse
+   *
+   * @par Snippet
+   * @par
+   * The code snippet below illustrates the storing of a "blocked" arrangement
+   * of 64 integers across 16 threads (where each thread owns 4 consecutive items)
+   * into a linear segment of memory. The store is specialized for
+   * @p WARP_STORE_TRANSPOSE, meaning items are locally reordered among threads so
+   * that memory references will be efficiently coalesced using a warp-striped
+   * access pattern.
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_store.cuh>
+   *
+   * __global__ void ExampleKernel(int *d_data, ...)
+   * {
+   *     constexpr int warp_threads = 16;
+   *     constexpr int block_threads = 256;
+   *     constexpr int items_per_thread = 4;
+   *
+   *     // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each
+   *     using WarpStoreT = WarpStore<int,
+   *                                  items_per_thread,
+   *                                  cub::WARP_STORE_TRANSPOSE,
+   *                                  warp_threads>;
+   *
+   *     constexpr int warps_in_block = block_threads / warp_threads;
+   *     constexpr int tile_size = items_per_thread * warp_threads;
+   *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+   *
+   *     // Allocate shared memory for WarpStore
+   *     __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block];
+   *
+   *     // Obtain a segment of consecutive items that are blocked across threads
+   *     int thread_data[4];
+   *     ...
+   *
+   *     // Store items to linear memory
+   *     WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data);
+   * @endcode
+   * @par
+   * Suppose the set of @p thread_data across the warp threads is
+   * <tt>{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }</tt>.
+   * The output @p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+   *
+   * @param[out] block_itr The thread block's base output iterator for storing to
+   * @param[in] items Data to store
+   */
+  template <typename OutputIteratorT>
+  __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                        T (&items)[ITEMS_PER_THREAD])
+  {
+    InternalStore(temp_storage, linear_tid).Store(block_itr, items);
+  }
+
+  /**
+   * @brief Store items into a linear segment of memory, guarded by range.
+   *
+   * @par
+   * \smemreuse
+   *
+   * @par Snippet
+   * @par
+   * The code snippet below illustrates the storing of a "blocked" arrangement
+   * of 64 integers across 16 threads (where each thread owns 4 consecutive items)
+   * into a linear segment of memory. The store is specialized for
+   * @p WARP_STORE_TRANSPOSE, meaning items are locally reordered among threads so
+   * that memory references will be efficiently coalesced using a warp-striped
+   * access pattern.
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_store.cuh>
+   *
+   * __global__ void ExampleKernel(int *d_data, int valid_items ...)
+   * {
+   *     constexpr int warp_threads = 16;
+   *     constexpr int block_threads = 256;
+   *     constexpr int items_per_thread = 4;
+   *
+   *     // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each
+   *     using WarpStoreT = WarpStore<int,
+   *                                  items_per_thread,
+   *                                  cub::WARP_STORE_TRANSPOSE,
+   *                                  warp_threads>;
+   *
+   *     constexpr int warps_in_block = block_threads / warp_threads;
+   *     constexpr int tile_size = items_per_thread * warp_threads;
+   *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+   *
+   *     // Allocate shared memory for WarpStore
+   *     __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block];
+   *
+   *     // Obtain a segment of consecutive items that are blocked across threads
+   *     int thread_data[4];
+   *     ...
+   *
+   *     // Store items to linear memory
+   *     WarpStoreT(temp_storage[warp_id]).Store(
+   *       d_data + warp_id * tile_size, thread_data, valid_items);
+   * @endcode
+   * @par
+   * Suppose the set of @p thread_data across the warp threads is
+   * <tt>{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }</tt> and @p valid_items
+   * is @p 5.. The output @p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ...</tt>,
+   * with only the first two threads being unmasked to store portions of valid
+   * data.
+   *
+   * @param[out] block_itr The thread block's base output iterator for storing to
+   * @param[in] items Data to store
+   * @param[in] valid_items Number of valid items to write
+   */
+  template <typename OutputIteratorT>
+  __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                        T (&items)[ITEMS_PER_THREAD],
+                                        int valid_items)
+  {
+    InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
+  }
+
+  //@}  end member group
+};
+
+
+CUB_NAMESPACE_END
diff --git a/cuda/harvesting/CudaThresher.cu b/cuda/harvesting/CudaThresher.cu
new file mode 100644
index 00000000..b023290e
--- /dev/null
+++ b/cuda/harvesting/CudaThresher.cu
@@ -0,0 +1,799 @@
+#include "pch.h"            // Make intellisens happy in some IDEs...
+#include "harvesting/Thresher.h"
+#include "harvesting/GreenReaperInternal.h"
+#include "harvesting/GreenReaper.h"
+#include "pos/chacha8.h"
+#include "CudaF1.h"
+#include "CudaFx.h"
+#include "CudaMatch.h"
+#include "CudaUtil.h"
+#include "CudaPlotContext.h"
+#include "cub/device/device_radix_sort.cuh"
+#include "ChiaConsts.h"
+#include "plotting/PlotTypes.h"
+
+// #define BB_CUDA_HARVEST_USE_TIMINGS 1
+
+namespace {
+    struct Timings
+    {
+        NanoSeconds f1       = NanoSeconds::zero();
+        NanoSeconds match    = NanoSeconds::zero();
+        NanoSeconds fx       = NanoSeconds::zero();
+        NanoSeconds sort     = NanoSeconds::zero();
+        NanoSeconds inlineX  = NanoSeconds::zero();
+        NanoSeconds download = NanoSeconds::zero();
+        NanoSeconds upload   = NanoSeconds::zero();
+    };
+}
+
+class CudaThresher : public IThresher
+{
+    GreenReaperConfig _config;
+    int               _deviceId;
+
+    bool      _isDecompressing = false;             // Are we currently decompressing a proof?
+    TableId   _currentTable    = TableId::Table1;   // Current table being decompressed
+    
+    uint32    _maxCompressionLevel = 0; // Max compression level for which we have allocated buffers
+
+    size_t    _bufferCapacity = 0;
+    size_t    _matchCapacity  = 0;
+    size_t    _sortBufferSize = 0;
+
+    uint32    _tableOffsetIn    = 0;  // Offset in the input table values. That is, the read position
+    uint32    _tableOffsetOut   = 0;  // Offset in the output table. That is, how many entries/pairs have been generated so far.
+
+    // Host buffers
+    uint32*   _hostMatchCount   = nullptr;
+    
+    // Device buffers
+    byte*     _devSortTmpBuffer = nullptr;
+    uint32*   _devChaChaInput   = nullptr;  // Modified plot id for chacha seed
+
+    // F1
+    uint64*   _devYBufferF1     = nullptr;
+    uint32*   _devXBuffer       = nullptr;
+    uint32*   _devXBufferTmp    = nullptr;
+
+    uint32*   _devSortKey       = nullptr;
+    uint32*   _devSortKeyTmp    = nullptr;
+
+    // Fx
+    Pair*     _devMatchesIn     = nullptr;
+    Pair*     _devMatchesOut    = nullptr;
+    uint64*   _devYBufferIn     = nullptr;
+    uint64*   _devYBufferOut    = nullptr;
+    byte*     _devMetaBufferIn  = nullptr;
+    byte*     _devMetaBufferOut = nullptr;
+
+    uint32*   _devMatchCount    = nullptr;
+
+    // Temporary sorted buffers
+    // Pair*   _devMatchesSorted = nullptr;
+    // uint64* _devYSorted       = nullptr;
+    // byte*   _devMetaSorted    = nullptr;
+
+
+    // Cuda objects
+    cudaStream_t _computeStream  = nullptr;
+    cudaStream_t _uploadStream   = nullptr;
+    cudaStream_t _downloadStream = nullptr;
+
+    cudaEvent_t  _computeEvent   = nullptr;
+    cudaEvent_t  _uploadEvent    = nullptr;
+    cudaEvent_t  _downloadEvent  = nullptr;
+
+    CudaPlotInfo _info;
+
+    Timings      _timings = {};
+
+public:
+    CudaThresher( const GreenReaperConfig& config, int deviceId )
+        : _config  ( config )
+        , _deviceId( deviceId )
+    {}
+
+    virtual ~CudaThresher()
+    {
+        ReleaseBuffers();
+    }
+
+    bool AllocateBuffers( const uint k, uint maxCompressionLevel ) override
+    {
+        // Only support C7 max for now
+        if( maxCompressionLevel > 7 )
+            return false;
+
+        // #NOTE: For now we always preallocate for the maximum compression level
+        maxCompressionLevel = 7;
+
+        if( _maxCompressionLevel >= maxCompressionLevel )
+            return true;
+    
+        _info.k              = 32;
+        _info.bucketCount    = 64;                          // #TODO: Make this configurable
+        _info.yBits          = _info.k + kExtraBits;
+        _info.bucketBits     = bblog2( _info.bucketCount );
+
+        // #TODO: Needs to be configured per k
+        //const uint64 kTableEntryCount  = 1ull << k;
+        const uint32 entriesPerF1Block   = kF1BlockSizeBits / k;
+
+        const uint64 allocEntryCountBase = GetEntriesPerBucketForCompressionLevel( k, maxCompressionLevel );
+        const uint64 bucketCapcity       = (allocEntryCountBase / _info.bucketCount + ( 4096 )) * 2;
+        const uint64 allocEntryCount     = RoundUpToNextBoundary( bucketCapcity * _info.bucketCount, entriesPerF1Block );
+
+        _bufferCapacity = allocEntryCount;
+        //const uint64 kSliceEntryCount  = kBucketEntryCount / _info.bucketCount;
+        //const uint64 kSliceCapcity     = kSliceEntryCount + _info.bucketCount;
+
+        _info.bucketCapacity   = (uint32)_bufferCapacity;
+        _info.sliceCapacity    = (uint32)bucketCapcity;
+        _info.metaMaxSizeBytes = _info.k * 4 / 8;
+
+        ASSERT( _info.sliceCapacity * _info.bucketCount == _info.bucketCapacity );
+
+
+        ////    cuda.info.sliceCapacity     = cuda.info.bucketCapacity / cuda.info.bucketCount;
+        ////    cuda.info.metaMaxSizeBytes;
+        //    // context.cuda.bufferCapacity =
+
+        // Allocate CUDA buffers
+        cudaError cErr = cudaSuccess;
+        {
+            #define CuFailCheck()  if( cErr != cudaSuccess ) goto FAIL
+
+            /// Host pinned allocations
+            cErr = cudaMallocHost( &_hostMatchCount, sizeof( uint32 ), cudaHostAllocDefault ); CuFailCheck();
+            // cErr = cudaMallocHost( &_hostBucketCounts, _info.bucketCount * sizeof( uint32 ), cudaHostAllocDefault ); CuFailCheck();
+
+            /// Cuda allocations
+            _sortBufferSize = 0;
+            cErr = cub::DeviceRadixSort::SortPairs<uint64, uint32>( nullptr, _sortBufferSize, nullptr, nullptr, nullptr, nullptr, allocEntryCount ); CuFailCheck();
+            ASSERT( _sortBufferSize );
+
+            cErr = cudaMalloc( &_devSortTmpBuffer, _sortBufferSize ); CuFailCheck();
+            // cErr = cudaMalloc( &_devBucketCounts , _info.bucketCount * sizeof( uint32 ) ); CuFailCheck();
+            cErr = CudaCallocT( _devChaChaInput, 32 ); CuFailCheck();
+
+            cErr = CudaCallocT( _devYBufferF1 , allocEntryCount ); CuFailCheck();
+            cErr = CudaCallocT( _devYBufferIn , allocEntryCount ); CuFailCheck();
+            cErr = CudaCallocT( _devYBufferOut, allocEntryCount ); CuFailCheck();
+            cErr = CudaCallocT( _devXBuffer   , allocEntryCount ); CuFailCheck();
+            cErr = CudaCallocT( _devXBufferTmp, allocEntryCount ); CuFailCheck();
+
+
+            const uint64 maxPairsPerTable = std::max( (uint64)GR_MIN_TABLE_PAIRS, GetMaxTablePairsForCompressionLevel( k, maxCompressionLevel ) );
+            _matchCapacity = (size_t)maxPairsPerTable;
+
+            cErr = CudaCallocT( _devMatchCount, 1 ); CuFailCheck();
+
+            cErr = CudaCallocT( _devMatchesIn,  maxPairsPerTable ); CuFailCheck();
+            cErr = CudaCallocT( _devMatchesOut, maxPairsPerTable ); CuFailCheck();
+
+            cErr = CudaCallocT( _devMetaBufferIn , maxPairsPerTable * sizeof( uint32 ) * 4 ); CuFailCheck();
+            cErr = CudaCallocT( _devMetaBufferOut, maxPairsPerTable * sizeof( uint32 ) * 4 ); CuFailCheck();
+
+            cErr = CudaCallocT( _devSortKey   , maxPairsPerTable ); CuFailCheck();
+            cErr = CudaCallocT( _devSortKeyTmp, maxPairsPerTable ); CuFailCheck();
+            
+
+            // Sorted temp buffers
+            // cErr = cudaMalloc( &_devMatchesSorted, maxPairsPerTable * sizeof( Pair ) ); CuFailCheck();
+            // cErr = cudaMalloc( &_devYSorted      , maxPairsPerTable * sizeof( uint64 ) ); CuFailCheck();
+            // cErr = cudaMalloc( &_devMetaSorted   , maxPairsPerTable * sizeof( uint32 )*4 ); CuFailCheck();
+
+            // CUDA objects
+            cErr = cudaStreamCreate( &_computeStream  ); CuFailCheck();
+            cErr = cudaStreamCreate( &_uploadStream   ); CuFailCheck();
+            cErr = cudaStreamCreate( &_downloadStream ); CuFailCheck();
+
+            cErr = cudaEventCreate( &_computeEvent  ); CuFailCheck();
+            cErr = cudaEventCreate( &_uploadEvent   ); CuFailCheck();
+            cErr = cudaEventCreate( &_downloadEvent ); CuFailCheck();
+
+            #undef CuFailCheck
+        }
+
+        //cErr = cudaMalloc( &cuda.devYBufferF1, sizeof( uint32 ) * allocEntryCount );
+        _maxCompressionLevel = maxCompressionLevel;
+        return true;
+
+    FAIL:
+        ReleaseBuffers();
+        return false;
+    }
+
+    void ReleaseBuffers() override
+    {
+        _bufferCapacity      = 0;
+        _maxCompressionLevel = 0;
+
+        // Release all buffers
+        CudaSafeFreeHost( _hostMatchCount );
+
+        CudaSafeFree( _devSortTmpBuffer );
+        CudaSafeFree( _devChaChaInput );
+
+        CudaSafeFree( _devYBufferF1  );
+        CudaSafeFree( _devYBufferIn  );
+        CudaSafeFree( _devYBufferOut );
+        CudaSafeFree( _devXBuffer    );
+        CudaSafeFree( _devXBufferTmp );
+
+        CudaSafeFree( _devMatchCount );
+        CudaSafeFree( _devMatchesIn );
+        CudaSafeFree( _devMatchesOut );
+
+        CudaSafeFree( _devMetaBufferIn );
+        CudaSafeFree( _devMetaBufferOut );
+
+        CudaSafeFree( _devSortKey    );
+        CudaSafeFree( _devSortKeyTmp );
+
+        // CUDA objects
+        if( _computeStream ) cudaStreamDestroy( _computeStream ); _computeStream = nullptr;
+        if( _uploadStream )  cudaStreamDestroy( _uploadStream );  _uploadStream  = nullptr;
+        if( _computeStream ) cudaStreamDestroy( _computeStream ); _computeStream = nullptr;
+
+        if( _computeEvent )  cudaEventDestroy( _computeEvent );  _computeEvent  = nullptr;
+        if( _uploadEvent )   cudaEventDestroy( _uploadEvent );   _uploadEvent   = nullptr;
+        if( _downloadEvent ) cudaEventDestroy( _downloadEvent ); _downloadEvent = nullptr;
+    }
+
+    ThresherResult DecompressInitialTable( 
+        GreenReaperContext& cx,
+        const byte   plotId[32],
+        const uint32 entryCountPerX,
+        Pair*        outPairs,
+        uint64*      outY,
+        void*        outMeta,
+        uint32&      outMatchCount,
+        const uint64 x0, const uint64 x1 ) override
+    {
+        // Only k32 for now
+        ASSERT( x0 <= 0xFFFFFFFF );
+        ASSERT( x1 <= 0xFFFFFFFF );
+        ASSERT( entryCountPerX*2 < _bufferCapacity );
+
+        ThresherResult result{};
+        result.kind = ThresherResultKind::Success; 
+
+        if( entryCountPerX*2 > _bufferCapacity )
+        {
+            result.kind  = ThresherResultKind::Error;
+            result.error = ThresherError::UnexpectedError;
+            return result;
+        }
+
+        uint64    table1EntryCount = 0;
+        cudaError cErr             = cudaSuccess;
+
+        // Ensure we're in a good state
+        cErr = cudaStreamSynchronize( _computeStream ); if( cErr != cudaSuccess ) goto FAIL;
+        cErr = cudaStreamSynchronize( _downloadStream ); if( cErr  != cudaSuccess ) goto FAIL;
+
+
+        {
+            byte key[32] = { 1 };
+            memcpy( key + 1, plotId, 32 - 1 );
+
+            chacha8_ctx chacha;
+            chacha8_keysetup( &chacha, key, 256, nullptr );
+
+            const uint32 f1EntriesPerBlock = kF1BlockSize / sizeof( uint32 );
+
+            const uint32 x0Start     = (uint32)(x0 * entryCountPerX);
+            const uint32 x1Start     = (uint32)(x1 * entryCountPerX);
+            const uint32 block0Start = x0Start / f1EntriesPerBlock;
+            const uint32 block1Start = x1Start / f1EntriesPerBlock;
+
+            const uint32 f1BlocksPerX = (uint32)(entryCountPerX * sizeof( uint32 ) / kF1BlockSize);
+            ASSERT( entryCountPerX == f1BlocksPerX * (kF1BlockSize / sizeof( uint32 ) ) );
+
+            uint32 f1BlocksToCompute = f1BlocksPerX;
+            uint32 f1Iterations      = 2;
+            uint32 f1BlockStart      = block0Start;
+
+            // #TODO: Re-enable 
+            // If we overlap chacha blocks, then  only calculate 1 range
+            {
+                const uint32 blockMin = std::min( block0Start, block1Start );
+                const uint32 blockMax = std::max( block0Start, block1Start );
+
+                if( blockMin + f1BlocksPerX >= blockMax )
+                {
+                    f1BlocksToCompute = blockMax - blockMin + f1BlocksPerX;
+                    f1Iterations      = 1;
+                    f1BlockStart      = blockMin;
+                }
+            }
+
+            // Setup initial data
+            {
+                #if BB_CUDA_HARVEST_USE_TIMINGS
+                    const auto timer = TimerBegin();
+                #endif
+
+                uint64* f1Y = _devYBufferF1;
+                uint32* f1X = _devXBufferTmp;
+
+                cErr = cudaMemcpyAsync( _devChaChaInput, chacha.input, 64, cudaMemcpyHostToDevice, _computeStream );
+                if( cErr != cudaSuccess ) goto FAIL;
+
+                for( uint32 i = 0; i < f1Iterations; i++ )
+                {
+                    CudaGenF1K32(
+                        _info,
+                        _devChaChaInput,    // Seed
+                        f1BlockStart,       // Starting chacha block 
+                        f1BlocksToCompute,  // How many chacha blocks to compute
+                        f1Y,
+                        f1X,
+                        _computeStream
+                    );
+
+                    f1Y += entryCountPerX;
+                    f1X += entryCountPerX;
+                    f1BlockStart = block1Start;
+                }
+
+                cErr = cudaStreamSynchronize( _computeStream );
+                if( cErr != cudaSuccess ) goto FAIL;
+
+                #if BB_CUDA_HARVEST_USE_TIMINGS
+                    _timings.f1 += TimerEndTicks( timer );
+                #endif
+            }
+
+
+
+            // Sort entries on Y
+            {
+                #if BB_CUDA_HARVEST_USE_TIMINGS
+                    const auto timer = TimerBegin();
+                #endif
+
+                const uint64 entriesPerChaChaBlock = kF1BlockSize / sizeof( uint32 );
+                const uint64 f1EntryCount          = f1BlocksToCompute * entriesPerChaChaBlock * f1Iterations;
+                table1EntryCount = f1EntryCount;
+
+                cErr = cub::DeviceRadixSort::SortPairs<uint64, uint32>(
+                    _devSortTmpBuffer, _sortBufferSize,
+                    _devYBufferF1,  _devYBufferIn,
+                    _devXBufferTmp, _devXBuffer,
+                    f1EntryCount, 0, _info.k+kExtraBits,
+                    _computeStream );
+                if( cErr != cudaSuccess ) goto FAIL;
+
+                cErr = cudaStreamSynchronize( _computeStream );
+                if( cErr != cudaSuccess ) goto FAIL;
+
+                #if BB_CUDA_HARVEST_USE_TIMINGS
+                    _timings.sort += TimerEndTicks( timer );
+                #endif
+            }
+        }
+
+        // Perform T2 matches
+        {
+            #if BB_CUDA_HARVEST_USE_TIMINGS
+                auto timer = TimerBegin();
+            #endif
+
+            cErr = CudaHarvestMatchK32(
+                    _devMatchesOut,
+                    _devMatchCount,
+                    (uint32)_bufferCapacity,
+                    _devYBufferIn,
+                    (uint32)table1EntryCount,
+                    0,
+                    _computeStream );
+            if( cErr != cudaSuccess ) goto FAIL;
+
+            // Get match count
+            cErr = cudaMemcpyAsync( _hostMatchCount, _devMatchCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, _computeStream );
+            if( cErr != cudaSuccess ) goto FAIL;
+
+            cErr = cudaStreamSynchronize( _computeStream );
+            if( cErr != cudaSuccess ) goto FAIL;
+
+            const uint32 matchCount = *_hostMatchCount;
+
+            #if BB_CUDA_HARVEST_USE_TIMINGS
+                _timings.match += TimerEndTicks( timer );
+                timer = TimerBegin();
+            #endif
+
+            if( matchCount < 1 )
+            {
+                result.kind = ThresherResultKind::NoMatches;
+                return result;
+            }
+
+            // Compute table 2 Fx
+            CudaFxHarvestK32( 
+                TableId::Table2,
+                _devYBufferOut,
+                _devMetaBufferOut,
+                matchCount,
+                _devMatchesOut,
+                _devYBufferIn,
+                _devXBuffer,
+                _computeStream );
+
+            #if BB_CUDA_HARVEST_USE_TIMINGS
+                cErr = cudaStreamSynchronize( _computeStream );
+                _timings.fx += TimerEndTicks( timer );
+                timer = TimerBegin();
+            #endif
+
+            // Inline x's into pairs
+            CudaK32InlineXsIntoPairs(
+                matchCount,
+                _devMatchesOut,
+                _devMatchesOut,
+                _devXBuffer,
+                _computeStream );
+
+            #if BB_CUDA_HARVEST_USE_TIMINGS
+                cErr = cudaStreamSynchronize( _computeStream );
+                _timings.inlineX += TimerEndTicks( timer );
+                timer = TimerBegin();
+            #endif
+
+            // Sync download stream w/ compute stream
+            // #TODO: Use pinned
+            const size_t metaSize = CDiv( _info.k * 2, 8 );
+
+            /// Copy new entries back to host
+            cErr = cudaEventRecord( _computeEvent, _computeStream );    // Signal from compute stream
+            if( cErr != cudaSuccess ) goto FAIL;
+
+            cErr = cudaStreamWaitEvent( _downloadStream, _computeEvent ); // Download stream sync w/ compute stream
+            if( cErr != cudaSuccess ) goto FAIL;
+
+            cErr = cudaMemcpyAsync( outPairs, _devMatchesOut, sizeof( Pair ) * matchCount, cudaMemcpyDeviceToHost, _downloadStream );
+            if( cErr != cudaSuccess ) goto FAIL;
+            
+            cErr = cudaMemcpyAsync( outY, _devYBufferOut, sizeof( uint64 ) * matchCount, cudaMemcpyDeviceToHost, _downloadStream );
+            if( cErr != cudaSuccess ) goto FAIL;
+
+            cErr = cudaMemcpyAsync( outMeta, _devMetaBufferOut, metaSize * matchCount, cudaMemcpyDeviceToHost, _downloadStream );
+            if( cErr != cudaSuccess ) goto FAIL;
+
+            cErr = cudaStreamSynchronize( _downloadStream );
+            if( cErr != cudaSuccess ) goto FAIL;
+
+            outMatchCount = matchCount;
+
+            #if BB_CUDA_HARVEST_USE_TIMINGS
+                _timings.download += TimerEndTicks( timer );
+            #endif
+
+            if( matchCount < 1 )
+            {
+                result.kind = ThresherResultKind::NoMatches;
+                return result;
+            }
+        }
+
+        return result;
+
+    FAIL:
+// Log::Line( "DecompressInitialTable() Failed with CUDA error '%s': %s", cudaGetErrorName( cErr ), cudaGetErrorString( cErr ) );
+        ASSERT( cErr == cudaSuccess );              // Force debugger break
+
+        result.kind          = ThresherResultKind::Error;
+        result.error         = ThresherError::CudaError;
+        result.internalError = (i32)cErr;
+
+        return result;
+    }
+
+    ThresherResult DecompressTableGroup(
+        GreenReaperContext& cx,
+        const TableId   table,
+        uint32          entryCount,
+        uint32          matchOffset,
+        uint32          maxPairs,
+        uint32&         outMatchCount,
+        Pair*           outPairs,
+        uint64*         outY,
+        void*           outMeta,
+        Pair*           outLPairs,
+        const Pair*     inLPairs,
+        const uint64*   inY,
+        const void*     inMeta ) override
+    {
+        ASSERT( maxPairs );
+
+        outMatchCount = 0;
+
+        ThresherResult result{};
+        result.kind = ThresherResultKind::Success; 
+
+        cudaError_t cErr = cudaSuccess;
+
+        const size_t inMetaMultiplier = GetTableMetaMultiplier( table - 1 );
+        const size_t inMetaByteSize   = CDiv( _info.k * inMetaMultiplier, 8 );
+        uint32 matchCount = 0;
+
+        // Ensure we're in a good state
+        cErr = cudaStreamSynchronize( _uploadStream ); if( cErr != cudaSuccess ) goto FAIL;
+        cErr = cudaStreamSynchronize( _computeStream ); if( cErr != cudaSuccess ) goto FAIL;
+        cErr = cudaStreamSynchronize( _downloadStream ); if( cErr != cudaSuccess ) goto FAIL;
+
+        /// Upload input data
+    #if BB_CUDA_HARVEST_USE_TIMINGS
+        auto timer = TimerBegin();
+    #endif
+
+        cErr = cudaMemcpyAsync( _devMatchesIn, inLPairs, sizeof( Pair ) * entryCount, cudaMemcpyHostToDevice, _uploadStream ); if( cErr != cudaSuccess ) goto FAIL;
+        cErr = cudaMemcpyAsync( _devYBufferOut, inY, sizeof( uint64 ) * entryCount, cudaMemcpyHostToDevice, _uploadStream ); if( cErr != cudaSuccess ) goto FAIL;
+        cErr = cudaMemcpyAsync( _devMetaBufferOut, inMeta, inMetaByteSize * entryCount, cudaMemcpyHostToDevice, _uploadStream ); if( cErr != cudaSuccess ) goto FAIL;
+
+        cErr = cudaEventRecord( _uploadEvent, _uploadStream );
+        if( cErr != cudaSuccess ) goto FAIL;
+
+        // Sync w/ upload stream
+        cErr = cudaStreamWaitEvent( _computeStream, _uploadEvent );
+        if( cErr != cudaSuccess ) goto FAIL;
+
+        #if BB_CUDA_HARVEST_USE_TIMINGS
+            cErr = cudaStreamSynchronize( _computeStream ); if( cErr != cudaSuccess ) goto FAIL;
+            _timings.upload += TimerEndTicks( timer );
+            timer = TimerBegin();
+        #endif
+
+        /// Sort on Y
+        SortEntriesOnY( table-1,
+            _devYBufferIn,
+            _devMatchesOut,
+            _devMetaBufferIn,
+            _devYBufferOut,
+            _devMatchesIn,
+            _devMetaBufferOut,
+            entryCount,
+            _computeStream );
+
+        #if BB_CUDA_HARVEST_USE_TIMINGS
+            cErr = cudaStreamSynchronize( _computeStream ); if( cErr != cudaSuccess ) goto FAIL;
+            _timings.sort += TimerEndTicks( timer );
+            timer = TimerBegin();
+        #endif
+
+        // Sync download stream w/ compute stream
+        cErr = cudaEventRecord( _computeEvent, _computeStream );
+        if( cErr != cudaSuccess ) goto FAIL;
+        cErr = cudaStreamWaitEvent( _downloadStream, _computeEvent );
+        if( cErr != cudaSuccess ) goto FAIL;
+
+        // Copy sorted input matches back to device
+        cErr = cudaMemcpyAsync( outLPairs, _devMatchesOut, sizeof( Pair ) * entryCount, cudaMemcpyDeviceToHost, _downloadStream );
+        if( cErr != cudaSuccess ) goto FAIL;
+
+        /// Perform new matches w/ sorted input Y
+        ASSERT( maxPairs <= _matchCapacity );
+
+        cErr = CudaHarvestMatchK32(
+                    _devMatchesOut,
+                    _devMatchCount,
+                    maxPairs,
+                    _devYBufferIn,
+                    entryCount,
+                    0,
+                    _computeStream );
+        if( cErr != cudaSuccess ) goto FAIL;
+
+        // Get match count
+        cErr = cudaMemcpyAsync( _hostMatchCount, _devMatchCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, _computeStream );
+        if( cErr != cudaSuccess ) goto FAIL;
+
+        cErr = cudaStreamSynchronize( _computeStream );
+        if( cErr != cudaSuccess ) goto FAIL;
+
+        matchCount = *_hostMatchCount;
+        outMatchCount = matchCount;
+
+        #if BB_CUDA_HARVEST_USE_TIMINGS
+            _timings.match += TimerEndTicks( timer );
+            timer = TimerBegin();
+        #endif
+
+        if( matchCount < 1 )
+        {
+// Log::Line( "CUDA: No matches!" );
+            result.kind = ThresherResultKind::NoMatches;
+            goto FAIL;
+        }
+
+        // Generate Fx
+        CudaFxHarvestK32( 
+            table,
+            _devYBufferOut,
+            _devMetaBufferOut,
+            matchCount,
+            _devMatchesOut,
+            _devYBufferIn,
+            _devMetaBufferIn,
+            _computeStream );
+
+        // Apply matchOffset
+        CudaK32ApplyPairOffset(
+            matchCount,
+            matchOffset,
+            _devMatchesOut,
+            _devMatchesOut,
+            _computeStream );
+
+        cErr = cudaEventRecord( _computeEvent, _computeStream );    // Signal from compute stream
+        if( cErr != cudaSuccess ) goto FAIL;
+
+        #if BB_CUDA_HARVEST_USE_TIMINGS
+            cErr = cudaStreamSynchronize( _computeStream ); if( cErr != cudaSuccess ) goto FAIL;
+            _timings.fx += TimerEndTicks( timer );
+            timer = TimerBegin();
+        #endif
+
+        /// Copy new, unsorted entries back to host
+        {
+            const size_t outMetaMultiplier = GetTableMetaMultiplier( table );
+            const size_t outMetaByteSize    = CDiv( _info.k * outMetaMultiplier, 8 );
+            ASSERT( outMetaByteSize );
+
+            // Download stream sync w/ compute stream
+            cErr = cudaStreamWaitEvent( _downloadStream, _computeEvent ); 
+            if( cErr != cudaSuccess ) goto FAIL;
+
+            cErr = cudaMemcpyAsync( outPairs, _devMatchesOut, sizeof( Pair ) * matchCount, cudaMemcpyDeviceToHost, _downloadStream );
+            if( cErr != cudaSuccess ) goto FAIL;
+            
+            cErr = cudaMemcpyAsync( outY, _devYBufferOut, sizeof( uint64 ) * matchCount, cudaMemcpyDeviceToHost, _downloadStream );
+            if( cErr != cudaSuccess ) goto FAIL;
+
+            cErr = cudaMemcpyAsync( outMeta, _devMetaBufferOut, outMetaByteSize * matchCount, cudaMemcpyDeviceToHost, _downloadStream );
+            if( cErr != cudaSuccess ) goto FAIL;
+
+            cErr = cudaStreamSynchronize( _downloadStream );
+            if( cErr != cudaSuccess ) goto FAIL;
+        }
+
+        #if BB_CUDA_HARVEST_USE_TIMINGS    
+            _timings.download += TimerEndTicks( timer );
+        #endif
+
+        outMatchCount = matchCount;
+        return result;
+
+    FAIL:
+// Log::Line( "DecompressTableGroup() Failed with CUDA error '%s': %s", cudaGetErrorName( cErr ), cudaGetErrorString( cErr ) );
+
+        ASSERT( cErr == cudaSuccess );  // Force debugger break
+
+        if( result.kind == ThresherResultKind::Success )
+        {
+            result.kind          = ThresherResultKind::Error;
+            result.error         = ThresherError::CudaError;
+            result.internalError = (i32)cErr;
+        }
+
+        return result;
+    }
+
+    cudaError_t SortEntriesOnY( 
+        const TableId table,
+              uint64* yOut,
+              Pair*   pairsOut,
+              void*   metaOut,
+        const uint64* yIn,
+        const Pair*   pairsIn,
+        const void*   metaIn ,
+        const uint32  entryCount,
+        cudaStream_t  stream )
+    {
+        cudaError_t cErr;
+
+        uint32* sortKeyIn = _devSortKeyTmp;
+        uint32* sortKey   = _devSortKey;
+
+        // Generate sort key (for meta and pairs)
+        CudaK32PlotGenSortKey( entryCount, sortKeyIn, stream );
+
+        // Sort y, with the sort key
+        cErr = cub::DeviceRadixSort::SortPairs<uint64, uint32>(
+                _devSortTmpBuffer, _sortBufferSize,
+                yIn, yOut, 
+                sortKeyIn, sortKey, 
+                entryCount, 0, _info.k+kExtraBits,
+                stream );
+
+        if( cErr != cudaSuccess ) return cErr;
+
+        // Sort matches on key
+        CudaK32PlotSortByKey( entryCount, sortKey, pairsIn, pairsOut, stream );
+
+        // Sort meta on key
+        const size_t metaMultiplier = GetTableMetaMultiplier( table );
+        const size_t metaByteSize   = CDiv( _info.k * metaMultiplier, 8 );
+        ASSERT( metaMultiplier > 0 );
+
+        switch( metaMultiplier )
+        {
+            case 2: CudaK32PlotSortByKey( entryCount, sortKey, (K32Meta2*)metaIn, (K32Meta2*)metaOut, stream ); break;
+            case 3: CudaK32PlotSortByKey( entryCount, sortKey, (K32Meta3*)metaIn, (K32Meta3*)metaOut, stream ); break;
+            case 4: CudaK32PlotSortByKey( entryCount, sortKey, (K32Meta4*)metaIn, (K32Meta4*)metaOut, stream ); break;
+            default: ASSERT( 0 ); break;
+        }
+
+        return cErr;
+    }
+
+    void DumpTimings() override
+    {
+        #if BB_CUDA_HARVEST_USE_TIMINGS
+            auto logTiming = []( const char* title, NanoSeconds ns ) {
+
+                Log::Line( "%8s: %.3lf", title, TicksToSeconds( ns ) );
+            };
+
+            logTiming( "F1"      , _timings.f1       );
+            logTiming( "Sort"    , _timings.sort     );
+            logTiming( "Fx"      , _timings.fx       );
+            logTiming( "Match"   , _timings.match    );
+            logTiming( "Inline X", _timings.inlineX  );
+            logTiming( "Download", _timings.download );
+            logTiming( "Upload"  , _timings.upload   );
+
+            ClearTimings();
+        #endif
+    }
+
+    void ClearTimings() override
+    {
+        _timings = {};
+    }
+};
+
+
+IThresher* CudaThresherFactory_Private( const GreenReaperConfig& config )
+{
+    ASSERT( config.gpuRequest != GRGpuRequestKind_None );
+
+    // Attempt to init CUDA first
+    cudaError cErr;
+
+    int deviceCount = 0;
+    cErr = cudaGetDeviceCount( &deviceCount );
+    if( cErr != cudaSuccess || deviceCount < 1 )
+        return nullptr;
+
+    int deviceId = (int)config.gpuDeviceIndex;
+    if( config.gpuDeviceIndex >= (uint)deviceCount )
+    {
+        // Match exact device?
+        if( config.gpuRequest == GRGpuRequestKind_ExactDevice )
+            return nullptr;
+        
+        deviceId = 0;
+    }
+
+    cErr = cudaSetDevice( deviceId );
+    if( cErr != cudaSuccess )
+    {
+        // Try the default device then, if requested
+        if( deviceId != 0 && config.gpuRequest == GRGpuRequestKind_FirstAvailable )
+        {
+            deviceId = 0;
+            cErr = cudaSetDevice( deviceId );
+            if( cErr != cudaSuccess )
+                return nullptr;
+        }
+        else
+            return nullptr;
+    }
+
+    auto* thresher = new CudaThresher( config, deviceId );
+    return thresher;
+}
diff --git a/cuda/harvesting/CudaThresherDummy.cpp b/cuda/harvesting/CudaThresherDummy.cpp
new file mode 100644
index 00000000..62f721ed
--- /dev/null
+++ b/cuda/harvesting/CudaThresherDummy.cpp
@@ -0,0 +1,7 @@
+#include "harvesting/Thresher.h"
+
+/// Dummy function for when CUDA is not available
+IThresher* CudaThresherFactory::Create( const struct GreenReaperConfig& config )
+{
+    return nullptr;
+}
diff --git a/cuda/harvesting/CudaThresherFactory.cu b/cuda/harvesting/CudaThresherFactory.cu
new file mode 100644
index 00000000..37106d6f
--- /dev/null
+++ b/cuda/harvesting/CudaThresherFactory.cu
@@ -0,0 +1,11 @@
+#include "pch.h"
+#include "harvesting/Thresher.h"
+
+/// Defined in CudaThresher.cu
+IThresher* CudaThresherFactory_Private( const struct GreenReaperConfig& config );
+
+/// Declared in Thresher.h
+IThresher* CudaThresherFactory::Create( const struct GreenReaperConfig& config )
+{
+    return CudaThresherFactory_Private( config );
+}
diff --git a/extract-version.sh b/extract-version.sh
index 0f50751f..2321a128 100755
--- a/extract-version.sh
+++ b/extract-version.sh
@@ -24,9 +24,9 @@ if [[ -n "$bb_version_suffix" ]] && [[ "${bb_version_suffix:0:1}" != "-" ]]; the
   bb_version_suffix="-${bb_version_suffix}"
 fi
 
-bb_ver_maj=$(printf $version_str | sed -E -r 's/([0-9]+)\.([0-9]+)\.([0-9]+)/\1/' | xargs)
-bb_ver_min=$(printf $version_str | sed -E -r 's/([0-9]+)\.([0-9]+)\.([0-9]+)/\2/' | xargs)
-bb_ver_rev=$(printf $version_str | sed -E -r 's/([0-9]+)\.([0-9]+)\.([0-9]+)/\3/' | xargs)
+bb_ver_maj=$(printf $version_str | sed -E 's/([0-9]+)\.([0-9]+)\.([0-9]+)/\1/' | xargs)
+bb_ver_min=$(printf $version_str | sed -E 's/([0-9]+)\.([0-9]+)\.([0-9]+)/\2/' | xargs)
+bb_ver_rev=$(printf $version_str | sed -E 's/([0-9]+)\.([0-9]+)\.([0-9]+)/\3/' | xargs)
 
 bb_git_commit=$GITHUB_SHA
 if [[ -z $bb_git_commit ]]; then
@@ -73,9 +73,9 @@ if [[ -n $ver_component ]]; then
 fi
 
 # Emit all version components
-# echo "$bb_ver_maj"
-# echo "$bb_ver_min"
-# echo "$bb_ver_rev"
-# echo "$bb_version_suffix"
-# echo "$bb_git_commit"
+# echo "MAJ: $bb_ver_maj"
+# echo "MIN: $bb_ver_min"
+# echo "REV: $bb_ver_rev"
+# echo "SUF: $bb_version_suffix"
+# echo "COM: $bb_git_commit"
 
diff --git a/src/BLS.h b/src/BLS.h
new file mode 100644
index 00000000..a793f35c
--- /dev/null
+++ b/src/BLS.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wreturn-type"
+
+#pragma warning( push )
+
+extern "C" {
+    #include "bech32/segwit_addr.h"
+}
+
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma warning( disable : 6287  )
+#pragma warning( disable : 4267  )
+#pragma warning( disable : 26495 )
+#include "bls.hpp"
+#include "elements.hpp"
+#include "schemes.hpp"
+#include "util.hpp"
+#pragma GCC diagnostic pop
+#pragma warning( pop )
\ No newline at end of file
diff --git a/src/ChiaConsts.h b/src/ChiaConsts.h
index e5ef6ece..ca978552 100644
--- a/src/ChiaConsts.h
+++ b/src/ChiaConsts.h
@@ -72,10 +72,33 @@ const double kRValues[7] = { 4.7, 2.75, 2.75, 2.7, 2.6, 2.45 };
 #define kPOSMagic          "Proof of Space Plot"
 #define kFormatDescription "v1.0"
 
+#define CHIA_PLOT_V2_MAGIC  0x544F4C50ul   // "PLOT"
+#define CHIA_PLOT_VERSION_2 2
+
+#define BB_PLOT_ID_LEN 32
+#define BB_PLOT_ID_HEX_LEN (BB_PLOT_ID_LEN * 2)
+
+#define BB_PLOT_MEMO_MAX_SIZE (48+48+32)
+
+#define BB_PLOT_FILE_LEN_TMP (sizeof( "plot-k32-2021-08-05-18-55-77a011fc20f0003c3adcc739b615041ae56351a22b690fd854ccb6726e5f43b7.plot.tmp" ) - 1)
+#define BB_PLOT_FILE_LEN (BB_PLOT_FILE_LEN_TMP - 4)
+
+#define BB_COMPRESSED_PLOT_FILE_LEN_TMP (sizeof( "plot-k32-c01-2021-08-05-18-55-77a011fc20f0003c3adcc739b615041ae56351a22b690fd854ccb6726e5f43b7.plot.tmp" ) - 1)
+#define BB_COMPRESSED_PLOT_FILE_LEN (BB_PLOT_FILE_LEN_TMP - 4)
+
+#define BB_PLOT_PROOF_X_COUNT 64
+
+#define BB_CHIA_CHALLENGE_SIZE 32
+#define BB_CHIA_QUALITY_SIZE   32
+
+#define BB_CHIA_K_MAX_VALUE    50
+
+
 // Initializes L_targets table
 //-----------------------------------------------------------
 inline void LoadLTargets()
 {
+    // #TODO: Fix this as it is not thread-safe
     static bool _initialized = false;
 
     if( _initialized )
@@ -114,8 +137,9 @@ inline constexpr size_t CalculateMaxDeltasSize( const TableId tableId )
 
 /// Fixed size for parks
 //-----------------------------------------------------------
-inline size_t CalculateParkSize( TableId tableId )
+inline constexpr size_t CalculateParkSize( TableId tableId )
 {
+    // if( tableId == TableId::Table1 ) return 8960;
     return 
         CDiv( _K * 2, 8 ) +                                         // LinePoint size
         CDiv( (kEntriesPerPark - 1) * (_K - kStubMinusBits), 8 ) +  // Stub Size 
@@ -140,7 +164,8 @@ inline constexpr size_t CalculateParkSize( const TableId tableId, const uint32 k
 }
 
 // Calculates the size of one C3 park. This will store bits for each f7 between
-// two C1 checkpoints, depending on how many times that f7 is present. 
+// two C1 checkpoints, depending on how many times that f7 is present.
+//-----------------------------------------------------------
 constexpr inline static size_t CalculateC3Size()
 {
     return (size_t)CDiv( kC3BitsPerEntry * kCheckpoint1Interval, 8 );
diff --git a/src/Globals.h b/src/Globals.h
index b0e5916a..f7f2b635 100644
--- a/src/Globals.h
+++ b/src/Globals.h
@@ -55,10 +55,16 @@
 #define TimerEndTicks( startTime ) \
     (std::chrono::steady_clock::now() - (startTime))
 
+#define NanoSecondsToSeconds( nano ) ( (nano) * 1e-9 )
+
 #define TicksToSeconds( duration ) \
-    ( std::chrono::duration_cast<std::chrono::nanoseconds>( (duration) ).count() * 1e-9 )
+    NanoSecondsToSeconds( std::chrono::duration_cast<std::chrono::nanoseconds>( (duration) ).count() )
+
+#define TicksToNanoSeconds( duration ) \
+    ( std::chrono::duration_cast<std::chrono::nanoseconds>( (duration) ).count() )
     
 
+
 #define ImplementFlagOps( FlagType ) \
 inline FlagType operator | ( FlagType lhs, FlagType rhs )                                               \
 {                                                                                                       \
diff --git a/src/Platform.h b/src/Platform.h
index 5e731d76..d6a12ab2 100644
--- a/src/Platform.h
+++ b/src/Platform.h
@@ -2,7 +2,9 @@
 
 #ifdef _WIN32
     
-    #define NOMINMAX 1
+    #ifndef NOMINMAX
+        #define NOMINMAX 1
+    #endif
     #define WIN32_LEAN_AND_MEAN 1
     // #NOTE: Not including globally anymore as we're getting some naming conflicts
     //#include <Windows.h>
diff --git a/src/PlotContext.h b/src/PlotContext.h
index c216a6b3..9cf78630 100644
--- a/src/PlotContext.h
+++ b/src/PlotContext.h
@@ -2,25 +2,33 @@
 #include "Config.h"
 #include "ChiaConsts.h"
 #include "threading/ThreadPool.h"
-#include "PlotWriter.h"
 #include "plotting/PlotTypes.h"
+#include "plotting/PlotWriter.h"
+#include "plotting/GlobalPlotConfig.h"
 
 struct PlotRequest
 {
     const byte* plotId;       // Id of the plot we want to create       
-    const char* outPath;      // Output plot path
+    const char* outDir;       // Output plot directory
+    const char* plotFileName; // .plot.tmp file name
     const byte* memo;         // Plot memo
     uint16      memoSize;
-    bool        IsFinalPlot;  
+    bool        isFirstPlot;
+    bool        IsFinalPlot;
 };
 
-
+struct MemPlotConfig
+{
+    const struct GlobalPlotConfig* gCfg;
+};
 
 ///
 /// Context for a in-memory plotting
 ///
 struct MemPlotContext
 {
+    MemPlotConfig cfg;
+
     // They id of the plot being plotted
     const byte* plotId;
 
@@ -66,7 +74,7 @@ struct MemPlotContext
                             // These are only used for tables 2-6 (inclusive).
                             // These buffers map to regions in yBuffer0.
 
-    DiskPlotWriter* plotWriter;
+    PlotWriter* plotWriter;
 
     // The buffer used to write to disk the Phase 4 data.
     // This may be metaBuffer0 or a an L/R buffer from
diff --git a/src/PlotWriter.cpp b/src/PlotWriter.cpp
index 471ccd73..2c000e71 100644
--- a/src/PlotWriter.cpp
+++ b/src/PlotWriter.cpp
@@ -67,9 +67,9 @@ bool DiskPlotWriter::BeginPlot( const char* plotFilePath, FileStream& file, cons
 
 
     const size_t paddedHeaderSize = RoundUpToNextBoundary( headerSize, (int)file.BlockSize() );
-    
+
     byte* header = _headerBuffer;
-    
+
     // Do we need to realloc?
     if( _headerSize )
     {
diff --git a/src/Types.h b/src/Types.h
index 75e90074..44d20992 100644
--- a/src/Types.h
+++ b/src/Types.h
@@ -66,3 +66,4 @@ typedef uint128_t uint128;
 
 typedef std::chrono::steady_clock::duration   Duration;
 typedef std::chrono::steady_clock::time_point TimePoint;
+typedef std::chrono::nanoseconds              NanoSeconds;
diff --git a/src/algorithm/RadixSort.h b/src/algorithm/RadixSort.h
index e096ec39..b9c074bf 100644
--- a/src/algorithm/RadixSort.h
+++ b/src/algorithm/RadixSort.h
@@ -55,6 +55,9 @@ class RadixSort256
     template<uint32 ThreadCount>
     static void SortYWithKey( ThreadPool& pool, uint64* input, uint64* tmp, uint32* keyInput, uint32* keyTmp, uint64 length );
 
+    template<uint32 ThreadCount>
+    static void SortYWithKey( ThreadPool& pool, const uint32 threadCount, uint64* input, uint64* tmp, uint32* keyInput, uint32* keyTmp, uint64 length );
+
 private:
 
     template<uint32 ThreadCount, SortMode Mode, typename T1, typename TK, int MaxIter = sizeof( T1 )>
@@ -107,6 +110,13 @@ inline void RadixSort256::SortYWithKey( ThreadPool& pool, uint64* input, uint64*
     DoSort<ThreadCount, SortAndGenKey, uint64, uint32, 5>( pool, 0, input, tmp, keyInput, keyTmp, length );
 }
 
+//-----------------------------------------------------------
+template<uint32 ThreadCount>
+inline void RadixSort256::SortYWithKey( ThreadPool& pool, const uint32 threadCount, uint64* input, uint64* tmp, uint32* keyInput, uint32* keyTmp, uint64 length )
+{
+    DoSort<ThreadCount, SortAndGenKey, uint64, uint32, 5>( pool, threadCount, input, tmp, keyInput, keyTmp, length );
+}
+
 //-----------------------------------------------------------
 template<uint32 ThreadCount, RadixSort256::SortMode Mode, typename T1, typename TK, int MaxIter>
 void inline RadixSort256::DoSort( ThreadPool& pool, const uint32 desiredThreadCount, T1* input, T1* tmp, TK* keyInput, TK* keyTmp, uint64 length )
diff --git a/src/commands/CmdCheckCUDA.cpp b/src/commands/CmdCheckCUDA.cpp
new file mode 100644
index 00000000..80485eb2
--- /dev/null
+++ b/src/commands/CmdCheckCUDA.cpp
@@ -0,0 +1,72 @@
+#include "util/CliParser.h"
+#include "plotting/GlobalPlotConfig.h"
+
+#if BB_CUDA_ENABLED
+    #include <cuda_runtime.h>
+#endif
+static const char _help[] = R"(cudacheck [OPTIONS]
+
+Determines if the host has CUDA installed or not.
+If it does detect CUDA, the exit code is 0 and
+the number of CUDA devices is displayed.
+Otherwise, the exit code is 1.
+
+OPTIONS:
+ -h, --help       : Display this help message and exit.
+ -j, --json       : Output in JSON.
+)";
+
+
+void CmdCheckCUDAHelp()
+{
+    Log::Write( _help );
+    Log::Flush();
+
+}
+
+void CmdCheckCUDA( GlobalPlotConfig& gCfg, CliParser& cli )
+{
+    bool json = false;
+
+    while( cli.HasArgs() )
+    {
+        if( cli.ReadSwitch( json, "-j", "--json" ) )
+        {
+            continue;
+        }
+        if( cli.ArgConsume( "-h", "--help" ) )
+        {
+            CmdCheckCUDAHelp();
+            Exit(0);
+        }
+        else
+        {
+            Fatal( "Unexpected argument '%s'", cli.Arg() );
+        }
+    }
+
+    int  deviceCount = 0;
+    bool success     = false;
+    
+    #if BB_CUDA_ENABLED
+        success = cudaGetDeviceCount( &deviceCount ) == cudaSuccess;
+    #endif
+
+    if( json )
+    {
+        Log::Write( "{ \"enabled\": %s, \"device_count\": %d }",
+            success ? "true" : "false",
+            deviceCount );
+    }
+    else
+    {
+        if( success )
+            Log::Line( "CUDA is enabled with %d devices.", deviceCount );
+        else
+            Log::Line( "CUDA is NOT available." );
+
+    }
+
+    Log::Flush();
+    Exit( deviceCount > 0 ? 0 : -1 );
+}
diff --git a/src/commands/CmdPlotCheck.cpp b/src/commands/CmdPlotCheck.cpp
new file mode 100644
index 00000000..a05beeb8
--- /dev/null
+++ b/src/commands/CmdPlotCheck.cpp
@@ -0,0 +1,140 @@
+#include "Commands.h"
+#include "plotting/GlobalPlotConfig.h"
+#include "threading/MTJob.h"
+#include "tools/PlotReader.h"
+#include "plotting/PlotValidation.h"
+#include "plotting/f1/F1Gen.h"
+
+
+struct Config
+{
+    GlobalPlotConfig* gCfg    = nullptr;
+
+    uint64      proofCount = 100;
+    const char* plotPath   = "";
+};
+
+void CmdPlotsCheckHelp();
+
+
+//-----------------------------------------------------------
+void CmdPlotsCheckMain( GlobalPlotConfig& gCfg, CliParser& cli )
+{
+    Config cfg = {};
+    cfg.gCfg = &gCfg;
+
+    while( cli.HasArgs() )
+    {
+        if( cli.ArgConsume( "-h", "--help" ) )
+        {
+            CmdPlotsCheckHelp();
+            Exit( 0 );
+        }
+        else if( cli.ReadU64( cfg.proofCount, "-n", "--iterations" ) ) continue;
+        else
+            break;
+    }
+
+    FatalIf( !cli.HasArgs(), "Expected a path to a plot file." );
+    {
+        cfg.plotPath = cli.Arg();
+        cli.NextArg();
+
+        if( cli.HasArgs() )
+        {
+            Fatal( "Unexpected argument '%s'.", cli.Arg() );
+            Exit( 1 );
+        }
+    }
+
+    cfg.proofCount = std::max( cfg.proofCount, (uint64)1 );
+
+    FilePlot plot;
+    FatalIf( !plot.Open( cfg.plotPath ), "Failed to open plot file at '%s' with error %d.", cfg.plotPath, plot.GetError() );
+
+    const uint32 threadCount = gCfg.threadCount == 0 ? SysHost::GetLogicalCPUCount() :
+                                std::min( (uint32)MAX_THREADS, std::min( gCfg.threadCount, SysHost::GetLogicalCPUCount() ) );
+
+    PlotReader reader( plot );
+    reader.ConfigDecompressor( threadCount, gCfg.disableCpuAffinity );
+
+    const uint32 k = plot.K();
+
+    byte AlignAs(8) seed[BB_PLOT_ID_LEN] = {};
+    SysHost::Random( seed, sizeof( seed ) );
+
+    {
+        std::string seedHex = BytesToHexStdString( seed, sizeof( seed ) );
+        Log::Line( "Checking %llu random proofs with seed 0x%s...", (llu)cfg.proofCount, seedHex.c_str() );
+    }
+    Log::Line( "Plot compression level: %u", plot.CompressionLevel() );
+
+    const uint64 f7Mask = (1ull << k) - 1;
+
+    uint64 prevF7     = 0;
+    uint64 proofCount = 0;
+
+    uint64 proofXs[BB_PLOT_PROOF_X_COUNT];
+
+    uint64 nextPercentage = 10;
+
+    for( uint64 i = 0; i < cfg.proofCount; i++ )
+    {
+        const uint64 f7 = F1GenSingleForK( k, seed, prevF7 ) & f7Mask;
+        prevF7 = f7;
+
+        uint64 startP7Idx = 0;
+        const uint64 nF7Proofs = reader.GetP7IndicesForF7( f7, startP7Idx );
+
+        for( uint64 j = 0; j < nF7Proofs; j++ )
+        {
+            uint64 p7Entry;
+            if( !reader.ReadP7Entry( startP7Idx + j, p7Entry ) )
+            {
+                // #TODO: Handle error
+                continue;
+            }
+
+            const auto r = reader.FetchProof( p7Entry, proofXs );
+            if( r == ProofFetchResult::OK )
+            {
+                // Convert to 
+                uint64 outF7 = 0;
+                if( PlotValidation::ValidateFullProof( k, plot.PlotId(), proofXs, outF7 ) )
+                {
+                    if( f7 == outF7 )
+                    {
+                        proofCount++;
+                    }
+                    else {}// #TODO: Handle error
+                }
+                else
+                {
+                    // #TODO: Handle error
+                }
+
+            }
+            else
+            {   
+                // #TODO: Handle error
+                continue;
+            }
+        }
+
+        const double percent = i / (double)cfg.proofCount * 100.0;
+        if( (uint64)percent == nextPercentage )
+        {
+            Log::Line( " %llu%%...", (llu)nextPercentage );
+            nextPercentage += 10;
+        }
+    }
+
+    Log::Line( "%llu / %llu (%.2lf%%) valid proofs found.",
+        (llu)proofCount, (llu)cfg.proofCount, ((double)proofCount / cfg.proofCount) * 100.0 );
+}
+
+//-----------------------------------------------------------
+void CmdPlotsCheckHelp()
+{
+
+}
\ No newline at end of file
diff --git a/src/commands/CmdSimulator.cpp b/src/commands/CmdSimulator.cpp
new file mode 100644
index 00000000..e1c48251
--- /dev/null
+++ b/src/commands/CmdSimulator.cpp
@@ -0,0 +1,536 @@
+#include "Commands.h"
+#include "plotting/GlobalPlotConfig.h"
+#include "tools/PlotReader.h"
+#include "threading/MTJob.h"
+#include "harvesting/GreenReaper.h"
+#include "plotting/f1/F1Gen.h"
+
+static constexpr double SECS_PER_DAY       = 24 * 60 * 60;
+static constexpr double CHALLENGE_INTERVAL = 9.375;
+static constexpr uint32 CHALLENGES_PER_DAY = (uint32)(SECS_PER_DAY / CHALLENGE_INTERVAL);
+
+enum class SubCommand
+{
+    None,
+    Farm,
+};
+
+struct Config
+{
+    GlobalPlotConfig* gCfg      = nullptr;
+    const char* plotPath        = "";
+    uint64      fetchCount      = 100;
+    uint32      parallelCount   = 1;
+    double      maxLookupTime   = 8.0;
+    uint32      filterBits      = 512;
+    uint32      partials        = 300;          // Partials per day
+    size_t      farmSize        = 0;
+    byte        randomSeed[BB_PLOT_ID_LEN] = {};
+    double      powerSimSeconds = -1;
+    bool        noCuda          = false;
+    int32       cudaDevice      = 0;
+
+    // Internally set
+    double      partialRatio  = 0;
+
+
+    // Set by the simulation job
+    size_t      jobsMemoryUsed = 0;
+};
+
+struct JobStats
+{
+    std::atomic<uint64> nTotalProofs            = 0;                                  // Total proofs non-decayed f7s found in the plot
+    std::atomic<uint64> nActualFetches          = 0;                                  // How many feches we actually made (f7 was found, no decay)
+    std::atomic<uint64> nActualFullProofFetches = 0;                                  // Actual number of full proffs fetched (excluding decay misses)
+    std::atomic<uint64> totalTimeNano           = 0;                                  // Total time accumulated fetching from plots (includes both qualities and full proofs)
+    std::atomic<uint64> totalFullProofTimeNano  = 0;                                  // 
+    std::atomic<uint64> maxFetchTimeNano        = 0;                                  // Largest amount of time spent fetching from plots (includes both qualities and full proof)
+    std::atomic<uint64> minFPFetchTimeNano      = std::numeric_limits<uint64>::max(); // Smallest amount of time spent fetching from plots (includes both qualities and full proof)
+};
+
+struct SimulatorJob : MTJob<SimulatorJob>
+{
+    Config*        cfg;
+    Span<FilePlot> plots;
+    JobStats*      stats;
+    uint32         decompressorThreadCount;
+
+    virtual void Run() override;
+
+    void RunFarm( PlotReader& reader, uint64 fetchCount, uint32 partialsCount );
+};
+
+
+static size_t CalculatePlotSizeBytes( const uint32 k, const uint32 compressionLevel );
+static void DumpCompressedPlotCapacity( const Config& cfg, const uint32 k, const uint32 compressionLevel, const double fetchAverageSecs );
+
+void CmdSimulateMain( GlobalPlotConfig& gCfg, CliParser& cli )
+{
+    Config cfg = {};
+    cfg.gCfg = &gCfg;
+
+    // Set initial random seed for F7s
+    SysHost::Random( (byte*)&cfg.randomSeed, sizeof( cfg.randomSeed ) );
+
+    while( cli.HasArgs() )
+    {
+        if( cli.ArgConsume( "-h", "--help" ) )
+        {
+            CmdSimulateHelp();
+            Exit( 0 );
+        }
+        else if( cli.ReadU64( cfg.fetchCount, "-n", "--iterations" ) ) continue;
+        else if( cli.ReadU32( cfg.parallelCount, "-p", "--parallel" ) ) continue;
+        else if( cli.ReadF64( cfg.maxLookupTime, "-l", "--lookup" ) ) continue;
+        else if( cli.ReadU32( cfg.filterBits, "-f", "--filter" ) ) continue;
+        else if( cli.ReadU32( cfg.partials, "--partials" ) ) continue;
+        else if( cli.ReadF64( cfg.powerSimSeconds, "--power" ) ){
+            FatalIf( cfg.powerSimSeconds < 0.0, "Invalid power simulation time." );
+            continue;
+        }
+        else if( cli.ReadSize( cfg.farmSize, "-s", "--size" ) ) continue;
+        else if( cli.ReadHexStrAsBytes( cfg.randomSeed, sizeof( cfg.randomSeed ), "--seed" ) ) continue;
+        else if( cli.ReadSwitch( cfg.noCuda, "--no-cuda" ) ) continue;
+        else
+            break;
+    }
+
+    FatalIf( !cli.HasArgs(), "Expected a path to a plot file." );
+    {
+        cfg.plotPath = cli.Arg();
+        cli.NextArg();
+
+        if( cli.HasArgs() )
+        {
+            Fatal( "Unexpected argument '%s'.", cli.Arg() );
+            Exit( 1 );
+        }
+    }
+
+    FatalIf( cfg.fetchCount < 1      , "Invalid iteration count of %u.", cfg.fetchCount );
+    FatalIf( cfg.maxLookupTime <= 0.0, "Invalid max lookup time of %lf.", cfg.maxLookupTime );
+    FatalIf( cfg.filterBits < 1      , "Invalid filter bits value of %u.", cfg.filterBits );
+    FatalIf( cfg.parallelCount * (uint64)gCfg.threadCount > MAX_THREADS, 
+        "Too many thread combination (%llu) between -t and -p", cfg.parallelCount * (llu)gCfg.threadCount );
+
+    const bool powerMode = cfg.powerSimSeconds > 0.0;
+
+    // Lower the parallel count until all instances have at least 1 lookup
+    if( !powerMode && cfg.parallelCount > cfg.fetchCount )
+    {
+        Log::Line( "Warning: Limiting parallel context count to %u, as it must be <= than the fetch count of %llu",
+            cfg.parallelCount, (llu)cfg.fetchCount );
+        cfg.parallelCount = (uint32)cfg.fetchCount;
+    }
+
+    // With cuda, only support a single context and thread
+    if( !cfg.noCuda && cfg.parallelCount > 1 )
+    {
+        cfg.parallelCount = 1;
+        Log::Line( "Warning: Limiting the number of parallel contexts to 1 for CUDA harvester simulation." );
+    }
+
+
+    FilePlot* plot = new FilePlot[cfg.parallelCount];
+    for( uint32 i = 0; i < cfg.parallelCount; i++ )
+    {
+        if( !plot[i].Open( cfg.plotPath ) )
+            Fatal( "Failed to open plot file at '%s' with error %d.", cfg.plotPath, plot[i].GetError() );
+    }
+
+    const uint32 compressionLevel = plot[0].CompressionLevel();
+    FatalIf( compressionLevel < 1, "The plot %s is not compressed.", cfg.plotPath );
+
+    if( powerMode ) 
+    {
+        if( cfg.farmSize == 0 )
+        {
+            // Set a default farm size when at least 1 plot per context passes the filter
+            const size_t plotSize = CalculatePlotSizeBytes( plot->K(), compressionLevel );
+            cfg.farmSize = (uint64)cfg.parallelCount * plotSize * cfg.filterBits;
+            Log::Line( "Setting default farm size to %llu TB (use --size <size> to set a farm size manually).",
+                (llu)BtoTBSi( cfg.farmSize ) );
+        }
+        
+        // Adjust fetch count given the simulation time
+        cfg.fetchCount = std::max( (uint64)1, (uint64)(cfg.powerSimSeconds / CHALLENGE_INTERVAL ) ) * cfg.parallelCount;
+    }
+
+
+
+    Log::Line( "[Simulator for harvester farm capacity for K%2u C%u plots]", plot->K(), compressionLevel );
+    Log::Line( " Random seed: 0x%s", BytesToHexStdString( cfg.randomSeed, sizeof( cfg.randomSeed ) ).c_str() );
+    Log::Line( " Simulating..." );
+    Log::NewLine();
+
+
+    ThreadPool pool( cfg.parallelCount, ThreadPool::Mode::Fixed, true );
+
+    const uint32 decompressorThreadCount = std::min( gCfg.threadCount == 0 ? 8 : gCfg.threadCount, SysHost::GetLogicalCPUCount() );
+    JobStats stats = {};
+
+    {
+        SimulatorJob job = {};
+        job.cfg                     = &cfg;
+        job.plots                   = Span<FilePlot>( plot, cfg.parallelCount );
+        job.stats                   = &stats;
+        job.decompressorThreadCount = decompressorThreadCount;
+
+        MTJobRunner<SimulatorJob>::RunFromInstance( pool, cfg.parallelCount, job );
+
+        if( stats.minFPFetchTimeNano == std::numeric_limits<uint64>::max() )
+            stats.minFPFetchTimeNano = 0;
+    }
+
+    // Report
+    {
+        const uint64 actualFetchCount        = stats.nActualFetches;
+        const double effectivePartialPercent = actualFetchCount == 0 ? 0 : stats.nActualFullProofFetches.load() / (double)actualFetchCount * 100.0;
+        // const uint64 fetchCountAdjusted = CDiv( cfg.fetchCount, cfg.parallelCount ) * cfg.parallelCount;
+
+        const uint64 totalTimeNanoAdjusted = stats.totalTimeNano / cfg.parallelCount;
+        const uint64 fetchAverageNano      = actualFetchCount == 0 ? 0 : totalTimeNanoAdjusted / actualFetchCount;
+        const double fetchAverageSecs      = NanoSecondsToSeconds( fetchAverageNano );
+        const double fetchMaxSecs          = NanoSecondsToSeconds( stats.maxFetchTimeNano );
+        const double fetchFpAverageSecs    = stats.nActualFullProofFetches > 0 ? NanoSecondsToSeconds( stats.totalFullProofTimeNano / stats.nActualFullProofFetches ) : 0;
+        const size_t memoryUsed            = cfg.jobsMemoryUsed * cfg.parallelCount;
+
+        Log::Line( " Context count                 : %llu", (llu)cfg.parallelCount );
+        Log::Line( " Thread per context instance   : %llu", (llu)gCfg.threadCount );
+        Log::Line( " Memory used                   : %.1lfMiB ( %.1lfGiB )", (double)memoryUsed BtoMB, (double)memoryUsed BtoGB );
+        Log::Line( " Proofs / Challenges           : %llu / %llu ( %.2lf%% )", (llu)stats.nTotalProofs, (llu)cfg.fetchCount, (uint64)stats.nTotalProofs / (double)cfg.fetchCount * 100.0 );
+        Log::Line( " Fetches / Challenges          : %llu / %llu", (llu)actualFetchCount, (llu)cfg.fetchCount );
+        Log::Line( " Filter bits                   : %u", cfg.filterBits );
+        Log::Line( " Effective partials            : %llu ( %.2lf%% )", (llu)stats.nActualFullProofFetches.load(), effectivePartialPercent );
+        Log::Line( " Total fetch time elapsed      : %.3lf seconds", NanoSecondsToSeconds( totalTimeNanoAdjusted ) );
+        Log::Line( " Average plot lookup time      : %.3lf seconds", fetchAverageSecs   );
+        Log::Line( " Worst plot lookup lookup time : %.3lf seconds", fetchMaxSecs       );
+        Log::Line( " Average full proof lookup time: %.3lf seconds", fetchFpAverageSecs );
+        Log::Line( " Fastest full proof lookup time: %.3lf seconds", stats.nActualFullProofFetches == 0 ? 0.0 : NanoSecondsToSeconds( stats.minFPFetchTimeNano ) );
+        Log::NewLine();
+
+        if( fetchMaxSecs >= cfg.maxLookupTime )
+        {
+            Log::Line( "*** Warning *** : Your worst plot lookup time of %.3lf was over the maximum set of %.3lf.", 
+                fetchMaxSecs, cfg.maxLookupTime );
+        }
+
+        // Calculate farm size for this compression level
+        Log::Line( " %10s | %-10s | %-10s | %-10s ", "compression", "plot count", "size TB", "size PB" );
+        Log::Line( "------------------------------------------------" );
+        DumpCompressedPlotCapacity( cfg, plot->K(), compressionLevel, fetchAverageSecs );
+    }
+
+    Log::NewLine();
+    Exit( 0 );
+}
+
+void DumpCompressedPlotCapacity( const Config& cfg, const uint32 k, const uint32 compressionLevel, const double fetchAverageSecs )
+{
+    // Calculate farm size for this compression level
+    const size_t plotSize  = CalculatePlotSizeBytes( k, compressionLevel );
+    const uint64 plotCount = (uint64)(cfg.maxLookupTime / fetchAverageSecs * cfg.filterBits);
+
+    const size_t farmSizeBytes = plotCount * plotSize;
+    const size_t farmSizeTB    = BtoTBSi( farmSizeBytes );
+    const auto   farmSizePB    = BtoPBSiF( farmSizeBytes );
+
+    Log::Line( " C%-10u | %-10llu | %-10llu | %-10.2lf ", compressionLevel, plotCount, farmSizeTB, farmSizePB );
+}
+
+size_t CalculatePlotSizeBytes( const uint32 k, const uint32 compressionLevel )
+{
+    ASSERT( compressionLevel > 0 );
+
+    auto info = GetCompressionInfoForLevel( compressionLevel );
+
+    const uint64 tableEntryCount      = 1ull << k;
+    const double tablePrunedFactors[] = { 0.798, 0.801, 0.807, 0.823, 0.865, 1, 1 };
+
+    const size_t parkSizes[] = {
+        0, // Table 1 is dropped
+        compressionLevel >= 9 ? 0 : info.tableParkSize,
+        compressionLevel >= 9 ? info.tableParkSize : CalculateParkSize( TableId::Table3 ),
+        CalculateParkSize( TableId::Table4 ),
+        CalculateParkSize( TableId::Table5 ),
+        CalculateParkSize( TableId::Table6 ),
+        CalculatePark7Size( k )
+    };
+
+    size_t tableSizes[7] = {};
+    for( uint32 table = (uint32)TableId::Table2; table <= (uint32)TableId::Table7; table++ )
+    {
+        const uint64 prunedEntryCount = (uint64)(tableEntryCount * tablePrunedFactors[table]);
+        const uint64 parkCount        = CDiv( prunedEntryCount, kEntriesPerPark );
+
+        tableSizes[table] = parkCount * parkSizes[table];
+    }
+
+    const size_t c1EntrySize = RoundUpToNextBoundary( k, 8 );
+    const size_t c3ParkCount = CDiv( tableEntryCount, kCheckpoint1Interval );
+    const size_t c3Size      = c3ParkCount * CalculateC3Size(); 
+    const size_t c1Size      = c1EntrySize * ( tableEntryCount / kCheckpoint1Interval ) + c1EntrySize;
+    const size_t c2Size      = c1EntrySize * ( tableEntryCount / (kCheckpoint1Interval * kCheckpoint2Interval) ) + c1EntrySize;
+
+    const size_t plotSize = c3Size + c1Size + c2Size +
+        tableSizes[0] +
+        tableSizes[1] +
+        tableSizes[2] +
+        tableSizes[3] +
+        tableSizes[4] +
+        tableSizes[5] +
+        tableSizes[6];
+    
+    return plotSize;
+}
+
+
+void SimulatorJob::Run()
+{
+    FilePlot& plot = plots[JobId()];
+
+    PlotReader reader( plot );
+
+    {
+        GreenReaperConfig grCfg = {};
+        grCfg.apiVersion         = GR_API_VERSION;
+        grCfg.threadCount        = decompressorThreadCount;
+        grCfg.cpuOffset          = decompressorThreadCount * JobId();
+        grCfg.disableCpuAffinity = cfg->gCfg->disableCpuAffinity;
+        grCfg.gpuRequest         = cfg->noCuda ? GRGpuRequestKind_None : GRGpuRequestKind_FirstAvailable;
+        grCfg.gpuDeviceIndex     = cfg->cudaDevice;
+
+        GreenReaperContext* grContext = nullptr;
+        const auto result = grCreateContext( &grContext, &grCfg, sizeof( GreenReaperConfig ) );
+        FatalIf( !grContext, "Failed to create decompression context with error %d.", (int)result );
+
+        if( grCfg.gpuRequest != GRGpuRequestKind_None && !(bool)grHasGpuDecompressor( grContext ) )
+            Log::Line( "Warning: No GPU device decompressor selected. Falling back to CPU-based simulation." );
+
+        reader.AssignDecompressionContext( grContext );
+    }
+    // reader.ConfigDecompressor( decompressorThreadCount, cfg->gCfg->disableCpuAffinity, decompressorThreadCount * JobId() );
+
+
+    const double challengeRatio = cfg->fetchCount / (double)CHALLENGES_PER_DAY;
+    const uint64 actualPartials = (uint64)(cfg->partials * challengeRatio);
+
+    uint64 fetchCountForJob, partialsForJob;
+    {
+        uint64 _;
+        GetThreadOffsets( this, cfg->fetchCount, fetchCountForJob, _, _  );
+        GetThreadOffsets( this, actualPartials, partialsForJob, _, _  );
+        ASSERT( fetchCountForJob > 0 );
+    }
+
+    RunFarm( reader, fetchCountForJob, (uint32)partialsForJob );
+
+    if( IsControlThread() )
+        cfg->jobsMemoryUsed = grGetMemoryUsage( reader.GetDecompressorContext() );
+}
+
+void SimulatorJob::RunFarm( PlotReader& reader, const uint64 challengeCount, const uint32 partialCount )
+{
+    const uint32 k      = reader.PlotFile().K();
+    const size_t f7Size = CDiv( k, 8 );
+    
+    uint64 plotsPerChallenge = 1;
+
+    // In power simulation mode, determine how many plots we've got per challenge, if any,
+    // based on the specified farm size.
+    const bool powerSimMode = cfg->powerSimSeconds > 0.0;
+    if( powerSimMode )
+    {
+        const size_t plotSize               = CalculatePlotSizeBytes( k, reader.PlotFile().CompressionLevel() );
+        const uint64 farmPlotCount          = (uint64)cfg->farmSize / plotSize;
+        const uint64 totalPlotsPerChallenge = farmPlotCount / cfg->filterBits;
+
+        uint64 _;
+        GetFairThreadOffsets( this, totalPlotsPerChallenge, plotsPerChallenge, _, _ );
+
+        if( plotsPerChallenge < 1 )
+            return;
+    }
+
+    const auto simulationStart = TimerBegin();
+
+    uint64 totalFetchTimeNano     = 0;
+    uint64 totalFullProofTimeNano = 0;
+    uint64 maxFetchDurationNano   = 0;
+    uint64 minFetchDurationNano   = std::numeric_limits<uint64>::max();
+    uint64 nTotalProofs           = 0;
+    uint64 nTotalFetches          = 0;
+    uint64 nFullProofsFetches     = 0;
+
+    uint64 fullProofXs[BB_PLOT_PROOF_X_COUNT] = {};
+    byte   quality    [BB_CHIA_QUALITY_SIZE ] = {};
+
+    byte challenge[BB_CHIA_CHALLENGE_SIZE] = {};
+    {
+        const char* challengeStr = "00000000000000ee9355068689bd558eafe07cc7af47ad1574b074fc34d6913a";
+        HexStrToBytes( challengeStr, sizeof( challenge )*2, challenge, sizeof( challenge ) );
+    }
+
+    const int64 partialInterval = partialCount == 0 ? std::numeric_limits<int64>::max() : (int64)std::min( challengeCount, challengeCount / partialCount );
+          int64 nextPartial     = partialCount == 0 ? std::numeric_limits<int64>::max() : 1;
+
+    const uint64 f7Mask = (1ull << k) - 1;
+          uint64 prevF7 = (uint64)_jobId & f7Mask;
+
+    for( uint64 n = 0; n < challengeCount; n++ )
+    {
+        // How many plots are we simulating for this challenge?
+        // When doing maximum farm simulation, we only try 1,
+        // as we are simply calculating the maximum capacity based
+        // on lookup times. However, when simulating for power,
+        // we need to know how many plots would pass the filter per challenge
+        // given a hypothetical farm.
+        const auto challengeStartTime = TimerBegin();
+
+        for( uint64 p = 0; p < plotsPerChallenge; p++ )
+        {
+            // Generate new f7 (challenge) & fetch from plot
+            const uint64 f7 = F1GenSingleForK( k, cfg->randomSeed, prevF7 ) & f7Mask;
+            prevF7 = f7;
+
+            // Embed f7 into challenge as BE
+            for( size_t i = 0; i < f7Size; i++ )
+                challenge[i] = (byte)(f7 >> ((f7Size - i - 1) * 8));
+
+            const auto fetchTimer = TimerBegin();
+
+            // Read indices for F7
+                  uint64 p7IndexBase = 0;
+            const uint64 matchCount  = reader.GetP7IndicesForF7( f7, p7IndexBase );
+
+            const bool fetchFullProof = --nextPartial <= 0;
+            if( fetchFullProof )
+                nextPartial = partialInterval;
+
+            uint32 nFetchedFromMatches           = 0;
+            uint64 nFullProofsFetchedFromMatches = 0;
+
+            for( uint64 i = 0; i < matchCount; i++ )
+            {
+                uint64 p7Entry;
+                FatalIf( !reader.ReadP7Entry( p7IndexBase + i, p7Entry ), 
+                    "Failed to read P7 entry at %llu. (F7 = %llu)", (llu)p7IndexBase + i, (llu)f7 );
+
+                ProofFetchResult rQ, rP = ProofFetchResult::OK;
+
+                rQ = reader.FetchQualityForP7Entry( p7Entry, challenge, quality );
+                if( rQ == ProofFetchResult::OK )
+                {
+                    nFetchedFromMatches++;
+
+                    if( fetchFullProof )
+                    {
+                        rP = reader.FetchProof( p7Entry, fullProofXs );
+                        if( rP == ProofFetchResult::OK )
+                            nFullProofsFetchedFromMatches++;
+                    }
+                }
+
+                const auto errR = rQ != ProofFetchResult::OK ? rQ : rP;
+                if( errR != ProofFetchResult::OK )
+                {
+                    Log::Error( "Error %d while fetching proof for F7 %llu.", (int)errR, (llu)f7 );
+                    // FatalIf( errR == ProofFetchResult::Error, "Error while fetching proof for F7 %llu.", (llu)f7 );
+                    // FatalIf( errR == ProofFetchResult::CompressionError, "Decompression error while fetching proof for F7 %llu.", (llu)f7 );
+                }
+            }
+
+            const auto fetchElapsedNano = (uint64)TicksToNanoSeconds( TimerEndTicks( fetchTimer ) );
+
+            nTotalProofs += nFetchedFromMatches;
+            if( nFetchedFromMatches > 0 )
+            {
+                nTotalFetches++;
+                totalFetchTimeNano += fetchElapsedNano;
+
+                maxFetchDurationNano = std::max( maxFetchDurationNano, fetchElapsedNano );
+            }
+
+            if( nFullProofsFetchedFromMatches > 0 )
+            {
+                nFullProofsFetches++;
+                totalFullProofTimeNano += fetchElapsedNano;
+
+                minFetchDurationNano = std::min( minFetchDurationNano, fetchElapsedNano );
+            }
+        }
+
+        if( powerSimMode )
+        {
+            // End power simulation?
+            const auto simulationElapsed = TimerEnd( simulationStart );
+
+            if( simulationElapsed >= cfg->powerSimSeconds )
+                break;
+
+            // Wait for next challenge when in power usage simulation mode
+            const double challengeTimeElapsed = TimerEnd( challengeStartTime );
+
+            // #TODO: Count missed challenges
+            double timeUntilNextChallenge = CHALLENGE_INTERVAL - challengeTimeElapsed;
+            if( timeUntilNextChallenge < 0.0 )
+                timeUntilNextChallenge = std::fmod( challengeTimeElapsed, CHALLENGE_INTERVAL );
+
+            if( timeUntilNextChallenge >= 0.01 )
+            {
+                // Log::Line( "[%u] Sleep for %.2lf seconds. %.2lf / %.2lf", 
+                //     timeUntilNextChallenge, simulationElapsed, cfg->powerSimSeconds );
+                Thread::Sleep( (long)(timeUntilNextChallenge * 1000.0) );
+            }
+        }
+    }
+
+    stats->nTotalProofs            += nTotalProofs;
+    stats->nActualFetches          += nTotalFetches;
+    stats->nActualFullProofFetches += nFullProofsFetches;
+    stats->totalTimeNano           += totalFetchTimeNano;
+    stats->totalFullProofTimeNano  += totalFullProofTimeNano;
+
+    {
+        uint64 curMaxFetch = stats->maxFetchTimeNano.load( std::memory_order_relaxed );
+        while( curMaxFetch < maxFetchDurationNano && 
+            !stats->maxFetchTimeNano.compare_exchange_weak( curMaxFetch, maxFetchDurationNano, 
+                                                            std::memory_order_release, std::memory_order_relaxed ) );
+    }
+
+    if( minFetchDurationNano != std::numeric_limits<uint64>::max() )
+    {
+        uint64 curMinFetch = stats->minFPFetchTimeNano.load( std::memory_order_relaxed );
+        while( curMinFetch > minFetchDurationNano && 
+            !stats->minFPFetchTimeNano.compare_exchange_weak( curMinFetch, minFetchDurationNano, 
+                                                              std::memory_order_release, std::memory_order_relaxed ) );
+    }
+}
+
+
+static const char _help[] = R"(simulate [OPTIONS] <plot_file_path>
+OPTIONS:
+ -h, --help               : Display this help message and exit.
+ -n, --iterations <count> : How many iterations to run (default = %llu)
+ -p, --parallel <count>   : How many instances to run in parallel (default = 1)
+ -l, --lookup <seconds>   : Maximum allowed time per proof lookup (default = %.2lf)
+ -f, --filter <count>     : Plot filter bit count (default = %u)
+ --partials <count>       : Partials per-day simulation. (default = %u)
+ --power <seconds>        : Time in seconds to run power simulation. -n is set automatically in this mode.
+ -s, --size <size>        : Size of farm. Only used when `--power` is set.
+ --seed <hex>             : 64 char hex string to use as a random seed for challenges.
+ --no-cuda                : Don't use CUDA for decompression.
+ -d, --device <index>     : Cuda device index. (default = 0)
+)";
+
+void CmdSimulateHelp()
+{
+    Config cfg = {};
+    printf( _help, 
+        (llu)cfg.fetchCount,
+        cfg.maxLookupTime,
+        cfg.filterBits,
+        cfg.partials
+    );
+}
\ No newline at end of file
diff --git a/src/commands/Commands.h b/src/commands/Commands.h
new file mode 100644
index 00000000..a61685d5
--- /dev/null
+++ b/src/commands/Commands.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "plotting/GlobalPlotConfig.h"
+#include "util/CliParser.h"
+
+void CmdSimulateHelp();
+void CmdSimulateMain( GlobalPlotConfig& gCfg, CliParser& cli );
+
+void CmdPlotsCheckHelp();
+void CmdPlotsCheckMain( GlobalPlotConfig& gCfg, CliParser& cli );
+
+void CmdCheckCUDA( GlobalPlotConfig& gCfg, CliParser& cli );
+void CmdCheckCUDAHelp();
diff --git a/src/harvesting/GreenReaper.cpp b/src/harvesting/GreenReaper.cpp
new file mode 100644
index 00000000..325a4982
--- /dev/null
+++ b/src/harvesting/GreenReaper.cpp
@@ -0,0 +1,1866 @@
+#include "GreenReaper.h"
+#include "GreenReaperInternal.h"
+#include "harvesting/Thresher.h"
+#include "threading/ThreadPool.h"
+#include "threading/GenJob.h"
+#include "plotting/Tables.h"
+#include "tools/PlotReader.h"
+#include "plotmem/LPGen.h"
+#include "ChiaConsts.h"
+#include "pos/chacha8.h"
+#include "b3/blake3.h"
+#include "algorithm/RadixSort.h"
+#include "util/jobs/SortKeyJob.h"
+#include "util/BitView.h"
+#include "util/VirtualAllocator.h"
+#include "plotting/matching/GroupScan.h"
+#include <mutex>
+#include <algorithm>
+
+
+static std::mutex _lTargetLock;
+
+// Internal types
+struct ProofContext
+{
+    uint32    leftLength;
+    uint32    rightLength;
+
+    uint64*   yLeft;        // L table Y
+    uint64*   yRight;       // R table Y
+    K32Meta4* metaLeft;     // L table metadata
+    K32Meta4* metaRight;    // R table metadata
+
+    uint64*   proof;
+};
+
+struct ProofTable
+{
+    Pair*  _pairs    = nullptr;
+    uint64 _capacity = 0;
+    uint64 _length   = 0;
+
+    struct {
+        uint32 count;
+        uint32 offset;
+    } _groups[16] = {};
+
+    inline Span<Pair> GetGroupPairs( uint32 groupIdx ) const
+    {
+        ASSERT( groupIdx < 16 );
+        
+        const uint32 offset = _groups[groupIdx].offset;
+        const uint32 count  = _groups[groupIdx].count;
+        ASSERT( offset + count <= _length );
+
+        return MakeSpan( _pairs + offset,  count );
+    }
+
+    inline Span<Pair> GetUsedTablePairs() const
+    {
+        return MakeSpan( _pairs, _length );
+    }
+
+    inline Span<Pair> GetFreeTablePairs() const
+    {
+        return MakeSpan( _pairs+_length, _capacity - _length );
+    }
+
+    inline Pair& PushGroupPair( uint32 groupIdx )
+    {
+        ASSERT( _length < _capacity );
+
+        _groups[groupIdx].count++;
+
+        return _pairs[_length++];
+    }
+
+    inline void BeginGroup( uint32 groupIdx )
+    {
+        ASSERT( groupIdx < 16 );
+        _groups[groupIdx].count = 0;
+
+        if( groupIdx > 0 )
+            _groups[groupIdx].offset = _groups[groupIdx-1].offset + _groups[groupIdx-1].count;
+        else
+            _length = 0;
+    }
+
+    inline void AddGroupPairs( uint32 groupIdx, uint32 pairCount )
+    {
+        ASSERT( groupIdx < 16 );
+
+        _groups[groupIdx].count += pairCount;
+        _length += pairCount;
+    }
+
+    template<TableId lTable>
+    void GetLTableGroupEntries( GreenReaperContext& cx, const uint32 group, Span<uint64>& outY, Span<typename K32MetaType<lTable>::Out>& outMeta );
+
+    // inline void FinishGroup( uint32 groupIdx )
+    // {
+    //     ASSERT( groupIdx < 16 );
+
+    //     if( groupIdx < 15 )
+    //         _groups[groupIdx+1].offset = _groups[groupIdx].offset + _groups[groupIdx].count;
+    // }
+};
+
+// Used for qualities fetch. Line point and index.
+// Represents a line point and its original, pre-sort index.
+struct LPIndex { uint64 lp; uint32 index; };
+
+struct GreenReaperContext
+{
+    enum State
+    {
+        None      = 0,  // Awaiting a proof request or qualities request
+        Qualities = 1,  // Currently fetching qualities
+    };
+
+    GreenReaperConfig config;
+
+    State          state          = None;
+    ThreadPool*    pool           = nullptr;
+    size_t         allocationSize = 0;
+
+    uint64         maxEntriesPerBucket;
+    uint32         maxCompressionLevelReserved = 0; // Compression level for which we have reserved memory
+
+    Span<uint64>   yBufferF1;   // F1 tmp
+    Span<uint64>   yBuffer;     // F1/In/Out
+    Span<uint64>   yBufferTmp;  // In/Out
+
+    Span<uint32>   xBuffer;
+    Span<uint32>   xBufferTmp;
+
+    Span<uint32>   sortKey;             // #TODO: Remove this, can use x for it
+
+    Span<K32Meta4> metaBuffer;
+    Span<K32Meta4> metaBufferTmp;
+    Span<Pair>     pairs;
+    Span<Pair>     pairsTmp;
+    Span<uint32>   groupsBoundaries;    // BC group boundaries
+
+    ProofTable     tables[7]  = {};
+
+    ProofContext   proofContext;
+
+    Pair           backTraceTables[6][32] = {};
+    LPIndex        lpTables       [6][32] = {};
+
+    IThresher*     cudaThresher         = nullptr;
+    bool           cudaRecreateThresher = false;    // In case a CUDA error occurred or the device was lost,
+                                                    // we need to re-create it.
+};
+
+enum class ForwardPropResult
+{
+    Failed   = 0,
+    Success  = 1,
+    Continue = 2
+};
+
+
+template<TableId lTable>
+inline void ProofTable::GetLTableGroupEntries( GreenReaperContext& cx, const uint32 group, Span<uint64>& outY, Span<typename K32MetaType<lTable>::Out>& outMeta )
+{
+    using TMeta = typename K32MetaType<lTable>::Out;
+
+    ASSERT( group < 16 );
+    outY    = MakeSpan( cx.proofContext.yLeft + _groups[group].offset, _groups[group].count );
+    outMeta = MakeSpan( (TMeta*)cx.proofContext.metaLeft + _groups[group].offset, _groups[group].count );
+}
+
+struct Table1BucketContext
+{
+    GreenReaperContext* cx;
+    const byte*    plotId;
+    uint64         entriesPerBucket;
+
+    // Mutated after each bucket to point to the next
+    // free section of the buffers.
+    Span<uint64>   outY;
+    Span<K32Meta2> outMeta;
+    Span<Pair>     outPairs;
+};
+
+
+/// Internal functions
+static GRResult RequestSetup( GreenReaperContext* cx, const uint32 k, const uint32 compressionLevel );
+
+static void SortQualityXs( const uint32 k, const byte plotId[BB_PLOT_ID_LEN], uint64* xs, const uint32 count );
+
+static GRResult ProcessTable1Bucket( Table1BucketContext& tcx, const uint64 x1, const uint64 x2, const uint32 groupIndex );
+static GRResult ProcessTable1BucketCPU( Table1BucketContext& tcx, const uint64 x1, const uint64 x2, const uint32 groupIndex );
+
+static void FreeBucketBuffers( GreenReaperContext& cx );
+static bool ReserveBucketBuffers( GreenReaperContext& cx, uint32 k, uint32 compressionLevel );
+
+static void GenerateF1( GreenReaperContext& cx, const byte plotId[32], const uint64 bucketEntryCount, const uint32 x0, const uint32 x1 );
+static Span<Pair> Match( GreenReaperContext& cx, const Span<uint64> yEntries, Span<Pair> outPairs, const uint32 pairOffset  );
+
+template<TableId rTable>
+ForwardPropResult ForwardPropTable( GreenReaperContext& cx, uint32 tableGroupCount, bool returnSuccessOnSingleMatch );
+
+static void FlipBuffers( GreenReaperContext& cx, const TableId rTable );
+template<TableId rTable>
+static void SortTableAndFlipBuffers( GreenReaperContext& cx, const uint32 groupCount );
+
+template<TableId rTable, typename TMetaIn, typename TMetaOut>
+static void GenerateFxForPairs( GreenReaperContext& cx, const Span<Pair> pairs, const Span<uint64> yIn, const Span<TMetaIn> metaIn, Span<uint64> yOut, Span<TMetaOut> outMeta );
+
+template<TableId rTable, typename TMetaIn, typename TMetaOut>
+static void GenerateFx( const Span<Pair> pairs, const Span<uint64> yIn, const Span<TMetaIn> metaIn, Span<uint64> yOut, Span<TMetaOut> outMeta );
+
+static bool ForwardPropTables( GreenReaperContext& cx );
+static void BacktraceProof( GreenReaperContext& cx, const TableId tableStart, uint64 proof[GR_POST_PROOF_X_COUNT] );
+
+
+///
+/// Public API
+///
+//-----------------------------------------------------------
+GRResult grPopulateApi( GRApi* api, const size_t apiStructSize, const int apiVersion )
+{
+    if( api == nullptr )
+        return GRResult_Failed;
+
+    if( apiVersion != GR_API_VERSION )
+        return GRResult_WrongVersion;
+
+    if( apiStructSize != sizeof( GRApi ) )
+        return GRResult_WrongVersion;
+
+    api->CreateContext                  = &grCreateContext;
+    api->DestroyContext                 = &grDestroyContext;
+    api->PreallocateForCompressionLevel = &grPreallocateForCompressionLevel;
+    api->FetchProofForChallenge         = &grFetchProofForChallenge;
+    api->GetFetchQualitiesXPair         = &grGetFetchQualitiesXPair;
+    api->GetMemoryUsage                 = &grGetMemoryUsage;
+    api->HasGpuDecompressor             = &grHasGpuDecompressor;
+    api->GetCompressionInfo             = &grGetCompressionInfo;
+
+    return GRResult_OK;
+}
+
+//-----------------------------------------------------------
+GRResult grCreateContext( GreenReaperContext** outContext, 
+                          GreenReaperConfig* config,
+                          const size_t configStructSize )
+{
+    if( outContext == nullptr )
+        return GRResult_Failed;
+
+    auto* context = new GreenReaperContext{};
+    if( context == nullptr )
+        return GRResult_OutOfMemory;
+
+    if( config )
+    {
+        if( configStructSize != sizeof( GreenReaperConfig ) || config->apiVersion != GR_API_VERSION )
+            return GRResult_WrongVersion;
+
+        context->config = *config;
+    }
+    else
+    {
+        context->config.threadCount = std::min( 2u, SysHost::GetLogicalCPUCount() );
+    }
+
+    context->pool = new ThreadPool( context->config.threadCount, ThreadPool::Mode::Fixed,
+                                    (bool)config->disableCpuAffinity, 
+                                    (bool)config->disableCpuAffinity ? 0 : config->cpuOffset );
+    if( !context->pool )
+    {
+        grDestroyContext( context );
+        return GRResult_OutOfMemory;
+    }
+
+    if( config->gpuRequest != GRGpuRequestKind_None )
+    {
+        context->cudaThresher = CudaThresherFactory::Create( *config );
+        if( context->cudaThresher == nullptr && config->gpuRequest == GRGpuRequestKind_ExactDevice )
+        {
+            grDestroyContext( context );
+            return GRResult_InvalidGPU;
+        }
+    }
+
+    *outContext = context;
+    return GRResult_OK;
+}
+
+//-----------------------------------------------------------
+void grDestroyContext( GreenReaperContext* context )
+{
+    if( context == nullptr )
+        return;
+
+    FreeBucketBuffers( *context );
+
+    if( context->pool )
+        delete context->pool;
+
+    delete context;
+}
+
+//-----------------------------------------------------------
+GRResult grPreallocateForCompressionLevel( GreenReaperContext* context, const uint32_t k, const uint32_t maxCompressionLevel )
+{
+    if( context == nullptr )
+        return GRResult_Failed;
+
+    if( k != 32 )
+        return GRResult_Failed;
+
+    // Ensure our buffers have enough for the specified entry bit count
+    if( !ReserveBucketBuffers( *context, k, maxCompressionLevel ) )
+        return GRResult_OutOfMemory;
+
+    return GRResult_OK;
+}
+
+//-----------------------------------------------------------
+size_t grGetMemoryUsage( GreenReaperContext* context )
+{
+    if( !context )
+        return 0;
+
+    return context->allocationSize;
+}
+
+//-----------------------------------------------------------
+GRBool grHasGpuDecompressor( GreenReaperContext* context )
+{
+    return (context != nullptr && context->cudaThresher != nullptr ) ? GR_TRUE : GR_FALSE;
+}
+
+//-----------------------------------------------------------
+GRResult grGetCompressionInfo( GRCompressionInfo* outInfo, const size_t infoStructSize, 
+                               const uint32_t k, const uint32_t compressionLevel )
+{
+    if( infoStructSize != sizeof( GRCompressionInfo ) )
+        return GRResult_WrongVersion;
+
+    if( outInfo == nullptr 
+        || k != 32
+        || compressionLevel < 1
+        || compressionLevel > 9 )
+    {
+        return GRResult_InvalidArg;
+    }
+
+    auto c = GetCompressionInfoForLevel( compressionLevel );
+    outInfo->entrySizeBits = c.entrySizeBits;
+    outInfo->subtSizeBits  = c.subtSizeBits;
+    outInfo->tableParkSize = c.tableParkSize;
+    outInfo->ansRValue     = c.ansRValue;
+
+    return GRResult_OK;
+}
+
+//-----------------------------------------------------------
+GRResult grFetchProofForChallenge( GreenReaperContext* cx, GRCompressedProofRequest* req )
+{
+    if( !req || !req->plotId )
+        return GRResult_Failed;
+
+    const uint32 k = 32;
+    {
+        auto r = RequestSetup( cx, k, req->compressionLevel );
+        if( r != GRResult_OK )
+            return r;
+    }
+
+    const uint32 numGroups        = GR_POST_PROOF_CMP_X_COUNT;
+    const uint64 entriesPerBucket = GetEntriesPerBucketForCompressionLevel( k, req->compressionLevel );
+    ASSERT( entriesPerBucket <= 0xFFFFFFFF );
+
+
+    bool proofMightBeDropped = false;
+
+    if( cx->cudaThresher ) cx->cudaThresher->ClearTimings();
+
+    uint32 xGroups[GR_POST_PROOF_X_COUNT] = {};
+
+    // Unpack x groups first
+    if( req->compressionLevel < 9 )
+    {
+        for( uint32 i = 0, j = 0; i < numGroups; i++, j+=2 )
+        {
+            const uint32  xLinePoint = (uint32)req->compressedProof[i];
+            const BackPtr xs         = LinePointToSquare64( xLinePoint );
+
+            proofMightBeDropped = proofMightBeDropped || (xs.x == 0 || xs.y == 0);
+
+            xGroups[j+0] = (uint32)xs.y;
+            xGroups[j+1] = (uint32)xs.x;
+        }
+    }
+    else
+    {
+        for( uint32 i = 0, j = 0; i < numGroups / 2; i++, j+=4 )
+        {
+            const uint32  xLinePoint = (uint32)req->compressedProof[i];
+            const BackPtr xs         = LinePointToSquare64( xLinePoint );
+
+            const uint32 entrybits = GetCompressionInfoForLevel( req->compressionLevel ).entrySizeBits;
+            const uint32 mask      = (1u << entrybits) - 1;
+
+            proofMightBeDropped = proofMightBeDropped || (xs.x == 0 || xs.y == 0);
+
+            xGroups[j+0] = (uint32)(xs.y) & mask;
+            xGroups[j+1] = (uint32)(xs.y) >> entrybits;
+            xGroups[j+2] = (uint32)(xs.x) & mask;
+            xGroups[j+3] = (uint32)(xs.x) >> entrybits;
+        }
+    }
+
+    {
+        Table1BucketContext tcx;
+        tcx.cx               = cx;
+        tcx.plotId           = req->plotId;
+        tcx.entriesPerBucket = entriesPerBucket;
+        tcx.outY             = cx->yBufferTmp;
+        tcx.outMeta          = cx->metaBufferTmp.template As<K32Meta2>();
+        tcx.outPairs         = cx->pairs;
+
+        for( uint32 i = 0, j = 0; i < numGroups; i++, j+= 2 )
+        {
+            const uint64 x1 = xGroups[j+0];
+            const uint64 x2 = xGroups[j+1];
+
+                  auto&  table      = cx->tables[1];
+            const uint32 groupIndex = i / 2;
+
+            if( (i % 2 ) == 0 )
+                table.BeginGroup( groupIndex );
+
+            const GRResult r = ProcessTable1Bucket( tcx, x1, x2, groupIndex );
+             if( r != GRResult_OK )
+                return proofMightBeDropped ? GRResult_NoProof : GRResult_Failed;
+        }
+    }
+
+    // Test
+    if( cx->cudaThresher )
+        cx->cudaThresher->DumpTimings();
+
+    // #NOTE: Sanity check, but should never happen w/ our starting compression levels.
+    if( cx->tables[1]._length <= 2 )
+    {
+        Log::Line( "[GR_WARNING] Unexpected proof match on first table." );
+        return GRResult_Failed;
+    }
+
+    // Continue forward propagation to the next table
+    const uint32 table2Length = (uint32)cx->tables[1]._length;
+
+    cx->proofContext.leftLength = table2Length;
+    // cx->proofContext.rightLength = cx->tables[2]._capacity;
+
+    cx->proofContext.yLeft     = cx->yBuffer.Ptr();
+    cx->proofContext.metaLeft  = cx->metaBuffer.Ptr();
+    cx->proofContext.yRight    = cx->yBufferTmp.Ptr();
+    cx->proofContext.metaRight = cx->metaBufferTmp.Ptr();
+    cx->proofContext.proof     = req->fullProof;
+
+    SortTableAndFlipBuffers<TableId::Table2>( *cx, 32 >> (int)TableId::Table2 );
+
+    return ForwardPropTables( *cx ) ? GRResult_OK : 
+        proofMightBeDropped ? GRResult_NoProof : GRResult_Failed;
+}
+
+//-----------------------------------------------------------
+GRResult grGetFetchQualitiesXPair( GreenReaperContext* cx, GRCompressedQualitiesRequest* req )
+{
+    if( !req || !req->plotId )
+        return GRResult_Failed;
+
+    const uint32 k = 32;
+    {
+        auto r = RequestSetup( cx, k, req->compressionLevel );
+        if( r != GRResult_OK )
+            return r;
+    }
+
+    if( cx->cudaThresher ) cx->cudaThresher->ClearTimings();
+
+    const uint64 entriesPerBucket = GetEntriesPerBucketForCompressionLevel( k, req->compressionLevel );
+    ASSERT( entriesPerBucket <= 0xFFFFFFFF );
+
+    // Begin decompression
+    Table1BucketContext tcx;
+    tcx.cx               = cx;
+    tcx.plotId           = req->plotId;
+    tcx.entriesPerBucket = entriesPerBucket;
+    tcx.outY             = cx->yBufferTmp;
+    tcx.outMeta          = cx->metaBufferTmp.template As<K32Meta2>();
+    tcx.outPairs         = cx->pairs;
+
+    uint32 xGroups[16] = {};
+    uint32 numXGroups  = 0;
+
+    bool proofMightBeDropped = false;
+
+    // #TODO: Support more K values
+    {
+        const BackPtr p = LinePointToSquare( ((uint128)req->xLinePoints[0].hi) << 64 | (uint128)req->xLinePoints[0].lo );
+
+        // For level < 9, we have 4 x groups inlined
+        const BackPtr x1x2 = LinePointToSquare64( p.x );
+        const BackPtr x2x3 = LinePointToSquare64( p.y );
+
+        proofMightBeDropped = (x1x2.x == 0 || x1x2.y == 0) || (x2x3.x == 0 || x2x3.y == 0);
+
+        if( req->compressionLevel < 9 )
+        {
+            numXGroups = 2;
+            xGroups[0] = (uint32)x1x2.x;
+            xGroups[1] = (uint32)x1x2.y;
+            xGroups[2] = (uint32)x2x3.x;
+            xGroups[3] = (uint32)x2x3.y;
+        }
+        else
+        {
+
+            // Level 9 and above have 8 packed entries
+            const uint32 entrybits = GetCompressionInfoForLevel( req->compressionLevel ).entrySizeBits;
+            const uint32 mask      = (1u << entrybits) - 1;
+
+            numXGroups = 4;
+            xGroups[0] = ((uint32)x1x2.x) & mask;
+            xGroups[1] = ((uint32)x1x2.x) >> entrybits;
+            xGroups[2] = ((uint32)x1x2.y) & mask;
+            xGroups[3] = ((uint32)x1x2.y) >> entrybits;
+            xGroups[4] = ((uint32)x2x3.x) & mask;
+            xGroups[5] = ((uint32)x2x3.x) >> entrybits;
+            xGroups[6] = ((uint32)x2x3.y) & mask;
+            xGroups[7] = ((uint32)x2x3.y) >> entrybits;
+        }
+    }
+
+    if( req->compressionLevel >= 6 && req->compressionLevel < 9 )
+    {
+        const BackPtr p = LinePointToSquare( ((uint128)req->xLinePoints[1].hi) << 64 | (uint128)req->xLinePoints[1].lo );
+
+        const BackPtr x1x2 = LinePointToSquare64( p.x );
+        const BackPtr x2x3 = LinePointToSquare64( p.y );
+
+        proofMightBeDropped = proofMightBeDropped || (x1x2.x == 0 || x1x2.y == 0) || (x2x3.x == 0 || x2x3.y == 0);
+
+        if( req->compressionLevel < 9 )
+        {
+            numXGroups = 4;
+            xGroups[4] = (uint32)x1x2.x;
+            xGroups[5] = (uint32)x1x2.y;
+            xGroups[6] = (uint32)x2x3.x;
+            xGroups[7] = (uint32)x2x3.y;
+        }
+        else
+        {
+            const uint32 entrybits = GetCompressionInfoForLevel( req->compressionLevel ).entrySizeBits;
+            const uint32 mask      = (1u << entrybits) - 1;
+
+            numXGroups = 8;
+            xGroups[8 ] = ((uint32)x1x2.x) & mask;
+            xGroups[9 ] = ((uint32)x1x2.x) >> entrybits;
+            xGroups[10] = ((uint32)x1x2.y) & mask;
+            xGroups[11] = ((uint32)x1x2.y) >> entrybits;
+            xGroups[12] = ((uint32)x2x3.x) & mask;
+            xGroups[13] = ((uint32)x2x3.x) >> entrybits;
+            xGroups[14] = ((uint32)x2x3.y) & mask;
+            xGroups[15] = ((uint32)x2x3.y) >> entrybits;
+        }
+    }
+
+    for( uint32 i = 0; i < numXGroups; i++ )
+    {
+        // Gen sorted f1
+        const uint32 xIdx = i*2;
+        const uint64 x1   = xGroups[xIdx+0];
+        const uint64 x2   = xGroups[xIdx+1];
+
+        const uint32 groupIndex = i / 2;
+        if( (i & 1) == 0 )
+            cx->tables[1].BeginGroup( groupIndex );
+
+        const auto r = ProcessTable1Bucket( tcx, x1, x2, groupIndex );
+
+        if( r != GRResult_OK )
+            return proofMightBeDropped ? GRResult_NoProof : GRResult_Failed;
+    }
+
+    if( cx->tables[1]._length == 1 )
+    {
+        // #TODO: Add this path, but this should never happen
+        Log::Line( "[GR_WARNING] Unexpected qualities match on first 2nd table." );
+        return GRResult_Failed;
+    }
+
+    // Continue forward propagation to the next table
+    cx->proofContext.leftLength  = (uint32)cx->tables[1]._length;
+
+    uint64 fullProof[GR_POST_PROOF_X_COUNT] = {};
+
+    cx->proofContext.yLeft     = cx->yBuffer.Ptr();
+    cx->proofContext.metaLeft  = cx->metaBuffer.Ptr();
+    cx->proofContext.yRight    = cx->yBufferTmp.Ptr();
+    cx->proofContext.metaRight = cx->metaBufferTmp.Ptr();
+    cx->proofContext.proof     = fullProof;
+
+    SortTableAndFlipBuffers<TableId::Table2>( *cx, numXGroups / 2 );
+
+
+    TableId matchTable = TableId::Table3;
+
+    auto fpResult = ForwardPropTable<TableId::Table3>( *cx, numXGroups /= 2, true );
+
+    if( fpResult == ForwardPropResult::Continue )
+    {
+        matchTable = TableId::Table4;
+        fpResult = ForwardPropTable<TableId::Table4>( *cx, numXGroups /= 2, true );
+    }
+    if( fpResult == ForwardPropResult::Continue )
+    {
+        matchTable = TableId::Table5;
+        fpResult = ForwardPropTable<TableId::Table5>( *cx, numXGroups /= 2, true );
+    }
+    if( fpResult == ForwardPropResult::Continue )
+    {
+        matchTable = TableId::Table6;
+        fpResult = ForwardPropTable<TableId::Table6>( *cx, numXGroups /= 2, true );
+    }
+
+    if( fpResult != ForwardPropResult::Success )
+        return proofMightBeDropped ? GRResult_NoProof : GRResult_Failed;
+
+    // If we have more than 1 group, we need to only trace back to
+    // the ones that belong to the first group
+    uint64 qualityXs[8] = {};
+
+    {
+        Pair  pairs[2][4] = {};
+        Pair* pairsIn  = pairs[0];
+        Pair* pairsOut = pairs[1];
+
+        if( cx->cudaThresher )
+            bbmemcpy_t( cx->tables[(int)matchTable]._pairs, cx->pairs.Ptr(), cx->tables[(int)matchTable]._length );
+
+        bbmemcpy_t( pairs[0], cx->tables[(int)matchTable]._pairs, cx->tables[(int)matchTable]._length );
+
+        for( TableId rTable = matchTable; rTable > TableId::Table3; rTable-- )
+        {
+            auto& inTable  = cx->tables[(int)rTable];
+            auto& outTable = cx->tables[(int)rTable-1];
+
+            for( uint32 i = 0; i < inTable._length; i++ )
+            {
+                const auto pair = inTable._pairs[i];
+
+                pairsOut[i*2+0] = outTable._pairs[pair.left ];
+                pairsOut[i*2+1] = outTable._pairs[pair.right];
+            }
+
+            std::swap( pairsOut, pairsIn );
+        }
+
+        // From table 3, only take the pair that points to the first group
+        const Pair t3Pair = pairsIn[0].left < cx->tables[1]._groups[0].count ?
+                            pairsIn[0] : pairsIn[1];
+
+        // Grab the x's from the first group only
+        const Pair xPair0 = cx->tables[1]._pairs[t3Pair.left ];
+        const Pair xPair1 = cx->tables[1]._pairs[t3Pair.right];
+
+        qualityXs[0] = xPair0.left;
+        qualityXs[1] = xPair0.right;
+        qualityXs[2] = xPair1.left;
+        qualityXs[3] = xPair1.right;
+    }
+
+    // We need to now sort the X's on y, in order to chose the right path
+    SortQualityXs( k, req->plotId, qualityXs, 4 );
+
+    // Follow the last path, based on the challenge in our x's
+    const uint32 last5Bits      = (uint32)req->challenge[31] & 0x1f;
+    const bool   isTable1BitSet = (last5Bits & 1) == 1;
+
+    if( !isTable1BitSet )
+    {
+        req->x1 = qualityXs[0];
+        req->x2 = qualityXs[1];
+    }
+    else
+    {
+        req->x1 = qualityXs[2];
+        req->x2 = qualityXs[3];
+    }
+
+    return GRResult_OK;
+}
+
+
+///
+/// Private Funcs
+///
+//-----------------------------------------------------------
+GRResult ProcessTable1Bucket( Table1BucketContext& tcx, const uint64 x1, const uint64 x2, const uint32 groupIndex )
+{
+    GreenReaperContext& cx = *tcx.cx;
+
+    // #TODO: Not supported proofs, these should be droped
+    const bool proofMightBeDropped = x1 == 0 || x2 == 0 ;
+    
+    // if( false )
+    if( cx.cudaThresher != nullptr )
+    {
+// #if _DEBUG
+//         ProcessTable1BucketCPU( tcx, x1, x2, groupIndex );
+// #endif
+        uint32 matchCount = 0;
+
+        const auto r = cx.cudaThresher->DecompressInitialTable(
+                        cx,
+                        tcx.plotId,
+                        (uint32)tcx.entriesPerBucket,
+                        tcx.outPairs.Ptr(),
+                        tcx.outY.Ptr(),
+                        tcx.outMeta.Ptr(),
+                        matchCount,
+                        x1, x2 );
+
+        if( r.kind == ThresherResultKind::Error )
+        {
+            // #NOTE: Perhaps find a way to log this or pass it back to the caller.
+            delete cx.cudaThresher;
+            cx.cudaThresher         = nullptr;
+            cx.cudaRecreateThresher = true;
+
+            return GRResult_Failed;
+        }
+        else if( r.kind == ThresherResultKind::Success )
+        {
+            auto& table = cx.tables[1];
+            table.AddGroupPairs( groupIndex, matchCount );
+// Log::Line( "[%u] CUDA Pairs: %u", groupIndex, matchCount );
+            tcx.outPairs = tcx.outPairs.Slice( matchCount );
+            tcx.outY     = tcx.outY    .Slice( matchCount );
+            tcx.outMeta  = tcx.outMeta .Slice( matchCount );
+
+            return GRResult_OK;
+        }
+
+        return proofMightBeDropped ? GRResult_NoProof : GRResult_Failed;
+    }
+    else
+    {
+        return ProcessTable1BucketCPU( tcx, x1, x2, groupIndex );
+    }
+
+    return GRResult_Failed;
+}
+
+//-----------------------------------------------------------
+GRResult ProcessTable1BucketCPU( Table1BucketContext& tcx, const uint64 x1, const uint64 x2, const uint32 groupIndex )
+{
+    GreenReaperContext& cx = *tcx.cx;
+
+    // #TODO: Not supported proofs, these should be droped
+    const bool proofMightBeDropped = x1 == 0 || x2 == 0 ;
+
+    // Gen F1 for both X buckets into a single bucket sorted on Y
+    GenerateF1( cx, tcx.plotId, tcx.entriesPerBucket, (uint32)x1, (uint32)x2 );
+
+    // Perform matches
+    auto yEntries = cx.yBuffer;
+    auto xEntries = cx.xBuffer;
+
+    const Span<Pair> pairs = Match( cx, yEntries, tcx.outPairs, 0 );
+// Log::Line( "[%u] CPU Pairs: %u", groupIndex, (uint)pairs.Length() );
+    // Expect at least one match
+    if( pairs.Length() < 1 )
+        return proofMightBeDropped ? GRResult_NoProof: GRResult_Failed;
+
+    auto& table = cx.tables[1];
+    table.AddGroupPairs( groupIndex, (uint32)pairs.Length() );
+
+    // Perform fx for table2 pairs
+    GenerateFxForPairs<TableId::Table2>( cx, pairs, yEntries, xEntries, tcx.outY, tcx.outMeta );
+
+    // Inline x's into the pairs
+    {
+        #if SHOW_TIMINGS
+            const auto timer = TimerBegin();
+        #endif
+        const uint32 threadCount = std::min( cx.config.threadCount, (uint32)pairs.Length() );
+
+        AnonMTJob::Run( *cx.pool, threadCount, [=]( AnonMTJob* self ){
+            
+            const Span<uint32> xs          = xEntries;
+                  Span<Pair>   threadPairs = GetThreadOffsets( self, pairs );
+
+            for( uint64 i = 0; i < threadPairs.Length(); i++ )
+            {
+                Pair p = threadPairs[i];
+                p.left  = xs[p.left ];
+                p.right = xs[p.right];
+
+                threadPairs[i] = p;
+            }
+        });
+        #if SHOW_TIMINGS
+            Log::Line( "Inline x elapsed: %.3lf s", TimerEnd( timer ) );
+        #endif
+    }
+
+    tcx.outPairs = tcx.outPairs.Slice( pairs.Length() );
+    tcx.outY     = tcx.outY    .Slice( pairs.Length() );
+    tcx.outMeta  = tcx.outMeta .Slice( pairs.Length() );
+
+    return GRResult_OK;
+}
+
+//-----------------------------------------------------------
+void BacktraceProof( GreenReaperContext& cx, const TableId tableStart, uint64 proof[GR_POST_PROOF_X_COUNT] )
+{
+    Pair _backtrace[2][64] = {};
+
+    Pair* backTraceIn  = _backtrace[0];
+    Pair* backTraceOut = _backtrace[1];
+
+    // #TODO: If in table 2, get the x's directly
+
+    // Fill initial back-trace
+    {
+        ProofTable& table = cx.tables[(int)tableStart];
+
+        const Pair* pairsSrc = cx.cudaThresher != nullptr ? cx.pairs.Ptr() : table._pairs;
+        bbmemcpy_t( backTraceIn, pairsSrc, table._length );
+
+        // const uint32 groupCount = 32 >> (int)tableStart;
+            
+
+        // for( uint32 i = 0; i < groupCount; i++ )
+        // {
+        //     ASSERT( table._length == 2 );
+
+        //     Pair p0 = table._pairs[0];
+        //     Pair p1 = table._pairs[1];
+
+        //     const uint32 idx = i*4;
+
+        //     backTraceIn[idx+0] = e0.parentL;
+        //     backTraceIn[idx+1] = e0.parentR;
+        //     backTraceIn[idx+2] = e1.parentL;
+        //     backTraceIn[idx+3] = e1.parentR;
+        // }
+    }
+
+    for( TableId table = tableStart; table > TableId::Table2; table-- )
+    {
+        const uint32 entryCount = (32 >> (int)table)*2;
+
+        ProofTable& lTable = cx.tables[(int)table-1];
+
+        for( uint32 i = 0; i < entryCount; i++ )
+        {
+            const Pair p = backTraceIn[i];
+            
+            const uint32 idx = i * 2;
+            backTraceOut[idx+0] = lTable._pairs[p.left ];
+            backTraceOut[idx+1] = lTable._pairs[p.right];
+        }
+
+        std::swap( backTraceIn, backTraceOut );
+    }
+
+    for( uint32 i = 0; i < GR_POST_PROOF_CMP_X_COUNT; i++ )
+    {
+        const uint32 idx = i * 2;
+        proof[idx+0] = backTraceIn[i].left;
+        proof[idx+1] = backTraceIn[i].right;
+    }
+
+// #if _DEBUG
+//     Log::Line( "" );
+//     Log::Line( "Recovered Proof:" );
+//     for( uint32 i = 0; i < GR_POST_PROOF_X_COUNT; i++ )
+//         Log::WriteLine( "[%2u]: %-10u ( 0x%08x )", i, (uint32)proof[i], (uint32)proof[i] );
+// #endif
+}
+
+//-----------------------------------------------------------
+GRResult RequestSetup( GreenReaperContext* cx, const uint32 k, const uint32 compressionLevel )
+{
+    if( compressionLevel < 1 || compressionLevel > 9 )
+        return GRResult_Failed;
+
+    // Make sure we have our CUDA decompressor working in case it was deleted after a failure
+    {
+        // if( cx->config.gpuRequest != GRGpuRequestKind_None )
+        if( cx->cudaRecreateThresher )
+        {
+            ASSERT( !cx->cudaThresher );
+            cx->cudaThresher = CudaThresherFactory::Create( cx->config );
+            if( !cx->cudaThresher )
+                return GRResult_Failed;
+
+            cx->cudaRecreateThresher = false;
+        }
+    }
+
+    const GRResult r = grPreallocateForCompressionLevel( cx, k, compressionLevel );
+    if( r != GRResult_OK )    
+        return r;
+
+    // Always make sure this has been done
+    {
+        // #TODO: Make The function itself thread-safe, don't do it out here
+        _lTargetLock.lock();
+        LoadLTargets();
+        _lTargetLock.unlock();
+    }
+
+    return r;
+}
+
+//-----------------------------------------------------------
+bool ReserveBucketBuffers( GreenReaperContext& cx, const uint32 k, const uint32 compressionLevel )
+{
+    // #TODO: Support other K values
+    ASSERT( k == 32 );
+
+    if( compressionLevel > cx.maxCompressionLevelReserved )
+    {
+        const uint64 entriesPerBucket = GetEntriesPerBucketForCompressionLevel( k, compressionLevel );
+        ASSERT( entriesPerBucket > cx.maxEntriesPerBucket );
+
+        cx.maxEntriesPerBucket = 0;
+        FreeBucketBuffers( cx );
+
+        const size_t allocCount = (size_t)entriesPerBucket * 2;
+
+        // The pair requirements ought to be much less as the number of matches we get per group is not as high.
+        uint64 maxPairsPerTable = std::max( (uint64)GR_MIN_TABLE_PAIRS, GetMaxTablePairsForCompressionLevel( k, compressionLevel ) );
+
+        VirtualAllocator alloc;
+
+        cx.yBufferF1        = alloc.TryCAllocBoundedSpan<uint64>  ( allocCount );
+        cx.yBuffer          = alloc.TryCAllocBoundedSpan<uint64>  ( allocCount );
+        cx.yBufferTmp       = alloc.TryCAllocBoundedSpan<uint64>  ( allocCount );
+        cx.xBuffer          = alloc.TryCAllocBoundedSpan<uint32>  ( allocCount );
+        cx.xBufferTmp       = alloc.TryCAllocBoundedSpan<uint32>  ( allocCount );
+        cx.sortKey          = alloc.TryCAllocBoundedSpan<uint32>  ( maxPairsPerTable );
+        cx.metaBuffer       = alloc.TryCAllocBoundedSpan<K32Meta4>( maxPairsPerTable );
+        cx.metaBufferTmp    = alloc.TryCAllocBoundedSpan<K32Meta4>( maxPairsPerTable );
+        cx.pairs            = alloc.TryCAllocBoundedSpan<Pair>    ( maxPairsPerTable );
+        cx.pairsTmp         = alloc.TryCAllocBoundedSpan<Pair>    ( maxPairsPerTable );
+        cx.groupsBoundaries = alloc.TryCAllocBoundedSpan<uint32>  ( allocCount );
+
+        // Allocate proof tables
+        // Table 1 needs no groups, as we write to the R table's merged group, always
+        for( uint32 i = 1; i < 7; i++ )
+        {
+            cx.tables[i] = {};
+            cx.tables[i]._pairs    = alloc.TryCAllocBounded<Pair>( maxPairsPerTable );
+            cx.tables[i]._capacity = maxPairsPerTable;
+
+            // Reduce the match count for each subsequent table by nearly half
+            maxPairsPerTable = std::max<uint64>( (uint64)(maxPairsPerTable * 0.6), GR_MIN_TABLE_PAIRS );
+        }
+
+        cx.allocationSize = alloc.AllocSize();
+
+        bool allocFailed = alloc.FailCount() > 0;
+        if( !allocFailed && cx.cudaThresher != nullptr )
+            allocFailed = cx.cudaThresher->AllocateBuffers( k, compressionLevel );
+
+
+        if( alloc.FailCount() > 0 )
+        {   
+            FreeBucketBuffers( cx );
+            return false;
+        }
+
+        cx.maxEntriesPerBucket         = entriesPerBucket;
+        cx.maxCompressionLevelReserved = compressionLevel;
+    }
+
+    if( cx.cudaThresher != nullptr )
+    {
+        if( !cx.cudaThresher->AllocateBuffers( k, compressionLevel ) )
+        {
+            FreeBucketBuffers( cx );
+            delete cx.cudaThresher;
+            cx.cudaThresher         = nullptr;
+            cx.cudaRecreateThresher = true;
+            return false;
+        }
+    }
+
+    return true;
+}
+
+//-----------------------------------------------------------
+void FreeBucketBuffers( GreenReaperContext& cx )
+{
+    cx.allocationSize = 0;
+    cx.maxCompressionLevelReserved = 0;
+
+    bbvirtfreebounded_span( cx.yBufferF1 );
+    bbvirtfreebounded_span( cx.yBuffer );
+    bbvirtfreebounded_span( cx.yBufferTmp );
+    bbvirtfreebounded_span( cx.xBuffer );
+    bbvirtfreebounded_span( cx.xBufferTmp );
+    bbvirtfreebounded_span( cx.sortKey );
+    bbvirtfreebounded_span( cx.metaBuffer );
+    bbvirtfreebounded_span( cx.metaBufferTmp );
+    bbvirtfreebounded_span( cx.pairs );
+    bbvirtfreebounded_span( cx.pairsTmp );
+    bbvirtfreebounded_span( cx.groupsBoundaries );
+
+    for( uint32 i = 0; i < 7; i++ )
+        bbvirtfreebounded( cx.tables[i]._pairs );
+
+    if( cx.cudaThresher != nullptr )
+        cx.cudaThresher->ReleaseBuffers();
+}
+
+
+//-----------------------------------------------------------
+inline void FlipBuffers( GreenReaperContext& cx, const TableId rTable )
+{
+    ProofTable& table = cx.tables[(int)rTable];
+
+    const uint32 tableLength = (uint32)table._length;
+
+    ASSERT( tableLength <= cx.pairs.Length() );
+    ASSERT( tableLength <= table._capacity );
+
+    cx.proofContext.leftLength  = (uint32)tableLength;
+    cx.proofContext.rightLength = (uint32)cx.tables[(int)rTable+1]._capacity;
+
+    std::swap( cx.proofContext.yLeft, cx.proofContext.yRight );
+    std::swap( cx.proofContext.metaLeft, cx.proofContext.metaRight );
+}
+
+//-----------------------------------------------------------
+template<TableId rTable>
+inline void SortTableAndFlipBuffers( GreenReaperContext& cx, const uint32 groupCount )
+{
+    if( cx.cudaThresher != nullptr )
+    {
+        ASSERT( cx.cudaThresher );
+        FlipBuffers( cx, rTable );
+        return;
+    }
+
+    #if SHOW_TIMINGS
+        const auto timer = TimerBegin();
+    #endif
+    using TMeta = typename K32MetaType<rTable>::Out;
+
+    ProofTable& table = cx.tables[(int)rTable];
+
+    const uint32 tableLength = (uint32)table._length;
+    const uint32 threadCount = cx.config.threadCount;
+
+    ASSERT( tableLength <= cx.pairs.Length() );
+    ASSERT( tableLength <= table._capacity );
+
+    // At this point the yRight/metaRight hold the unsorted fx output 
+    // from the left table pairs/y/meta and sort onto the right buffers
+    Span<uint64> tableYUnsorted     = MakeSpan( cx.proofContext.yRight, tableLength );
+    Span<uint64> tableYSorted       = MakeSpan( cx.proofContext.yLeft , tableLength );
+    Span<TMeta>  tableMetaUnsorted  = MakeSpan( (TMeta*)cx.proofContext.metaRight, tableLength );
+    Span<TMeta>  tableMetaSorted    = MakeSpan( (TMeta*)cx.proofContext.metaLeft , tableLength );
+    Span<Pair>   tablePairsUnsorted = cx.pairs.SliceSize( tableLength );
+    Span<Pair>   tablePairsSorted   = MakeSpan( table._pairs, tableLength );
+
+    Span<uint32> keyUnsorted        = cx.xBufferTmp.SliceSize( tableLength );
+    Span<uint32> keySorted          = cx.xBuffer   .SliceSize( tableLength );
+
+    for( uint32 i = 0; i < groupCount; i++ )
+    {
+        const uint32 groupLength  = table._groups[i].count;
+        const uint32 groupThreads = std::min( threadCount, groupLength );
+
+        auto yUnsorted     = tableYUnsorted    .SliceSize( groupLength );
+        auto ySorted       = tableYSorted      .SliceSize( groupLength );
+        auto metaUnsorted  = tableMetaUnsorted .SliceSize( groupLength );
+        auto metaSorted    = tableMetaSorted   .SliceSize( groupLength );
+        auto pairsUnsorted = tablePairsUnsorted.SliceSize( groupLength );
+        auto pairsSorted   = tablePairsSorted  .SliceSize( groupLength );
+
+        tableYUnsorted     = tableYUnsorted    .Slice( groupLength );
+        tableYSorted       = tableYSorted      .Slice( groupLength );
+        tableMetaUnsorted  = tableMetaUnsorted .Slice( groupLength );
+        tableMetaSorted    = tableMetaSorted   .Slice( groupLength );
+        tablePairsUnsorted = tablePairsUnsorted.Slice( groupLength ); 
+        tablePairsSorted   = tablePairsSorted  .Slice( groupLength );
+
+
+        auto kUnsorted = keyUnsorted.SliceSize( groupLength );
+        auto kSorted   = keySorted  .SliceSize( groupLength );
+
+        // #TODO: Perhaps do all groups and this in a single job (more code repetition, though)?
+        SortKeyJob::GenerateKey( *cx.pool, groupThreads, keyUnsorted );
+
+        RadixSort256::SortYWithKey<BB_MAX_JOBS>( *cx.pool, groupThreads, yUnsorted.Ptr(), ySorted.Ptr(),
+                                                 kUnsorted.Ptr(), kSorted.Ptr(), groupLength );
+
+        SortKeyJob::SortOnKey( *cx.pool, groupThreads, kSorted, metaUnsorted , metaSorted );
+        SortKeyJob::SortOnKey( *cx.pool, groupThreads, kSorted, pairsUnsorted, pairsSorted );
+    }
+
+    cx.proofContext.leftLength  = (uint32)tableLength;
+    cx.proofContext.rightLength = (uint32)cx.tables[(int)rTable+1]._capacity;
+
+    #if SHOW_TIMINGS
+        Log::Line( "Sort elapsed: %.3lf s", TimerEnd( timer ) );
+    #endif
+}
+
+///
+/// F1
+///
+//-----------------------------------------------------------
+void GenerateF1( GreenReaperContext& cx, const byte plotId[32], const uint64 bucketEntryCount, const uint32 x0, const uint32 x1 )
+{
+    #if SHOW_TIMINGS
+        const auto timer = TimerBegin();
+    #endif
+
+    const uint32 k = 32;
+    const uint32 f1BlocksPerBucket = (uint32)(bucketEntryCount * sizeof( uint32 ) / kF1BlockSize);
+
+    uint32 threadCount = cx.pool->ThreadCount();
+
+    while( f1BlocksPerBucket < threadCount )
+        threadCount--;
+    
+    // Out buffers are continuous, so that we can merge both buckets into one
+    uint32* xBuffer = cx.xBufferTmp.Ptr();
+    uint64* yBuffer = cx.yBufferF1 .Ptr();
+
+    auto blocks = Span<uint32>( (uint32*)cx.yBuffer.Ptr(), bucketEntryCount );
+
+    const uint32 xSources[2] = { x0, x1 };
+
+    Span<uint32> xEntries[2] = {
+        Span<uint32>( xBuffer, bucketEntryCount ),
+        Span<uint32>( xBuffer + bucketEntryCount, bucketEntryCount )
+    };
+
+    Span<uint64> yEntries[2] = {
+        Span<uint64>( yBuffer, bucketEntryCount ),
+        Span<uint64>( yBuffer + bucketEntryCount, bucketEntryCount )
+    };
+
+    byte key[32] = { 1 };
+    memcpy( key+1, plotId, 31 );
+
+    AnonMTJob::Run( *cx.pool, threadCount, [=]( AnonMTJob* self ) {
+
+        const uint32 xShift            = k - kExtraBits;
+        const uint32 f1EntriesPerBlock = kF1BlockSize / sizeof( uint32 );
+
+        uint32 numBlocks, blockOffset, _;
+        GetThreadOffsets( self, f1BlocksPerBucket, numBlocks, blockOffset, _ );
+
+        const uint32 entriesPerThread = numBlocks * f1EntriesPerBlock;
+        const uint32 entriesOffset    = blockOffset * f1EntriesPerBlock;
+
+        Span<uint32> f1Blocks = blocks.Slice( blockOffset * f1EntriesPerBlock, numBlocks * f1EntriesPerBlock );
+
+        chacha8_ctx chacha;
+        chacha8_keysetup( &chacha, key, 256, NULL );
+
+        for( uint32 i = 0; i < 2; i++ )
+        {
+            Span<uint32> xSlice = xEntries[i].Slice( entriesOffset, entriesPerThread );
+            Span<uint64> ySlice = yEntries[i].Slice( entriesOffset, entriesPerThread );
+
+            const uint32 xStart     = (uint32)(( xSources[i] * bucketEntryCount ) + entriesOffset);
+            const uint32 blockIndex = xStart / f1EntriesPerBlock;
+
+            chacha8_get_keystream( &chacha, blockIndex, numBlocks, (byte*)f1Blocks.Ptr() );
+
+            for( uint32 j = 0; j < entriesPerThread; j++ )
+            {
+                // Get the starting and end locations of y in bits relative to our block
+                const uint32 x = xStart + j;
+                
+                uint64 y = Swap32( f1Blocks[j] );
+                y = ( y << kExtraBits ) | ( x >> xShift );
+
+                xSlice[j] = x;
+                ySlice[j] = y;
+            }
+
+// #if _DEBUG
+//             {
+//                 for( uint32 j = 0; j < entriesPerBucket; j++ )
+//                     Log::Line( "[%2u] %-10u | 0x%08x", j,  xEntries[j], xEntries[j] );
+//             }
+// #endif
+        }
+    });
+
+    // const auto elapsed = TimerEnd( timer );
+
+    // Log::Line( "Completed F1 in %.2lf seconds.", TimerEnd( timer ) );
+
+
+    // Sort f1 on y
+    const uint64 mergedEntryCount = bucketEntryCount * 2;
+    RadixSort256::SortYWithKey<BB_MAX_JOBS>( *cx.pool, yBuffer, cx.yBuffer.Ptr(), xBuffer, cx.xBuffer.Ptr(), mergedEntryCount );
+    
+    #if SHOW_TIMINGS
+        Log::Line( "F1 elapsed: %.3lf s", TimerEnd( timer ) );
+    #endif
+}
+
+//-----------------------------------------------------------
+void SortQualityXs( const uint32 k, const byte plotId[BB_PLOT_ID_LEN], uint64* x, const uint32 count )
+{
+    ASSERT( count <= 16 );
+    
+    uint128 lp[8] = {};
+    const uint32 lpCount = count / 2;
+
+    for( uint32 i = 0; i < lpCount; i++ )
+        lp[i] = SquareToLinePoint128( x[i*2], x[i*2+1] );
+
+    std::sort( &lp[0], &lp[lpCount] );
+
+    for( uint32 i = 0; i < lpCount; i++ )
+    {
+        const auto v = LinePointToSquare( lp[i] );
+
+        x[i*2]   = v.x;
+        x[i*2+1] = v.y;
+    }
+}
+
+///
+/// Forward propagation
+///
+//-----------------------------------------------------------
+template<TableId rTableId>
+inline uint64 ForwardPropTableGroup( GreenReaperContext& cx, const uint32 lGroup, Span<Pair> outPairs, Span<uint64> yRight, Span<typename K32MetaType<rTableId>::Out> metaRight  )
+{
+    using TMetaIn  = typename K32MetaType<rTableId>::In;
+    using TMetaOut = typename K32MetaType<rTableId>::Out;
+
+    ASSERT( yRight.Length() == metaRight.Length() );
+
+    ProofTable& lTable = cx.tables[(int)rTableId-1];
+    ProofTable& rTable = cx.tables[(int)rTableId];
+
+    Span<uint64>  yLeft;    
+    Span<TMetaIn> metaLeft;
+    lTable.GetLTableGroupEntries<rTableId-1>( cx, lGroup, yLeft, metaLeft );
+
+    Span<Pair> pairs;
+
+    const uint32 rGroup = lGroup / 2;
+    if( (lGroup & 1) == 0)
+        rTable.BeginGroup( rGroup );
+
+    if( cx.cudaThresher != nullptr )
+    {
+        auto inLeftPairs = Span<Pair>( cx.pairs.Ptr() + lTable._groups[lGroup].offset,
+                                       lTable._groups[lGroup].count );
+
+        auto outLeftPairs = Span<Pair>( lTable._pairs + lTable._groups[lGroup].offset,
+                                       lTable._groups[lGroup].count );
+
+        const uint32 matchOffset = lTable._groups[lGroup].offset;
+              uint32 matchCount  = 0;
+
+        const auto r = cx.cudaThresher->DecompressTableGroup(
+            cx,
+            rTableId,
+            (uint32)yLeft.Length(),
+            matchOffset,
+            (uint32)outPairs.Length(),
+            matchCount,                   // Output
+            outPairs.Ptr(),
+            yRight.Ptr(),
+            metaRight.Ptr(),
+            outLeftPairs.Ptr(),
+            inLeftPairs.Ptr(),            // Input (pairs are in/out as they get sorted here)
+            yLeft.Ptr(),
+            metaLeft.Ptr() );
+
+        if( r.kind == ThresherResultKind::Error )
+        {
+            // Perhaps find a way to log this or pass the error back to the caller.
+            delete cx.cudaThresher;
+            cx.cudaThresher         = nullptr;
+            cx.cudaRecreateThresher = true;
+        }
+
+         if( r.kind != ThresherResultKind::Success || matchCount < 1 )
+            return 0;
+
+        ASSERT( matchCount <= inLeftPairs.Length() );
+
+        pairs = outPairs.SliceSize( matchCount );
+    }
+    else
+    {
+        // Match
+        #if SHOW_TIMINGS
+            const auto timer = TimerBegin();
+        #endif
+        
+        pairs = Match( cx, yLeft, outPairs, lTable._groups[lGroup].offset );
+        
+        #if SHOW_TIMINGS
+            Log::Line( " Match elapsed: %.3lf s", TimerEnd( timer ) );
+        #endif
+
+        if( pairs.Length() > yRight.Length() )
+            return 0;
+
+        // Fx
+        if( pairs.Length() > 0 )
+        {
+            // Since pairs have the global L table offset applied to them,
+            // we need to turn the left values back to global table y and meta, instead
+            // of group-local y and meta
+            yLeft    = MakeSpan( cx.proofContext.yLeft, cx.proofContext.leftLength );
+            metaLeft = MakeSpan( (TMetaIn*)cx.proofContext.metaLeft, cx.proofContext.leftLength );
+
+            #if SHOW_TIMINGS
+                const auto timer = TimerBegin();
+            #endif
+
+            GenerateFxForPairs<rTableId, TMetaIn, TMetaOut>( cx, pairs, yLeft, metaLeft, yRight, metaRight );
+
+            #if SHOW_TIMINGS
+                Log::Line( " Fx elapsed: %.3lf s", TimerEnd( timer ) );
+            #endif
+        }
+    }
+
+    rTable.AddGroupPairs( rGroup, (uint32)pairs.Length() );
+
+    return pairs.Length();
+}
+
+//-----------------------------------------------------------
+template<TableId rTable>
+ForwardPropResult ForwardPropTable( GreenReaperContext& cx, uint32 tableGroupCount, bool returnSuccessOnSingleMatch )
+{
+    #if SHOW_TIMINGS
+        const auto timer = TimerBegin();
+    #endif
+    using TMetaOut = typename K32MetaType<rTable>::Out;
+
+    if( cx.cudaThresher ) cx.cudaThresher->ClearTimings();
+
+    auto& table = cx.tables[(int)rTable];
+    // ASSERT( table._length == 0 );
+
+    auto yRight    = MakeSpan( cx.proofContext.yRight, table._capacity );
+    auto metaRight = MakeSpan( (TMetaOut*)cx.proofContext.metaRight, table._capacity );
+    auto outPairs  = cx.pairs.SliceSize( table._capacity );
+
+    // Fp all groups in the table
+    uint64 tableMatchCount = 0;
+
+    for( uint32 i = 0; i < tableGroupCount; i++ )
+    {
+        uint64 matchCount = ForwardPropTableGroup<rTable>( cx, i, outPairs, yRight, metaRight );
+// Log::Line( "[%u] : %llu", i, (llu)matchCount );
+
+        if( matchCount == 0 )
+            return ForwardPropResult::Failed;
+
+// #if _DEBUG
+//         {
+//             for( uint64 j = 0; j < matchCount; j++ )
+//             {
+//                 Log::Line( "  [%u] L: %-8u | R: %u", j, outPairs[j].left, outPairs[j].right );
+//             }
+//         }
+// #endif
+        tableMatchCount += matchCount;
+        outPairs  = outPairs .Slice( matchCount ); 
+        yRight    = yRight   .Slice( matchCount );
+        metaRight = metaRight.Slice( matchCount );
+    }
+
+    // if( cx.cudaThresher ) cx.cudaThresher->DumpTimings();
+
+    const uint32 groupsToFlip = std::max( 1u, tableGroupCount / 2 );
+    SortTableAndFlipBuffers<rTable>( cx, groupsToFlip );
+
+    // The last table makes no group entries, it should simply have a single match
+    // if( rTable == TableId::Table6 )
+    //     return tableMatchCount == 1 ? ForwardPropResult::Success : ForwardPropResult::Failed;
+
+    // When getting qualities, we may have a single match before the last table
+    // if( returnSuccessOnSingleMatch && tableMatchCount == 1 )
+    //     return ForwardPropResult::Success;
+
+    // Check if we found a match already
+    const bool hasProof = (returnSuccessOnSingleMatch && tableMatchCount == 1) || tableMatchCount == 2;
+    // bool hasProof = true;
+
+    // const uint32 groupsToCheck = tableGroupCount == 1 ? 1 : tableGroupCount / 2;
+    // for( uint32 i = 0; i < groupsToCheck; i++ )
+    // {
+    //     if( table._groups[i].count != 2 )
+    //     {
+    //         hasProof = false;
+    //         break;
+    //     }
+    // }
+    #if SHOW_TIMINGS
+        Log::Line( "FP table %u: %.3lf s", rTable+1, TimerEnd( timer ) );
+    #endif
+
+    return hasProof ? ForwardPropResult::Success : ForwardPropResult::Continue;
+}
+
+//-----------------------------------------------------------
+bool ForwardPropCudaTables( GreenReaperContext& cx, bool returnSuccessOnSingleMatch )
+{
+    ASSERT( cx.cudaThresher );
+    return false;
+
+    // for( TableId rTableId = TableId::Table3; rTableId < TableId::Table7; rTableId++ )
+    // {
+    //     const uint32 groupCount  = 32 >> ((int)rTableId-1);
+    //     const bool   isLastTable = rTableId == TableId::Table6;
+
+    //     ProofTable& lTable = cx.tables[(int)rTableId-1];
+    //     ProofTable& rTable = cx.tables[(int)rTableId];
+
+    //     Span<Pair> outPairs = cx.pairs.SliceSize( rTable._capacity );
+
+    //     uint32 tableMatchCount = 0;
+
+    //     for( uint32 lGroup = 0; lGroup < groupCount; lGroup++ )
+    //     {
+    //         const uint32 rGroup = lGroup / 2;
+
+    //         if( (lGroup & 1) == 0)
+    //             rTable.BeginGroup( rGroup );
+
+    //         uint32 matchCount = 0;
+    //         const bool r = cx.cudaThresher->DecompressNextTableGroup(
+    //             cx,
+    //             outPairs,
+    //             matchCount,
+    //             lTable._groups[lGroup].offset
+    //         );
+
+    //         if( !r || matchCount < 1 )
+    //             return false;
+
+
+    //         rTable.AddGroupPairs( rGroup, matchCount );
+    //         tableMatchCount += matchCount;
+
+    //         outPairs = cx.pairs.SliceSize( rTable._capacity );
+    //     }
+
+    //     const bool hasProof = (returnSuccessOnSingleMatch && tableMatchCount == 1) || tableMatchCount == 2;
+    //     return hasProof;
+    // }
+}
+
+//-----------------------------------------------------------
+bool ForwardPropTables( GreenReaperContext& cx )
+{
+    for( TableId rTable = TableId::Table3; rTable < TableId::Table7; rTable++ )
+    {
+        ForwardPropResult r;
+
+        const uint32 groupCount  = 32 >> ((int)rTable-1);
+        const bool   isLastTable = rTable == TableId::Table6;
+
+        switch( rTable )
+        {
+            // case TableId::Table2: r = ForwardPropTable<TableId::Table2>( cx ); break;
+            case TableId::Table3: r = ForwardPropTable<TableId::Table3>( cx, groupCount, isLastTable ); break;
+            case TableId::Table4: r = ForwardPropTable<TableId::Table4>( cx, groupCount, isLastTable ); break;
+            case TableId::Table5: r = ForwardPropTable<TableId::Table5>( cx, groupCount, isLastTable ); break;
+            case TableId::Table6: r = ForwardPropTable<TableId::Table6>( cx, groupCount, isLastTable ); break;
+
+            default:
+                Log::Error( "[GR_ERROR] Unexpected table encountered %u", (uint32)rTable+1 );
+                return false;
+        }
+
+        switch( r )
+        {
+            case ForwardPropResult::Success:
+                BacktraceProof( cx, std::min( rTable, TableId::Table6 ), cx.proofContext.proof );
+                return true;
+
+            case ForwardPropResult::Continue:
+                break;
+
+            default:
+                // ASSERT( 0 );
+                return false;
+        }   
+    }
+
+    return false;
+}
+
+
+///
+/// Matching
+///
+//-----------------------------------------------------------
+struct GRMatchJob : MTJob<GRMatchJob>
+{
+    const uint64* _yBuffer;
+          uint64  _entryCount;
+    const uint32* _groupIndices;
+          uint32  _groupCount;
+          Pair*   _tmpPairs;
+          Pair*   _outPairs;
+          uint64  _maxMatches;
+          uint32  _pairOffset;
+          std::atomic<uint64>* _totalMatchCount;
+
+          uint32 _matchCount;
+
+    virtual void Run() override;
+};
+
+//-----------------------------------------------------------
+Span<Pair> Match( GreenReaperContext& cx, const Span<uint64> yEntries, Span<Pair> outputPairs, const uint32 pairOffset )
+{
+    ASSERT( yEntries.length <= 0xFFFFFFFF );
+    ASSERT( cx.groupsBoundaries.length <= 0xFFFFFFFF );
+
+    // Get the group boundaries and the adjusted thread count
+    const uint64 groupCount = ScanBCGroupMT32( 
+        *cx.pool,
+        cx.config.threadCount,
+        yEntries.Ptr(),
+        (uint32)yEntries.Length(),
+        cx.xBufferTmp.Ptr(),
+        cx.groupsBoundaries.Ptr(),
+        (uint32)cx.groupsBoundaries.Length()
+    );
+
+    ASSERT( cx.pairsTmp.Length() >= outputPairs.Length() );
+    std::atomic<uint64> matchCount = 0;
+
+    GRMatchJob job = {};
+    job._yBuffer         = yEntries.Ptr();
+    job._entryCount      = yEntries.Length();
+    job._groupIndices    = cx.groupsBoundaries.Ptr();
+    job._groupCount      = (uint32)groupCount;
+    job._tmpPairs        = cx.pairsTmp.Ptr();
+    job._outPairs        = outputPairs.Ptr();
+    job._maxMatches      = outputPairs.Length();
+    job._pairOffset      = pairOffset;
+    job._totalMatchCount = &matchCount;
+    job._matchCount      = 0;
+
+    const uint32 matchThreadCount = std::min( cx.config.threadCount, (uint32)groupCount );
+    MTJobRunner<GRMatchJob>::RunFromInstance( *cx.pool, matchThreadCount, job );
+
+    return outputPairs.SliceSize( matchCount );
+}
+
+//-----------------------------------------------------------
+static uint32 MatchJob(
+    const Span<uint64> yEntries, 
+    const uint32*      groupBoundaries,
+    const uint32       groupCount,
+          Span<Pair>   pairs, 
+    const uint32       pairOffset,
+    const uint32       id );
+
+void GRMatchJob::Run()
+{
+    uint32 groupCount, offset, _;
+    GetThreadOffsets( this, _groupCount, groupCount, offset, _ );
+
+    uint64 maxMatches, matchOffset, __;
+    GetThreadOffsets( this, _maxMatches, maxMatches, matchOffset, __ );
+
+    auto tmpPairs = Span<Pair>( _tmpPairs + matchOffset, maxMatches );
+
+    const uint32 matchCount = MatchJob( 
+        Span<uint64>( (uint64*)_yBuffer, _entryCount ),
+        _groupIndices + offset,
+        groupCount,
+        tmpPairs,
+        _pairOffset,
+        _jobId );
+
+    _totalMatchCount->fetch_add( matchCount, std::memory_order_relaxed );
+
+    _matchCount = matchCount;
+    SyncThreads();
+
+    uint64 copyOffset = 0;
+    for( uint32 i = 0; i < _jobId; i++ )
+        copyOffset += GetJob( i )._matchCount;
+
+    bbmemcpy_t( _outPairs + copyOffset, tmpPairs.Ptr(), matchCount );
+}
+
+//-----------------------------------------------------------
+uint32 MatchJob(
+    const Span<uint64> yEntries, 
+    const uint32*      groupBoundaries,
+    const uint32       groupCount,
+          Span<Pair>   pairs, 
+    const uint32       pairOffset,
+    const uint32       id )
+{
+    // const uint32 groupCount = (uint32)groupBoundaries.Length();
+    const uint32 maxPairs = (uint32)pairs.Length();
+
+    uint32 pairCount = 0;
+
+    uint8  rMapCounts [kBC];
+    uint16 rMapIndices[kBC];
+
+    uint64 groupLStart = groupBoundaries[0];
+    uint64 groupL      = yEntries[groupLStart] / kBC;
+
+    for( uint32 i = 1; i <= groupCount; i++ )
+    {
+        const uint64 groupRStart = groupBoundaries[i];
+        const uint64 groupR      = yEntries[groupRStart] / kBC;
+
+        if( groupR - groupL == 1 )
+        {
+            // Groups are adjacent, calculate matches
+            const uint16 parity           = groupL & 1;
+            const uint64 groupREnd        = groupBoundaries[i+1];
+
+            const uint64 groupLRangeStart = groupL * kBC;
+            const uint64 groupRRangeStart = groupR * kBC;
+
+            ASSERT( groupREnd - groupRStart <= 350 );
+            ASSERT( groupLRangeStart == groupRRangeStart - kBC );
+
+            // Prepare a map of range kBC to store which indices from groupR are used
+            // For now just iterate our bGroup to find the pairs
+
+            // #NOTE: memset(0) works faster on average than keeping a separate a clearing buffer
+            memset( rMapCounts, 0, sizeof( rMapCounts ) );
+
+            for( uint64 iR = groupRStart; iR < groupREnd; iR++ )
+            {
+                uint64 localRY = yEntries[iR] - groupRRangeStart;
+                ASSERT( yEntries[iR] / kBC == groupR );
+
+                if( rMapCounts[localRY] == 0 )
+                    rMapIndices[localRY] = (uint16)( iR - groupRStart );
+
+                rMapCounts[localRY] ++;
+            }
+
+            // For each group L entry
+            for( uint64 iL = groupLStart; iL < groupRStart; iL++ )
+            {
+                const uint64 yL     = yEntries[iL];
+                const uint64 localL = yL - groupLRangeStart;
+
+                for( int iK = 0; iK < kExtraBitsPow; iK++ )
+                {
+                    const uint16 targetR = L_targets[parity][localL][iK];
+
+                    // for( uint j = 0; j < rMapCounts[targetR]; j++ )
+                    if( rMapCounts[targetR] > 0 )
+                    {
+                        for( uint j = 0; j < rMapCounts[targetR]; j++ )
+                        {
+                            const uint64 iR = groupRStart + rMapIndices[targetR] + j;
+                            ASSERT( iL < iR );
+
+                            // Add a new pair
+                            if( pairCount >= maxPairs )
+                            {
+                                // #TODO: Set error
+                                ASSERT( 0 );
+                                return pairCount;
+                            }
+
+                            Pair& pair = pairs[pairCount++];
+                            pair.left  = (uint32)iL + pairOffset;
+                            pair.right = (uint32)iR + pairOffset;
+                        }
+                    }
+                }
+            }
+        }
+        // Else: Not an adjacent group, skip to next one.
+
+        // Go to next group
+        groupL      = groupR;
+        groupLStart = groupRStart;
+    }
+
+    return pairCount;
+}
+
+
+///
+/// Fx
+///
+//-----------------------------------------------------------
+template<TableId rTable, typename TMetaIn, typename TMetaOut>
+void GenerateFxForPairs( GreenReaperContext& cx, const Span<Pair> pairs, const Span<uint64> yIn, const Span<TMetaIn> metaIn, Span<uint64> yOut, Span<TMetaOut> metaOut )
+{
+    ASSERT( yOut.Length() >= pairs.Length() );
+    ASSERT( metaOut.Length() >= pairs.Length() );
+
+    const uint32 threadCount = std::min( cx.config.threadCount, (uint32)pairs.Length() );
+
+    if( threadCount == 1 )
+    {
+        GenerateFx<rTable, TMetaIn, TMetaOut>( pairs, yIn, metaIn, yOut, metaOut );
+        return;
+    }
+
+    AnonMTJob::Run( *cx.pool, cx.config.threadCount, [&]( AnonMTJob* self ){
+        
+        uint64 count, offset, _;
+        GetThreadOffsets( self, (uint64)pairs.Length(), count, offset, _ );
+        
+        GenerateFx<rTable, TMetaIn, TMetaOut>( pairs.Slice( offset, count ), yIn, metaIn, yOut.Slice( offset, count ), metaOut.Slice( offset, count ) );
+    });
+}
+
+//-----------------------------------------------------------
+template<TableId rTable, typename TMetaIn, typename TMetaOut>
+inline void GenerateFx( const Span<Pair> pairs, const Span<uint64> yIn, const Span<TMetaIn> metaIn, Span<uint64> yOut, Span<TMetaOut> outMeta )
+{
+    constexpr size_t MetaInMulti  = TableMetaIn <rTable>::Multiplier;
+    constexpr size_t MetaOutMulti = TableMetaOut<rTable>::Multiplier;
+    static_assert( MetaInMulti != 0, "Invalid metaKMultiplier" );
+
+    const uint32 k           = 32;
+    const uint32 shiftBits   = MetaOutMulti == 0 ? 0 : kExtraBits;
+    const uint32 ySize       = k + kExtraBits;         // = 38
+    const uint32 yShift      = 64 - (k + shiftBits);   // = 26 or 32
+    
+    const size_t metaSize    = k * MetaInMulti;
+    const size_t metaSizeLR  = metaSize * 2;
+    const size_t bufferSize  = CDiv( ySize + metaSizeLR, 8 );
+
+    // Hashing
+    uint64 input [5]; // y + L + R
+    uint64 output[4]; // blake3 hashed output
+
+    blake3_hasher hasher;
+    static_assert( bufferSize <= sizeof( input ), "Invalid fx input buffer size." );
+
+    for( uint64 i = 0; i < pairs.Length(); i++ )
+    {
+        const Pair   pair = pairs[i];
+        const uint64 y    = yIn[pair.left];
+
+        const TMetaIn metaL = metaIn[pair.left ];
+        const TMetaIn metaR = metaIn[pair.right];
+
+        TMetaOut& mOut = outMeta[i];
+
+        if constexpr( MetaInMulti == 1 )
+        {
+            const uint64 l = metaL;
+            const uint64 r = metaR;
+
+            input[0] = Swap64( y << 26 | l >> 6  );
+            input[1] = Swap64( l << 58 | r << 26 );
+
+            // Metadata is just L + R of 8 bytes
+            if constexpr( MetaOutMulti == 2 )
+                mOut = l << 32 | r;
+        }
+        else if constexpr( MetaInMulti == 2 )
+        {
+            const uint64 l = metaL;
+            const uint64 r = metaR;
+
+            input[0] = Swap64( y << 26 | l >> 38 );
+            input[1] = Swap64( l << 26 | r >> 38 );
+            input[2] = Swap64( r << 26 );
+
+            // Metadata is just L + R again of 16 bytes
+            if constexpr( MetaOutMulti == 4 )
+            {
+                mOut.m0 = l;
+                mOut.m1 = r;
+            }
+        }
+        else if constexpr( MetaInMulti == 3 )
+        {
+            const uint64 l0 = metaL.m0;
+            const uint64 l1 = metaL.m1 & 0xFFFFFFFF;
+            const uint64 r0 = metaR.m0;
+            const uint64 r1 = metaR.m1 & 0xFFFFFFFF;
+        
+            input[0] = Swap64( y  << 26 | l0 >> 38 );
+            input[1] = Swap64( l0 << 26 | l1 >> 6  );
+            input[2] = Swap64( l1 << 58 | r0 >> 6  );
+            input[3] = Swap64( r0 << 58 | r1 << 26 );
+        }
+        else if constexpr( MetaInMulti == 4 )
+        {
+            const auto l = metaL;
+            const auto r = metaR;
+
+            input[0] = Swap64( y    << 26 | l.m0 >> 38 );
+            input[1] = Swap64( l.m0 << 26 | l.m1 >> 38 );
+            input[2] = Swap64( l.m1 << 26 | r.m0 >> 38 );
+            input[3] = Swap64( r.m0 << 26 | r.m1 >> 38 );
+            input[4] = Swap64( r.m1 << 26 );
+        }
+
+        // Hash the input
+        blake3_hasher_init( &hasher );
+        blake3_hasher_update( &hasher, input, bufferSize );
+        blake3_hasher_finalize( &hasher, (uint8_t*)output, sizeof( output ) );
+
+        const uint64 f = Swap64( *output ) >> yShift;
+        yOut[i] = f;
+
+        if constexpr ( MetaOutMulti == 2 && MetaInMulti == 3 )
+        {
+            const uint64 h0 = Swap64( output[0] );
+            const uint64 h1 = Swap64( output[1] );
+
+            mOut = h0 << ySize | h1 >> 26;
+        }
+        else if constexpr ( MetaOutMulti == 3 )
+        {
+            const uint64 h0 = Swap64( output[0] );
+            const uint64 h1 = Swap64( output[1] );
+            const uint64 h2 = Swap64( output[2] );
+
+            mOut.m0 = h0 << ySize | h1 >> 26;
+            mOut.m1 = ((h1 << 6) & 0xFFFFFFC0) | h2 >> 58;
+        }
+        else if constexpr ( MetaOutMulti == 4 && MetaInMulti != 2 ) // In = 2 is calculated above with L + R
+        {
+            const uint64 h0 = Swap64( output[0] );
+            const uint64 h1 = Swap64( output[1] );
+            const uint64 h2 = Swap64( output[2] );
+
+            mOut.m0 = h0 << ySize | h1 >> 26;
+            mOut.m1 = h1 << 38    | h2 >> 26;
+        }
+    }
+}
diff --git a/src/harvesting/GreenReaper.h b/src/harvesting/GreenReaper.h
new file mode 100644
index 00000000..3fdfa6c9
--- /dev/null
+++ b/src/harvesting/GreenReaper.h
@@ -0,0 +1,170 @@
+#pragma once 
+#include <stdint.h>
+
+#ifdef GR_EXPORT
+    #ifdef _WIN32
+        #define GR_API __declspec(dllexport)
+    #else
+        #define GR_API __attribute__ ((visibility("default")))
+    #endif
+#elif !defined( GR_NO_IMPORT )
+    #ifdef _WIN32
+        #define GR_API __declspec(dllimport)
+    #else
+        #define GR_API
+    #endif
+#else
+    #define GR_API
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define GR_API_VERSION 1
+
+#define GR_POST_PROOF_X_COUNT 64
+#define GR_POST_PROOF_CMP_X_COUNT (GR_POST_PROOF_X_COUNT/2)
+
+typedef int32_t GRBool;
+#define GR_FALSE 0
+#define GR_TRUE  1
+
+typedef struct GreenReaperContext GreenReaperContext;
+
+/// How to select GPU for harvesting.
+typedef enum GRGpuRequestKind
+{
+    GRGpuRequestKind_None = 0,        // Disable GPU harvesting.
+    GRGpuRequestKind_FirstAvailable,  // Select the device specified, or the first available, if any. If no device is available it defaults to CPU harvesting.
+    GRGpuRequestKind_ExactDevice,     // Select the specified device only. If none is available, it is an error.
+} GRGpuRequestKind;
+
+typedef uint32_t GRGpuRequestKind_t;
+
+typedef struct GreenReaperConfig
+{
+    uint32_t           apiVersion;
+    uint32_t           threadCount;
+    uint32_t           cpuOffset;
+    GRBool             disableCpuAffinity;
+    GRGpuRequestKind_t gpuRequest;         // What kind of GPU to select for harvesting.
+    uint32_t           gpuDeviceIndex;     // Which device index to use (0 by default)
+
+    uint32_t           _reserved[16];      // Reserved for future use
+} GreenReaperConfig;
+
+typedef enum GRResult
+{
+    GRResult_Failed        = 0,
+    GRResult_OK            = 1,
+    GRResult_OutOfMemory   = 2,
+    GRResult_NoProof       = 3,  // A dropped proof due to line point compression
+    GRResult_WrongVersion  = 4,
+    GRResult_InvalidGPU    = 5,  // Invalid or missing GPU selection. (When GRGpuRequestKind_ExactDevice is used.)
+    GRResult_InvalidArg    = 6,  // An invalid argument was passed.
+
+} GRResult;
+
+typedef struct GRCompressionInfo
+{
+    uint32_t entrySizeBits;
+    uint32_t subtSizeBits;
+    size_t   tableParkSize;
+    double   ansRValue;
+} GRCompressionInfo;
+
+// Timings expressed in nanoseconds
+// typedef struct GRProofTimings
+// {
+//     uint64_t totalElapsedNS;
+//     uint64_t f1ElapsedNS;
+//     uint64_t sortElapsedNS;
+//     uint64_t matchElapsedNS;
+//     uint64_t fxElapsedNS;
+// };
+
+typedef struct GRCompressedProofRequest
+{
+    union {
+        uint64_t compressedProof[GR_POST_PROOF_CMP_X_COUNT];   // Corresponds to the x buckets in line points form
+        uint64_t fullProof      [GR_POST_PROOF_X_COUNT];
+    };
+
+          uint32_t  compressionLevel;
+    const uint8_t*  plotId;
+
+    // Pass a pointer to a timings struct if 
+    // you'd like detailed timings output
+    // GRProofTimings* outTimings;
+          
+} GRCompressedProofRequest;
+
+typedef struct GRLinePoint
+{
+    uint64_t hi;  // High-order bytes
+    uint64_t lo;  // Low-order bytes
+} GRLinePoint;
+
+typedef struct GRCompressedQualitiesRequest
+{
+    // Input
+    const uint8_t*  plotId;
+    const uint8_t*  challenge;
+    uint32_t        compressionLevel;
+    GRLinePoint     xLinePoints[2];     // Line points with compressed x's
+
+    // Output
+    uint64_t        x1, x2;             // Output x qualities
+
+} GRCompressedQualitiesRequest;
+
+typedef struct GRApiV1
+{
+    GRResult (*CreateContext)( GreenReaperContext** outContext, GreenReaperConfig* config, size_t configStructSize );
+    void     (*DestroyContext)( GreenReaperContext* context );
+    GRResult (*PreallocateForCompressionLevel)( GreenReaperContext* context, uint32_t k, uint32_t maxCompressionLevel );
+    GRResult (*FetchProofForChallenge)( GreenReaperContext* context, GRCompressedProofRequest* req );
+    GRResult (*GetFetchQualitiesXPair)( GreenReaperContext* context, GRCompressedQualitiesRequest* req );
+    size_t   (*GetMemoryUsage)( GreenReaperContext* context );
+    GRBool   (*HasGpuDecompressor)( GreenReaperContext* context );
+    GRResult (*GetCompressionInfo)( GRCompressionInfo* outInfo, size_t infoStructSize, uint32_t k, uint32_t compressionLevel );
+
+} GRApiV1;
+
+typedef GRApiV1 GRApi;
+
+
+///
+/// API
+///
+
+/// Populate an API object with all the current API's functions.
+/// A single API version is ever supported per binary.
+GR_API GRResult grPopulateApi( GRApi* api, size_t apiStructSize, int apiVersion );
+
+/// Create a decompression context
+GR_API GRResult grCreateContext( GreenReaperContext** outContext, GreenReaperConfig* config, size_t configStructSize );
+
+/// Destroy decompression context
+GR_API void grDestroyContext( GreenReaperContext* context );
+
+/// Preallocate context's in-memory buffers to support a maximum compression level
+GR_API GRResult grPreallocateForCompressionLevel( GreenReaperContext* context, uint32_t k, uint32_t maxCompressionLevel );
+
+/// Full proof of space request given a challenge
+GR_API GRResult grFetchProofForChallenge( GreenReaperContext* context, GRCompressedProofRequest* req );
+
+/// Request plot qualities for a challenge
+GR_API GRResult grGetFetchQualitiesXPair( GreenReaperContext* context, GRCompressedQualitiesRequest* req );
+
+GR_API size_t grGetMemoryUsage( GreenReaperContext* context );
+
+/// Returns true if the context has a Gpu-based decompressor created.
+GR_API GRBool grHasGpuDecompressor( GreenReaperContext* context );
+
+GR_API GRResult grGetCompressionInfo( GRCompressionInfo* outInfo, size_t infoStructSize, uint32_t k, uint32_t compressionLevel );
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/harvesting/GreenReaperInternal.h b/src/harvesting/GreenReaperInternal.h
new file mode 100644
index 00000000..6ae101d0
--- /dev/null
+++ b/src/harvesting/GreenReaperInternal.h
@@ -0,0 +1,31 @@
+#pragma once
+
+// Determined from the average match count at compression level 11, which was 0.288% of the bucket count * 2.
+// We round up to a reasonable percentage of 0.5%.
+// #NOTE: This must be modified for higher compression levels.
+static constexpr double GR_MAX_MATCHES_MULTIPLIER         = 0.005;
+static constexpr double GR_MAX_MATCHES_MULTIPLIER_2T_DROP = 0.018;  // For C9+
+static constexpr uint32 GR_MAX_BUCKETS                    = 32;
+static constexpr uint64 GR_MIN_TABLE_PAIRS                = 1024;
+
+struct CudaThresherConfig
+{
+    uint deviceId;
+    
+};
+
+
+inline uint64 GetEntriesPerBucketForCompressionLevel( const uint32 k, const uint32 cLevel )
+{
+    const uint32 entryBits = 17u - cLevel;
+    const uint32 bucketBits = k - entryBits;
+    const uint64 bucketEntryCount = 1ull << bucketBits;
+
+    return bucketEntryCount;
+}
+
+inline uint64 GetMaxTablePairsForCompressionLevel( const uint32 k, const uint32 cLevel )
+{
+    const double factor = cLevel >= 9 ? GR_MAX_MATCHES_MULTIPLIER_2T_DROP : GR_MAX_MATCHES_MULTIPLIER;
+    return (uint64)( GetEntriesPerBucketForCompressionLevel( k, cLevel ) * factor ) * (uint64)GR_MAX_BUCKETS;
+}
\ No newline at end of file
diff --git a/src/harvesting/GreenReaperPortable.h b/src/harvesting/GreenReaperPortable.h
new file mode 100644
index 00000000..64f60780
--- /dev/null
+++ b/src/harvesting/GreenReaperPortable.h
@@ -0,0 +1,58 @@
+#pragma once
+#include <stddef.h>
+#include "GreenReaper.h"
+
+#ifdef _WIN32
+    #define NOMINMAX 1
+    #define WIN32_LEAN_AND_MEAN 1
+    #include <libloaderapi.h>
+#elif defined( __linux__ ) || defined( __APPLE__ )
+    #include <dlfcn.h>
+#else
+    #error "Unsupported Platform"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+inline void* grLoadModule( const char* path )
+{
+    void* module = NULL;
+
+    #ifdef _WIN32
+        module = (void*)LoadLibraryA( (LPCSTR)path );
+    #else
+        module = dlopen( path, RTLD_LAZY | RTLD_LOCAL );
+    #endif
+
+    return module;
+}
+
+inline GRResult grPopulateApiFromModule( void* module, GRApi* api, const size_t apiStructSize, const int apiVersion )
+{
+    if( module == NULL || api == NULL )
+        return GRResult_Failed;
+
+    #define GR_STR(x) # x
+    
+    typedef GRResult (*grPopulateApiProc)( GRApi* api, size_t apiStructSize, int apiVersion );
+
+    grPopulateApiProc populateApi = NULL;
+
+    #ifdef _WIN32
+        populateApi = (grPopulateApiProc)GetProcAddress( (HMODULE)module, (LPCSTR)GR_STR( grPopulateApi ) );
+    #else
+        populateApi = (grPopulateApiProc)dlsym( module, GR_STR( grPopulateApi ) );
+    #endif
+
+    if( populateApi == NULL )
+        return GRResult_Failed;
+
+    return populateApi( api, apiStructSize, apiVersion );
+    #undef GR_STR
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/src/harvesting/Thresher.h b/src/harvesting/Thresher.h
new file mode 100644
index 00000000..7ffb3c6b
--- /dev/null
+++ b/src/harvesting/Thresher.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include "plotting/Tables.h"
+
+struct GreenReaperContext;
+struct Pair;
+
+enum class ThresherResultKind
+{
+    Success = 0,    // Successfully processed a table with matches.
+    NoMatches,      // No error occurred, but no matches we obtained.
+    Error,          // An error has occurred when decompressing and an error code will be set.
+};
+
+enum class ThresherError
+{
+    None = 0,
+    UnexpectedError,
+    CudaError,
+};
+
+struct ThresherResult
+{
+    ThresherResultKind kind;
+    ThresherError      error;
+    i32                internalError;
+};
+
+
+/// Plot decompression interface
+class IThresher
+{
+public:
+    inline virtual ~IThresher() {}
+
+    virtual bool AllocateBuffers( uint k, uint maxCompressionLevel ) = 0;
+
+    virtual void ReleaseBuffers() = 0;
+
+    virtual ThresherResult DecompressInitialTable(
+        GreenReaperContext& cx,
+        const byte plotId[32],
+        uint32     entryCountPerX,
+        Pair*      outPairs,
+        uint64*    outY,
+        void*      outMeta,
+        uint32&    outMatchCount,
+        uint64 x0, uint64 x1) = 0;
+
+    virtual ThresherResult DecompressTableGroup(
+        GreenReaperContext& cx,
+        const TableId   table,
+        uint32          entryCount,
+        uint32          matchOffset,
+        uint32          maxPairs,
+        uint32&         outMatchCount,
+        Pair*           outPairs,
+        uint64*         outY,
+        void*           outMeta,
+        Pair*           outLPairs,  // Where to store sorted input pairs from previous table
+        const Pair*     inLPairs,
+        const uint64*   inY,
+        const void*     inMeta ) = 0;
+
+    // For testing
+    virtual void DumpTimings() {}
+
+    virtual void ClearTimings() {}
+};
+
+
+class CudaThresherFactory
+{
+public:
+    static IThresher* Create( const struct GreenReaperConfig& config );
+};
+
diff --git a/src/main.cpp b/src/main.cpp
index b1396401..ae568d1c 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -2,13 +2,19 @@
 #include "util/CliParser.h"
 #include "plotdisk/DiskPlotter.h"
 #include "plotmem/MemPlotter.h"
+#include "plotting/PlotTools.h"
+#include "commands/Commands.h"
 #include "Version.h"
 
 #if PLATFORM_IS_UNIX
     #include <sys/resource.h>
 #endif
 
-static void ParseCommandLine( GlobalPlotConfig& cfg, int argc, const char* argv[] );
+#if BB_CUDA_ENABLED
+    #include "../cuda/CudaPlotter.h"
+#endif
+
+static void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, const char* argv[] );
 static void PrintUsage();
 
 // See IOTester.cpp
@@ -28,25 +34,6 @@ void PlotCompareMain( GlobalPlotConfig& gCfg, CliParser& cli );
 void PlotCompareMainPrintUsage();
 
 
-enum class PlotterType
-{
-    None = 0,
-    Ram,
-    Disk
-};
-
-struct Plotter 
-{
-    PlotterType type;
-    union {
-        void* _ptr;
-        DiskPlotter* disk;
-        MemPlotter*  mem;
-    };
-};
-    
-Plotter _plotter;
-
 //-----------------------------------------------------------
 int main( int argc, const char* argv[] )
 {
@@ -57,12 +44,10 @@ int main( int argc, const char* argv[] )
     Log::Line( "*** Warning: Debug mode is ENABLED ***" );
 #endif
 
-    _plotter = {};
-
-    GlobalPlotConfig cfg;
-    ParseCommandLine( cfg, --argc, ++argv );
+    IPlotter* plotter = nullptr;
 
-    FatalIf( !_plotter._ptr, "No plot command chosen." );
+    auto& cfg = *new GlobalPlotConfig{};
+    ParseCommandLine( cfg, plotter, --argc, ++argv );
 
 
     const int64 plotCount = cfg.plotCount > 0 ? (int64)cfg.plotCount : std::numeric_limits<int64>::max();
@@ -74,18 +59,17 @@ int main( int argc, const char* argv[] )
 
     char   plotIdStr[BB_PLOT_ID_LEN*2+1];
 
-
     // Prepare the output path
-    size_t outputFolderLen = strlen( cfg.outputFolder );
-    char*  plotOutPath     = new char[outputFolderLen + BB_PLOT_FILE_LEN_TMP + 2]; // + '/' + null terminator
-
-    if( outputFolderLen )
+    char*  plotOutPath      = nullptr;
+    uint32 plotOutPathIndex = 0;
     {
-        memcpy( plotOutPath, cfg.outputFolder, outputFolderLen );
+        // Get the largest buffer needed
+        size_t outFolderLengthMax = cfg.outputFolders[0].length();
+
+        for( uint32 i = 1; i < cfg.outputFolderCount; i++ )
+            outFolderLengthMax = std::max( outFolderLengthMax, cfg.outputFolders[i].length() );
 
-        // Add a trailing slash, if we need one.
-        if( plotOutPath[outputFolderLen-1] != '/' && plotOutPath[outputFolderLen-1] != '\\' )
-            plotOutPath[outputFolderLen++] = '/';
+        plotOutPath = new char[outFolderLengthMax + BB_COMPRESSED_PLOT_FILE_LEN_TMP + 2]; // + '/' + null terminator
     }
 
     // Start plotting
@@ -93,7 +77,7 @@ int main( int argc, const char* argv[] )
     {
         // Generate a plot id and memo
         PlotTools::GeneratePlotIdAndMemo( plotId, plotMemo, plotMemoSize,
-                                          cfg.farmerPublicKey, cfg.poolPublicKey, cfg.poolContractPuzzleHash );
+                                          *cfg.farmerPublicKey, cfg.poolPublicKey, cfg.poolContractPuzzleHash );
 
         // Apply debug plot id and/or memo
         if( cfg.plotIdStr )
@@ -108,9 +92,21 @@ int main( int argc, const char* argv[] )
         // Convert plot id to string
         PlotTools::PlotIdToString( plotId, plotIdStr );
 
-        // Set the plot file name
-        const char* plotFileName = plotOutPath + outputFolderLen;
-        PlotTools::GenPlotFileName( plotId, (char*)plotFileName );
+        // Set the plot file name & get the full path to it
+        const char* plotFileName  = nullptr;
+        const char* plotOutFolder = nullptr;
+        {
+            // Select the next output folder
+            const std::string& curOutputDir = cfg.outputFolders[plotOutPathIndex++];
+            plotOutPathIndex %= cfg.outputFolderCount;
+
+            plotOutFolder = curOutputDir.data();
+
+            memcpy( plotOutPath, curOutputDir.data(), curOutputDir.length() );
+
+            plotFileName = plotOutPath + curOutputDir.length();
+            PlotTools::GenPlotFileName( plotId, (char*)plotFileName, cfg.compressionLevel );
+        }
 
         // Begin plot
         if( cfg.plotCount == 0 )
@@ -118,6 +114,9 @@ int main( int argc, const char* argv[] )
         else
             Log::Line( "Generating plot %lld / %u: %s", i+1, cfg.plotCount, plotIdStr );
 
+        Log::Line( "Plot temporary file: %s", plotOutPath );
+
+
         if( cfg.showMemo )
         {
             char plotMemoStr[BB_PLOT_MEMO_MAX_SIZE*2+1];
@@ -130,39 +129,21 @@ int main( int argc, const char* argv[] )
         }
         Log::Line( "" );
 
-        if( _plotter.type == PlotterType::Ram )
-        {
-            auto& plotter = *_plotter.mem;
-
-            PlotRequest req = {};
-            req.plotId      = plotId;
-            req.memo        = plotMemo;
-            req.memoSize    = plotMemoSize;
-            req.outPath     = plotOutPath;
-            req.IsFinalPlot = i == plotCount-1;
-            
-            plotter.Run( req );
-        }
-        else if( _plotter.type == PlotterType::Disk )
-        {
-            auto& plotter = *_plotter.disk;
-            
-            DiskPlotter::PlotRequest req;
-            req.plotId       = plotId;
-            req.plotMemo     = plotMemo;
-            req.plotMemoSize = plotMemoSize;
-            req.plotFileName = plotFileName;
-            plotter.Plot( req );
-        }
-        else
-        {
-            Fatal( "Unknown plotter type." );
-        }
+        PlotRequest req = {};
+        req.plotId       = plotId;
+        req.memo         = plotMemo;
+        req.memoSize     = plotMemoSize;
+        req.outDir       = plotOutFolder;
+        req.plotFileName = plotFileName;
+        req.isFirstPlot  = i == 0;
+        req.IsFinalPlot  = i == plotCount-1;
+
+        plotter->Run( req );
     }
 }
 
 //-----------------------------------------------------------
-void ParseCommandLine( GlobalPlotConfig& cfg, int argc, const char* argv[] )
+void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, const char* argv[] )
 {
     CliParser cli( argc, argv );
 
@@ -170,8 +151,8 @@ void ParseCommandLine( GlobalPlotConfig& cfg, int argc, const char* argv[] )
     const char* poolPublicKey       = nullptr;
     const char* poolContractAddress = nullptr;
 
-    DiskPlotter::Config diskCfg = {};
-    MemPlotConfig       ramCfg  = {};
+    outPlotter        = nullptr;
+    IPlotter* plotter = nullptr;
 
     while( cli.HasArgs() )
     {
@@ -189,6 +170,16 @@ void ParseCommandLine( GlobalPlotConfig& cfg, int argc, const char* argv[] )
             continue;
         else if( cli.ReadStr( cfg.plotIdStr, "-i", "--plot-id" ) )
             continue;
+        else if( cli.ArgConsume( "-z", "--compress" ) )
+        {
+            cfg.compressionLevel = 1;   // Default to lowest compression
+
+            // The next parameter is potentially the compression level
+             if( IsNumber( cli.Peek() ) )
+                cfg.compressionLevel = (uint32)cli.ReadU64();
+            
+            continue;
+        }
         else if( cli.ReadStr( cfg.plotMemoStr, "--memo" ) )
             continue;
         else if( cli.ReadSwitch( cfg.showMemo, "--show-memo" ) )
@@ -197,7 +188,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, int argc, const char* argv[] )
             continue;
         else if( cli.ReadSwitch( cfg.disableCpuAffinity, "--no-cpu-affinity" ) )
             continue;
-        else if( cli.ArgConsume( "-v", "--verbose" ) )
+        else if( cli.ReadSwitch( cfg.verbose, "-v", "--verbose" ) )
         {
             Log::SetVerbose( true );
         }
@@ -239,17 +230,25 @@ void ParseCommandLine( GlobalPlotConfig& cfg, int argc, const char* argv[] )
         }
         else if( cli.ArgConsume( "--about" ) )
         {
-            Log::Line( "BladeBit Chia Plotter" );
+            Log::Line( "Bladebit Chia Plotter" );
             Log::Line( "Version      : %s", BLADEBIT_VERSION_STR   );
             Log::Line( "Git Commit   : %s", BLADEBIT_GIT_COMMIT    );
             Log::Line( "Compiled With: %s", BBGetCompilerVersion() );
             
             exit( 0 );
         }
+        else if( cli.ReadSwitch( cfg.benchmarkMode, "--benchmark", "--dry-run" ) )
+            continue;
+
 
         // Commands
         else if( cli.ArgConsume( "diskplot" ) )
         {
+            // #TODO: Remove when fixed
+            FatalIf( cfg.compressionLevel > 0, "diskplot is currently disabled for compressed plotting due to a bug." );
+
+            plotter = new DiskPlotter();
+            
             // Increase the file size limit on linux
             #if PLATFORM_IS_UNIX
                 struct rlimit limit;
@@ -267,44 +266,56 @@ void ParseCommandLine( GlobalPlotConfig& cfg, int argc, const char* argv[] )
                     }
                 }
             #endif
-
-            // DiskPlotter::Config diskCfg;
-            diskCfg.globalCfg = &cfg;
-            DiskPlotter::ParseCommandLine( cli, diskCfg );
-
-            _plotter.type = PlotterType::Disk;
             break;
         }
         else if( cli.ArgConsume( "ramplot" ) )
         {
-            ramCfg.threadCount   = cfg.threadCount == 0 ? 
-                                    SysHost::GetLogicalCPUCount() : 
-                                    bbclamp( cfg.threadCount, 1u, SysHost::GetLogicalCPUCount() );
-            ramCfg.warmStart     = cfg.warmStart;
-            ramCfg.gCfg          = &cfg;
+            FatalIf( cfg.compressionLevel > 7, "ramplot currently does not support compression levels greater than 7" );
 
-            _plotter.type = PlotterType::Ram;
+            plotter = new MemPlotter();
+            break;
+        }
+    #if BB_CUDA_ENABLED
+        else if( cli.ArgConsume( "cudaplot" ) )
+        {
+            plotter = new CudaK32Plotter();
             break;
         }
+    #endif
         else if( cli.ArgConsume( "iotest" ) )
         {
             IOTestMain( cfg, cli );
-            exit( 0 );
+            Exit( 0 );
         }
         else if( cli.ArgConsume( "memtest" ) )
         {
             MemTestMain( cfg, cli );
-            exit( 0 );
+            Exit( 0 );
         }
         else if( cli.ArgConsume( "validate" ) )
         {
             PlotValidatorMain( cfg, cli );
-            exit( 0 );
+            Exit( 0 );
         }
         else if( cli.ArgConsume( "plotcmp" ) )
         {
             PlotCompareMain( cfg, cli );
-            exit( 0 );
+            Exit( 0 );
+        }
+        else if( cli.ArgConsume( "simulate" ) )
+        {
+            CmdSimulateMain( cfg, cli );
+            Exit( 0 );
+        }
+        else if( cli.ArgConsume( "check" ) )
+        {
+            CmdPlotsCheckMain( cfg, cli );
+            Exit( 0 );
+        }
+        else if( cli.ArgConsume( "cudacheck" ) )
+        {
+            CmdCheckCUDA( cfg, cli );
+            Exit( 1 );
         }
         else if( cli.ArgConsume( "help" ) )
         {
@@ -312,6 +323,10 @@ void ParseCommandLine( GlobalPlotConfig& cfg, int argc, const char* argv[] )
             {
                 if( cli.ArgMatch( "diskplot" ) )
                     DiskPlotter::PrintUsage();
+                else if( cli.ArgMatch( "ramplot" ) )
+                    Log::Line( "bladebit -f ... -p/c ... ramplot <out_dirs>" );
+                else if( cli.ArgMatch( "cudaplot" ) )
+                    Log::Line( "bladebit_cuda -f ... -p/c ... cudaplot [-d=device] <out_dirs>" );
                 else if( cli.ArgMatch( "iotest" ) )
                     IOTestPrintUsage();
                 else if( cli.ArgMatch( "memtest" ) )
@@ -320,10 +335,16 @@ void ParseCommandLine( GlobalPlotConfig& cfg, int argc, const char* argv[] )
                     PlotValidatorPrintUsage();
                 else if( cli.ArgMatch( "plotcmp" ) )
                     PlotCompareMainPrintUsage();
+                else if( cli.ArgMatch( "simulate" ) )
+                    CmdSimulateHelp();
+                else if( cli.ArgMatch( "check" ) )
+                    CmdPlotsCheckHelp();
+                else if( cli.ArgMatch( "cudacheck" ) )
+                    CmdCheckCUDAHelp();
                 else
                     Fatal( "Unknown command '%s'.", cli.Arg() );
 
-                exit( 0 );
+                Exit( 0 );
             }
 
             Log::Line( "help [<command>]" );
@@ -332,26 +353,19 @@ void ParseCommandLine( GlobalPlotConfig& cfg, int argc, const char* argv[] )
             PrintUsage();
             exit( 0 );
         }
-        // else if( cli.ArgMatch( "memplot" ) )
-        // {
-
-        // }
         else
         {
             Fatal( "Unexpected argument '%s'", cli.Arg() );
         }
     }
 
-    // The remainder should be output folders
-    while( cli.HasArgs() )
-    {
-        cfg.outputFolder = cli.Arg();
-        cli.NextArg();
-    }
+    // The remainder should be output folders, which we parse after the plotter consumes it's config
 
-    // Validation
+    ///
+    /// Validate global conifg
+    ///
     FatalIf( farmerPublicKey == nullptr, "A farmer public key must be specified." );
-    FatalIf( !KeyTools::HexPKeyToG1Element( farmerPublicKey, cfg.farmerPublicKey ),
+    FatalIf( !KeyTools::HexPKeyToG1Element( farmerPublicKey, *(cfg.farmerPublicKey = new bls::G1Element()) ),
         "Invalid farmer public key '%s'", farmerPublicKey );
 
     if( poolContractAddress )
@@ -370,6 +384,25 @@ void ParseCommandLine( GlobalPlotConfig& cfg, int argc, const char* argv[] )
         Fatal( "Error: Either a pool public key or a pool contract address must be specified." );
 
 
+    // FatalIf( cfg.compressionLevel > 7, "Invalid compression level. Please specify a compression level between 0 and 7 (inclusive)." );
+    FatalIf( cfg.compressionLevel > 9, "Invalid compression level. Please specify a compression level between 0 and 9 (inclusive)." );
+    // If making compressed plots, get thr compression CTable, etc.
+    if( cfg.compressionLevel > 0 )
+    {
+        // #TODO: Remove this when added
+        if( cfg.compressionLevel > 7 )
+            Log::Line( "[WARNING] Compression levels greater than 7 are only for testing purposes and are not configured to the final plot size." );
+
+        cfg.compressedEntryBits = 17 - cfg.compressionLevel;
+        cfg.ctable              = CreateCompressionCTable( cfg.compressionLevel, &cfg.cTableSize );
+        cfg.compressionInfo     = GetCompressionInfoForLevel( cfg.compressionLevel );
+        cfg.compressedEntryBits = cfg.compressionInfo.entrySizeBits;
+        cfg.numDroppedTables    = cfg.compressionLevel < 9 ? 1 : 2;
+
+        cfg.ctable          = CreateCompressionCTable( cfg.compressionLevel );
+        cfg.compressionInfo = GetCompressionInfoForLevel( cfg.compressionLevel );
+    }
+
     const uint maxThreads = SysHost::GetLogicalCPUCount();
     if( cfg.threadCount == 0 )
         cfg.threadCount = maxThreads;
@@ -381,8 +414,6 @@ void ParseCommandLine( GlobalPlotConfig& cfg, int argc, const char* argv[] )
         cfg.threadCount = maxThreads;
     }
 
-    FatalIf( cfg.outputFolder == nullptr, "An output folder must be specified." );
-
 
     if( cfg.plotIdStr )
     {
@@ -411,7 +442,9 @@ void ParseCommandLine( GlobalPlotConfig& cfg, int argc, const char* argv[] )
             Fatal( "Invalid plot memo." );
     }
 
-    // Config Summary
+    ///
+    // Global Config Summary
+    ///
     Log::Line( "" );
     Log::Line( "Bladebit Chia Plotter" );
     Log::Line( "Version      : %s", BLADEBIT_VERSION_STR   );
@@ -431,33 +464,61 @@ void ParseCommandLine( GlobalPlotConfig& cfg, int argc, const char* argv[] )
     Log::Line( " CPU affinity disabled : %s", cfg.disableCpuAffinity ? "true" : "false" );
 
     Log::Line( " Farmer public key     : %s", farmerPublicKey );
-    
 
     if( poolContractAddress )
         Log::Line( " Pool contract address : %s", poolContractAddress );
     else if( cfg.poolPublicKey )
         Log::Line( " Pool public key       : %s", poolPublicKey   );
 
-    Log::Line( " Output path           : %s", cfg.outputFolder );
+    // Log::Line( " Compression           : %s", cfg.compressionLevel > 0 ? "enabled" : "disabled" );
+    if( cfg.compressionLevel > 0 )
+        Log::Line( " Compression Level     : %u", cfg.compressionLevel );
 
-    Log::Line( "" );
+    Log::Line( " Benchmark mode        : %s", cfg.benchmarkMode ? "enabled" : "disabled" );
+    // Log::Line( " Output path           : %s", cfg.outputFolder );
+    // Log::Line( "" );
+    
+
+    FatalIf( plotter == nullptr, "No plotter type chosen." );
+
+    // #TODO: Remove when C8 compression values added.
+    FatalIf( cfg.compressionLevel == 8, "Compression level 8 is not currently supported." );
+
+    // Parse plotter-specific CLI
+    plotter->ParseCLI( cfg, cli );
+    
+    // Parse remaining args as output directories
+    cfg.outputFolderCount = (uint32)cli.RemainingArgCount();
+    FatalIf( cfg.outputFolderCount < 1, "At least one output folder must be specified." );
+
+    cfg.outputFolders = new std::string[cfg.outputFolderCount];
 
-    // Create plotter
-    switch( _plotter.type )
+    int32 folderIdx = 0;
+    std::string outPath; 
+    while( cli.HasArgs() )
     {
-        case PlotterType::Disk:
-            _plotter.disk = new DiskPlotter( diskCfg );
-            break;
+        outPath = cli.Arg();
 
-        case PlotterType::Ram:
-            _plotter.mem  = new MemPlotter( ramCfg );
-            break;
+        // Add trailing slash?
+        const char endChar = outPath.back();
+        if( endChar != '/' && endChar != '\\' )
+            outPath += '/';
         
-        default:
-            Fatal( "No plotter chosen." );
-            break;
+        cfg.outputFolders[folderIdx++] = outPath;
+        cli.NextArg();
     }
+
+    cfg.outputFolder = cfg.outputFolders[0].c_str();
+
+    Log::Line( "" );
+    Log::Flush();
+
+    // Initialize plotter
+    plotter->Init();
+
     Log::Line( "" );
+
+    outPlotter = plotter;
 }
 
 
@@ -465,11 +526,14 @@ void ParseCommandLine( GlobalPlotConfig& cfg, int argc, const char* argv[] )
 static const char* USAGE = "bladebit [GLOBAL_OPTIONS] <command> [COMMAND_OPTIONS]\n"
 R"(
 [COMMANDS]
+ cudaplot   : Create a plot by using the a CUDA-capable GPU.
  diskplot   : Create a plot by making use of a disk.
  ramplot    : Create a plot completely in-ram.
  iotest     : Perform a write and read test on a specified disk.
  memtest    : Perform a memory (RAM) copy test.
  validate   : Validates all entries in a plot to ensure they all evaluate to a valid proof.
+ simulate   : Simulation tool useful for compressed plot capacity.
+ check      : Check and validate random proofs in a plot.
  help       : Output this help message, or help for a specific command, if specified.
 
 [GLOBAL_OPTIONS]:
@@ -491,6 +555,15 @@ R"(
                         Use this if you are creating OG plots.
                         Only used if a pool contract address is not specified.
 
+ -z,--compress [level]: Compress the plot. Optionally pass a compression level parameter.
+                        If no level parameter is passed, the default compression level of 1 is used.
+                        Current compression levels supported are from 0 to 7 (inclusive).
+                        Where 0 means no compression, and 7 is the highest compression.
+                        Higher compression means smaller plots, but more CPU usage during harvesting.
+ 
+ --benchmark          : Enables benchmark mode. This is meant to test plotting without
+                        actually writing a final plot to disk.
+
  -w, --warm-start     : Touch all pages of buffer allocations before starting to plot.
 
  -i, --plot-id        : Specify a plot id for debugging.
diff --git a/src/main_old.h b/src/main_old.h
deleted file mode 100644
index d0d70ec8..00000000
--- a/src/main_old.h
+++ /dev/null
@@ -1,786 +0,0 @@
-#include <thread>
-#include <cstdlib>
-#include <string>
-
-#include "Version.h"
-#include "util/Util.h"
-#include "util/Log.h"
-#include "SysHost.h"
-#include "plotmem/MemPlotter.h"
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wreturn-type"
-
-#pragma warning( push )
-
-extern "C" {
-    #include "bech32/segwit_addr.h"
-}
-
-#pragma GCC diagnostic ignored "-Wsign-compare"
-#pragma warning( disable : 6287  )
-#pragma warning( disable : 4267  )
-#pragma warning( disable : 26495 )
-#include "bls.hpp"
-#include "elements.hpp"
-#include "schemes.hpp"
-#include "util.hpp"
-#pragma GCC diagnostic pop
-#pragma warning( pop )
-
-#define PLOT_FILE_PREFIX_LEN (sizeof("plot-k32-2021-08-05-18-55-")-1)
-#define PLOT_FILE_FMT_LEN (sizeof( "/plot-k32-2021-08-05-18-55-77a011fc20f0003c3adcc739b615041ae56351a22b690fd854ccb6726e5f43b7.plot.tmp" ))
-
-/// Internal Data Structures
-struct Config
-{
-    uint            threads            = 0;
-    uint            plotCount          = 1;
-    bool            warmStart          = false;
-    bool            disableNuma        = false;
-    bool            disableCpuAffinity = false;
-
-    bls::G1Element  farmerPublicKey;
-    bls::G1Element* poolPublicKey      = nullptr;
-    
-    ByteSpan*       contractPuzzleHash = nullptr;
-    const char*     outputFolder       = nullptr;
-
-    int             maxFailCount       = 100;
-
-    const char*     plotId             = nullptr;
-    const char*     plotMemo           = nullptr;
-    bool            showMemo           = false;
-};
-
-/// Internal Functions
-void            ParseCommandLine( int argc, const char* argv[], Config& cfg );
-bool            HexPKeyToG1Element( const char* hexKey, bls::G1Element& pkey );
-
-ByteSpan        DecodePuzzleHash( const char* poolContractAddress );
-void            GeneratePlotIdAndMemo( Config& cfg, byte plotId[32], byte plotMemo[48+48+32], uint16& outMemoSize );
-bls::PrivateKey MasterSkToLocalSK( bls::PrivateKey& sk );
-bls::G1Element  GeneratePlotPublicKey( const bls::G1Element& localPk, bls::G1Element& farmerPk, const bool includeTaproot );
-
-std::vector<uint8_t> BytesConcat( std::vector<uint8_t> a, std::vector<uint8_t> b, std::vector<uint8_t> c );
-
-void PrintSysInfo();
-void GetPlotIdBytes( const std::string& plotId, byte outBytes[32] );
-void PrintUsage();
-
-#if _DEBUG
-    std::string          HexToString( const byte* bytes, size_t length );
-    std::vector<uint8_t> HexStringToBytes( const char* hexStr );
-    std::vector<uint8_t> HexStringToBytes( const std::string& hexStr );
-    void PrintPK( const bls::G1Element&  key );
-    void PrintSK( const bls::PrivateKey& key );
-#endif
-
-//-----------------------------------------------------------
-const char* USAGE = "bladebit [<OPTIONS>] [<out_dir>]\n"
-R"(
-<out_dir>: Output directory in which to output the plots.
-           This directory must exist.
-
-OPTIONS:
-
- -h, --help           : Shows this message and exits.
-
- -t, --threads        : Maximum number of threads to use.
-                        For best performance, use all available threads (default behavior).
-                        Values below 2 are not recommended.
- 
- -n, --count          : Number of plots to create. Default = 1.
-
- -f, --farmer-key     : Farmer public key, specified in hexadecimal format.
-                        *REQUIRED*
-
- -p, --pool-key       : Pool public key, specified in hexadecimal format.
-                        Either a pool public key or a pool contract address must be specified.
-
- -c, --pool-contract  : Pool contract address, specified in hexadecimal format.
-                        Address where the pool reward will be sent to.
-                        Only used if pool public key is not specified.
-
- -w, --warm-start     : Touch all pages of buffer allocations before starting to plot.
-
- -i, --plot-id        : Specify a plot id for debugging.
-
- --memo               : Specify a plot memo for debugging.
-
- --show-memo          : Output the memo of the next plot the be plotted.
-
- -v, --verbose        : Enable verbose output.
-
- -m, --no-numa        : Disable automatic NUMA aware memory binding.
-                        If you set this parameter in a NUMA system you
-                        will likely get degraded performance.
-
- --no-cpu-affinity    : Disable assigning automatic thread affinity.
-                        This is useful when running multiple simultaneous
-                        instances of Bladebit as you can manually
-                        assign thread affinity yourself when launching Bladebit.
- 
- --memory             : Display system memory available, in bytes, and the 
-                        required memory to run Bladebit, in bytes.
- 
- --memory-json        : Same as --memory, but formats the output as json.
-
- --version            : Display current version.
-)";
-
-
-//-----------------------------------------------------------
-int main( int argc, const char* argv[] )
-{
-    // Install a crash handler to dump our stack traces
-    SysHost::InstallCrashHandler();
-
-    // Parse command line info
-    Config cfg;
-    ParseCommandLine( argc-1, argv+1, cfg );
-
-    // Create the plot output path
-    size_t outputFolderLen = strlen( cfg.outputFolder );
-    
-    char* plotOutPath = new char[outputFolderLen + PLOT_FILE_FMT_LEN];
-
-    if( outputFolderLen )
-    {
-        memcpy( plotOutPath, cfg.outputFolder, outputFolderLen );
-
-        // Add a trailing slash, if we need one.
-        if( plotOutPath[outputFolderLen-1] != '/' )
-            plotOutPath[outputFolderLen++] = '/';
-    }
-
-    // Begin plotting
-    PlotRequest req;
-    ZeroMem( &req );
-
-    // #TODO: Don't let this config to permanently remain on the stack
-    MemPlotConfig plotCfg;
-    plotCfg.threadCount   = cfg.threads;
-    plotCfg.noNUMA        = cfg.disableNuma;
-    plotCfg.noCPUAffinity = cfg.disableCpuAffinity;
-    plotCfg.warmStart     = cfg.warmStart;
-
-    MemPlotter plotter( plotCfg );
-
-    byte   plotId[32];
-    byte   memo  [48+48+32];
-    uint16 memoSize;
-    char   plotIdStr[65] = { 0 };
-
-    int failCount = 0;
-    for( uint i = 0; i < cfg.plotCount; i++ )
-    {
-        // Generate a new plot id
-        GeneratePlotIdAndMemo( cfg, plotId, memo, memoSize );
-
-        // Apply debug plot id and/or memo
-        if( cfg.plotId )
-            HexStrToBytes( cfg.plotId, 64, plotId, 32 );
-
-        if( cfg.plotMemo )
-        {
-            const size_t memoLen = strlen( cfg.plotMemo );
-            HexStrToBytes( cfg.plotMemo, memoLen, memo, memoLen/2 );
-        }
-        
-        // Convert plot id to string
-        {
-            size_t numEncoded = 0;
-            BytesToHexStr( plotId, sizeof( plotId), plotIdStr, sizeof( plotIdStr ), numEncoded );
-
-            ASSERT( numEncoded == 32 );
-            plotIdStr[64] = 0;
-        }
-
-        // Set the output path
-        {
-            time_t     now = time( nullptr  );
-            struct tm* t   = localtime( &now ); ASSERT( t );
-            
-            const size_t r = strftime( plotOutPath + outputFolderLen, PLOT_FILE_FMT_LEN, "plot-k32-%Y-%m-%d-%H-%M-", t );
-            if( r != PLOT_FILE_PREFIX_LEN )
-                Fatal( "Failed to generate plot file." );
-
-            memcpy( plotOutPath + outputFolderLen + PLOT_FILE_PREFIX_LEN     , plotIdStr, 64 );
-            memcpy( plotOutPath + outputFolderLen + PLOT_FILE_PREFIX_LEN + 64, ".plot.tmp", sizeof( ".plot.tmp" ) );
-        }
-
-        Log::Line( "Generating plot %d / %d: %s", i+1, cfg.plotCount, plotIdStr );
-        if( cfg.showMemo )
-        {
-            char memoStr[(48+48+32)*2 + 1];
-
-            size_t numEncoded = 0;
-            BytesToHexStr( memo, memoSize, memoStr, sizeof( memoStr ) - 1, numEncoded );
-            memoStr[numEncoded*2] = 0;
-
-            Log::Line( "Plot Memo: %s", memoStr );
-        }
-        Log::Line( "" );
-
-        // Prepare the request
-        req.outPath     = plotOutPath;
-        req.plotId      = plotId;
-        req.memo        = memo;
-        req.memoSize    = memoSize;
-        req.IsFinalPlot = i+1 == cfg.plotCount;
-
-        // Plot it
-        if( !plotter.Run( req ) )
-        {
-            Log::Error( "Error: Plot %s failed... Trying next plot.", plotIdStr );
-            if( cfg.maxFailCount > 0 && ++failCount >= cfg.maxFailCount )
-            {
-                // #TODO: Wait for pending plot writes to disk
-                Fatal( "Maximum number of plot failures reached. Exiting." );
-            }
-        }
-
-        Log::Line( "" );
-    }
-    
-    Log::Flush();
-    return 0;
-}
-
-//-----------------------------------------------------------
-void ParseCommandLine( int argc, const char* argv[], Config& cfg )
-{
-    #define check( a ) (strcmp( a, arg ) == 0)
-    int i;
-    const char* arg = nullptr;
-
-    auto value = [&](){
-
-        if( ++i >= argc )
-            Fatal( "Expected a value for parameter '%s'", arg );
-
-        return argv[i];
-    };
-
-    auto ivalue = [&]() {
-
-        const char* val = value();
-        int64 v = 0;
-        
-        int r = sscanf( val, "%lld", &v );
-        if( r != 1 )
-            Fatal( "Invalid value for argument '%s'.", arg );
-
-        return v;
-    };
-
-    auto uvalue = [&]() {
-        
-        int64 v = ivalue();
-        if( v < 0 || v > 0xFFFFFFFF )
-            Fatal( "Invalid value for argument '%s'. Value '%ld' is out of range.", arg, v );
-
-        return (uint32)v;
-    };
-
-    const char* farmerPublicKey     = nullptr;
-    const char* poolPublicKey       = nullptr;
-    const char* poolContractAddress = nullptr;
-
-    for( i = 0; i < argc; i++ )
-    {
-        arg = argv[i];
-
-        if( check( "-h" ) || check( "--help") )
-        {
-            PrintUsage();
-            exit( 0 );
-        }
-        else if( check( "-t" ) || check( "--threads") )
-        {
-            cfg.threads = uvalue();
-            
-            if( cfg.threads == 1 )
-                Log::Line( "Warning: Only 1 thread was specified. Sub-optimal performance expected." );
-        }
-        else if( check( "-n" ) || check( "--count" ) )
-        {
-            cfg.plotCount = uvalue();
-            if( cfg.plotCount < 1 )
-            {
-                Log::Line( "Warning: Invalid plot count specified. Setting it to 1." );
-                cfg.plotCount = 1;
-            }
-        }
-        else if( check( "-f" ) || check( "--farmer-key" ) )
-        {
-            farmerPublicKey = value();
-        }
-        else if( check( "-p" ) || check( "--pool-key" ) )
-        {
-            poolPublicKey = value();
-        }
-        else if( check( "-c" ) || check( "--pool-contract" ) )
-        {
-            poolContractAddress = value();
-        }
-        else if( check( "-w" ) || check( "--warm-start" ) )
-        {
-            cfg.warmStart = true;
-        }
-        else if( check( "-i" ) || check( "--plot-id" ) )
-        {
-            cfg.plotId = value();
-
-            size_t len = strlen( cfg.plotId );
-            if( len < 64 && len != 66 )
-                Fatal( "Invalid plot id." );
-            
-            if( len == 66 )
-            {
-                if( cfg.plotId[0] == '0' && cfg.plotId[1] == 'x' )
-                    cfg.plotId += 2;
-                else
-                    Fatal( "Invalid plot id." );
-            }
-        }
-        else if( check( "--memo" ) )
-        {
-            cfg.plotMemo = value();
-
-            size_t len = strlen( cfg.plotMemo );
-            if( len > 2 && cfg.plotMemo[0] == '0' && cfg.plotMemo[1] == 'x' )
-            {
-                cfg.plotMemo += 2;
-                len -= 2;
-            }
-            
-            if( len/2 != (48 + 48 + 32) && len != (32 + 48 + 32) )
-                Fatal( "Invalid plot memo." );
-        }
-        else if( check( "--show-memo" ) )
-        {
-            cfg.showMemo = true;
-        }
-        else if( check( "-m" ) || check( "--no-numa" ) )
-        {
-            cfg.disableNuma = true;
-        }
-        else if( check( "--no-cpu-affinity" ) )
-        {
-            cfg.disableCpuAffinity = true;
-        }
-        else if( check( "-v" ) || check( "--verbose" ) )
-        {
-            Log::SetVerbose( true );
-        }
-        else if( check( "--memory" ) )
-        {
-            // #TODO: Get this value from Memplotter
-            const size_t requiredMem  = 416ull GB;
-            const size_t availableMem = SysHost::GetAvailableSystemMemory();
-            const size_t totalMem     = SysHost::GetTotalSystemMemory();
-
-            Log::Line( "required : %llu", requiredMem  );
-            Log::Line( "total    : %llu", totalMem     );
-            Log::Line( "available: %llu", availableMem );
-
-            exit( 0 );
-        }
-        else if( check( "--memory-json" ) )
-        {
-            // #TODO: Get this value from Memplotter
-            const size_t requiredMem  = 416ull GB;
-            const size_t availableMem = SysHost::GetAvailableSystemMemory();
-            const size_t totalMem     = SysHost::GetTotalSystemMemory();
-
-            Log::Line( "{ \"required\": %llu, \"total\": %llu, \"available\": %llu }",
-                         requiredMem, totalMem, availableMem );
-
-            exit( 0 );
-        }
-        else if( check( "--version" ) )
-        {
-            Log::Line( BLADEBIT_VERSION_STR );
-            exit( 0 );
-        }
-        else
-        {
-            if( i+1 < argc )
-            {
-                Fatal( "Unexpected argument '%s'.", arg );
-                exit( 1 );
-            }
-
-            cfg.outputFolder = arg;
-        }
-    }
-    #undef check
-
-
-    if( farmerPublicKey )
-    {
-        if( !HexPKeyToG1Element( farmerPublicKey, cfg.farmerPublicKey ) )
-            Fatal( "Failed to parse farmer public key '%s'.", farmerPublicKey );
-        
-        // Remove 0x prefix for printing
-        if( farmerPublicKey[0] == '0' && farmerPublicKey[1] == 'x' )
-            farmerPublicKey += 2;
-    }
-    else
-        Fatal( "A farmer public key is required. Please specify a farmer public key." );
-
-    if( poolPublicKey )
-    {
-        if( poolContractAddress )
-            Log::Write( "Warning: Pool contract address will not be used. A pool public key was specified." );
-        
-        // cfg.poolPublicKey = new bls::G1Element()
-        bls::G1Element poolPubG1;
-        if( !HexPKeyToG1Element( poolPublicKey, poolPubG1 ) )
-            Fatal( "Error: Failed to parse pool public key '%s'.", poolPublicKey );
-
-        cfg.poolPublicKey = new bls::G1Element( std::move( poolPubG1 ) );
-
-        // Remove 0x prefix for printing
-        if( poolPublicKey[0] == '0' && poolPublicKey[1] == 'x' )
-            poolPublicKey += 2;
-    }
-    else if( poolContractAddress )
-    {
-        cfg.contractPuzzleHash = new ByteSpan( std::move( DecodePuzzleHash( poolContractAddress ) ) );
-    }
-    else
-        Fatal( "Error: Either a pool public key or a pool contract address must be specified." );
-
-
-    const uint threadCount = SysHost::GetLogicalCPUCount();
-
-    if( cfg.threads == 0 )
-        cfg.threads = threadCount;
-    else if( cfg.threads > threadCount )
-    {
-        Log::Write( "Warning: Lowering thread count from %d to %d, the native maximum.", 
-                    cfg.threads, threadCount );
-
-        cfg.threads = threadCount;
-    }
-    
-    if( cfg.plotCount < 1 )
-        cfg.plotCount = 1;
-
-    if( cfg.outputFolder == nullptr )
-    {
-        Log::Line( "Warning: No output folder specified. Using current directory." );
-        cfg.outputFolder = "";
-    }
-
-    Log::Line( "Creating %d plots:", cfg.plotCount );
-    
-    if( cfg.outputFolder )
-        Log::Line( " Output path           : %s", cfg.outputFolder );
-    else
-        Log::Line( " Output path           : Current directory." );
-
-    Log::Line( " Thread count          : %d", cfg.threads );
-    Log::Line( " Warm start enabled    : %s", cfg.warmStart ? "true" : "false" );
-
-
-    Log::Line( " Farmer public key     : %s", farmerPublicKey );
-
-    if( poolPublicKey )
-        Log::Line( " Pool public key       : %s", poolPublicKey   );
-    else if( poolContractAddress )
-        Log::Line( " Pool contract address : %s", poolContractAddress );
-    
-    Log::Line( "" );
-}
-
-//-----------------------------------------------------------
-void GeneratePlotIdAndMemo( Config& cfg, byte plotId[32], byte plotMemo[48+48+32], uint16& outMemoSize )
-{
-    bls::G1Element& farmerPK = cfg.farmerPublicKey;
-    bls::G1Element* poolPK   = cfg.poolPublicKey;
-
-    // Generate random master secret key
-    byte seed[32];
-    SysHost::Random( seed, sizeof( seed ) );
-
-    bls::PrivateKey sk      = bls::AugSchemeMPL().KeyGen( bls::Bytes( seed, sizeof( seed ) ) );
-    bls::G1Element  localPk = std::move( MasterSkToLocalSK( sk ) ).GetG1Element();
-
-    // #See: chia-blockchain create_plots.py
-    //       The plot public key is the combination of the harvester and farmer keys
-    //       New plots will also include a taproot of the keys, for extensibility
-    const bool includeTaproot = cfg.contractPuzzleHash != nullptr;
-    
-    bls::G1Element plotPublicKey = std::move( GeneratePlotPublicKey( localPk, farmerPK, includeTaproot ) );
-    
-    std::vector<uint8_t> farmerPkBytes = farmerPK.Serialize();
-    std::vector<uint8_t> localSkBytes  = sk.Serialize();
-
-    // The plot id is based on the harvester, farmer, and pool keys
-    if( !includeTaproot )
-    {
-        std::vector<uint8_t> bytes = poolPK->Serialize();
-        
-        // Gen plot id
-        auto plotPkBytes = plotPublicKey.Serialize();
-        bytes.insert( bytes.end(), plotPkBytes.begin(), plotPkBytes.end() );
-
-        bls::Util::Hash256( plotId, bytes.data(), bytes.size() );
-
-        // Gen memo
-        auto memoBytes = BytesConcat( poolPK->Serialize(), farmerPkBytes, localSkBytes );
-
-        const size_t poolMemoSize = 48 + 48 + 32;
-        ASSERT( memoBytes.size() == poolMemoSize );
-
-        memcpy( plotMemo, memoBytes.data(), poolMemoSize );
-        outMemoSize = (uint16)poolMemoSize;
-    }
-    else
-    {
-        // Create a pool plot with a contract puzzle hash
-        ASSERT( cfg.contractPuzzleHash );
-
-        const auto& ph = *cfg.contractPuzzleHash;
-        std::vector<uint8_t> phBytes( (uint8_t*)ph.values, (uint8_t*)ph.values + ph.length );
-        
-        // Gen plot id
-        std::vector<uint8_t> plotIdBytes = phBytes;
-        auto plotPkBytes = plotPublicKey.Serialize();
-
-        plotIdBytes.insert( plotIdBytes.end(), plotPkBytes.begin(), plotPkBytes.end() );
-        bls::Util::Hash256( plotId, plotIdBytes.data(), plotIdBytes.size() );
-
-        // Gen memo
-        auto memoBytes = BytesConcat( phBytes, farmerPkBytes, localSkBytes );
-
-        const size_t phMemoSize = 32 + 48 + 32;
-        ASSERT( memoBytes.size() == phMemoSize );
-
-        memcpy( plotMemo, memoBytes.data(), phMemoSize );
-        outMemoSize = (uint16)phMemoSize;
-    }
-}
-
-//-----------------------------------------------------------
-bls::PrivateKey MasterSkToLocalSK( bls::PrivateKey& sk )
-{
-    // #SEE: chia-blockchain: derive-keys.py
-    // EIP 2334 bls key derivation
-    // https://eips.ethereum.org/EIPS/eip-2334
-    // 12381 = bls spec number
-    // 8444  = Chia blockchain number and port number
-    // 0, 1, 2, 3, 4, 5, 6 farmer, pool, wallet, local, backup key, singleton, pooling authentication key numbers
-
-    const uint32 blsSpecNum         = 12381;
-    const uint32 chiaBlockchainPort = 8444; 
-    const uint32 localIdx           = 3;
-
-    bls::PrivateKey ssk = bls::AugSchemeMPL().DeriveChildSk( sk, blsSpecNum );
-    ssk = bls::AugSchemeMPL().DeriveChildSk( ssk, chiaBlockchainPort );
-    ssk = bls::AugSchemeMPL().DeriveChildSk( ssk, localIdx );
-    ssk = bls::AugSchemeMPL().DeriveChildSk( ssk, 0        );
-
-    return ssk;
-}
-
-//-----------------------------------------------------------
-bls::G1Element GeneratePlotPublicKey( const bls::G1Element& localPk, bls::G1Element& farmerPk, const bool includeTaproot )
-{
-    bls::G1Element plotPublicKey;
-
-    if( includeTaproot )
-    {
-        std::vector<uint8_t> taprootMsg = (localPk + farmerPk).Serialize();
-        taprootMsg = BytesConcat( taprootMsg, localPk.Serialize(), farmerPk.Serialize() );
-        
-        byte tapRootHash[32];
-        bls::Util::Hash256( tapRootHash, taprootMsg.data(), taprootMsg.size() );
-
-        bls::PrivateKey taprootSk = bls::AugSchemeMPL().KeyGen( bls::Bytes( tapRootHash, sizeof( tapRootHash ) ) );
-        
-        plotPublicKey = localPk + farmerPk + taprootSk.GetG1Element();
-    }
-    else
-    {
-        plotPublicKey = localPk + farmerPk;
-    }
-
-    return plotPublicKey;
-}
-
-//-----------------------------------------------------------
-ByteSpan DecodePuzzleHash( const char* poolContractAddress )
-{
-    ASSERT( poolContractAddress );
-
-    size_t length = strlen( poolContractAddress );
-
-    if( length < 9 )
-        Fatal( "Error: Invalid pool contract address '%s'.", poolContractAddress );
-
-    char* hrp  = (char*)malloc( length - 6 );
-    byte* data = (byte*)malloc( length - 8 );
-
-    size_t dataLength = 0;
-    bech32_encoding encoding = bech32_decode( hrp, data, &dataLength, poolContractAddress );
-    if( encoding == BECH32_ENCODING_NONE )
-        Fatal( "Error: Failed to decode contract address '%s'.", poolContractAddress );
-
-    ASSERT( dataLength > 0 );
-    free( hrp );
-
-    // See: convertbits in bech32m.py
-    // This extends fields from 5 bits to 8 bits
-    byte* decoded = (byte*)malloc( length - 8 );
-
-    const uint fromBits = 5;
-    const uint toBits   = 8;
-
-    uint acc     = 0;
-    uint bits    = 0;
-    uint maxv    = (1 << toBits) - 1;
-    uint maxAcc  = (1 << (fromBits + toBits - 1)) - 1;
-    uint bitsLen = 0;
-
-    for( size_t i = 0; i < dataLength; i++ )
-    {
-        uint value = data[i];
-
-        if( value < 0 || (value >> fromBits) )
-            Fatal( "Error: Invalid pool contract address '%s'. Could not decode bits.", poolContractAddress );
-
-        acc = ((acc << fromBits) | value) & maxAcc;
-        bits += fromBits;
-        
-        while( bits >= toBits )
-        {
-            ASSERT( bitsLen < length-8 );
-            bits -= toBits;
-            decoded[bitsLen++] = (acc >> bits) & maxv;
-        }
-    }
-
-    if( bits >= fromBits || ((acc << (toBits - bits)) & maxv) )
-        Fatal( "Error: Invalid pool contract address bits '%s'.", poolContractAddress );
-
-    free( data );
-
-    return ByteSpan( decoded, bitsLen );
-}
-
-//-----------------------------------------------------------
-bool HexPKeyToG1Element( const char* hexKey, bls::G1Element& pkey )
-{
-    ASSERT( hexKey );
-    
-    size_t length = strlen( hexKey );
-
-    if( length < bls::G1Element::SIZE*2 )
-        return false;
-
-    if( hexKey[0] == '0' && hexKey[1] == 'x' )
-    {
-        hexKey += 2;
-        length -= 2;
-    }
-
-    if( length != bls::G1Element::SIZE*2 )
-        return false;
-
-    byte g1Buffer[bls::G1Element::SIZE];
-    HexStrToBytes( hexKey, length, g1Buffer, sizeof( g1Buffer ) );
-
-    bls::Bytes g1Bytes( g1Buffer, sizeof( g1Buffer ) );
-
-    pkey = bls::G1Element::FromBytes( g1Bytes );
-    
-    return pkey.IsValid();
-}
-
-//-----------------------------------------------------------
-void GetPlotIdBytes( const std::string& plotId, byte outBytes[32] )
-{
-    const char* pId = plotId.c_str();
-    if( plotId.length() == 66 )
-    {
-        ASSERT( pId[0] == '0' && pId[1] == 'x' );
-        pId += 2;
-    }
-
-    HexStrToBytes( pId, 64, outBytes, 32 );
-}
-
-
-//-----------------------------------------------------------
-inline std::vector<uint8_t> BytesConcat( std::vector<uint8_t> a, std::vector<uint8_t> b, std::vector<uint8_t> c )
-{
-    a.insert( a.end(), b.begin(), b.end() );
-    a.insert( a.end(), c.begin(), c.end() );
-    return a;
-}
-
-//-----------------------------------------------------------
-void PrintUsage()
-{
-    fputs( USAGE, stderr );
-    fflush( stderr );
-}
-
-#if _DEBUG
-    //-----------------------------------------------------------
-    std::string HexToString( const byte* bytes, size_t length )
-    {
-        ASSERT( length );
-
-        const size_t slen = length * 2 + 1;
-        char* buffer      = (char*)malloc( slen );
-        memset( buffer, 0, slen );
-        
-        size_t numEncoded;
-        BytesToHexStr( bytes, length, buffer, slen, numEncoded );
-
-        std::string str( buffer );
-        free( buffer );
-
-        return str;
-    }
-
-    //-----------------------------------------------------------
-    std::vector<uint8_t> HexStringToBytes( const char* hexStr )
-    {
-        const size_t len  = strlen( hexStr );
-
-        byte* buffer = (byte*)malloc( len / 2 );
-
-        HexStrToBytes( hexStr, len, buffer, len / 2 );
-        std::vector<uint8_t> ret( buffer, buffer + len / 2 );
-        
-        free( buffer );
-        return ret;
-    }
-
-    //-----------------------------------------------------------
-    std::vector<uint8_t> HexStringToBytes( const std::string& hexStr )
-    {
-        return HexStringToBytes( hexStr.c_str() );
-    }
-
-    //-----------------------------------------------------------
-    void PrintPK( const bls::G1Element& key )
-    {
-        std::vector<uint8_t> bytes = key.Serialize();
-        Log::Line( "%s", HexToString( (byte*)bytes.data(), bytes.size() ).c_str() );
-    }
-
-    //-----------------------------------------------------------
-    void PrintSK( const bls::PrivateKey& key )
-    {
-        std::vector<uint8_t> bytes = key.Serialize();
-        Log::Line( "%s", HexToString( (byte*)bytes.data(), bytes.size() ).c_str() );
-    }
-#endif
diff --git a/src/platform/linux/SysHost_Linux.cpp b/src/platform/linux/SysHost_Linux.cpp
index 808b8ff8..6de2279b 100644
--- a/src/platform/linux/SysHost_Linux.cpp
+++ b/src/platform/linux/SysHost_Linux.cpp
@@ -2,16 +2,22 @@
 #include "Platform.h"
 #include "util/Util.h"
 
-#include <sys/random.h>
 #include <execinfo.h>
 #include <signal.h>
 #include <atomic>
 #include <errno.h>
-#include <numa.h>
-#include <numaif.h>
 #include <stdio.h>
 #include <mutex>
 
+#if !defined(BB_IS_HARVESTER)
+    #include <sys/random.h>
+#endif
+
+#if BB_NUMA_ENABLED
+    #include <numa.h>
+    #include <numaif.h>
+#endif
+
 // #if _DEBUG
     #include "util/Log.h"
 // #endif
@@ -259,6 +265,9 @@ void SysHost::DumpStackTrace()
 //-----------------------------------------------------------
 void SysHost::Random( byte* buffer, size_t size )
 {
+    #if defined(BB_IS_HARVESTER)
+        Panic( "getrandom not supported on bladebit_harvester target.");
+    #else
     // See: https://man7.org/linux/man-pages/man2/getrandom.2.html
 
     ssize_t sizeRead;
@@ -282,12 +291,16 @@ void SysHost::Random( byte* buffer, size_t size )
 
         writer += (size_t)sizeRead;
     }
+    #endif
 }
 
 // #NOTE: This is not thread-safe
 //-----------------------------------------------------------
 const NumaInfo* SysHost::GetNUMAInfo()
 {
+#if !BB_NUMA_ENABLED
+    return nullptr;
+#else
     if( numa_available() == -1 )
         return nullptr;
 
@@ -358,12 +371,15 @@ const NumaInfo* SysHost::GetNUMAInfo()
     }
 
     return info;
+#endif // BB_NUMA_ENABLED
 }
 
 //-----------------------------------------------------------
 void SysHost::NumaAssignPages( void* ptr, size_t size, uint node )
 {
+#if BB_NUMA_ENABLED
     numa_tonode_memory( ptr, size, (int)node );
+#endif
 }
 
 //-----------------------------------------------------------
@@ -373,6 +389,7 @@ bool SysHost::NumaSetThreadInterleavedMode()
     if( !numa )
         return false;
     
+#if BB_NUMA_ENABLED
     const size_t MASK_SIZE = 128;
     unsigned long mask[MASK_SIZE];
     memset( mask, 0xFF, sizeof( mask ) );
@@ -391,6 +408,7 @@ bool SysHost::NumaSetThreadInterleavedMode()
     #endif
 
     return r == 0;
+#endif // BB_NUMA_ENABLED
 }
 
 //-----------------------------------------------------------
@@ -400,6 +418,7 @@ bool SysHost::NumaSetMemoryInterleavedMode( void* ptr, size_t size )
     if( !numa )
         return false;
 
+#if BB_NUMA_ENABLED
     const size_t MASK_SIZE = 128;
     unsigned long mask[MASK_SIZE];
     memset( mask, 0xFF, sizeof( mask ) );
@@ -418,6 +437,7 @@ bool SysHost::NumaSetMemoryInterleavedMode( void* ptr, size_t size )
     #endif
 
     return r == 0;
+#endif // BB_NUMA_ENABLED
 }
 
 //-----------------------------------------------------------
@@ -427,6 +447,7 @@ int SysHost::NumaGetNodeFromPage( void* ptr )
     if( !numa )
         return -1;
 
+#if BB_NUMA_ENABLED
     int node = -1;
     int r = numa_move_pages( 0, 1, &ptr, nullptr, &node, 0 );
 
@@ -442,4 +463,5 @@ int SysHost::NumaGetNodeFromPage( void* ptr )
     }
 
     return node;
+#endif // BB_NUMA_ENABLED
 }
diff --git a/src/platform/macos/SysHost_Macos.cpp b/src/platform/macos/SysHost_Macos.cpp
index 192d3de2..54c480c1 100644
--- a/src/platform/macos/SysHost_Macos.cpp
+++ b/src/platform/macos/SysHost_Macos.cpp
@@ -6,7 +6,10 @@
 #include <mach/mach.h>
 #include <fcntl.h>
 #include <pthread.h>
-#include <sodium.h>
+
+#if !defined( BB_IS_HARVESTER )
+    #include <sodium.h>
+#endif
 
 //-----------------------------------------------------------
 size_t SysHost::GetPageSize()
@@ -84,18 +87,22 @@ void* SysHost::VirtualAlloc( size_t size, bool initialize )
     ASSERT( ptr );
 
     // #TODO: Use a hinting system for this.
-    // Hit the memory to be accessed sequentially
+    // Hint the memory to be accessed sequentially
     r = vm_behavior_set( task, ptr, allocSize, VM_BEHAVIOR_SEQUENTIAL );
     if( r != 0 )
     {
-        Log::Line( "Warning: vm_behavior_set() failed with error: %d .", (int32)r );
+        #if !defined( BB_IS_HARVESTER )
+            Log::Line( "Warning: vm_behavior_set() failed with error: %d .", (int32)r );
+        #endif
     }
 
     // Try wiring it
     r = vm_wire( mach_host_self(), task, ptr, allocSize, VM_PROT_READ | VM_PROT_WRITE );
     if( r != 0 )
     {
-        Log::Line( "Warning: vm_wire() failed with error: %d .", (int32)r );
+        #if !defined( BB_IS_HARVESTER )
+            Log::Line( "Warning: vm_wire() failed with error: %d .", (int32)r );
+        #endif
     }
 
     // Store page size
@@ -229,7 +236,11 @@ void SysHost::DumpStackTrace()
 //-----------------------------------------------------------
 void SysHost::Random( byte* buffer, size_t size )
 {
+    #if defined(BB_IS_HARVESTER)
+        Panic( "getrandom not supported on bladebit_harvester target.");
+    #else
     randombytes_buf( buffer, size );
+    #endif
 }
 
 
diff --git a/src/platform/unix/Thread_Unix.cpp b/src/platform/unix/Thread_Unix.cpp
index 25ddfaa1..371eac23 100644
--- a/src/platform/unix/Thread_Unix.cpp
+++ b/src/platform/unix/Thread_Unix.cpp
@@ -18,39 +18,36 @@ Thread::Thread( size_t stackSize )
 
     _state.store( ThreadState::ReadyToRun, std::memory_order_release );
 
-#if PLATFORM_IS_UNIX
-
     pthread_attr_t  attr;
     
     int r = pthread_attr_init( &attr );
-    if( r ) Fatal( "pthread_attr_init() failed." );
+    PanicIf( r, "pthread_attr_init() failed with error %d.", r );
     
     r = pthread_attr_setstacksize( &attr, stackSize );
-    if( r ) Fatal( "pthread_attr_setstacksize() failed." );
+    PanicIf( r, "pthread_attr_setstacksize() failed with error %d.", r );
 
-    // Initialize suspended mode signal
+    // Initialize suspended mode signal and exit signals
     r = pthread_cond_init(  &_launchCond,  NULL );
-    if( r ) Fatal( "pthread_cond_init() failed." );
+    PanicIf( r, "pthread_cond_init() failed with error %d.", r );
 
     r = pthread_mutex_init( &_launchMutex, NULL );
-    if( r ) Fatal( "pthread_mutex_init() failed." );
+    PanicIf( r, "pthread_mutex_init() failed with error %d.", r );
+
+    r = pthread_cond_init(  &_exitCond,  NULL );
+    PanicIf( r, "pthread_cond_init() failed with error %d.", r );
+
+    r = pthread_mutex_init( &_exitMutex, NULL );
+    PanicIf( r, "pthread_mutex_init() failed with error %d.", r );
     
     r = pthread_create( &_threadId, &attr, (PthreadFunc)&Thread::ThreadStarterUnix, this );
-    if( r ) Fatal( "pthread_create() failed." );
+    PanicIf( r, "pthread_create() failed with error %d.", r );
     
     r = pthread_attr_destroy( &attr );
-    if( r ) Fatal( "pthread_attr_destroy() failed." );
-
-#elif PLATFORM_IS_WINDOWS
-    
-
-#else
-    #error Not implemented
-#endif
+    PanicIf( r, "pthread_attr_destroy() failed with error %d.", r );
 }
 
 //-----------------------------------------------------------
-Thread::Thread() : Thread( 8 MB )
+Thread::Thread() : Thread( 8 MiB )
 {
 }
 
@@ -62,19 +59,23 @@ Thread::~Thread()
     if( !didExit )
     {
         // Thread should have exited already
-        #if PLATFORM_IS_UNIX
-            pthread_cancel( _threadId );
+        ASSERT( 0 );
+        
+        pthread_cancel( _threadId );
 
-            pthread_mutex_destroy( &_launchMutex );
-            pthread_cond_destroy ( &_launchCond );
+        pthread_mutex_destroy( &_launchMutex );
+        pthread_cond_destroy ( &_launchCond );
 
-            ZeroMem( &_launchMutex );
-            ZeroMem( &_launchCond  );
-        #else
-            #error Unimplemented
-        #endif
+        ZeroMem( &_launchMutex );
+        ZeroMem( &_launchCond  );
     }
 
+    pthread_mutex_destroy( &_exitMutex );
+    pthread_cond_destroy ( &_exitCond );
+
+    ZeroMem( &_exitMutex );
+    ZeroMem( &_exitCond  );
+
     _threadId = 0;
 }
 
@@ -105,24 +106,23 @@ void Thread::Run( ThreadRunner runner, void* param )
                                     std::memory_order_release,
                                     std::memory_order_relaxed ) )
     {
-        // Another thread ran us first.
+        // Another thread preempted us.
         return;
     }
     
     _runner   = runner;
     _runParam = param;
 
-    #if PLATFORM_IS_UNIX
-        // Signal thread to resume
-        int r = pthread_mutex_lock( &_launchMutex );
-        if( r ) Fatal( "pthread_mutex_lock() failed." );
 
-        r = pthread_cond_signal( &_launchCond );
-        if( r ) Fatal( "pthread_cond_signal() failed." );
+    // Signal thread to resume
+    int r = pthread_mutex_lock( &_launchMutex );
+    PanicIf( r, "pthread_mutex_lock() failed with error %d.", r );
 
-        r = pthread_mutex_unlock( &_launchMutex );
-        if( r ) Fatal( "pthread_mutex_unlock() failed." );
-    #endif
+    r = pthread_cond_signal( &_launchCond );
+    PanicIf( r, "pthread_cond_signal() failed with error %d.", r );
+
+    r = pthread_mutex_unlock( &_launchMutex );
+    PanicIf( r, "pthread_mutex_unlock() failed with error %d.", r );
 }
 
 
@@ -160,43 +160,60 @@ bool Thread::WaitForExit( long milliseconds )
     if( state == ThreadState::Exited )
         return true;
 
-    #if PLATFORM_IS_UNIX
-        
-        int r;
+    // Immediate return?
+    if( milliseconds == 0 || state != ThreadState::Running )
+        return false;
 
-        if( milliseconds > 0 )
-        {   
-            // #TODO: Support on apple with a condition variable and mutex pair
-            #if __APPLE__
-                return false;
-            #else
-                long seconds = milliseconds / 1000;
-                milliseconds -= seconds * 1000;
+    int r = 0;
+    int waitResult = 0;
+
+    if( milliseconds > 0 )
+    {
+        int r = pthread_mutex_lock( &_exitMutex );
+        PanicIf( r, "pthread_mutex_lock() failed with error %d.", r );
 
-                struct timespec abstime;
+        state = _state.load( std::memory_order_relaxed );
+        if( state != ThreadState::Exited )
+        {
+            struct timespec abstime = {};
+            
+            long seconds = milliseconds / 1000;
+            milliseconds -= seconds * 1000;
 
-                if( clock_gettime( CLOCK_REALTIME, &abstime ) == -1 )
-                {
-                    ASSERT( 0 );
-                    return false;
-                }
+            #if !__APPLE__
+                r = clock_gettime( CLOCK_REALTIME, &abstime );
+                PanicIf( r, "clock_gettime() failed with error %d", r );
+            #endif
 
-                abstime.tv_sec  += seconds;
-                abstime.tv_nsec += milliseconds * 1000000;
+            abstime.tv_sec  += seconds;
+            abstime.tv_nsec += milliseconds * 1000000l;
 
-                r = pthread_timedjoin_np( _threadId, NULL, &abstime );
-                ASSERT( !r || r == ETIMEDOUT );
+            // #NOTE: On macOS it seems that using absolute time (pthread_cond_timedwait) with anything 
+            //        less than 1 second is invalid, therefore we use this variant which works with
+            //        smaller wait times.
+            #if __APPLE__
+                waitResult = pthread_cond_timedwait_relative_np( &_exitCond, &_exitMutex, &abstime );
+            #else
+                waitResult = pthread_cond_timedwait( &_exitCond, &_exitMutex, &abstime );
             #endif
-        }
-        else
-        {
-            r = pthread_join( _threadId, NULL );
+            if( waitResult != 0 && waitResult != ETIMEDOUT ) 
+                Panic( "pthread_cond_timedwait() failed with error %d.", waitResult );
         }
 
-        return r == 0;
-    #else
-        #error Unimplemented
-    #endif
+        r = pthread_mutex_unlock( &_exitMutex );
+        PanicIf( r, "pthread_mutex_unlock() failed with error %d.", r );
+
+        state = _state.load( std::memory_order_relaxed );
+        if( waitResult == ETIMEDOUT && state != ThreadState::Exited )
+            return false;
+    }
+
+    void* ret = nullptr;
+    r = pthread_join( _threadId, &ret ); 
+    ASSERT( !r ); (void)r;
+    ASSERT( _state.load( std::memory_order_relaxed ) == ThreadState::Exited );
+
+    return true;
 }
 
 // Starts up a thread.
@@ -204,18 +221,21 @@ bool Thread::WaitForExit( long milliseconds )
 void* Thread::ThreadStarterUnix( Thread* t )
 {
     // On Linux, it suspends it until it is signaled to run.
-    int r = pthread_mutex_lock( &t->_launchMutex );
-    if( r ) Fatal( "pthread_mutex_lock() failed." );
-
-    while( t->_state.load( std::memory_order_relaxed ) == ThreadState::ReadyToRun )
     {
-        r = pthread_cond_wait( &t->_launchCond, &t->_launchMutex );
-        if( r ) Fatal( "pthread_cond_wait() failed." );
-        break;
-    }
+        int r = pthread_mutex_lock( &t->_launchMutex );
+        PanicIf( r, "pthread_mutex_lock() failed with error %d.", r );
+
+        if( t->_state.load( std::memory_order_relaxed ) == ThreadState::ReadyToRun )
+        {
+            r = pthread_cond_wait( &t->_launchCond, &t->_launchMutex );
+            PanicIf( r, "pthread_cond_wait() failed with error %d.", r );
+        }
+
+        r = pthread_mutex_unlock( &t->_launchMutex );
+        PanicIf( r, "pthread_mutex_unlock() failed with error %d.", r );
 
-    r = pthread_mutex_unlock( &t->_launchMutex );
-    if( r ) Fatal( "pthread_mutex_unlock() failed." );
+        ASSERT( t->_state.load( std::memory_order_acquire ) == ThreadState::Running );
+    }
 
     pthread_mutex_destroy( &t->_launchMutex );
     pthread_cond_destroy ( &t->_launchCond  );
@@ -226,11 +246,23 @@ void* Thread::ThreadStarterUnix( Thread* t )
     // Run the thread function
     t->_runner( t->_runParam );
 
-    // Thread has exited
-    t->_state.store( ThreadState::Exited, std::memory_order_release );
-    
-    // TODO: Signal if waiting to be joined
-    pthread_exit( nullptr );
+
+    // Signal if waiting to be joined
+    {
+        int r = pthread_mutex_lock( &t->_exitMutex );
+        PanicIf( r, "pthread_mutex_lock() failed with error %d.", r );
+
+        // Thread has exited
+        t->_state.store( ThreadState::Exited, std::memory_order_release );
+
+        r = pthread_cond_signal( &t->_exitCond );
+        PanicIf( r, "pthread_cond_signal() failed with error %d.", r );
+
+        r = pthread_mutex_unlock( &t->_exitMutex );
+        PanicIf( r, "pthread_mutex_unlock() failed with error %d.", r );
+    }
+
+    // pthread_exit( nullptr );
     
     return nullptr;
 }
diff --git a/src/platform/win32/SysHost_Win32.cpp b/src/platform/win32/SysHost_Win32.cpp
index 08ddd107..b1744027 100644
--- a/src/platform/win32/SysHost_Win32.cpp
+++ b/src/platform/win32/SysHost_Win32.cpp
@@ -7,6 +7,7 @@
 #include <processthreadsapi.h>
 #include <systemtopologyapi.h>
 #include <psapi.h>
+#include <dbghelp.h>
 
 
 static_assert( INVALID_HANDLE_VALUE == INVALID_WIN32_HANDLE );
@@ -21,6 +22,9 @@ static_assert( INVALID_HANDLE_VALUE == INVALID_WIN32_HANDLE );
 extern "C" BOOLEAN NTAPI RtlGenRandom( PVOID RandomBuffer, ULONG RandomBufferLength );
 #pragma comment( lib, "advapi32.lib" )
 
+// For stack back trace
+#pragma comment( lib, "dbghelp.lib" )
+
 static bool EnableLockMemoryPrivilege();
 
 
@@ -173,6 +177,8 @@ void SysHost::VirtualFree( void* ptr )
     {
         const DWORD err = GetLastError();
         Log::Error( "VirtualFree() failed with error: %d", err );
+
+        DumpStackTrace();
     }
 }
 
@@ -180,8 +186,35 @@ void SysHost::VirtualFree( void* ptr )
 bool SysHost::VirtualProtect( void* ptr, size_t size, VProtect flags )
 {
     ASSERT( ptr );
+    if( !ptr )
+        return false;
+
+    DWORD winFlags;
+    
+    if( flags == VProtect::NoAccess )
+    {
+        winFlags = PAGE_NOACCESS;
+    }
+    else
+    {
+        if( IsFlagSet( flags, VProtect::Write ) )
+        {
+            winFlags = PAGE_READWRITE;
+        }
+        else
+        {
+            ASSERT( IsFlagSet( flags, VProtect::Read ) );
+            winFlags = PAGE_READONLY;
+        }
+    }
+
+    DWORD oldProtect = 0;
+    if( ::VirtualProtect( ptr, (SIZE_T)size, winFlags, &oldProtect ) == FALSE )
+    {
+        Log::Error( "::VirtualProtect() failed with error: %d", (int)::GetLastError() );
+        return false;
+    }
 
-    // #TODO: Implement me
     return true;
 }
 
@@ -273,7 +306,66 @@ void SysHost::InstallCrashHandler()
 //-----------------------------------------------------------
 void SysHost::DumpStackTrace()
 {
-    // #TODO: Implement me
+    /// See: https://learn.microsoft.com/en-us/windows/win32/debug/retrieving-symbol-information-by-address
+    constexpr uint32 MAX_FRAMES = 256;
+
+    void* frames[MAX_FRAMES] = {};
+
+    const uint32 frameCount = (uint32)::CaptureStackBackTrace( 0, MAX_FRAMES, frames, nullptr );
+
+    if( frameCount < 1 )
+        return;
+
+    const HANDLE curProcess = ::GetCurrentProcess();
+
+    ::SymSetOptions( SYMOPT_UNDNAME | SYMOPT_DEFERRED_LOADS | SYMOPT_LOAD_LINES );
+    if( !::SymInitialize( curProcess, NULL, TRUE ) )
+    {
+        Log::Error( "Waring: SymInitialize returned error: %d", GetLastError() );
+        return;
+    }
+
+
+    byte* symBuffer = (byte*)malloc( sizeof( SYMBOL_INFO ) + sizeof(TCHAR) * (MAX_SYM_NAME+1) );
+    if( !symBuffer )
+    {
+        Log::Error( "Warning: Failed to dump stack trace." );
+        return;
+    }
+    
+    auto* symbol = reinterpret_cast<SYMBOL_INFO*>( symBuffer );
+    symbol->MaxNameLen   = MAX_SYM_NAME;
+    symbol->SizeOfStruct = sizeof( SYMBOL_INFO );
+
+    IMAGEHLP_LINE64 line = {};
+    line.SizeOfStruct = sizeof( IMAGEHLP_LINE64 );
+
+    for( uint32 i = 0; i < frameCount; i++ )
+    {
+              DWORD64 displacement = 0;
+        const DWORD64 address      = (DWORD64)frames[i];
+
+        if( ::SymFromAddr( curProcess, address, &displacement, symbol ) )
+        {
+            DWORD lineDisplacement = 0;
+            if( ::SymGetLineFromAddr64( curProcess, address, &lineDisplacement, &line ) )
+            {
+                Log::Line( "0x%016llX @ %s::%s() line: %llu", (llu)address, line.FileName, symbol->Name, (llu)line.LineNumber );
+            }
+            else
+            {
+                Log::Line( "0x%016llX @ <unknown>::%s()", (llu)address, symbol->Name );
+            }
+        }
+        else
+        {
+            Log::Line( "0x%016llX @ <unknown>::<unknown>", (llu)address );
+        }
+    }
+
+    Log::Flush();
+
+    free( symBuffer );
 }
 
 //-----------------------------------------------------------
diff --git a/src/plotdisk/DiskBufferQueue.cpp b/src/plotdisk/DiskBufferQueue.cpp
index a02c6320..dbd4d5dc 100644
--- a/src/plotdisk/DiskBufferQueue.cpp
+++ b/src/plotdisk/DiskBufferQueue.cpp
@@ -517,7 +517,15 @@ void DiskBufferQueue::DebugReadSliceSizes( const TableId table, const FileId fil
 #endif
 
 //-----------------------------------------------------------
-inline DiskBufferQueue::Command* DiskBufferQueue::GetCommandObject( Command::CommandType type )
+void DiskBufferQueue::CommitCommands()
+{
+    //Log::Debug( "Committing %d commands.", _commands._pendingCount );
+    _commands.Commit();
+    _cmdReadySignal.Signal();
+}
+
+//-----------------------------------------------------------
+DiskBufferQueue::Command* DiskBufferQueue::GetCommandObject( Command::CommandType type )
 {
     Command* cmd;
     while( !_commands.Write( cmd ) )
@@ -541,13 +549,6 @@ inline DiskBufferQueue::Command* DiskBufferQueue::GetCommandObject( Command::Com
     return cmd;
 }
 
-//-----------------------------------------------------------
-void DiskBufferQueue::CommitCommands()
-{
-    //Log::Debug( "Committing %d commands.", _commands._pendingCount );
-    _commands.Commit();
-    _cmdReadySignal.Signal();
-}
 
 //-----------------------------------------------------------
 void DiskBufferQueue::CommandThreadMain( DiskBufferQueue* self )
@@ -718,13 +719,17 @@ void DiskBufferQueue::ExecuteCommand( Command& cmd )
             CmdTruncateBucket( cmd );
         break;
 
+        case Command::PlotWriterCommand:
+            CmdPlotWriterCommand( cmd );
+        break;
+
     #if _DEBUG
         case Command::DBG_WriteSliceSizes:
             CmdDbgWriteSliceSizes( cmd );
         break;
 
         case Command::DBG_ReadSliceSizes:
-        CmdDbgReadSliceSizes( cmd );
+            CmdDbgReadSliceSizes( cmd );
         break;
     #endif
 
@@ -1203,6 +1208,13 @@ inline const char* DiskBufferQueue::DbgGetCommandName( Command::CommandType type
     }
 }
 
+//-----------------------------------------------------------
+void DiskBufferQueue::CmdPlotWriterCommand( const Command& cmd )
+{
+    ASSERT( cmd.type == Command::PlotWriterCommand );
+    cmd.plotWriterCmd.writer->ExecuteCommand( cmd.plotWriterCmd.cmd );   
+}
+
 #if _DEBUG
 //-----------------------------------------------------------
 void DiskBufferQueue::CmdDbgWriteSliceSizes( const Command& cmd )
@@ -1222,7 +1234,7 @@ void DiskBufferQueue::CmdDbgWriteSliceSizes( const Command& cmd )
     }
     // FatalIf( !file.Open( path, FileMode::Create, FileAccess::Write ), "Failed to open '%s' for writing.", path );
 
-    FileSet& fileSet        = _files[(int)cmd.dbgSliceSizes.fileId];
+    FileSet& fileSet        = _files[(int)fileId];
     const uint32 numBuckets = (uint32)fileSet.files.Length();
 
     for( uint32 i = 0; i < numBuckets; i++ )
@@ -1230,7 +1242,7 @@ void DiskBufferQueue::CmdDbgWriteSliceSizes( const Command& cmd )
               auto   slices    = fileSet.writeSliceSizes[i];
         const size_t sizeWrite = sizeof( size_t ) * numBuckets;
 
-        if( file.Write( slices.Ptr(), sizeWrite ) != sizeWrite )
+        if( file.Write( slices.Ptr(), sizeWrite ) != (ssize_t)sizeWrite )
         {
             Log::Error( "Failed to write slice size for table %d", (int)cmd.dbgSliceSizes.table+1 );
             return;
@@ -1382,3 +1394,5 @@ void DiskBufferQueue::DeleteBucketNow( const FileId fileId )
     }
 }
 
+
+
diff --git a/src/plotdisk/DiskBufferQueue.h b/src/plotdisk/DiskBufferQueue.h
index 68cb3ce8..357ce846 100644
--- a/src/plotdisk/DiskBufferQueue.h
+++ b/src/plotdisk/DiskBufferQueue.h
@@ -6,6 +6,7 @@
 #include "threading/MTJob.h"
 #include "plotting/WorkHeap.h"
 #include "plotting/Tables.h"
+#include "plotting/PlotWriter.h"
 #include "FileId.h"
 
 class Thread;
@@ -57,6 +58,8 @@ struct FileSet
 
 class DiskBufferQueue
 {
+    friend class PlotWriter;
+
     struct Command
     {
         enum CommandType
@@ -76,6 +79,9 @@ class DiskBufferQueue
             WaitForFence,
             TruncateBucket,
 
+            PlotWriterCommand,      // Wraps a PlotWriter command and dispatches it in our thread
+
+            // DEBUG
             DBG_WriteSliceSizes,    // Read/Write slice sizes to disk. Used for skipping tables
             DBG_ReadSliceSizes
         };
@@ -141,6 +147,12 @@ class DiskBufferQueue
                 ssize_t position;
             } truncateBucket;
 
+            struct
+            {
+                PlotWriter* writer;
+                PlotWriter::Command cmd;
+            } plotWriterCmd;
+
             #if _DEBUG
                 struct
                 {
@@ -405,6 +417,8 @@ class DiskBufferQueue
     void DeleteFileNow( const FileId fileId, const uint32 bucket );
     void DeleteBucketNow( const FileId fileId );
 
+    void CmdPlotWriterCommand( const Command& cmd );
+    
     static const char* DbgGetCommandName( Command::CommandType type );
 
     #if _DEBUG
diff --git a/src/plotdisk/DiskF1.h b/src/plotdisk/DiskF1.h
index a63e81d1..a6080534 100644
--- a/src/plotdisk/DiskF1.h
+++ b/src/plotdisk/DiskF1.h
@@ -81,7 +81,7 @@ struct DiskF1
             const int64 trailingEntries = ( 1ll << _k ) - _entriesPerBucket * _numBuckets;
             
             byte key[BB_PLOT_ID_LEN] = { 1 };
-            memcpy( key + 1, _context.plotId, BB_PLOT_ID_LEN-1 );
+            memcpy( key + 1, _context.plotRequest.plotId, BB_PLOT_ID_LEN-1 );
 
             chacha8_ctx chacha;
             chacha8_keysetup( &chacha, key, 256, NULL );
diff --git a/src/plotdisk/DiskPlotConfig.h b/src/plotdisk/DiskPlotConfig.h
index cff87cbe..14eec692 100644
--- a/src/plotdisk/DiskPlotConfig.h
+++ b/src/plotdisk/DiskPlotConfig.h
@@ -49,8 +49,8 @@
 #define BB_DP_DBG_TEST_DIR      "/home/harold/plot/dbg/"
 #define BB_DP_DBG_REF_DIR       "/home/harold/plot/ref/"
 
-// #define BB_DP_DBG_SKIP_PHASE_1  1
-// #define BB_DP_DBG_SKIP_PHASE_2  1
+#define BB_DP_DBG_SKIP_PHASE_1  1
+#define BB_DP_DBG_SKIP_PHASE_2  1
 
 // Skip all of Phase 1 except the C tables writing.
 // #NOTE: BB_DP_DBG_SKIP_PHASE_1 Must be defined
@@ -103,6 +103,9 @@
         #define BB_DBG_ValidateBoundedPairs( numBuckets, table, context )
     #endif
 
+    // Dump sorted line points to a file
+    #define BB_DBG_DumpLinePoints_T2 1 
+
     #define BB_DP_DBG_WriteTableCounts( context ) Debug::WriteTableCounts( context )
     #define BB_DP_DBG_ReadTableCounts( context )  Debug::ReadTableCounts( context )
 
diff --git a/src/plotdisk/DiskPlotContext.h b/src/plotdisk/DiskPlotContext.h
index 6c4c3af5..43275830 100644
--- a/src/plotdisk/DiskPlotContext.h
+++ b/src/plotdisk/DiskPlotContext.h
@@ -5,13 +5,14 @@
 #include "threading/MTJob.h"
 #include "plotting/PlotTypes.h"
 #include "plotting/Tables.h"
+#include "plotting/GlobalPlotConfig.h"
 #include "ChiaConsts.h"
-
-struct GlobalPlotConfig;
+#include "plotting/PlotWriter.h"
+#include "PlotContext.h"
 
 struct DiskPlotConfig
 {
-    GlobalPlotConfig* globalCfg                = nullptr;
+    const GlobalPlotConfig* globalCfg          = nullptr;
     const char*       tmpPath                  = nullptr;
     const char*       tmpPath2                 = nullptr;
     size_t            expectedTmpDirBlockSize  = 0;
@@ -44,15 +45,18 @@ struct DiskPlotContext
 
     ThreadPool*      threadPool;
     DiskBufferQueue* ioQueue;
+    PlotWriter*      plotWriter;
+    PlotRequest      plotRequest;
     FencePool*       fencePool;
 
-    size_t       heapSize;          // Size in bytes of our working heap.
-    byte*        heapBuffer;        // Buffer allocated for in-memory work
+    size_t       heapSize;              // Size in bytes of our working heap.
+    byte*        heapBuffer;            // Buffer allocated for in-memory work
     
-    size_t       cacheSize;         // Size of memory cache to reserve for IO (region in file that never gets written to disk).
+    size_t       cacheSize;             // Size of memory cache to reserve for IO (region in file that never gets written to disk).
     byte*        cache;
 
-    uint32       numBuckets;        // Divide entries into this many buckets
+    uint32       numBuckets;            // Divide entries into this many buckets
+    
 
     uint32       ioThreadCount;     // How many threads to use for the disk buffer writer/reader
     uint32       f1ThreadCount;     // How many threads to use for f1 generation
@@ -61,10 +65,6 @@ struct DiskPlotContext
     uint32       p2ThreadCount;     // How many threads to use for Phase 2
     uint32       p3ThreadCount;     // How many threads to use for Phase 3
 
-    const byte*  plotId;
-    const byte*  plotMemo;
-    uint16       plotMemoSize;
-
     uint32       bucketCounts[(uint)TableId::_Count][BB_DP_MAX_BUCKET_COUNT+1];
     uint64       entryCounts [(uint)TableId::_Count];
 
diff --git a/src/plotdisk/DiskPlotDebug.h b/src/plotdisk/DiskPlotDebug.h
index 42169010..24cc0421 100644
--- a/src/plotdisk/DiskPlotDebug.h
+++ b/src/plotdisk/DiskPlotDebug.h
@@ -608,8 +608,8 @@ void Debug::ValidateK32Pairs( const TableId table, DiskPlotContext& context )
 
     // Cleanup
     bbvirtfreebounded( heap );
-    bbvirtfreebounded( reference.Ptr() );
-    bbvirtfreebounded( bucketPairs.Ptr() );
+    bbvirtfreebounded_span( reference );
+    bbvirtfreebounded_span( bucketPairs );
 
     Log::Line( "[DEBUG] Completed" );
 }
@@ -725,10 +725,10 @@ void Debug::DumpPairs( const TableId table, DiskPlotContext& context )
 
     // Cleanup
     bbvirtfreebounded( block );
-    bbvirtfreebounded( pairTable.Ptr() );
+    bbvirtfreebounded_span( pairTable );
     bbvirtfreebounded( heap );
     if( hasMap ) 
-        bbvirtfreebounded( mapTable.Ptr() );
+        bbvirtfreebounded_span( mapTable );
 
     Log::Line( "[DEBUG] Completed." );
 }
diff --git a/src/plotdisk/DiskPlotPhase1.cpp b/src/plotdisk/DiskPlotPhase1.cpp
deleted file mode 100644
index ff0f389c..00000000
--- a/src/plotdisk/DiskPlotPhase1.cpp
+++ /dev/null
@@ -1,460 +0,0 @@
-#include "DiskPlotPhase1.h"
-#include "util/Util.h"
-#include "util/Log.h"
-#include "b3/blake3.h"
-#include "algorithm/RadixSort.h"
-#include "plotting/GenSortKey.h"
-#include "jobs/LookupMapJob.h"
-#include "plotting/TableWriter.h"
-#include "util/StackAllocator.h"
-#include "DiskF1.h"
-#include "DiskFp.h"
-
-
-// Test
-#if _DEBUG
-    #include "../plotmem/DbgHelper.h"
-    #include "plotdisk/DiskPlotDebug.h"
-#endif
-
-//-----------------------------------------------------------
-DiskPlotPhase1::DiskPlotPhase1( DiskPlotContext& cx )
-    : _cx( cx )
-    , _diskQueue( cx.ioQueue )
-{
-    ASSERT( cx.tmpPath );
-
-    const FileSetOptions tmp1Options = cx.cfg->noTmp1DirectIO ? FileSetOptions::None : FileSetOptions::DirectIO;
-
-    _diskQueue->InitFileSet( FileId::T1, "t1", 1, tmp1Options, nullptr );  // X (sorted on Y)
-    _diskQueue->InitFileSet( FileId::T2, "t2", 1, tmp1Options, nullptr );  // Back pointers
-    _diskQueue->InitFileSet( FileId::T3, "t3", 1, tmp1Options, nullptr );
-    _diskQueue->InitFileSet( FileId::T4, "t4", 1, tmp1Options, nullptr );
-    _diskQueue->InitFileSet( FileId::T5, "t5", 1, tmp1Options, nullptr );
-    _diskQueue->InitFileSet( FileId::T6, "t6", 1, tmp1Options, nullptr );
-    _diskQueue->InitFileSet( FileId::T7, "t7", 1, tmp1Options, nullptr );
-
-    _diskQueue->InitFileSet( FileId::MAP2, "map2", _cx.numBuckets+1, tmp1Options, nullptr );
-    _diskQueue->InitFileSet( FileId::MAP3, "map3", _cx.numBuckets+1, tmp1Options, nullptr );
-    _diskQueue->InitFileSet( FileId::MAP4, "map4", _cx.numBuckets+1, tmp1Options, nullptr );
-    _diskQueue->InitFileSet( FileId::MAP5, "map5", _cx.numBuckets+1, tmp1Options, nullptr );
-    _diskQueue->InitFileSet( FileId::MAP6, "map6", _cx.numBuckets+1, tmp1Options, nullptr );
-    _diskQueue->InitFileSet( FileId::MAP7, "map7", _cx.numBuckets+1, tmp1Options, nullptr );
-
-    {
-        const size_t cacheSize = _cx.cacheSize / 2;
-
-        FileSetOptions opts = FileSetOptions::UseTemp2;
-
-        if( !_cx.cfg->noTmp2DirectIO )
-            opts |= FileSetOptions::DirectIO;
-
-        if( _cx.cache )
-            opts |= FileSetOptions::Cachable;
-
-        FileSetInitData fdata;
-        fdata.cache     = _cx.cache;
-        fdata.cacheSize = cacheSize;
-
-        _diskQueue->InitFileSet( FileId::FX0, "fx_0", _cx.numBuckets, opts, &fdata );
-        
-        fdata.cache = ((byte*)fdata.cache) + cacheSize;
-        _diskQueue->InitFileSet( FileId::FX1, "fx_1", _cx.numBuckets, opts, &fdata );
-    }
-}
-
-//-----------------------------------------------------------
-void DiskPlotPhase1::Run()
-{
-    #if _DEBUG && ( BB_DP_DBG_SKIP_PHASE_1 || BB_DP_P1_SKIP_TO_TABLE )
-    {
-        FileStream bucketCounts, tableCounts, backPtrBucketCounts;
-
-        if( bucketCounts.Open( BB_DP_DBG_TEST_DIR BB_DP_DBG_READ_BUCKET_COUNT_FNAME, FileMode::Open, FileAccess::Read ) )
-        {
-            if( bucketCounts.Read( _cx.bucketCounts, sizeof( _cx.bucketCounts ) ) != sizeof( _cx.bucketCounts ) )
-            {
-                Log::Error( "Failed to read from bucket counts file." );
-                goto CONTINUE;
-            }
-        }
-        else
-        {
-            Log::Error( "Failed to open bucket counts file." );
-            goto CONTINUE;
-        }
-
-        if( tableCounts.Open( BB_DP_DBG_TEST_DIR BB_DP_TABLE_COUNTS_FNAME, FileMode::Open, FileAccess::Read ) )
-        {
-            if( tableCounts.Read( _cx.entryCounts, sizeof( _cx.entryCounts ) ) != sizeof( _cx.entryCounts ) )
-            {
-                Log::Error( "Failed to read from table counts file." );
-                goto CONTINUE;
-            }
-        }
-        else
-        {
-            Log::Error( "Failed to open table counts file." );
-            goto CONTINUE;
-        }
-
-        if( backPtrBucketCounts.Open( BB_DP_DBG_TEST_DIR BB_DP_DBG_PTR_BUCKET_COUNT_FNAME, FileMode::Open, FileAccess::Read ) )
-        {
-            if( backPtrBucketCounts.Read( _cx.ptrTableBucketCounts, sizeof( _cx.ptrTableBucketCounts ) ) != sizeof( _cx.ptrTableBucketCounts ) )
-            {
-                Fatal( "Failed to read from pointer bucket counts file." );
-            }
-        }
-        else
-        {
-            Fatal( "Failed to open pointer bucket counts file." );
-        }
-
-        #if BB_DP_P1_SKIP_TO_TABLE
-            goto FP;
-        #endif
-
-        #if BB_DP_DBG_SKIP_TO_C_TABLES
-            goto CTables;
-        #endif
-
-        return;
-
-    CONTINUE:;
-    }
-    #endif
-
-    
-
-#if !BB_DP_DBG_READ_EXISTING_F1
-    GenF1();
-#else
-    {
-        size_t pathLen = strlen( _cx.tmpPath );
-        pathLen += sizeof( BB_DP_DBG_READ_BUCKET_COUNT_FNAME );
-
-        std::string bucketsPath = _cx.tmpPath;
-        if( bucketsPath[bucketsPath.length() - 1] != '/' && bucketsPath[bucketsPath.length() - 1] != '\\' )
-            bucketsPath += "/";
-
-        bucketsPath += BB_DP_DBG_READ_BUCKET_COUNT_FNAME;
-
-        const size_t bucketsCountSize = sizeof( uint32 ) * BB_DP_BUCKET_COUNT;
-
-        FileStream fBucketCounts;
-        if( fBucketCounts.Open( bucketsPath.c_str(), FileMode::Open, FileAccess::Read ) )
-        {
-
-            size_t sizeRead = fBucketCounts.Read( _cx.bucketCounts[0], bucketsCountSize );
-            FatalIf( sizeRead != bucketsCountSize, "Invalid bucket counts." );
-        }
-        else
-        {
-            GenF1();
-
-            fBucketCounts.Close();
-            FatalIf( !fBucketCounts.Open( bucketsPath.c_str(), FileMode::Create, FileAccess::Write ), "File to open bucket counts file" );
-            FatalIf( fBucketCounts.Write( _cx.bucketCounts[0], bucketsCountSize ) != bucketsCountSize, "Failed to write bucket counts.");
-        }
-    }
-#endif
-
-    #if _DEBUG && BB_DP_DBG_VALIDATE_F1
-    // if constexpr ( 0 )
-    // {
-    //     const uint32* bucketCounts = _cx.bucketCounts[0];
-    //     uint64 totalEntries = 0;
-    //     for( uint i = 0; i < BB_DP_BUCKET_COUNT; i++ )
-    //         totalEntries += bucketCounts[i];
-            
-    //     ASSERT( totalEntries == 1ull << _K );
-
-    //     Debug::ValidateYFileFromBuckets( FileId::Y0, *_cx.threadPool, *_diskQueue, TableId::Table1, _cx.bucketCounts[0] );
-    // }
-    #endif
-
-#if _DEBUG && BB_DP_P1_SKIP_TO_TABLE
-    FP:
-#endif
-
-    ForwardPropagate();
-
-    // Check all table counts
-    #if _DEBUG
-    for( int table = (int)TableId::Table1; table <= (int)TableId::Table7; table++ )
-    {
-        uint64 entryCount = 0;
-
-        for( int bucket = 0; bucket < (int)_cx.numBuckets; bucket++ )
-            entryCount += _cx.bucketCounts[table][bucket];
-
-        ASSERT( entryCount == _cx.entryCounts[table] );
-    }
-    #endif
-
-    #if _DEBUG
-    {
-        // Write bucket counts
-        FileStream bucketCounts, tableCounts, backPtrBucketCounts;
-
-        if( bucketCounts.Open( BB_DP_DBG_TEST_DIR BB_DP_DBG_READ_BUCKET_COUNT_FNAME, FileMode::Create, FileAccess::Write ) )
-        {
-            if( bucketCounts.Write( _cx.bucketCounts, sizeof( _cx.bucketCounts ) ) != sizeof( _cx.bucketCounts ) )
-                Log::Error( "Failed to write to bucket counts file." );
-        }
-        else
-            Log::Error( "Failed to open bucket counts file." );
-
-        if( tableCounts.Open( BB_DP_DBG_TEST_DIR BB_DP_TABLE_COUNTS_FNAME, FileMode::Create, FileAccess::Write ) )
-        {
-            if( tableCounts.Write( _cx.entryCounts, sizeof( _cx.entryCounts ) ) != sizeof( _cx.entryCounts ) )
-                Log::Error( "Failed to write to table counts file." );
-        }
-        else
-            Log::Error( "Failed to open table counts file." );
-
-        if( backPtrBucketCounts.Open( BB_DP_DBG_TEST_DIR BB_DP_DBG_PTR_BUCKET_COUNT_FNAME, FileMode::Create, FileAccess::Write ) )
-        {
-            if( backPtrBucketCounts.Write( _cx.ptrTableBucketCounts, sizeof( _cx.ptrTableBucketCounts ) ) != sizeof( _cx.ptrTableBucketCounts ) )
-                Log::Error( "Failed to write to back pointer bucket counts file." );
-        }
-        else
-            Log::Error( "Failed to open back pointer bucket counts file." );
-    }
-    #endif
-
-    Log::Line( " Phase 1 Total I/O wait time: %.2lf", TicksToSeconds( _cx.ioWaitTime ) + _cx.ioQueue->IOBufferWaitTime() );
-
-#if BB_DP_DBG_SKIP_TO_C_TABLES
-    CTables:
-#endif
-    WriteCTables();
-
-    #if !BB_DP_P1_KEEP_FILES
-        _cx.ioQueue->DeleteBucket( _fxIn );
-        _cx.ioQueue->CommitCommands();
-    #endif
-}
-
-///
-/// F1 Generation
-///
-//-----------------------------------------------------------
-void DiskPlotPhase1::GenF1()
-{
-    Log::Line( "Generating f1..." );
-    auto timer = TimerBegin();
-    
-    switch( _cx.numBuckets )
-    {
-        case 128 : GenF1Buckets<128 >(); break;
-        case 256 : GenF1Buckets<256 >(); break;
-        case 512 : GenF1Buckets<512 >(); break;
-        case 1024: GenF1Buckets<1024>(); break;
-    default:
-        ASSERT( 0 );
-        break;
-    }
-
-    _cx.entryCounts[0] = 1ull << _K;
-    
-    double elapsed = TimerEnd( timer );
-    Log::Line( "Finished f1 generation in %.2lf seconds. ", elapsed );
-    Log::Line( "Table 1 I/O wait time: %.2lf seconds.", _cx.ioQueue->IOBufferWaitTime() );
-
-    #if BB_IO_METRICS_ON
-        const double writeThroughput = _cx.ioQueue->GetAverageWriteThroughput();
-        const auto&  writes          = _cx.ioQueue->GetWriteMetrics();
-
-        Log::Line( " Table 1 I/O Metrics:" );
-        Log::Line( "  Average write throughput %.2lf MiB ( %.2lf MB ) or %.2lf GiB ( %.2lf GB ).", 
-            writeThroughput BtoMB, writeThroughput / 1000000.0, writeThroughput BtoGB, writeThroughput / 1000000000.0 );
-        Log::Line( "  Total size written: %.2lf MiB ( %.2lf MB ) or %.2lf GiB ( %.2lf GB ).",
-            (double)writes.size BtoMB, (double)writes.size / 1000000.0, (double)writes.size BtoGB, (double)writes.size / 1000000000.0 );
-        Log::Line( "  Total write commands: %llu.", (llu)writes.count );
-        Log::Line( "" );
-
-        _cx.ioQueue->ClearWriteMetrics();
-    #endif
-}
-
-//-----------------------------------------------------------
-template <uint32 _numBuckets>
-void DiskPlotPhase1::GenF1Buckets()
-{
-    DiskF1<_numBuckets> f1( _cx, FileId::FX0 );
-    f1.GenF1();
-}
-
-//-----------------------------------------------------------
-void DiskPlotPhase1::ForwardPropagate()
-{
-    TableId startTable = TableId::Table2;
-
-    #if BB_DP_P1_SKIP_TO_TABLE
-        startTable = BB_DP_P1_START_TABLE;
-        if( (int)startTable ^ 1 )
-            std::swap( _fxIn, _fxOut );
-    #endif
-
-    for( TableId table = startTable; table <= TableId::Table7; table++ )
-    {
-        Log::Line( "Table %u", table+1 );
-        auto timer = TimerBegin();
-
-        switch( table )
-        {
-            case TableId::Table2: ForwardPropagateTable<TableId::Table2>(); break;
-            case TableId::Table3: ForwardPropagateTable<TableId::Table3>(); break;
-            case TableId::Table4: ForwardPropagateTable<TableId::Table4>(); break;
-            case TableId::Table5: ForwardPropagateTable<TableId::Table5>(); break;
-            case TableId::Table6: ForwardPropagateTable<TableId::Table6>(); break;
-            case TableId::Table7: ForwardPropagateTable<TableId::Table7>(); break;
-        
-            default:
-                Fatal( "Invalid table." );
-                break;
-        }
-        Log::Line( "Completed table %u in %.2lf seconds with %.llu entries.", table+1, TimerEnd( timer ), _cx.entryCounts[(int)table] );
-        Log::Line( "Table %u I/O wait time: %.2lf seconds.",  table+1, TicksToSeconds( _tableIOWaitTime ) );
-
-        std::swap( _fxIn, _fxOut );
-
-        // No longer need fxout. Delete it
-        if( table == TableId::Table7 )
-        {
-            #if !BB_DP_P1_KEEP_FILES
-            _cx.ioQueue->DeleteBucket( _fxOut );
-            _cx.ioQueue->CommitCommands();
-            #endif
-        }
-        
-        #if BB_IO_METRICS_ON
-            const double readThroughput  = _cx.ioQueue->GetAverageReadThroughput();
-            const auto&  reads           = _cx.ioQueue->GetReadMetrics();
-            const double writeThroughput = _cx.ioQueue->GetAverageWriteThroughput();
-            const auto&  writes          = _cx.ioQueue->GetWriteMetrics();
-
-            Log::Line( " Table %u I/O Metrics:", (uint32)table+1 );
-            
-            Log::Line( "  Average read throughput %.2lf MiB ( %.2lf MB ) or %.2lf GiB ( %.2lf GB ).", 
-                readThroughput BtoMB, readThroughput / 1000000.0, readThroughput BtoGB, readThroughput / 1000000000.0 );
-            Log::Line( "  Total size read: %.2lf MiB ( %.2lf MB ) or %.2lf GiB ( %.2lf GB ).",
-                (double)reads.size BtoMB, (double)reads.size / 1000000.0, (double)reads.size BtoGB, (double)reads.size / 1000000000.0 );
-            Log::Line( "  Total read commands: %llu.", (llu)reads.count );
-
-            Log::Line( "  Average write throughput %.2lf MiB ( %.2lf MB ) or %.2lf GiB ( %.2lf GB ).", 
-                writeThroughput BtoMB, writeThroughput / 1000000.0, writeThroughput BtoGB, writeThroughput / 1000000000.0 );
-            Log::Line( "  Total size written: %.2lf MiB ( %.2lf MB ) or %.2lf GiB ( %.2lf GB ).",
-                (double)writes.size BtoMB, (double)writes.size / 1000000.0, (double)writes.size BtoGB, (double)writes.size / 1000000000.0 );
-            Log::Line( "  Total write commands: %llu.", (llu)writes.count );
-            Log::Line( "" );
-
-            _cx.ioQueue->ClearReadMetrics();
-            _cx.ioQueue->ClearWriteMetrics();
-        #endif
-    }
-}
-
-//-----------------------------------------------------------
-template<TableId table>
-void DiskPlotPhase1::ForwardPropagateTable()
-{
-    _tableIOWaitTime = Duration::zero();
-
-    // Above table 6 the metadata is < x4, let's truncate the output file
-    // so that we can recover some space from the x4 metadata from the previous tables
-    if( table >= TableId::Table6 )
-    {
-        _cx.ioQueue->TruncateBucket( _fxOut, 0 );
-        _cx.ioQueue->CommitCommands();
-    }
-
-    switch( _cx.numBuckets )
-    {
-        case 128 : ForwardPropagateBuckets<table, 128 >(); break;
-        case 256 : ForwardPropagateBuckets<table, 256 >(); break;
-        case 512 : ForwardPropagateBuckets<table, 512 >(); break;
-        case 1024: ForwardPropagateBuckets<table, 1024>(); break;
-    
-        default:
-            Fatal( "Invalid bucket count." );
-            break;
-    }
-}
-
-//-----------------------------------------------------------
-template<TableId table, uint32 _numBuckets>
-void DiskPlotPhase1::ForwardPropagateBuckets()
-{
-    DiskFp<table, _numBuckets> fp( _cx, _fxIn, _fxOut );
-    fp.Run();
-
-    _tableIOWaitTime = fp.IOWaitTime();
-    _cx.ioWaitTime   += _tableIOWaitTime;
-
-    #if BB_DP_DBG_VALIDATE_FX
-        #if !_DEBUG
-            Log::Line( "Warning: Table validation enabled in release mode." );
-        #endif
-        
-        using TYOut = typename DiskFp<table, _numBuckets>::TYOut;
-        Debug::ValidateYForTable<table, _numBuckets, TYOut>( _fxOut, *_cx.ioQueue, *_cx.threadPool, _cx.bucketCounts[(int)table] );
-        Debug::ValidatePairs<256>( _cx, table );
-    #endif
-
-    #if BB_DP_DBG_DUMP_PAIRS
-        if( table > TableId::Table2 )
-            BB_DBG_DumpPairs( _numBuckets, table-1, _cx );
-
-        if( table == TableId::Table7 )
-            BB_DBG_DumpPairs( _numBuckets, table, _cx );
-    #endif
-}
-
-//-----------------------------------------------------------
-void DiskPlotPhase1::WriteCTables()
-{
-    switch( _cx.numBuckets )
-    {
-        case 128 : WriteCTablesBuckets<128 >(); break;
-        case 256 : WriteCTablesBuckets<256 >(); break;
-        case 512 : WriteCTablesBuckets<512 >(); break;
-        case 1024: WriteCTablesBuckets<1024>(); break;
-    
-        default:
-            Fatal( "Invalid bucket count." );
-            break;
-    }
-}
-
-//-----------------------------------------------------------
-template<uint32 _numBuckets>
-void DiskPlotPhase1::WriteCTablesBuckets()
-{
-    #if BB_DP_DBG_SKIP_TO_C_TABLES
-        _fxIn  = FileId::FX0;
-        _fxOut = FileId::FX1;
-
-        #if BB_DP_DBG_VALIDATE_FX 
-            #if !_DEBUG
-                Log::Line( "Warning: Table validation enabled in release mode." );
-            #endif
-            
-            using TYOut = typename DiskFp<TableId::Table7, _numBuckets>::TYOut;
-            Debug::ValidateYForTable<TableId::Table7, _numBuckets, TYOut>( _fxIn, *_cx.ioQueue, *_cx.threadPool, _cx.bucketCounts[(int)TableId::Table7] );
-        #endif
-    #endif
-
-    Log::Line( "Processing f7s and writing C tables to plot file." );
-
-    const auto timer = TimerBegin();
-
-    DiskFp<TableId::Table7, _numBuckets, true> fp( _cx, _fxIn, _fxOut );
-    fp.RunF7();
-
-    const double elapsed = TimerEnd( timer );
-    Log::Line( "Completed C processing tables in %.2lf seconds.", elapsed );
-    Log::Line( "C Tables I/O wait time: %.2lf.", TicksToSeconds( fp.IOWaitTime() ) );
-}
-
diff --git a/src/plotdisk/DiskPlotPhase1.h b/src/plotdisk/DiskPlotPhase1.h
deleted file mode 100644
index 350b5b57..00000000
--- a/src/plotdisk/DiskPlotPhase1.h
+++ /dev/null
@@ -1,41 +0,0 @@
-#pragma once
-#include "DiskPlotContext.h"
-#include "util/Log.h"
-#include "ChiaConsts.h"
-
-class DiskPlotPhase1
-{   
-public:
-    DiskPlotPhase1( DiskPlotContext& cx );
-    void Run();
-
-private:
-    void GenF1();
-    
-    template <uint32 _numBuckets>
-    void GenF1Buckets();
-
-    // Run forward propagations portion
-    void ForwardPropagate();
-
-    template<TableId table>
-    void ForwardPropagateTable();
-
-    template<TableId table, uint32 _numBuckets>
-    void ForwardPropagateBuckets();
-
-    // Write C tables
-    void WriteCTables();
-
-    template<uint32 _numBuckets>
-    void WriteCTablesBuckets();
-
-private:
-    DiskPlotContext& _cx;
-    DiskBufferQueue* _diskQueue;
-
-    Duration         _tableIOWaitTime;
-
-    FileId           _fxIn  = FileId::FX0;
-    FileId           _fxOut = FileId::FX1;
-};
diff --git a/src/plotdisk/DiskPlotPhase2.cpp b/src/plotdisk/DiskPlotPhase2.cpp
index 6ddea5da..bf5c4358 100644
--- a/src/plotdisk/DiskPlotPhase2.cpp
+++ b/src/plotdisk/DiskPlotPhase2.cpp
@@ -130,7 +130,9 @@ void DiskPlotPhase2::RunWithBuckets()
     uint64* lMarkingTable = bitFields[0];
     uint64* rMarkingTable = bitFields[1];
 
-    for( TableId table = TableId::Table7; table > TableId::Table2; table = table-1 )
+    const TableId endTable = _context.cfg->globalCfg->compressionLevel > 0 ? TableId::Table3 : TableId::Table2;
+
+    for( TableId table = TableId::Table7; table > endTable; table = table-1 )
     {
         readFence.Reset( 0 );
 
diff --git a/src/plotdisk/DiskPlotPhase3.cpp b/src/plotdisk/DiskPlotPhase3.cpp
index 2d641b59..6be14d56 100644
--- a/src/plotdisk/DiskPlotPhase3.cpp
+++ b/src/plotdisk/DiskPlotPhase3.cpp
@@ -6,6 +6,7 @@
 #include "algorithm/RadixSort.h"
 #include "plotting/TableWriter.h"
 #include "plotmem/ParkWriter.h"
+#include "plotting/Compression.h"
 
 #if _DEBUG
     #include "DiskPlotDebug.h"
@@ -19,6 +20,11 @@
     // #define BB_DP_DBG_P3_SKIP_TO_TABLE
     #define BB_DP_DBG_P3_START_TABLE TableId::Table7
 
+
+    #if BB_DBG_DumpLinePoints_T2
+        static FileStream _dbgLpTableFile = {};
+        static void DbgWriteSortedLinePoints( TableId table, uint32 bucket, uint32 numBuckets, const Span<uint64> linePoints );
+    #endif
 #endif
 
 // Extra L entries to load per bucket to ensure we
@@ -156,6 +162,9 @@ class P3StepOne
     {
         _readFence .Reset();
         _writeFence.Reset();
+
+        _isCompressedTable        = context.cfg->globalCfg->compressionLevel > 0 && rTable-1 <= (TableId)context.cfg->globalCfg->numDroppedTables;
+        _lpBitsSavedByCompression = _isCompressedTable ? 63 - ( ( _context.cfg->globalCfg->compressedEntryBits * 2 - 1) * 2 - 1 ) : 0;
     }
 
     //-----------------------------------------------------------
@@ -178,6 +187,7 @@ class P3StepOne
         const size_t  rMarksSize       =  RoundUpToNextBoundary( maxBucketEntries * _numBuckets  / 8, (int)tmp1BlockSize );
         //RoundUpToNextBoundary( _context.entryCounts[(int)rTable] / 8, (int)context.tmp1BlockSize );
 
+        const bool isCompressedTable2 = dryRun ? false : _isCompressedTable;
 
         rMarks       = allocator.Alloc( rMarksSize, tmp1BlockSize );
         rTableReader = dryRun ? PMReader( allocator, tmp1BlockSize )
@@ -186,10 +196,13 @@ class P3StepOne
         lReader        = nullptr;
         lTableNEntries = nullptr;
         
-        if constexpr ( lTable == TableId::Table1 )
+        if( lTable == TableId::Table1 || isCompressedTable2 )
         {
+            const FileId  lReaderFileId = isCompressedTable2 ? FileId::T2 : FileId::T1;
+            const TableId lReaderTable  = isCompressedTable2 ? TableId::Table2 : TableId::Table1;
+
             lTable1Reader = dryRun ? L1Reader( allocator, tmp1BlockSize, maxBucketEntries )
-                                   : L1Reader( FileId::T1, _context.ioQueue, allocator, maxBucketEntries, tmp2BlockSize, _context.bucketCounts[(int)TableId::Table1] ); 
+                                   : L1Reader( lReaderFileId, _context.ioQueue, allocator, maxBucketEntries, tmp2BlockSize, _context.bucketCounts[(int)lReaderTable] ); 
             lReader       = &lTable1Reader;
         }
         else
@@ -273,7 +286,7 @@ class P3StepOne
 
         auto LoadBucket = [&]( const uint32 bucket ) {
 
-            if( lTable == TableId::Table1 )
+            if( lTable == TableId::Table1 || _isCompressedTable )
                 lReader->LoadNextBucket();
                 // lTable1Reader.LoadNextBucket();
             else
@@ -302,7 +315,7 @@ class P3StepOne
 
             uint32* lEntries;
 
-            if constexpr ( lTable == TableId::Table1 )
+            if( lTable == TableId::Table1 || _isCompressedTable )
                 // lEntries = lTable1Reader.ReadLoadedBucket();
                 lEntries = lReader->ReadLoadedBucket();
             else
@@ -321,10 +334,10 @@ class P3StepOne
                                                           BitField( (uint64*)rMarks, context.entryCounts[(int)rTable] ), pairs, map );
             ASSERT( prunedEntryCount <= bucketLength );
 
-            WriteLinePointsToBuckets( bucket, (int64)prunedEntryCount, _rPrunedLinePoints, _rPrunedMap, (uint64*)pairs, map, outLPBucketCounts );
+            WriteLinePointsToBuckets( bucket, (int64)prunedEntryCount, _rPrunedLinePoints, _rPrunedMap, (uint64*)pairs, map, outLPBucketCounts, _lpBitsSavedByCompression );
 
             // #TODO: Remove this after making our T2+ table reader an IP3LMapReader? Or just remove the IP3LMapReader thing?
-            if constexpr ( lTable > TableId::Table1 )
+            if( lTable > TableId::Table1 && !_isCompressedTable )
             {
                 // ASSERT( 0 );
                 if( bucket < _numBuckets - 1 )
@@ -377,6 +390,11 @@ class P3StepOne
         int64 __prunedEntryCount[BB_DP_MAX_JOBS];
         int64* _prunedEntryCount = __prunedEntryCount;
 
+// #if _DEBUG
+//     std::atomic<uint64> _numSameLP = 0;
+//     std::atomic<uint64>* numSameLP = &_numSameLP;
+// #endif
+
         AnonMTJob::Run( *_context.threadPool, _threadCount, [=]( AnonMTJob* self ) {
 
             int64 count, offset, end;
@@ -465,7 +483,21 @@ class P3StepOne
                     const uint64 x = lTable[p.left ];
                     const uint64 y = lTable[p.right];
 
-                    ASSERT( x || y );                    outLinePoints[i] = SquareToLinePoint( x, y );
+                    ASSERT( x || y );
+                    outLinePoints[i] = SquareToLinePoint( x, y );
+// #if _DEBUG
+//         if( x == y )
+//         {
+//             std::atomic<uint64>& sameLP = *const_cast<std::atomic<uint64>*>( numSameLP );
+//             sameLP++;
+//         }
+//         else
+//         {
+//             const BackPtr deconverted = LinePointToSquare64( outLinePoints[i] ) ;
+//             ASSERT( ( deconverted.x == x && deconverted.y == y ) || 
+//                     ( deconverted.y == x && deconverted.y == x ) );
+//         }
+// #endif
                 }
             }
         });
@@ -474,12 +506,17 @@ class P3StepOne
         for( int32 i = 0; i < (int32)_threadCount; i++ )
             prunedEntryCount += _prunedEntryCount[i];
 
+// #if _DEBUG
+//         Log::Line( "Same LP: %llu / %llu : %.2lf", _numSameLP.load(), (uint64)prunedEntryCount,
+//              _numSameLP.load()/ (double)prunedEntryCount );
+// #endif
+
         return (uint64)prunedEntryCount;
     }
 
     //-----------------------------------------------------------
     void WriteLinePointsToBuckets( const uint32 bucket, const int64 entryCount, const uint64* linePoints, const uint64* indices,
-                                   uint64* tmpLPs, uint64* tmpIndices, uint64 outLPBucketCounts[_numBuckets+1] )
+                                   uint64* tmpLPs, uint64* tmpIndices, uint64 outLPBucketCounts[_numBuckets+1], const uint32 bitsSavedByCompression )
     {
         using LPJob = AnonPrefixSumJob<uint32>;
 
@@ -491,7 +528,7 @@ class P3StepOne
             const uint32 threadCount   = self->JobCount();
 
             const uint32 entrySizeBits = _entrySizeBits;
-            const uint32 bucketShift   = _lpBits;
+            const uint32 bucketShift   = _lpBits - bitsSavedByCompression;
 
             const uint64* srcLinePoints = linePoints;
             const uint64* srcIndices    = indices;
@@ -639,7 +676,9 @@ class P3StepOne
     BitBucketWriter<_numBuckets> _lpWriter;
     byte*            _lpWriteBuffer[2] = { nullptr };
 
-    uint64           _prunedEntryCount = 0;
+    uint64           _prunedEntryCount         = 0;
+    uint32           _lpBitsSavedByCompression = 0;
+    bool             _isCompressedTable        = false;
 };
 
 template<TableId rTable, uint32 _numBuckets>
@@ -683,6 +722,10 @@ class P3StepTwo
         _readFence   .Reset();
         _writeFence  .Reset();
         _lpWriteFence.Reset();
+
+
+        _isCompressedTable        = context.cfg->globalCfg->compressionLevel > 0 && rTable-1 <= (TableId)context.cfg->globalCfg->numDroppedTables;
+        _lpBitsSavedByCompression = _isCompressedTable ? 63 - ( ( _context.cfg->globalCfg->compressedEntryBits * 2 - 1) * 2 - 1 ) : 0;
     }
 
     //-----------------------------------------------------------
@@ -740,6 +783,8 @@ class P3StepTwo
     //-----------------------------------------------------------
     void Run( const uint64 inLPBucketCounts[_numBuckets+1], uint64 outLMapBucketCounts[_numBuckets+1] )
     {
+        _context.plotWriter->BeginTable( (PlotTable)rTable-1 );
+
         _ioQueue.SeekBucket( FileId::LP, 0, SeekOrigin::Begin );
         _ioQueue.CommitCommands();
 
@@ -817,9 +862,30 @@ class P3StepTwo
                 std::swap( sortedIndices, scratchIndices );
             }
 
-            #if _DEBUG
+#if _DEBUG
                 // ValidateLinePoints( lTable, _context, bucket, sortedLinePoints, (uint64)entryCount );
-            #endif
+                
+        // # TEST
+        // {
+        //     uint64 prevLp = sortedLinePoints[0];
+        //     for( int64 i = 0; i < entryCount; i++ )
+        //     {
+        //         const uint64 lp = sortedLinePoints[i];
+        //         ASSERT( lp <= 0x1FFFFFFFFFFFFFFFull );
+        //         ASSERT( lp >= prevLp );
+        //         const uint64 delta      = lp - prevLp;
+        //         const uint64 smallDelta = delta >> 29;
+
+        //         ASSERT( smallDelta < 256 );
+        //         prevLp = lp;
+        //     }
+        // }
+#if _DEBUG && BB_DBG_DumpLinePoints_T2
+        if( rTable == TableId::Table3 )
+            DbgWriteSortedLinePoints( rTable-1, bucket, _numBuckets, Span<uint64>( sortedLinePoints, (size_t)entryCount ) );
+#endif
+
+#endif
 
             // Write reverse map to disk
             mapWriter.Write( *_context.threadPool, _threadCount, bucket, entryCount, mapOffset, sortedIndices, scratchIndices, outLMapBucketCounts );
@@ -844,6 +910,8 @@ class P3StepTwo
         _ioQueue.SignalFence( _lpWriteFence, _numBuckets + 5 );
         _ioQueue.CommitCommands();
         _lpWriteFence.Wait( _numBuckets + 5 );
+
+        _context.plotWriter->EndTable();   
     }
 
 private:
@@ -851,6 +919,7 @@ class P3StepTwo
     //-----------------------------------------------------------
     void UnpackEntries( const uint32 bucket, const int64 entryCount, const byte* packedEntries, uint64* outLinePoints, uint64* outIndices )
     {
+        
         AnonMTJob::Run( *_context.threadPool, _threadCount, [=]( AnonMTJob* self ) {
 
             int64 count, offset, end;
@@ -862,7 +931,7 @@ class P3StepTwo
             uint64* indices    = outIndices;
             ASSERT( indices + entryCount )
 
-            const uint64 bucketMask = ((uint64)bucket) << _lpBits;
+            const uint64 bucketMask = ((uint64)bucket) << (_lpBits - _lpBitsSavedByCompression);
             for( int64 i = offset; i < end; i++ )
             {
                 const uint64 lp  = reader.ReadBits64( _lpBits  ) | bucketMask;
@@ -882,10 +951,33 @@ class P3StepTwo
         ASSERT( entryCount );
         ASSERT( inLinePoints );
 
+#if _DEBUG
+        // # TEST
+        // if( _lpBitsSavedByCompression > 0 )
+        // {
+        //     uint64 prevLp = inLinePoints[0];
+        //     for( uint64 i = 0; i < entryCount; i++ )
+        //     {
+        //         const uint64 lp = inLinePoints[i];
+        //         ASSERT( lp <= 0x1FFFFFFFFFFFFFFFull );
+        //         ASSERT( lp >= prevLp );
+        //         const uint64 delta      = lp - prevLp;
+        //         const uint64 smallDelta = delta >> 29;
+
+        //         ASSERT( smallDelta < 256 );
+        //         prevLp = lp;
+        //     }
+        // }
+#endif  
+
         DiskBufferQueue& ioQueue = *_context.ioQueue;
 
-        const TableId lTable     = rTable - 1;
-        const size_t  parkSize   = CalculateParkSize( lTable );
+        const TableId lTable = rTable - 1;
+
+        size_t            _parkSize    = 0;
+        uint32            _stubBitSize = 0;
+        const FSE_CTable* _cTable      = nullptr;
+        GetParkSerializationData( _parkSize, _stubBitSize, _cTable );
 
         /// Encode into parks
         byte* parkBuffer = GetLPWriteBuffer( bucket );
@@ -904,29 +996,32 @@ class P3StepTwo
 
         AnonMTJob::Run( *_context.threadPool, _context.p3ThreadCount, [=]( AnonMTJob* self ){
 
+            const auto  parkSize    = _parkSize;
+            const auto  stubBitSize = _stubBitSize;
+            const auto* cTable      = _cTable;
+
             uint64 count, offset, end;
             GetThreadOffsets( self, parkCount, count, offset, end );
 
-            const TableId lTable          = rTable - 1;
-            const size_t  parkSize        = CalculateParkSize( lTable );
-
             uint64* parkLinePoints  = inLinePoints + offset * kEntriesPerPark;
             byte*   parkWriteBuffer = parkBuffer   + offset * parkSize;
             
             for( uint64 i = 0; i < count; i++ )
             {
                 // #NOTE: This functions mutates inLinePoints
-                WritePark( parkSize, kEntriesPerPark, (uint64*)parkLinePoints, parkWriteBuffer, lTable );
+                WritePark( parkSize, kEntriesPerPark, (uint64*)parkLinePoints, parkWriteBuffer, stubBitSize, cTable );
                 parkLinePoints  += kEntriesPerPark;
                 parkWriteBuffer += parkSize;
             }
         });
 
-        const size_t sizeWritten = parkSize * parkCount;
+        const size_t sizeWritten = _parkSize * parkCount;
         _context.plotTableSizes[(int)lTable] += sizeWritten;
 
-        ioQueue.WriteFile( FileId::PLOT, 0, parkBuffer, sizeWritten );
-        ioQueue.SignalFence( _lpWriteFence, bucket );
+        // ioQueue.WriteFile( FileId::PLOT, 0, parkBuffer, sizeWritten );
+        // ioQueue.SignalFence( _lpWriteFence, bucket );
+        _context.plotWriter->WriteTableData( parkBuffer, sizeWritten );
+        _context.plotWriter->SignalFence( _lpWriteFence, bucket );
         ioQueue.CommitCommands();
 
 
@@ -937,13 +1032,16 @@ class P3StepTwo
             if( overflowEntries )
             {
                 // #NOTE: This functions mutates inLinePoints
-                WritePark( parkSize, overflowEntries, lpOverflowStart, _finalPark, lTable );
+                WritePark( _parkSize, overflowEntries, lpOverflowStart, _finalPark, _stubBitSize, _cTable );
 
-                _context.plotTableSizes[(int)lTable] += parkSize;
-                ioQueue.WriteFile( FileId::PLOT, 0, _finalPark, parkSize );
+                _context.plotTableSizes[(int)lTable] += _parkSize;
+                // ioQueue.WriteFile( FileId::PLOT, 0, _finalPark, _parkSize );
+                _context.plotWriter->WriteTableData( _finalPark, _parkSize );   // #TODO: Shouldn't we signal a fence here??
                 ioQueue.CommitCommands();
             }
 
+            _context.plotWriter->SignalFence( _lpWriteFence, 0x1FFFFFFF );
+            _lpWriteFence.Wait( 0x1FFFFFFF );
             return;
         }
     }
@@ -957,6 +1055,29 @@ class P3StepTwo
         return _parkBuffers[bucket & 1];
     }
 
+    //-----------------------------------------------------------
+    void GetParkSerializationData( size_t& outParkSize, uint32& outStubBitSize, const FSE_CTable*& outCtable ) const
+    {
+        // Calculate the park size dynamically based on table 1's park and delta size, but with modified stub sizes
+        if( _isCompressedTable )
+        {
+            ASSERT( _k == 32 );
+
+            auto info = GetCompressionInfoForLevel( _context.cfg->globalCfg->compressionLevel );
+            outParkSize    = info.tableParkSize;
+            outStubBitSize = info.subtSizeBits;
+            outCtable      = _context.cfg->globalCfg->ctable;
+        }
+        else
+        {
+            const TableId lTable = rTable - 1;
+
+            outStubBitSize = _k - kStubMinusBits;
+            outParkSize    = CalculateParkSize( lTable );
+            outCtable      = CTables[(int)lTable];
+        }
+    }
+
 private:
     DiskPlotContext& _context;
     DiskBufferQueue& _ioQueue;
@@ -967,11 +1088,13 @@ class P3StepTwo
     Duration         _ioWaitTime = Duration::zero();
     FileId           _readId;
     FileId           _writeId;
-    uint64*          _lpLeftOverBuffer = nullptr;
-    uint64           _lpParkLeftOvers  = 0;
-    uint64           _maxParkCount     = 0;
-    byte*            _parkBuffers[2]   = { nullptr };
-    byte*            _finalPark        = nullptr;
+    uint64*          _lpLeftOverBuffer         = nullptr;
+    uint64           _lpParkLeftOvers          = 0;
+    uint64           _maxParkCount             = 0;
+    byte*            _parkBuffers[2]           = { nullptr };
+    byte*            _finalPark                = nullptr;
+    bool             _isCompressedTable        = false;
+    uint32           _lpBitsSavedByCompression = 0;
 };
 
 
@@ -1055,7 +1178,7 @@ void DiskPlotPhase3::Run()
     }
 
     // Delete remaining temporary files
-    #if !BB_DP_P3_KEEP_FILES
+    #if !BB_DP_DBG_P3_KEEP_FILES
         ioQueue.DeleteBucket( FileId::LP );
         ioQueue.DeleteBucket( FileId::LP_MAP_0 );
         ioQueue.DeleteBucket( FileId::LP_MAP_1 );
@@ -1141,6 +1264,18 @@ void DiskPlotPhase3::RunBuckets()
 {
     TableId startTable = TableId::Table2;
 
+    if( _context.cfg->globalCfg->compressionLevel > 0 )
+    {
+        startTable = TableId::Table3;
+        _context.plotTableSizes   [(int)TableId::Table1] = 0;
+        _context.plotTablePointers[(int)TableId::Table2] = _context.plotTablePointers[(int)TableId::Table1];
+
+        #if !BB_DP_DBG_P3_KEEP_FILES
+            _context.ioQueue->DeleteBucket( FileId::T1 );
+            _context.ioQueue->CommitCommands();
+        #endif
+    }
+
 #if _DEBUG && defined( BB_DP_DBG_P3_SKIP_TO_TABLE )
     startTable = BB_DP_DBG_P3_START_TABLE;
 
@@ -1236,7 +1371,7 @@ void DiskPlotPhase3::ProcessTable()
     _ioQueue.SignalFence( _stepFence );
 
     // OK to delete input files now
-    #if !BB_DP_P3_KEEP_FILES
+    #if !BB_DP_DBG_P3_KEEP_FILES
         if( rTable == TableId::Table2 )
             _ioQueue.DeleteFile( FileId::T1, 0 );
     
@@ -1298,7 +1433,10 @@ void DiskPlotPhase3::WritePark7( const uint64 inMapBucketCounts[_numBuckets+1] )
     const uint64 maxParkCount     = maxBucketEntries / kEntriesPerPark;
     const size_t parkSize         = CalculatePark7Size( _K );
 
-    const size_t plotBlockSize    = ioQueue.BlockSize( FileId::PLOT );
+    // #TODO: We don't need these alignments anymore
+    // const size_t plotBlockSize    = ioQueue.BlockSize( FileId::PLOT );
+    const size_t plotBlockSize    = _context.plotWriter->BlockSize();
+    _context.plotWriter->BeginTable( PlotTable::Table7 );
 
     StackAllocator allocator( context.heapBuffer, context.heapSize );
 
@@ -1398,8 +1536,10 @@ void DiskPlotPhase3::WritePark7( const uint64 inMapBucketCounts[_numBuckets+1] )
         const size_t sizeWritten = parkSize * parkCount;
         context.plotTableSizes[(int)TableId::Table7] += sizeWritten;
 
-        ioQueue.WriteFile( FileId::PLOT, 0, parkBuffer, sizeWritten );
-        ioQueue.SignalFence( _writeFence, bucket );
+        // ioQueue.WriteFile( FileId::PLOT, 0, parkBuffer, sizeWritten );
+        // ioQueue.SignalFence( _writeFence, bucket );
+        _context.plotWriter->WriteTableData(  parkBuffer, sizeWritten );
+        _context.plotWriter->SignalFence( _writeFence, bucket );
         ioQueue.CommitCommands();
 
 
@@ -1415,7 +1555,8 @@ void DiskPlotPhase3::WritePark7( const uint64 inMapBucketCounts[_numBuckets+1] )
                 TableWriter::WriteP7Entries( overflowEntries, indexOverflowStart, finalPark, 0 );
 
                 context.plotTableSizes[(int)TableId::Table7] += parkSize;
-                ioQueue.WriteFile( FileId::PLOT, 0, finalPark, parkSize );
+                // ioQueue.WriteFile( FileId::PLOT, 0, finalPark, parkSize );
+                _context.plotWriter->WriteTableData(  finalPark, parkSize );
                 ioQueue.CommitCommands();
             }
 
@@ -1430,7 +1571,9 @@ void DiskPlotPhase3::WritePark7( const uint64 inMapBucketCounts[_numBuckets+1] )
     }
 
     // Wait for all writes to finish
-    ioQueue.SignalFence( _writeFence, _numBuckets + 2 );
+    // ioQueue.SignalFence( _writeFence, _numBuckets + 2 );
+    _context.plotWriter->EndTable();
+    _context.plotWriter->SignalFence( _writeFence, _numBuckets + 2 );
     ioQueue.CommitCommands();
     _writeFence.Wait( _numBuckets + 2 );
 
@@ -1601,6 +1744,31 @@ void UnpackPark7( const byte* srcBits, uint64* dstEntries )
 }
 
 
+#if BB_DBG_DumpLinePoints_T2
+//-----------------------------------------------------------
+void DbgWriteSortedLinePoints( const TableId table, const uint32 bucket, const uint32 numBuckets, const Span<uint64> linePoints )
+{
+    if( bucket == 0 )
+    {
+        char filePath[1024] = {};
+        sprintf( filePath, "%st%u.lp.ref", BB_DP_DBG_REF_DIR, (uint32)table+1 );
+
+        _dbgLpTableFile.Open( filePath, FileMode::Create, FileAccess::Write );
+    }
+
+    if( _dbgLpTableFile.IsOpen() )
+    {
+        _dbgLpTableFile.Write( linePoints.Ptr(), (size_t)linePoints.Length() * sizeof( uint64 ) );
+
+        if( bucket == numBuckets-1 )
+        {
+            _dbgLpTableFile.Close();
+            Exit(0);
+        }
+    }
+}
+#endif
+
 #endif
 
 
diff --git a/src/plotdisk/DiskPlotter.cpp b/src/plotdisk/DiskPlotter.cpp
index d355232f..aa70aa79 100644
--- a/src/plotdisk/DiskPlotter.cpp
+++ b/src/plotdisk/DiskPlotter.cpp
@@ -6,7 +6,6 @@
 #include "io/FileStream.h"
 
 #include "DiskFp.h"
-#include "DiskPlotPhase1.h"
 #include "DiskPlotPhase2.h"
 #include "DiskPlotPhase3.h"
 #include "SysHost.h"
@@ -18,25 +17,25 @@ size_t ValidateTmpPathAndGetBlockSize( DiskPlotter::Config& cfg );
 
 
 //-----------------------------------------------------------
-// DiskPlotter::DiskPlotter()
-// {
-// }
+DiskPlotter::DiskPlotter() {}
 
 //-----------------------------------------------------------
-DiskPlotter::DiskPlotter( const Config& cfg )
+void DiskPlotter::Init()
 {
+    // Initialize tables for matching
+    LoadLTargets();
+
+    Config& cfg = _cfg;
+
     ASSERT( cfg.tmpPath  );
     ASSERT( cfg.tmpPath2 );
 
-    // Initialize tables for matching
-    LoadLTargets();
-    
     ZeroMem( &_cx );
     
-    GlobalPlotConfig& gCfg = *cfg.globalCfg;
+    auto& gCfg = *cfg.globalCfg;
 
     FatalIf( !GetTmpPathsBlockSizes( cfg.tmpPath, cfg.tmpPath2, _cx.tmp1BlockSize, _cx.tmp2BlockSize ),
-        "Failed to obtain temp paths block size from t1: '%s' or %s t2: '%s'.", cfg.tmpPath, cfg.tmpPath2 );
+        "Failed to obtain temp paths block size from t1: '%s' or t2: '%s'.", cfg.tmpPath, cfg.tmpPath2 );
 
     FatalIf( _cx.tmp1BlockSize < 8 || _cx.tmp2BlockSize < 8,"File system block size is too small.." );
 
@@ -51,16 +50,16 @@ DiskPlotter::DiskPlotter( const Config& cfg )
     _cx.p2ThreadCount = cfg.p2ThreadCount == 0 ? gCfg.threadCount : std::min( cfg.p2ThreadCount, sysLogicalCoreCount );
     _cx.p3ThreadCount = cfg.p3ThreadCount == 0 ? gCfg.threadCount : std::min( cfg.p3ThreadCount, sysLogicalCoreCount );
 
-    const size_t heapSize = GetRequiredSizeForBuckets( cfg.bounded, cfg.numBuckets, _cx.tmp1BlockSize, _cx.tmp2BlockSize, _cx.fpThreadCount );
+    const size_t heapSize = GetRequiredSizeForBuckets( true, cfg.numBuckets, _cx.tmp1BlockSize, _cx.tmp2BlockSize, _cx.fpThreadCount );
     ASSERT( heapSize );
 
-    _cfg            = cfg;
-    _cx.cfg         = &_cfg;
-    _cx.tmpPath     = cfg.tmpPath;
-    _cx.tmpPath2    = cfg.tmpPath2;
-    _cx.numBuckets  = cfg.numBuckets;
-    _cx.heapSize    = heapSize;
-    _cx.cacheSize   = cfg.cacheSize;
+    _cfg                    = cfg;
+    _cx.cfg                 = &_cfg;
+    _cx.tmpPath             = cfg.tmpPath;
+    _cx.tmpPath2            = cfg.tmpPath2;
+    _cx.numBuckets          = cfg.numBuckets;
+    _cx.heapSize            = heapSize;
+    _cx.cacheSize           = cfg.cacheSize;
 
     Log::Line( "[Bladebit Disk Plotter]" );
     Log::Line( " Heap size      : %.2lf GiB ( %.2lf MiB )", (double)_cx.heapSize BtoGB, (double)_cx.heapSize BtoMB );
@@ -77,6 +76,7 @@ DiskPlotter::DiskPlotter( const Config& cfg )
     Log::Line( " Temp2 block sz : %u"       , _cx.tmp2BlockSize );
     Log::Line( " Temp1 path     : %s"       , _cx.tmpPath       );
     Log::Line( " Temp2 path     : %s"       , _cx.tmpPath2      );
+
 #if BB_IO_METRICS_ON
     Log::Line( " I/O metrices enabled." );
 #endif
@@ -116,6 +116,7 @@ DiskPlotter::DiskPlotter( const Config& cfg )
     _cx.threadPool = new ThreadPool( sysLogicalCoreCount, ThreadPool::Mode::Fixed, gCfg.disableCpuAffinity );
     _cx.ioQueue    = new DiskBufferQueue( _cx.tmpPath, _cx.tmpPath2, gCfg.outputFolder, _cx.heapBuffer, _cx.heapSize, _cx.ioThreadCount, ioThreadId );
     _cx.fencePool  = new FencePool( 8 );
+    _cx.plotWriter = new PlotWriter( *_cx.ioQueue );
 
     if( cfg.globalCfg->warmStart )
     {
@@ -134,8 +135,10 @@ DiskPlotter::DiskPlotter( const Config& cfg )
 }
 
 //-----------------------------------------------------------
-void DiskPlotter::Plot( const PlotRequest& req )
+void DiskPlotter::Run( const PlotRequest& req )
 {
+    auto& gCfg = *_cfg.globalCfg;
+
     // Reset state
     memset( _cx.plotTablePointers   , 0, sizeof( _cx.plotTablePointers ) );
     memset( _cx.plotTableSizes      , 0, sizeof( _cx.plotTableSizes ) );
@@ -144,17 +147,17 @@ void DiskPlotter::Plot( const PlotRequest& req )
     memset( _cx.ptrTableBucketCounts, 0, sizeof( _cx.ptrTableBucketCounts ) );
     memset( _cx.bucketSlices        , 0, sizeof( _cx.bucketSlices ) );
     memset( _cx.p1TableWaitTime     , 0, sizeof( _cx.p1TableWaitTime ) );
+
     _cx.ioWaitTime      = Duration::zero();
     _cx.cTableWaitTime  = Duration::zero();
     _cx.p7WaitTime      = Duration::zero();
 
-    const bool bounded = _cx.cfg->bounded;
-
-    _cx.plotId       = req.plotId;
-    _cx.plotMemo     = req.plotMemo;
-    _cx.plotMemoSize = req.plotMemoSize;
+    _cx.plotRequest = req;
+    
 
-    _cx.ioQueue->OpenPlotFile( req.plotFileName, req.plotId, req.plotMemo, req.plotMemoSize );
+    FatalIf( !_cx.plotWriter->BeginPlot( gCfg.compressionLevel > 0 ? PlotVersion::v2_0 : PlotVersion::v1_0, 
+                req.outDir, req.plotFileName, req.plotId, req.memo, req.memoSize, gCfg.compressionLevel ),
+        "Failed to open plot file with error: %d", _cx.plotWriter->GetError() );
 
     #if ( _DEBUG && ( BB_DP_DBG_SKIP_PHASE_1 || BB_DP_P1_SKIP_TO_TABLE || BB_DP_DBG_SKIP_TO_C_TABLES ) )
         BB_DP_DBG_ReadTableCounts( _cx );
@@ -167,20 +170,12 @@ void DiskPlotter::Plot( const PlotRequest& req )
         Log::Line( "Running Phase 1" );
         const auto timer = TimerBegin();
 
-        if( bounded )
         {
             K32BoundedPhase1 phase1( _cx );
             #if !( _DEBUG && BB_DP_DBG_SKIP_PHASE_1 )
                 phase1.Run();
             #endif
         }
-        else
-        {
-            DiskPlotPhase1 phase1( _cx );
-            #if !( _DEBUG && BB_DP_DBG_SKIP_PHASE_1 )
-                phase1.Run();
-            #endif
-        }
 
         #if ( _DEBUG && !BB_DP_DBG_SKIP_PHASE_1 )
             BB_DP_DBG_WriteTableCounts( _cx );
@@ -194,11 +189,6 @@ void DiskPlotter::Plot( const PlotRequest& req )
         Log::Line( "Running Phase 2" );
         const auto timer = TimerBegin();
 
-        // if( bounded )
-        // {
-        //     Fatal( "Phase 2 bounded not implemented." );
-        // }
-        // else
         {
             DiskPlotPhase2 phase2( _cx );
             phase2.Run();
@@ -212,11 +202,6 @@ void DiskPlotter::Plot( const PlotRequest& req )
         Log::Line( "Running Phase 3" );
         const auto timer = TimerBegin();
 
-        // if( bounded )
-        // {
-        //     Fatal( "Phase 3 bounded not implemented." );
-        // }
-        // else
         {
             DiskPlotPhase3 phase3( _cx );
             phase3.Run();
@@ -233,52 +218,28 @@ void DiskPlotter::Plot( const PlotRequest& req )
         Log::Line( "Waiting for plot file to complete pending writes..." );
         const auto timer = TimerBegin();
 
-        // Update the table pointers location
-        DiskBufferQueue& ioQueue = *_cx.ioQueue;
-        ASSERT( sizeof( _cx.plotTablePointers ) == sizeof( uint64 ) * 10 );
-
-        // Convert them to big endian
-        for( int i = 0; i < 10; i++ )
-            _cx.plotTablePointers[i] = Swap64( _cx.plotTablePointers[i] );
-
-        const int64 tablePtrsStart = (int64)ioQueue.PlotTablePointersAddress();
-        ioQueue.SeekFile( FileId::PLOT, 0, tablePtrsStart, SeekOrigin::Begin );
-        ioQueue.WriteFile( FileId::PLOT, 0, _cx.plotTablePointers, sizeof( _cx.plotTablePointers ) );
-        
-        // Wait for all IO commands to finish
-        Fence& fence = _cx.fencePool->RequireFence();
-        ioQueue.SignalFence( fence );
-        ioQueue.CommitCommands();
-        fence.Wait();
-        _cx.fencePool->ReleaseFence( fence );
+        // Finalize plot and wait for plot to finish writing
+        auto& plotWriter = *_cx.plotWriter;
+        plotWriter.EndPlot( true );
+        plotWriter.WaitForPlotToComplete();
         
         const double elapsed = TimerEnd( timer );
         Log::Line( "Completed pending writes in %.2lf seconds.", elapsed );
         Log::Line( "Finished writing plot %s.", req.plotFileName );
-        Log::Line( "Final plot table pointers: " );
 
-        for( int i = 0; i < 10; i++ )
-        {
-            const uint64 addy = Swap64( _cx.plotTablePointers[i] );
-
-            if( i < 7 )
-                Log::Line( " Table %d: %16lu ( 0x%016lx )", i+1, addy, addy );
-            else
-                Log::Line( " C %d    : %16lu ( 0x%016lx )", i-6, addy, addy );
-        }
-        Log::Line( "" );
+        plotWriter.DumpTables();
 
         double plotElapsed = TimerEnd( plotTimer );
         Log::Line( "Finished plotting in %.2lf seconds ( %.1lf minutes ).", plotElapsed, plotElapsed / 60 );
-
-        // Rename plot file
-        ioQueue.FinishPlot( fence );
     }
 }
 
 //-----------------------------------------------------------
-void DiskPlotter::ParseCommandLine( CliParser& cli, Config& cfg )
+void DiskPlotter::ParseCLI( const GlobalPlotConfig& gCfg, CliParser& cli  )
 {
+    Config& cfg = _cfg;
+    cfg.globalCfg = &gCfg;
+
     while( cli.HasArgs() )
     {
         if( cli.ReadU32( cfg.numBuckets,  "-b", "--buckets" ) ) 
@@ -328,18 +289,20 @@ void DiskPlotter::ParseCommandLine( CliParser& cli, Config& cfg )
             PrintUsage();
             exit( 0 );
         }
-        else if( cli.Arg()[0] == '-' )
-        {
-            Fatal( "Unexpected argument '%s'.", cli.Arg() );
-        }
-        else 
-        {
-            cfg.globalCfg->outputFolder = cli.ArgConsume();
+        else
+            break;  // Let the caller handle trailing args
+        // else if( cli.Arg()[0] == '-' )
+        // {
+        //     Fatal( "Unexpected argument '%s'.", cli.Arg() );
+        // }
+        // else 
+        // {
+        //     cfg.globalCfg->outputFolder = cli.ArgConsume();
 
-            FatalIf( strlen( cfg.globalCfg->outputFolder ) == 0, "Invalid plot output directory." );
-            FatalIf( cli.HasArgs(), "Unexpected argument '%s'.", cli.Arg() );
-            break;
-        }
+        //     FatalIf( strlen( cfg.globalCfg->outputFolder ) == 0, "Invalid plot output directory." );
+        //     FatalIf( cli.HasArgs(), "Unexpected argument '%s'.", cli.Arg() );
+        //     break;
+        // }
     }
 
     ///
@@ -415,23 +378,7 @@ size_t DiskPlotter::GetRequiredSizeForBuckets( const bool bounded, const uint32
         return std::max( p1HeapSize, p3HeapSize );
     }
 
-    switch( numBuckets )
-    {
-        case 128 : return DiskFp<TableId::Table4, 128 >::GetRequiredHeapSize( fxBlockSize, pairsBlockSize );
-        case 256 : return DiskFp<TableId::Table4, 256 >::GetRequiredHeapSize( fxBlockSize, pairsBlockSize );
-        case 512 : return DiskFp<TableId::Table4, 512 >::GetRequiredHeapSize( fxBlockSize, pairsBlockSize );
-        case 1024:
-            Fatal( "1024 buckets are currently unsupported." );
-            return 0;
-            // We need to add a bit more here (at least 1GiB) to have enough space for P2, which keeps
-            // 2 marking table bitfields in-memory: k^32 / 8 = 0.5GiB
-//            return 1032ull MB + DiskFp<TableId::Table4, 1024>::GetRequiredHeapSize( fxBlockSize, pairsBlockSize );
-
-    default:
-        break;
-    }
-
-    Fatal( "Invalid bucket size: %u.", numBuckets );
+    Fatal( "Only bounded k32 plots currently supported: %u.", numBuckets );
     return 0;
 }
 
@@ -496,9 +443,6 @@ Creates plots by making use of a disk to temporarily store and read values.
  -b, --buckets <n>  : The number of buckets to use. The default is 256.
                       You may specify one of: 128, 256, 512, 1024 and 64 for if --k32-bounded is enabled.
                       1024 is not available for plots of k < 33.
-
- --unbounded        : Create an unbounded k32 plot. That is a plot that does not cut-off entries that 
-                      overflow 2^32;
  
  -a, --alternate    : Halves the temp2 cache size requirements by alternating bucket writing methods
                       between tables.
diff --git a/src/plotdisk/DiskPlotter.h b/src/plotdisk/DiskPlotter.h
index b2ecae14..119bcd46 100644
--- a/src/plotdisk/DiskPlotter.h
+++ b/src/plotdisk/DiskPlotter.h
@@ -2,37 +2,37 @@
 
 #include "DiskPlotContext.h"
 #include "plotting/GlobalPlotConfig.h"
-class CliParser;
+#include "plotting/IPlotter.h"
 
-class DiskPlotter
+class DiskPlotter : public IPlotter
 {
 public:
     using Config = DiskPlotConfig;
 
-    struct PlotRequest
-    {
-        const byte*  plotId;
-        const byte*  plotMemo;
-        uint16       plotMemoSize;
-        const char*  plotFileName;
-    };
+    // struct PlotRequest
+    // {
+    //     const byte*  plotId;
+    //     const byte*  plotMemo;
+    //     uint16       plotMemoSize;
+    //     const char*  plotFileName;
+    // };
 
 public:
-    // DiskPlotter();
-    DiskPlotter( const Config& cfg );
+    DiskPlotter();
 
-    void Plot( const PlotRequest& req );
+    void ParseCLI( const GlobalPlotConfig& gCfg, CliParser& cli ) override;
+    void Init()  override;
+    void Run( const PlotRequest& req ) override;
 
-    static bool   GetTmpPathsBlockSizes(  const char* tmpPath1, const char* tmpPath2, size_t& tmpPath1Size, size_t& tmpPath2Size );
+
+    static bool   GetTmpPathsBlockSizes( const char* tmpPath1, const char* tmpPath2, size_t& tmpPath1Size, size_t& tmpPath2Size );
     static size_t GetRequiredSizeForBuckets( const bool bounded, const uint32 numBuckets, const char* tmpPath1, const char* tmpPath2, const uint32 threadCount );
     static size_t GetRequiredSizeForBuckets( const bool bounded, const uint32 numBuckets, const size_t fxBlockSize, const size_t pairsBlockSize, const uint32 threadCount );
-    
-    static void ParseCommandLine( CliParser& cli, Config& cfg );
 
     static void PrintUsage();
 
 private:
-    DiskPlotContext   _cx;
-    Config            _cfg;
+    DiskPlotContext   _cx  = {};
+    Config            _cfg = {};
 };
 
diff --git a/src/plotdisk/jobs/IOJob.cpp b/src/plotdisk/jobs/IOJob.cpp
index b6557da1..90b27da1 100644
--- a/src/plotdisk/jobs/IOJob.cpp
+++ b/src/plotdisk/jobs/IOJob.cpp
@@ -49,7 +49,7 @@ bool IOJob::RunIOJob( bool write,
     threadCount = std::max( 1u, std::min( threadCount, pool.ThreadCount() ) );
 
     // For small writes use a single thread
-    const size_t minWrite = std::max( blockSize, (size_t)16 MB );
+    const size_t minWrite = std::max( blockSize, (size_t)(16 MiB) );
 
     if( size <= minWrite || threadCount == 1 )
     {
@@ -127,6 +127,23 @@ void IOJob::Run()
         ReadFromFile( *_file, _buffer, _size, _blockBuffer, _blockSize, _error );
 }
 
+//-----------------------------------------------------------
+bool IOJob::WriteToFile( const char* filePath, const void* writeBuffer, const size_t size, int& error )
+{
+    FileStream file;
+    if( !file.Open( filePath, FileMode::Create, FileAccess::Write, FileFlags::NoBuffering ) )
+    {
+        error = file.GetError();
+        return false;
+    }
+
+    void* block = bbvirtalloc( file.BlockSize() );
+    const bool r = WriteToFile( file, writeBuffer, size, block, file.BlockSize(), error );
+    bbvirtfree( block );
+
+    return r;
+}
+
 //-----------------------------------------------------------
 bool IOJob::WriteToFile( IStream& file, const void* writeBuffer, const size_t size,
                          void* fileBlockBuffer, const size_t blockSize, int& error )                 
@@ -153,7 +170,7 @@ bool IOJob::WriteToFile( IStream& file, const void* writeBuffer, const size_t si
         sizeToWrite -= (size_t)sizeWritten;
         buffer      += sizeWritten;
     }
-    
+
     if( remainder )
     {
         ASSERT( blockBuffer );
@@ -174,9 +191,57 @@ bool IOJob::WriteToFile( IStream& file, const void* writeBuffer, const size_t si
     return true;
 }
 
+//-----------------------------------------------------------
+bool IOJob::WriteToFileUnaligned( const char* filePath, const void* writeBuffer, const size_t size, int& error )
+{
+    FileStream file;
+    if( !file.Open( filePath, FileMode::Create, FileAccess::Write, FileFlags::None ) )
+    {
+        error = file.GetError();
+        return false;
+    }
+
+    return WriteToFileUnaligned( file, writeBuffer, size, error );
+}
+
+//-----------------------------------------------------------
+bool IOJob::WriteToFileUnaligned( IStream& file, const void* writeBuffer, const size_t size, int& error )
+{
+    error = 0;
+
+    const byte* buffer      = (byte*)writeBuffer;
+    size_t      sizeToWrite = size;
+
+    while( sizeToWrite )
+    {
+        ssize_t sizeWritten = file.Write( buffer, sizeToWrite );
+        if( sizeWritten < 1 )
+        {
+            error = file.GetError();
+            return false;
+        }
+
+        ASSERT( sizeWritten <= (ssize_t)sizeToWrite );
+
+        sizeToWrite -= (size_t)sizeWritten;
+        buffer      += sizeWritten;
+    }
+
+    return true;
+}
+
 //-----------------------------------------------------------
 void* IOJob::ReadAllBytesDirect( const char* path, int& error )
 {
+    size_t byteCount = 0;
+    return ReadAllBytesDirect( path, error, byteCount );
+}
+
+//-----------------------------------------------------------
+void* IOJob::ReadAllBytesDirect( const char* path, int& error, size_t& byteCount )
+{
+    byteCount = 0;
+
     FileStream file;
     if( !file.Open( path, FileMode::Open, FileAccess::Read, FileFlags::NoBuffering ) )
         return nullptr;
@@ -197,20 +262,45 @@ void* IOJob::ReadAllBytesDirect( const char* path, int& error )
         return nullptr;
     }
 
+    byteCount = readSize;
     return buffer;
 }
 
+
 //-----------------------------------------------------------
 bool IOJob::ReadFromFile( const char* path, void* buffer, const size_t size,
                           void* blockBuffer, const size_t blockSize, int& error )
 {
     FileStream file;
     if( !file.Open( path, FileMode::Open, FileAccess::Read ) )
+    {
+        error = file.GetError();
         return false;
+    }
 
     return ReadFromFile( file, buffer, size, blockBuffer, blockSize, error );
 }
 
+//-----------------------------------------------------------
+bool IOJob::ReadFromFile( const char* path, void* buffer, const size_t size, int& error )
+{
+    ASSERT( path   );
+    ASSERT( buffer );
+
+    FileStream file;
+    if( !file.Open( path, FileMode::Open, FileAccess::Read, FileFlags::NoBuffering ) )
+        return false;
+
+    const size_t blockSize = file.BlockSize();
+
+    void* block  = bbvirtalloc( blockSize );
+    const bool r = ReadFromFile( file, buffer, size, block, blockSize, error );
+
+    bbvirtfree( block );
+
+    return r;
+}
+
 //-----------------------------------------------------------
 bool IOJob::ReadFromFile( IStream& file, void* readBuffer, const size_t size,
                           void* fileBlockBuffer, const size_t blockSize, int& error )
@@ -255,3 +345,23 @@ bool IOJob::ReadFromFile( IStream& file, void* readBuffer, const size_t size,
 
     return true;
 }
+
+//-----------------------------------------------------------
+bool IOJob::ReadFromFileUnaligned( const char* path, void* buffer, const size_t size, int& error )
+{
+    FileStream file;
+    if( !file.Open( path, FileMode::Open, FileAccess::Read ) )
+    {
+        error = file.GetError();
+        return false;
+    }
+
+    return ReadFromFileUnaligned( file, buffer, size, error );
+}
+
+//-----------------------------------------------------------
+bool IOJob::ReadFromFileUnaligned( IStream& file, void* buffer, const size_t size, int& error )
+{
+    return ReadFromFile( file, buffer, size, nullptr, 1, error );
+}
+
diff --git a/src/plotdisk/jobs/IOJob.h b/src/plotdisk/jobs/IOJob.h
index f90437eb..ef09e807 100644
--- a/src/plotdisk/jobs/IOJob.h
+++ b/src/plotdisk/jobs/IOJob.h
@@ -32,8 +32,13 @@ struct IOJob : MTJob<IOJob>
 
     void Run() override;
 
+    static bool WriteToFile( const char* filePath, const void* writeBuffer, const size_t size, int& error );
+
     static bool WriteToFile( IStream& file, const void* writeBuffer, const size_t size,
                              void* fileBlockBuffer, const size_t blockSize, int& error );
+    
+    static bool WriteToFileUnaligned( const char* filePath, const void* writeBuffer, const size_t size, int& error );
+    static bool WriteToFileUnaligned( IStream& file, const void* writeBuffer, const size_t size, int& error );
 
     static bool ReadFromFile( IStream& file, void* buffer, const size_t size,
                               void* blockBuffer, const size_t blockSize, int& error );
@@ -41,9 +46,18 @@ struct IOJob : MTJob<IOJob>
     static bool ReadFromFile( const char* path, void* buffer, const size_t size,
                               void* blockBuffer, const size_t blockSize, int& error );
 
+    static bool ReadFromFile( const char* path, void* buffer, const size_t size, int& error );
+
+
+    static bool ReadFromFileUnaligned( const char* path, void* buffer, const size_t size, int& error );
+    static bool ReadFromFileUnaligned( IStream& file, void* buffer, const size_t size, int& error );
+
     static void* ReadAllBytesDirect( const char* path, int& error );
+    
+    static void* ReadAllBytesDirect( const char* path, int& error, size_t& byteCount );
 
 private:
+
     static bool RunIOJob( bool write, ThreadPool& pool, uint32 threadCount, 
         IStream** files, 
         byte* ioBuffer, const size_t size,
diff --git a/src/plotdisk/k32/CTableWriterBounded.h b/src/plotdisk/k32/CTableWriterBounded.h
index 07b02a95..0e31b2ae 100644
--- a/src/plotdisk/k32/CTableWriterBounded.h
+++ b/src/plotdisk/k32/CTableWriterBounded.h
@@ -38,6 +38,8 @@ class CTableWriterBounded
     {
         auto& context = _context;
 
+        PlotWriter& plotWriter = *context.plotWriter;
+
         // #TODO: Make the whole thing parallel?
         uint32 c1NextCheckpoint = 0;  // How many C1 entries to skip until the next checkpoint. If there's any entries here, it means the last bucket wrote a
         uint32 c2NextCheckpoint = 0;  // checkpoint entry and still had entries which did not reach the next checkpoint.
@@ -79,13 +81,16 @@ class CTableWriterBounded
         ///
         /// Begin
         ///
-        
+
         // Prepare read bucket files
         _ioQueue.SeekBucket( FileId::FX0   , 0, SeekOrigin::Begin );
         _ioQueue.SeekBucket( FileId::INDEX0, 0, SeekOrigin::Begin );
 
         // Seek to the start of the C3 table instead of writing garbage data.
-        _ioQueue.SeekFile( FileId::PLOT, 0, (int64)(c1TableSizeBytes + c2TableSizeBytes), SeekOrigin::Current );
+        // _ioQueue.SeekFile( FileId::PLOT, 0, (int64)(c1TableSizeBytes + c2TableSizeBytes), SeekOrigin::Current );
+        plotWriter.ReserveTableSize( PlotTable::C1, c1TableSizeBytes );
+        plotWriter.ReserveTableSize( PlotTable::C2, c2TableSizeBytes );
+        plotWriter.BeginTable( PlotTable::C3 );
         _ioQueue.CommitCommands();
 
         // Load initial bucket
@@ -212,7 +217,8 @@ class CTableWriterBounded
                 c3TableSizeBytes += sizeWritten;
 
                 // Write the C3 table to the plot file directly
-                _ioQueue.WriteFile( FileId::PLOT, 0, c3Buffer, c3BufferSize );
+                // _ioQueue.WriteFile( FileId::PLOT, 0, c3Buffer, c3BufferSize );
+                plotWriter.WriteTableData( c3Buffer, c3BufferSize );
                 _ioQueue.SignalFence( _writeFence, bucket );
                 _ioQueue.CommitCommands();
             }
@@ -220,32 +226,35 @@ class CTableWriterBounded
 
         // Seek back to the begining of the C1 table and
         // write C1 and C2 buffers to file, then seek back to the end of the C3 table
-
         c1Buffer[c1TotalEntries-1] = 0;          // Chiapos adds a trailing 0
         c2Buffer[c2TotalEntries-1] = 0xFFFFFFFF; // C2 overflow protection      // #TODO: Remove?
 
         _readFence.Reset( 0 );
 
-        _ioQueue.SeekBucket( FileId::PLOT, -(int64)( c1TableSizeBytes + c2TableSizeBytes + c3TableSizeBytes ), SeekOrigin::Current );
-        _ioQueue.WriteFile( FileId::PLOT, 0, c1Buffer.Ptr(), c1TableSizeBytes );
-        _ioQueue.WriteFile( FileId::PLOT, 0, c2Buffer.Ptr(), c2TableSizeBytes );
-        _ioQueue.SeekBucket( FileId::PLOT, (int64)c3TableSizeBytes, SeekOrigin::Current );
+        plotWriter.EndTable();
+        plotWriter.WriteReservedTable( PlotTable::C1, c1Buffer.Ptr() );
+        plotWriter.WriteReservedTable( PlotTable::C2, c2Buffer.Ptr() );
+
+        // _ioQueue.SeekBucket( FileId::PLOT, -(int64)( c1TableSizeBytes + c2TableSizeBytes + c3TableSizeBytes ), SeekOrigin::Current );
+        // _ioQueue.WriteFile( FileId::PLOT, 0, c1Buffer.Ptr(), c1TableSizeBytes );
+        // _ioQueue.WriteFile( FileId::PLOT, 0, c2Buffer.Ptr(), c2TableSizeBytes );
+        // _ioQueue.SeekBucket( FileId::PLOT, (int64)c3TableSizeBytes, SeekOrigin::Current );
         _ioQueue.SignalFence( _readFence, 1 );
         _ioQueue.CommitCommands();
 
         // Save C table addresses into the plot context.
         // And set the starting address for the table 1 to be written
-        const size_t headerSize = _ioQueue.PlotHeaderSize();
+        // const size_t headerSize = _ioQueue.PlotHeaderSize();
 
-        _context.plotTablePointers[7] = headerSize;                                       // C1
-        _context.plotTablePointers[8] = _context.plotTablePointers[7] + c1TableSizeBytes; // C2
-        _context.plotTablePointers[9] = _context.plotTablePointers[8] + c2TableSizeBytes; // C3
-        _context.plotTablePointers[0] = _context.plotTablePointers[9] + c3TableSizeBytes; // T1
+        // _context.plotTablePointers[7] = headerSize;                                       // C1
+        // _context.plotTablePointers[8] = _context.plotTablePointers[7] + c1TableSizeBytes; // C2
+        // _context.plotTablePointers[9] = _context.plotTablePointers[8] + c2TableSizeBytes; // C3
+        // _context.plotTablePointers[0] = _context.plotTablePointers[9] + c3TableSizeBytes; // T1
 
-        // Save sizes
-        _context.plotTableSizes[7] = c1TableSizeBytes;
-        _context.plotTableSizes[8] = c2TableSizeBytes;
-        _context.plotTableSizes[9] = c3TableSizeBytes;
+        // // Save sizes
+        // _context.plotTableSizes[7] = c1TableSizeBytes;
+        // _context.plotTableSizes[8] = c2TableSizeBytes;
+        // _context.plotTableSizes[9] = c3TableSizeBytes;
 
         // Wait for all commands to finish
         _readFence.Wait( 1 );
diff --git a/src/plotdisk/k32/DiskPlotBounded.cpp b/src/plotdisk/k32/DiskPlotBounded.cpp
index 3224ba70..5c8d8a2b 100644
--- a/src/plotdisk/k32/DiskPlotBounded.cpp
+++ b/src/plotdisk/k32/DiskPlotBounded.cpp
@@ -340,12 +340,19 @@ void K32BoundedPhase1::RunFx()
     _context.ioWaitTime += fx._tableIOWait;
 
     #if _DEBUG
+    {
         BB_DP_DBG_WriteTableCounts( _context );
          // #TODO: Update this for alternating mode
         _ioQueue.DebugWriteSliceSizes( table, (uint)table %2 == 0 ? FileId::FX0    : FileId::FX1    );
         _ioQueue.DebugWriteSliceSizes( table, (uint)table %2 == 0 ? FileId::INDEX0 : FileId::INDEX1 );
         _ioQueue.DebugWriteSliceSizes( table, (uint)table %2 == 0 ? FileId::META0  : FileId::META1  );
+        
+        auto& fence = _context.fencePool->RequireFence();
+        _ioQueue.SignalFence( fence );
         _ioQueue.CommitCommands();
+        fence.Wait();
+        _context.fencePool->ReleaseFence( fence );
+    }
     #endif
 }
 
diff --git a/src/plotdisk/k32/F1Bounded.inl b/src/plotdisk/k32/F1Bounded.inl
index 9069033b..130e39a8 100644
--- a/src/plotdisk/k32/F1Bounded.inl
+++ b/src/plotdisk/k32/F1Bounded.inl
@@ -77,7 +77,7 @@ public:
         Job::Run( *_context.threadPool, _context.f1ThreadCount, [=]( Job* self ) {
 
             byte key[BB_PLOT_ID_LEN] = { 1 };
-            memcpy( key + 1, _context.plotId, BB_PLOT_ID_LEN-1 );
+            memcpy( key + 1, _context.plotRequest.plotId, BB_PLOT_ID_LEN-1 );
 
             chacha8_ctx chacha;
             chacha8_keysetup( &chacha, key, 256, nullptr );
@@ -346,13 +346,12 @@ void DbgValidateF1( DiskPlotContext& context )
 
     // Cleanup
     context.fencePool->RestoreAllFences();
-    bbvirtfreebounded( yReference.Ptr() );
-    bbvirtfreebounded( xReference.Ptr() );
-    bbvirtfreebounded( yBuffer.Ptr() );
-    bbvirtfreebounded( xBuffer.Ptr() );
-    bbvirtfreebounded( tmpBuffer.Ptr() );
-    bbvirtfreebounded( tmpBuffer2.Ptr() );
+    bbvirtfreebounded_span( yReference );
+    bbvirtfreebounded_span( xReference );
+    bbvirtfreebounded_span( yBuffer    );
+    bbvirtfreebounded_span( xBuffer    );
+    bbvirtfreebounded_span( tmpBuffer  );
+    bbvirtfreebounded_span( tmpBuffer2 );
 }
 
 #endif
-
diff --git a/src/plotdisk/k32/FxBounded.inl b/src/plotdisk/k32/FxBounded.inl
index fbfb2e4e..38345e10 100644
--- a/src/plotdisk/k32/FxBounded.inl
+++ b/src/plotdisk/k32/FxBounded.inl
@@ -5,6 +5,7 @@
 #include "plotdisk/BitBucketWriter.h"
 #include "plotdisk/MapWriter.h"
 #include "plotdisk/BlockWriter.h"
+#include "plotmem/LPGen.h"
 #include "util/StackAllocator.h"
 #include "FpMatchBounded.inl"
 #include "b3/blake3.h"
@@ -82,16 +83,21 @@ public:
         _idxId [1] = FileId::INDEX0 + (FileId)((int)rTable & 1);
         _metaId[1] = FileId::META0  + (FileId)((int)rTable & 1);
 
-        if( context.cfg && context.cfg->alternateBuckets )
+        if( context.cfg )
         {
-            _interleaved = ((uint)rTable & 1) == 0; // Only even tables have interleaved writes when alternating mode is enabled
-    
-            _yId   [0] = FileId::FX0;
-            _idxId [0] = FileId::INDEX0;
-            _metaId[0] = FileId::META0;
-            _yId   [1] = FileId::FX0;
-            _idxId [1] = FileId::INDEX0;
-            _metaId[1] = FileId::META0;
+            if( context.cfg->alternateBuckets )
+            {
+                _interleaved = ((uint)rTable & 1) == 0; // Only even tables have interleaved writes when alternating mode is enabled
+        
+                _yId   [0] = FileId::FX0;
+                _idxId [0] = FileId::INDEX0;
+                _metaId[0] = FileId::META0;
+                _yId   [1] = FileId::FX0;
+                _idxId [1] = FileId::INDEX0;
+                _metaId[1] = FileId::META0;
+            }
+
+            _compressPlot = context.cfg->globalCfg->compressionLevel > 0;
         }
     }
 
@@ -183,7 +189,9 @@ public:
         
         if constexpr ( rTable == TableId::Table2 )
         {
-            _xWriter = BlockWriter<uint32>( allocator, FileId::T1, _mapWriteFence, t1BlockSize, entriesPerBucket );
+            const FileId xFileId = _compressPlot ? FileId::T2 : FileId::T1;
+
+            _xWriter = BlockWriter<uint32>( allocator, xFileId, _mapWriteFence, t1BlockSize, entriesPerBucket );
         }
         else
         {
@@ -428,7 +436,8 @@ private:
                 ASSERT( tableEntryCount + totalMatches == _maxTableEntries );
             }
 
-            WritePairs( self, bucket, totalMatches, matches, matchOffset );
+            if( rTable != TableId::Table2 || !_compressPlot )
+                WritePairs( self, bucket, totalMatches, matches, matchOffset );
 
             ///
             /// Sort meta on Y
@@ -437,17 +446,25 @@ private:
     
             Span<TMetaIn> metaUnsorted = _meta[bucket].SliceSize( entryCount );                 ASSERT( metaUnsorted.Length() == entryCount );
             Span<TMetaIn> metaIn       = _metaTmp[0].template As<TMetaIn>().SliceSize( entryCount );
+            Span<uint32>  xWriteBuffer;
 
             if constexpr ( rTable == TableId::Table2 )
             {
                 // #TODO: Simplify this, allowing BlockWriter to let the user specify a buffer, like in BitWriter
                 // Get and set shared x buffer for other threads
                 if( self->BeginLockBlock() )
-                    _xWriteBuffer = Span<TMetaIn>( _xWriter.GetNextBuffer( _tableIOWait ), entryCount );
+                {
+                    const uint64 xBufferLength = _compressPlot ? totalMatches : entryCount;
+
+                    _xWriteBuffer = Span<uint32>( _xWriter.GetNextBuffer( _tableIOWait ), xBufferLength );
+                }
                 self->EndLockBlock();
 
-                // Grap shared buffer
-                metaIn = _xWriteBuffer;
+                xWriteBuffer = _xWriteBuffer;
+                
+                // Grab shared buffer
+                if( !_compressPlot )
+                    metaIn = xWriteBuffer;
             }
 
             SortOnYKey( self, sortKey, metaUnsorted, metaIn );
@@ -459,6 +476,9 @@ private:
             // On Table 2, metadata is our x values, which have to be saved as table 1
             if constexpr ( rTable == TableId::Table2 )
             {
+                if( _compressPlot )
+                    CompressTable1( self, matchOffset, matches, metaIn, xWriteBuffer );
+                
                 // Write (sorted-on-y) x back to disk
                 if( self->BeginLockBlock() )
                 {
@@ -466,7 +486,7 @@ private:
                         _dbgPlot.WriteYX( bucket, yInput, metaIn );
                     #endif
 
-                    _xWriter.SubmitBuffer( _ioQueue, entryCount );
+                    _xWriter.SubmitBuffer( _ioQueue, xWriteBuffer.Length() );
                     if( bucket == _numBuckets - 1 )
                         _xWriter.SubmitFinalBlock( _ioQueue );
                 }
@@ -527,6 +547,50 @@ private:
         }
     }
 
+    //-----------------------------------------------------------
+    void CompressTable1( Job* self, const uint64 matchOffset, const Span<Pair> pairs, const Span<uint32> inXEntries, Span<uint32> outXEntries )
+    {
+#if _DEBUG
+        uint64 numRepeated = 0;
+#endif
+        const uint32 entryBits = _context.cfg->globalCfg->compressedEntryBits;
+        const uint32 shift     = 32 - entryBits;
+
+        for( uint64 i = 0; i < pairs.Length(); i++ )
+        {
+            const Pair p = pairs[i];
+
+            const uint32 x1 = inXEntries[p.left ] >> shift;
+            const uint32 x2 = inXEntries[p.right] >> shift;
+
+            // Convert to linepoint            
+            const uint32 x12 = (uint32)SquareToLinePoint( x2, x1 );
+            ASSERT( !(x12 & 1ul << 31 ) );
+            ASSERT( !(x12 & (1ul << (entryBits*2-1)) ) );
+
+// #if _DEBUG
+//             const BackPtr deconverted = LinePointToSquare64( x12 ) ;
+
+//             if( x1 == x2 )
+//                 numRepeated ++;
+//             else
+//             {
+//                 ASSERT( ( deconverted.x == x1 && deconverted.y == x2 ) || 
+//                         ( deconverted.x == x2 && deconverted.y == x1 ) );
+//             }
+// #endif
+            // const uint32 x12 = (x2 & 0xFFFF0000) | ( x1 >> 16 );
+            // const uint32 x12 = (x2 << shift) | x1;
+            outXEntries[matchOffset+i] = x12;
+        }
+
+// #if _DEBUG
+//         Log::WriteLine( "[%u] %llu / %llu : %.2lf", self->JobId(), 
+//             numRepeated, pairs.Length(),
+//              numRepeated / (double)pairs.Length() );
+// #endif
+    }
+
     //-----------------------------------------------------------
     void SortY( Job* self, const uint64 entryCount, uint32* ySrc, uint32* yTmp, uint32* sortKeySrc, uint32* sortKeyTmp )
     {
@@ -1258,8 +1322,8 @@ private:
         }
 
         Log::Line( " Indices are valid" );
-        bbvirtfreebounded( indices.Ptr() );
-        bbvirtfreebounded( tmpIndices.Ptr() );
+        bbvirtfreebounded_span( indices );
+        bbvirtfreebounded_span( tmpIndices );
     }
     #endif
 
@@ -1297,8 +1361,9 @@ private:
     Span<uint32>        _xWriteBuffer;          // Set by the control thread for other threads to use
 
     // Writers for when using alternating mode, for non-interleaved writes
-    bool                _interleaved = true;
-    
+    bool                _interleaved  = true;
+    bool                _compressPlot = false;
+
     // Working buffers
     Span<uint64>        _yTmp;
     Span<uint32>        _sortKey;
diff --git a/src/plotdisk/main_diskplot.h b/src/plotdisk/main_diskplot.h
deleted file mode 100644
index 0653f48a..00000000
--- a/src/plotdisk/main_diskplot.h
+++ /dev/null
@@ -1,493 +0,0 @@
-#include <stdlib.h>
-#include "DiskPlotter.h"
-#include "SysHost.h"
-#include "io/FileStream.h"
-#include "util/Log.h"
-#include "plotting/PlotTools.h"
-
-// TEST
-#include "plotdisk/jobs/IOJob.h"
-#include "threading/ThreadPool.h"
-#include <sys/resource.h>
-
-// TEST:
-void TestDiskBackPointers();
-void TestLookupMaps();
-
-struct GlobalConfig
-{
-
-};
-
-
-//-----------------------------------------------------------
-void ParseConfig( int argc, const char* argv[], GlobalConfig& gConfig, DiskPlotter::Config& cfg );
-
-size_t ParseSize( const char* arg, const char* sizeText );
-size_t ValidateTmpPathAndGetBlockSize( DiskPlotter::Config& cfg );
-
-//-----------------------------------------------------------
-int WriteTest( int argc, const char* argv[] )
-{
-    uint         threadCount  = 4;
-    const size_t bufferSize   = 64ull GB;
-    // const uint   bufferCount  = 4;
-    const char*  filePath     = "/mnt/p5510a/test.tmp";
-
-    // if( argc > 0 )
-    // {
-    //     int r = sscanf( argv[0], "%lu", &threadCount );
-    //     FatalIf( r != 1, "Invalid value for threadCount" );
-    // }
-
-    Log::Line( "Threads    : %u", threadCount );
-    Log::Line( "Buffer Size: %llu GiB", bufferSize BtoGB );
-    
-    ThreadPool pool( threadCount );
-    FileStream* files        = new FileStream[threadCount];
-    byte**      blockBuffers = new byte*[threadCount];
-
-    for( uint i = 0; i < threadCount; i++ )
-    {
-        FatalIf( !files[i].Open( filePath, FileMode::OpenOrCreate, FileAccess::ReadWrite, 
-                  FileFlags::LargeFile | FileFlags::NoBuffering ), "Failed to open file." );
-
-        blockBuffers[i] = (byte*)SysHost::VirtualAlloc( files[i].BlockSize() );
-    }
-    Log::Line( "Allocating buffer..." );
-    byte* buffer = (byte*)SysHost::VirtualAlloc( bufferSize, true ); 
-
-    Log::Line( "Writing..." );
-    int error = 0;
-    double elapsed = IOJob::WriteWithThreads( 
-        threadCount, pool, files, buffer, bufferSize, blockBuffers, files[0].BlockSize(), error );
-
-    if( error )
-        Fatal( "Error when writing: %d (0x%x)", error, error );
-
-    const double writeRate = bufferSize / elapsed;
-
-    Log::Line( "Wrote %.2lf GiB in %.4lf seconds: %.2lf MiB/s (%.2lf MB/s)", 
-               (double)bufferSize BtoGB, elapsed, writeRate BtoMB, writeRate / 1000 / 1000 );
-
-    FatalIf( !files[0].Seek( 0, SeekOrigin::Begin ), 
-            "Failed to seek file with error %d.", files[0].GetError() );
-    
-    Log::Line( "Reading..." );
-    auto readTimer = TimerBegin();
-    IOJob::ReadFromFile( files[0], buffer, bufferSize, blockBuffers[0], files[0].BlockSize(), error );
-    elapsed = TimerEnd( readTimer );
-
-    FatalIf( error, "Error reading: %d, (0x%x)", error, error );
-
-    const double readRate = bufferSize / elapsed;
-    Log::Line( "Read %.2lf GiB in %.4lf seconds: %.2lf MiB/s (%.2lf MB/s)", 
-               (double)bufferSize BtoGB, elapsed, readRate BtoMB, readRate / 1000 / 1000 );
-
-    return 0;
-}
-
-//-----------------------------------------------------------
-int main( int argc, const char* argv[] )
-{
-    // Install a crash handler to dump our stack traces
-    SysHost::InstallCrashHandler();
-
-    struct rlimit limit;
-    getrlimit( RLIMIT_NOFILE, &limit );
-    Log::Line( "%u / %u", limit.rlim_cur, limit.rlim_max );
-    
-    limit.rlim_cur = limit.rlim_max;
-    setrlimit( RLIMIT_NOFILE, &limit );
-
-    #if _DEBUG
-        Log::Line( "DEBUG: ON" );
-    #else
-        Log::Line( "DEBUG: OFF" );
-    #endif
-
-    #if _NDEBUG
-        Log::Line( "NDEBUG: ON" );
-    #else
-        Log::Line( "NDEBUG: OFF" );
-    #endif
-
-    // TestDiskBackPointers(); return 0;
-    // TestLookupMaps(); return 0;
-
-    argc--;
-    argv++;
-    // return WriteTest( argc, argv );
-
-    DiskPlotter::Config plotCfg;
-    GlobalConfig gCfg;
-
-    ParseConfig( argc, argv, gCfg, plotCfg );
-
-    DiskPlotter plotter( plotCfg );
-
-    byte*   plotId       = new byte[BB_PLOT_ID_LEN];
-    byte*   plotMemo     = new byte[BB_PLOT_MEMO_MAX_SIZE];
-    char*   plotFileName = new char[BB_PLOT_FILE_LEN_TMP];
-    uint16  plotMemoSize = 0;
-
-    DiskPlotter::PlotRequest req;
-    req.plotFileName = plotFileName;
-    req.plotId       = plotId;
-    req.plotMemo     = plotMemo;
-    // #TODO: Generate plot id & memo
-
-
-    // TEST
-    // #TODO: Remove
-    {
-        const char refPlotId  [] = "c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835";
-        const char refPlotMemo[] = "80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef01b7bf8a22a9ac82a003e07b551c851ea683839f3e1beb8ac9ede57d2c020669";
-
-        memset( plotId  , 0, BB_PLOT_ID_LEN );
-        memset( plotMemo, 0, BB_PLOT_MEMO_MAX_SIZE );
-
-        HexStrToBytes( refPlotId  , sizeof( refPlotId   )-1, plotId  , BB_PLOT_ID_LEN );
-        HexStrToBytes( refPlotMemo, sizeof( refPlotMemo )-1, plotMemo, BB_PLOT_MEMO_MAX_SIZE );
-        
-        req.plotMemoSize = 128;
-    }
-
-    PlotTools::GenPlotFileName( plotId, plotFileName );
-    plotter.Plot( req );
-
-    exit( 0 );
-}
-
-//-----------------------------------------------------------
-void ParseConfig( int argc, const char* argv[], GlobalConfig& gConfig, DiskPlotter::Config& cfg )
-{
-    #define check( a ) (strcmp( a, arg ) == 0)
-    int i;
-    const char* arg = nullptr;
-
-    auto value = [&](){
-
-        FatalIf( ++i >= argc, "Expected a value for parameter '%s'", arg );
-        return argv[i];
-    };
-
-    auto ivalue = [&]() {
-
-        const char* val = value();
-        int64 v = 0;
-        
-        int r = sscanf( val, "%lld", &v );
-        FatalIf( r != 1, "Invalid int64 value for argument '%s'.", arg );
-
-        return v;
-    };
-
-    auto uvalue = [&]() {
-        
-        const char* val = value();
-        uint64 v = 0;
-
-        int r = sscanf( val, "%llu", &v );
-        FatalIf( r != 1, "Invalid uint64 value for argument '%s'.", arg );
-
-        return v;
-    };
-
-
-     // Set defaults
-    const size_t f1DefaultWriteInterval    = 128ull MB;
-    size_t       fxDefaultWriteInterval    = 64ull  MB;
-    size_t       matchDefaultWriteInterval = 64ull  MB;
-    const uint   minBufferCount            = 3;
-
-    // Parse fx and match per-table
-    auto checkFx = [&]( const char* a ) {
-        
-        const size_t minSize = sizeof( "--fx" ) - 1;
-        const size_t len     = strlen( a );
-
-        if( len >= minSize && memcmp( "--fx", a, minSize ) == 0 )
-        {
-            if( len == minSize )
-            {
-                // Set the default value
-                fxDefaultWriteInterval = ParseSize( "--fx", value() );
-                return true;
-            }
-            else
-            {
-                // Set the value for a table (--fx2, --fx4...)
-                const char intChar = a[minSize];
-                
-                // Expect an integer from 2-7 (inclusive) to come immediately after --fx
-                if( intChar >= '2' && intChar <= '7' )
-                {
-                    const int tableId = (int)intChar - '0';
-                    cfg.writeIntervals[tableId].fxGen = ParseSize( "--fn", value() );
-                    
-                    return true;
-                }
-            }
-        }
-
-        return false;
-    };
-
-    auto checkMatch = [&]( const char* a ) {
-        
-        const size_t minSize = sizeof( "--match" ) - 1;
-        const size_t len     = strlen( a );
-
-        if( len >= minSize && memcmp( "--match", a, minSize ) == 0 )
-        {
-            if( len == minSize )
-            {
-                // Set the default value
-                matchDefaultWriteInterval = uvalue();
-                return true;
-            }
-            else
-            {
-                // Set the value for a table (--fx2, --fx4...)
-                const char intChar = a[minSize];
-                
-                // Expect an integer from 2-7 (inclusive) to come immediately after --fx
-                if( intChar >= '2' && intChar <= '7' )
-                {
-                    const int tableId = (int)intChar - '0';
-                    cfg.writeIntervals[tableId].matching = uvalue();
-                    
-                    return true;
-                }
-            }
-        }
-
-        return false;
-    };
-   
-    // Set some defaults
-    ZeroMem( &cfg );
-    cfg.workHeapSize = BB_DP_MIN_RAM_SIZE;  // #TODO: I don't think we need this anymore.
-                                            //        We only need a variable size for the write intervals.
-                                            //        This size will be fixed.
-
-    cfg.ioBufferCount = minBufferCount;
-
-
-    // Start parsing
-    for( i = 0; i < argc; i++ )
-    {
-        arg = argv[i];
-
-        if( check( "--f1" ) )
-        {
-            cfg.writeIntervals[0].fxGen = ParseSize( arg, value() );
-        }
-        else if( checkFx( arg ) || checkMatch( arg ) )
-        {
-            continue;
-        }
-        else if( check( "-t" ) || check( "--threads" ) )
-        {
-            cfg.workThreadCount = (uint)uvalue();
-        }
-        else if( check( "-b" ) || check( "--buffer-count" ) )
-        {
-            cfg.ioBufferCount = (uint)uvalue();
-        }
-        // else if( check( "-d" ) || check( "--direct-io" ) )
-        // {
-        //     cfg.enableDirectIO = true;
-        // }
-        // else if( check( "--io-threads" ) )
-        // {
-        //     cfg.ioThreadCount = (uint)uvalue();
-        // }
-        // else if( check( "-h" ) || check( "--heap" ) )
-        // {
-        //     ParseSize( arg, value() );
-        // }
-        else if( check( "--temp" ) )
-        {
-            cfg.tmpPath = value();
-        }
-        else if( i == argc - 1 )
-        {
-            cfg.tmpPath = arg;
-        }
-        else
-        {
-            Fatal( "Error: Unexpected argument '%s'.", arg );
-        }
-    }
-
-    
-    // Validate some parameters
-    const size_t diskBlockSize = ValidateTmpPathAndGetBlockSize( cfg );
-
-    const size_t minBucketSize = BB_DP_MAX_ENTRIES_PER_BUCKET * sizeof( uint32 );
-
-    size_t maxWriteInterval = 0;
-
-    for( TableId table = TableId::Table1; table < TableId::_Count; table++ )
-    {
-        auto& writeInterval = cfg.writeIntervals[(int)table];
-
-        if( writeInterval.fxGen == 0 )
-            writeInterval.fxGen = table == TableId::Table1 ? f1DefaultWriteInterval : fxDefaultWriteInterval;
-
-        if( writeInterval.matching == 0 )
-            writeInterval.matching = matchDefaultWriteInterval;
-
-        // Ensure the intervals are <= than the minimum write size of a bucket (table 7)
-        // and >= disk block size of the temporary directory.
-        FatalIf( writeInterval.fxGen    > minBucketSize, "f%d write interval must be less or equal than %llu bytes.", (int)table+1, minBucketSize );
-        FatalIf( writeInterval.matching > minBucketSize, "Table %d match write interval must be less or equal than %llu bytes.", (int)table+1, minBucketSize );
-        FatalIf( writeInterval.fxGen    < diskBlockSize, "f%d write interval must be greater or equal than the tmp directory block size of %llu bytes.", (int)table+1, diskBlockSize );
-        FatalIf( writeInterval.matching < diskBlockSize, "Table %d match write interval must be greater or equal than the tmp directory block size of %llu bytes.", (int)table + 1, minBucketSize );
-
-        // Round up the size to the block size
-        writeInterval.fxGen    = RoundUpToNextBoundaryT( writeInterval.fxGen   , diskBlockSize );
-        writeInterval.matching = RoundUpToNextBoundaryT( writeInterval.matching, diskBlockSize );
-
-        maxWriteInterval = std::max( maxWriteInterval, writeInterval.fxGen    );
-        maxWriteInterval = std::max( maxWriteInterval, writeInterval.matching );
-    }
-
-    cfg.ioBufferSize = maxWriteInterval;
-
-    FatalIf( cfg.ioBufferCount < 3, "IO buffer (write interval buffers) cont must be 3 or more." );
-    
-
-    const uint sysLogicalCoreCount = SysHost::GetLogicalCPUCount();
-
-    if( cfg.workThreadCount == 0 )
-        cfg.workThreadCount = sysLogicalCoreCount;
-    else if( cfg.workThreadCount > sysLogicalCoreCount )
-    {
-        Log::Line( "Warning: Limiting work threads to %u, which is the system's logical CPU count.", sysLogicalCoreCount );
-        cfg.workThreadCount = sysLogicalCoreCount;
-    }
-
-    if( cfg.ioThreadCount == 0 )
-        cfg.ioThreadCount = 1;        // #TODO: figure out a reasonable default. Probably 1 or 2 for current consumer NVMes running on PCIe3...
-    else if( cfg.ioThreadCount > sysLogicalCoreCount )
-    {
-        Log::Line( "Warning: Limiting disk queue threads to %u, which is the system's logical CPU count.", sysLogicalCoreCount );
-        cfg.ioThreadCount = sysLogicalCoreCount;
-    }
-}
-
-//-----------------------------------------------------------
-size_t ParseSize( const char* arg, const char* sizeText )
-{
-    const size_t len = strlen( sizeText );
-    const char* end = sizeText + len;
-
-    const char* suffix = sizeText;
-
-#ifdef _WIN32
-    #define StriCmp _stricmp
-#else
-    #define StriCmp strcasecmp
-#endif
-
-    // Try to find a suffix:
-    //  Find the first character that's not a digit
-    do
-    {
-        const char c = *suffix;
-        if( c < '0' || c > '9' )
-            break;
-    }
-    while( ++suffix < end );
-
-    // Apply multiplier depending on the suffix
-    size_t multiplier = 1;
-
-    const size_t suffixLength = end - suffix;
-    if( suffixLength > 0 )
-    {
-        if( StriCmp( "GB", suffix ) == 0 || StriCmp( "G", suffix ) == 0 )
-            multiplier = 1ull GB;
-        else if( StriCmp( "MB", suffix ) == 0 || StriCmp( "M", suffix ) == 0 )
-            multiplier = 1ull MB;
-        else if( StriCmp( "KB", suffix ) == 0 || StriCmp( "K", suffix ) == 0 )
-            multiplier = 1ull KB;
-        else
-            Fatal( "Invalid suffix '%s' for argument '%s'", suffix, arg );
-    }
-
-    const size_t MAX_DIGITS = 19;
-    char digits[MAX_DIGITS + 1];
-
-    const size_t digitsLength = suffix - sizeText;
-    FatalIf( digitsLength < 1 || digitsLength > MAX_DIGITS, "Invalid parameters value for argument '%s'.", arg );
-
-    // Read digits
-    size_t size = 0;
-
-    memcpy( digits, sizeText, digitsLength );
-    digits[digitsLength] = 0;
-
-    FatalIf( sscanf( digits, "%llu", &size ) != 1, 
-             "Invalid parameters value for argument '%s'.", arg );
-
-    const size_t resolvedSize = size * multiplier;
-    
-    // Check for overflow
-    FatalIf( resolvedSize < size, "Size overflowed for argument '%s'.", arg );;
-
-    return resolvedSize;
-}
-
-//-----------------------------------------------------------
-size_t ValidateTmpPathAndGetBlockSize( DiskPlotter::Config& cfg )
-{
-    FatalIf( cfg.tmpPath == nullptr, "No temporary path specified." );
-
-    size_t pathLen = strlen( cfg.tmpPath );
-    FatalIf( pathLen < 1, "Invalid temporary path." );
-
-    char* tmpPath = bbmalloc<char>( pathLen + 2 );
-    memcpy( tmpPath, cfg.tmpPath, pathLen );
-
-    if( cfg.tmpPath[pathLen - 1] != '/'
-    #ifdef _WIN32
-        && cfg.tmpPath[pathLen - 1] != '\\'
-    #endif
-        )
-    {
-        tmpPath[pathLen++] = '/';
-    }
-
-    tmpPath[pathLen] = (char)0;
-    cfg.tmpPath = tmpPath;
-
-    // Open a file in the temp dir to obtain the block size
-    uint64 randNum = 0;
-    SysHost::Random( (byte*)&randNum, sizeof( randNum ) );
-
-    char* randFileName = bbmalloc<char>( pathLen + 32 );
-    
-    int r = snprintf( randFileName, pathLen + 32, "%s.%llx.blk", tmpPath, randNum );
-    FatalIf( r < 1, "Unexpected error validating temp directory." );
-
-    FileStream tmpFile;
-
-    if( !tmpFile.Open( randFileName, FileMode::Create, FileAccess::ReadWrite ) )
-    {
-        int err = tmpFile.GetError();
-        Fatal( "Failed to open a file in the temp directory with error %d (0x%x).", err, err );
-    }
-    
-
-    cfg.expectedTmpDirBlockSize = tmpFile.BlockSize();
-
-    remove( randFileName );
-    free( randFileName );
-
-    return cfg.expectedTmpDirBlockSize;
-}
-
diff --git a/src/plotmem/DbgHelper.h b/src/plotmem/DbgHelper.h
index 67955977..9f336f86 100644
--- a/src/plotmem/DbgHelper.h
+++ b/src/plotmem/DbgHelper.h
@@ -3,6 +3,7 @@
 #include "PlotContext.h"
 #include "util/Log.h"
 #include "io/FileStream.h"
+#include "plotdisk/jobs/IOJob.h"
 
 #define DBG_TABLES_PATH ".sandbox/"
 
@@ -243,187 +244,194 @@ inline bool DbgReadTableFromFile( ThreadPool& pool, const char* path, uint64& ou
 template<typename T>
 inline void DbgWriteTableToFile( ThreadPool& pool, const char* path, uint64 entryCount, const T* entries, bool unBuffered )
 {
-    if( entryCount < 1 )
-        return;
+    // if( entryCount < 1 )
+    //     return;
 
-    const FileFlags flags = unBuffered ? FileFlags::None : FileFlags::NoBuffering; 
+    // const FileFlags flags = unBuffered ? FileFlags::None : FileFlags::NoBuffering; 
 
     Log::Line( "Started writing table file @ %s", path );
     auto timer = TimerBegin();
 
-    FileStream file;
-    if( !file.Open( path, FileMode::Create, FileAccess::Write ,flags ) )
+    int err = 0;
+    if( !IOJob::WriteToFile( path, entries, entryCount * sizeof( T), err ) )
     {
-        Log::Line( "Error: Failed to open table file '%s'.", path );
+        Log::Error( "Failed to write table with error %d", err );
         return;
     }
 
-    // Reserve the total size
-    const size_t size        = sizeof( T ) * entryCount;
-
-    const size_t blockSize   = file.BlockSize();
-    const size_t totalBlocks = size / blockSize;
-
-    ASSERT( size > blockSize );
-
-    if( !file.Reserve( blockSize + CDiv( size, (int)blockSize ) ) )
-    {
-        Log::Line( "Failed to reserve size with error %d for table '%s'.", file.GetError(), path );
-    }
-
-    /// Write the entry count as a whole block
-    static void* block = nullptr;
-    if( !block )
-    {
-        #if PLATFORM_IS_UNIX
-            int r = posix_memalign( &block, blockSize, blockSize );
-            if( r != 0 )
-            {
-                Log::Line( "Failed to get aligned block with error %d.", r );
-                return;
-            }
-        #else
-            block = _aligned_malloc( blockSize, blockSize );
+    // FileStream file;
+    // if( !file.Open( path, FileMode::Create, FileAccess::Write ,flags ) )
+    // {
+    //     Log::Line( "Error: Failed to open table file '%s'.", path );
+    //     return;
+    // }
+
+    // // Reserve the total size
+    // const size_t size        = sizeof( T ) * entryCount;
+
+    // const size_t blockSize   = file.BlockSize();
+    // const size_t totalBlocks = size / blockSize;
+
+    // ASSERT( size > blockSize );
+
+    // if( !file.Reserve( blockSize + CDiv( size, (int)blockSize ) ) )
+    // {
+    //     Log::Line( "Failed to reserve size with error %d for table '%s'.", file.GetError(), path );
+    // }
+
+    // /// Write the entry count as a whole block
+    // static void* block = nullptr;
+    // if( !block )
+    // {
+    //     #if PLATFORM_IS_UNIX
+    //         int r = posix_memalign( &block, blockSize, blockSize );
+    //         if( r != 0 )
+    //         {
+    //             Log::Line( "Failed to get aligned block with error %d.", r );
+    //             return;
+    //         }
+    //     #else
+    //         block = _aligned_malloc( blockSize, blockSize );
             
-            if( !block )
-            {
-                Log::Line( "Failed to allocate aligned block." );
-                return;
-            }
-        #endif
-    }
-    ASSERT( block );
+    //         if( !block )
+    //         {
+    //             Log::Line( "Failed to allocate aligned block." );
+    //             return;
+    //         }
+    //     #endif
+    // }
+    // ASSERT( block );
     
-    memset( block, 0, blockSize );
-    *((uint64*)block) = entryCount;
-
-    if( file.Write( block, blockSize ) != (ssize_t)blockSize )
-    {
-        Log::Line( "Error: Failed to write count on table file '%s'.", path );
-        return;
-    }
-    file.Close();
-
-    ///
-    /// Multi-threaded writing
-    ///
-    struct WriteJob
-    {
-        FileStream file     ;
-        byte*      buffer   ;
-        size_t     writeSize;
-        size_t     blockSize;
-    };
+    // memset( block, 0, blockSize );
+    // *((uint64*)block) = entryCount;
+
+    // if( file.Write( block, blockSize ) != (ssize_t)blockSize )
+    // {
+    //     Log::Line( "Error: Failed to write count on table file '%s'.", path );
+    //     return;
+    // }
+    // file.Close();
+
+    // ///
+    // /// Multi-threaded writing
+    // ///
+    // struct WriteJob
+    // {
+    //     FileStream file     ;
+    //     byte*      buffer   ;
+    //     size_t     writeSize;
+    //     size_t     blockSize;
+    // };
     
-    #if __GNUC__ > 7
-    #pragma GCC diagnostic push
-    #pragma GCC diagnostic ignored "-Wclass-memaccess"
-    #endif
+    // #if __GNUC__ > 7
+    // #pragma GCC diagnostic push
+    // #pragma GCC diagnostic ignored "-Wclass-memaccess"
+    // #endif
 
-    WriteJob jobs[MAX_THREADS];
-    memset( jobs, 0, sizeof( jobs ) );
+    // WriteJob jobs[MAX_THREADS];
+    // memset( jobs, 0, sizeof( jobs ) );
 
-    #if __GNUC__ > 7
-    #pragma GCC diagnostic pop
-    #endif
+    // #if __GNUC__ > 7
+    // #pragma GCC diagnostic pop
+    // #endif
 
-    const uint threadCount = pool.ThreadCount();
-    ASSERT( threadCount <= MAX_THREADS );
+    // const uint threadCount = pool.ThreadCount();
+    // ASSERT( threadCount <= MAX_THREADS );
    
-    const size_t blocksPerThread = totalBlocks / threadCount;
-    const size_t threadWriteSize = blocksPerThread * blockSize;
-
-    for( uint i = 0; i < threadCount; i++ )
-    {
-        auto& job = jobs[i];
-
-        const size_t offset = threadWriteSize * i;
-
-        // Open a new handle to the file and seek it to the correct position
-        job.file = FileStream();
-        if( !job.file.Open( path, FileMode::Open, FileAccess::Write, flags ) )
-        {
-            Log::Line( "Error: Failed to open table file '%s'.", path );
-            return;
-        }
-
-        if( !job.file.Seek( (int64)(blockSize + offset), SeekOrigin::Begin ) )
-        {
-            Log::Line( "Error: Failed to seek table file '%s'.", path );
-            return;
-        }
-
-        job.buffer    = ((byte*)entries) + offset;
-        job.writeSize = threadWriteSize;
-        job.blockSize = blockSize;
-    }
-
-    pool.RunJob( (JobFunc)[]( void* pdata ) {
-
-        WriteJob& job = *(WriteJob*)pdata;
-
-        size_t size = job.writeSize;
-
-        // Write blocks until we run out
-        while( size > 0 )
-        {
-            ssize_t written = job.file.Write( job.buffer, size );
-            if( written < 0 )
-            {
-                Log::Line( "Error: Write failure." );
-                return;
-            }
+    // const size_t blocksPerThread = totalBlocks / threadCount;
+    // const size_t threadWriteSize = blocksPerThread * blockSize;
+
+    // for( uint i = 0; i < threadCount; i++ )
+    // {
+    //     auto& job = jobs[i];
+
+    //     const size_t offset = threadWriteSize * i;
+
+    //     // Open a new handle to the file and seek it to the correct position
+    //     job.file = FileStream();
+    //     if( !job.file.Open( path, FileMode::Open, FileAccess::Write, flags ) )
+    //     {
+    //         Log::Line( "Error: Failed to open table file '%s'.", path );
+    //         return;
+    //     }
+
+    //     if( !job.file.Seek( (int64)(blockSize + offset), SeekOrigin::Begin ) )
+    //     {
+    //         Log::Line( "Error: Failed to seek table file '%s'.", path );
+    //         return;
+    //     }
+
+    //     job.buffer    = ((byte*)entries) + offset;
+    //     job.writeSize = threadWriteSize;
+    //     job.blockSize = blockSize;
+    // }
+
+    // pool.RunJob( (JobFunc)[]( void* pdata ) {
+
+    //     WriteJob& job = *(WriteJob*)pdata;
+
+    //     size_t size = job.writeSize;
+
+    //     // Write blocks until we run out
+    //     while( size > 0 )
+    //     {
+    //         ssize_t written = job.file.Write( job.buffer, size );
+    //         if( written < 0 )
+    //         {
+    //             Log::Line( "Error: Write failure." );
+    //             return;
+    //         }
             
-            ASSERT( size >= (size_t)written );
+    //         ASSERT( size >= (size_t)written );
 
-            size       -= (size_t)written;
-            job.buffer += written;
-        }
+    //         size       -= (size_t)written;
+    //         job.buffer += written;
+    //     }
 
-        job.file.Close();
+    //     job.file.Close();
 
-    }, jobs, threadCount, sizeof( WriteJob ) );
+    // }, jobs, threadCount, sizeof( WriteJob ) );
     
-    // Write any remainder
-    const size_t totalThreadWrite = threadWriteSize * threadCount;
-    size_t       remainder        = size - totalThreadWrite;
-
-    // Re-open without block-aligned writes
-    if( remainder > 0 )
-    {
-        if( !file.Open( path, FileMode::Open, FileAccess::Write ) )
-        {
-            Log::Line( "Error: Failed to open table file '%s'.", path );
-            return;
-        }
-
-        const size_t seek = blockSize + totalThreadWrite;
-
-        if( !file.Seek( (int64)seek, SeekOrigin::Begin ) )
-        {
-            Log::Line( "Error: Failed to seek table file." );
-            return;
-        }
-
-        byte* buffer = ((byte*)entries) + totalThreadWrite;
-
-        do {
-
-            ssize_t written = file.Write( buffer, remainder );
-            if( written <= 0 )
-            {
-                Log::Line( "Error: Failed to write final data to table file." );
-                file.Close();
-                return;
-            }
-
-            ASSERT( written <= (ssize_t)remainder );
-            remainder -= (size_t)written;
-
-        } while( remainder > 0 );
+    // // Write any remainder
+    // const size_t totalThreadWrite = threadWriteSize * threadCount;
+    // size_t       remainder        = size - totalThreadWrite;
+
+    // // Re-open without block-aligned writes
+    // if( remainder > 0 )
+    // {
+    //     if( !file.Open( path, FileMode::Open, FileAccess::Write ) )
+    //     {
+    //         Log::Line( "Error: Failed to open table file '%s'.", path );
+    //         return;
+    //     }
+
+    //     const size_t seek = blockSize + totalThreadWrite;
+
+    //     if( !file.Seek( (int64)seek, SeekOrigin::Begin ) )
+    //     {
+    //         Log::Line( "Error: Failed to seek table file." );
+    //         return;
+    //     }
+
+    //     byte* buffer = ((byte*)entries) + totalThreadWrite;
+
+    //     do {
+
+    //         ssize_t written = file.Write( buffer, remainder );
+    //         if( written <= 0 )
+    //         {
+    //             Log::Line( "Error: Failed to write final data to table file." );
+    //             file.Close();
+    //             return;
+    //         }
+
+    //         ASSERT( written <= (ssize_t)remainder );
+    //         remainder -= (size_t)written;
+
+    //     } while( remainder > 0 );
         
-        file.Close();
-    }
+    //     file.Close();
+    // }
 
 
     const double elapsed = TimerEnd( timer );
diff --git a/src/plotmem/LPGen.h b/src/plotmem/LPGen.h
index 9e1e9bb9..f72f50da 100644
--- a/src/plotmem/LPGen.h
+++ b/src/plotmem/LPGen.h
@@ -61,7 +61,7 @@ inline BackPtr LinePointToSquare64( uint64 index );
 //-----------------------------------------------------------
 FORCE_INLINE uint64 GetXEnc( uint64 x )
 {
-    ASSERT( x );
+    // ASSERT( x );
     uint64 a = x, b = x - 1;
 
     // if( a % 2 == 0 )
@@ -71,7 +71,7 @@ FORCE_INLINE uint64 GetXEnc( uint64 x )
         b >>= 1; // b /= 2;
 
     const uint64 r = a * b;
-    ASSERT( r >= a && r >= b );
+    // ASSERT( r >= a && r >= b );
 
     return r;
 }
@@ -89,7 +89,7 @@ FORCE_INLINE uint128 GetXEnc128( uint64 x )
         b >>= 1; // b /= 2;
 
     const uint128 r = (uint128)a * b;
-    ASSERT( r >= a && r >= b );
+    // ASSERT( r >= a && r >= b );
 
     return r;
 }
@@ -112,6 +112,17 @@ FORCE_INLINE uint64 SquareToLinePoint( uint64 x, uint64 y )
     return GetXEnc( x ) + y;
 }
 
+inline uint128 SquareToLinePoint128( const uint64 x, const uint64 y )
+{
+    // Always makes y < x, which maps the random x, y  points from a square into a
+    // triangle. This means less data is needed to represent y, since we know it's less
+    // than x.
+    if( y > x )
+        return GetXEnc128( y ) + x;
+
+    return GetXEnc128( x ) + y;
+}
+
 FORCE_INLINE BackPtr LinePointToSquare( uint128 index )
 {
     // Performs a square root, without the use of doubles, 
diff --git a/src/plotmem/MemPhase1.cpp b/src/plotmem/MemPhase1.cpp
index 36ec940a..becc7ae8 100644
--- a/src/plotmem/MemPhase1.cpp
+++ b/src/plotmem/MemPhase1.cpp
@@ -6,12 +6,14 @@
 #include "FxSort.h"
 #include "algorithm/YSort.h"
 #include "SysHost.h"
+#include "plotting/GlobalPlotConfig.h"
+#include "plotmem/LPGen.h"
 #include <cmath>
 
 #include "DbgHelper.h"
-    
+
     bool DbgVerifySortedY( const uint64 entryCount, const uint64* yBuffer );
-    
+
 #if _DEBUG
     // #define DBG_VALIDATE_KB_GROUPS 1
 
@@ -103,33 +105,14 @@ MemPhase1::MemPhase1( MemPlotContext& context )
 void MemPhase1::WaitForPreviousPlotWriter()
 {
     // Wait until the current plot has finished writing
-    if( !_context.plotWriter->WaitUntilFinishedWriting() )
-        Fatal( "Failed to write previous plot file %s with error: %d", 
-            _context.plotWriter->FilePath().c_str(),
-            _context.plotWriter->GetError() );
-
-    const char* curname = _context.plotWriter->FilePath().c_str();
-    char* newname = new char[strlen(curname) - 3]();
-    memcpy(newname, curname, strlen(curname) - 4);
-
-    rename(curname, newname);
-
-    // Print final pointer offsets
-    Log::Line( "" );
-    Log::Line( "Previous plot %s finished writing to disk:", _context.plotWriter->FilePath().c_str() );
-    const uint64* tablePointers = _context.plotWriter->GetTablePointers();
-    for( uint i = 0; i < 7; i++ )
-    {
-        const uint64 ptr = Swap64( tablePointers[i] );
-        Log::Line( "  Table %u pointer  : %16lu ( 0x%016lx )", i+1, ptr, ptr );
-    }
+    const auto timer = TimerBegin();
+    _context.plotWriter->WaitForPlotToComplete();
+    const auto elapsed = TimerEnd( timer );
+    
+    Log::Line( "Previous plot finished writing to disk ( waited %.2lf seconds ): %s",
+        elapsed, _context.plotWriter->GetLastPlotFileName() );
 
-    for( uint i = 7; i < 10; i++ )
-    {
-        const uint64 ptr = Swap64( tablePointers[i] );
-        Log::Line( "  C%u table pointer : %16lu ( 0x%016lx )", i+1-7, ptr, ptr);
-    }
-    Log::Line( "" );
+    _context.plotWriter->DumpTables();
 
     _context.p4WriteBuffer = nullptr;
 }
@@ -169,6 +152,7 @@ void MemPhase1::Run()
 uint64 MemPhase1::GenerateF1()
 {
     MemPlotContext& cx  = _context;
+    const bool isCompressed = cx.cfg.gCfg->compressionLevel > 0;
 
     ///
     /// Init chacha key
@@ -198,7 +182,7 @@ uint64 MemPhase1::GenerateF1()
     // Generate all of the y values to a metabuffer first
     byte*   blocks  = (byte*)cx.yBuffer0;
     uint64* yBuffer = cx.yBuffer0;
-    uint32* xBuffer = cx.t1XBuffer;
+    uint32* xBuffer = isCompressed ? (uint32*)cx.t3LRBuffer : cx.t1XBuffer;  // Write to a temp buffer so we can write back as inlined x's into table 2
     uint64* yTmp    = cx.metaBuffer1;
     uint32* xTmp    = (uint32*)(yTmp + totalEntries);
 
@@ -355,6 +339,73 @@ uint64 MemPhase1::FpComputeTable( uint64 entryCount,
     return FpComputeSingleTable<tableId>( entryCount, pairBuffer, yBuffer, metaBuffer );
 }
 
+
+//-----------------------------------------------------------
+static void InlineTable2( MemPlotContext& cx, const uint64 pairCount, const Pair* pairs )
+{
+    const uint32 threadCount = cx.threadCount;
+
+    struct Job
+    {
+        uint32          id;
+        uint32          jobCount;
+        MemPlotContext* cx;
+        uint64          pairCount;
+        const Pair*     pairs;
+    };
+
+    Job jobs[MAX_THREADS];
+
+    for( uint32 i = 0; i < threadCount; i++ )
+    {
+        Job& job = jobs[i];
+        job.id          = i;
+        job.jobCount    = threadCount;
+        job.cx          = &cx;
+        job.pairCount   = pairCount;
+        job.pairs       = pairs;
+    }
+
+    cx.threadPool->RunJob<Job>( []( Job* self ) {
+        
+        const auto&  cx          = *self->cx;
+        const uint32 id          = self->id;
+        const uint32 threadCount = self->jobCount;
+        const uint64 pairCount   = self->pairCount;
+        const Pair*  pairs       = self->pairs;
+
+        const uint32 entryBits = cx.cfg.gCfg->compressedEntryBits;
+        const uint32 shift     = 32 - entryBits;
+        
+              int64 count  = (int64)(pairCount / threadCount);
+        const int64 offset = count * (int64)id;
+
+        if( id == self->jobCount -1 )
+            count += pairCount - count * threadCount;
+
+        const int64 end = offset + count;
+
+        const uint32* srcTable = (uint32*)cx.t3LRBuffer;
+              uint32* dstTable = cx.t1XBuffer;
+
+        for( int64 i = offset; i < end; i++ )
+        {
+            const Pair p = pairs[i];
+
+            const uint32 x1 = srcTable[p.left ] >> shift;
+            const uint32 x2 = srcTable[p.right] >> shift;
+
+            // Convert to linepoint
+            const uint32 x12 = (uint32)SquareToLinePoint( x2, x1 );
+            ASSERT( !(x12 & 1ul << 31 ) );
+            ASSERT( !(x12 & (1ul << (entryBits*2-1)) ) );
+
+            dstTable[i] = x12;
+        }
+
+    }, jobs, threadCount );
+}
+
 //-----------------------------------------------------------
 template<TableId tableId>
 uint64 MemPhase1::FpComputeSingleTable(
@@ -367,6 +418,8 @@ uint64 MemPhase1::FpComputeSingleTable(
     using TMetaOut = typename TableMetaType<tableId>::MetaOut;
 
     MemPlotContext& cx  = _context;
+    const bool isCompressed = cx.cfg.gCfg->compressionLevel > 0;
+
     Log::Line( "Forward propagating to table %d...", (int)tableId+1 );
 
     // yBuffer.read amd metaBuffer.read should always point
@@ -407,7 +460,7 @@ uint64 MemPhase1::FpComputeSingleTable(
     // Special case: Use taable 1's x buffer as input metadata
     if constexpr ( tableId == TableId::Table2 )
     {
-        inMetaBuffer = (uint64*)cx.t1XBuffer;
+        inMetaBuffer = isCompressed ? (uint64*)cx.t3LRBuffer : (uint64*)cx.t1XBuffer;
         ASSERT( metaBuffer.read == cx.metaBuffer0 );
     }
 
@@ -456,6 +509,9 @@ uint64 MemPhase1::FpComputeSingleTable(
             unsortedPairBuffer,         pairBuffer   // Write to the final pair buffer
         );
 
+        if( tableId == TableId::Table2 && isCompressed )
+            InlineTable2( cx, pairCount, pairBuffer );
+
         // DbgVerifyPairsKBCGroups( pairCount, yBuffer.write, pairBuffer );
 
         // Use the sorted metabuffer as the read buffer for the next table
@@ -528,121 +584,6 @@ void F1JobThread( F1GenJob* job )
         xBuffer[i] = (uint32)( x + i );
 }
 
-
-//-----------------------------------------------------------
-// void F1NumaJobThread( F1GenJob* job )
-// {
-//     // const NumaInfo* numa = SysHost::GetNUMAInfo();
-
-//     const uint32 pageSize           = (uint32)SysHost::GetPageSize();
-
-//     // const uint   k                  = _K;
-//     const size_t CHACHA_BLOCK_SIZE  = kF1BlockSizeBits / 8;
-//     // const uint64 totalEntries       = 1ull << k;
-//     const uint32 entriesPerBlock    = (uint32)( CHACHA_BLOCK_SIZE / sizeof( uint32 ) );
-//     const uint32 blocksPerPage      = pageSize / CHACHA_BLOCK_SIZE;
-//     const uint32 entriesPerPage32   = entriesPerBlock * blocksPerPage;
-//     const uint32 entriesPerPage64   = entriesPerPage32 / 2;
-    
-//     const uint   pageOffset         = job->startPage;
-//     const uint   pageCount          = job->pageCount;
-
-//     const uint32 pageStride         = job->threadCount;
-//     const uint32 blockStride        = pageSize         * pageStride;
-//     const uint32 entryStride32      = entriesPerPage32 * pageStride;
-//     const uint32 entryStride64      = entriesPerPage64 * pageStride;
-
-//     // #TODO: Get proper offset depending on node count. Or, figure out if we can always have
-//     //        the pages of the buffers simply start at the same location
-//     const uint32 blockStartPage    = SysHost::NumaGetNodeFromPage( job->blocks  ) == job->node ? 0 : 1;
-//     const uint32 yStartPage        = SysHost::NumaGetNodeFromPage( job->yBuffer ) == job->node ? 0 : 1;
-//     const uint32 xStartPage        = SysHost::NumaGetNodeFromPage( job->xBuffer ) == job->node ? 0 : 1;
-
-//     // const uint64 x                  = job->x;
-//     const uint64 x                  = (blockStartPage + pageOffset) * entriesPerPage32;
-//     // const uint32 xStride            = pageStride * entriesPerPage;
-
-//     byte*   blockBytes = job->blocks + (blockStartPage + pageOffset) * pageSize;
-//     uint32* blocks     = (uint32*)blockBytes;
-    
-//     uint64* yBuffer    = job->yBuffer + ( yStartPage + pageOffset ) * entriesPerPage64;
-//     uint32* xBuffer    = job->xBuffer + ( xStartPage + pageOffset ) * entriesPerPage32;
-
-//     chacha8_ctx chacha;
-//     ZeroMem( &chacha );
-
-//     chacha8_keysetup( &chacha, job->key, 256, NULL );
-
-//     for( uint64 p = 0; p < pageCount/4; p+=4 )
-//     {
-//         ASSERT( SysHost::NumaGetNodeFromPage( blockBytes ) == job->node );
-
-//         // blockBytes
-//         // Which block are we generating?
-//         const uint64 blockIdx = ( x + p * entryStride32 ) * _K / kF1BlockSizeBits;
-
-//         chacha8_get_keystream( &chacha, blockIdx,  blocksPerPage,   blockBytes );
-//         chacha8_get_keystream( &chacha, blockIdx + blocksPerPage,   blocksPerPage, blockBytes + blockStride     );
-//         chacha8_get_keystream( &chacha, blockIdx + blocksPerPage*2, blocksPerPage, blockBytes + blockStride * 2 );
-//         chacha8_get_keystream( &chacha, blockIdx + blocksPerPage*3, blocksPerPage, blockBytes + blockStride * 3 );
-//         blockBytes += blockStride * 4;
-//     }
-
-//     for( uint64 p = 0; p < pageCount; p++ )
-//     {
-//         ASSERT( SysHost::NumaGetNodeFromPage( yBuffer ) == job->node );
-//         ASSERT( SysHost::NumaGetNodeFromPage( blocks  ) == job->node );
-
-//         const uint64 curX = x + p * entryStride32;
-
-//         for( uint64 i = 0; i < entriesPerPage32; i++ )
-//         {
-//             // chacha output is treated as big endian, therefore swap, as required by chiapos
-//             const uint64 y = Swap32( blocks[i] );
-//             yBuffer[i] = ( y << kExtraBits ) | ( (curX+i) >> (_K - kExtraBits) );
-//         }
-        
-//         // for( uint64 i = 0; i < 64; i++ )
-//         // {
-//         //     yBuffer[0] = ( Swap32( blocks[0] ) << kExtraBits ) | ( (curX+0) >> (_K - kExtraBits) );
-//         //     yBuffer[1] = ( Swap32( blocks[1] ) << kExtraBits ) | ( (curX+1) >> (_K - kExtraBits) );
-//         //     yBuffer[2] = ( Swap32( blocks[2] ) << kExtraBits ) | ( (curX+2) >> (_K - kExtraBits) );
-//         //     yBuffer[3] = ( Swap32( blocks[3] ) << kExtraBits ) | ( (curX+3) >> (_K - kExtraBits) );
-//         //     yBuffer[4] = ( Swap32( blocks[4] ) << kExtraBits ) | ( (curX+4) >> (_K - kExtraBits) );
-//         //     yBuffer[5] = ( Swap32( blocks[5] ) << kExtraBits ) | ( (curX+5) >> (_K - kExtraBits) );
-//         //     yBuffer[6] = ( Swap32( blocks[6] ) << kExtraBits ) | ( (curX+6) >> (_K - kExtraBits) );
-//         //     yBuffer[7] = ( Swap32( blocks[7] ) << kExtraBits ) | ( (curX+7) >> (_K - kExtraBits) );
-
-//         //     yBuffer += 8;
-//         //     blocks  += 8;
-//         // }
-
-//         // #TODO: This is wrong. We need to fill more y's before w go to the next block page.
-//         yBuffer += entryStride64;
-//         blocks  += entryStride32;
-//     }
-
-//     // Gen the x that generated the y
-//     for( uint64 p = 0; p < pageCount; p++ )
-//     {
-//         ASSERT( SysHost::NumaGetNodeFromPage( xBuffer ) == job->node );
-
-//         const uint32 curX = (uint32)(x + p * entryStride32);
-
-//         for( uint32 i = 0; i < entriesPerPage32; i++ )
-//             xBuffer[i] = curX + i;
-
-//         xBuffer += entryStride32;
-//     }
-
-//     // #TODO: Process last part
-// }
-
-
-///
-/// kBC groups & matching
-///
-
 //-----------------------------------------------------------
 uint64 MemPhase1::FpScan( const uint64 entryCount, const uint64* yBuffer, uint32* groupBoundaries, kBCJob jobs[MAX_THREADS] )
 {
@@ -766,11 +707,11 @@ uint64 MemPhase1::FpScan( const uint64 entryCount, const uint64* yBuffer, uint32
 
         // Add a trailing end index so that we can test against it
         job.groupBoundaries[job.groupCount] = jobs[i+1].groupBoundaries[0];
-        
+
         groupCount += job.groupCount;
     }
     groupCount += jobs[threadCount-1].groupCount;
-        
+
     // Let the last job know where its R group
     auto& lastJob = jobs[threadCount-1];
 
diff --git a/src/plotmem/MemPhase2.cpp b/src/plotmem/MemPhase2.cpp
index ef93ac20..286de54a 100644
--- a/src/plotmem/MemPhase2.cpp
+++ b/src/plotmem/MemPhase2.cpp
@@ -79,16 +79,18 @@ void MemPhase2::Run()
         cx.t7LRBuffer
     };
 
+    const bool isCompressed = cx.cfg.gCfg->compressionLevel > 0;
+    const uint endTable     = (uint)(isCompressed ? TableId::Table3 : TableId::Table2);
+
     // #NOTE: We don't need to prune table 1. 
     //        Since it doesn't refer back to other values,
     //        pruning up to table 2 is enough.
-    for( uint i = (int)TableId::Table7; i > 1; i-- )
+    for( uint i = (int)TableId::Table7; i > endTable; i-- )
     {
         const Pair*  rTable       = rTables[i];
         const uint64 rTableCount  = cx.entryCount[i];
         byte* lTableMarkingBuffer = (byte*)cx.usedEntries[i-1];
 
-
         Log::Line( "  Prunning table %d...", i );
         auto timer = TimerBegin();
 
diff --git a/src/plotmem/MemPhase3.cpp b/src/plotmem/MemPhase3.cpp
index c9f1b7c3..9db5c54e 100644
--- a/src/plotmem/MemPhase3.cpp
+++ b/src/plotmem/MemPhase3.cpp
@@ -40,7 +40,14 @@ void MemPhase3::Run()
     // Therefore after each iteration rTable will be a park buffer
     uint64* lpBuffer = cx.metaBuffer0;
 
-    for( uint i = (uint)TableId::Table1; i < (uint)TableId::Table7; i++ )
+    const bool    isCompressed = cx.cfg.gCfg->compressionLevel > 0;
+    const TableId startTable   = isCompressed ? TableId::Table2 : TableId::Table1;
+
+    // Write a dummy table for table 1
+    if( isCompressed )
+        cx.plotWriter->ReserveTableSize( PlotTable::Table1, 0 );
+
+    for( uint i = (uint)startTable; i < (uint)TableId::Table7; i++ )
     {
         Pair*        rTable       = rTables[i+1];
         const uint64 rTableCount  = cx.entryCount[i+1];
@@ -165,12 +172,28 @@ uint64 MemPhase3::ProcessTable( uint32* lEntries, uint64* lpBuffer, Pair* rTable
 
     // Write park for table (re-use rTable for it)
     // #NOTE: For table 6: rTable is meta0 here.
-    byte*  parkBuffer     = _context.plotWriter->AlignPointerToBlockSize<byte>( (void*)rTable );
-    size_t sizeTableParks = WriteParks<MAX_THREADS>( *cx.threadPool, newLength, lpBuffer, parkBuffer, tableId );
-    
-    // Send over the park for writing in the plot file in the background
-    if( !cx.plotWriter->WriteTable( parkBuffer, sizeTableParks ) )
-        Fatal( "Failed to write table %d to disk.", (int)tableId+1 );
+    // #TODO: Only aligned if the user asked for it
+
+
+    byte*  parkBuffer     = _context.plotWriter->BlockAlignPtr<byte>( rTable );
+    // size_t sizeTableParks = WriteParks<MAX_THREADS>( *cx.threadPool, newLength, lpBuffer, parkBuffer, tableId );
+
+    size_t            parkSize    = CalculateParkSize( tableId );
+    uint64            stubBitSize = (_K - kStubMinusBits);
+    const FSE_CTable* cTable      = CTables[(int)tableId];
+
+    if( tableId == TableId::Table2 && cx.cfg.gCfg->compressionLevel > 0 )
+    {
+        parkSize    = cx.cfg.gCfg->compressionInfo.tableParkSize;
+        stubBitSize = cx.cfg.gCfg->compressionInfo.subtSizeBits;
+        cTable      = cx.cfg.gCfg->ctable;
+    }
+
+    size_t sizeTableParks = WriteParks<MAX_THREADS>( *cx.threadPool, newLength, lpBuffer, parkBuffer, parkSize, stubBitSize, cTable );
+
+    cx.plotWriter->BeginTable( (PlotTable)tableId );
+    cx.plotWriter->WriteTableData( parkBuffer, sizeTableParks );
+    cx.plotWriter->EndTable();
 
     if constexpr ( IsTable6 )
     {
diff --git a/src/plotmem/MemPhase4.cpp b/src/plotmem/MemPhase4.cpp
index a0e076c3..14dceb40 100644
--- a/src/plotmem/MemPhase4.cpp
+++ b/src/plotmem/MemPhase4.cpp
@@ -34,7 +34,7 @@ void MemPhase4::WriteP7()
 
     const uint32* lTable     = cx.t1XBuffer;          // L table is passed around in the t1XBuffer
     const uint64  entryCount = cx.entryCount[(int)TableId::Table7];
-    byte*         p7Buffer   = cx.plotWriter->AlignPointerToBlockSize<byte>( cx.p4WriteBufferWriter ); // This buffer is big enough to hold the whole park
+    byte*         p7Buffer   = cx.plotWriter->BlockAlignPtr<byte>( cx.p4WriteBufferWriter ); // This buffer is big enough to hold the whole park
     
     Log::Line( "  Writing P7." );
     auto timer = TimerBegin();
@@ -43,8 +43,10 @@ void MemPhase4::WriteP7()
     
     cx.p4WriteBufferWriter = ((byte*)p7Buffer) + sizeWritten;
     
-    if( !cx.plotWriter->WriteTable( p7Buffer, sizeWritten ) )
-        Fatal( "Failed to write P7 to disk." );
+    // #TODO: block-align written size
+    cx.plotWriter->BeginTable( PlotTable::Table7 );
+    cx.plotWriter->WriteTableData( p7Buffer, sizeWritten );
+    cx.plotWriter->EndTable();
 
     double elapsed = TimerEnd( timer );
     Log::Line( "  Finished writing P7 in %.2lf seconds.", elapsed );
@@ -56,7 +58,7 @@ void MemPhase4::WriteC1()
     MemPlotContext& cx = _context;
  
     const uint64 entryCount  = cx.entryCount[(int)TableId::Table7];
-    uint32*      writeBuffer = cx.plotWriter->AlignPointerToBlockSize<uint32>( cx.p4WriteBufferWriter );
+    uint32*      writeBuffer = cx.plotWriter->BlockAlignPtr<uint32>( cx.p4WriteBufferWriter );
 
     Log::Line( "  Writing C1 table." );
     auto timer = TimerBegin();
@@ -66,8 +68,10 @@ void MemPhase4::WriteC1()
 
     cx.p4WriteBufferWriter = ((byte*)writeBuffer) + sizeWritten;
 
-    if( !cx.plotWriter->WriteTable( writeBuffer, sizeWritten ) )
-        Fatal( "Failed to write C1 to disk." );
+    // #TODO: block-align written size
+    cx.plotWriter->BeginTable( PlotTable::C1 );
+    cx.plotWriter->WriteTableData( writeBuffer, sizeWritten );
+    cx.plotWriter->EndTable();
 
     double elapsed = TimerEnd( timer );
     Log::Line( "  Finished writing C1 table in %.2lf seconds.", elapsed );
@@ -79,7 +83,7 @@ void MemPhase4::WriteC2()
     MemPlotContext& cx = _context;
  
     const uint64 entryCount  = cx.entryCount[(int)TableId::Table7];
-    uint32*      writeBuffer = cx.plotWriter->AlignPointerToBlockSize<uint32>( cx.p4WriteBufferWriter );
+    uint32*      writeBuffer = cx.plotWriter->BlockAlignPtr<uint32>( cx.p4WriteBufferWriter );
 
     Log::Line( "  Writing C2 table." );
     auto timer = TimerBegin();
@@ -89,8 +93,10 @@ void MemPhase4::WriteC2()
 
     cx.p4WriteBufferWriter = ((byte*)writeBuffer) + sizeWritten;
 
-    if( !cx.plotWriter->WriteTable( writeBuffer, sizeWritten ) )
-        Fatal( "Failed to write C2 to disk." );
+    // #TODO: block-align written size
+    cx.plotWriter->BeginTable( PlotTable::C2 );
+    cx.plotWriter->WriteTableData( writeBuffer, sizeWritten );
+    cx.plotWriter->EndTable();
 
     double elapsed = TimerEnd( timer );
     Log::Line( "  Finished writing C2 table in %.2lf seconds.", elapsed );
@@ -102,7 +108,7 @@ void MemPhase4::WriteC3()
     MemPlotContext& cx = _context;
  
     const uint64 entryCount  = cx.entryCount[(int)TableId::Table7];
-    byte*        writeBuffer = cx.plotWriter->AlignPointerToBlockSize<byte>( cx.p4WriteBufferWriter );
+    byte*        writeBuffer = cx.plotWriter->BlockAlignPtr<byte>( cx.p4WriteBufferWriter );
 
     Log::Line( "  Writing C3 table." );
     auto timer = TimerBegin();
@@ -112,8 +118,10 @@ void MemPhase4::WriteC3()
 
     cx.p4WriteBufferWriter = ((byte*)writeBuffer) + sizeWritten;
 
-    if( !cx.plotWriter->WriteTable( writeBuffer, sizeWritten ) )
-        Fatal( "Failed to write C3 to disk." );
+    // #TODO: block-align written size
+    cx.plotWriter->BeginTable( PlotTable::C3 );
+    cx.plotWriter->WriteTableData( writeBuffer, sizeWritten );
+    cx.plotWriter->EndTable();
 
     double elapsed = TimerEnd( timer );
     Log::Line( "  Finished writing C3 table in %.2lf seconds.", elapsed );
diff --git a/src/plotmem/MemPhase4.h b/src/plotmem/MemPhase4.h
index 65ce2c27..11e7f078 100644
--- a/src/plotmem/MemPhase4.h
+++ b/src/plotmem/MemPhase4.h
@@ -417,17 +417,18 @@ inline void WriteC3Park( const uint64 length, uint32* f7Entries, byte* parkBuffe
     {
         const uint32 f7    = f7Entries[i];
         const uint32 delta = f7 - prevF7;
-        prevF7 = f7;
 
         ASSERT( delta < 255 );
         *deltaWriter++ = (byte)delta;
+
+        prevF7 = f7;
     }
 
     ASSERT( (uint64)(deltaWriter - (byte*)f7Entries) == length );
 
     // Serialize them into the C3 park buffer
     const size_t compressedSize = FSE_compress_usingCTable(
-        parkBuffer+2, c3Size, (byte*)f7Entries, 
+        parkBuffer+2, (length*8), (byte*)f7Entries,     // Bogus dstCapacity, see ParkWriter.h note on FSE_compress_usingCTable for reason
         length, (const FSE_CTable*)CTable_C3
     );
     ASSERT( (compressedSize+2) < c3Size );
diff --git a/src/plotmem/MemPlotter.cpp b/src/plotmem/MemPlotter.cpp
index 0d64b3fd..fa298292 100644
--- a/src/plotmem/MemPlotter.cpp
+++ b/src/plotmem/MemPlotter.cpp
@@ -9,16 +9,21 @@
 #include "MemPhase3.h"
 #include "MemPhase4.h"
 
+//----------------------------------------------------------
+void MemPlotter::ParseCLI( const GlobalPlotConfig& gCfg, CliParser& cli )
+{
+    _context.cfg.gCfg = &gCfg;
+}
 
 //----------------------------------------------------------
-MemPlotter::MemPlotter( const MemPlotConfig& cfg )
+void MemPlotter::Init()
 {
-    ZeroMem( &_context );
+    auto& cfg = *_context.cfg.gCfg;
 
     const bool warmStart = cfg.warmStart;
 
     const NumaInfo* numa = nullptr;
-    if( !cfg.noNUMA )
+    if( !cfg.disableNuma )
         numa = SysHost::GetNUMAInfo();
     
     if( numa && numa->nodeCount < 2 )
@@ -33,7 +38,7 @@ MemPlotter::MemPlotter( const MemPlotConfig& cfg )
     _context.threadCount = cfg.threadCount;
     
     // Create a thread pool
-    _context.threadPool = new ThreadPool( cfg.threadCount, ThreadPool::Mode::Fixed, cfg.noCPUAffinity );
+    _context.threadPool = new ThreadPool( cfg.threadCount, ThreadPool::Mode::Fixed, cfg.disableCpuAffinity );
 
     // Allocate buffers
     {
@@ -112,11 +117,7 @@ MemPlotter::MemPlotter( const MemPlotConfig& cfg )
 }
 
 //----------------------------------------------------------
-MemPlotter::~MemPlotter()
-{}
-
-//----------------------------------------------------------
-bool MemPlotter::Run( const PlotRequest& request )
+void MemPlotter::Run( const PlotRequest& request )
 {
     auto& cx = _context;
 
@@ -124,28 +125,10 @@ bool MemPlotter::Run( const PlotRequest& request )
     cx.plotId       = request.plotId;
     cx.plotMemo     = request.memo;
     cx.plotMemoSize = request.memoSize;
-    
-    // Open the plot file for writing before we actually start plotting
-    const int PLOT_FILE_RETRIES = 16;
-    FileStream* plotfile = new FileStream();
-    ASSERT( plotfile );
 
-    for( int i = 0; i < PLOT_FILE_RETRIES; i++ )
-    {
-        if( !plotfile->Open( request.outPath, FileMode::Create, FileAccess::Write, FileFlags::NoBuffering | FileFlags::LargeFile ) )
-        {
-            if( i+1 >= PLOT_FILE_RETRIES )
-            {
-                Log::Error( "Error: Failed to open plot output file at %s for writing after %d tries.", request.outPath, PLOT_FILE_RETRIES );
-                delete plotfile;
-                return false;
-            }
-
-            continue;
-        }
-
-        break;
-    }
+    // Start the first plot immediately, to exit early in case of error    
+    if( request.isFirstPlot )
+        BeginPlotFile( request );
     
     // Start plotting
     auto plotTimer = TimerBegin();
@@ -175,11 +158,9 @@ bool MemPlotter::Run( const PlotRequest& request )
         Log::Line( "Finished Phase 2 in %.2lf seconds.", elapsed );
     }
 
-    // Start writing the plot file
-    if( !_context.plotWriter )
-        _context.plotWriter = new DiskPlotWriter();
-    
-    cx.plotWriter->BeginPlot( request.outPath, *plotfile, request.plotId, request.memo, request.memoSize );
+    // Start the new plot file
+    if( !request.isFirstPlot )
+        BeginPlotFile( request );
 
     {
         auto timeStart = TimerBegin();
@@ -204,6 +185,8 @@ bool MemPlotter::Run( const PlotRequest& request )
     }
 
     // Wait flush writer, if this is the final plot
+    _context.plotWriter->EndPlot( true );
+
     if( request.IsFinalPlot )
     {
         auto timeStart = TimerBegin();
@@ -220,60 +203,32 @@ bool MemPlotter::Run( const PlotRequest& request )
     Log::Line( "Finished plotting in %.2lf seconds (%.2lf minutes).", 
         plotElapsed, plotElapsed / 60.0 );
 
-    cx.plotCount ++;
-    return true;
+    cx.plotCount++;
 }
 
 //-----------------------------------------------------------
-void MemPlotter::WaitPlotWriter()
+void MemPlotter::BeginPlotFile( const PlotRequest& request )
 {
-    // if( !_context.plotWriter )
-    //     _context.plotWriter = new DiskPlotWriter();
-    // else
-    // {
-        // Wait until the current plot has finished writing
-        if( !_context.plotWriter->WaitUntilFinishedWriting() )
-            Fatal( "Failed to write plot file %s with error: %d", 
-                _context.plotWriter->FilePath().c_str(),
-                _context.plotWriter->GetError() );
-
-        // Rename plot file to final plot file name (remove .tmp suffix)
-        const char*  tmpName       = _context.plotWriter->FilePath().c_str();
-        const size_t tmpNameLength = strlen( tmpName );
-
-        char* plotName = new char[tmpNameLength - 3];  ASSERT( plotName );
-
-        memcpy( plotName, tmpName, tmpNameLength - 4 );
-        plotName[tmpNameLength-4] = 0;
-
-        int r = rename( tmpName, plotName );
-        
-        if( r )
-        {
-            Log::Error( "Error: Failed to rename plot file %s.", tmpName );
-            Log::Error( " Please rename it manually." );
-        }
-
-        Log::Line( "" );
-        Log::Line( "Plot %s finished writing to disk:", r ? tmpName : plotName );
-
-        delete[] plotName;
+    // Re-create the serializer for now to workaround multiple-run serializer bug 
+    if( _context.plotWriter )
+    {
+        delete _context.plotWriter;
+        _context.plotWriter = nullptr;
+    }
 
-        // Print final pointer offsets
-        const uint64* tablePointers = _context.plotWriter->GetTablePointers();
-        for( uint i = 0; i < 7; i++ )
-        {
-            const uint64 ptr = Swap64( tablePointers[i] );
-            Log::Line( "  Table %u pointer  : %16lu ( 0x%016lx )", i+1, ptr, ptr );
-        }
+    if( !_context.plotWriter )
+        _context.plotWriter = new PlotWriter();
+    
+    FatalIf( !_context.plotWriter->BeginPlot( PlotVersion::v2_0, request.outDir, request.plotFileName, 
+              request.plotId, request.memo, request.memoSize, _context.cfg.gCfg->compressionLevel ),
+            "Failed to open plot file with error: %d", _context.plotWriter->GetError() );
+}
 
-        for( uint i = 7; i < 10; i++ )
-        {
-            const uint64 ptr = Swap64( tablePointers[i] );
-            Log::Line( "  C%u table pointer : %16lu ( 0x%016lx )", i+1-7, ptr, ptr );
-        }
-        Log::Line( "" );
-    // }
+//-----------------------------------------------------------
+void MemPlotter::WaitPlotWriter()
+{
+    _context.plotWriter->WaitForPlotToComplete();
+    _context.plotWriter->DumpTables();
 }
 
 ///
diff --git a/src/plotmem/MemPlotter.h b/src/plotmem/MemPlotter.h
index fb931c19..37a418bf 100644
--- a/src/plotmem/MemPlotter.h
+++ b/src/plotmem/MemPlotter.h
@@ -1,37 +1,33 @@
 #pragma once
 #include "PlotContext.h"
 #include "plotting/GlobalPlotConfig.h"
+#include "plotting/IPlotter.h"
 
 struct NumaInfo;
 
-struct MemPlotConfig
-{
-    GlobalPlotConfig* gCfg;
-    uint threadCount;
-    bool warmStart;
-    bool noNUMA;
-    bool noCPUAffinity;
-};
-
 // This plotter performs the whole plotting process in-memory.
-class MemPlotter
+class MemPlotter : public IPlotter
 {
 public:
 
-    MemPlotter( const MemPlotConfig& cfg );
-    ~MemPlotter();
+    inline MemPlotter() {}
+    inline ~MemPlotter() {}
 
-    bool Run( const PlotRequest& request );
+    void ParseCLI( const GlobalPlotConfig& gCfg, CliParser& cli ) override;
+    void Init() override;
+    void Run( const PlotRequest& req ) override;
 
 private:
 
     template<typename T>
     T* SafeAlloc( size_t size, bool warmStart, const NumaInfo* numa );
 
+    void BeginPlotFile( const PlotRequest& request );
+
     // Check if the background plot writer finished
     void WaitPlotWriter();
 
 private:
 
-    MemPlotContext _context;
+    MemPlotContext _context = {};
 };
\ No newline at end of file
diff --git a/src/plotmem/ParkWriter.h b/src/plotmem/ParkWriter.h
index cb56261a..45dab389 100644
--- a/src/plotmem/ParkWriter.h
+++ b/src/plotmem/ParkWriter.h
@@ -5,30 +5,45 @@
 
 struct WriteParkJob
 {
-    size_t  parkSize;       // #TODO: This should be a compile-time constant?
-    uint64  parkCount;      // How many parks to write
-    uint64* linePoints;     // Sorted line points to write to the park
-    byte*   parkBuffer;     // Buffer into which the parks will be written
-    TableId tableId;        // What table are we writing this park to?
+    size_t            parkSize;       // #TODO: This should be a compile-time constant?
+    uint64            parkCount;      // How many parks to write
+    uint64*           linePoints;     // Sorted line points to write to the park
+    byte*             parkBuffer;     // Buffer into which the parks will be written
+    uint64            stubBitSize;
+    const FSE_CTable* cTable;
+    // TableId tableId;        // What table are we writing this park to?
 };
 
 // Write parks in parallel
 // Returns the total size written
+template<uint MaxJobs>
+size_t WriteParks( ThreadPool& pool, const uint64 length, uint64* linePoints, byte* parkBuffer, const size_t parkSize, const uint64 stubBitSize, const FSE_CTable* cTable );
+
 template<uint MaxJobs>
 size_t WriteParks( ThreadPool& pool, const uint64 length, uint64* linePoints, byte* parkBuffer, TableId tableId );
 
 // Write a single park.
-// Returns the offset to the next park buffer
-void WritePark( const size_t parkSize, const uint64 count, uint64* linePoints, byte* parkBuffer, TableId tableId );
+size_t WritePark( const size_t parkSize, const uint64 count, uint64* linePoints, byte* parkBuffer, const uint64 stubBitSize, const FSE_CTable* cTable );
+size_t WritePark( const size_t parkSize, const uint64 count, uint64* linePoints, byte* parkBuffer, TableId tableId );
 
 void WriteParkThread( WriteParkJob* job );
 
 //-----------------------------------------------------------
 template<uint MaxJobs>
 inline size_t WriteParks( ThreadPool& pool, const uint64 length, uint64* linePoints, byte* parkBuffer, TableId tableId )
+{
+    const size_t      parkSize    = CalculateParkSize( tableId );
+    const FSE_CTable* cTable      = CTables[(int)tableId];
+    const uint64      stubBitSize = (_K - kStubMinusBits);       // For us, it is 29 bits since K = 32
+
+    return WriteParks<MaxJobs>( pool, length, linePoints, parkBuffer, parkSize, stubBitSize, cTable );
+}
+
+//-----------------------------------------------------------
+template<uint MaxJobs>
+inline size_t WriteParks( ThreadPool& pool, const uint64 length, uint64* linePoints, byte* parkBuffer, const size_t parkSize, const uint64 stubBitSize, const FSE_CTable* cTable )
 {
     const uint   threadCount    = MaxJobs > pool.ThreadCount() ? pool.ThreadCount() : MaxJobs;
-    const size_t parkSize       = CalculateParkSize( tableId );
     const uint64 parkCount      = length / kEntriesPerPark;
     const uint64 parksPerThread = parkCount / threadCount;
 
@@ -48,11 +63,13 @@ inline size_t WriteParks( ThreadPool& pool, const uint64 length, uint64* linePoi
     {
         auto& job = jobs[i];
 
-        job.parkSize   = parkSize;
-        job.parkCount  = parksPerThread;
-        job.linePoints = threadLinePoints;
-        job.parkBuffer = threadParkBuffer;
-        job.tableId    = tableId;
+        job.parkSize    = parkSize;
+        job.parkCount   = parksPerThread;
+        job.linePoints  = threadLinePoints;
+        job.parkBuffer  = threadParkBuffer;
+        job.stubBitSize = stubBitSize;
+        job.cTable      = cTable;
+        // job.tableId    = tableId;
 
         // Assign trailer parks accross threads. hehe
         if( trailingParks )
@@ -70,7 +87,7 @@ inline size_t WriteParks( ThreadPool& pool, const uint64 length, uint64* linePoi
 
     // Write trailing entries if any
     if( trailingEntries )
-        WritePark( parkSize, trailingEntries, threadLinePoints, threadParkBuffer, tableId );
+        WritePark( parkSize, trailingEntries, threadLinePoints, threadParkBuffer, stubBitSize, cTable );
 
 
     const size_t sizeWritten = parkSize * ( parkCount + (trailingEntries ? 1 : 0) );
@@ -79,7 +96,17 @@ inline size_t WriteParks( ThreadPool& pool, const uint64 length, uint64* linePoi
 }
 
 //-----------------------------------------------------------
-inline void WritePark( const size_t parkSize, const uint64 count, uint64* linePoints, byte* parkBuffer, TableId tableId )
+inline size_t WritePark( const size_t parkSize, const uint64 count, uint64* linePoints, byte* parkBuffer, TableId tableId )
+{
+    const FSE_CTable* ct            = CTables[(int)tableId];
+    const uint64       stubBitSize  = (_K - kStubMinusBits);       // For us, it is 29 bits since K = 32
+
+    return WritePark( parkSize, count, linePoints, parkBuffer, stubBitSize, ct );
+}
+
+//-----------------------------------------------------------
+inline size_t WritePark( const size_t parkSize, const uint64 count, uint64* linePoints, byte* parkBuffer, 
+                         const uint64 stubBitSize, const FSE_CTable* cTable )
 {
     ASSERT( count <= kEntriesPerPark );
 
@@ -100,7 +127,7 @@ inline void WritePark( const size_t parkSize, const uint64 count, uint64* linePo
     }
 
     // Grab the writing location after the stubs
-    const uint64 stubBitSize      = (_K - kStubMinusBits);       // For us, it is 29 bits since K = 32
+    // const uint64 stubBitSize      = (_K - kStubMinusBits);       // For us, it is 29 bits since K = 32
     const size_t stubSectionBytes = CDiv( (kEntriesPerPark - 1) * stubBitSize, 8 );
 
     byte* deltaBytesWriter = ((byte*)writer) + stubSectionBytes;
@@ -123,7 +150,7 @@ inline void WritePark( const size_t parkSize, const uint64 count, uint64* linePo
             if( freeBits <= stubBitSize )
             {
                 // Update the next field bits to what the stub bits that were not written into the current field
-                bits = stubBitSize - freeBits;
+                bits = (uint32)stubBitSize - freeBits;
 
                 // Write what we can (which may be nothing) into the free bits of the current field
                 field |= stub >> bits;
@@ -153,7 +180,7 @@ inline void WritePark( const size_t parkSize, const uint64 count, uint64* linePo
             {
                 // The stub completely fits into the current field with room to spare
                 field |= stub << (freeBits - stubBitSize);
-                bits += stubBitSize;
+                bits += (uint32)stubBitSize;
             }
         }
 
@@ -162,7 +189,7 @@ inline void WritePark( const size_t parkSize, const uint64 count, uint64* linePo
             *writer++ = Swap64( field );
 
         // Zero-out any remaining unused bytes
-        const size_t stubUsedBytes  = CDiv( (count - 1) * stubBitSize, 8 );
+        const size_t stubUsedBytes  = CDiv( (count - 1) * (size_t)stubBitSize, 8 );
         const size_t remainderBytes = stubSectionBytes - stubUsedBytes;
         
         memset( deltaBytesWriter - remainderBytes, 0, remainderBytes );
@@ -170,7 +197,6 @@ inline void WritePark( const size_t parkSize, const uint64 count, uint64* linePo
     
     
     // Convert to small deltas
-    constexpr uint64 smallDeltaShift = (_K - kStubMinusBits);
     byte* smallDeltas = (byte*)&linePoints[1];
     
     #if DEBUG
@@ -181,8 +207,10 @@ inline void WritePark( const size_t parkSize, const uint64 count, uint64* linePo
         // We re-write the same buffer, but since our
         // byte writer is always behind the read (uint64 size fields),
         // we don't have to worry about corrupting our current data
-        smallDeltas[i-1] = (byte)(linePoints[i] >> smallDeltaShift);
-        ASSERT( smallDeltas[i-1] < 256 );
+        const uint64 smallDelta = linePoints[i] >> stubBitSize;
+        ASSERT( smallDelta < 256 );
+
+        smallDeltas[i-1] = (byte)smallDelta;
 
         #if DEBUG
             uint sm = smallDeltas[i-1];
@@ -194,27 +222,31 @@ inline void WritePark( const size_t parkSize, const uint64 count, uint64* linePo
             }
         #endif
     }
-    
+
     #if DEBUG
         const double averageDeltaBits = deltaBitsAccum / (double)2047;
         ASSERT( averageDeltaBits <= kMaxAverageDeltaTable1 );
     #endif
 
     // Write small deltas
+    size_t parkSizeWritten = 0;
     {
         uint16* deltaSizeWriter = (uint16*)deltaBytesWriter;
         deltaBytesWriter += 2;
 
-        const FSE_CTable* ct = CTables[(int)tableId];
+        const size_t deltasSizeAvailable = parkSize - sizeof( uint64 ) - CDiv( (count - 1) * stubBitSize, 8 );
 
         size_t deltasSize = FSE_compress_usingCTable( 
-                                deltaBytesWriter, (count-1) * 8,
-                                smallDeltas, count-1, ct );
+                                deltaBytesWriter, (count-1) * 8,    // We don't use deltasSizeAvailable so we can use the fast-path instead. 
+                                smallDeltas, count-1, cTable );     // We let it overrun the buffer into the next one and fail if so.
+
+        if( deltasSize > deltasSizeAvailable )
+            Fatal( "Overran park buffer: %llu / %llu", (deltaBytesWriter + deltasSize - parkBuffer), parkSize );
 
         if( !deltasSize )
         {
             // Deltas were NOT compressed, we have to copy them raw
-            deltasSize = (count-1);
+            deltasSize       = (count-1);
             *deltaSizeWriter = (uint16)(deltasSize | 0x8000);
             memcpy( deltaBytesWriter, smallDeltas, count-1 );
         }
@@ -226,31 +258,35 @@ inline void WritePark( const size_t parkSize, const uint64 count, uint64* linePo
 
         deltaBytesWriter += deltasSize;
 
-        const size_t parkSizeWritten = deltaBytesWriter - parkBuffer;
+        parkSizeWritten = deltaBytesWriter - parkBuffer;
 
-        if( parkSizeWritten > parkSize )
-            Fatal( "Overran park buffer for table %d.", (int)tableId + 1 );
+        // if( parkSizeWritten > parkSize )
+        //     Fatal( "Overran park buffer: %llu / %llu", parkSizeWritten, parkSize );
         
         // Zero-out any remaining bytes in the deltas section
         const size_t parkSizeRemainder = parkSize - parkSizeWritten;
 
         memset( deltaBytesWriter, 0, parkSizeRemainder );
     }
+
+    return parkSizeWritten;
 }
 
 //-----------------------------------------------------------
 inline void WriteParkThread( WriteParkJob* job )
 {
-    const size_t  parkSize  = job->parkSize;
-    const uint64  parkCount = job->parkCount;
-    const TableId tableId   = job->tableId;
+    const size_t  parkSize    = job->parkSize;
+    const uint64  parkCount   = job->parkCount;
+    const uint64  stubBitSize = job->stubBitSize;
+    const auto*   cTable      = job->cTable;
+    // const TableId tableId   = job->tableId;
 
     uint64* linePoints = job->linePoints;
     byte*   parkBuffer = job->parkBuffer;
 
     for( uint64 i = 0; i < parkCount; i++ )
     {
-        WritePark( parkSize, kEntriesPerPark, linePoints, parkBuffer, tableId );
+        WritePark( parkSize, kEntriesPerPark, linePoints, parkBuffer, stubBitSize, cTable );
         
         linePoints += kEntriesPerPark;
         parkBuffer += parkSize;
diff --git a/src/plotting/CTables.h b/src/plotting/CTables.h
index d71e3044..3bcb5053 100644
--- a/src/plotting/CTables.h
+++ b/src/plotting/CTables.h
@@ -1,6 +1,8 @@
 #pragma once
 #include "fse/fse.h"
 
+// #TODO:  THese must be aligned to FSE_CTable alignment
+
 const byte CTable_0[34812] = {
   0x0e, 0x00, 0xfe, 0x00, 0x00, 0x40, 0x01, 0x40, 0x02, 0x40, 0x18, 0x40, 0x19, 0x40, 0x1a, 0x40, 0x30, 0x40, 0x31, 0x40, 0x32, 0x40, 0x48, 0x40, 0x49, 0x40, 0x4a, 0x40, 0x60, 0x40, 0x61, 0x40, 
   0x62, 0x40, 0x78, 0x40, 0x79, 0x40, 0x7a, 0x40, 0x90, 0x40, 0x91, 0x40, 0x92, 0x40, 0xa8, 0x40, 0xa9, 0x40, 0xaa, 0x40, 0xc0, 0x40, 0xc1, 0x40, 0xc2, 0x40, 0xd8, 0x40, 0xd9, 0x40, 0xda, 0x40, 
diff --git a/src/plotting/Compression.cpp b/src/plotting/Compression.cpp
new file mode 100644
index 00000000..d1db2973
--- /dev/null
+++ b/src/plotting/Compression.cpp
@@ -0,0 +1,143 @@
+#include "Compression.h"
+#include "plotting/FSETableGenerator.h"
+#include "util/Util.h"
+#include <mutex>
+
+// Caches for C and D tables
+static std::atomic<FSE_CTable*> _cTableCache[32] = {};
+static std::atomic<FSE_DTable*> _dTableCache[32] = {};
+
+static std::mutex _cCacheLock;
+static std::mutex _dCacheLock;
+
+void* CreateCompressionCTableForCLevel( size_t* outTableSize, const uint32 compressionLevel, const double rValue, const bool compress )
+{
+    if( compress )
+    {
+        FSE_CTable* cTable = _cTableCache[compressionLevel].load( std::memory_order_acquire );
+        if( !cTable )
+        {
+            _cCacheLock.lock();
+            cTable = _cTableCache[compressionLevel].load( std::memory_order_acquire );
+            if( !cTable )
+            {
+                // Cache it
+                cTable = FSETableGenerator::GenCompressionTable( rValue, outTableSize );
+                _cTableCache[compressionLevel].store( cTable, std::memory_order_release );
+            }
+            _cCacheLock.unlock();
+        }
+
+        return cTable;
+    }
+    else
+    {
+        FSE_DTable* dTable = _dTableCache[compressionLevel].load( std::memory_order_acquire );
+        if( !dTable )
+        {
+            _dCacheLock.lock();
+            dTable = _dTableCache[compressionLevel].load( std::memory_order_acquire );
+            if( !dTable )
+            {
+                // Cache it
+                dTable = FSETableGenerator::GenDecompressionTable( rValue, outTableSize );
+                _dTableCache[compressionLevel].store( dTable, std::memory_order_release );
+            }
+            _dCacheLock.unlock();
+        }
+
+        return dTable;
+    }
+
+    return nullptr;
+}
+
+template<uint32 level>
+void* CreateCompressionCTable( size_t* outTableSize, const bool compress )
+{
+    return CreateCompressionCTableForCLevel( outTableSize, level, CompressionLevelInfo<level>::ANS_R_VALUE, compress );
+}
+
+template<uint32 level>
+void GetCompressionInfoForLevel( CompressionInfo& info )
+{
+    info.entrySizeBits = CompressionLevelInfo<level>::ENTRY_SIZE;
+    info.subtSizeBits  = CompressionLevelInfo<level>::STUB_BIT_SIZE;
+    info.tableParkSize = CompressionLevelInfo<level>::TABLE_PARK_SIZE;
+    info.ansRValue     = CompressionLevelInfo<level>::ANS_R_VALUE;
+}
+
+CompressionInfo GetCompressionInfoForLevel( const uint32 compressionLevel )
+{
+    CompressionInfo info = {};
+
+    switch ( compressionLevel )
+    {
+        case 1: GetCompressionInfoForLevel<1>( info ); break;
+        case 2: GetCompressionInfoForLevel<2>( info ); break;
+        case 3: GetCompressionInfoForLevel<3>( info ); break;
+        case 4: GetCompressionInfoForLevel<4>( info ); break;
+        case 5: GetCompressionInfoForLevel<5>( info ); break;
+        case 6: GetCompressionInfoForLevel<6>( info ); break;
+        case 7: GetCompressionInfoForLevel<7>( info ); break;
+        case 8: GetCompressionInfoForLevel<8>( info ); break;
+        case 9: GetCompressionInfoForLevel<9>( info ); break;
+    
+    default:
+        Fatal( "Invalid compression level %u.", compressionLevel );
+        break;
+    }
+
+    return info;
+}
+
+void* CreateCompressionTable( const uint32 compressionLevel, size_t* outTableSize, const bool compress )
+{
+    switch ( compressionLevel )
+    {
+        case 1: return CreateCompressionCTable<1>( outTableSize, compress );
+        case 2: return CreateCompressionCTable<2>( outTableSize, compress );
+        case 3: return CreateCompressionCTable<3>( outTableSize, compress );
+        case 4: return CreateCompressionCTable<4>( outTableSize, compress );
+        case 5: return CreateCompressionCTable<5>( outTableSize, compress );
+        case 6: return CreateCompressionCTable<6>( outTableSize, compress );
+        case 7: return CreateCompressionCTable<7>( outTableSize, compress );
+        case 8: return CreateCompressionCTable<8>( outTableSize, compress );
+        case 9: return CreateCompressionCTable<9>( outTableSize, compress );
+    
+        default:
+        break;
+    }
+
+    Fatal( "Invalid compression level %u.", compressionLevel );
+    return nullptr;
+}
+
+FSE_CTable* CreateCompressionCTable( const uint32 compressionLevel, size_t* outTableSize )
+{
+    return (FSE_CTable*)CreateCompressionTable( compressionLevel, outTableSize, true );
+}
+
+FSE_DTable* CreateCompressionDTable( const uint32 compressionLevel, size_t* outTableSize )
+{
+    return (FSE_DTable*)CreateCompressionTable( compressionLevel, outTableSize, false );
+}
+
+uint32 GetCompressedLPBitCount( const uint32 compressionLevel )
+{
+    // #TODO: Don't support this? Or rather, support K size
+    if( compressionLevel == 0 )
+        return 64;
+
+    const uint32 nDroppedTables = compressionLevel < 9 ? 1 :
+                                   compressionLevel < 13 ? 2 : 3;
+
+    auto info = GetCompressionInfoForLevel( compressionLevel );
+
+    uint32 lpBitSize = info.entrySizeBits * 2 * nDroppedTables - 1;
+
+    // for( uint32 i = 0; i < nDroppedTables; i++ )
+    //     lpBitSize = lpBitSize * 2 - 1;
+
+    return lpBitSize * 2 - 1;
+}
\ No newline at end of file
diff --git a/src/plotting/Compression.h b/src/plotting/Compression.h
new file mode 100644
index 00000000..babb379f
--- /dev/null
+++ b/src/plotting/Compression.h
@@ -0,0 +1,111 @@
+#pragma once
+#include "fse/fse.h"
+
+struct CompressionInfo
+{
+    uint32_t entrySizeBits;
+    uint32_t subtSizeBits;
+    size_t   tableParkSize;
+    double   ansRValue;
+};
+
+// #TODO: Change this API to C, and move it to GreenReaper.h
+// #TODO: Rename to GetCompressionCTable/DTable
+// #TODO: Add to a namespace
+FSE_CTable*     CreateCompressionCTable( const uint32_t compressionLevel, size_t* outTableSize = nullptr );
+FSE_DTable*     CreateCompressionDTable( const uint32_t compressionLevel, size_t* outTableSize = nullptr );
+CompressionInfo GetCompressionInfoForLevel( const uint32_t compressionLevel );
+uint32_t        GetCompressedLPBitCount( const uint32_t compressionLevel );
+
+template<uint32_t level>
+struct CompressionLevelInfo 
+{
+    // static_assert( false, "Invalid compression level." );
+};
+
+template<>
+struct CompressionLevelInfo<1>
+{
+    static constexpr uint32_t ENTRY_SIZE      = 16;
+    static constexpr uint32_t STUB_BIT_SIZE   = 29;
+    static constexpr size_t TABLE_PARK_SIZE = 8336;
+    static constexpr double ANS_R_VALUE     = 2.51;
+};
+
+template<>
+struct CompressionLevelInfo<2>
+{
+    static constexpr uint32_t ENTRY_SIZE      = 15;
+    static constexpr uint32_t STUB_BIT_SIZE   = 25;
+    static constexpr size_t   TABLE_PARK_SIZE = 7360;
+    static constexpr double   ANS_R_VALUE     = 3.44;
+};
+
+template<>
+struct CompressionLevelInfo<3>
+{
+    static constexpr uint32_t ENTRY_SIZE      = 14;
+    static constexpr uint32_t STUB_BIT_SIZE   = 21;
+    static constexpr size_t   TABLE_PARK_SIZE = 6352;
+    static constexpr double   ANS_R_VALUE     = 4.36;
+};
+
+template<>
+struct CompressionLevelInfo<4>
+{
+    static constexpr uint32_t ENTRY_SIZE      = 13;
+    static constexpr uint32_t STUB_BIT_SIZE   = 16;
+    static constexpr size_t   TABLE_PARK_SIZE = 5325; //5312;
+    static constexpr double   ANS_R_VALUE     = 9.30;
+};
+
+template<>
+struct CompressionLevelInfo<5>
+{
+    static constexpr uint32_t ENTRY_SIZE      = 12;
+    static constexpr uint32_t STUB_BIT_SIZE   = 12;
+    static constexpr size_t   TABLE_PARK_SIZE = 4300; //4290;
+    static constexpr double   ANS_R_VALUE     = 9.30;
+};
+
+template<>
+struct CompressionLevelInfo<6>
+{
+    static constexpr uint32_t ENTRY_SIZE      = 11;
+    static constexpr uint32_t STUB_BIT_SIZE   = 8;
+    static constexpr size_t   TABLE_PARK_SIZE = 3273; //3263;
+    static constexpr double   ANS_R_VALUE     = 9.10;
+};
+
+template<>
+struct CompressionLevelInfo<7>
+{
+    static constexpr uint32_t ENTRY_SIZE      = 10;
+    static constexpr uint32_t STUB_BIT_SIZE   = 4;
+    static constexpr size_t   TABLE_PARK_SIZE = 2250; //2240;
+    static constexpr double   ANS_R_VALUE     = 8.60;
+};
+
+// #TODO: These are dummy values for now... Update with real values
+template<>
+struct CompressionLevelInfo<8>
+{
+    static constexpr uint32_t ENTRY_SIZE      = 9;
+    static constexpr uint32_t STUB_BIT_SIZE   = 4;
+    static constexpr size_t   TABLE_PARK_SIZE = 6350; //2240;
+    static constexpr double   ANS_R_VALUE     = 8.60;
+};
+
+template<>
+struct CompressionLevelInfo<9>
+{
+    static constexpr uint32_t ENTRY_SIZE      = 8;
+    static constexpr uint32_t STUB_BIT_SIZE   = 30;
+    static constexpr size_t   TABLE_PARK_SIZE = 8808;
+    static constexpr double   ANS_R_VALUE     = 4.54;
+};
+
+
+
+
+
diff --git a/src/plotting/FSETableGenerator.cpp b/src/plotting/FSETableGenerator.cpp
new file mode 100644
index 00000000..c4e15911
--- /dev/null
+++ b/src/plotting/FSETableGenerator.cpp
@@ -0,0 +1,104 @@
+#define FSE_STATIC_LINKING_ONLY
+#include "fse/fse.h"
+
+#include <vector>
+#include <cmath>
+#include <queue>
+#include "ChiaConsts.h"
+#include "FSETableGenerator.h"
+
+static std::vector<short> CreateNormalizedCount(double R);
+
+//-----------------------------------------------------------
+void* GenFSETable( const double rValue, size_t* outTableSize, const bool compress )
+{
+    std::vector<short> nCount         = CreateNormalizedCount( rValue );
+    unsigned           maxSymbolValue = (unsigned)nCount.size() - 1;
+    unsigned           tableLog       = 14;
+
+    FatalIf( maxSymbolValue > 255, "maxSymbolValue > 255" );
+    
+    size_t      err = 0;
+    FSE_CTable* ct  = nullptr;
+    FSE_DTable* dt  = nullptr;
+
+    if( compress )
+    {
+        ct  = FSE_createCTable( maxSymbolValue, tableLog );
+        err = FSE_buildCTable( ct, nCount.data(), maxSymbolValue, tableLog );
+    }
+    else
+    {
+        dt  = FSE_createDTable( tableLog );
+        err = FSE_buildDTable( dt, nCount.data(), maxSymbolValue, tableLog );
+    }
+    
+    FatalIf( FSE_isError( err ), "Failed to generate FSE compression table with error: %s", FSE_getErrorName( err ) );
+    
+    // const size_t tableSizeBytes = FSE_CTABLE_SIZE( tableLog, maxSymbolValue );
+
+    if( outTableSize )
+    {
+        if( compress )
+            *outTableSize = FSE_CTABLE_SIZE( tableLog, maxSymbolValue );
+        else
+            *outTableSize = FSE_DTABLE_SIZE( tableLog );
+    }
+
+    return compress ? ct : dt;
+}
+
+//-----------------------------------------------------------
+FSE_CTable* FSETableGenerator::GenCompressionTable( const double rValue, size_t* outTableSize )
+{
+    return (FSE_CTable*)GenFSETable( rValue, outTableSize, true );
+}
+
+//-----------------------------------------------------------
+FSE_DTable* FSETableGenerator::GenDecompressionTable( double rValue, size_t* outTableSize )
+{
+    return (FSE_CTable*)GenFSETable( rValue, outTableSize, false );
+}
+
+///
+/// Taken from chiapos
+///
+std::vector<short> CreateNormalizedCount(double R)
+{
+    std::vector<double> dpdf;
+    int N = 0;
+    double E = 2.718281828459;
+    double MIN_PRB_THRESHOLD = 1e-50;
+    int TOTAL_QUANTA = 1 << 14;
+    double p = 1 - pow((E - 1) / E, 1.0 / R);
+
+    while (p > MIN_PRB_THRESHOLD && N < 255) {
+        dpdf.push_back(p);
+        N++;
+        p = (pow(E, 1.0 / R) - 1) * pow(E - 1, 1.0 / R);
+        p /= pow(E, ((N + 1) / R));
+    }
+
+    std::vector<short> ans(N, 1);
+    auto cmp = [&dpdf, &ans](int i, int j) {
+        return dpdf[i] * (log2(ans[i] + 1) - log2(ans[i])) <
+                dpdf[j] * (log2(ans[j] + 1) - log2(ans[j]));
+    };
+
+    std::priority_queue<int, std::vector<int>, decltype(cmp)> pq(cmp);
+    for (int i = 0; i < N; ++i) pq.push(i);
+
+    for (int todo = 0; todo < TOTAL_QUANTA - N; ++todo) {
+        int i = pq.top();
+        pq.pop();
+        ans[i]++;
+        pq.push(i);
+    }
+
+    for (int i = 0; i < N; ++i) {
+        if (ans[i] == 1) {
+            ans[i] = (short)-1;
+        }
+    }
+    return ans;
+}
\ No newline at end of file
diff --git a/src/plotting/FSETableGenerator.h b/src/plotting/FSETableGenerator.h
new file mode 100644
index 00000000..bebd9f85
--- /dev/null
+++ b/src/plotting/FSETableGenerator.h
@@ -0,0 +1,10 @@
+#pragma once
+
+typedef unsigned FSE_CTable;
+typedef unsigned FSE_DTable;
+
+struct FSETableGenerator
+{
+    static FSE_CTable* GenCompressionTable( const double rValue, size_t* outTableSize );
+    static FSE_DTable* GenDecompressionTable( double rValue, size_t* outTableSize );
+};
diff --git a/src/plotting/GlobalPlotConfig.h b/src/plotting/GlobalPlotConfig.h
index 910596cc..863aec4d 100644
--- a/src/plotting/GlobalPlotConfig.h
+++ b/src/plotting/GlobalPlotConfig.h
@@ -1,6 +1,12 @@
 #pragma once
-#include "plotting/PlotTools.h"
-#include "util/KeyTools.h"
+#include "plotting/Compression.h"
+#include <string>
+
+struct PuzzleHash;
+namespace bls
+{
+    class G1Element;
+}
 
 struct GlobalPlotConfig
 {
@@ -16,14 +22,27 @@ struct GlobalPlotConfig
     // #TODO: Allow multiple output paths
     const char* outputFolder = nullptr;
 
-    bool showMemo           = false;
-    bool warmStart          = false;
-    bool disableNuma        = false;
-    bool disableCpuAffinity = false;
+    uint32       outputFolderCount         = 1;
+    std::string* outputFolders             = nullptr;
 
-    bls::G1Element  farmerPublicKey;
+    bool            benchmarkMode          = false;            // For testing plot timings -- Does not write the plot file to disk
+    bool            showMemo               = false;
+    bool            warmStart              = false;
+    bool            disableNuma            = false;
+    bool            disableCpuAffinity     = false;
+    bool            disableOutputDirectIO  = false;            // Do not use direct I/O when writing the plot files
+    bool            verbose                = false;            // Allow some verbose output
+    uint32          compressionLevel       = 0;                // 0 == no compression. 1 = 16 bits. 2 = 15 bits, ..., 6 = 11 bits
+    uint32          compressedEntryBits    = 32;               // Bit size of table 1 entries. If compressed, then it is set to <= 16.
+    FSE_CTable*     ctable                 = nullptr;          // Compression table if making compressed plots
+    size_t          cTableSize             = 0;
+    uint32          numDroppedTables       = 0;                // When compression level > 0 : 1. > 8 : 2. Otherwise 0 (no compression)
+    CompressionInfo compressionInfo        = {};
+
+    bls::G1Element* farmerPublicKey        = nullptr;
     bls::G1Element* poolPublicKey          = nullptr;   // Either poolPublicKey or poolContractPuzzleHash must be set.
     PuzzleHash*     poolContractPuzzleHash = nullptr;   // If both are set, poolContractPuzzleHash will be used over
                                                         // the poolPublicKey.
+
 };
 
diff --git a/src/plotting/IPlotter.h b/src/plotting/IPlotter.h
new file mode 100644
index 00000000..ba980c91
--- /dev/null
+++ b/src/plotting/IPlotter.h
@@ -0,0 +1,13 @@
+#pragma once
+
+class CliParser;
+struct GlobalPlotConfig;
+struct PlotRequest;
+
+class IPlotter
+{
+public:
+    virtual void ParseCLI( const GlobalPlotConfig& gCfg, CliParser& cli ) = 0;
+    virtual void Init() = 0;
+    virtual void Run( const PlotRequest& req ) = 0;
+};
diff --git a/src/plotting/PlotHeader.h b/src/plotting/PlotHeader.h
new file mode 100644
index 00000000..e9ffcced
--- /dev/null
+++ b/src/plotting/PlotHeader.h
@@ -0,0 +1,34 @@
+#pragma once
+#include "ChiaConsts.h"
+
+enum class PlotVersion : uint32
+{
+    v1_0 = 0,
+    v2_0 = CHIA_PLOT_VERSION_2
+};
+
+enum class PlotFlags : uint32
+{
+    None       = 0,
+    Compressed = 1 << 0,
+
+}; ImplementFlagOps( PlotFlags );
+
+
+struct PlotHeader
+{
+    byte   id  [BB_PLOT_ID_LEN]        = { 0 };
+    byte   memo[BB_PLOT_MEMO_MAX_SIZE] = { 0 };
+    uint   memoLength                  = 0;
+    uint32 k                           = 0;
+    uint64 tablePtrs[10]               = { 0 };
+};
+
+typedef PlotHeader PlotFileHeaderV1;
+
+struct PlotFileHeaderV2 : PlotFileHeaderV1
+{
+    PlotFlags flags             = PlotFlags::None;
+    byte      compressionLevel  = 0;
+    uint64    tableSizes[10]    = { 0 };
+};
\ No newline at end of file
diff --git a/src/plotting/PlotTools.cpp b/src/plotting/PlotTools.cpp
index 99c1d2dd..1b8d7b64 100644
--- a/src/plotting/PlotTools.cpp
+++ b/src/plotting/PlotTools.cpp
@@ -1,11 +1,11 @@
 #include "PlotTools.h"
 #include "util/Util.h"
+#include "BLS.h"
 
-
-#define PLOT_FILE_PREFIX_LEN (sizeof("plot-k32-2021-08-05-18-55-")-1)
+#define PLOT_FILE_DATE_LEN (sizeof("2021-08-05-18-55-")-1)
 
 //-----------------------------------------------------------
-void PlotTools::GenPlotFileName( const byte plotId[BB_PLOT_ID_LEN], char outPlotFileName[BB_PLOT_FILE_LEN] )
+void PlotTools::GenPlotFileName( const byte plotId[BB_PLOT_ID_LEN], char outPlotFileName[BB_COMPRESSED_PLOT_FILE_LEN_TMP], const uint32 compressionLevel )
 {
     ASSERT( plotId );
     ASSERT( outPlotFileName );
@@ -13,8 +13,30 @@ void PlotTools::GenPlotFileName( const byte plotId[BB_PLOT_ID_LEN], char outPlot
     time_t     now = time( nullptr );
     struct tm* t   = localtime( &now ); ASSERT( t );
     
-    const size_t r = strftime( outPlotFileName, BB_PLOT_FILE_LEN, "plot-k32-%Y-%m-%d-%H-%M-", t );
-    if( r != PLOT_FILE_PREFIX_LEN )
+    const bool isCompressed = compressionLevel > 0;
+
+    const char classicFormat[]    = "plot-k32-";
+    const char compressedFormat[] = "plot-k32-c%02u-";
+    const char dateFormat[]       = "%Y-%m-%d-%H-%M-";
+
+    size_t bufferLength = isCompressed ? BB_COMPRESSED_PLOT_FILE_LEN : BB_PLOT_FILE_LEN;
+    size_t prefixLength = sizeof( classicFormat ) - 1;
+
+    if( isCompressed )
+    {
+        int l = snprintf( outPlotFileName, bufferLength, compressedFormat, compressionLevel );
+        FatalIf( l <= 0, "Failed to prepare plot file name." );
+        prefixLength = (size_t)l;
+    }
+    else
+        memcpy( outPlotFileName, classicFormat, sizeof( classicFormat ) - 1 );
+    
+    outPlotFileName += prefixLength;
+    bufferLength    -= prefixLength;
+
+    size_t r = strftime( outPlotFileName, bufferLength, dateFormat, t );
+
+    if( r != PLOT_FILE_DATE_LEN )
         Fatal( "Failed to generate plot file." );
 
     PlotIdToString( plotId, outPlotFileName + r );
@@ -93,7 +115,7 @@ void PlotTools::GeneratePlotIdAndMemo(
     SysHost::Random( seed, sizeof( seed ) );
 
     bls::PrivateKey sk      = bls::AugSchemeMPL().KeyGen( bls::Bytes( seed, sizeof( seed ) ) );
-    bls::G1Element  localPk = std::move( KeyTools::MasterSkToLocalSK( sk ) ).GetG1Element();
+    bls::G1Element  localPk = KeyTools::MasterSkToLocalSK( sk ).GetG1Element();
 
     // #See: chia-blockchain create_plots.py
     //       The plot public key is the combination of the harvester and farmer keys
diff --git a/src/plotting/PlotTools.h b/src/plotting/PlotTools.h
index 63ce8f41..71c5c0c9 100644
--- a/src/plotting/PlotTools.h
+++ b/src/plotting/PlotTools.h
@@ -1,20 +1,25 @@
 #pragma once
 #include "ChiaConsts.h"
 #include "util/KeyTools.h"
+#include "util/BitView.h"
 
-#define BB_PLOT_PROOF_X_COUNT 32
+#define PROOF_X_COUNT       64
+#define MAX_K_SIZE          50
+#define MAX_META_MULTIPLIER 4
+#define MAX_Y_BIT_SIZE      ( MAX_K_SIZE + kExtraBits )
+#define MAX_META_BIT_SIZE   ( MAX_K_SIZE * MAX_META_MULTIPLIER )
+#define MAX_FX_BIT_SIZE     ( MAX_Y_BIT_SIZE + MAX_META_BIT_SIZE + MAX_META_BIT_SIZE )
 
-#define BB_PLOT_ID_LEN 32
-#define BB_PLOT_ID_HEX_LEN (BB_PLOT_ID_LEN * 2)
+typedef Bits<MAX_Y_BIT_SIZE>    YBits;
+typedef Bits<MAX_META_BIT_SIZE> MetaBits;
+typedef Bits<MAX_FX_BIT_SIZE>   FxBits;
 
-#define BB_PLOT_MEMO_MAX_SIZE (48+48+32)
-
-#define BB_PLOT_FILE_LEN_TMP (sizeof( "plot-k32-2021-08-05-18-55-77a011fc20f0003c3adcc739b615041ae56351a22b690fd854ccb6726e5f43b7.plot.tmp" ) - 1)
-#define BB_PLOT_FILE_LEN (BB_PLOT_FILE_LEN_TMP - 4)
+typedef unsigned FSE_CTable;
+typedef unsigned FSE_DTable;
 
 struct PlotTools
 {
-    static void GenPlotFileName( const byte plotId[BB_PLOT_ID_LEN], char outPlotFileName[BB_PLOT_FILE_LEN] );
+    static void GenPlotFileName( const byte plotId[BB_PLOT_ID_LEN], char outPlotFileName[BB_COMPRESSED_PLOT_FILE_LEN_TMP], uint32 compressionLevel );
     static void PlotIdToString( const byte plotId[BB_PLOT_ID_LEN], char plotIdString[BB_PLOT_ID_HEX_LEN+1] );
 
     static bool PlotStringToId( const char plotIdString[BB_PLOT_ID_HEX_LEN+1], byte plotId[BB_PLOT_ID_LEN] );
@@ -29,6 +34,7 @@ struct PlotTools
         bls::G1Element* poolPK,
         PuzzleHash*     contractPuzzleHash
     );
+
     // static void PlotIdToStringTmp( const byte* plotId, const byte plotIdString[BB_PLOT_FILE_LEN_TMP] );
 
     // //-----------------------------------------------------------
@@ -53,6 +59,8 @@ struct PlotTools
     //     return CalculateLinePointSize(k) + CalculateStubsSize(k) +
     //            CalculateMaxDeltasSize(k, table_index);
     // }
+
+
     
 };
 
diff --git a/src/plotting/PlotValidation.h b/src/plotting/PlotValidation.h
index 992137fa..1ca5479a 100644
--- a/src/plotting/PlotValidation.h
+++ b/src/plotting/PlotValidation.h
@@ -1,23 +1,16 @@
 #pragma once
-#include "util/BitView.h"
-#include "ChiaConsts.h"
-#include "plotting/PlotTools.h"
-
-#define PROOF_X_COUNT       64
-#define MAX_K_SIZE          48
-#define MAX_META_MULTIPLIER 4
-#define MAX_Y_BIT_SIZE      ( MAX_K_SIZE + kExtraBits )
-#define MAX_META_BIT_SIZE   ( MAX_K_SIZE * MAX_META_MULTIPLIER )
-#define MAX_FX_BIT_SIZE     ( MAX_Y_BIT_SIZE + MAX_META_BIT_SIZE + MAX_META_BIT_SIZE )
 
-typedef Bits<MAX_Y_BIT_SIZE>    YBits;
-typedef Bits<MAX_META_BIT_SIZE> MetaBits;
-typedef Bits<MAX_FX_BIT_SIZE>   FxBits;
+#include "plotting/PlotTools.h"
 
+// #NOTE: For now it is implemented in PlotValidator.cpp
+bool ValidateFullProof( const uint32 k, const byte plotId[BB_PLOT_ID_LEN], uint64 fullProofXs[PROOF_X_COUNT], uint64& outF7 );
 
-class PlotValidation
+namespace PlotValidation
 {
-    static bool ValidateFullProof( const uint32 k, const byte plotId[BB_PLOT_ID_LEN], uint64 fullProofXs[PROOF_X_COUNT], uint64& outF7 );
+    inline static bool ValidateFullProof( const uint32 k, const byte plotId[BB_PLOT_ID_LEN], uint64 fullProofXs[PROOF_X_COUNT], uint64& outF7 )
+    {
+        return ::ValidateFullProof( k, plotId, fullProofXs, outF7 );
+    }
     
     static uint64 BytesToUInt64( const byte bytes[8] );
     static uint64 SliceUInt64FromBits( const byte* bytes, uint32 bitOffset, uint32 bitCount );
diff --git a/src/plotting/PlotWriter.cpp b/src/plotting/PlotWriter.cpp
new file mode 100644
index 00000000..6e0785aa
--- /dev/null
+++ b/src/plotting/PlotWriter.cpp
@@ -0,0 +1,825 @@
+#include "PlotWriter.h"
+#include "ChiaConsts.h"
+#include "plotdisk/jobs/IOJob.h"
+#include "plotdisk/DiskBufferQueue.h"
+
+//-----------------------------------------------------------
+PlotWriter::PlotWriter() : PlotWriter( true ) {}
+
+//-----------------------------------------------------------
+PlotWriter::PlotWriter( bool useDirectIO )
+    : _queue()
+    , _writerThread( new Thread( 4 MiB ) )
+    , _directIO    ( useDirectIO )
+{
+    // #MAYBE: Start the thread at first plot?
+    _writerThread->Run( WriterThreadEntry, this );
+}
+
+//-----------------------------------------------------------
+PlotWriter::PlotWriter( DiskBufferQueue& ownerQueue ) : PlotWriter( true )
+{
+    _owner = &ownerQueue;
+}
+
+//-----------------------------------------------------------
+PlotWriter::~PlotWriter()
+{
+    if( _writerThread )
+    {
+        ExitWriterThread();
+        ASSERT( _writerThread->HasExited() );
+        delete _writerThread;
+    }
+
+    if( _plotPathBuffer.Ptr() )
+        free( _plotPathBuffer.Ptr() );
+    if( _plotFinalPathName )
+        free( _plotFinalPathName );
+    if( _writeBuffer.Ptr() )
+        bbvirtfree( _writeBuffer.Ptr() );
+}
+
+//-----------------------------------------------------------
+bool PlotWriter::BeginPlot( PlotVersion version, 
+    const char* plotFileDir, const char* plotFileName, const byte plotId[32],
+    const byte* plotMemo, const uint16 plotMemoSize, const uint32 compressionLevel )
+{
+    return BeginPlotInternal( version, plotFileDir, plotFileName, plotId, plotMemo, plotMemoSize, compressionLevel );
+}
+
+//-----------------------------------------------------------
+// bool PlotWriter::BeginCompressedPlot( PlotVersion version, 
+//         const char* plotFileDir, const char* plotFileName, const byte plotId[32],
+//         const byte* plotMemo, const uint16 plotMemoSize,
+//         uint32 compressionLevel )
+// {
+//     ASSERT( compressionLevel );
+
+//     return BeginPlot( version, plotFileDir, plotFileName, plotId, plotMemo, plotMemoSize, (int32)compressionLevel );
+// }
+
+//-----------------------------------------------------------
+bool PlotWriter::BeginPlotInternal( PlotVersion version,
+        const char* plotFileDir, const char* plotFileName, const byte plotId[32],
+        const byte* plotMemo, const uint16 plotMemoSize,
+        int32 compressionLevel )
+{
+    if( _dummyMode ) return true;
+
+    ASSERT( !_stream.IsOpen() );
+
+    // Ensure we don't start a 
+    if( _stream.IsOpen() )
+        return false;
+
+    if( !plotFileDir || !*plotFileDir || !plotFileDir || !*plotFileName )
+        return false;
+
+    if( !plotMemo || !plotMemoSize )
+        return false;
+
+    ASSERT( compressionLevel >= 0 && compressionLevel <= 9 );
+
+    if( compressionLevel > 0 && version < PlotVersion::v2_0 )
+        return false;
+
+    /// Copy plot file path
+    {
+        const size_t dirLength   = strlen( plotFileDir  );
+        const size_t nameLength  = strlen( plotFileName );
+        const size_t nameBufSize = dirLength + nameLength + 2;
+
+        if( _plotPathBuffer.length < nameBufSize )
+        {
+            _plotPathBuffer.values = (char*)realloc( _plotPathBuffer.values, nameBufSize );
+            _plotPathBuffer.length = nameBufSize;
+        }
+
+        auto plotFilePath = _plotPathBuffer;
+
+        memcpy( plotFilePath.Ptr(), plotFileDir, dirLength );
+        plotFilePath = plotFilePath.Slice( dirLength );
+        if( plotFileDir[dirLength-1] != '/' && plotFileDir[dirLength-1] != '\\' )
+        {
+            *plotFilePath.values = '/';
+            plotFilePath = plotFilePath.Slice( 1 );
+        }
+
+        memcpy( plotFilePath.Ptr(), plotFileName, nameLength );
+        plotFilePath[nameLength] = 0;
+    }
+
+    /// Open the plot file
+    //  #NOTE: We need to read access because we allow seeking, but in order to
+    //         remain block-aligned, we might have to read data from the seek location.
+    const FileFlags flags = FileFlags::LargeFile | ( _directIO ? FileFlags::NoBuffering : FileFlags::None );
+    if( !_stream.Open( _plotPathBuffer.Ptr(), FileMode::Create, FileAccess::ReadWrite, flags ) )
+        return false;
+
+    /// Make the header
+    size_t headerSize = 0;
+
+    if( version == PlotVersion::v1_0 )
+    {
+        headerSize =
+            ( sizeof( kPOSMagic ) - 1 ) +
+            32 +            // plot id
+            1  +            // k
+            2  +            // kFormatDescription length
+            ( sizeof( kFormatDescription ) - 1 ) +
+            2  +            // Memo length
+            plotMemoSize +  // Memo
+            80              // Table pointers
+            ;
+    }
+    else if( version == PlotVersion::v2_0 )
+    {
+        headerSize =
+            4  +            // magic
+            4  +            // version number
+            32 +            // plot id
+            1  +            // k
+            2  +            // Memo length
+            plotMemoSize +  // Memo
+            4  +            // flags
+            80 +            // Table pointers
+            80              // Table sizes
+            ;
+
+        if( compressionLevel > 0 )
+            headerSize += 1;
+    }
+    else
+        return false;
+
+    if( _writeBuffer.Ptr() == nullptr )
+    {
+        const size_t allocSize = RoundUpToNextBoundaryT( BUFFER_ALLOC_SIZE, _stream.BlockSize() );
+
+        if( _writeBuffer.Ptr() && allocSize > _writeBuffer.Length() )
+            bbvirtfree_span( _writeBuffer );
+
+        _writeBuffer.values = bbvirtalloc<byte>( allocSize );
+        _writeBuffer.length = allocSize;
+    }
+
+    _headerSize = headerSize;
+
+    if( version == PlotVersion::v1_0 )
+    {
+        byte* headerWriter = _writeBuffer.Ptr();
+
+        // Magic
+        memcpy( headerWriter, kPOSMagic, sizeof( kPOSMagic ) - 1 );
+        headerWriter += sizeof( kPOSMagic ) - 1;
+
+        // Plot Id
+        memcpy( headerWriter, plotId, 32 );
+        headerWriter += 32;
+
+        // K
+        *headerWriter++ = (byte)_K;
+
+        // Format description
+        *((uint16*)headerWriter) = Swap16( (uint16)(sizeof( kFormatDescription ) - 1) );
+        headerWriter += 2;
+        memcpy( headerWriter, kFormatDescription, sizeof( kFormatDescription ) - 1 );
+        headerWriter += sizeof( kFormatDescription ) - 1;
+
+        // Memo
+        *((uint16*)headerWriter) = Swap16( plotMemoSize );
+        headerWriter += 2;
+        memcpy( headerWriter, plotMemo, plotMemoSize );
+        headerWriter += plotMemoSize;
+        
+        // Empty table pointers
+        headerWriter += sizeof( _tablePointers );
+
+        ASSERT( (size_t)(uintptr_t)( headerWriter - _writeBuffer.Ptr() ) == headerSize );
+    }
+    else if( version == PlotVersion::v2_0 )
+    {
+        byte* headerWriter = _writeBuffer.Ptr();
+        
+        // The start of the header buffer should be 4-byte aligned, so we can do this
+        *((uint32*)headerWriter) = (uint32)CHIA_PLOT_V2_MAGIC;  headerWriter += 4;  // magic
+        *((uint32*)headerWriter) = (uint32)CHIA_PLOT_VERSION_2; headerWriter += 4;  // file version
+
+        // Plot Id
+        memcpy( headerWriter, plotId, 32 );
+        headerWriter += 32;
+
+        // K
+        *headerWriter++ = (byte)_K;
+
+        // Memo
+        *((uint16*)headerWriter) = Swap16( plotMemoSize );
+        headerWriter += 2;
+        memcpy( headerWriter, plotMemo, plotMemoSize );
+        headerWriter += plotMemoSize;
+
+        // Flags
+        PlotFlags flags = PlotFlags::None;
+        static_assert( sizeof( flags ) == 4 );
+
+        if( compressionLevel > 0 )
+            flags |= PlotFlags::Compressed;
+
+        memcpy( headerWriter, &flags, sizeof( flags ) ); headerWriter += sizeof( flags );
+
+        if( compressionLevel > 0 )
+            *headerWriter++ = (byte)compressionLevel;
+
+        // Empty tables pointer and sizes
+        headerWriter += sizeof( _tablePointers ) * 2;
+
+        ASSERT( (size_t)(uintptr_t)(headerWriter - _writeBuffer.Ptr()) == headerSize );
+    }
+
+    // Write header, block-aligned, tables will start at the aligned position
+    const ssize_t headerWriteSize = (ssize_t)RoundUpToNextBoundaryT( _headerSize, _stream.BlockSize() );
+    FatalIf( headerWriteSize != _stream.Write( _writeBuffer.Ptr(), (size_t)headerWriteSize ),
+        "Failed to write plot header with error: %d.", _stream.GetError() );
+
+    // Reset state
+    _plotVersion        = version;
+    _bufferBytes        = 0;
+    _haveTable          = false;
+    _currentTable       = PlotTable::Table1;
+    _position           = headerWriteSize;
+    // _tablesBeginAddress = headerWriteSize;
+    _tableStart         = 0;
+    _unalignedFileSize  = headerWriteSize;
+    _alignedFileSize    = headerWriteSize;
+
+    memset( _tablePointers, 0, sizeof( _tablePointers ) );
+    memset( _tableSizes   , 0, sizeof( _tablePointers ) );
+
+    return true;
+}
+
+
+//-----------------------------------------------------------
+void PlotWriter::EndPlot( const bool rename )
+{
+    if( _dummyMode ) return;
+
+    ASSERT( _stream.IsOpen() );
+
+    auto& cmd = GetCommand( CommandType::EndPlot );
+    cmd.endPlot.fence    = &_completedFence;
+    cmd.endPlot.rename   = rename;
+    SubmitCommands();
+}
+
+
+//-----------------------------------------------------------
+void PlotWriter::WaitForPlotToComplete()
+{
+    if( _dummyMode ) return;
+
+    ASSERT( _headerSize );
+    _completedFence.Wait();
+
+    _headerSize = 0;
+    ASSERT( !_stream.IsOpen() );
+}
+
+//-----------------------------------------------------------
+void PlotWriter::DumpTables()
+{
+    const uint64* tablePointers = _tablePointers;
+    const uint64* tableSizes    = _tableSizes;
+
+    Log::Line( "Final plot table pointers: " );
+    for( int i = 0; i < 10; i++ )
+    {
+        const uint64 addy = tablePointers[i];
+
+        if( i < 7 )
+            Log::Line( " Table %d: %16llu ( 0x%016llx )", i+1, addy, addy );
+        else
+            Log::Line( " C %d    : %16llu ( 0x%016llx )", i-6, addy, addy );
+    }
+    Log::Line( "" );
+
+    Log::Line( "Final plot table sizes: " );
+    for( int i = 0; i < 10; i++ )
+    {
+        const uint64 size = tableSizes[i];
+
+        if( i < 7 )
+            Log::Line( " Table %d: %.2lf MiB", i+1, (double)size BtoMB );
+        else
+            Log::Line( " C %d    : %.2lf MiB", i-6, (double)size BtoMB );
+    }
+    Log::Line( "" );
+}
+
+//-----------------------------------------------------------
+void PlotWriter::BeginTable( const PlotTable table )
+{
+    if( _dummyMode ) return;
+
+    auto& cmd = GetCommand( CommandType::BeginTable );
+    cmd.beginTable.table = table;
+    SubmitCommands();
+}
+
+//-----------------------------------------------------------
+void PlotWriter::ReserveTableSize( const PlotTable table, const size_t size )
+{
+    if( _dummyMode ) return;
+
+    auto& cmd = GetCommand( CommandType::ReserveTable );
+    cmd.reserveTable.table = table;
+    cmd.reserveTable.size  = size;
+    SubmitCommands();
+}
+
+//-----------------------------------------------------------
+void PlotWriter::EndTable()
+{
+    if( _dummyMode ) return;
+
+    auto& cmd = GetCommand( CommandType::EndTable );
+    SubmitCommands();
+}
+
+//-----------------------------------------------------------
+void PlotWriter::WriteTableData( const void* data, const size_t size )
+{
+    if( _dummyMode ) return;
+
+    auto& cmd = GetCommand( CommandType::WriteTable );
+    cmd.writeTable.buffer = (byte*)data;
+    cmd.writeTable.size   = size;
+    SubmitCommands();
+}
+
+//-----------------------------------------------------------
+void PlotWriter::WriteReservedTable( const PlotTable table, const void* data )
+{
+    if( _dummyMode ) return;
+
+    auto& cmd = GetCommand( CommandType::WriteReservedTable );
+    cmd.writeReservedTable.table  = table;
+    cmd.writeReservedTable.buffer = (byte*)data;
+    SubmitCommands();
+}
+
+//-----------------------------------------------------------
+void PlotWriter::SignalFence( Fence& fence )
+{
+    if( _dummyMode ) fence.Signal();
+
+    auto& cmd = GetCommand( CommandType::SignalFence );
+    cmd.signalFence.fence    = &fence;
+    cmd.signalFence.sequence = -1;
+    SubmitCommands();
+}
+
+//-----------------------------------------------------------
+void PlotWriter::SignalFence( Fence& fence, uint32 sequence )
+{
+    if( _dummyMode ) fence.Signal( sequence );
+
+    auto& cmd = GetCommand( CommandType::SignalFence );
+    cmd.signalFence.fence    = &fence;
+    cmd.signalFence.sequence = (int64)sequence;
+    SubmitCommands();
+}
+
+//-----------------------------------------------------------
+void PlotWriter::ExitWriterThread()
+{
+    // Signal writer thread to exit after it finishes its commands
+    auto& cmd = GetCommand( CommandType::Exit );
+    cmd.signalFence.fence = &_completedFence;
+    SubmitCommands();
+
+    // Wait for writer thread to exit
+    _completedFence.Wait();
+    ASSERT( _writerThread->HasExited() );
+}
+
+//-----------------------------------------------------------
+PlotWriter::Command& PlotWriter::GetCommand( CommandType type )
+{
+    if( _owner != nullptr )
+    {
+        auto* cmd = _owner->GetCommandObject( DiskBufferQueue::Command::CommandType::PlotWriterCommand );
+        ASSERT( cmd );
+
+        ZeroMem( &cmd->plotWriterCmd );
+        cmd->plotWriterCmd.writer   = this;
+        cmd->plotWriterCmd.cmd.type = type;
+        return cmd->plotWriterCmd.cmd;
+    }
+    else
+    {
+        Command* cmd = nullptr;
+        while( !_queue.Write( cmd ) )
+        {
+            Log::Line( "[PlotWriter] Command buffer full. Waiting for commands." );
+            auto waitTimer = TimerBegin();
+
+            // Block and wait until we have commands free in the buffer
+            _cmdConsumedSignal.Wait();
+            
+            Log::Line( "[PlotWriter] Waited %.6lf seconds for a Command to be available.", TimerEnd( waitTimer ) );
+        }
+        
+        ASSERT( cmd );
+        ZeroMem( cmd );
+        cmd->type = type;
+
+        return *cmd;
+    }
+}
+
+//-----------------------------------------------------------
+void PlotWriter::SubmitCommands()
+{
+    if( _owner != nullptr )
+    {
+        _owner->CommitCommands();
+    }
+    else
+    {
+        _queue.Commit();
+        _cmdReadySignal.Signal();
+    }
+}
+
+
+///
+/// Writer Thread
+///
+//-----------------------------------------------------------
+void PlotWriter::WriterThreadEntry( PlotWriter* self )
+{
+    self->WriterThreadMain();
+}
+
+//-----------------------------------------------------------
+void PlotWriter::WriterThreadMain()
+{
+    const uint32 MAX_COMMANDS = 64;
+    Command commands[MAX_COMMANDS];
+
+    for( ;; )
+    {
+        // Wait for commands
+        _cmdReadySignal.Wait();
+
+        // Load commands from the queue
+        int32 cmdCount;
+        while( ( ( cmdCount = _queue.Dequeue( commands, MAX_COMMANDS ) ) ) )
+        {
+            // Notify we consumed commands
+            _cmdConsumedSignal.Signal();
+
+            for( int32 i = 0; i < cmdCount; i++ )
+            {
+                if( commands[i].type == CommandType::Exit )
+                {
+                    commands[i].signalFence.fence->Signal();
+                    return;
+                }
+
+                ExecuteCommand( commands[i] );
+            }
+        }
+    }
+}
+
+//-----------------------------------------------------------
+void PlotWriter::ExecuteCommand( const Command& cmd )
+{
+    switch( cmd.type )
+    {
+        default: return;
+
+        case CommandType::BeginTable         : CmdBeginTable( cmd ); break;
+        case CommandType::EndTable           : CmdEndTable( cmd ); break;
+        case CommandType::WriteTable         : CmdWriteTable( cmd ); break;
+        case CommandType::ReserveTable       : CmdReserveTable( cmd ); break;
+        case CommandType::WriteReservedTable : CmdWriteReservedTable( cmd ); break;
+        case CommandType::EndPlot            : CmdEndPlot( cmd ); break;
+
+        case CommandType::SignalFence:
+            if( cmd.signalFence.sequence >= 0 )
+                cmd.signalFence.fence->Signal( (uint32)cmd.signalFence.sequence );
+            else
+                cmd.signalFence.fence->Signal();
+        break;
+    }
+}
+
+
+//-----------------------------------------------------------
+void PlotWriter::SeekToLocation( const size_t location )
+{
+    // We need to read the block we seeked to if:
+    // - The seeked-to block is NOT the current block AND
+    // - The seeked-to block already existed
+
+    const size_t blockSize              = _stream.BlockSize();
+    const size_t currentAlignedLocation = _position / blockSize * blockSize;
+    const size_t alignedLocation        = location / blockSize * blockSize;
+
+    if( _bufferBytes )
+    {
+        FlushRetainedBytes();
+        
+        // Seek back to the location
+        FatalIf( !_stream.Seek( -(int64)blockSize, SeekOrigin::Current ),
+            "Plot file seek failed with error: %d", _stream.GetError() );
+    }
+    ASSERT( _bufferBytes == 0 );
+    
+
+    FatalIf( !_stream.Seek( (int64)alignedLocation, SeekOrigin::Begin ),
+        "Plot file seek failed with error: %d", _stream.GetError() );
+    
+    // Read the block we just seeked-to,
+    // unless it is at the unaligned end, and the end is block-aligned (start of a block)
+    if( alignedLocation < _unalignedFileSize )
+    {
+        FatalIf( (ssize_t)blockSize != _stream.Read( _writeBuffer.Ptr(), blockSize ),
+            "Plot file read failed with error: %d", _stream.GetError() );
+
+        // Seek back to the location
+        FatalIf( !_stream.Seek( -(int64)blockSize, SeekOrigin::Current ),
+            "Plot file seek failed with error: %d", _stream.GetError() );
+    }
+
+    _bufferBytes = location - alignedLocation;
+
+    _position          = location;
+    _unalignedFileSize = std::max( _position, _unalignedFileSize );
+}
+
+//-----------------------------------------------------------
+size_t PlotWriter::BlockAlign( const size_t size ) const
+{
+    return RoundUpToNextBoundaryT( size, _stream.BlockSize() );
+}
+
+//-----------------------------------------------------------
+void PlotWriter::FlushRetainedBytes()
+{
+    if( _bufferBytes > 0 )
+    {
+        const size_t blockSize = _stream.BlockSize();
+        ASSERT( RoundUpToNextBoundaryT( _bufferBytes, blockSize ) == blockSize )
+
+        int32 err;
+        PanicIf( !IOJob::WriteToFile( _stream, _writeBuffer.Ptr(), blockSize, nullptr, blockSize, err ),
+            "Failed to write to plot with error %d:", err );
+
+        _bufferBytes     = 0;
+        _alignedFileSize = std::max( _alignedFileSize, CDivT( _unalignedFileSize, blockSize ) * blockSize );
+    }
+}
+
+//-----------------------------------------------------------
+void PlotWriter::WriteData( const byte* src, const size_t size )
+{
+    // #TODO: If the input data is aligned, and our position is aligned, bypass this.
+
+    // Determine how many blocks will be written
+    const size_t capacity    = _writeBuffer.Length();
+    const size_t blockSize   = _stream.BlockSize();
+    ASSERT( _bufferBytes < blockSize );
+
+    const size_t startBlock  = _position / blockSize;
+    const size_t endBlock    = ( _position + size ) / blockSize;
+    // const size_t blocksWritten = endBlock - startBlock;
+
+
+    byte* writeBuffer = _writeBuffer.Ptr();
+    int32 err         = 0;
+
+    size_t sizeRemaining = _bufferBytes + size;                    // Data that we will keep in the temporary buffer for later writing
+    size_t sizeToWrite   = sizeRemaining / blockSize * blockSize;  // Block-aligned data we can write
+    
+    // Substract the blocks that will be written, or if none
+    // will be written, we only need to copy over the size (no blocks filled)
+    sizeRemaining = std::min( sizeRemaining - sizeToWrite, size );
+
+    // Write as much block-aligned data as we can
+    while( sizeToWrite )
+    {
+        const size_t spaceAvailable = capacity - _bufferBytes;
+        const size_t copySize       = std::min( spaceAvailable, sizeToWrite - _bufferBytes );
+        ASSERT( (copySize + _bufferBytes) / blockSize * blockSize == (copySize + _bufferBytes) );
+
+        memcpy( writeBuffer + _bufferBytes, src, copySize );
+        
+        const size_t writeSize = _bufferBytes + copySize;
+        sizeToWrite -= writeSize;
+        src         += copySize;
+        _bufferBytes = 0;
+
+        ASSERT( writeSize / blockSize * blockSize == writeSize );
+
+        PanicIf( !IOJob::WriteToFile( _stream, writeBuffer, writeSize, nullptr, blockSize, err ),
+            "Failed to write to plot with error %d:", err );
+    }
+
+
+    // Data remaining in the last block has to be read back if the last block had already been written to
+    const size_t maxBlockWritten = _unalignedFileSize / blockSize;
+
+    if( maxBlockWritten >= endBlock && endBlock > startBlock )
+    {
+        ASSERT( _bufferBytes == 0 );
+        PanicIf( _stream.Read( _writeBuffer.Ptr(), blockSize ) != (ssize_t)blockSize, 
+            "Plot file read failed: %d", _stream.GetError() );
+        
+        // Seek back to the last block
+        PanicIf( !_stream.Seek( -(int64)blockSize, SeekOrigin::Current ),
+            "Plot file seek failed: %d", _stream.GetError() );
+    }
+
+    if( sizeRemaining > 0 )
+    {
+        memcpy( writeBuffer + _bufferBytes, src, sizeRemaining );
+        _bufferBytes += sizeRemaining;
+    }
+    ASSERT( _bufferBytes < blockSize );
+
+    // Update position and file size
+    _position        += size;
+    _unalignedFileSize = std::max( _position, _unalignedFileSize );
+    _alignedFileSize   = std::max( _alignedFileSize, _unalignedFileSize / blockSize * blockSize );
+}
+
+///
+/// Commands
+///
+//-----------------------------------------------------------
+void PlotWriter::CmdBeginTable( const Command& cmd )
+{
+    ASSERT( cmd.type == CommandType::BeginTable );
+    ASSERT( !_haveTable );
+    if( _haveTable )
+        return;
+
+    const PlotTable table = cmd.beginTable.table;
+
+    _currentTable = table;
+    _haveTable    = true;
+    
+    _tableStart                = _position;
+    _tablePointers[(int)table] = _tableStart;
+}
+
+//-----------------------------------------------------------
+void PlotWriter::CmdEndTable( const Command& cmd )
+{
+    ASSERT( cmd.type == CommandType::EndTable );
+    ASSERT( _haveTable );
+    if( !_haveTable )
+        return;
+
+    ASSERT( _position >= _tableStart );
+    _tableSizes[(int)_currentTable] = _position - _tableStart;
+    _haveTable = false;
+}
+
+//-----------------------------------------------------------
+void PlotWriter::CmdWriteTable( const Command& cmd )
+{
+    ASSERT( cmd.type == CommandType::WriteTable );
+
+    auto& c = cmd.writeTable;
+    ASSERT( c.size );
+    
+    WriteData( c.buffer, c.size );
+}
+
+//-----------------------------------------------------------
+void PlotWriter::CmdReserveTable( const Command& cmd )
+{
+    ASSERT( cmd.type == CommandType::ReserveTable );
+    ASSERT( !_haveTable );
+    if( _haveTable )
+        return;
+
+    auto& c = cmd.reserveTable;
+    ASSERT( _tablePointers[(int)c.table] == 0 );
+
+    _tablePointers[(int)c.table] = _position;
+    _tableSizes   [(int)c.table] = c.size;
+
+    SeekToLocation( _position + c.size );
+}
+
+//-----------------------------------------------------------
+void PlotWriter::CmdWriteReservedTable( const Command& cmd )
+{
+    ASSERT( cmd.type == CommandType::WriteReservedTable );
+
+    const auto& c = cmd.writeReservedTable;
+
+    const size_t currentLocation = _position;
+
+    const size_t tableLocation   = _tablePointers[(int)c.table];
+    const size_t tableSize       = _tableSizes[(int)c.table];
+    
+    ASSERT( tableSize );
+    ASSERT( tableLocation != 0 );
+
+    SeekToLocation( tableLocation );
+    WriteData( c.buffer, tableSize );
+    SeekToLocation( currentLocation );
+}
+
+//-----------------------------------------------------------
+void PlotWriter::CmdSignalFence( const Command& cmd )
+{
+    ASSERT( cmd.type == CommandType::SignalFence || cmd.type == CommandType::EndPlot );
+
+    if( cmd.signalFence.sequence >= 0 )
+        cmd.signalFence.fence->Signal( (uint32)cmd.signalFence.sequence );
+    else
+        cmd.signalFence.fence->Signal();
+}
+
+//-----------------------------------------------------------
+void PlotWriter::CmdEndPlot( const Command& cmd )
+{
+    ASSERT( cmd.type == CommandType::EndPlot );
+    ASSERT( _stream.IsOpen() );
+    ASSERT( !_haveTable );
+
+    // Write table sizes
+    size_t tablePointersLoc;
+    if( _plotVersion == PlotVersion::v1_0 )
+        tablePointersLoc = _headerSize - 80;
+    else if( _plotVersion == PlotVersion::v2_0 )
+        tablePointersLoc = _headerSize - 160;
+    else
+    {
+        ASSERT( 0 );
+    }
+
+    uint64 tablePointersBE[10];
+    for( uint32 i = 0; i < 10; i++ )
+        tablePointersBE[i] = Swap64( _tablePointers[i] );
+
+    SeekToLocation( tablePointersLoc );
+    WriteData( (byte*)tablePointersBE, sizeof( tablePointersBE ) );
+
+    if( _plotVersion == PlotVersion::v2_0 )
+        WriteData( (byte*)_tableSizes, sizeof( _tableSizes ) );
+    
+    ASSERT( _position == _headerSize );
+
+    FlushRetainedBytes();
+    _stream.Close();
+
+    // Now rename to its final non-temp name
+    if( cmd.endPlot.rename )
+    {
+        const uint32 RETRY_COUNT  = 10;
+        const long   MS_WAIT_TIME = 1000;
+
+        const char* tmpName = _plotPathBuffer.Ptr();
+        const size_t pathLen = strlen( tmpName );
+
+        _plotFinalPathName = (char*)realloc( _plotFinalPathName, pathLen + 1 );
+        memcpy( _plotFinalPathName, tmpName, pathLen );
+        _plotFinalPathName[pathLen-4] = '\0';
+
+        Log::Line( "%s -> %s", tmpName, _plotFinalPathName );
+
+        int32 error = 0;
+
+        for( uint32 i = 0; i < RETRY_COUNT; i++ )
+        {
+            const bool success = FileStream::Move( tmpName, _plotFinalPathName, &error );
+
+            if( success )
+                break;
+            
+            Log::Line( "[PlotWriter] Error: Could not rename plot file with error: %d.", error );
+
+            if( i+1 == RETRY_COUNT)
+            {
+                Log::Line( "[PlotWriter] Error: Failed to to rename plot file after %u retries. Please rename manually.", RETRY_COUNT );
+                break;
+            }
+
+            Log::Line( "[PlotWriter] Retrying in %.2lf seconds...", MS_WAIT_TIME / 1000.0 );
+            Thread::Sleep( MS_WAIT_TIME );
+        }
+    }
+
+    cmd.endPlot.fence->Signal();
+}
+
diff --git a/src/plotting/PlotWriter.h b/src/plotting/PlotWriter.h
new file mode 100644
index 00000000..28084687
--- /dev/null
+++ b/src/plotting/PlotWriter.h
@@ -0,0 +1,276 @@
+#pragma once
+
+#include "util/SPCQueue.h"
+#include "plotting/PlotTypes.h"
+#include "plotting/PlotHeader.h"
+#include "io/FileStream.h"
+#include "threading/Thread.h"
+#include "threading/AutoResetSignal.h"
+#include "threading/Fence.h"
+
+/**
+ * Handles writing the final plot data to disk asynchronously.
+ *
+ * The tables may appear in any order, but the [C2_Table] MUST appear immediately before [C3_Table_Parks].
+ * This is to keep in step with how chiapos loads the C2 table currently.
+ * 
+ * Final file format is as follows:
+ * 
+ * VERSION 1.0:
+ * [Header]
+ *   - "Proof of Space Plot" (utf-8) : 19 bytes
+ *   - unique plot id                : 32 bytes
+ *   - k                             : 1 byte  
+ *   - format description length     : 2 bytes 
+ *   - format description            : * bytes 
+ *   - memo length                   : 2 bytes 
+ *   - memo                          : * bytes 
+ *   - table pointers                : 80 bytes (8 * 10)
+ * [(Optional_Padding)]
+ * [Table1_Parks]
+ * [Table2_Parks]
+ * [Table3_Parks]
+ * [Table4_Parks]
+ * [Table5_Parks]
+ * [Table6_Parks]
+ * [Table7_Parks]
+ * [C1_Table]
+ * [C2_Table]
+ * [C3_Table_Parks]
+ * 
+ * 
+ * VERSION 2.0:
+ * 
+ * [Header] is modified in the following way:
+ * 
+ * [V2_Header]
+ *   - magic number (0x544F4C50 'PLOT') : 4  bytes
+ *   - version number                   : 4  bytes
+ *   - unique plot id                   : 32 bytes
+ *   - k                                : 1  byte
+ *   - memo length                      : 2  bytes
+ *   - memo                             : *  bytes
+ *   - plot flags ([Plot_Flags])        : 4 bytes
+ *   - (if compression flag is set)
+ *    - compression level (1-9)         : 1 byte
+ *   - table pointers                   : 80 bytes (8 * 10)
+ *   - table sizes                      : 80 bytes (8 * 10)
+ * 
+ * The rest of the file is the normal payload tables, but [Table1_Parks] and [Table2_Parks]
+ * may be missing depending of the compression level. In this case the
+ * table pointers of the missing tables should be set to the 
+ * start of the next available table, and their sizes set ot 0.
+ * 
+ */
+
+class FileStream;
+class Fence;
+class Thread;
+class DiskBufferQueue;
+
+class PlotWriter
+{
+    friend class DiskBufferQueue;
+
+    struct Command;
+    enum class CommandType : uint32;
+
+    static constexpr size_t BUFFER_ALLOC_SIZE = 32 MiB;
+public:
+
+    PlotWriter();
+    PlotWriter( bool useDirectIO );
+    PlotWriter( DiskBufferQueue& ownerQueue );
+    virtual ~PlotWriter();
+    
+
+    // Begins writing a new plot. Any previous plot must have finished before calling this
+    bool BeginPlot( PlotVersion version, 
+        const char* plotFileDir, const char* plotFileName, const byte plotId[32],
+        const byte* plotMemo, const uint16 plotMemoSize, uint32 compressionLevel = 0 );
+
+    // bool BeginCompressedPlot( PlotVersion version, 
+    //     const char* plotFileDir, const char* plotFileName, const byte plotId[32],
+    //     const byte* plotMemo, const uint16 plotMemoSize, uint32 compressionLevel );
+
+    void EndPlot( bool rename = true );
+
+    void WaitForPlotToComplete();
+
+    void DumpTables();
+
+    // Begin plotting a table
+    void BeginTable( const PlotTable table );
+
+    // End current table
+    void EndTable();
+
+    // Don't actually write table data, but reserve the size for it and seek
+    // to the offset that would be written
+    void ReserveTableSize( const PlotTable table, const size_t size );
+
+    // Write data to the currently active table
+    void WriteTableData( const void* data, const size_t size );
+
+    // Write all data to a reserved table.
+    // The whole buffer must be specified
+    void WriteReservedTable( const PlotTable table, const void* data );
+
+    void SignalFence( Fence& fence );
+    void SignalFence( Fence& fence, uint32 sequence );
+    
+    void CompleteTable();
+
+    inline int32 GetError() { return _stream.GetError(); }
+
+    inline size_t BlockSize() const { return _stream.BlockSize(); }
+
+    inline const uint64* GetTablePointers() const { return _tablePointers; }
+
+    inline const uint64* GetTableSizes() const { return _tableSizes; }
+
+    inline const char* GetLastPlotFileName() const { return _plotFinalPathName ? _plotFinalPathName : ""; }
+
+    size_t BlockAlign( size_t size ) const;
+
+    template<typename T>
+    inline T* BlockAlignPtr( void* ptr ) const
+    {
+        return (T*)(uintptr_t)BlockAlign( (size_t)(uintptr_t)ptr );
+    }
+
+    inline void EnableDummyMode()
+    {
+        ASSERT( !_haveTable );
+        ASSERT( _headerSize == 0 );
+        ASSERT( _position == 0 );
+        _dummyMode = true;
+    }
+
+private:
+
+    bool BeginPlotInternal( PlotVersion version,
+        const char* plotFileDir, const char* plotFileName, const byte plotId[32],
+        const byte* plotMemo, const uint16 plotMemoSize,
+        int32 compressionLevel );
+
+    Command& GetCommand( CommandType type );
+    void SubmitCommands();
+    
+    void SeekToLocation( size_t location );
+
+    void ExitWriterThread();
+
+    static void WriterThreadEntry( PlotWriter* self );
+    void WriterThreadMain();
+    void ExecuteCommand( const Command& cmd );
+    void FlushRetainedBytes();
+
+    void WriteData( const byte* data, size_t size );
+
+private:
+    void CmdBeginTable( const Command& cmd );
+    void CmdEndTable( const Command& cmd );
+    void CmdWriteTable( const Command& cmd );
+    void CmdReserveTable( const Command& cmd );
+    void CmdWriteReservedTable( const Command& cmd );
+    void CmdSignalFence( const Command& cmd );
+    void CmdEndPlot( const Command& cmd );
+
+private:
+    enum class CommandType : uint32
+    {
+        None = 0,
+        Exit,
+        BeginTable,
+        EndTable,
+        WriteTable,
+        ReserveTable,
+        WriteReservedTable,
+        SignalFence,
+        EndPlot
+    };
+
+    struct Command
+    {
+        CommandType type;
+
+        union {
+            struct
+            {
+                PlotTable table;
+            } beginTable;
+
+            struct
+            {
+                PlotTable table;
+                size_t size;
+            } reserveTable;
+
+            // Also used for WriteReservedTable
+            struct 
+            {
+                const byte* buffer;
+                size_t      size;
+            } writeTable;
+
+            struct 
+            {
+                PlotTable   table;
+                const byte* buffer;
+            } writeReservedTable;
+
+            struct
+            {
+                PlotTable table;
+            } endTable;
+
+            // Also used for EndPlot
+            struct
+            {
+                Fence* fence;
+                int64  sequence;
+            } signalFence;
+
+            struct
+            {
+                Fence* fence;
+                bool   rename;
+            } endPlot;
+        };
+    };
+
+
+private:
+    class DiskBufferQueue* _owner               = nullptr;  // This instance might be own by an IOQueue, which will 
+                                                            // dispatch our ocmmands in its own threads.
+                                                            
+    FileStream             _stream;
+    bool                   _directIO;
+    bool                   _dummyMode           = false;    // In this mode we don't actually write anything
+    PlotVersion            _plotVersion         = PlotVersion::v2_0;
+    Span<char>             _plotPathBuffer      = {};
+    char*                  _plotFinalPathName   = {};
+    Thread*                _writerThread        = nullptr;
+    Fence                  _completedFence;             // Signal plot completed
+    AutoResetSignal        _cmdReadySignal;
+    AutoResetSignal        _cmdConsumedSignal;
+    Span<byte>             _writeBuffer         = {};
+    size_t                 _bufferBytes         = 0;    // Current number of bytes in the buffer
+    size_t                 _headerSize          = 0;
+    bool                   _haveTable           = false;
+    PlotTable              _currentTable        = PlotTable::Table1;
+    size_t                 _position            = 0;    // Current read/write location, relative to the start of the file
+    size_t                 _unalignedFileSize   = 0;    // Current total file size, including headers, but excluding any extra alignment bytes
+    size_t                 _alignedFileSize     = 0;    // Current actual size of data we've written to disk.
+                                                        //  This is different than _unalignedFileSize because the latter might 
+                                                        //  have data accounted in the buffer, but not written to disk. In which
+                                                        //  case the _alignedFileSize is lesser than _unalignedFileSize.
+                                                        
+    // size_t                 _tablesBeginAddress  = 0;    // Start location for tables section
+    size_t                 _tableStart          = 0;    // Current table start location
+    uint64                 _tablePointers[10]   = {};
+    uint64                 _tableSizes   [10]   = {};
+    SPCQueue<Command, 512> _queue;
+};
+
diff --git a/src/plotting/Tables.h b/src/plotting/Tables.h
index 3acf900d..f9944358 100644
--- a/src/plotting/Tables.h
+++ b/src/plotting/Tables.h
@@ -195,3 +195,19 @@ template<> struct K32MetaType<TableId::Table7>{ using In = K32Meta2;  using Out
 /// Helper for obtaining the correct fx (y) output type per table
 template<TableId rTable> struct K32TYOut { using Type = uint64; };
 template<>               struct K32TYOut<TableId::Table7> { using Type = uint32; };
+
+
+inline uint32 GetTableMetaMultiplier( const TableId table )
+{
+    switch( table )
+    {
+        case TableId::Table1: return (uint32)TableMetaOut<TableId::Table1>::Multiplier;
+        case TableId::Table2: return (uint32)TableMetaOut<TableId::Table2>::Multiplier;
+        case TableId::Table3: return (uint32)TableMetaOut<TableId::Table3>::Multiplier;
+        case TableId::Table4: return (uint32)TableMetaOut<TableId::Table4>::Multiplier;
+        case TableId::Table5: return (uint32)TableMetaOut<TableId::Table5>::Multiplier;
+        case TableId::Table6: return (uint32)TableMetaOut<TableId::Table6>::Multiplier;
+        case TableId::Table7: return (uint32)TableMetaOut<TableId::Table7>::Multiplier;
+        default: return 0;
+    }
+}
diff --git a/src/plotting/f1/F1Gen.cpp b/src/plotting/f1/F1Gen.cpp
new file mode 100644
index 00000000..c5ee4566
--- /dev/null
+++ b/src/plotting/f1/F1Gen.cpp
@@ -0,0 +1,40 @@
+#include "F1Gen.h"
+#include "pos/chacha8.h"
+#include "plotting/PlotTools.h"
+#include "util/BitView.h"
+
+//-----------------------------------------------------------
+uint64 F1GenSingleForK( const uint32 k, const byte plotId[BB_PLOT_ID_LEN], uint64 x )
+{
+    ASSERT( k > 0 );
+    ASSERT( k <= BB_CHIA_K_MAX_VALUE );
+
+    const uint32 xShift = k - kExtraBits;
+
+    // Prepare ChaCha key
+    byte key[32] = { 1 };
+    memcpy( key + 1, plotId, 31 );
+
+    chacha8_ctx chacha;
+    chacha8_keysetup( &chacha, key, 256, NULL );
+
+    // Enough to hold 2 cha-cha blocks since a value my span over 2 blocks
+    byte AlignAs(8) blocks[kF1BlockSize*2];
+
+    const uint64 blockIdx    = x * k / kF1BlockSizeBits; 
+    const uint64 blockEndIdx = (x * k + k - 1) / kF1BlockSizeBits; 
+    
+    const uint32 nBlocks = (uint32)(blockEndIdx - blockIdx + 1);
+
+    chacha8_get_keystream( &chacha, blockIdx, nBlocks, blocks );
+
+    // Get the starting and end locations of y in bits relative to our block
+    const uint64 bitStart = x * k - blockIdx * kF1BlockSizeBits;
+
+    CPBitReader hashBits( blocks, sizeof( blocks ) * 8 );
+    
+    uint64 y = hashBits.Read64At( bitStart, k );
+    y = ( y << kExtraBits ) | ( x >> xShift );
+
+    return y;
+}
\ No newline at end of file
diff --git a/src/plotting/f1/F1Gen.h b/src/plotting/f1/F1Gen.h
new file mode 100644
index 00000000..feb8861b
--- /dev/null
+++ b/src/plotting/f1/F1Gen.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include "threading/ThreadPool.h"
+#include "ChiaConsts.h"
+
+uint64 F1GenSingleForK( const uint32 k, const byte plotId[BB_PLOT_ID_LEN], uint64 x );
\ No newline at end of file
diff --git a/src/plotting/fx/PlotFx.inl b/src/plotting/fx/PlotFx.inl
new file mode 100644
index 00000000..fd529177
--- /dev/null
+++ b/src/plotting/fx/PlotFx.inl
@@ -0,0 +1,227 @@
+#pragma once
+
+#include "plotting/Tables.h"
+#include "plotting/PlotTypes.h"
+
+#define B3Round( intputByteSize ) \
+    uint32 state[16] = {                      \
+        0x6A09E667UL,   /*IV full*/           \
+        0xBB67AE85UL,                         \
+        0x3C6EF372UL,                         \
+        0xA54FF53AUL,                         \
+        0x510E527FUL,                         \
+        0x9B05688CUL,                         \
+        0x1F83D9ABUL,                         \
+        0x5BE0CD19UL,                         \
+        0x6A09E667UL,   /*IV 0-4*/            \
+        0xBB67AE85UL,                         \
+        0x3C6EF372UL,                         \
+        0xA54FF53AUL,                         \
+        0,               /*count lo*/         \
+        0,               /*count hi*/         \
+        (intputByteSize),/*buffer length*/    \
+        11              /*flags. Always 11*/  \
+    };                                        \
+                                              \
+    round_fn( state, (uint32*)&input[0], 0 ); \
+    round_fn( state, (uint32*)&input[0], 1 ); \
+    round_fn( state, (uint32*)&input[0], 2 ); \
+    round_fn( state, (uint32*)&input[0], 3 ); \
+    round_fn( state, (uint32*)&input[0], 4 ); \
+    round_fn( state, (uint32*)&input[0], 5 ); \
+    round_fn( state, (uint32*)&input[0], 6 ); 
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wattributes"
+
+FORCE_INLINE uint32_t rotr32( uint32_t w, uint32_t c )
+{
+    return ( w >> c ) | ( w << ( 32 - c ) );
+}
+
+FORCE_INLINE void g( uint32_t* state, size_t a, size_t b, size_t c, size_t d,
+                     uint32_t x, uint32_t y )
+{
+    state[a] = state[a] + state[b] + x;
+    state[d] = rotr32( state[d] ^ state[a], 16 );
+    state[c] = state[c] + state[d];
+    state[b] = rotr32( state[b] ^ state[c], 12 );
+    state[a] = state[a] + state[b] + y;
+    state[d] = rotr32( state[d] ^ state[a], 8 );
+    state[c] = state[c] + state[d];
+    state[b] = rotr32( state[b] ^ state[c], 7 );
+}
+
+FORCE_INLINE void round_fn( uint32_t state[16], const uint32_t* msg, size_t round )
+{
+    static const uint8_t MSG_SCHEDULE[7][16] = {
+        {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+        {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
+        {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
+        {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
+        {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
+        {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
+        {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
+    };
+    // Select the message schedule based on the round.
+    const uint8_t* schedule = MSG_SCHEDULE[round];
+
+    // Mix the columns.
+    g( state, 0, 4, 8 , 12, msg[schedule[0]], msg[schedule[1]] );
+    g( state, 1, 5, 9 , 13, msg[schedule[2]], msg[schedule[3]] );
+    g( state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]] );
+    g( state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]] );
+
+    // Mix the rows.
+    g( state, 0, 5, 10, 15, msg[schedule[8]] , msg[schedule[9]] );
+    g( state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]] );
+    g( state, 2, 7, 8 , 13, msg[schedule[12]], msg[schedule[13]] );
+    g( state, 3, 4, 9 , 14, msg[schedule[14]], msg[schedule[15]] );
+}
+
+
+//-----------------------------------------------------------
+template<TableId rTable>
+FORCE_INLINE typename K32TYOut<rTable>::Type FxK32HashOne( 
+    const uint64                           y,
+    const typename K32MetaType<rTable>::In metaL,
+    const typename K32MetaType<rTable>::In metaR,
+    typename K32MetaType<rTable>::Out&     outMeta
+    )
+{
+    constexpr size_t MetaInMulti  = TableMetaIn <rTable>::Multiplier;
+    constexpr size_t MetaOutMulti = TableMetaOut<rTable>::Multiplier;
+
+    using TYOut = typename K32TYOut<rTable>::Type;
+
+    const uint32 k           = 32;
+    const uint32 metaSize    = k * MetaInMulti;
+    const uint32 metaSizeLR  = metaSize * 2;
+    const uint32 inputSize   = CDiv( 38 + metaSizeLR, 8 );
+    const uint32 shiftBits   = MetaOutMulti == 0 ? 0 : kExtraBits;      // Table 7 does not output with kExtraBits
+    const uint32 ySize       = k + kExtraBits;                          // = 38
+    const uint32 yShift      = 64 - (k + shiftBits);                    // = 26 or 32
+
+    uint64 input [8];
+    uint64 output[4];
+        
+    if constexpr( MetaInMulti == 1 )
+    {
+        const uint64 l = metaL;
+        const uint64 r = metaR;
+        
+        const uint64 i0 = y << 26 | l >> 6;
+        const uint64 i1 = l << 58 | r << 26;
+
+        input[0] = Swap64( i0 );
+        input[1] = Swap64( i1 );
+        input[2] = 0;
+        input[3] = 0;
+        input[4] = 0;
+        input[5] = 0;
+        input[6] = 0;
+        input[7] = 0;
+
+        if constexpr( MetaOutMulti == 2 )
+            outMeta = l << 32 | r;
+    }
+    else if constexpr ( MetaInMulti == 2 )
+    {
+        input[0] = Swap64( y     << 26 | metaL >> 38 );
+        input[1] = Swap64( metaL << 26 | metaR >> 38 );
+        input[2] = Swap64( metaR << 26 );
+        input[3] = 0;
+        input[4] = 0;
+        input[5] = 0;
+        input[6] = 0;
+        input[7] = 0;
+
+        if constexpr ( MetaOutMulti == 4 )
+        {
+            outMeta.m0 = metaL;
+            outMeta.m1 = metaR;
+        }
+    }
+    else if constexpr ( MetaInMulti == 3 )
+    {
+        const uint64 l0 = metaL.m0;
+        const uint64 l1 = metaL.m1 & 0xFFFFFFFF;
+        const uint64 r0 = metaR.m0;
+        const uint64 r1 = metaR.m1 & 0xFFFFFFFF;
+        
+        input[0] = Swap64( y  << 26 | l0 >> 38 );
+        input[1] = Swap64( l0 << 26 | l1 >> 6  );
+        input[2] = Swap64( l1 << 58 | r0 >> 6  );
+        input[3] = Swap64( r0 << 58 | r1 << 26 );
+        input[4] = 0;
+        input[5] = 0;
+        input[6] = 0;
+        input[7] = 0;
+    }
+    else if constexpr ( MetaInMulti == 4 )
+    {
+        input[0] = Swap64( y        << 26 | metaL.m0 >> 38 );
+        input[1] = Swap64( metaL.m0 << 26 | metaL.m1 >> 38 );
+        input[2] = Swap64( metaL.m1 << 26 | metaR.m0 >> 38 );
+        input[3] = Swap64( metaR.m0 << 26 | metaR.m1 >> 38 );
+        input[4] = Swap64( metaR.m1 << 26 );
+        input[5] = 0;
+        input[6] = 0;
+        input[7] = 0;
+    }
+
+    B3Round( inputSize );
+
+    uint32* out = (uint32*)output;
+    out[0] = state[0] ^ state[8];
+    out[1] = state[1] ^ state[9];
+
+    const uint64 oy = Swap64( *output ) >> yShift;
+
+    // Save output metadata
+    if constexpr ( MetaOutMulti == 2 && MetaInMulti == 3 )
+    {
+        out[2] = state[2] ^ state[10];
+        out[3] = state[3] ^ state[11];
+
+        const uint64 h0 = Swap64( output[0] );
+        const uint64 h1 = Swap64( output[1] );
+
+        outMeta = h0 << ySize | h1 >> 26;
+    }
+    else if constexpr ( MetaOutMulti == 3 )
+    {
+        out[2] = state[2] ^ state[10];
+        out[3] = state[3] ^ state[11];
+        out[4] = state[4] ^ state[12];
+        out[5] = state[5] ^ state[13];
+
+        const uint64 h0 = Swap64( output[0] );
+        const uint64 h1 = Swap64( output[1] );
+        const uint64 h2 = Swap64( output[2] );
+
+        outMeta.m0 = h0 << ySize | h1 >> 26;
+        outMeta.m1 = ((h1 << 6) & 0xFFFFFFC0) | h2 >> 58;
+    }
+    else if constexpr ( MetaOutMulti == 4 && MetaInMulti != 2 )
+    {
+        out[2] = state[2] ^ state[10];
+        out[3] = state[3] ^ state[11];
+        out[4] = state[4] ^ state[12];
+        out[5] = state[5] ^ state[13];
+
+        const uint64 h0 = Swap64( output[0] );
+        const uint64 h1 = Swap64( output[1] );
+        const uint64 h2 = Swap64( output[2] );
+
+        outMeta.m0 = h0 << ySize | h1 >> 26;
+        outMeta.m1 = h1 << 38    | h2 >> 26;
+    }
+
+    return (TYOut)oy;
+}
+
+#pragma GCC diagnostic pop
+
+#undef B3Round
+
diff --git a/src/plotting/matching/GroupScan.cpp b/src/plotting/matching/GroupScan.cpp
new file mode 100644
index 00000000..0c8de900
--- /dev/null
+++ b/src/plotting/matching/GroupScan.cpp
@@ -0,0 +1,160 @@
+#include "GroupScan.h"
+#include "ChiaConsts.h"
+#include "threading/MonoJob.h"
+
+struct ScanJob : MTJob<ScanJob>
+{
+    const uint64* _yBuffer;
+          uint32  _entryCount;
+          uint32* _groupIndices;
+          uint32* _finalGroupIndices;
+          uint32  _maxGroups;
+          std::atomic<uint64>* _totalGroupCount;
+          std::atomic<uint32>* _jobAddSequence;
+
+          // Internal job use
+          uint64  _startOffset;
+          uint64  _groupCount;
+
+    virtual void Run() override;
+};
+
+uint64 ScanBCGroupThread32(
+    const uint64* yBuffer,
+    const uint64  scanStart,
+    const uint64  scanEnd,
+    uint32*       groupIndices,
+    const uint32  maxGroups,
+    const uint32  jobId )
+{
+    ASSERT( yBuffer );
+    ASSERT( groupIndices );
+    
+    if( maxGroups < 1 )
+    {
+        ASSERT( 0 );
+        return 0;
+    }
+
+    uint64 groupCount = 0;
+    uint64 prevGroup  = yBuffer[scanStart] / kBC;
+
+    for( uint64 i = scanStart + 1; i < scanEnd; i++ )
+    {
+        const uint64 group = yBuffer[i] / kBC;
+        if( group == prevGroup )
+            continue;
+        
+        ASSERT( group > prevGroup );
+        prevGroup = group;
+        
+        groupIndices[groupCount++] = (uint32)i;
+
+        if( groupCount == maxGroups )
+        {
+            ASSERT( 0 );    // We ought to always have enough space
+                            // So this should be an error
+            break;
+        }
+    }
+
+    return groupCount;
+}
+
+uint64 ScanBCGroupMT32( 
+    ThreadPool&   pool, 
+          uint32  threadCount,
+    const uint64* yBuffer,
+    const uint32  entryCount,
+          uint32* tmpGroupIndices,
+          uint32* outGroupIndices,
+    const uint32  maxGroups
+    )
+{
+    // Each thread must a minimum # of entries, otherwise, reduce threads until have enough
+    const uint64 minEntriesPerThreads = 10000;
+
+    threadCount = std::min( threadCount, entryCount);
+    while( threadCount > 1 && entryCount / threadCount < minEntriesPerThreads )
+        threadCount--;
+
+    if( maxGroups < threadCount || maxGroups < 3 )
+        return 0;
+
+    std::atomic<uint64> groupCount = 0;
+
+    ASSERT( entryCount <= 0xFFFFFFFF );
+    ScanJob job = {};
+    job._yBuffer           = yBuffer;
+    job._entryCount        = entryCount;
+    job._groupIndices      = tmpGroupIndices;
+    job._finalGroupIndices = outGroupIndices;
+    job._maxGroups         = maxGroups;
+    job._totalGroupCount   = &groupCount;
+
+    MTJobRunner<ScanJob>::RunFromInstance( pool, threadCount, job );
+
+    return groupCount;
+}
+
+void ScanJob::Run()
+{
+    // First get the starting index for each thread
+    uint32 count, offset, _;
+    GetThreadOffsets( this, _entryCount, count, offset, _ );
+    
+    // Find the start of our current group
+    {
+        const uint64 curGroup = _yBuffer[offset] / kBC;
+
+        while( offset > 0 )
+        {
+            const uint64 group = _yBuffer[offset-1] / kBC;
+            if( group != curGroup )
+                break;
+            
+            offset--;
+        }
+    }
+
+    _startOffset = offset;
+    this->SyncThreads();
+
+    const uint64 end = this->IsLastThread() ? _entryCount : this->GetNextJob()._startOffset;
+    ASSERT( end > offset );
+
+    uint32 maxGroups, groupOffset;
+    GetThreadOffsets( this, _maxGroups, maxGroups, groupOffset, _ );
+
+    uint32* groupIndices = _groupIndices + groupOffset;
+
+    // Add initial boundary
+    groupIndices[0] = offset;
+    maxGroups--;
+
+    uint64 groupCount = 1 + ScanBCGroupThread32( _yBuffer, offset, end, groupIndices+1, maxGroups, _jobId );
+
+    // Copy groups into contiguous buffer
+    _groupCount = groupCount;
+    SyncThreads();
+
+    uint64 copyOffset = 0;
+    for( uint32 i = 0; i < _jobId; i++ )
+        copyOffset += GetJob( i )._groupCount;
+
+    bbmemcpy_t( _finalGroupIndices + copyOffset, groupIndices, groupCount );
+
+    if( IsLastThread() )
+    {
+        // Add the last ghost group, but don't actually count it
+        if( maxGroups > 0 )
+            _finalGroupIndices[copyOffset + groupCount] = _entryCount;
+        else
+            groupCount--;   // Can't add a ghost group, so our last group will have to serve as ghost
+        
+        // Do not count the last group, as that one doesn't have any group to match with
+        groupCount--;
+    }
+
+    _totalGroupCount->fetch_add( groupCount, std::memory_order_relaxed );
+}
\ No newline at end of file
diff --git a/src/plotting/matching/GroupScan.h b/src/plotting/matching/GroupScan.h
new file mode 100644
index 00000000..809d1a8c
--- /dev/null
+++ b/src/plotting/matching/GroupScan.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "threading/ThreadPool.h"
+
+// Returns: Group count found, minus the last 2 ghost groups.
+uint64 ScanBCGroupThread32(
+    const uint64* yBuffer,
+    uint64        scanStart,
+    uint64        scanEnd,
+    uint32*       groupIndices,
+    uint32        maxGroups,
+    uint32        jobId = 0 );
+
+// Returns: Group count found, minus the last 2 ghost groups.
+uint64 ScanBCGroupMT32( 
+    ThreadPool&   pool, 
+          uint32  threadCount,
+    const uint64* yBuffer,
+    const uint32  entryCount,
+          uint32* tmpGroupIndices,
+          uint32* outGroupIndices,
+    const uint32  maxGroups
+);
\ No newline at end of file
diff --git a/src/pos/chacha8.h b/src/pos/chacha8.h
index 6933ce4b..0ac63c6e 100644
--- a/src/pos/chacha8.h
+++ b/src/pos/chacha8.h
@@ -18,6 +18,14 @@ void chacha8_get_keystream(
     uint32_t n_blocks,
     uint8_t *c);
 
+
+void chacha8_get_keystream_cuda(
+    const uint32_t* input,
+    uint64_t pos,
+    uint32_t n_blocks,
+    uint8_t* c );
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/threading/Fence.cpp b/src/threading/Fence.cpp
index 03425399..948ac35b 100644
--- a/src/threading/Fence.cpp
+++ b/src/threading/Fence.cpp
@@ -68,3 +68,9 @@ void Fence::Wait( uint32 value, Duration& accumulator )
     }
 }
 
+//-----------------------------------------------------------
+void Fence::SpinWait( uint32 value )
+{
+    while( _value < value );
+}
+
diff --git a/src/threading/Fence.h b/src/threading/Fence.h
index 05e4f9f1..0263b0c0 100644
--- a/src/threading/Fence.h
+++ b/src/threading/Fence.h
@@ -31,10 +31,14 @@ class Fence
     // Wait until the fence reaches or passes the specified value
     void Wait( uint32 value, Duration& accumulator );
 
+    // Same as wait, but it does not suspend the thread, 
+    // instead continuously checks if the current fence value is >= value
+    void SpinWait( uint32 value );
+
 private:
-    // std::atomic<uint32> _value;
+     std::atomic<uint32> _value;
     // #NOTE: Don't think we need atomic, since memory ordering ought to be enforced by the mutex.
-    volatile uint32     _value = 0;
+    //volatile uint32     _value = 0;
     AutoResetSignal     _signal;
 
 };
diff --git a/src/threading/GenJob.h b/src/threading/GenJob.h
new file mode 100644
index 00000000..1475d09a
--- /dev/null
+++ b/src/threading/GenJob.h
@@ -0,0 +1,50 @@
+#include "MTJob.h"
+#include "ThreadPool.h"
+
+///
+/// An MTJob which shares a single immutable context object across all jobs
+/// and also contains a single mutable object shared across all threads for outputs.
+///
+
+template<typename TJobContextIn, typename TJobContextOut>
+struct GenJob;
+
+template<typename TJobContextIn, typename TJobContextOut>
+using GenJobRunFunc = void (*)( GenJob<TJobContextIn, TJobContextOut>* self );
+
+template<typename TJobContextIn, typename TJobContextOut>
+struct GenJob : MTJob<GenJob<TJobContextIn, TJobContextOut>>
+{
+    const TJobContextIn*  input;
+          TJobContextOut* output;
+    
+    GenJobRunFunc<TJobContextIn, TJobContextOut> run;
+    
+    inline void Run() override { run( this ); }
+};
+
+template<typename TJobContextIn, typename TJobContextOut>
+inline void GenJobRun( ThreadPool& pool, const uint32 threadCount, 
+                       const TJobContextIn* input, TJobContextOut* output,
+                       GenJobRunFunc<TJobContextIn, TJobContextOut> runFunc )
+{
+    MTJobRunner<GenJob<TJobContextIn, TJobContextOut>> jobs( pool );
+    
+    for( uint32 i = 0; i< threadCount; i++ )
+    {
+        auto& job = jobs[i];
+        job.run     = runFunc;
+        job.input   = input;
+        job.output  = output;
+    }
+
+    jobs.Run( threadCount );
+}
+
+template<typename TJobContextIn, typename TJobContextOut>
+inline void GenJobRun( ThreadPool& pool, 
+                       const TJobContextIn* input, TJobContextOut* output,
+                       GenJobRunFunc<TJobContextIn, TJobContextOut> runFunc )
+{
+    GenJobRun( pool, pool.ThreadCount(), input, output, runFunc );
+}
diff --git a/src/threading/MTJob.h b/src/threading/MTJob.h
index 3d71f8fc..a7e0badd 100644
--- a/src/threading/MTJob.h
+++ b/src/threading/MTJob.h
@@ -77,6 +77,18 @@ struct MTJobSyncT
         return _jobs[index];
     };
 
+    inline const TJob& GetNextJob() const
+    {   
+        ASSERT( !IsLastThread() );
+        return _jobs[_jobId+1];
+    };
+
+    inline const TJob& GetPrevJob() const
+    {   
+        ASSERT( !IsControlThread() );
+        return _jobs[_jobId-1];
+    };
+
     inline const TJob& LastJob() const { return _jobs[_jobCount-1]; }
 
     // For debugging
@@ -113,6 +125,9 @@ struct MTJobRunner
     double Run();
     double Run( uint32 threadCount );
 
+    static void RunFromInstance( ThreadPool& pool, uint32 threadCount, const TJob& jobSrc );
+    static void RunFromInstanceDynamic( ThreadPool& pool, uint32 threadCount, const TJob& jobSrc );
+
     inline TJob& operator[]( uint64 index ) { return this->_jobs[index]; }
     inline TJob& operator[]( int64  index ) { return this->_jobs[index]; }
     inline TJob& operator[]( uint index   ) { return this->_jobs[index]; }
@@ -207,6 +222,37 @@ inline void MTJobRunner<TJob, MaxJobs>::RunJobWrapper( TJob* job )
     static_cast<MTJob<TJob>*>( job )->Run();
 }
 
+template<typename TJob, uint MaxJobs>
+void MTJobRunner<TJob, MaxJobs>::RunFromInstance( ThreadPool& pool, const uint32 threadCount, const TJob& jobSrc )
+{
+    FatalIf( threadCount > MaxJobs, "Too many threads for job." );
+
+    MTJobRunner<TJob> jobs( pool );
+
+    for( uint32 i = 0; i < threadCount; i++ )
+    {
+        auto& job = jobs[i];
+        job = jobSrc;
+    }
+
+    jobs.Run( threadCount );
+}
+
+template<typename TJob, uint MaxJobs>
+void MTJobRunner<TJob, MaxJobs>::RunFromInstanceDynamic( ThreadPool& pool, const uint32 threadCount, const TJob& jobSrc )
+{
+    FatalIf( threadCount > MaxJobs, "Too many threads for job." );
+
+    auto  runner = new MTJobRunner<TJob>( pool );
+    TJob* jobs   = runner->Jobs();
+
+    for( uint32 i = 0; i < threadCount; i++ )
+        jobs[i] = jobSrc;
+
+    runner->Run( threadCount );
+    delete runner;
+}
+
 
 template<typename TJob>
 inline void MTJobSyncT<TJob>::SyncThreads()
@@ -566,3 +612,48 @@ inline void GetThreadOffsets( MTJob<TJob>* job, const T totalCount, T& count, T&
     GetThreadOffsets( job->JobId(), job->JobCount(), totalCount, count, offset, end );
 }
 
+//-----------------------------------------------------------
+template<typename TJob, typename T>
+inline Span<T> GetThreadOffsets( MTJob<TJob>* job, const Span<T> values )
+{
+    const uint32 id          = job->JobId();
+    const uint32 threadCount = job->JobCount();
+
+    size_t count, offset;
+    
+    const size_t totalCount     = values.Length();
+    const size_t countPerThread = totalCount / threadCount;
+    const size_t remainder      = totalCount - countPerThread * threadCount;
+
+    count  = countPerThread;
+    offset = id * countPerThread;
+
+    if( id == threadCount-1 )
+        count += remainder;
+
+    return values.Slice( offset, count );
+}
+
+//-----------------------------------------------------------
+template<typename T>
+inline void GetFairThreadOffsets( const uint32 id, const uint32 threadCount, const T totalCount, T& count, T& offset, T& end )
+{
+    const T countPerThread = totalCount / (T)threadCount;
+    const T remainder      = totalCount - countPerThread * (T)threadCount;
+
+    count  = countPerThread;
+    offset = (T)id * countPerThread;
+
+    if( id < remainder )
+        count++;
+    
+    end = offset + count;
+}
+
+//-----------------------------------------------------------
+template<typename TJob, typename T>
+inline void GetFairThreadOffsets( MTJob<TJob>* job, const T totalCount, T& count, T& offset, T& end )
+{
+    GetFairThreadOffsets( job->JobId(), job->JobCount(), totalCount, count, offset, end );
+}
+
diff --git a/src/threading/MonoJob.h b/src/threading/MonoJob.h
new file mode 100644
index 00000000..93bba36d
--- /dev/null
+++ b/src/threading/MonoJob.h
@@ -0,0 +1,57 @@
+#include "MTJob.h"
+#include "ThreadPool.h"
+
+///
+/// An MTJob which shares a single context object across all jobs
+///
+
+template<typename TJobContext>
+struct MonoJob;
+
+template<typename TJobContext>
+using MonoJobRunFunc = void (*)( MonoJob<TJobContext>* self );
+
+template<typename TJobContext>
+struct MonoJob : MTJob<MonoJob<TJobContext>>
+{
+    TJobContext*                context;
+    MonoJobRunFunc<TJobContext> run;
+    
+    inline static void RunJob( ThreadPool& pool, const uint32 threadCount, TJobContext* job, MonoJobRunFunc<TJobContext> runFunc );
+    inline static void RunJob( ThreadPool& pool, TJobContext* job, MonoJobRunFunc<TJobContext> runFunc );
+
+    inline void Run() override { run( this ); }
+};
+
+template<typename TJobContext>
+inline void MonoJobRun( ThreadPool& pool, const uint32 threadCount, TJobContext* jobContext, MonoJobRunFunc<TJobContext> runFunc )
+{
+    MTJobRunner<MonoJob<TJobContext>> jobs( pool );
+    
+    for( uint32 i = 0; i< threadCount; i++ )
+    {
+        auto& job = jobs[i];
+        job.run     = runFunc;
+        job.context = jobContext;
+    }
+
+    jobs.Run( threadCount );
+}
+
+template<typename TJobContext>
+inline void MonoJobRun( ThreadPool& pool, TJobContext* jobContext, MonoJobRunFunc<TJobContext> runFunc )
+{
+    MonoJobRun( pool, pool.ThreadCount(), jobContext, runFunc );
+}
+
+template<typename TJobContext>
+inline void MonoJob<TJobContext>::RunJob( ThreadPool& pool, const uint32 threadCount, TJobContext* job, MonoJobRunFunc<TJobContext> runFunc )
+{
+    MonoJobRun<TJobContext>( pool, threadCount, job, runFunc );
+}
+
+template<typename TJobContext>
+inline void MonoJob<TJobContext>::RunJob( ThreadPool& pool, TJobContext* job, MonoJobRunFunc<TJobContext> runFunc )
+{
+    RunJob( pool, pool.ThreadCount(), job, runFunc );
+}
\ No newline at end of file
diff --git a/src/threading/Thread.h b/src/threading/Thread.h
index a3f88d55..2e757af0 100644
--- a/src/threading/Thread.h
+++ b/src/threading/Thread.h
@@ -28,12 +28,16 @@ class Thread
     // Cause the current thread to sleep
     static void Sleep( long milliseconds );
 
+    // Not thread safe, should only ever be called by a single thread.
     bool WaitForExit( long milliseconds = -1 );
 
     bool HasExited() const;
 
     bool SetPriority( const ThreadPriority priority );
     
+    // Unsafe
+    inline ThreadId GetNativeHandle() const { return _threadId; }
+
 private:
     
     #if PLATFORM_IS_UNIX
@@ -52,15 +56,18 @@ class Thread
 #if PLATFORM_IS_UNIX
     // Used for launching the thread and 
     // suspending it until it actually runs.
-    pthread_mutex_t _launchMutex;
-    pthread_cond_t  _launchCond ;
+    pthread_mutex_t _launchMutex = {};
+    pthread_cond_t  _launchCond  = {};
+
+    pthread_mutex_t _exitMutex = {};
+    pthread_cond_t  _exitCond  = {};
 #endif
 
     enum class ThreadState : int
     {
         ReadyToRun = 0,
-        Running    = 1,
-        Exited     = 2
+        Running,
+        Exited,
     };
 
     std::atomic<ThreadState> _state;
diff --git a/src/threading/ThreadPool.cpp b/src/threading/ThreadPool.cpp
index fecf9760..fc6dee58 100644
--- a/src/threading/ThreadPool.cpp
+++ b/src/threading/ThreadPool.cpp
@@ -5,8 +5,8 @@
 
 
 //-----------------------------------------------------------
-ThreadPool::ThreadPool( uint threadCount, Mode mode, bool disableAffinity )
-    : _threadCount( threadCount )
+ThreadPool::ThreadPool( uint threadCount, Mode mode, bool disableAffinity, uint32 cpuOffset )
+    : _threadCount    ( threadCount )
     , _mode           ( mode )
     , _disableAffinity( disableAffinity )
     , _jobSignal      ( 0 )
@@ -20,10 +20,12 @@ ThreadPool::ThreadPool( uint threadCount, Mode mode, bool disableAffinity )
 
     auto threadRunner = mode == Mode::Fixed ? FixedThreadRunner : GreedyThreadRunner;
 
+    const uint32 maxCpus = SysHost::GetLogicalCPUCount();
+
     for( uint i = 0; i < threadCount; i++ )
     {
         _threadData[i].index = (int)i;
-        _threadData[i].cpuId = i;
+        _threadData[i].cpuId = (cpuOffset + i) % maxCpus;
         _threadData[i].pool  = this;
         
         Thread& t = _threads[i];
@@ -35,7 +37,7 @@ ThreadPool::ThreadPool( uint threadCount, Mode mode, bool disableAffinity )
 //-----------------------------------------------------------
 ThreadPool::~ThreadPool()
 {
-    // Signal
+    // Signal threads to exit
     _exitSignal.store( true, std::memory_order_release );
 
     if( _mode == Mode::Fixed )
@@ -49,10 +51,10 @@ ThreadPool::~ThreadPool()
             _jobSignal.Release();
     }
 
-    // #TODO: Wait for all threads to finish
-    
-    // #TODO: Signal thread for exit.
-    // #TODO: Wait for all threads to exit
+    // #TODO: Implement A Thread::Join(...) to wait for all at the same time
+    // Wait for all threads to exit
+    for( uint i = 0; i < _threadCount; i++ )
+        _threads[i].WaitForExit();
 
     delete[] _threads;
     delete[] _threadData;
@@ -115,7 +117,7 @@ void ThreadPool::DispatchGreedy( JobFunc func, byte* data, uint count, size_t da
 
     ASSERT( _poolSignal.GetCount() == 0 );
 
-    // Signal release the job semaphore <coun> amount of times.
+    // Signal release the job semaphore <count> amount of times.
     // The job threads will grab jobs from the pool as long as there is one.
     for( uint i = 0; i < count; i++ )
         _jobSignal.Release();
@@ -156,14 +158,14 @@ void ThreadPool::FixedThreadRunner( void* tParam )
     for( ;; )
     {
         if( exitSignal.load( std::memory_order::memory_order_acquire ) )
-            return;
+            break;
 
         // Wait until we are signalled to go
         jobSignal.Wait();
 
         // We may have been signalled to exit
         if( exitSignal.load( std::memory_order_acquire ) )
-            return;
+            break;
         
         // Run job
         pool._jobFunc( pool._jobData + pool._jobDataSize * index );
@@ -171,6 +173,8 @@ void ThreadPool::FixedThreadRunner( void* tParam )
         // Finished job
         poolSignal.Release();
     }
+
+    // Signal exited
 }
 
 //-----------------------------------------------------------
diff --git a/src/threading/ThreadPool.h b/src/threading/ThreadPool.h
index 162d637d..040f6678 100644
--- a/src/threading/ThreadPool.h
+++ b/src/threading/ThreadPool.h
@@ -20,7 +20,7 @@ class ThreadPool
                     // as there are jobs available.
     };
 
-    ThreadPool( uint threadCount, Mode mode = Mode::Fixed, bool disableAffinity = false );
+    ThreadPool( uint threadCount, Mode mode = Mode::Fixed, bool disableAffinity = false, uint32 cpuOffset = 0);
     ~ThreadPool();
 
     void RunJob( JobFunc func, void* data, uint count, size_t dataSize );
diff --git a/src/tools/IOTester.cpp b/src/tools/IOTester.cpp
index 80d7b6cf..9e9f4583 100644
--- a/src/tools/IOTester.cpp
+++ b/src/tools/IOTester.cpp
@@ -136,7 +136,7 @@ void IOTestMain( GlobalPlotConfig& gCfg, CliParser& cli )
         {
             auto* diskFile = new FileStream();
             FatalIf( !diskFile->Open( filePath, FileMode::Create, FileAccess::ReadWrite, flags ), 
-                "Failed to open temporary test file at path '%s'.", testDir );
+                "Failed to open temporary test file at path '%s' with error: %d.", testDir, diskFile->GetError() );
 
             files[0] = diskFile;
         }
diff --git a/src/tools/PlotComparer.cpp b/src/tools/PlotComparer.cpp
index 7d09fc1f..9562e2a0 100644
--- a/src/tools/PlotComparer.cpp
+++ b/src/tools/PlotComparer.cpp
@@ -349,12 +349,13 @@ void PlotCompareMain( GlobalPlotConfig& gCfg, CliParser& cli )
     FatalIf( tgtPlot.K() != 32, "Plot B is k%u. Only k32 plots are currently supported.", tgtPlot.K() );
 
     FatalIf( !MemCmp( refPlot.PlotId(), tgtPlot.PlotId(), BB_PLOT_ID_LEN ), "Plot id mismatch." );
-    FatalIf( !MemCmp( refPlot.PlotMemo(), tgtPlot.PlotMemo(), std::min( refPlot.PlotMemoSize(), tgtPlot.PlotMemoSize() ) ), "Plot memo mismatch." );
+    // FatalIf( !MemCmp( refPlot.PlotMemo(), tgtPlot.PlotMemo(), std::min( refPlot.PlotMemoSize(), tgtPlot.PlotMemoSize() ) ), "Plot memo mismatch." );
     FatalIf( refPlot.K() != tgtPlot.K(), "K value mismatch." );
 
     // Test P7, dump it
     // DumpP7( refPlot, "/mnt/p5510a/reference/p7.tmp" );
 
+    // TestC3Table( refPlot, tgtPlot );
     // TestTable( refPlot, tgtPlot, TableId::Table7 );
     // TestTable( refPlot, tgtPlot, TableId::Table3 );
 
@@ -483,7 +484,10 @@ void TestC3Table( PlotInfo& ref, PlotInfo& tgt )
 
         for( int64 i = 0; i < parkCount; i++ )
         {
-            if( !MemCmp( refC3Reader, tgtC3Reader, c3ParkSize ) )
+            const uint16 refSize = Swap16( *(uint16*)refC3Reader );
+            const uint16 tgtSize = Swap16( *(uint16*)tgtC3Reader );
+
+            if( refSize != tgtSize || !MemCmp( refC3Reader, tgtC3Reader, tgtSize ) )
             {
                 Log::Line( " C3 park %lld failed.", i );
                 failures.push_back( i );
@@ -620,6 +624,9 @@ void TestTable( PlotInfo& ref, PlotInfo& tgt, TableId table )
     
     Log::Line( "Validating Table %u...", table+1 );
 
+    const uint64 stubBitSize      = (_K - kStubMinusBits);
+    const size_t stubSectionBytes = CDiv( (kEntriesPerPark - 1) * stubBitSize, 8 );
+
     uint64 failureCount = 0;
     if( table == TableId::Table7 )
     {
@@ -635,12 +642,26 @@ void TestTable( PlotInfo& ref, PlotInfo& tgt, TableId table )
     {
         for( int64 i = 0; i < parkCount; i++ )
         {
-            if( !MemCmp( parkRef, parkTgt, parkSize ) )
+            // Ignore buffer zone
+
+            const uint16 pRefCSize = *(uint16*)(parkRef + stubSectionBytes + sizeof( uint64 ) );
+            const uint16 pTgtCSize = *(uint16*)(parkTgt + stubSectionBytes + sizeof( uint64 ) );
+
+            bool failed = pRefCSize != pTgtCSize;
+
+            if( !failed )
+            {
+                const size_t realParkSize = sizeof( uint64 ) + stubSectionBytes + pRefCSize;
+                failed =!MemCmp( parkRef, parkTgt, realParkSize );
+            }
+            // if( pRefCSize != pTgtCSize || !MemCmp( parkRef, parkTgt, parkSize ) )
+
             {
-                bool failed = true;
+                // bool failed     = true;
 
                 if( failed )
                 {
+                    bool stubsEqual = MemCmp( parkRef, parkTgt, stubSectionBytes + sizeof( uint64 ) );
                     Log::Line( " T%u park %lld failed.", table+1, i );
                     failureCount++;
                 }
diff --git a/src/tools/PlotFile.cpp b/src/tools/PlotFile.cpp
new file mode 100644
index 00000000..69b78dfb
--- /dev/null
+++ b/src/tools/PlotFile.cpp
@@ -0,0 +1,168 @@
+#include "PlotReader.h"
+#include "ChiaConsts.h"
+
+///
+/// Plot Files
+///
+//-----------------------------------------------------------
+bool IPlotFile::ReadHeader( int& error )
+{
+    error = 0;
+
+    _version = PlotVersion::v1_0;
+
+    // Check if the plot file is v2
+    {
+        uint32 v2Magic = 0;
+        if( Read( sizeof( v2Magic ), &v2Magic ) != sizeof( v2Magic ) )
+        {
+            error = GetError();
+            return false;
+        }
+
+        // It's > v1
+        if( v2Magic == CHIA_PLOT_V2_MAGIC )
+        {
+            // Get the plot version
+            if( Read( sizeof( _version ), &_version ) != sizeof( _version ) )
+            {
+                error = GetError();
+                return false;
+            }
+
+            // For now only version 2 is supported
+            if( _version != PlotVersion::v2_0 )
+            {
+                error = -1;
+                return false;
+            }
+        }
+        else
+        {
+            // Check for v1 magic
+            char v1Magic[sizeof( kPOSMagic )-1] = { 0 };
+            memcpy( v1Magic, &v2Magic, sizeof( v2Magic ) );
+
+            const size_t v1ReadMagicSize = sizeof( v1Magic ) - sizeof( v2Magic );
+            if( Read( v1ReadMagicSize, &v1Magic[sizeof(v2Magic)] ) != v1ReadMagicSize )
+            {
+                error = GetError();
+                return false;
+            }
+
+            // Ensure it is indeed v1 magic
+            if( !MemCmp( v1Magic, kPOSMagic, sizeof( v1Magic ) ) )
+            {
+                error = -1;       // #TODO: Set actual user error
+                return false;
+            }
+        }
+    }
+    
+    // Plot Id
+    {
+        if( Read( sizeof( _header.id ), _header.id ) != sizeof( _header.id ) )
+            return false;
+
+        // char str[65] = { 0 };
+        // size_t numEncoded = 0;
+        // BytesToHexStr( _header.id, sizeof( _header.id ), str, sizeof( str ), numEncoded );
+        // ASSERT( numEncoded == sizeof( _header.id ) );
+        // _idString = str;
+    }
+
+    // K
+    {
+        byte k = 0;
+        if( Read( 1, &k ) != 1 )
+            return false;
+
+        _header.k = k;
+    }
+
+    // Format Descritption
+    if( _version < PlotVersion::v2_0 )
+    {
+        const uint formatDescSize =  ReadUInt16();
+        FatalIf( formatDescSize != sizeof( kFormatDescription ) - 1, "Invalid format description size." );
+
+        char desc[sizeof( kFormatDescription )-1] = { 0 };
+        if( Read( sizeof( desc ), desc ) != sizeof( desc ) )
+            return false;
+        
+        if( !MemCmp( desc, kFormatDescription, sizeof( desc ) ) )
+        {
+            error = -1; // #TODO: Set proper user error
+            return false;
+        }
+    }
+
+    // Memo
+    {
+        uint memoSize = ReadUInt16();
+        if( memoSize > sizeof( _header.memo ) )
+        {
+            error = -1; // #TODO: Set proper user error
+            return false;
+        } 
+
+        _header.memoLength = memoSize;
+
+        if( Read( memoSize, _header.memo ) != memoSize )
+        {
+            error = -1; // #TODO: Set proper user error
+            return false;
+        }
+
+        // char str[BB_PLOT_MEMO_MAX_SIZE*2+1] = { 0 };
+        // size_t numEncoded = 0;
+        // BytesToHexStr( _memo, memoSize, str, sizeof( str ), numEncoded );
+        
+        // _memoString = str;
+    }
+
+    // Flags
+    if( _version >= PlotVersion::v2_0 )
+    {
+        if( Read( sizeof( _header.flags ), &_header.flags ) != sizeof( _header.flags ) )
+        {
+            error = GetError();
+            return false;
+        }
+
+        // Compression level
+        if( IsFlagSet( _header.flags, PlotFlags::Compressed ) )
+        {
+            if( Read( sizeof( _header.compressionLevel ), &_header.compressionLevel ) != sizeof( _header.compressionLevel ) )
+            {
+                error = GetError();
+                return false;
+            }
+        }
+    }
+
+    // Table pointers
+    if( Read( sizeof( _header.tablePtrs ), _header.tablePtrs ) != sizeof( _header.tablePtrs ) )
+    {
+        error = GetError();
+        return false;
+    }
+
+    for( int i = 0; i < 10; i++ )
+        _header.tablePtrs[i] = Swap64( _header.tablePtrs[i] );
+
+    // Table sizes
+    if( _version >= PlotVersion::v2_0 )
+    {
+        if( Read( sizeof( _header.tableSizes ), _header.tableSizes ) != sizeof( _header.tableSizes ) )
+        {
+            error = GetError();
+            return false;
+        }
+    }
+
+    // What follows is table data
+    return true;
+}
+
+
diff --git a/src/tools/PlotReader.cpp b/src/tools/PlotReader.cpp
index bb4a84d6..6e925947 100644
--- a/src/tools/PlotReader.cpp
+++ b/src/tools/PlotReader.cpp
@@ -5,6 +5,9 @@
 #include "plotting/CTables.h"
 #include "plotting/DTables.h"
 #include "plotmem/LPGen.h"
+#include "plotting/Compression.h"
+#include "harvesting/GreenReaper.h"
+#include "BLS.h"
 
 ///
 /// Plot Reader
@@ -24,17 +27,20 @@ PlotReader::PlotReader( IPlotFile& plot )
 //-----------------------------------------------------------
 PlotReader::~PlotReader()
 {
-    free( _parkBuffer );
-    free( _deltasBuffer );
+    free( _parkBuffer );    _parkBuffer = nullptr;
+    free( _deltasBuffer );  _deltasBuffer = nullptr;
 
-    if( _c2Entries.values )
-        bbvirtfreebounded( _c2Entries.values );
+    bbvirtfreebounded( _c1Buffer );
 
-    if( _c1Buffer )
-        bbvirtfreebounded( _c1Buffer );
+    if( _c2Entries.Ptr() )
+        delete[] _c2Entries.values;
+    _c2Entries = {};
+
+    bbvirtfreebounded_span( _c3Buffer );
 
-    if( _c3Buffer.Ptr() )
-        bbvirtfreebounded( _c3Buffer.Ptr() );
+    if( _grContext )
+        grDestroyContext( _grContext );
+    _grContext = nullptr;
 }
 
 //-----------------------------------------------------------
@@ -82,7 +88,7 @@ size_t PlotReader::GetTableParkCount( const PlotTable table ) const
         case PlotTable::Table4:
         case PlotTable::Table5:
         case PlotTable::Table6:
-            return _plot.TableSize( table ) / CalculateParkSize( (TableId)table );
+            return _plot.TableSize( table ) / GetParkSizeForTable( (TableId)table );
 
         default:
             return 0;
@@ -223,8 +229,39 @@ int64 PlotReader::ReadC3Park( uint64 parkIndex, uint64* f7Buffer )
 }
 
 //-----------------------------------------------------------
-bool PlotReader::ReadP7Entries( uint64 parkIndex, uint64* p7Indices )
+bool PlotReader::ReadP7Entries( uint64 parkIndex, uint64 p7ParkEntries[kEntriesPerPark] )
+{
+    if( LoadP7Park( parkIndex ) )
+    {
+        bbmemcpy_t( p7ParkEntries, _park7Entries, kEntriesPerPark );
+        return true;
+    }
+
+    return false;
+}
+
+//-----------------------------------------------------------
+bool PlotReader::ReadP7Entry( uint64 p7Index, uint64& outP7Entry )
+{
+    const uint64 parkIndex = p7Index / kEntriesPerPark;
+
+    if( LoadP7Park( parkIndex ) )
+    {
+        const uint64 parkLocalP7Index = p7Index - parkIndex * kEntriesPerPark;
+        outP7Entry = _park7Entries[parkLocalP7Index];
+        return true;
+    }
+
+    outP7Entry = 0;
+    return false;
+}
+
+//-----------------------------------------------------------
+bool PlotReader::LoadP7Park( uint64 parkIndex )
 {
+    if( _park7Index != -1 && (uint64)_park7Index == parkIndex )
+        return true;
+
     const uint32 k              = _plot.K();
     const uint32 p7EntrySize    = k + 1;
     const uint64 p7TableAddress = _plot.TableAddress( PlotTable::Table7 );
@@ -248,8 +285,9 @@ bool PlotReader::ReadP7Entries( uint64 parkIndex, uint64* p7Indices )
     CPBitReader parkReader( (byte*)_parkBuffer, parkSizeBytes * 8 );
 
     for( uint32 i = 0; i < kEntriesPerPark; i++ )
-        p7Indices[i] = parkReader.Read64( p7EntrySize );
+        _park7Entries[i] = parkReader.Read64( p7EntrySize );
 
+    _park7Index = (int64)parkIndex;
     return true;
 }
 
@@ -264,11 +302,11 @@ bool PlotReader::ReadLPParkComponents( TableId table, uint64 parkIndex,
     if( table >= TableId::Table7 )
         return false;
 
-    const uint32 k              = _plot.K();
-    const size_t lpSizeBytes    = LinePointSizeBytes( k );
-    const size_t tableMaxSize   = _plot.TableSize( (PlotTable)table );
-    const size_t tableAddress   = _plot.TableAddress( (PlotTable)table );
-    const size_t parkSize       = CalculateParkSize( table, k );
+    const uint32 k                = _plot.K();
+    const size_t lpSizeBytes      = LinePointSizeBytes( k );
+    const size_t tableMaxSize     = _plot.TableSize( (PlotTable)table );
+    const size_t tableAddress     = _plot.TableAddress( (PlotTable)table );
+    const size_t parkSize         = GetParkSizeForTable( table );
 
     const uint64 maxParks       = tableMaxSize / parkSize;
     if( parkIndex >= maxParks )
@@ -294,17 +332,17 @@ bool PlotReader::ReadLPParkComponents( TableId table, uint64 parkIndex,
     }
 
     // Read stubs
-    const size_t stubsSizeBytes = CDiv( ( kEntriesPerPark - 1 ) * ( k - kStubMinusBits ), 8 );
+    const size_t stubsSizeBytes = GetLPStubByteSize( table );
     uint64* stubsBuffer = _parkBuffer;
 
     if( _plot.Read( stubsSizeBytes, stubsBuffer ) != (ssize_t)stubsSizeBytes )
         return false;
 
     // Read deltas
-    const size_t maxDeltasSizeBytes = CalculateMaxDeltasSize( (TableId)table );
+    const size_t maxDeltasSizeBytes = GetParkDeltasSectionMaxSize( (TableId)table );
     byte* compressedDeltaBuffer = ((byte*)_parkBuffer) + RoundUpToNextBoundary( stubsSizeBytes, sizeof( uint64 ) );
     byte* deltaBuffer           = _deltasBuffer;
-    
+
     uint16 compressedDeltasSize = 0;
     if( _plot.Read( 2, &compressedDeltasSize ) != 2 )
         return false;
@@ -329,10 +367,12 @@ bool PlotReader::ReadLPParkComponents( TableId table, uint64 parkIndex,
             return false;
 
         // Decompress deltas
+        const FSE_DTable* dTable = GetDTableForTable( table );
+
         deltaCount = FSE_decompress_usingDTable( 
                         deltaBuffer, kEntriesPerPark - 1, 
                         compressedDeltaBuffer, compressedDeltasSize, 
-                        DTables[(int)table] );
+                        dTable );
 
         if( FSE_isError( deltaCount ) )
             return false;
@@ -346,6 +386,15 @@ bool PlotReader::ReadLPParkComponents( TableId table, uint64 parkIndex,
     return true;
 }
 
+//-----------------------------------------------------------
+const FSE_DTable* PlotReader::GetDTableForTable( TableId table ) const
+{
+    if( !IsCompressedXTable( table ) )
+        return DTables[(int)table];
+
+    return CreateCompressionDTable( _plot.CompressionLevel() );
+}
+
 //-----------------------------------------------------------
 bool PlotReader::ReadLPPark( TableId table, uint64 parkIndex, uint128 linePoints[kEntriesPerPark], uint64& outEntryCount )
 {
@@ -363,7 +412,7 @@ bool PlotReader::ReadLPPark( TableId table, uint64 parkIndex, uint128 linePoints
     linePoints[0] = baseLinePoint;
     if( deltaCount > 0 )
     {
-        const uint32 stubBitSize = ( _plot.K() - kStubMinusBits );
+        const uint32 stubBitSize = GetLPStubBitSize( table );
         
         for( uint64 i = 1; i <= deltaCount; i++ )
         {
@@ -380,6 +429,70 @@ bool PlotReader::ReadLPPark( TableId table, uint64 parkIndex, uint128 linePoints
     return true;
 }
 
+//-----------------------------------------------------------
+TableId PlotReader::GetLowestStoredTable() const
+{
+    const uint32 compressionLevel = _plot.CompressionLevel();
+
+    const uint32 numDroppedTables = compressionLevel == 0 ? 0 :
+                                    compressionLevel >= 9 ? 2 : 1;
+
+    return TableId::Table1 + numDroppedTables;
+}
+
+//-----------------------------------------------------------
+bool PlotReader::IsCompressedXTable( const TableId table ) const
+{
+    return _plot.CompressionLevel() > 0 && table == GetLowestStoredTable();
+}
+
+//-----------------------------------------------------------
+size_t PlotReader::GetParkSizeForTable( TableId table ) const
+{
+    if( IsCompressedXTable( table ) )
+        return GetCompressionInfoForLevel( _plot.CompressionLevel() ).tableParkSize;
+
+    if( table < GetLowestStoredTable() )
+    {
+        ASSERT(0);
+        return 0;
+    }
+
+    return CalculateParkSize( table, _plot.K() );
+}
+
+//-----------------------------------------------------------
+uint32 PlotReader::GetLPStubBitSize( TableId table ) const
+{
+    FatalIf( table < GetLowestStoredTable(), "Getting stub bit size for invalid table." );
+
+    if( !IsCompressedXTable( table ) )
+        return _plot.K() - kStubMinusBits;
+
+    auto info = GetCompressionInfoForLevel( _plot.CompressionLevel() );
+    return info.subtSizeBits;
+}
+
+//-----------------------------------------------------------
+uint32 PlotReader::GetLPStubByteSize( const TableId table ) const
+{
+    return CDiv( ( kEntriesPerPark - 1 ) * GetLPStubBitSize( table ), 8 );
+}
+
+//-----------------------------------------------------------
+size_t PlotReader::GetParkDeltasSectionMaxSize( const TableId table ) const
+{
+    if( !IsCompressedXTable( table ) )
+        return CalculateMaxDeltasSize( table );
+
+    auto info = GetCompressionInfoForLevel( _plot.CompressionLevel() );
+
+    const uint32 lpSize       = CDiv( _K * 2u, 8 );
+    const uint32 stubByteSize = GetLPStubByteSize( table );
+    
+    return info.tableParkSize - ( lpSize + stubByteSize );
+}
+
 // #TODO: Add 64-bit outLinePoint (templatize)
 //-----------------------------------------------------------
 bool PlotReader::ReadLP( TableId table, uint64 index, uint128& outLinePoint )
@@ -404,7 +517,7 @@ bool PlotReader::ReadLP( TableId table, uint64 index, uint128& outLinePoint )
             return false;
 
         const uint64 maxIter     = std::min( lpLocalIdx, deltaCount );
-        const uint32 stubBitSize = ( _plot.K() - kStubMinusBits );
+        const uint32 stubBitSize = GetLPStubBitSize( table );
 
         for( uint64 i = 0; i < maxIter; i++ )
         {
@@ -430,13 +543,10 @@ bool PlotReader::FetchProofFromP7Entry( uint64 p7Entry, uint64 proof[32] )
 }
 
 //-----------------------------------------------------------
-Span<uint64> PlotReader::GetP7IndicesForF7( const uint64 f7, Span<uint64> indices )
+uint64 PlotReader::GetP7IndicesForF7( const uint64 f7, uint64& outStartT6Index )
 {
-    if( indices.Length() == 0 )
-        return {};
-
     if( !LoadC2Entries() )
-        return {};
+        return 0;
 
     uint64 c2Index = 0;
 
@@ -515,36 +625,43 @@ Span<uint64> PlotReader::GetP7IndicesForF7( const uint64 f7, Span<uint64> indice
         _c3Buffer.length = kCheckpoint1Interval * 2;
     }
 
-    uint64 c3Count = (uint64)ReadC3Park( c3Park, _c3Buffer.Ptr() );
+    int64 c3Count = ReadC3Park( c3Park, _c3Buffer.Ptr() );
+    if( c3Count < 0)
+        return {};
 
     if( parkCount > 1 )
     {
         ASSERT( parkCount == 2 );
-        c3Count += (uint64)ReadC3Park( c3Park+1, _c3Buffer.Ptr() + c3Count );
+        const int64 secondParkC3Count = ReadC3Park( c3Park+1, _c3Buffer.Ptr() + c3Count );
+        if( secondParkC3Count < 0 )
+            return {};
+
+        c3Count += secondParkC3Count;
     }
 
     // Grab as many matches as we can
     const Span<uint64> c3Entries    = _c3Buffer.SliceSize( (size_t)c3Count );
-    const uint64       c3StartIndex = c3Park * kCheckpoint1Interval;
-    uint64 matchCount = 0;
+          uint64       c3StartIndex = c3Park * kCheckpoint1Interval;
 
     for( uint64 i = 0; i < c3Entries.Length(); i++ )
     {
         if( c3Entries[i] == f7 )
         {
-            while( matchCount < indices.Length() && i < c3Count && c3Entries[i] == f7 )
-                indices[matchCount++] = c3StartIndex + i++;
+            uint64 matchCount = 1;
+
+            outStartT6Index = c3StartIndex + i;
+            while( ++i < (uint64)c3Count && c3Entries[i] == f7 )
+                matchCount++;
 
-            return indices.SliceSize( matchCount );
+            return matchCount;
         }
     }
 
-    return {};
+    return 0;
 }
 
-
 //-----------------------------------------------------------
-bool PlotReader::FetchProof( const uint64 t6LPIndex, uint64 fullProofXs[BB_PLOT_PROOF_X_COUNT] )
+ProofFetchResult PlotReader::FetchProof( const uint64 t6LPIndex, uint64 fullProofXs[BB_PLOT_PROOF_X_COUNT] )
 {
     uint64 lpIndices[2][BB_PLOT_PROOF_X_COUNT];
 
@@ -557,24 +674,26 @@ bool PlotReader::FetchProof( const uint64 t6LPIndex, uint64 fullProofXs[BB_PLOT_
     // from 6 to 1, grabbing all of the x's that make up a proof.
     uint32 lookupCount = 1;
 
-    for( TableId table = TableId::Table6; table >= TableId::Table1; table-- )
+    const bool    isCompressed = _plot.CompressionLevel() > 0;
+    const TableId endTable     = GetLowestStoredTable();
+
+    for( TableId table = TableId::Table6; table >= endTable; table-- )
     {
         ASSERT( lookupCount <= 32 );
 
+        const bool use64BitLP = table < TableId::Table6 && _plot.K() <= 32;
+
         for( uint32 i = 0, dst = 0; i < lookupCount; i++, dst += 2 )
         {
             const uint64 idx = lpIdxSrc[i];
 
             uint128 lp = 0;
             if( !ReadLP( table, idx, lp ) )
-                return false;
+                return ProofFetchResult::Error;
 
-            BackPtr ptr;
-            if( table < TableId::Table6 && _plot.K() <= 32 )
-                ptr = LinePointToSquare64( (uint64)lp );
-            else
-                ptr = LinePointToSquare( lp );
+            const BackPtr ptr = use64BitLP ? LinePointToSquare64( (uint64)lp ) : LinePointToSquare( lp );
 
+            ASSERT( ptr.x > ptr.y );
             lpIdxDst[dst+0] = ptr.y;
             lpIdxDst[dst+1] = ptr.x;
         }
@@ -585,9 +704,172 @@ bool PlotReader::FetchProof( const uint64 t6LPIndex, uint64 fullProofXs[BB_PLOT_
         // memset( lpIdxDst, 0, sizeof( uint64 ) * PROOF_X_COUNT );
     }
 
+    const uint32  finalIndex = ((uint32)(endTable - TableId::Table1)) % 2;
+    const uint64* xSource   = lpIndices[finalIndex];
+
+    if( isCompressed )
+        return DecompressProof( xSource, fullProofXs );
+
     // Full proof x's will be at the src ptr
     memcpy( fullProofXs, lpIdxSrc, sizeof( uint64 ) * BB_PLOT_PROOF_X_COUNT );
-    return true;
+    return ProofFetchResult::OK;
+}
+
+//-----------------------------------------------------------
+ProofFetchResult PlotReader::DecompressProof( const uint64 compressedProof[BB_PLOT_PROOF_X_COUNT], uint64 fullProofXs[BB_PLOT_PROOF_X_COUNT] )
+{
+    GreenReaperContext* gr = GetGRContext();
+    if( !gr )
+        return ProofFetchResult::Error;
+
+    const uint32 compressionLevel = _plot.CompressionLevel();
+
+    GRCompressedProofRequest req = {};
+    req.plotId           = _plot.PlotId();
+    req.compressionLevel = compressionLevel;
+
+    const uint32 compressedProofCount = compressionLevel < 9 ? PROOF_X_COUNT / 2 : PROOF_X_COUNT / 4;
+
+    for( uint32 i = 0; i < compressedProofCount; i++ )
+        req.compressedProof[i] = compressedProof[i];
+
+    const GRResult r = grFetchProofForChallenge( gr, &req );
+
+    if( r == GRResult_OK )
+    {
+        bbmemcpy_t( fullProofXs, req.fullProof, BB_PLOT_PROOF_X_COUNT );
+        return ProofFetchResult::OK;
+    }
+
+    return r == GRResult_NoProof ? ProofFetchResult::NoProof : ProofFetchResult::CompressionError;
+}
+
+//-----------------------------------------------------------
+ProofFetchResult PlotReader::FetchQualityXsForP7Entry( 
+    const uint64  t6Index, 
+    const byte    challenge[BB_CHIA_CHALLENGE_SIZE], 
+          uint64& outX1, uint64& outX2 )
+{
+    const bool    isCompressed = _plot.CompressionLevel() > 0;
+    const TableId endTable     = GetLowestStoredTable();
+
+    GreenReaperContext* gr = nullptr;
+    if( isCompressed )
+    {
+        gr = GetGRContext();
+        if( !gr )
+            return ProofFetchResult::Error;
+    }
+
+    const uint32 last5Bits = (uint32)challenge[31] & 0x1f;
+
+    uint64 lpIndex  = t6Index; 
+    uint64 altIndex = 0;
+
+    for( TableId table = TableId::Table6; table > endTable; table-- )
+    {
+        // Read line point
+        uint128 lp;
+        if( !ReadLP( table, lpIndex, lp ) )
+            return ProofFetchResult::Error;
+
+
+        const bool    use64BitLP = table < TableId::Table6 && _plot.K() <= 32;
+        const BackPtr ptr        = use64BitLP ? LinePointToSquare64( (uint64)lp ) : LinePointToSquare( lp );
+        ASSERT( ptr.x >= ptr.y );
+
+        const bool isTableBitSet = ((last5Bits >> ((uint32)table-1)) & 1) == 1;
+
+        if( !isTableBitSet )
+        {
+            lpIndex  = ptr.y;
+            altIndex = ptr.x;
+        }
+        else
+        {
+            lpIndex  = ptr.x;
+            altIndex = ptr.y;
+        }
+    }
+
+    if( isCompressed )
+    {
+        const bool needBothLeaves = _plot.CompressionLevel() >= 6;
+
+        // Read both back pointers, depending on compression level
+        uint128 xLP0, xLP1;
+        if( !ReadLP( endTable, lpIndex, xLP0 ) )
+            return ProofFetchResult::Error;
+
+        if( needBothLeaves )
+        {
+            if( !ReadLP( endTable, altIndex, xLP1 ) )
+                return ProofFetchResult::Error;
+        }
+
+        // Now decompress the X's
+        GRCompressedQualitiesRequest req = {};
+        req.plotId                = _plot.PlotId();
+        req.compressionLevel      = _plot.CompressionLevel();
+        req.challenge             = challenge;
+        req.xLinePoints[0].hi     = (uint64)(xLP0 >> 64);
+        req.xLinePoints[0].lo     = (uint64)xLP0;
+
+        if( needBothLeaves )
+        {
+            req.xLinePoints[1].hi = (uint64)(xLP1 >> 64);
+            req.xLinePoints[1].lo = (uint64)xLP1;
+        }
+
+        const auto r = grGetFetchQualitiesXPair( gr, &req );
+        if( r != GRResult_OK )
+            return r == GRResult_NoProof ? ProofFetchResult::NoProof : ProofFetchResult::CompressionError;
+
+        outX1 = req.x1;
+        outX2 = req.x2;
+    }
+    else
+    {
+        uint128 lp;
+        if( !ReadLP( endTable, lpIndex, lp ) )
+            return ProofFetchResult::Error;
+
+        const BackPtr ptr = _plot.K() <= 32 ? LinePointToSquare64( (uint64)lp ) : LinePointToSquare( lp );
+        outX1 = ptr.x;
+        outX2 = ptr.y;
+    }
+
+    return ProofFetchResult::OK;
+}
+
+//-----------------------------------------------------------
+ProofFetchResult PlotReader::FetchQualityForP7Entry( 
+    const uint64 t6Index, 
+    const byte challenge [BB_CHIA_CHALLENGE_SIZE], 
+          byte outQuality[BB_CHIA_QUALITY_SIZE] )
+{
+    uint64 x1, x2;
+
+    const ProofFetchResult r = FetchQualityXsForP7Entry( t6Index, challenge, x1, x2 );
+    if( r != ProofFetchResult::OK )
+        return r;
+
+    const size_t HASH_SIZE_MAX = BB_CHIA_QUALITY_SIZE + CDiv( 2*50, 8 );
+    byte hashInput[HASH_SIZE_MAX] = {};
+
+    const uint32 k        = _plot.K();
+    const size_t hashSize = BB_CHIA_QUALITY_SIZE + CDiv( 2*k, 8 );
+
+    memcpy( hashInput, challenge, BB_CHIA_QUALITY_SIZE );
+
+    Bits<HASH_SIZE_MAX-BB_CHIA_QUALITY_SIZE> hashBits;
+    hashBits.Write( x2, k );
+    hashBits.Write( x1, k );
+
+    hashBits.ToBytes( hashInput + BB_CHIA_QUALITY_SIZE );
+    bls::Util::Hash256( outQuality, hashInput, hashSize );
+
+    return r;
 }
 
 //-----------------------------------------------------------
@@ -618,7 +900,7 @@ bool PlotReader::LoadC2Entries()
         return false;
     }
 
-    _c2Entries = bbcalloc_span<uint64>( c2MaxEntries );
+    _c2Entries = Span<uint64>( new uint64[c2MaxEntries], c2MaxEntries );
 
     const size_t f7BitCount = f7ByteSize * 8;
     CPBitReader reader( buffer, c2Size * 8 );
@@ -643,104 +925,52 @@ bool PlotReader::LoadC2Entries()
     return true;
 }
 
-///
-/// Plot Files
-///
-// #TODO: Move to other source files
 //-----------------------------------------------------------
-bool IPlotFile::ReadHeader( int& error )
+void PlotReader::AssignDecompressionContext( struct GreenReaperContext* context )
 {
-    error = 0;
+    ASSERT( context );
+    if( !context)
+        return;
 
-    // Magic
-    {
-        char magic[sizeof( kPOSMagic )-1] = { 0 };
-        if( Read( sizeof( magic ), magic ) != sizeof( magic ) )
-            return false;
-        
-        if( !MemCmp( magic, kPOSMagic, sizeof( magic ) ) )
-        {
-            error = -1;       // #TODO: Set actual user error
-            return false;
-        }
-    }
+    if( _grContext )
+        grDestroyContext( _grContext );
     
-    // Plot Id
-    {
-        if( Read( sizeof( _header.id ), _header.id ) != sizeof( _header.id ) )
-            return false;
-
-        // char str[65] = { 0 };
-        // size_t numEncoded = 0;
-        // BytesToHexStr( _header.id, sizeof( _header.id ), str, sizeof( str ), numEncoded );
-        // ASSERT( numEncoded == sizeof( _header.id ) );
-        // _idString = str;
-    }
-
-    // K
-    {
-        byte k = 0;
-        if( Read( 1, &k ) != 1 )
-            return false;
-
-        _header.k = k;
-    }
+    _grContext = context;
+}
 
-    // Format Descritption
-    {
-        const uint formatDescSize =  ReadUInt16();
-        FatalIf( formatDescSize != sizeof( kFormatDescription ) - 1, "Invalid format description size." );
+//-----------------------------------------------------------
+void PlotReader::ConfigDecompressor( const uint32 threadCount, const bool disableCPUAffinity, const uint32 cpuOffset )
+{
+    if( _grContext )
+        grDestroyContext( _grContext );
+    _grContext = nullptr;
+
+    GreenReaperConfig cfg = {};
+    cfg.apiVersion         = GR_API_VERSION;
+    cfg.threadCount        = bbclamp( threadCount, 1u, SysHost::GetLogicalCPUCount() );
+    cfg.cpuOffset          = cpuOffset;
+    cfg.disableCpuAffinity = disableCPUAffinity ? GR_TRUE : GR_FALSE;
+
+    auto result = grCreateContext( &_grContext, &cfg, sizeof( GreenReaperConfig ) );
+    ASSERT( result == GRResult_OK );
+}
 
-        char desc[sizeof( kFormatDescription )-1] = { 0 };
-        if( Read( sizeof( desc ), desc ) != sizeof( desc ) )
-            return false;
-        
-        if( !MemCmp( desc, kFormatDescription, sizeof( desc ) ) )
-        {
-            error = -1; // #TODO: Set proper user error
-            return false;
-        }
-    }
-    
-    // Memo
+//-----------------------------------------------------------
+GreenReaperContext* PlotReader::GetGRContext()
+{
+    if( _grContext == nullptr )
     {
-        uint memoSize = ReadUInt16();
-        if( memoSize > sizeof( _header.memo ) )
-        {
-            error = -1; // #TODO: Set proper user error
-            return false;
-        } 
-
-        _header.memoLength = memoSize;
+        GreenReaperConfig cfg = {};
+        cfg.apiVersion  = GR_API_VERSION;
+        cfg.threadCount = std::min( 8u, SysHost::GetLogicalCPUCount() );
 
-        if( Read( memoSize, _header.memo ) != memoSize )
-        {
-            error = -1; // #TODO: Set proper user error
-            return false;
-        }
-
-        // char str[BB_PLOT_MEMO_MAX_SIZE*2+1] = { 0 };
-        // size_t numEncoded = 0;
-        // BytesToHexStr( _memo, memoSize, str, sizeof( str ), numEncoded );
-        
-        // _memoString = str;
-    }
-
-    // Table pointers
-    if( Read( sizeof( _header.tablePtrs ), _header.tablePtrs ) != sizeof( _header.tablePtrs ) )
-    {
-        error = -1; // #TODO: Set proper user error
-        return false;
+        auto result = grCreateContext( &_grContext, &cfg, sizeof( GreenReaperConfig ) );
+        ASSERT( result == GRResult_OK );
     }
 
-    for( int i = 0; i < 10; i++ )
-        _header.tablePtrs[i] = Swap64( _header.tablePtrs[i] );
-
-    // What follows is table data
-    return true;
+    return _grContext;
 }
 
-
 ///
 /// Memory Plot
 ///
diff --git a/src/tools/PlotReader.h b/src/tools/PlotReader.h
index 9d051be8..9517c78a 100644
--- a/src/tools/PlotReader.h
+++ b/src/tools/PlotReader.h
@@ -1,28 +1,44 @@
 #pragma once
-#include "plotting/PlotTools.h"
+// #include "plotting/PlotTools.h"
+#include "plotting/FSETableGenerator.h"
 #include "plotting/PlotTypes.h"
+#include "plotting/PlotHeader.h"
 #include "io/FileStream.h"
 #include "util/Util.h"
 #include <vector>
 
 class CPBitReader;
 
-struct PlotHeader
+enum class ProofFetchResult
 {
-    byte   id  [BB_PLOT_ID_LEN]        = { 0 };
-    byte   memo[BB_PLOT_MEMO_MAX_SIZE] = { 0 };
-    uint   memoLength                  = 0;
-    uint32 k                           = 0;
-    uint64 tablePtrs[10]               = { 0 };
+    OK = 0,
+    NoProof,
+    Error,
+    CompressionError
 };
 
 // Base Abstract class for read-only plot files
 class IPlotFile
 {
 public:
-
     inline uint K() const { return _header.k; }
 
+    inline PlotFlags Flags() const 
+    {
+        if( _version < PlotVersion::v2_0 )
+            return PlotFlags::None;
+
+         return _header.flags;
+    }
+
+    inline uint CompressionLevel() const
+    {
+        if( _version < PlotVersion::v2_0 )
+            return 0;
+
+        return _header.compressionLevel;
+    }
+
     inline const byte* PlotId() const { return _header.id; }
 
     inline uint PlotMemoSize() const { return _header.memoLength; }
@@ -96,7 +112,8 @@ class IPlotFile
     bool ReadHeader( int& error );
 
 protected:
-    PlotHeader _header;
+    PlotFileHeaderV2 _header;
+    PlotVersion      _version;
 };
 
 class MemoryPlot : public IPlotFile
@@ -153,6 +170,9 @@ class PlotReader
     PlotReader( IPlotFile& plot );
     ~PlotReader();
 
+    // Start reading a different plot
+    void SetPlot( IPlotFile& plot );
+
     uint64 GetC3ParkCount() const;
 
     // Get the maximum potential F7 count.
@@ -179,11 +199,13 @@ class PlotReader
     // If the return value is negative, there was an error reading the park.
     int64 ReadC3Park( uint64 parkIndex, uint64* f7Buffer );
 
-    bool ReadP7Entries( uint64 parkIndex, uint64* p7Indices );
+    bool ReadP7Entries( uint64 parkIndex, uint64 p7ParkEntries[kEntriesPerPark] );
+
+    bool ReadP7Entry( uint64 p7Index, uint64& outP7Entry );
 
     uint64 GetFullProofForF7Index( uint64 f7Index, byte* fullProof );
 
-    bool FetchProof( uint64 t6LPIndex, uint64 fullProofXs[BB_PLOT_PROOF_X_COUNT] );
+    ProofFetchResult FetchProof( uint64 t6LPIndex, uint64 fullProofXs[BB_PLOT_PROOF_X_COUNT] );
 
     // void   FindF7ParkIndices( uintt64 f7, std::vector<uint64> indices );
     bool ReadLPPark( TableId table, uint64 parkIndex, uint128 linePoints[kEntriesPerPark], uint64& outEntryCount );
@@ -194,25 +216,54 @@ class PlotReader
 
     inline IPlotFile& PlotFile() const { return _plot; }
 
-    Span<uint64> GetP7IndicesForF7( const uint64 f7, Span<uint64> indices );
-    
+    // Returns the number of f7s found and the starting index as outStartT6Index.
+    uint64 GetP7IndicesForF7( const uint64 f7, uint64& outStartT6Index );
+
+    ProofFetchResult FetchQualityXsForP7Entry( uint64 t6Index,const byte challenge[BB_CHIA_CHALLENGE_SIZE], uint64& outX1, uint64& outX2 );
+    ProofFetchResult FetchQualityForP7Entry( uint64 t6Index, const byte challenge[BB_CHIA_CHALLENGE_SIZE], byte outQuality[BB_CHIA_QUALITY_SIZE] );
+
+    TableId           GetLowestStoredTable() const;
+    bool              IsCompressedXTable( TableId table ) const;
+    size_t            GetParkSizeForTable( TableId table ) const;
+    uint32            GetLPStubBitSize( TableId table ) const;
+    uint32            GetLPStubByteSize( TableId table ) const;
+    size_t            GetParkDeltasSectionMaxSize( TableId table ) const;
+    const FSE_DTable* GetDTableForTable( TableId table ) const;
+
+    // Takes ownership of a decompression context
+    void AssignDecompressionContext( struct GreenReaperContext* context );
+
+    void ConfigDecompressor( uint32 threadCount, bool disableCPUAffinity, uint32 cpuOffset = 0 );
+    inline struct GreenReaperContext* GetDecompressorContext() const { return _grContext; }
+
 private:
+    ProofFetchResult DecompressProof( const uint64 compressedProof[BB_PLOT_PROOF_X_COUNT], uint64 fullProofXs[BB_PLOT_PROOF_X_COUNT] );
 
     bool ReadLPParkComponents( TableId table, uint64 parkIndex, 
                                CPBitReader& outStubs, byte*& outDeltas, 
                                uint128& outBaseLinePoint, uint64& outDeltaCounts );
 
+    bool LoadP7Park( uint64 parkIndex );
+
     bool LoadC2Entries();
+
+    struct GreenReaperContext* GetGRContext();
+
 private:
     IPlotFile& _plot;
     uint32     _version;
 
     // size_t  _parkBufferSize;
-    uint64*      _parkBuffer    ;        // Buffer for loading compressed park data.
-    byte*        _deltasBuffer  ;        // Buffer for decompressing deltas in parks that have delta. 
+    uint64*      _parkBuffer;           // Buffer for loading compressed park data.
+    byte*        _deltasBuffer;         // Buffer for decompressing deltas in parks that have delta. 
 
-    Span<uint64> _c2Entries;
     byte*        _c1Buffer = nullptr;
+    Span<uint64> _c2Entries;
     Span<uint64> _c3Buffer;
+
+    struct GreenReaperContext* _grContext = nullptr;    // Used for decompressing
+    
+    int64  _park7Index = -1;
+    uint64 _park7Entries[kEntriesPerPark];
 };
 
diff --git a/src/tools/PlotTools.h b/src/tools/PlotTools.h
deleted file mode 100644
index e34a8945..00000000
--- a/src/tools/PlotTools.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#pragma once
-
-
-struct ValidatePlotOptions
-{
-    std::string plotPath    = "";
-    bool        inRAM       = false;
-    bool        unpacked    = false;
-    uint32      threadCount = 0;
-    float       startOffset = 0.0f; // Offset percent at which to start
-};
-
diff --git a/src/tools/PlotTools_Main.old.h b/src/tools/PlotTools_Main.old.h
deleted file mode 100644
index b7f838f7..00000000
--- a/src/tools/PlotTools_Main.old.h
+++ /dev/null
@@ -1,115 +0,0 @@
-#include "PlotTools.h"
-#include "util/Log.h"
-#include "util/Util.h"
-#include <string.h>
-
-
-bool ValidatePlot( const ValidatePlotOptions& options );
-
-int ValidatePlotCmd( int argc, const char* argv[] );
-
-inline bool match( const char* ref, const char* arg )
-{
-    return strcmp( ref, arg ) == 0;
-}
-
-inline bool match( const char* ref0, const char* ref1, const char* arg )
-{
-    return strcmp( ref0, arg ) == 0 ||
-           strcmp( ref1, arg ) == 0;
-}
-
-
-//-----------------------------------------------------------
-int main( int argc, const char* argv[] )
-{
-    argc--;
-    argv++;
-
-    for( int i = 0; i < argc; i++ )
-    {
-        const char* arg = argv[i];
-
-        if( match( "validate", arg ) )
-            return ValidatePlotCmd( argc-1, argv+i+1 );
-    }
-
-    Log::Line( "No command specified." );
-    return 1;
-}
-
-//-----------------------------------------------------------
-int ValidatePlotCmd( int argc, const char* argv[] )
-{
-    ValidatePlotOptions opts;
-
-    int i;
-    const char* arg;
-    auto value = [&](){
-
-        FatalIf( ++i >= argc, "Expected a value for parameter '%s'", arg );
-        return argv[i];
-    };
-
-    auto ivalue = [&]() {
-
-        const char* val = value();
-        int64 v = 0;
-        
-        int r = sscanf( val, "%lld", &v );
-        FatalIf( r != 1, "Invalid int64 value for argument '%s'.", arg );
-
-        return v;
-    };
-
-    auto uvalue = [&]() {
-        
-        const char* val = value();
-        uint64 v = 0;
-
-        int r = sscanf( val, "%llu", &v );
-        FatalIf( r != 1, "Invalid uint64 value for argument '%s'.", arg );
-
-        return v;
-    };
-
-    auto fvalue = [&]() {
-        
-        const char* val = value();
-        float v = 0.f;
-
-        int r = sscanf( val, "%f", &v );
-        FatalIf( r != 1, "Invalid float32 value for argument '%s'.", arg );
-
-        return v;
-    };
-
-    for( i = 0; i < argc; i++ )
-    {
-        arg = argv[i];
-
-        if( match( "-m", "--in-ram", arg ) )
-            opts.inRAM = true;
-        else if( match( "-t", "--threads", arg ) )
-            opts.threadCount = uvalue();
-        else if( match( "-o", "--offset", arg ) )
-            opts.startOffset = std::max( std::min( fvalue() / 100.f, 100.f ), 0.f );
-        else if( i == argc - 1 )
-            opts.plotPath = arg;
-        else
-        {
-            Log::Error( "Unknown argument '%s'", arg );
-            return 1;
-        }
-    }
-
-    if( opts.threadCount == 0 )
-        opts.threadCount = SysHost::GetLogicalCPUCount();
-    else
-        opts.threadCount = std::min( opts.threadCount, SysHost::GetLogicalCPUCount() );
-
-    // #TODO: Allow many plots to be validated
-    return ValidatePlot( opts ) ? 0 : 1;
-}
-
-
diff --git a/src/tools/PlotValidator.cpp b/src/tools/PlotValidator.cpp
index fdbd5bbe..d0ec5e8d 100644
--- a/src/tools/PlotValidator.cpp
+++ b/src/tools/PlotValidator.cpp
@@ -3,36 +3,54 @@
 #include "util/Log.h"
 #include "util/BitView.h"
 #include "io/FileStream.h"
-#include "PlotTools.h"
 #include "PlotReader.h"
 #include "plotting/PlotTools.h"
+#include "plotting/PlotValidation.h"
 #include "plotmem/LPGen.h"
 #include "pos/chacha8.h"
 #include "b3/blake3.h"
 #include "threading/MTJob.h"
 #include "util/CliParser.h"
 #include "plotting/GlobalPlotConfig.h"
+#include "harvesting/GreenReaper.h"
 #include <mutex>
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-function"
 
-#define PROOF_X_COUNT       64
-#define MAX_K_SIZE          48
-#define MAX_META_MULTIPLIER 4
-#define MAX_Y_BIT_SIZE      ( MAX_K_SIZE + kExtraBits )
-#define MAX_META_BIT_SIZE   ( MAX_K_SIZE * MAX_META_MULTIPLIER )
-#define MAX_FX_BIT_SIZE     ( MAX_Y_BIT_SIZE + MAX_META_BIT_SIZE + MAX_META_BIT_SIZE )
-
 #define COLOR_NONE       "\033[0m"
 #define COLOR_RED        "\033[31m"
 #define COLOR_GREEN      "\033[32m"
 #define COLOR_RED_BOLD   "\033[1m\033[31m"
 #define COLOR_GREEN_BOLD "\033[1m\033[32m"
 
-typedef Bits<MAX_Y_BIT_SIZE>    YBits;
-typedef Bits<MAX_META_BIT_SIZE> MetaBits;
-typedef Bits<MAX_FX_BIT_SIZE>   FxBits;
+struct ValidatePlotOptions
+{
+    struct GlobalPlotConfig* gCfg;
+
+    std::string plotPath    = "";
+    bool        inRAM       = false;
+    bool        unpacked    = false;
+    uint32      threadCount = 0;
+    float       startOffset = 0.0f;  // Offset percent at which to start
+    bool        useCuda     = false; // Use a cuda device when decompressing
+
+    int64       f7          = -1;
+};
+
+
+
+// #define PROOF_X_COUNT       64
+// #define MAX_K_SIZE          50
+// #define MAX_META_MULTIPLIER 4
+// #define MAX_Y_BIT_SIZE      ( MAX_K_SIZE + kExtraBits )
+// #define MAX_META_BIT_SIZE   ( MAX_K_SIZE * MAX_META_MULTIPLIER )
+// #define MAX_FX_BIT_SIZE     ( MAX_Y_BIT_SIZE + MAX_META_BIT_SIZE + MAX_META_BIT_SIZE )
+
+
+// typedef Bits<MAX_Y_BIT_SIZE>    YBits;
+// typedef Bits<MAX_META_BIT_SIZE> MetaBits;
+// typedef Bits<MAX_FX_BIT_SIZE>   FxBits;
 
 // #TODO: Add C1 & C2 table validation
 
@@ -45,22 +63,28 @@ Validates all of a plot's values to ensure they all contain valid proofs.
 You can specify the thread count in the bladebit global option '-t'.
 
 [ARGUMENTS]
-<plot_path>   : Path to the plot file to be validated.
+<plot_path>      : Path to the plot file to be validated.
 
 [OPTIOINS]
- -m, --in-ram : Loads the whole plot file into memory before validating.
+ -m, --in-ram    : Loads the whole plot file into memory before validating.
+
+ -o, --offset    : Percentage offset at which to start validating.
+                   Ex (start at 50%): bladebit validate -o 50 /path/to/my/plot
+
+ -u, --unpack    : Decompress the plot into memory before validating.
+                   This decreases validation time substantially but
+                   it requires around 128GiB of RAM for k=32.
+                   This is only supported for plots with k=32 and below.
+
+ --prove, -p <c> : Find if a proof exists given challenge <c>.
 
- -o, --offset : Percentage offset at which to start validating.
-                Ex (start at 50%): bladebit validate -o 50 /path/to/my/plot
+ --f7 <f7>       : Specify an f7 to find and validate in the plot.
 
- -u, --unpack : Decompress the plot into memory before validating.
-                This decreases validation time substantially but
-                it requires around 128GiB of RAM for k=32.
-                This is only supported for plots with k=32 and below.
+ --quality <f7>  : Fetch quality string for f7.
 
- --f7 <f7>    : Specify an f7 to find and validate in the plot.
+ --cuda          : Use a CUDA device when decompressing.
 
- -h, --help   : Print this help message and exit.
+ -h, --help      : Print this help message and exit.
 )";
 
 void PlotValidatorPrintUsage()
@@ -68,8 +92,6 @@ void PlotValidatorPrintUsage()
     Log::Line( USAGE );
 }
 
-
-
 struct UnpackedK32Plot
 {
     // Table 1 == X's in line point form
@@ -95,17 +117,18 @@ struct UnpackedK32Plot
 };
 
 
+static void VerifyFullProofStr( const ValidatePlotOptions& opts, const char* plotIdStr, const char* fullProofStr, const char* challengeStr );
+
 static void GetProofF1( uint32 k, const byte plotId[BB_PLOT_ID_LEN], uint64 fullProofXs[PROOF_X_COUNT], uint64 fx[PROOF_X_COUNT] );
 
 template<bool Use64BitLpToSquare>
-static bool FetchProof( PlotReader& plot, uint64 t6LPIndex, uint64 fullProofXs[PROOF_X_COUNT] );
+static bool FetchProof( PlotReader& plot, uint64 t6LPIndex, uint64 fullProofXs[PROOF_X_COUNT], GreenReaperContext* gr = nullptr );
 
-static void GetProofForChallenge( const char* plotPath, const char* challengeHex );
-static bool ValidateFullProof( const uint32 k, const byte plotId[BB_PLOT_ID_LEN], uint64 fullProofXs[PROOF_X_COUNT], uint64& outF7 );
+static void GetProofForChallenge( const ValidatePlotOptions& opts, const char* challengeHex, bool qualityOnly );
 static void ReorderProof( PlotReader& plot, uint64 fullProofXs[PROOF_X_COUNT] );
 static void GetProofF1( uint32 k, const byte plotId[BB_PLOT_ID_LEN], uint64 fullProofXs[PROOF_X_COUNT], uint64 fx[PROOF_X_COUNT] );
+static bool DecompressProof( const byte plotId[BB_PLOT_ID_LEN], const uint32 compressionLevel, const uint64 compressedProof[PROOF_X_COUNT], uint64 fullProofXs[PROOF_X_COUNT], GreenReaperContext* gr = nullptr );
 
-static uint64 BytesToUInt64( const byte bytes[8] );
 static uint64 SliceUInt64FromBits( const byte* bytes, uint32 bitOffset, uint32 bitCount );
 
 static bool FxMatch( uint64 yL, uint64 yR );
@@ -115,6 +138,7 @@ static void FxGen( const TableId table, const uint32 k,
                    uint64& outY, MetaBits& outMeta );
 
 static bool ValidatePlot( const ValidatePlotOptions& options );
+static void ValidatePark( IPlotFile& file, const uint64 parkIndex );
 
 static uint64 ValidateInMemory( UnpackedK32Plot& plot, ThreadPool& pool );
 
@@ -139,9 +163,12 @@ struct ValidateJob : MTJob<ValidateJob>
 void PlotValidatorMain( GlobalPlotConfig& gCfg, CliParser& cli )
 {
     ValidatePlotOptions opts;
+    opts.gCfg = &gCfg;
 
     const char* challenge = nullptr;
-    int64       f7        = -1;
+    const char* fullProof = nullptr;
+    const char* plotIdStr = nullptr;
+    bool        quality   = false;
 
     while( cli.HasArgs() )
     {
@@ -153,8 +180,27 @@ void PlotValidatorMain( GlobalPlotConfig& gCfg, CliParser& cli )
             continue;
         else if( cli.ReadStr( challenge, "--prove" ) )
             continue;
-        else if( cli.ReadI64( f7, "--f7" ) )    // Same as proof, but the challenge is made from an f7
+        else if( cli.ReadI64( opts.f7, "--f7" ) )    // Same as proof, but the challenge is made from an f7
+            continue;
+        else if( cli.ReadI64( opts.f7, "--quality" ) )
+        {
+            quality = true;
+            continue;
+        }
+        else if( cli.ReadSwitch( opts.useCuda, "--cuda" ) )
+        {
+            #if !BB_CUDA_ENABLED
+                Fatal( "--cuda is only available in the bladebit_cuda variant." );
+            #endif
+
             continue;
+        }
+        else if( cli.ReadStr( fullProof, "--verify" ) )
+        {
+            challenge = cli.ArgConsume();
+            plotIdStr = cli.ArgConsume();
+            break;
+        }
         else if( cli.ArgConsume( "-h", "--help" ) )
         {
             PlotValidatorPrintUsage();
@@ -171,22 +217,37 @@ void PlotValidatorMain( GlobalPlotConfig& gCfg, CliParser& cli )
     }
 
     // Check for f7
-    if( f7 >= 0 )
+    // if( challenge )
+    // {
+    //     if( sscanf( challenge, "%lld", &f7 ) == 1 )
+    //         challenge = nullptr;
+    // }
+
+    if( opts.f7 >= 0 )
     {
+        // Create a default challenge string. The f7 will be embedded into it
         challenge = new char[65];
-        sprintf( (char*)challenge, "%08llx", f7 );
-        memset( (void*)(challenge+8), '0', 64-8 );
-        ((char*)challenge)[64] = 0;
+        const char challengeSrc[] = "00000000ff04b8ee9355068689bd558eafe07cc7af47ad1574b074fc34d6913a";
+        memcpy( (void*)challenge, challengeSrc, sizeof( challengeSrc ) );
+    }
+
+    const uint32 maxThreads = SysHost::GetLogicalCPUCount();
+
+    // Check for full proof verification
+    if( fullProof != nullptr )
+    {
+        VerifyFullProofStr( opts, plotIdStr, fullProof, challenge );
+        Exit( 0 );
     }
 
     // Check for challenge
     if( challenge != nullptr )
     {
-        GetProofForChallenge( opts.plotPath.c_str(), challenge );
+        opts.threadCount = std::min( maxThreads, gCfg.threadCount == 0 ? 8u : gCfg.threadCount );
+        GetProofForChallenge( opts, challenge, quality );
         Exit( 0 );
     }
 
-    const uint32 maxThreads = SysHost::GetLogicalCPUCount();
 
     opts.threadCount = gCfg.threadCount == 0 ? maxThreads : std::min( maxThreads, gCfg.threadCount );
     opts.startOffset = std::max( std::min( opts.startOffset / 100.f, 100.f ), 0.f );
@@ -384,6 +445,80 @@ uint64 ValidateInMemory( UnpackedK32Plot& plot, ThreadPool& pool )
     return totalFailures;
 }
 
+//-----------------------------------------------------------
+void ValidatePark( IPlotFile& file, const uint64 c3ParkIndex )
+{
+    PlotReader reader( file );
+    const uint32 k = file.K();
+
+    const uint64 plotC3ParkCount = file.TableSize( PlotTable::C1 ) / sizeof( uint32 ) - 1;
+
+    if( c3ParkIndex >= plotC3ParkCount )
+        Fatal( "C3 park index %llu is out of range of %llu maximum parks.", c3ParkIndex, plotC3ParkCount );
+    
+    uint64 f7Entries[kCheckpoint1Interval];
+    uint64 p7Entries[kEntriesPerPark];
+    
+    const int64 f7EntryCount = reader.ReadC3Park( c3ParkIndex, f7Entries );
+    FatalIf( f7EntryCount < 0, "Failed to read C3 Park %llu", c3ParkIndex );
+
+    const uint64 f7IdxBase       = c3ParkIndex * kCheckpoint1Interval;
+    
+    int64 curPark7 = -1;
+
+    uint64 failCount = 0;
+
+    for( uint32 i = 0; i < (uint32)f7EntryCount; i++ )
+    {
+        const uint64 f7Idx       = f7IdxBase + i;
+        const uint64 p7ParkIndex = f7Idx / kEntriesPerPark;
+        const uint64 f7          = f7Entries[i];
+
+        if( (int64)p7ParkIndex != curPark7 )
+        {
+            curPark7 = (int64)p7ParkIndex;
+            FatalIf( !reader.ReadP7Entries( p7ParkIndex, p7Entries ), "Failed to read P7 %llu.", p7ParkIndex );
+        }
+
+        const uint64 p7LocalIdx = f7Idx - p7ParkIndex * kEntriesPerPark;
+        const uint64 t6Index    = p7Entries[p7LocalIdx];
+
+        bool success = true;
+
+        // if( k <= 32 )
+        // {
+        //     success = FetchProof<true>( reader, t6Index, fullProofXs );
+        // }
+        // else
+        //     success = FetchProof<false>( reader, t6Index, fullProofXs );
+
+        // if( success )
+        // {
+        //     // ReorderProof( plot, fullProofXs );   // <-- No need for this for validation
+            
+        //     // Now we can validate the proof
+        //     uint64 outF7;
+
+        //     if( ValidateFullProof( k, plot.PlotFile().PlotId(), fullProofXs, outF7 ) )
+        //         success = f7 == outF7;
+        //     else
+        //         success = false;
+        // }
+        // else
+        // {
+        //     success = false;
+        //     Log::Error( "Park %llu proof fetch failed for f7[%llu] local(%llu) = %llu ( 0x%016llx ) ", 
+        //         c3ParkIdx, f7Idx, i, f7, f7 );
+
+        //     failCount++;
+        // }
+    }
+
+    // if( failCount == 0 )
+    //     Log::Line( "SUCCESS: C3 park %llu is valid.", c3ParkIdx );
+    // else
+    //     Log::Line( "FAILED: Invalid C3 park %llu.", c3ParkIdx );
+}
 
 //-----------------------------------------------------------
 void ValidateJob::Log( const char* msg, ... )
@@ -520,11 +655,68 @@ void ValidateJob::Run()
     this->failCount = proofFailCount;
 }
 
+//-----------------------------------------------------------
+void VerifyFullProofStr( const ValidatePlotOptions& opts, const char* plotIdStr, const char* fullProofStr, const char* challengeStr )
+{
+    // #TODO: Properly implement. Only for testing at the moment.
+    FatalIf( !fullProofStr, "Invalid proof." );
+    FatalIf( !challengeStr, "Invalid challenge." );
+    FatalIf( !plotIdStr, "Invalid plot id." );
+
+    fullProofStr = Offset0xPrefix( fullProofStr );
+    challengeStr = Offset0xPrefix( challengeStr );
+    plotIdStr    = Offset0xPrefix( plotIdStr );
+
+    const size_t fpLength        = strlen( fullProofStr );
+    const size_t challengeLength = strlen( challengeStr );
+    const size_t plotIdLength    = strlen( plotIdStr );
+    FatalIf( fpLength % 16 != 0, "Invalid proof: Proof must be a multiple of 8 bytes." );
+    FatalIf( challengeLength != 32*2, "Invalid challenge: Challenge must be a 32 bytes." );
+    FatalIf( plotIdLength != 32*2, "Invalid plot id : Plot id must be a 32 bytes." );
+
+    byte  plotId        [BB_PLOT_ID_LEN];
+    byte  challengeBytes[32];
+    byte* fullProofBytes = new byte[fpLength/2];
+    
+    FatalIf( !HexStrToBytesSafe( fullProofStr, fpLength, fullProofBytes, fpLength/2 ),
+        "Could not parse full proof." );
+
+    FatalIf( !HexStrToBytesSafe( challengeStr, sizeof(challengeBytes)*2, challengeBytes, sizeof(challengeBytes) ),
+        "Could not parse challenge." );
+    
+    FatalIf( !HexStrToBytesSafe( plotIdStr, sizeof(plotId)*2, plotId, sizeof(plotId) ),
+        "Could not parse plot id." );
+
+    const uint32 k = (uint32)(fpLength / 16);
+
+    uint64 f7 = 0;
+    uint64 proofXs[PROOF_X_COUNT] = {};
+    {
+        CPBitReader f7Reader( challengeBytes, sizeof( challengeBytes ) * 8 );
+        f7 = f7Reader.Read64( k );
+    }
+    {
+        CPBitReader proofReader( fullProofBytes, fpLength/2 * 8 );
+
+        for( uint32 i = 0; i < PROOF_X_COUNT; i++ )
+            proofXs[i] = proofReader.Read64( k );
+    }
+
+    uint64 computedF7 = 0;
+    if( !ValidateFullProof( k, plotId, proofXs, computedF7 ) || computedF7 != f7 )
+    {
+        Log::Line( "Verification Failed." );
+        Exit(1);
+    }
+
+    Log::Line( "Varification Successful!" );
+}
+
 // #TODO: Support K>32
 //-----------------------------------------------------------
-void GetProofForChallenge( const char* plotPath, const char* challengeHex )
+void GetProofForChallenge( const ValidatePlotOptions& opts, const char* challengeHex, bool qualityOnly )
 {
-    FatalIf( !plotPath || !*plotPath, "Invalid plot path." );
+    FatalIf( opts.plotPath.length() == 0, "Invalid plot path." );
     FatalIf( !challengeHex || !*challengeHex, "Invalid challenge." );
 
     const size_t lenChallenge = strlen( challengeHex );
@@ -534,32 +726,69 @@ void GetProofForChallenge( const char* plotPath, const char* challengeHex )
     HexStrToBytes( challengeHex, lenChallenge, (byte*)challenge, 32 );
 
     FilePlot plot;
-    FatalIf( !plot.Open( plotPath ), "Failed to open plot at %s.", plotPath );
+    FatalIf( !plot.Open( opts.plotPath.c_str() ), "Failed to open plot at %s.", opts.plotPath.c_str() );
     FatalIf( plot.K() != 32, "Only k32 plots are supported." );
 
+    const uint32 k = plot.K();
+
+    if( opts.f7 >= 0 )
+    {
+        // Embed f7 into challenge as BE
+        byte* challengeBytes = (byte*)challenge;
+        uint64 f7 = (uint64)opts.f7;
+
+        const size_t f7Size = CDiv( k, 8 );
+
+        for( size_t i = 0; i < f7Size; i++ )
+            challengeBytes[i] = (uint8_t)(f7 >> ((f7Size - i - 1) * 8));
+    }
+
     // Read F7 value
     CPBitReader f7Reader( (byte*)challenge, sizeof( challenge ) * 8 );
-    const uint32 f7 = (uint32)f7Reader.Read64( 32 );
+    const uint64 f7 = f7Reader.Read64( k );
 
     // Find this f7 in the plot file
     PlotReader reader( plot );
+    // reader.ConfigDecompressor()
 
-    uint64 _indices[64] = {};
-    Span<uint64> indices( _indices, sizeof( _indices ) / sizeof( uint64 ) );    // #TODO: Should simply return the start index and count
-
-    auto matches = reader.GetP7IndicesForF7( f7, indices );
-    FatalIf( matches.Length() == 0, "Could not find f7 %llu in plot", f7 );
+    uint64 p7BaseIndex = 0;
+    const uint64 matchCount = reader.GetP7IndicesForF7( f7, p7BaseIndex );
+    if(  matchCount == 0 )
+    {
+        Log::Line( "Could not find f7 %llu in plot.", (llu)f7 );
+        Exit( 1 );
+    }
 
     uint64 fullProofXs[PROOF_X_COUNT];
     uint64 proof   [32]  = {};
     char   proofStr[513] = {};
     uint64 p7Entries[kEntriesPerPark] = {};
+    byte   quality  [BB_CHIA_QUALITY_SIZE ] = {};
 
     int64 prevP7Park = -1;
 
-    for( uint64 i = 0; i < matches.Length(); i++ )
+    GreenReaperContext* gr = nullptr;
+    if( plot.CompressionLevel() > 0 )
+    {
+        GreenReaperConfig cfg = {};
+        cfg.apiVersion  = GR_API_VERSION;
+        cfg.threadCount = opts.threadCount;
+
+        if( opts.useCuda )
+            cfg.gpuRequest = GRGpuRequestKind_FirstAvailable;
+
+        auto result = grCreateContext( &gr, &cfg, sizeof( GreenReaperConfig ) );
+        FatalIf( result != GRResult_OK, "Failed to created decompression context with error %d.", (int)result );
+
+        if( opts.useCuda && !(bool)grHasGpuDecompressor( gr ) )
+            Log::Line( "Warning: No GPU device selected. Falling back to CPU-based validation." );
+        else
+            reader.AssignDecompressionContext( gr );
+    }
+
+    for( uint64 i = 0; i < matchCount; i++ )
     {
-        const uint64 p7Index = matches[i];
+        const uint64 p7Index = p7BaseIndex + i;
         const uint64 p7Park  = p7Index / kEntriesPerPark;
         
         // uint64 o = reader.GetFullProofForF7Index( matches[i], proof );
@@ -572,11 +801,31 @@ void GetProofForChallenge( const char* plotPath, const char* challengeHex )
 
         const uint64 localP7Index = p7Index - p7Park * kEntriesPerPark;
         const uint64 t6Index      = p7Entries[localP7Index];
+    
+        bool             gotProof = false;
+        ProofFetchResult qResult  = ProofFetchResult::NoProof;
+
+        auto const timer    = TimerBegin();
+
+        if( qualityOnly )
+            qResult = reader.FetchQualityForP7Entry( t6Index, (byte*)challenge, quality );
+        else
+            gotProof = FetchProof<true>( reader, t6Index, fullProofXs, gr );
+
+        auto const elapsed  = TimerEndTicks( timer );
+
+        if( opts.gCfg->verbose )
+        {
+            Log::Line( "%s fetch time: %02.2lf seconds ( %02.2lf ms ).", qualityOnly ? "Quality" : "Proof",
+                TicksToSeconds( elapsed ), TicksToNanoSeconds( elapsed ) * 0.000001 );
+        }
 
-        const bool gotProof = FetchProof<true>( reader, t6Index, fullProofXs );
-        
         if( gotProof )
         {
+            uint64 computedF7 = std::numeric_limits<uint64>::max();
+            const bool valid = ValidateFullProof( plot.K(), plot.PlotId(), fullProofXs, computedF7 );
+            ASSERT( valid && computedF7 == f7 );
+
             ReorderProof( reader, fullProofXs );
 
             BitWriter writer( proof, sizeof( proof ) * 8 );
@@ -592,6 +841,13 @@ void GetProofForChallenge( const char* plotPath, const char* challengeHex )
             // Log::Line( "[%llu] : %s", i, proofStr );
             Log::Line( proofStr );
         }
+        else if( qResult == ProofFetchResult::OK )
+        {
+            size_t encoded;
+            BytesToHexStr( (byte*)quality, sizeof( quality ), proofStr, sizeof( proofStr ), encoded );
+            // Log::Write( "0x" );
+            Log::Line( proofStr );
+        }
     }
 }
 
@@ -707,7 +963,6 @@ UnpackedK32Plot UnpackedK32Plot::Load( IPlotFile** plotFile, ThreadPool& pool, u
         });
     }
 
-    
     auto LoadBackPtrTable = [&]( const TableId table ) {
 
         Log::Line( "Loading table %u", table+1 );
@@ -726,7 +981,7 @@ UnpackedK32Plot UnpackedK32Plot::Load( IPlotFile** plotFile, ThreadPool& pool, u
             uint64 parkCount, parkOffset, parkEnd;
             GetThreadOffsets( self, plotParkCount, parkCount, parkOffset, parkEnd );
             
-            uint64 parkEntryCount;
+            uint64  parkEntryCount;
             uint128 linePoints[kEntriesPerPark];
 
             Span<Pair> tableWriter = backPointers.Slice( parkOffset * kEntriesPerPark, parkCount * kEntriesPerPark );
@@ -852,9 +1107,54 @@ bool UnpackedK32Plot::FetchProof( const uint64 index, uint64 fullProofXs[PROOF_X
     return true;
 }
 
+//-----------------------------------------------------------
+bool DecompressProof( const byte plotId[BB_PLOT_ID_LEN], const uint32 compressionLevel, const uint64 compressedProof[PROOF_X_COUNT], uint64 fullProofXs[PROOF_X_COUNT], GreenReaperContext* gr )
+{
+// #if _DEBUG
+//     for( uint32 i = 0; i < 32; i++ )
+//     {
+//         const uint32 x = (uint32)compressedProof[i];
+//         Log::Line( "[%-2u] %-10u ( 0x%08X )", i, x, x );
+//     }
+// #endif
+
+    bool destroyContext = false;
+    if( gr == nullptr )
+    {
+        GreenReaperConfig cfg = {};
+        cfg.apiVersion  = GR_API_VERSION;
+        cfg.threadCount = std::min( 8u, SysHost::GetLogicalCPUCount() );
+
+        auto result = grCreateContext( &gr, &cfg, sizeof( GreenReaperConfig ) );
+    
+        FatalIf( result != GRResult_OK, "Failed to created decompression context with error %d.", (int)result );
+        destroyContext = true;
+    }
+
+    auto info = GetCompressionInfoForLevel( compressionLevel );
+
+    GRCompressedProofRequest req = {};
+    req.compressionLevel = compressionLevel;
+    req.plotId           = plotId;
+
+    const uint32 compressedProofCount = compressionLevel < 9 ? PROOF_X_COUNT / 2 : PROOF_X_COUNT / 4;
+
+    for( uint32 i = 0; i < compressedProofCount; i++ )
+        req.compressedProof[i] = compressedProof[i];
+
+    const GRResult r = grFetchProofForChallenge( gr, &req );
+
+    bbmemcpy_t( fullProofXs, req.fullProof, PROOF_X_COUNT );
+
+    if( destroyContext )
+        grDestroyContext( gr );
+
+    return r == GRResult_OK;
+}
+
 //-----------------------------------------------------------
 template<bool Use64BitLpToSquare>
-bool FetchProof( PlotReader& plot, uint64 t6LPIndex, uint64 fullProofXs[PROOF_X_COUNT] )
+bool FetchProof( PlotReader& plot, uint64 t6LPIndex, uint64 fullProofXs[PROOF_X_COUNT], GreenReaperContext* gr )
 {
     uint64 lpIndices[2][PROOF_X_COUNT];
     // memset( lpIndices, 0, sizeof( lpIndices ) );
@@ -868,7 +1168,12 @@ bool FetchProof( PlotReader& plot, uint64 t6LPIndex, uint64 fullProofXs[PROOF_X_
     // from 6 to 1, grabbing all of the x's that make up a proof.
     uint32 lookupCount = 1;
 
-    for( TableId table = TableId::Table6; table >= TableId::Table1; table-- )
+    const bool    isCompressed = plot.PlotFile().CompressionLevel() > 0;
+    const TableId endTable     = !isCompressed ? TableId::Table1 :
+                                    plot.PlotFile().CompressionLevel() < 9 ? 
+                                    TableId::Table2 : TableId::Table3;
+
+    for( TableId table = TableId::Table6; table >= endTable; table-- )
     {
         ASSERT( lookupCount <= 32 );
 
@@ -886,6 +1191,7 @@ bool FetchProof( PlotReader& plot, uint64 t6LPIndex, uint64 fullProofXs[PROOF_X_
             else
                 ptr = LinePointToSquare( lp );
 
+            ASSERT( ptr.x > ptr.y );
             lpIdxDst[dst+0] = ptr.y;
             lpIdxDst[dst+1] = ptr.x;
         }
@@ -896,8 +1202,14 @@ bool FetchProof( PlotReader& plot, uint64 t6LPIndex, uint64 fullProofXs[PROOF_X_
         // memset( lpIdxDst, 0, sizeof( uint64 ) * PROOF_X_COUNT );
     }
 
+    const uint32  finalIndex = ((uint32)(endTable - TableId::Table1)) % 2;
+    const uint64* xSource   = lpIndices[finalIndex];
+
+    if( isCompressed )
+        return DecompressProof( plot.PlotFile().PlotId(), plot.PlotFile().CompressionLevel(), xSource, fullProofXs, gr );
+
     // Full proof x's will be at the src ptr
-    memcpy( fullProofXs, lpIdxSrc, sizeof( uint64 ) * PROOF_X_COUNT );
+    memcpy( fullProofXs, xSource, sizeof( uint64 ) * PROOF_X_COUNT );
     return true;
 }
 
@@ -910,7 +1222,7 @@ bool ValidateFullProof( const uint32 k, const byte plotId[BB_PLOT_ID_LEN], uint6
     // Convert these x's to f1 values
     {
         const uint32 xShift = k - kExtraBits;
-        
+
         // Prepare ChaCha key
         byte key[32] = { 1 };
         memcpy( key + 1, plotId, 31 );
@@ -1073,6 +1385,8 @@ void GetProofF1( uint32 k, const byte plotId[BB_PLOT_ID_LEN], uint64 fullProofXs
 //-----------------------------------------------------------
 bool FxMatch( uint64 yL, uint64 yR )
 {
+    LoadLTargets();
+
     const uint64 groupL = yL / kBC;
     const uint64 groupR = yR / kBC;
 
@@ -1151,20 +1465,6 @@ void FxGen( const TableId table, const uint32 k,
     }
 }
 
-//-----------------------------------------------------------
-/// Convertes 8 bytes to uint64 and endian-swaps it.
-/// This takes any byte alignment, so that bytes does
-/// not have to be aligned to 64-bit boundary.
-/// This is for compatibility for how chiapos extracts
-/// bytes into integers.
-//-----------------------------------------------------------
-inline uint64 BytesToUInt64( const byte bytes[8] )
-{
-    uint64 tmp;
-    memcpy( &tmp, bytes, sizeof( uint64 ) );
-    return Swap64( tmp );
-}
-
 //-----------------------------------------------------------
 // Treats bytes as a set of 64-bit big-endian fields,
 // from which it will extract a whole 64-bit value
diff --git a/src/util/BitView.h b/src/util/BitView.h
index 75a7ea93..da74ea4b 100644
--- a/src/util/BitView.h
+++ b/src/util/BitView.h
@@ -37,6 +37,14 @@ class CPBitReader
         return value;
     }
 
+    //-----------------------------------------------------------
+    inline uint64 Read64At( const uint64 position, const uint32 bitCount )
+    {
+        ASSERT( position + bitCount <= _sizeBits );
+        const uint64 value = Read64( bitCount, _fields, position, _sizeBits );
+        return value;
+    }
+
     //-----------------------------------------------------------
     inline uint128 Read128Aligned( const uint32 bitCount )
     {
@@ -672,7 +680,7 @@ class Bits
     {
         ASSERT( bitCount <= BitSize );
         const uint64 value = CPBitReader::Read64( bitCount, _fields, 0 );
-        
+
         return value;
     }
     
diff --git a/src/util/CliParser.cpp b/src/util/CliParser.cpp
new file mode 100644
index 00000000..77ba7c47
--- /dev/null
+++ b/src/util/CliParser.cpp
@@ -0,0 +1,348 @@
+#include "CliParser.h"
+#include "util/KeyTools.h"
+
+//-----------------------------------------------------------
+bool CliParser::ReadSwitch( bool& value, const char* paramA, const char* paramB )
+{
+    if( ArgMatch( paramA, paramB ) )
+    {
+        NextArg();
+        value = true;
+        return true;
+    }
+
+    return false;
+}
+
+// Same as ReadSwitch but set's the value to false if
+// this parameter is matched.
+//-----------------------------------------------------------
+bool CliParser::ReadUnswitch( bool& value, const char* paramA, const char* paramB )
+{
+    if( ArgMatch( paramA, paramB ) )
+    {
+        NextArg();
+        value = false;
+        return true;
+    }
+
+    return false;
+}
+
+//-----------------------------------------------------------
+bool CliParser::ReadStr( const char*& value, const char* paramA, const char* paramB )
+{
+    if( !ArgMatch( paramA, paramB ) )
+        return false;
+
+    NextArg();
+    FatalIf( !HasArgs(), "Expected a value for argument '%s'.", _argv[_i-1] );
+
+    value = _argv[_i];
+    NextArg();
+
+    return true;
+}
+
+//-----------------------------------------------------------
+uint64 CliParser::ReadU64()
+{
+    const char* strValue = Arg();
+    NextArg();
+
+    uint64 value;
+    int r = sscanf( strValue, "%llu",(llu*)&value );
+    FatalIf( r != 1, "Expected an uint64 value at parameter %d.", _i );
+    
+    return value;
+}
+
+//-----------------------------------------------------------
+bool CliParser::ReadU64( uint64& value, const char* paramA, const char* paramB )
+{
+    const char* strValue = nullptr;
+    if( !ReadStr( strValue, paramA, paramB ) )
+        return false;
+
+    const char* arg = _argv[_i-2];
+    int r = sscanf( strValue, "%llu", (llu*)&value );
+    FatalIf( r != 1, "Invalid uint64 value for argument '%s'.", arg );
+
+    return true;
+}
+
+//-----------------------------------------------------------
+bool CliParser::ReadI64( int64& value, const char* paramA, const char* paramB )
+{
+    const char* strValue = nullptr;
+    if( !ReadStr( strValue, paramA, paramB ) )
+        return false;
+
+    const char* arg = _argv[_i-2];
+    int r = sscanf( strValue, "%lld", &value );
+    FatalIf( r != 1, "Invalid int64 value for argument '%s'.", arg );
+
+    return true;
+}
+
+//-----------------------------------------------------------
+bool CliParser::ReadU32( uint32& value, const char* paramA, const char* paramB )
+{
+    uint64 u64Value = 0;
+    if( !ReadU64( u64Value, paramA, paramB ) )
+        return false;
+
+    value = (uint32)u64Value;
+    const char* arg = _argv[_i-2];
+    FatalIf( value != u64Value, "Invalid uint32 value for argument '%s'.", arg );
+
+    return true;
+}
+
+//-----------------------------------------------------------
+bool CliParser::ReadI32( int32& value, const char* paramA, const char* paramB )
+{
+    int64 i64Value = 0;
+    if( !ReadI64( i64Value, paramA, paramB ) )
+        return false;
+
+    value = (int32)i64Value;
+    const char* arg = _argv[_i-2];
+    FatalIf( value != i64Value, "Invalid int32 value for argument '%s'.", arg );
+
+    return true;
+}
+
+//-----------------------------------------------------------
+bool CliParser::ReadF64( float64& value, const char* paramA, const char* paramB )
+{
+    const char* strValue = nullptr;
+    if( !ReadStr( strValue, paramA, paramB ) )
+        return false;
+
+    const char* arg = _argv[_i-2];
+    int r = sscanf( strValue, "%lf", &value );
+    FatalIf( r != 1, "Invalid float64 value for argument '%s'.", arg );
+
+    return true;
+}
+
+//-----------------------------------------------------------
+bool CliParser::ReadF32( float32& value, const char* paramA, const char* paramB )
+{
+    const char* strValue = nullptr;
+    if( !ReadStr( strValue, paramA, paramB ) )
+        return false;
+
+    const char* arg = _argv[_i-2];
+    int r = sscanf( strValue, "%f", &value );
+    FatalIf( r != 1, "Invalid float32 value for argument '%s'.", arg );
+
+    return true;
+}
+
+//-----------------------------------------------------------
+bool CliParser::ReadPKey( class bls::G1Element* value, const char* paramA, const char* paramB )
+{
+    const char* strValue = nullptr;
+    if( !ReadStr( strValue, paramA, paramB ) )
+        return false;
+
+    value = new bls::G1Element();
+    if( !KeyTools::HexPKeyToG1Element( strValue, *value ) )
+    {
+        const char* arg = _argv[_i-2];
+        Fatal( "Invalid public key value for argument '%s'.", arg );
+    }
+
+    return true;
+}
+
+//-----------------------------------------------------------
+bool CliParser::ReadPKey( bls::G1Element& value, const char* paramA, const char* paramB )
+{
+    const char* strValue = nullptr;
+    if( !ReadStr( strValue, paramA, paramB ) )
+        return false;
+
+    if( !KeyTools::HexPKeyToG1Element( strValue, value ) )
+    {
+        const char* arg = _argv[_i-2];
+        Fatal( "Invalid public key value for argument '%s'.", arg );
+    }
+
+    return true;
+}
+
+//-----------------------------------------------------------
+bool CliParser::ReadPuzzleHash( PuzzleHash* value, const char* paramA, const char* paramB )
+{
+    const char* strValue = nullptr;
+    if( !ReadStr( strValue, paramA, paramB ) )
+        return false;
+
+    auto* ph = new PuzzleHash();
+    if( !PuzzleHash::FromAddress( *ph, strValue ) )
+    {
+        const char* arg = _argv[_i-2];
+        Fatal( "Invalid puzzle hash value '%s' for argument '%s'.", strValue, arg );
+    }
+
+    return true;
+}
+
+//-----------------------------------------------------------
+bool CliParser::ReadSize( size_t& value, const char* paramA, const char* paramB )
+{
+    const char* sizeText = nullptr;
+    if( !ReadStr( sizeText, paramA, paramB ) )
+        return false;
+
+    const char* arg = _argv[_i-2];
+    return ReadSize( sizeText, value, arg );
+}
+
+//-----------------------------------------------------------
+bool CliParser::ReadSize( const char* sizeText, size_t& size, const char* arg )
+{
+    ASSERT( sizeText );
+    size = 0;
+
+    const size_t len = strlen( sizeText );
+    const char*  end = sizeText + len;
+
+    const char* suffix = sizeText;
+
+    #ifdef _WIN32
+        #define StriCmp _stricmp
+    #else
+        #define StriCmp strcasecmp
+    #endif
+
+    // Try to find a suffix:
+    //  Find the first character that's not a digit
+    do
+    {
+        const char c = *suffix;
+        if( c < '0' || c > '9' )
+            break;
+    }
+    while( ++suffix < end );
+
+    // Apply multiplier depending on the suffix
+    size_t multiplier = 1;
+
+    const size_t suffixLength = end - suffix;
+    if( suffixLength > 0 )
+    {
+        if( StriCmp( "GiB", suffix ) == 0 || StriCmp( "G", suffix ) == 0 )
+            multiplier = 1ull GiB;
+        else if( StriCmp( "MiB", suffix ) == 0 || StriCmp( "M", suffix ) == 0 )
+            multiplier = 1ull MiB;
+        else if( StriCmp( "KiB", suffix ) == 0 || StriCmp( "K", suffix ) == 0 )
+            multiplier = 1ull KiB;
+        else if( StriCmp( "TiB", suffix ) == 0 || StriCmp( "T", suffix ) == 0 )
+            multiplier = 1ull TiB;
+        else if( StriCmp( "PiB", suffix ) == 0 || StriCmp( "P", suffix ) == 0 )
+            multiplier = 1ull PiB;
+
+        // SI Units
+        else if( StriCmp( "PB", suffix ) == 0 )
+            multiplier = 1ull PBSi;
+        else if( StriCmp( "TB", suffix ) == 0 )
+            multiplier = 1ull TBSi;
+        else if( StriCmp( "GB", suffix ) == 0  )
+            multiplier = 1ull GBSi;
+        else if( StriCmp( "MB", suffix ) == 0 )
+            multiplier = 1ull MBSi;
+        else if( StriCmp( "KB", suffix ) == 0 )
+            multiplier = 1ull KBSi;
+
+        else
+        {
+            Fatal( "Invalid suffix '%s' for argument '%s'", suffix, arg );
+        }
+    }
+
+    size_t parsedSize = 0;
+
+    const size_t MAX_DIGITS = 19;
+    char digits[MAX_DIGITS + 1];
+
+    const size_t digitsLength = suffix - sizeText;
+    FatalIf( digitsLength < 1 || digitsLength > MAX_DIGITS, "Invalid parameters value for argument '%s'.", arg );
+
+    // Read digits
+    memcpy( digits, sizeText, digitsLength );
+    digits[digitsLength] = 0;
+
+    FatalIf( sscanf( digits, "%llu", (llu*)&parsedSize ) != 1,
+                "Invalid parameters value for argument '%s'.", arg );
+
+    size = parsedSize * multiplier;
+
+    // Check for overflow
+    FatalIf( size < size, "Size overflowed for argument '%s'.", arg );
+
+    return true;
+    #undef StriCmp
+}
+
+///-----------------------------------------------------------
+size_t CliParser::ReadSize( const char* arg )
+{
+    size_t size;
+    FatalIf( !ReadSize( Arg(), size, arg ),
+        "Expected a size argument for paramter '%s'", arg );
+
+    NextArg();
+    return size;
+}
+
+//-----------------------------------------------------------
+size_t CliParser::ReadSize()
+{
+    size_t size;
+    FatalIf( !ReadSize( Arg(), size, "" ),
+        "Expected a size argument at index %d", _i );
+
+    NextArg();
+    return size;
+}
+
+//-----------------------------------------------------------
+bool CliParser::ReadHexStr( const char*& hexStr, const size_t maxStrLength, const char* paramA, const char* paramB )
+{
+    if( !ReadStr( hexStr, paramA, paramB ) )
+        return false;
+    
+    size_t len = strlen( hexStr );
+    if( len >= 2 && hexStr[0] == '0' && hexStr[0] == 'x' )
+    {
+        hexStr += 2;
+        len -= 2;
+    }
+
+    FatalIf( len == 0, "Expected a hexadecimal string for parameter '%s'.", _argv[_i-1] );
+    FatalIf( len < maxStrLength, "Hexadecimal string '%s' for parameter '%s' is too long.", hexStr, _argv[_i-1] );
+
+    for( size_t i = 0; i < len; i++ )
+    {
+        FatalIf( !IsHexChar( hexStr[i] ), "Expected a hexadecimal string for parameter '%s'.", _argv[_i-1] );
+    }
+
+    return true;
+}
+    
+//-----------------------------------------------------------
+bool CliParser::ReadHexStrAsBytes( byte* bytes, size_t maxBytes, const char* paramA, const char* paramB )
+{
+    const char* hexStr = nullptr;
+    if( !ReadHexStr( hexStr, maxBytes*2, paramA, paramB ) )
+        return false;
+
+    FatalIf( !HexStrToBytesSafe( hexStr, strlen( hexStr ), bytes, maxBytes ),
+        "Failed to parse hexadecimal string for parameter '%s'.",  _argv[_i-1] );
+
+    return true;
+}
\ No newline at end of file
diff --git a/src/util/CliParser.h b/src/util/CliParser.h
index 4aba32f5..dcf5b490 100644
--- a/src/util/CliParser.h
+++ b/src/util/CliParser.h
@@ -1,7 +1,11 @@
 #pragma once
 #include <cstring>
 #include "util/Util.h"
-#include "util/KeyTools.h"
+
+namespace bls
+{
+    class G1Element;
+}
 
 class CliParser
 {
@@ -27,6 +31,12 @@ class CliParser
         return _i == _argc - 1;
     }
 
+    //-----------------------------------------------------------
+    inline int32 RemainingArgCount() const
+    {
+        return _argc - _i;
+    }
+
     //-----------------------------------------------------------
     inline const char* Arg()
     {
@@ -34,6 +44,21 @@ class CliParser
         return _argv[_i];
     }
 
+    //-----------------------------------------------------------
+    inline const char* Peek()
+    {
+        if( !HasArgs() )
+            return nullptr;
+        
+        return _argv[_i];
+    }
+
+    //-----------------------------------------------------------
+    inline void NextArg()
+    {
+        _i++;
+    }
+
     //-----------------------------------------------------------
     inline bool ArgMatch( const char* paramA, const char* paramB = nullptr )
     {
@@ -63,303 +88,48 @@ class CliParser
         return arg;
     }
 
-    //-----------------------------------------------------------
-    inline bool ReadSwitch( bool& value, const char* paramA, const char* paramB = nullptr )
-    {
-        if( ArgMatch( paramA, paramB ) )
-        {
-            NextArg();
-            value = true;
-            return true;
-        }
 
-        return false;
-    }
+    bool ReadSwitch( bool& value, const char* paramA, const char* paramB = nullptr );
 
     // Same as ReadSwitch but set's the value to false if
     // this parameter is matched.
     //-----------------------------------------------------------
-    inline bool ReadUnswitch( bool& value, const char* paramA, const char* paramB = nullptr )
-    {
-        if( ArgMatch( paramA, paramB ) )
-        {
-            NextArg();
-            value = false;
-            return true;
-        }
-
-        return false;
-    }
-
-    //-----------------------------------------------------------
-    inline bool ReadStr( const char*& value, const char* paramA, const char* paramB = nullptr )
-    {
-        if( !ArgMatch( paramA, paramB ) )
-            return false;
-
-        NextArg();
-        FatalIf( !HasArgs(), "Expected a value for argument '%s'.", _argv[_i-1] );
-
-        value = _argv[_i];
-        NextArg();
-
-        return true;
-    }
-
-    //-----------------------------------------------------------
-    inline uint64 ReadU64()
-    {
-        const char* strValue = Arg();
-        NextArg();
-
-        uint64 value;
-        int r = sscanf( strValue, "%llu",(llu*)&value );
-        FatalIf( r != 1, "Expected an uint64 value at parameter %d.", _i );
-        
-        return value;
-    }
-
-    //-----------------------------------------------------------
-    inline bool ReadU64( uint64& value, const char* paramA, const char* paramB = nullptr )
-    {
-        const char* strValue = nullptr;
-        if( !ReadStr( strValue, paramA, paramB ) )
-            return false;
-
-        const char* arg = _argv[_i-2];
-        int r = sscanf( strValue, "%llu", (llu*)&value );
-        FatalIf( r != 1, "Invalid uint64 value for argument '%s'.", arg );
-
-        return true;
-    }
-
-    //-----------------------------------------------------------
-    inline bool ReadI64( int64& value, const char* paramA, const char* paramB = nullptr )
-    {
-        const char* strValue = nullptr;
-        if( !ReadStr( strValue, paramA, paramB ) )
-            return false;
-
-        const char* arg = _argv[_i-2];
-        int r = sscanf( strValue, "%lld", &value );
-        FatalIf( r != 1, "Invalid int64 value for argument '%s'.", arg );
-
-        return true;
-    }
-
-    //-----------------------------------------------------------
-    inline bool ReadU32( uint32& value, const char* paramA, const char* paramB = nullptr )
-    {
-        uint64 u64Value = 0;
-        if( !ReadU64( u64Value, paramA, paramB ) )
-            return false;
-
-        value = (uint32)u64Value;
-        const char* arg = _argv[_i-2];
-        FatalIf( value != u64Value, "Invalid uint32 value for argument '%s'.", arg );
-
-        return true;
-    }
-
-    //-----------------------------------------------------------
-    inline bool ReadI32( int32& value, const char* paramA, const char* paramB = nullptr )
-    {
-        int64 i64Value = 0;
-        if( !ReadI64( i64Value, paramA, paramB ) )
-            return false;
-
-        value = (int32)i64Value;
-        const char* arg = _argv[_i-2];
-        FatalIf( value != i64Value, "Invalid int32 value for argument '%s'.", arg );
-
-        return true;
-    }
-
-    //-----------------------------------------------------------
-    inline bool ReadF64( float64& value, const char* paramA, const char* paramB = nullptr )
-    {
-        const char* strValue = nullptr;
-        if( !ReadStr( strValue, paramA, paramB ) )
-            return false;
-
-        const char* arg = _argv[_i-2];
-        int r = sscanf( strValue, "%lf", &value );
-        FatalIf( r != 1, "Invalid float64 value for argument '%s'.", arg );
+    bool ReadUnswitch( bool& value, const char* paramA, const char* paramB = nullptr );
 
-        return true;
-    }
-
-    //-----------------------------------------------------------
-    inline bool ReadF32( float32& value, const char* paramA, const char* paramB = nullptr )
-    {
-        const char* strValue = nullptr;
-        if( !ReadStr( strValue, paramA, paramB ) )
-            return false;
-
-        const char* arg = _argv[_i-2];
-        int r = sscanf( strValue, "%f", &value );
-        FatalIf( r != 1, "Invalid float32 value for argument '%s'.", arg );
-
-        return true;
-    }
-
-     //-----------------------------------------------------------
-    inline bool ReadPKey( bls::G1Element* value, const char* paramA, const char* paramB = nullptr )
-    {
-        const char* strValue = nullptr;
-        if( !ReadStr( strValue, paramA, paramB ) )
-            return false;
-
-        value = new bls::G1Element();
-        if( !KeyTools::HexPKeyToG1Element( strValue, *value ) )
-        {
-            const char* arg = _argv[_i-2];
-            Fatal( "Invalid public key value for argument '%s'.", arg );
-        }
-
-        return true;
-    }
-
-    //-----------------------------------------------------------
-    inline bool ReadPKey( bls::G1Element& value, const char* paramA, const char* paramB = nullptr )
-    {
-        const char* strValue = nullptr;
-        if( !ReadStr( strValue, paramA, paramB ) )
-            return false;
-
-        if( !KeyTools::HexPKeyToG1Element( strValue, value ) )
-        {
-            const char* arg = _argv[_i-2];
-            Fatal( "Invalid public key value for argument '%s'.", arg );
-        }
-
-        return true;
-    }
+    bool ReadStr( const char*& value, const char* paramA, const char* paramB = nullptr );
     
-    //-----------------------------------------------------------
-    inline bool ReadPuzzleHash( PuzzleHash* value, const char* paramA, const char* paramB = nullptr )
-    {
-        const char* strValue = nullptr;
-        if( !ReadStr( strValue, paramA, paramB ) )
-            return false;
-
-        auto* ph = new PuzzleHash();
-        if( !PuzzleHash::FromAddress( *ph, strValue ) )
-        {
-            const char* arg = _argv[_i-2];
-            Fatal( "Invalid puzzle hash value '%s' for argument '%s'.", strValue, arg );
-        }
-
-        return true;
-    }
-
-    //-----------------------------------------------------------
-    inline bool ReadSize( size_t& value, const char* paramA, const char* paramB = nullptr  )
-    {
-        const char* sizeText = nullptr;
-        if( !ReadStr( sizeText, paramA, paramB ) )
-            return false;
-
-        const char* arg = _argv[_i-2];
-        return ReadSize( sizeText, value, arg );
-    }
-
-    //-----------------------------------------------------------
-    inline bool ReadSize( const char* sizeText, size_t& size, const char* arg = "" )
-    {
-        ASSERT( sizeText );
-        size = 0;
-
-        const size_t len = strlen( sizeText );
-        const char*  end = sizeText + len;
-
-        const char* suffix = sizeText;
-
-        #ifdef _WIN32
-            #define StriCmp _stricmp
-        #else
-            #define StriCmp strcasecmp
-        #endif
-
-        // Try to find a suffix:
-        //  Find the first character that's not a digit
-        do
-        {
-            const char c = *suffix;
-            if( c < '0' || c > '9' )
-                break;
-        }
-        while( ++suffix < end );
-
-        // Apply multiplier depending on the suffix
-        size_t multiplier = 1;
-
-        const size_t suffixLength = end - suffix;
-        if( suffixLength > 0 )
-        {
-            if( StriCmp( "GB", suffix ) == 0 || StriCmp( "G", suffix ) == 0 )
-                multiplier = 1ull GB;
-            else if( StriCmp( "MB", suffix ) == 0 || StriCmp( "M", suffix ) == 0 )
-                multiplier = 1ull MB;
-            else if( StriCmp( "KB", suffix ) == 0 || StriCmp( "K", suffix ) == 0 )
-                multiplier = 1ull KB;
-            else
-            {
-                Fatal( "Invalid suffix '%s' for argument '%s'", suffix, arg );
-            }
-        }
-
-        size_t parsedSize = 0;
+    uint64 ReadU64();
+    
+    bool ReadU64( uint64& value, const char* paramA, const char* paramB = nullptr );
+    
+    bool ReadI64( int64& value, const char* paramA, const char* paramB = nullptr );
+    
+    bool ReadU32( uint32& value, const char* paramA, const char* paramB = nullptr );
 
-        const size_t MAX_DIGITS = 19;
-        char digits[MAX_DIGITS + 1];
+    bool ReadI32( int32& value, const char* paramA, const char* paramB = nullptr );
 
-        const size_t digitsLength = suffix - sizeText;
-        FatalIf( digitsLength < 1 || digitsLength > MAX_DIGITS, "Invalid parameters value for argument '%s'.", arg );
+    bool ReadF64( float64& value, const char* paramA, const char* paramB = nullptr );
 
-        // Read digits
-        memcpy( digits, sizeText, digitsLength );
-        digits[digitsLength] = 0;
+    bool ReadF32( float32& value, const char* paramA, const char* paramB = nullptr );
 
-        FatalIf( sscanf( digits, "%llu", (llu*)&parsedSize ) != 1,
-                 "Invalid parameters value for argument '%s'.", arg );
+    bool ReadPKey( bls::G1Element* value, const char* paramA, const char* paramB = nullptr );
 
-        size = parsedSize * multiplier;
+    bool ReadPKey( bls::G1Element& value, const char* paramA, const char* paramB = nullptr );
 
-        // Check for overflow
-        FatalIf( size < size, "Size overflowed for argument '%s'.", arg );
+    bool ReadPuzzleHash( struct PuzzleHash* value, const char* paramA, const char* paramB = nullptr );
 
-        return true;
-        #undef StriCmp
-    }
+    bool ReadSize( size_t& value, const char* paramA, const char* paramB = nullptr  );
 
-    //-----------------------------------------------------------
-    inline size_t ReadSize( const char* arg )
-    {
-        size_t size;
-        FatalIf( !ReadSize( Arg(), size, arg ),
-            "Expected a size argument for paramter '%s'", arg );
+    bool ReadSize( const char* sizeText, size_t& size, const char* arg = "" );
 
-        NextArg();
-        return size;
-    }
+    bool ReadHexStr( const char*& hexStr, size_t maxStrLength, const char* paramA, const char* paramB = nullptr  );
+    
+    bool ReadHexStrAsBytes( byte* bytes, size_t maxBytes, const char* paramA, const char* paramB = nullptr  );
 
-    //-----------------------------------------------------------
-    inline size_t ReadSize()
-    {
-        size_t size;
-        FatalIf( !ReadSize( Arg(), size, "" ),
-            "Expected a size argument at index %d", _i );
+    size_t ReadSize( const char* arg );
 
-        NextArg();
-        return size;
-    }
+    size_t ReadSize();
 
-    //-----------------------------------------------------------
-    inline void NextArg()
-    {
-        _i++;
-    }
 
 private:
     int          _i;
diff --git a/src/util/KeyTools.cpp b/src/util/KeyTools.cpp
index 4df99278..110417d7 100644
--- a/src/util/KeyTools.cpp
+++ b/src/util/KeyTools.cpp
@@ -1,8 +1,7 @@
 #include "KeyTools.h"
 #include "util/Util.h"
 #include "util/Log.h"
-
-
+#include "BLS.h"
 
 //-----------------------------------------------------------
 bool KeyTools::HexPKeyToG1Element( const char* hexKey, bls::G1Element& pkey )
@@ -34,7 +33,7 @@ bool KeyTools::HexPKeyToG1Element( const char* hexKey, bls::G1Element& pkey )
 }
 
 //-----------------------------------------------------------
-bls::PrivateKey KeyTools::MasterSkToLocalSK( bls::PrivateKey& sk )
+bls::PrivateKey KeyTools::MasterSkToLocalSK( const bls::PrivateKey& sk )
 {
     // #SEE: chia-blockchain: derive-keys.py
     // EIP 2334 bls key derivation
@@ -52,21 +51,21 @@ bls::PrivateKey KeyTools::MasterSkToLocalSK( bls::PrivateKey& sk )
     ssk = bls::AugSchemeMPL().DeriveChildSk( ssk, localIdx );
     ssk = bls::AugSchemeMPL().DeriveChildSk( ssk, 0        );
 
-    return ssk;
+    return std::move( ssk );
 }
 
 //-----------------------------------------------------------
-void KeyTools::PrintPK( const bls::G1Element&  key )
+void KeyTools::PrintPK( const bls::G1Element& key )
 {
     std::vector<uint8_t> bytes = key.Serialize();
-    Log::Line( "%s", HexToString( (byte*)bytes.data(), bytes.size() ).c_str() );
+    Log::Line( "%s", BytesToHexStdString( (byte*)bytes.data(), bytes.size() ).c_str() );
 }
 
 //-----------------------------------------------------------
 void KeyTools::PrintSK( const bls::PrivateKey& key )
 {
     std::vector<uint8_t> bytes = key.Serialize();
-    Log::Line( "%s", HexToString( (byte*)bytes.data(), bytes.size() ).c_str() );
+    Log::Line( "%s", BytesToHexStdString( (byte*)bytes.data(), bytes.size() ).c_str() );
 }
 
 
diff --git a/src/util/KeyTools.h b/src/util/KeyTools.h
index 43d327ce..96970605 100644
--- a/src/util/KeyTools.h
+++ b/src/util/KeyTools.h
@@ -1,28 +1,9 @@
 #pragma once
 
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wreturn-type"
-
-#pragma warning( push )
-
-extern "C" {
-    #include "bech32/segwit_addr.h"
-}
-
-#pragma GCC diagnostic ignored "-Wsign-compare"
-#pragma warning( disable : 6287  )
-#pragma warning( disable : 4267  )
-#pragma warning( disable : 26495 )
-#include "bls.hpp"
-#include "elements.hpp"
-#include "schemes.hpp"
-#include "util.hpp"
-#pragma GCC diagnostic pop
-#pragma warning( pop )
-
+#include "BLS.h"
 
 #define CHIA_PUZZLE_HASH_SIZE   32
-#define CHIA_ADDRESS_MAX_LENGTH 63 // 4 (hrp) + 1 (divisor) + 52 (data) + 6 (checksum)
+#define CHIA_ADDRESS_MAX_LENGTH 63 // 4 (hrp) + 1 (separator) + 52 (data) + 6 (checksum)
                                    // hrp is either xch or txch
 
  #define CHIA_ADDRESS_LENGTH         62
@@ -50,7 +31,7 @@ class KeyTools
 public:
     static bool HexPKeyToG1Element( const char* hexKey, bls::G1Element& pkey );
 
-    static bls::PrivateKey MasterSkToLocalSK( bls::PrivateKey& sk );
+    static bls::PrivateKey MasterSkToLocalSK( const bls::PrivateKey& sk );
 
     static void PrintPK( const bls::G1Element&  key );
     static void PrintSK( const bls::PrivateKey& key );
diff --git a/src/util/Log.cpp b/src/util/Log.cpp
index 699180c7..5978dd5b 100644
--- a/src/util/Log.cpp
+++ b/src/util/Log.cpp
@@ -172,6 +172,16 @@ void Log::FlushError()
     fflush( GetErrStream() );
 }
 
+//-----------------------------------------------------------
+void Log::NewLine()
+{
+    #if _WIN32
+        Log::Write( "\r\n" );
+    #else
+        Log::Write( "\n" );
+    #endif
+}
+
 
 #if DBG_LOG_ENABLE
 
diff --git a/src/util/Log.h b/src/util/Log.h
index 849cef61..b8a65bf2 100644
--- a/src/util/Log.h
+++ b/src/util/Log.h
@@ -25,6 +25,8 @@ class Log
     static void Flush();
     static void FlushError();
 
+    static void NewLine();
+
 #if DBG_LOG_ENABLE
     static void Debug( const char* msg, ... );
     static void DebugV( const char* msg, va_list args );
diff --git a/src/util/Span.h b/src/util/Span.h
index 18af7585..3624048b 100644
--- a/src/util/Span.h
+++ b/src/util/Span.h
@@ -120,13 +120,31 @@ struct Span
         ASSERT( count <= other.length );
         ASSERT( length >= count );
         
-        return memcmp( values, other.values, count * sizeof( T ) ) == 0;
+        #if _DEBUG
+            for( size_t i = 0; i < length; i++ )
+            {
+                const T a = values[i];
+                const T b = other[i];
+                if( a != b )
+                {
+                    ASSERT(0);
+                    return false;
+                }
+            }
+
+            return true;
+        #else
+            return memcmp( values, other.values, count * sizeof( T ) ) == 0;
+        #endif
     }
 
     inline bool EqualElements( const Span<T>& other ) const
     {
         if( other.length != length )
+        {
+            ASSERT(0);
             return false;
+        }
 
         return EqualElements( other, length );
     }
@@ -142,3 +160,10 @@ struct Span
 };
 
 typedef Span<uint8_t> ByteSpan;
+
+
+template<typename T>
+inline Span<T> MakeSpan( T* ptr, size_t length )
+{
+    return Span<T>( ptr, length );
+}
\ No newline at end of file
diff --git a/src/util/StackAllocator.h b/src/util/StackAllocator.h
index dc31e096..a8ac2573 100644
--- a/src/util/StackAllocator.h
+++ b/src/util/StackAllocator.h
@@ -40,7 +40,17 @@ class IAllocator
     }
 };
 
-class DummyAllocator : public IAllocator
+class IStackAllocator : public IAllocator
+{
+public:
+
+    virtual size_t Size() const = 0;
+    virtual void Pop( const size_t size ) = 0;
+    virtual void PopToMarker( const size_t sizeMarker ) = 0;
+};
+
+
+class DummyAllocator : public IStackAllocator
 {
 public:
     //-----------------------------------------------------------
@@ -52,15 +62,28 @@ class DummyAllocator : public IAllocator
     }
 
     //-----------------------------------------------------------
-    inline size_t Size() const { return _size; }
+    inline size_t Size() const override { return _size; }
+
+     //-----------------------------------------------------------
+    inline void Pop( const size_t size ) override
+    {
+        ASSERT( size >= _size );
+        _size -= size;
+    }
+
+    //-----------------------------------------------------------
+    inline void PopToMarker( const size_t sizeMarker ) override
+    {
+        ASSERT( sizeMarker <= _size );
+        _size = sizeMarker;
+    }
 
 private:
     size_t _size = 0;
 };
 
-
 // Fixed-capacity stack allocator
-class StackAllocator : public IAllocator
+class StackAllocator : public IStackAllocator
 {
 public:
 
@@ -100,7 +123,7 @@ class StackAllocator : public IAllocator
     }
 
     //-----------------------------------------------------------
-    inline size_t Size() const
+    inline size_t Size() const override
     {
         return _size;
     }
@@ -111,6 +134,12 @@ class StackAllocator : public IAllocator
         return _capacity - _size;
     }
 
+    //-----------------------------------------------------------
+    inline byte* Ptr() const
+    {
+        return _buffer;
+    }
+
     //-----------------------------------------------------------
     inline byte* Top() const
     {
@@ -118,14 +147,14 @@ class StackAllocator : public IAllocator
     }
 
     //-----------------------------------------------------------
-    inline void Pop( const size_t size )
+    inline void Pop( const size_t size ) override
     {
         ASSERT( size >= _size );
         _size -= size;
     }
 
     //-----------------------------------------------------------
-    inline void PopToMarker( const size_t sizeMarker )
+    inline void PopToMarker( const size_t sizeMarker ) override
     {
         ASSERT( sizeMarker <= _size );
         _size = sizeMarker;
diff --git a/src/util/Util.cpp b/src/util/Util.cpp
index c080142a..408bbabe 100644
--- a/src/util/Util.cpp
+++ b/src/util/Util.cpp
@@ -76,7 +76,7 @@ bool AssertLog( int line, const char* file, const char* func )
 }
 
 //-----------------------------------------------------------
-std::string HexToString( const byte* bytes, size_t length )
+std::string BytesToHexStdString( const byte* bytes, size_t length )
 {
     ASSERT( length );
 
diff --git a/src/util/Util.h b/src/util/Util.h
index 0ac15677..7d38cdde 100644
--- a/src/util/Util.h
+++ b/src/util/Util.h
@@ -19,15 +19,50 @@
     #error Byte swapping intrinsics not configured for this compiler.
 #endif
 
+#if defined(__GNUC__)
+    #define AlignAs(bytes) __attribute__( (aligned((bytes))) )
+#elif defined(_MSC_VER)
+    #define AlignAs(bytes) __declspec( align((bytes)) )
+#endif
+
 
 /// Byte size conversions
-#define KB *(1<<10)
-#define MB *(1<<20)
-#define GB *(1<<30)
+#define KB *(1llu<<10)
+#define MB *(1llu<<20)
+#define GB *(1llu<<30)
+#define TB *(1llu<<40)
+#define PB *(1llu<<50)
+
+#define KiB *(1llu<<10)
+#define MiB *(1llu<<20)
+#define GiB *(1llu<<30)
+#define TiB *(1llu<<40)
+#define PiB *(1llu<<50)
+
+#define BtoKB /(1llu<<10)
+#define BtoMB /(1llu<<20)
+#define BtoGB /(1llu<<30)
+
+
+/// SI Units
+#define KBSi *(1000llu)
+#define MBSi *(1000llu KBSi )
+#define GBSi *(1000llu MBSi )
+#define TBSi *(1000llu GBSi )
+#define PBSi *(1000llu TBSi )
 
-#define BtoKB /(1<<10)
-#define BtoMB /(1<<20)
-#define BtoGB /(1<<30)
+
+#define BtoKBSi( v ) ((v) / 1000llu)
+#define BtoMBSi( v ) (BtoKBSi(v) / 1000llu)
+#define BtoGBSi( v ) (BtoMBSi(v) / 1000llu)
+#define BtoTBSi( v ) (BtoGBSi(v) / 1000llu)
+#define BtoPBSi( v ) (BtoTBSi(v) / 1000llu)
+
+#define BtoKBSiF( v ) ((v) / 1000.0)
+#define BtoMBSiF( v ) (BtoKBSiF(v) / 1000.0)
+#define BtoGBSiF( v ) (BtoMBSiF(v) / 1000.0)
+#define BtoTBSiF( v ) (BtoGBSiF(v) / 1000.0)
+#define BtoPBSiF( v ) (BtoTBSiF(v) / 1000.0)
 
 
 ///
@@ -203,6 +238,16 @@ inline void bbvirtfree( void* ptr )
     SysHost::VirtualFree( ptr );
 }
 
+//-----------------------------------------------------------
+template<typename T>
+inline void bbvirtfree_span( Span<T>& span )
+{
+    if( span.values )
+        SysHost::VirtualFree( span.values );
+
+    span = {};
+}
+
 //-----------------------------------------------------------
 template<typename T = void>
 inline T* bbvirtalloc( size_t size )
@@ -235,22 +280,72 @@ inline T* bbcvirtalloc( size_t count )
     return bbvirtalloc<T>( sizeof( T ) * count );
 }
 
+//-----------------------------------------------------------
+inline void* bb_try_virt_alloc( size_t size )
+{
+    return SysHost::VirtualAlloc( size, false );
+}
+
 // Allocate virtual memory with protected boundary pages
 // #NOTE: Only free with bbvirtfreebounded
-//-----------------------------------------------------------
 template<typename T = void>
-inline T* bbvirtallocbounded( size_t size )
+inline void bbvirtfreebounded( T*& ptr )
+{
+    if( ptr )
+    {
+        bbvirtfree( ((byte*)ptr) - SysHost::GetPageSize() );
+        ptr = nullptr;
+    }
+}
+
+//-----------------------------------------------------------
+inline void* bb_try_virt_alloc_bounded( size_t size )
 {
     const size_t pageSize = SysHost::GetPageSize();
     size = RoundUpToNextBoundaryT<size_t>( size, pageSize ) + pageSize * 2;
 
-    auto* ptr = (byte*)SysHost::VirtualAlloc( size, false );
-    FatalIf( !ptr, "VirtualAlloc failed." );
+    auto* ptr = (byte*)bb_try_virt_alloc( size );
+    if( !ptr )
+        return nullptr;
+
+    if( !SysHost::VirtualProtect( ptr, pageSize, VProtect::NoAccess ) ||
+        !SysHost::VirtualProtect( ptr + size - pageSize, pageSize, VProtect::NoAccess ) )
+    {
+        bbvirtfreebounded( ptr );
+        return nullptr;
+    }
+
+    return ptr + pageSize;
+}
+
+//-----------------------------------------------------------
+template<typename T>
+inline T* bb_try_virt_calloc_bounded( const size_t count )
+{
+    return reinterpret_cast<T*>( bb_try_virt_alloc_bounded( count * sizeof( T ) ) );
+}
 
-    SysHost::VirtualProtect( ptr, pageSize, VProtect::NoAccess );
-    SysHost::VirtualProtect( ptr + size - pageSize, pageSize, VProtect::NoAccess );
+//-----------------------------------------------------------
+template<typename T>
+inline Span<T> bb_try_virt_calloc_bounded_span( const size_t count )
+{
+    auto span = Span<T>( bb_try_virt_calloc_bounded<T>( count ), count );
+    if( span.Ptr() == nullptr )
+        span.length = 0;
+
+    return span;
+}
+
+// Allocate virtual memory with protected boundary pages
+// #NOTE: Only free with bbvirtfreebounded
+//-----------------------------------------------------------
+template<typename T = void>
+inline T* bbvirtallocbounded( const size_t size )
+{
+    void* ptr = bb_try_virt_alloc_bounded( size );
+    FatalIf( !ptr, "VirtualAlloc failed." );
 
-    return reinterpret_cast<T*>( ptr + pageSize );
+    return reinterpret_cast<T*>( ptr );
 }
 
 //-----------------------------------------------------------
@@ -288,6 +383,13 @@ inline T* bbcvirtallocboundednuma( size_t count )
    return ptr;
 }
 
+//-----------------------------------------------------------
+template<typename T = void>
+inline bool bb_interleave_numa_memory( const size_t count, T* ptr )
+{
+    return SysHost::GetNUMAInfo() && SysHost::NumaSetMemoryInterleavedMode( ptr, count * sizeof( T ) );
+}
+
 //-----------------------------------------------------------
 template<typename T = void>
 inline Span<T> bbcvirtallocboundednuma_span( size_t count )
@@ -296,14 +398,14 @@ inline Span<T> bbcvirtallocboundednuma_span( size_t count )
 }
 
 //-----------------------------------------------------------
-inline void bbvirtfreebounded( void* ptr )
+template<typename T>
+inline void bbvirtfreebounded_span( Span<T>& span )
 {
-    ASSERT( ptr );
-    SysHost::VirtualFree( ((byte*)ptr) - SysHost::GetPageSize() );
+    bbvirtfreebounded( span.values );
+    span = {};
 }
 
 
-
 const char HEX_TO_BIN[256] = {
     0,   // 0	00	NUL
     0,   // 1	01	SOH
@@ -440,6 +542,14 @@ const char HEX_TO_BIN[256] = {
     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 };
 
+
+//-----------------------------------------------------------
+inline bool IsHexChar( const char c )
+{
+    if( c < '0' || c > 'f' ) return false;
+    return c <= '9' || c >= 'a' || ( c >= 'A' && c <= 'B' );
+}
+
 //-----------------------------------------------------------
 inline void HexStrToBytes( const char* str, const size_t strSize,
                            byte* dst, size_t dstSize )
@@ -573,7 +683,68 @@ inline std::vector<uint8_t> BytesConcat( std::vector<uint8_t> a, std::vector<uin
     return a;
 }
 
-std::string HexToString( const byte* bytes, size_t length );
+std::string BytesToHexStdString( const byte* bytes, size_t length );
 std::vector<uint8_t> HexStringToBytes( const char* hexStr );
 std::vector<uint8_t> HexStringToBytes( const std::string& hexStr );
 
+//-----------------------------------------------------------
+inline bool IsDigit( const char c )
+{
+    return c >= '0' && c <= '9';
+}
+
+//-----------------------------------------------------------
+inline bool IsNumber( const char* str )
+{
+    if( str == nullptr )
+        return false;
+
+    while( *str )
+    {
+        if( !IsDigit( *str++ ) )
+            return false;
+    }
+    
+    return true;
+}
+
+// Offsets a string pointer to the bytes from immediately
+// following the "0x", if the string starts with such a prefix.
+// #NOTE: Assumes a null-terminated string.
+//-----------------------------------------------------------
+inline const char* Offset0xPrefix( const char* str )
+{
+    ASSERT( str );
+    if( str && str[0] == '0' && (str[1] == 'x' || str[1] == 'X') )
+        return str+2;
+
+    return str;
+}
+
+template<typename T>
+inline void PrintBits( const T value, const uint32 bitCount )
+{
+    const uint32 shift = bitCount - 1;
+    for( uint32 i = 0; i < bitCount; i++ )
+        (value >> (shift-i)) & T(1) ? Log::Write( "1" ) : Log::Write( "0" );
+}
+
+//-----------------------------------------------------------
+/// Convertes 8 bytes to uint64 and endian-swaps it.
+/// This takes any byte alignment, so that bytes does
+/// not have to be aligned to 64-bit boundary.
+/// This is for compatibility for how chiapos extracts
+/// bytes into integers.
+//-----------------------------------------------------------
+inline uint64 BytesToUInt64( const byte bytes[8] )
+{
+    uint64 tmp;
+
+    if( (((uintptr_t)&bytes[0]) & 7) == 0 ) // Is address 8-byte aligned?
+        tmp = *(uint64*)&bytes[0];
+    else
+        memcpy( &tmp, bytes, sizeof( uint64 ) );
+
+    return Swap64( tmp );
+}
+
diff --git a/src/util/VirtualAllocator.h b/src/util/VirtualAllocator.h
new file mode 100644
index 00000000..f3a2ee33
--- /dev/null
+++ b/src/util/VirtualAllocator.h
@@ -0,0 +1,97 @@
+#pragma once
+
+#include "StackAllocator.h"
+#include "util/Util.h"
+
+
+class VirtualAllocator : public IAllocator
+{
+public:
+    inline void* Alloc( const size_t size, const size_t alignment ) override
+    {
+        (void)alignment;
+
+        const size_t allocSize = PageAlign( size );
+        _size += allocSize;
+        
+        return bbvirtalloc<byte>( allocSize );
+    }
+
+    inline void* TryAlloc( const size_t size )
+    {
+        const size_t allocSize = PageAlign( size );
+
+        void* ptr = bb_try_virt_alloc( allocSize );
+        if( !ptr )
+            _failureCount++;
+        else
+            _size += allocSize;
+
+        return ptr;
+    }
+
+    template<typename T>
+    inline T* TryCAlloc( const size_t count )
+    {
+        return TryAlloc( sizeof( T ) * count );
+    }
+
+    template<typename T>
+    inline Span<T> TryCAllocSpan( const size_t count )
+    {
+        T* ptr = TryCAlloc<T>( count );
+        return Span<T>( ptr, ptr ? count : 0 );
+    }
+
+
+    inline void* AllocBounded( const size_t size )
+    {
+        const size_t allocSize = PageAlign( size );
+        _size += allocSize + GetPageSize() * 2;
+
+        return bbvirtallocbounded( size );
+    }
+
+    inline void* TryAllocBounded( const size_t size )
+    {
+        const size_t allocSize = PageAlign( size );
+
+        void* ptr = bb_try_virt_alloc_bounded( size );
+        if( !ptr )
+            _failureCount++;
+        else
+            _size += allocSize + GetPageSize() * 2;
+
+        return ptr;
+    }
+
+    template<typename T>
+    inline T* TryCAllocBounded( const size_t count )
+    {
+        return (T*)TryAllocBounded( sizeof( T ) * count );
+    }
+
+    template<typename T>
+    inline Span<T> TryCAllocBoundedSpan( const size_t count )
+    {
+        T* ptr = TryCAllocBounded<T>( count );
+        return Span<T>( ptr, ptr ? count : 0 );
+    }
+
+
+    inline size_t GetPageSize() const { return SysHost::GetPageSize(); }
+
+    inline size_t PageAlign( const size_t size ) const
+    {
+        return RoundUpToNextBoundaryT<size_t>( size, GetPageSize() );
+    }
+
+    inline size_t AllocSize() const { return _size; }
+
+    inline size_t FailCount() const { return _failureCount; }
+
+private:
+    size_t _size         = 0;
+    size_t _failureCount = 0;
+};
+
diff --git a/src/util/jobs/SortKeyJob.h b/src/util/jobs/SortKeyJob.h
new file mode 100644
index 00000000..31f6f876
--- /dev/null
+++ b/src/util/jobs/SortKeyJob.h
@@ -0,0 +1,74 @@
+#pragma once
+
+#include "threading/MonoJob.h"
+
+struct SortKeyJob
+{
+    template<typename T>
+    inline static void GenerateKey( ThreadPool& pool, const uint32 threadCount, Span<T> keyBuffer )
+    {
+        ASSERT( pool.ThreadCount() >= threadCount );
+
+        MonoJobRun<Span<T>>( pool, threadCount, &keyBuffer, []( MonoJob<Span<T>>* self ){
+
+            Span<T> key = *self->context;
+
+            T count, offset, end;
+            GetThreadOffsets( self, (T)key.Length(), count, offset, end );
+
+            for( T i = offset; i < end; i++ )
+                key[i] = i;
+        });
+    }
+
+    template<typename T>
+    inline static void GenerateKey( ThreadPool& pool, Span<T> keyBuffer )
+    {
+        Run( pool, pool.ThreadCount(), keyBuffer );
+    }
+
+private:
+    template<typename T, typename TKey>
+    struct SortOnKeyJob
+    {
+        Span<T>    entriesIn;
+        Span<T>    entriesOut;
+        Span<TKey> key;
+    };
+
+public:
+    template<typename T, typename TKey>
+    inline static void SortOnKey( ThreadPool& pool, const uint32 threadCount, const Span<TKey> key, const Span<T> entriesIn, Span<T> entriesOut )
+    {
+        ASSERT( pool.ThreadCount() >= threadCount );
+
+        using Job = SortOnKeyJob<T, TKey>;
+
+        Job context;
+        context.entriesIn  = entriesIn;
+        context.entriesOut = entriesOut;
+        context.key        = key;
+
+        ASSERT( entriesIn.Length() == entriesOut.Length() && entriesIn.Length() == key.Length() );
+
+        MonoJobRun<Job>( pool, threadCount, &context, []( MonoJob<Job>* self ){
+            
+            auto context    = self->context;
+            auto entriesIn  = context->entriesIn;
+            auto entriesOut = context->entriesOut;
+            auto key        = context->key;
+
+            TKey count, offset, end;
+            GetThreadOffsets( self, (TKey)entriesIn.Length(), count, offset, end );
+
+            for( TKey i = offset; i < end; i++ )
+                entriesOut[i] = entriesIn[key[i]];
+        });
+    }
+
+    template<typename T, typename TKey>
+    inline static void SortOnKey( ThreadPool& pool, const Span<TKey> key, const Span<T> entriesIn, Span<T> entriesOut )
+    {
+        SortOnKey( pool, pool.ThreadCount(), key, entriesIn, entriesOut );
+    }
+};
\ No newline at end of file
diff --git a/tests/TestCompressedPlotProof.cpp b/tests/TestCompressedPlotProof.cpp
new file mode 100644
index 00000000..f8529570
--- /dev/null
+++ b/tests/TestCompressedPlotProof.cpp
@@ -0,0 +1,393 @@
+#include "TestUtil.h"
+#include "harvesting/GreenReaper.h"
+#include "tools/PlotReader.h"
+#include "plotmem/LPGen.h"
+#include "BLS.h"
+
+// #TODO: Move these to the GreenReaper api
+#include "util/BitView.h"
+
+bool GetProofForChallenge( PlotReader& reader, const char* challengeHex, uint64 fullProofXs[GR_POST_PROOF_X_COUNT] );
+bool GetProofForChallenge( PlotReader& reader, const uint32 f7, uint64 fullProofXs[GR_POST_PROOF_X_COUNT] );
+static void Sha256( byte outHash[32], const byte* bytes, size_t length );
+
+//-----------------------------------------------------------
+TEST_CASE( "compressed-plot-proof", "[sandbox][plots]" )
+{
+    FilePlot filePlot;
+    ENSURE( filePlot.Open( "/home/harold/plot/ref/plot-k32-2022-10-04-22-13-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" ) );
+    ENSURE( filePlot.K() == 32 );
+
+    PlotReader reader( filePlot );
+
+    const uint32 f7 = 7;
+    
+    uint64 fullProofXs    [GR_POST_PROOF_X_COUNT];
+    uint64 compressedProof[GR_POST_PROOF_CMP_X_COUNT] = {};
+    
+    ENSURE( GetProofForChallenge( reader, f7, fullProofXs ) );
+
+    const uint32 entryBitCount = bbclamp<uint32>( GetEnvU32( "bb_entry_bits", 16 ), 10, 16 );
+    const uint32 shift         = 32 - entryBitCount;
+
+    GreenReaperConfig cfg = {};
+    cfg.threadCount = bbclamp<uint32>( GetEnvU32( "bb_thread_count", 1 ), 1, SysHost::GetLogicalCPUCount() );
+
+    // Compress the full proof x's to n bits
+    GRCompressedProofRequest req = {};
+    
+    Log::WriteLine( "Proof X's:" );
+
+    for( uint32 i = 0; i < GR_POST_PROOF_X_COUNT; i+=2 )
+    {
+        uint64 x1 = fullProofXs[i];
+        uint64 x2 = fullProofXs[i+1];
+
+        Log::WriteLine( "[%2u]: %-10u ( 0x%08x )", i  , (uint32)x1, (uint32)x1 );
+        Log::WriteLine( "[%2u]: %-10u ( 0x%08x )", i+1, (uint32)x2, (uint32)x2 );
+        
+        x1 >>= shift;
+        x2 >>= shift;
+        ENSURE( x1 < (1ull << entryBitCount) );
+        ENSURE( x2 < (1ull << entryBitCount) );
+
+        // Convert to 31-bit linepoint
+        const uint32 x12 = (uint32)SquareToLinePoint( x2, x1 );
+        compressedProof[i>>1] = x12;
+    }
+
+    GreenReaperContext* cx = nullptr;
+    cx = grCreateContext( &cfg );
+    ENSURE( cx );
+
+    req.compressionLevel = 17 - entryBitCount;
+    // req.entryBitCount = entryBitCount;
+    // req.expectedF7    = f7;
+    // req.plotId        = filePlot.PlotId();
+
+    // Warm start
+    memcpy( req.compressedProof, compressedProof, sizeof( compressedProof ) );
+    grFetchProofForChallenge( cx, &req );
+    memcpy( req.compressedProof, compressedProof, sizeof( compressedProof ) );
+    
+    const auto timer = TimerBegin();
+    auto result = grFetchProofForChallenge( cx, &req );
+    const double elapsed = TimerEnd( timer );
+
+    // const bool isSame = memcmp( req.fullProof, fullProofXs, sizeof( fullProofXs ) ) == 0;
+    // #TODO: Need it place it in proof ordering for this to work.
+    // for( uint32 i = 0; i < GR_POST_PROOF_X_COUNT; i++ )
+    // {
+    //     ASSERT( req.fullProof[i] == fullProofXs[i] );
+    // }
+
+    Log::Line( "Completed %u-bit proof fetch %s in %.2lf seconds using %u thread(s).",
+        entryBitCount,
+        result == GRResult_OK ? "successfully" : "with failure",
+        elapsed, cfg.threadCount );
+
+    grDestroyContext( cx );
+    cx = nullptr;
+}
+
+//-----------------------------------------------------------
+TEST_CASE( "compressed-plot-qualities", "[sandbox][plots]" )
+{
+    // const char*  plotPath         = GetEnv( "bb_plot_path", "/home/harold/plot/tmp/plot-k32-c01-2023-02-13-22-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" );
+    // const char* plotPath = "/home/harold/plot/tmp/plot-k32-2023-02-09-21-15-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot";
+
+    // const char* plotPath = "/home/harold/plot/tmp/plot-k32-c01-2023-02-13-22-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot";
+    // const char* plotPath = "/home/harold/plot/tmp/plot-k32-c02-2023-02-14-21-19-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot";
+    // const char* plotPath = "/home/harold/plot/tmp/plot-k32-c03-2023-02-14-21-31-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot";
+    // const char* plotPath = "/home/harold/plot/tmp/plot-k32-c04-2023-02-08-01-33-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot";
+    // const char* plotPath = "/home/harold/plot/tmp/plot-k32-c05-2023-02-14-21-35-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot";
+    // const char* plotPath = "/home/harold/plot/tmp/plot-k32-c06-2023-02-14-21-43-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot";
+    const char* plotPath = "/home/harold/plot/tmp/plot-k32-c07-2023-02-08-17-35-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot";
+    
+    // const char* plotPath = "/home/harold/plot/tmp/plot-k32-c09-2023-02-14-21-22-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot";
+                 plotPath    = GetEnv( "bb_plot", plotPath );
+    const uint32 iterations  = GetEnvU32( "bb_iterations", 1 );
+    const uint32 f7Start     = GetEnvU32( "bb_f7", 7 );
+    const uint32 threadCount = bbclamp<uint32>( GetEnvU32( "bb_thread_count", 16 ), 1, SysHost::GetLogicalCPUCount() );
+
+    Log::Line( "Iterations  : %u", iterations );
+    Log::Line( "Starting F7 : %u", f7Start );
+    Log::Line( "Threads     : %u", threadCount );
+
+    FilePlot filePlot;
+    ENSURE( filePlot.Open( plotPath ) );
+    ENSURE( filePlot.K() == 32 );
+    ENSURE( filePlot.CompressionLevel() > 0 );
+
+    const uint32 k = filePlot.K();
+
+    // Read F7 from challenge
+    uint64 f7 = f7Start;
+    byte challenge[32] = {};
+    {
+        const char* challengeStr = "00000000ff04b8ee9355068689bd558eafe07cc7af47ad1574b074fc34d6913a";
+        HexStrToBytes( challengeStr, sizeof( challenge )*2, challenge, sizeof( challenge ) );
+
+        // CPBitReader f7Reader( (byte*)challenge, sizeof( challenge ) * 8 );
+        // f7 = f7Reader.Read64( filePlot.K() );
+    }
+
+    // Init GR context & qualities tree
+    GreenReaperContext* gr = nullptr;
+    {
+        GreenReaperConfig cfg = {};
+        cfg.threadCount = threadCount;
+
+        gr = grCreateContext( &cfg );
+        ENSURE( gr );
+    }
+
+    const uint32 last5Bits = (uint32)challenge[31] & 0x1f;
+
+    PlotReader reader( filePlot );
+
+    uint64 _indices[64] = {};
+    Span<uint64> indices( _indices, sizeof( _indices ) / sizeof( uint64 ) );
+
+    uint64 p7Entries[kEntriesPerPark] = {};
+
+    uint64 nFound = 0;
+
+    for( uint32 c = 0; c < iterations; c++ )
+    {
+        uint64 p7IndexBase = 0;
+        const uint64 matchCount  = reader.GetP7IndicesForF7( f7, p7IndexBase );
+
+        Log::Line( "[%-2u] F7: %llu", c, f7 );
+        
+
+            // FatalIf( matches.Length() < 1, "F7 has no proofs." );
+        if( matchCount < 1 )
+        {
+            // Log::Line( " No proofs." );
+            f7++;
+            continue;
+        }
+
+        nFound += matchCount;
+
+        // Embed f7 into challenge as BE
+        const size_t f7Size = CDiv( k, 8 );
+        for( size_t i = 0; i < f7Size; i++ )
+            challenge[i] = (byte)(f7 >> ((f7Size - i - 1) * 8));
+
+        f7++;
+
+        const bool needBothLeaves = filePlot.CompressionLevel() >= 6 ;
+
+        for( uint64 i = 0; i < matchCount; i++ )
+        {
+            const uint64 p7Index = p7IndexBase + i;
+            const uint64 p7Park  = p7Index / kEntriesPerPark;
+            FatalIf( !reader.ReadP7Entries( p7Park, p7Entries ), "Failed to read P7 %llu.", p7Park );   
+
+            const uint64 localP7Index = p7Index - p7Park * kEntriesPerPark;
+                  uint64 lpIndex      = p7Entries[localP7Index];
+                  uint64 altIndex     = 0;
+
+            // Init root node and set as current node
+            // GRQualitiesTreeInit( &qTree );
+            // auto& root = *GRQualitiesTreeAddNode( &qTree, nullptr );
+            // GRQualitiesNode* node = &root;
+
+            // Go from Table 6 to the final table to following only 1 path, and generating the graph
+            const TableId finalTable = TableId::Table2; // #TODO: Support other tables
+
+            for( TableId table = TableId::Table6; table > finalTable; table-- )
+            {
+                // Read line point
+                uint128 lp = 0;
+                FatalIf( !reader.ReadLP( table, lpIndex, lp ), "Failed to read line point for table %u", table+1 );
+                const BackPtr ptr = LinePointToSquare( lp );
+
+                ASSERT( ptr.x >= ptr.y );
+
+                const bool isTableBitSet = ((last5Bits >> ((uint32)table-1)) & 1) == 1;
+
+                if( !isTableBitSet )
+                {
+                    lpIndex = ptr.y;
+                    // node->nextIndex = ptr.x;    // Store the alternate path
+                    altIndex = ptr.x;
+                }
+                else
+                {
+                    lpIndex = ptr.x;
+                    // node->nextIndex = ptr.y;    // Store the alternate path
+                    altIndex = ptr.y;
+                }
+
+                if( table -1 == finalTable )
+                    break;
+
+                // GRQualitiesNode* child = GRQualitiesTreeAddNode( &qTree, node );
+                // ENSURE( child );
+                // node = child;
+            }
+
+            // Read compressed table line point (contains partial x's)
+            {
+                // Read both back pointers, depending on compression level
+                uint128 xLP0, xLP1;
+                FatalIf( !reader.ReadLP( finalTable, lpIndex, xLP0 ), "Failed to read line point from compressed table" );
+
+                if( needBothLeaves )
+                    FatalIf( !reader.ReadLP( finalTable, altIndex, xLP1 ), "Failed to read line point from compressed table" );
+                // const BackPtr xs1 = LinePointToSquare( xLP );
+
+                // FatalIf( !reader.ReadLP( finalTable, node->nextIndex, xLP ), "Failed to read line point from compressed table" );
+                // const BackPtr xs2 = LinePointToSquare( xLP );
+
+                // Unsupported proof (should be dropped)
+                // if( (xs1.x == 0 || xs1.y == 0) || (xs2.x == 0 || xs2.y == 0) )
+                // {
+                //     ASSERT( 0 );
+                // }
+
+                // Now decompress the X's
+                GRCompressedQualitiesRequest req = {};
+                req.plotId                = filePlot.PlotId();
+                req.compressionLevel      = filePlot.CompressionLevel();
+                req.challenge             = challenge;
+                req.xLinePoints[0].hi     = (uint64)(xLP0 >> 64);
+                req.xLinePoints[0].lo     = (uint64)xLP0;
+
+                if( needBothLeaves )
+                {
+                    req.xLinePoints[1].hi = (uint64)(xLP1 >> 64);
+                    req.xLinePoints[1].lo = (uint64)xLP1;
+                }
+
+                const auto r = grGetFetchQualitiesXPair( gr, &req );
+                ENSURE( r != GRResult_Failed );
+                ENSURE( r != GRResult_OutOfMemory );
+
+                if( r == GRResult_NoProof )
+                {
+                    Log::Line( " [%-2u] Dropped.", i );
+                    nFound --;
+                    continue;
+                }
+
+                byte hash[32] = {};
+                {
+                    const size_t HASH_SIZE_MAX = 32 + CDiv( 2*50, 8 );
+
+                    const size_t hashSize = 32 + CDiv( 2*k, 8 );
+                    byte hashInput[HASH_SIZE_MAX] = {};
+
+                    memcpy( hashInput, challenge, 32 );
+
+                    Bits<HASH_SIZE_MAX-32> hashBits;
+                    hashBits.Write( req.x2, k );
+                    hashBits.Write( req.x1, k );
+
+                    hashBits.ToBytes( hashInput+32 );
+
+                    Sha256( hash, hashInput, hashSize );
+                }
+
+                char hashStr[65] = {};
+                size_t _;
+                BytesToHexStr( hash, 32, hashStr, sizeof( hashStr), _ );
+
+                Log::Line( " [%-2u] 0x%s", i, hashStr );
+            }
+        }
+    }
+
+    Log::Line( "" );
+    Log::Line( "Found %llu / %u proofs ( %.2lf%% )", nFound, iterations, nFound / (double)iterations * 100 );
+
+    // Cleanup
+    grDestroyContext( gr );
+}
+
+//-----------------------------------------------------------
+bool GetProofForChallenge( PlotReader& reader, const char* challengeHex, uint64 fullProofXs[GR_POST_PROOF_X_COUNT] )
+{
+    FatalIf( !challengeHex || !*challengeHex, "Invalid challenge." );
+
+    const size_t lenChallenge = strlen( challengeHex );
+    FatalIf( lenChallenge != 64, "Invalid challenge, should be 32 bytes." );
+
+    // #TODO: Use HexStrToBytesSafe
+    uint64 challenge[4] = {};
+    HexStrToBytes( challengeHex, lenChallenge, (byte*)challenge, 32 );
+    
+    CPBitReader f7Reader( (byte*)challenge, sizeof( challenge ) * 8 );
+    const uint64 f7 = f7Reader.Read64( 32 );
+
+    return GetProofForChallenge( reader, (uint32)f7, fullProofXs );
+}
+
+//-----------------------------------------------------------
+bool GetProofForChallenge( PlotReader& reader, const uint32 f7, uint64 fullProofXs[GR_POST_PROOF_X_COUNT] )
+{
+    uint64 p7BaseIndex = 0;
+    const uint64 matchCount = reader.GetP7IndicesForF7( f7, p7BaseIndex );
+
+    // uint64 fullProofXs[GR_POST_PROOF_X_COUNT];
+    // uint64 proof   [32]  = {};
+    // char   proofStr[513] = {};
+    uint64 p7Entries[kEntriesPerPark] = {};
+
+    int64 prevP7Park = -1;
+
+    for( uint64 i = 0; i < matchCount; i++ )
+    {
+        const uint64 p7Index = p7BaseIndex + i;
+        const uint64 p7Park  = p7Index / kEntriesPerPark;
+        
+        // uint64 o = reader.GetFullProofForF7Index( matches[i], proof );
+        if( (int64)p7Park != prevP7Park )
+        {
+            FatalIf( !reader.ReadP7Entries( p7Park, p7Entries ), "Failed to read P7 %llu.", p7Park );
+        }
+
+        prevP7Park = (int64)p7Park;
+
+        const uint64 localP7Index = p7Index - p7Park * kEntriesPerPark;
+        const uint64 t6Index      = p7Entries[localP7Index];
+
+        // if( compressed )
+        //     gotProof = FetchC16Proof( reader, t6Index, fullProofXs );
+        // else
+        //     gotProof = FetchProof<true>( reader, t6Index, fullProofXs );
+
+        if( reader.FetchProof( t6Index, fullProofXs ) == ProofFetchResult::OK )
+        {
+            // #TODO: reorder it
+            return true;
+
+            // ReorderProof( reader, fullProofXs );
+
+            // BitWriter writer( proof, sizeof( proof ) * 8 );
+
+            // for( uint32 j = 0; j < PROOF_X_COUNT; j++ )
+            //     writer.Write64BE( fullProofXs[j], 32 );
+
+            // for( uint32 j = 0; j < PROOF_X_COUNT/2; j++ )
+            //     proof[j] = Swap64( proof[j] );
+
+            // size_t encoded;
+            // BytesToHexStr( (byte*)proof, sizeof( proof ), proofStr, sizeof( proofStr ), encoded );
+            // Log::Line( "[%llu] : %s", i, proofStr );
+            // Log::Line( proofStr );
+        }
+
+        return false;
+    }
+
+    return false;
+}
+
+void Sha256( byte outHash[32], const byte* bytes, const size_t length )
+{
+    bls::Util::Hash256( outHash, bytes, length );
+}
\ No newline at end of file
diff --git a/tests/TestLinePointDeltas.cpp b/tests/TestLinePointDeltas.cpp
new file mode 100644
index 00000000..e98598c4
--- /dev/null
+++ b/tests/TestLinePointDeltas.cpp
@@ -0,0 +1,386 @@
+#include "TestUtil.h"
+#include "plotmem/LPGen.h"
+#include "plotdisk/jobs/IOJob.h"
+#include "plotdisk/DiskPlotConfig.h"
+#include "plotting/PlotTools.h"
+#include "plotting/FSETableGenerator.h"
+#include "plotmem/ParkWriter.h"
+
+struct LpData
+{
+    uint64 deltaMax;
+    uint64 deltaMin;
+    uint64 deltaMaxPark;
+    uint64 deltaMinPark;
+    uint64 deltaBitsMax;
+    uint64 deltaBitsMin;
+    uint64 deltaTotalBits;
+    uint64 overflowCount;
+};
+
+struct ParkData
+{
+    size_t totalParkSize;
+    size_t parkSizeMax;
+    size_t parkSizeMin;
+};
+
+static Span<uint64> LoadLpTableForCompressionLevel( uint compressionLevel );
+static void DumpLpData( Span<uint64> linePoints, uint32 compressionLevel, uint32 stubBitSize, LpData& outLpData );
+static uint32 CountBits( uint64 sm );
+static void CalculateParkSizes( const Span<uint64> linePoints, const uint32 stubBitSize, const FSE_CTable* cTable, const size_t parkBufferSize, ParkData& parkData );
+
+ThreadPool* _pool = nullptr;
+
+
+//-----------------------------------------------------------
+TEST_CASE( "line-point-deltas", "[sandbox]" )
+{
+    _pool = new ThreadPool( SysHost::GetLogicalCPUCount() );
+
+    const size_t defaultParkSize   = CalculateParkSize( TableId::Table1 );
+    const uint32 defaultSutbBits   = 32 - kStubMinusBits;
+    const size_t defaultStubBytes  = CDiv( (kEntriesPerPark-1) * defaultSutbBits, 8 );
+    const size_t defaultDeltasSize = defaultParkSize - defaultStubBytes;
+
+    // byte* parkBuffer = (byte*)malloc( defaultParkSize * 4 );
+
+          uint32 cLevel    = GetEnvU32( "bb_clevel", 1 );
+    const uint32 endCLevel = GetEnvU32( "bb_end_clevel", 9 );
+    const uint32 k         = 32;
+
+    for( ; cLevel <= endCLevel; cLevel++ )
+    {
+        Log::Line( "[Level %u]", cLevel );
+        Log::Line( " Loading line points" );
+        
+        Span<uint64> linePoints = LoadLpTableForCompressionLevel( cLevel );
+
+        // Convert to park-specific deltas
+
+        LpData lpData = {};
+
+        const uint32 xBitSize     = (17 - cLevel);
+        const uint32 entryBitSize = xBitSize * 2;
+              uint32 stubBitSize  = k-1;    // Delta average should be k size, so remove 1 bit for small delta
+
+        double rValue = 0;
+
+        Log::Line( " Calculating for entries of %u bits", entryBitSize/2 );
+        for( ;; )
+        {
+            DumpLpData( linePoints, cLevel, stubBitSize, lpData );
+
+            const double averageBits = lpData.deltaTotalBits / double(linePoints.Length()-1);
+            Log::Line( " [Stub bit size: %u]", stubBitSize );
+            Log::Line( "  Average delta bit count : %.2lf", averageBits          );
+            Log::Line( "  Max delta bits          : %llu" , lpData.deltaBitsMax  );
+            Log::Line( "  Min delta bits          : %llu" , lpData.deltaBitsMin  );
+            Log::Line( "  Max delta value         : %llu" , lpData.deltaMax      );
+            Log::Line( "  Min delta value         : %llu" , lpData.deltaMin      );
+            Log::Line( "  Max delta park          : %llu" , lpData.deltaMaxPark  );
+            Log::Line( "  Min delta park          : %llu" , lpData.deltaMinPark  );
+            Log::Line( "  N overflowed 256-bits   : %llu" , lpData.overflowCount );
+            Log::WriteLine( "" );
+
+            if( lpData.overflowCount > 0 )
+            {
+                stubBitSize++;
+                break;
+            }
+
+            stubBitSize--;
+            rValue = lpData.deltaTotalBits / double(linePoints.Length()-1); 
+        }
+
+
+        // Now try to determine a more reasonable ANS encoding value
+        Log::NewLine();
+        Log::Line( " Selected sub bit size: %u", stubBitSize );
+        Log::NewLine();
+        Log::Line( " [Calculating ANS value]" );
+
+        size_t deltasSize = defaultDeltasSize;
+
+
+        const size_t stubBytes = CDiv( (kEntriesPerPark-1) * stubBitSize, 8 );
+        size_t currentParkSize  = stubBytes + defaultDeltasSize;
+        size_t smallestParkSize = currentParkSize;
+
+        bool wentLower  = false;
+        bool wentHigher = false;
+
+        // uint64 parkEntries[kEntriesPerPark];
+
+        const uint64 totalParkCount = CDiv( (1ull << 32), kEntriesPerPark );
+        ParkData parkData;
+
+        uint64 end = 0xFFFFFFFFFFFFFFFFull;
+        for( uint64 i = 0 ; i < end; i++ )
+        {
+            rValue += 0.1;
+
+            FSE_CTable* cTable = FSETableGenerator::GenFSECompressionTable( rValue );
+            ASSERT( cTable );
+
+            Log::Line( "  [R value: %.2lf]", rValue );
+            CalculateParkSizes( linePoints, stubBitSize, cTable, defaultParkSize, parkData );
+
+            FSE_freeCTable( cTable );
+
+            const size_t averageParkSize = (size_t)std::ceil( parkData.totalParkSize / (double)totalParkCount );
+
+            const uint64 parkDeltaMin = parkData.parkSizeMin - stubBytes;
+            const uint64 parkDeltaMax = parkData.parkSizeMax - stubBytes;
+
+            Log::Line( "   Average park size in bytes: %llu", averageParkSize );
+            Log::Line( "   Minimum park size         : %llu", parkData.parkSizeMin );
+            Log::Line( "   Maximum park size         : %llu", parkData.parkSizeMax );
+            Log::Line( "   Min deltas size           : %llu", parkDeltaMin );
+            Log::Line( "   Max deltas size           : %llu", parkDeltaMax );
+            Log::Line( "" );
+
+            // const size_t parkSize = WritePark( defaultParkSize*4, kEntriesPerPark, parkEntries, parkBuffer, stubBitSize, cTable );
+
+            if( parkData.parkSizeMax < smallestParkSize && !wentLower )
+            {
+                wentLower = true;
+                continue;
+            }
+            
+            // If we already gone below the default threshhold, and 
+            // we got a park size greater than our smallest one, then we can break
+            if( !wentHigher && wentLower && parkData.parkSizeMax > smallestParkSize )
+            {
+                Log::Line( "*** Lowest reached ***" );
+                // Do 6 more and exit
+                end = i + 7;
+                wentHigher = true;
+            }
+
+            smallestParkSize = std::min( smallestParkSize, parkData.parkSizeMax );
+            // deltasSize      = parkData.parkSizeMax - stubBytes;
+            // break;
+        }
+
+        bbvirtfree( linePoints.Ptr() );
+        Log::Line( "" );
+    }
+}
+
+//-----------------------------------------------------------
+void CalculateParkSizes( const Span<uint64> linePoints, const uint32 stubBitSize, const FSE_CTable* cTable, const size_t parkBufferSize, ParkData& parkData )
+{
+    const uint64 parkCount = (uint64)CDiv( linePoints.Length(), kEntriesPerPark );
+
+    ParkData parkDatas[BB_MAX_JOBS] = {};
+
+    AnonMTJob::Run( *_pool, [&]( AnonMTJob* self ){
+        
+        uint64 count, offset, end;
+        GetThreadOffsets( self, parkCount, count, offset, end );
+
+        const uint64 sliceEnd = self->IsLastThread() ? linePoints.length : end * kEntriesPerPark;
+
+        Span<uint64> entries = linePoints.Slice( offset * kEntriesPerPark, sliceEnd - offset * kEntriesPerPark );
+
+        uint64 parkEntries[kEntriesPerPark];
+        byte*  parkBuffer = (byte*)malloc( parkBufferSize*4 );
+
+        ParkData& data = parkDatas[self->_jobId];
+        data.parkSizeMax   = 0;
+        data.parkSizeMin   = 0xFFFFFFFFFFFFFFFFull;
+        data.totalParkSize = 0;
+        
+        while( entries.Length() > 0 )
+        {
+            const uint64 entryCount = std::min( entries.Length(), (size_t)kEntriesPerPark );
+            entries.SliceSize( entryCount ).CopyTo( Span<uint64>( parkEntries, entryCount ) );
+            
+            const size_t parkSize = WritePark( parkBufferSize*4, entryCount, parkEntries, parkBuffer, stubBitSize, cTable );
+
+            data.parkSizeMax   = std::max( data.parkSizeMax, parkSize );
+            data.parkSizeMin   = std::min( data.parkSizeMin, parkSize );
+            data.totalParkSize += parkSize;
+
+            entries = entries.Slice( entryCount );
+        }
+        
+        free( parkBuffer );
+    });
+
+    ZeroMem( &parkData );
+    parkData.parkSizeMin = 0xFFFFFFFFFFFFFFFFull;
+
+    for( uint32 i = 0; i < _pool->ThreadCount(); i++ )
+    {
+        parkData.parkSizeMax   = std::max( parkData.parkSizeMax, parkDatas[i].parkSizeMax );
+        parkData.parkSizeMin   = std::min( parkData.parkSizeMin, parkDatas[i].parkSizeMin );
+        parkData.totalParkSize += parkDatas[i].totalParkSize;
+    }
+}
+
+//-----------------------------------------------------------
+void DumpLpData( Span<uint64> linePoints, const uint32 compressionLevel, const uint32 stubBitSize, LpData& outLpData )
+{
+    struct Job : MTJob<Job>
+    {
+        Span<uint64> linePoints;
+        uint32       stubBitSize;
+
+        uint64 deltaMax;
+        uint64 deltaMin;
+        uint64 overflowCount;
+        uint64 deltaMaxPark;
+        uint64 deltaMinPark;
+        uint64 deltaBitsMax;
+        uint64 deltaBitsMin;
+        uint64 deltaTotalBits;
+        
+        inline void Run() override
+        {
+            deltaMax       = 0;
+            deltaMin       = 0xFFFFFFFFFFFFFFFFull;
+            deltaMaxPark   = 0;
+            deltaMinPark   = 0xFFFFFFFFFFFFFFFFull;;
+            deltaBitsMax   = 0;
+            deltaBitsMin   = 0xFFFFFFFFFFFFFFFFull;
+            deltaTotalBits = 0;
+            overflowCount  = 0;
+
+            const uint32 id         = this->JobId();
+            Span<uint64> linePoints = this->linePoints;
+
+            const uint64 totalParkCount = (uint64)CDiv( linePoints.Length(), kEntriesPerPark );
+
+            uint64 parkCount, parkOffset, parkEnd;
+            GetThreadOffsets( this, totalParkCount, parkCount, parkOffset, parkEnd );
+
+            const uint64 entryStart = parkOffset * kEntriesPerPark;
+            const uint64 entryEnd   = std::min( entryStart + parkCount * kEntriesPerPark, (uint64)linePoints.Length() );
+
+            linePoints = linePoints.Slice( entryStart, entryEnd - entryStart );
+
+            // Deltafy
+            for( uint64 park = parkOffset; park < parkEnd; park++ )
+            {
+                const uint64 parkEntryCount = std::min( linePoints.Length(), (size_t)kEntriesPerPark );
+                
+                uint64 prevLp = linePoints[0];
+
+                for( uint64 i = 1; i < parkEntryCount; i++ )
+                {
+                    const uint64 lp    = linePoints[i]; ASSERT( lp >= prevLp );
+                    const uint64 delta = lp - prevLp;
+
+                    // Remove stub (convert to small deltas)
+                    const uint64 smallDelta = delta >> stubBitSize;
+
+                    if( smallDelta >= 256 )
+                        overflowCount++;
+
+                    const uint64 deltaBits = CountBits( smallDelta );
+                    deltaTotalBits += deltaBits;
+
+                    if( smallDelta > deltaMax )
+                        deltaMaxPark = park;
+                    if( smallDelta < deltaMin )
+                        deltaMinPark = park;
+
+                    deltaMax     = std::max( deltaMax    , smallDelta );
+                    deltaMin     = std::min( deltaMin    , smallDelta );
+                    deltaBitsMax = std::max( deltaBitsMax, deltaBits  );
+                    deltaBitsMin = std::min( deltaBitsMin, deltaBits  );
+
+                    prevLp = lp;
+                }
+
+                linePoints = linePoints.Slice( parkEntryCount );
+            }
+        }
+    };
+
+    const uint32 threadCount = _pool->ThreadCount();
+    MTJobRunner<Job> jobs( *_pool );
+    
+    for( uint32 i = 0; i < threadCount; i++ )
+    {
+        jobs[i].linePoints  = linePoints;
+        jobs[i].stubBitSize = stubBitSize;
+    }
+    jobs.Run( threadCount );
+
+    {
+        uint64 deltaMax       = 0;
+        uint64 deltaMin       = 0xFFFFFFFFFFFFFFFFull;
+        uint64 deltaMaxPark   = 0;
+        uint64 deltaMinPark   = 0xFFFFFFFFFFFFFFFFull;;
+        uint64 deltaBitsMax   = 0;
+        uint64 deltaBitsMin   = 0xFFFFFFFFFFFFFFFFull;
+        uint64 deltaTotalBits = 0;
+        uint64 overflowCount  = 0;
+
+        for( uint32 i = 0; i < threadCount; i++ )
+        {
+            deltaMax       = std::max( deltaMax    , jobs[i].deltaMax     );
+            deltaMin       = std::min( deltaMin    , jobs[i].deltaMin     ); 
+            deltaMaxPark   = std::max( deltaMaxPark, jobs[i].deltaMaxPark );
+            deltaMinPark   = std::min( deltaMinPark, jobs[i].deltaMinPark );
+            deltaBitsMax   = std::max( deltaBitsMax, jobs[i].deltaBitsMax );
+            deltaBitsMin   = std::min( deltaBitsMin, jobs[i].deltaBitsMin );
+            deltaTotalBits += jobs[i].deltaTotalBits;
+            overflowCount  += jobs[i].overflowCount;
+        }
+        
+        outLpData.deltaMax       = deltaMax;
+        outLpData.deltaMin       = deltaMin;
+        outLpData.deltaMaxPark   = deltaMaxPark;
+        outLpData.deltaMinPark   = deltaMinPark;
+        outLpData.deltaBitsMax   = deltaBitsMax;
+        outLpData.deltaBitsMin   = deltaBitsMin;
+        outLpData.deltaTotalBits = deltaTotalBits;
+        outLpData.overflowCount  = overflowCount;
+    }
+}
+
+//-----------------------------------------------------------
+Span<uint64> LoadLpTableForCompressionLevel( const uint compressionLevel )
+{
+    char filePath[1024] = {};
+
+    // if( compressionLevel < 9 )
+    //     sprintf( filePath, "%st2.lp.c%u.ref", BB_DP_DBG_REF_DIR "compressed-lps/", compressionLevel );
+    // else
+        sprintf( filePath, "%slp.c%u.ref", BB_DP_DBG_REF_DIR "compressed-lps/", compressionLevel );
+
+    size_t byteCount = 0;
+    int err = 0;
+    uint64* linePoints = (uint64*)IOJob::ReadAllBytesDirect( filePath, err, byteCount );
+    
+    FatalIf( !linePoints, "Failed to load line points from '%s' with error %d", filePath, err );
+
+    // Get the line point count
+    int64 count = (int64)(byteCount / sizeof( uint64 ));
+    for( int64 i = count-2; i >= 0; i-- )
+    {
+        if( linePoints[i] > linePoints[i+1] )
+        {
+            count = i+1;
+            break;
+        }
+    }
+
+    return Span<uint64>( linePoints, (size_t)count );
+}
+
+//-----------------------------------------------------------
+inline uint32 CountBits( uint64 sm )
+{
+    for( int32 i = 63; i >= 0; i-- )
+    {
+        if( sm >> i )
+            return (uint32)i+1;
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/tests/TestMacOSThreads.cpp b/tests/TestMacOSThreads.cpp
new file mode 100644
index 00000000..e944b895
--- /dev/null
+++ b/tests/TestMacOSThreads.cpp
@@ -0,0 +1,22 @@
+#include "TestUtil.h"
+#include "threading/ThreadPool.h"
+
+//-----------------------------------------------------------
+TEST_CASE( "macos-threads", "[sandbox]" )
+{
+    std::atomic<bool> signal = false;
+
+    Thread* thread = new Thread();
+
+    ThreadRunner tfunc = (ThreadRunner)[]( void* p ) -> void {
+        Thread::Sleep( 1500 );
+        reinterpret_cast<std::atomic<bool>*>( p )->store( true, std::memory_order_release );
+    };
+    thread->Run( tfunc, &signal );
+
+    ENSURE( thread->WaitForExit( 100 ) == false );
+    ENSURE( thread->WaitForExit() );
+    ENSURE( thread->HasExited() );
+
+    delete thread;
+}
\ No newline at end of file
diff --git a/tests/TestPlotWriter.cpp b/tests/TestPlotWriter.cpp
new file mode 100644
index 00000000..6d233c6e
--- /dev/null
+++ b/tests/TestPlotWriter.cpp
@@ -0,0 +1,95 @@
+#include "TestUtil.h"
+#include "plotdisk/jobs/IOJob.h"
+#include "plotting/PlotWriter.h"
+#include "tools/PlotReader.h"
+
+#define PLOT_FILE_DIR "/home/harold/plot/tmp/"
+#define PLOT_FILE_NAME "tester.plot.tmp"
+#define PLOT_FILE_PATH PLOT_FILE_DIR PLOT_FILE_NAME
+
+template<typename T>
+inline T RandRange( T min, T max )
+{
+    const T      range = max - min;
+    const double r     = std::rand() / (double)RAND_MAX;
+    return min + (T)(range * r);
+}
+
+//-----------------------------------------------------------
+TEST_CASE( "plot-writer", "[sandbox]" )
+{
+    PlotWriter writer;
+
+    byte plotId[32];
+
+    const size_t blockSize  = 4096;
+    const size_t totalCount = ( 6ull GiB ) / sizeof( uint32 );
+    // const size_t nBlocks    = CDiv( totalCount, blockSize / sizeof( uint32 ) );
+
+    uint32* data = bbcvirtallocbounded<uint32>( totalCount );//  new uint32[totalCount/sizeof( uint32 )];
+
+    const uint32 TEST_ROUNDS = 64;
+
+
+    for( uint32 round = 0; round < TEST_ROUNDS; round++ )
+    {
+        const size_t c1Min  = 1024*27 + 115, c1Max = 1024*30 + 357;
+        const size_t c2Min  = 21,      c2Max = 57;
+        const size_t c3Base = RandRange( 100, 500 );
+
+        // const size_t c1Length = 30504;
+        // const size_t c2Length = 36;
+        const size_t c1Length = RandRange( c1Min, c1Max );
+        const size_t c2Length = RandRange( c2Min, c2Max );
+        const size_t c3Length = totalCount - ( c1Length + c2Length );
+
+        Log::Line( "Testing round %u:", round    );
+        Log::Line( " C1 Length: %llu" , c1Length );
+        Log::Line( " C2 Length: %llu" , c2Length );
+        Log::Line( " C3 Length: %llu" , c3Length );
+        Log::Line( " C3 Base  : %llu" , c3Base );
+
+        for( size_t i = 0; i < c1Length+c2Length; i++ )
+            data[i] = (uint32)(i+1);
+
+        for( size_t i = c1Length+c2Length, j = 0; i < totalCount; i++, j++ )
+            data[i] = (uint32)(c3Base + j);
+
+        ASSERT( writer.BeginPlot( PlotVersion::v1_0, PLOT_FILE_DIR, PLOT_FILE_NAME, plotId, plotId, sizeof( plotId ), 0  ) );
+        writer.ReserveTableSize( PlotTable::C1, c1Length * sizeof( uint32 ) );
+        writer.ReserveTableSize( PlotTable::C2, c2Length * sizeof( uint32 ) );
+        writer.BeginTable( PlotTable::C3 );
+        writer.WriteTableData( data+c1Length+c2Length, c3Length * sizeof( uint32 ) );
+        writer.EndTable();
+        writer.WriteReservedTable( PlotTable::C1, data );
+        writer.WriteReservedTable( PlotTable::C2, data + c1Length );
+        writer.EndPlot( false );
+
+        int err;
+        const uint32* plotData = (uint32*)IOJob::ReadAllBytesDirect( PLOT_FILE_PATH, err ) + 1024;
+        ASSERT( plotData );
+
+        Log::Line( " Validating..." );
+        FilePlot plot;
+        plot.Open( PLOT_FILE_PATH );
+        ASSERT( plot.IsOpen() );
+        const size_t c1Address = plot.TableAddress( PlotTable::C1 );
+        const size_t c2Address = plot.TableAddress( PlotTable::C2 );
+        const size_t c3Address = plot.TableAddress( PlotTable::C3 );
+
+        Log::Line( " C1: %lu ( 0x%016lx ) )", c1Address, c1Address );
+        Log::Line( " C2: %lu ( 0x%016lx ) )", c2Address, c2Address );
+        Log::Line( " C3: %lu ( 0x%016lx ) )", c3Address, c3Address );
+
+        ASSERT( c1Address == blockSize );
+        ASSERT( c2Address == c1Address + c1Length * 4 );
+        ASSERT( c3Address == c1Address + (c1Length + c2Length) * 4 );
+
+        for( size_t i = c1Length+c2Length, j = 0; i < totalCount; i++, j++ )
+            ASSERT( plotData[i] == c3Base + j );
+
+        Log::Line( " OK!" );
+        Log::Line( "" );
+    }
+
+}
diff --git a/tests/TestUtil.h b/tests/TestUtil.h
index 6c3679eb..80132e0f 100644
--- a/tests/TestUtil.h
+++ b/tests/TestUtil.h
@@ -43,3 +43,25 @@ T* LoadReferenceTable( const char* path, uint64& outEntryCount )
     outEntryCount = entryCount;
     return entries;
 }
+
+//-----------------------------------------------------------
+inline const char* GetEnv( const char* name, const char* defaultValue )
+{
+    const char* value = std::getenv( name );
+
+    if( !value || !*value )
+        value = defaultValue;
+
+    return value;
+}
+
+//-----------------------------------------------------------
+inline uint32 GetEnvU32( const char* name, uint32 value )
+{
+    const char* sVal = std::getenv( name );
+
+    if( sVal )
+        sscanf( sVal, "%u",  &value );
+
+    return value;
+}
\ No newline at end of file