diff --git a/.idea/vcs.xml b/.idea/vcs.xml
index 54a1aefd..94a25f7f 100644
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@@ -2,13 +2,5 @@
 <project version="4">
   <component name="VcsDirectoryMappings">
     <mapping directory="$PROJECT_DIR$" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/build-release/_deps/bls-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/build-release/_deps/relic-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/build-release/_deps/sodium-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/build-release/_deps/sodium-src/libsodium" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/build/_deps/bls-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/build/_deps/relic-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/build/_deps/sodium-src" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/build/_deps/sodium-src/libsodium" vcs="Git" />
   </component>
 </project>
\ No newline at end of file
diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json
deleted file mode 100644
index 8ae95aee..00000000
--- a/.vscode/c_cpp_properties.json
+++ /dev/null
@@ -1,43 +0,0 @@
-{
-    "configurations": [
-        {
-            "name": "Win32",
-            "includePath": [
-                "${workspaceFolder}/**"
-            ],
-            "defines": [
-                "_DEBUG",
-                "UNICODE",
-                "_UNICODE"
-            ],
-            "windowsSdkVersion": "10.0.19041.0",
-            "compilerPath": "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\bin\\Hostx64\\x64\\cl.exe",
-            "cStandard": "c17",
-            "cppStandard": "c++17",
-            "intelliSenseMode": "windows-msvc-x64",
-            "configurationProvider": "ms-vscode.cmake-tools",
-            "forcedInclude": [
-                "src/pch.h"
-            ]
-        },
-        {
-            "name": "macOS",
-            "includePath": [
-                "${workspaceFolder}/**"
-            ],
-            "defines": [],
-            "macFrameworkPath": [
-                "/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/System/Library/Frameworks"
-            ],
-            "compilerPath": "/usr/bin/clang",
-            "cStandard": "c17",
-            "cppStandard": "c++17",
-            "intelliSenseMode": "macos-clang-arm64",
-            "configurationProvider": "ms-vscode.cmake-tools",
-            "forcedInclude": [
-                "src/pch.h"
-            ]
-        }
-    ],
-    "version": 4
-}
\ No newline at end of file
diff --git a/.vscode/launch.json b/.vscode/launch.json
index 6957af27..bb356736 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -131,19 +131,25 @@
             "preLaunchTask" : "build_cuda_debug",
 
             "program": "${workspaceFolder}/build/bladebit_cuda",
-            
+
             //                 "-c", "xch1uf48n3f50xrs7zds0uek9wp9wmyza6crnex6rw8kwm3jnm39y82q5mvps6",
             // "-i", "7a709594087cca18cffa37be61bdecf9b6b465de91acb06ecb6dbe0f4a536f73",    // Yes overflow
             // "--memo", "80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef207d52406afa2b6d7d92ea778f407205bd9dca40816c1b1cacfca2a6612b93eb",
+
+            "args":
+            "-w -n 1 -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --check 100 --check-threshold 2 /home/harold/plot",
+
+            // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot /home/harold/plot",
+            // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk --no-direct-buffers /home/harold/plot",
+            // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk /home/harold/plot",
+            "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-64 -t1 /home/harold/plotdisk /home/harold/plot",
             
-            "args": 
-            // "-w --compress 3 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot ~/plot/tmp",
-            "-w --compress 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot ~/plot",
             
             "windows": {
                 "type": "cppvsdbg",
                 "program": "${workspaceFolder}/build/Debug/bladebit_cuda.exe",
-                "args": "--benchmark --compress 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot D:/"
+                // "args": "--benchmark -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot D:/"
+                "args": "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot -t2 D:/chia_test_plots D:/chia_test_plots",
             }
         },
 
@@ -236,7 +242,7 @@
 
         {
             "name"         : "Tests",
-            
+
             "type"          : "cppdbg",
             "osx": {
                 "MIMode": "lldb",
@@ -245,7 +251,7 @@
             "stopAtEntry"   : false,
             "cwd"           : "${workspaceFolder}",
             "preLaunchTask" : "build_tests_debug",
-            "console"       : "internalConsole",
+            // "console"       : "internalConsole",
 
             "program": "${workspaceRoot}/build/tests",
             
@@ -260,6 +266,8 @@
                 // { "name": "bb_plot"        , "value": "/home/harold/plot/tmp/plot-k32-c06-2023-02-14-21-43-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" },
                 { "name": "bb_clevel"      , "value": "1" },
                 { "name": "bb_end_clevel"  , "value": "1" },
+
+                { "name": "bb_queue_path"  , "value": "/home/ubuntu/plot" },
             ],
 
             "args": [
@@ -273,7 +281,10 @@
                 // "line-point-deltas"
                 // "compressed-plot-proof"
                 // "compressed-plot-qualities"
-                "macos-threads"
+                // "macos-threads"
+                // "disk-slices"
+                // "disk-buckets"
+                "[disk-queue]"
             ]
         }
 
@@ -285,10 +296,16 @@
             "stopAtEntry"   : false,
             "cwd"           : "${workspaceFolder}",
             "preLaunchTask" : "build_debug",
-            "console"       : "internalConsole",
 
             "program": "${workspaceFolder}/build/bladebit",
-
+            // "program": "${workspaceFolder}/build/bladebit_cuda",
+            
+            "linux": {
+                "MIMode": "gdb",
+                "miDebuggerPath": "/usr/bin/gdb",
+                "program": "${workspaceFolder}/build/bladebit"
+            },
+    
             "windows": {
                 "type"   : "cppvsdbg",
                 "program": "${workspaceFolder}/build/debug/bladebit.exe"
@@ -301,6 +318,11 @@
                 // "-t", "48",
                 // "-t", "1",
                 
+                // "validate", "--f7", "2",
+                // "/home/harold/plot/jmplot-c01-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+                // "/home/harold/plot/plot-k32-c01-2023-07-19-00-29-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+                // "/home/harold/plot/plot-k32-c01-2023-08-03-04-57-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+                
                 // "-t", "1", "validate", "--f7", "324", "~/plot/tmp/plot-k32-c01-2023-02-13-22-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
                 // "validate", "--f7", "7", "~/plot/tmp/plot-k32-c01-2023-03-09-14-07-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
                 // "validate", "--cuda", "--f7", "4", "~/plot/tmp/plot-k32-c07-2023-04-13-16-08-330fbf677f78641061c93312c1a7ffa28138739b69975f3b874df6acc3e76378.plot",
@@ -322,8 +344,8 @@
                 // // "/home/harold/plot/tmp/plot-k32-c04-2023-01-31-23-15-5cfc42dfaa5613da0b425994c2427a2ba4a8efcfb49e7844e93c0854baf09863.plot"
                 
                 // Simulation
-                "-t", "1", "simulate", "--seed", "b8e9ec6bc179ae6ba5f5c3483f7501db32879efa84b62001d27601a540dca5ff", 
-                    "-p", "16", "-n", "1", "--power", "45", "--size", "4PB", "~/plot/tmp/plot-k32-c01-2023-03-09-14-07-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+                // "-t", "1", "simulate", "--seed", "b8e9ec6bc179ae6ba5f5c3483f7501db32879efa84b62001d27601a540dca5ff", 
+                //     "-p", "16", "-n", "1", "--power", "45", "--size", "4PB", "~/plot/tmp/plot-k32-c01-2023-03-09-14-07-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
                 // "-t", "30", "simulate", "-p", "2", "-n", "600", "~/plot/tmp/plot-k32-c07-2023-03-16-11-49-7732c75d9f3b5ad1fc804bb7429121e334bd4f25f9bbbb76ef0370b5a0e80aae.plot"
                 
                 // "-m",
@@ -335,11 +357,18 @@
                 // "--f7", "3983284117", "/home/harito/plot/tmp/gpu_1.plot",
 
                 /// Compare
-                // "plotcmp",
-                // "/home/harito/plot/tmp/gpu_1.plot.old",
-                // "/home/harold/plot-tmpfs/gpu_1.plot",
-                // "/home/harito/plot/tmp/gpu_1.plot",
-                // "/home/harito/plot/tmp/plot-k32-2022-11-21-05-59-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+                "plotcmp",
+                "/home/harold/plot/plot-k32-c01-2023-08-22-16-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+                "/home/harold/plot/plot-k32-c01-2023-08-22-16-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+
+                // "/home/harold/plot/plot-k32-c01-2023-08-03-22-59-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+                // "/home/harold/plot/jmplot-c01-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+                
+                // Check
+                // "check",
+                // "-n", "100", "--seed", "dc471c4d905ba3a65c6cecb46d97b132c0c98f51d416db5ec5cbdbe95ef2832f", 
+                // "/home/harold/plot/plot-k32-c01-2023-07-19-00-29-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+                // "/home/harold/plot/jm.plot"
             ]
         },
 
diff --git a/.vscode/settings.json b/.vscode/settings.json
index c6c5274d..6c2da21b 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -4,16 +4,16 @@
         "nominmax"
     ],
     "files.associations": {
+        "*.sd": "yaml",
+        "*.userprefs": "xml",
+        "*.make": "makefile",
         "Fastfile": "ruby",
         "*.plist": "xml",
-        "*.sd": "yaml",
         "*.json": "jsonc",
         "*.ir": "llvm",
         "*.qs": "javascript",
         "*.ac": "shellscript",
         "player": "json",
-        "*.userprefs": "xml",
-        "*.make": "makefile",
         "memory": "cpp",
         "cstddef": "cpp",
         "string": "cpp",
@@ -113,7 +113,18 @@
         "filesystem": "cpp",
         "__bits": "cpp",
         "csignal": "cpp",
-        "cfenv": "cpp"
+        "cfenv": "cpp",
+        "ranges": "cpp",
+        "xhash": "cpp",
+        "xmemory": "cpp",
+        "xstddef": "cpp",
+        "xstring": "cpp",
+        "xtr1common": "cpp",
+        "xtree": "cpp",
+        "xutility": "cpp",
+        "__assert": "cpp",
+        "*.inc": "cpp",
+        "xiosbase": "cpp"
     },
     "cSpell.words": [
         "Ryzen"
@@ -124,7 +135,13 @@
     "cmake.preferredGenerators": [
         "Unix Makefiles",
         "Visual Studio 17 2022"
-    ]
+    ],
+    // "cmake.buildArgs": [],
+    "cmake.configureSettings": {
+        "BB_ENABLE_TESTS": "ON",
+        "BB_CUDA_USE_NATIVE": "ON"
+    },
+    "C_Cpp.dimInactiveRegions": false,
     // "cmake.generator": "Unix Makefiles"
     // "cmake.generator": "Visual Studio 17 2022"
 
diff --git a/Bladebit.cmake b/Bladebit.cmake
index 6ce0ad97..ffd03d67 100644
--- a/Bladebit.cmake
+++ b/Bladebit.cmake
@@ -227,6 +227,8 @@ set(src_bladebit
     src/plotting/PlotWriter.cpp
     src/plotting/PlotWriter.h
     src/plotting/Tables.h
+    src/plotting/BufferChain.h
+    src/plotting/BufferChain.cpp
     
     src/plotting/f1/F1Gen.h
     src/plotting/f1/F1Gen.cpp
@@ -258,6 +260,7 @@ set(src_bladebit
     src/tools/PlotReader.cpp
     src/tools/PlotReader.h
     src/tools/PlotValidator.cpp
+    src/tools/PlotChecker.cpp
 
     src/util/Array.h
     src/util/Array.inl
@@ -289,6 +292,18 @@ set(src_bladebit
     src/harvesting/GreenReaper.h
     src/harvesting/GreenReaperInternal.h
     src/harvesting/Thresher.h
+
+    src/plotting/DiskQueue.h
+    src/plotting/DiskQueue.cpp
+    src/plotting/DiskBuffer.h
+    src/plotting/DiskBuffer.cpp
+    src/plotting/DiskBucketBuffer.h
+    src/plotting/DiskBucketBuffer.cpp
+    src/plotting/DiskBufferBase.h
+    src/plotting/DiskBufferBase.cpp
+
+    src/util/MPMCQueue.h
+    src/util/CommandQueue.h
 )
 
 target_sources(bladebit_core PUBLIC ${src_bladebit})
diff --git a/BladebitCUDA.cmake b/BladebitCUDA.cmake
index 1fc668fa..8b140c2f 100644
--- a/BladebitCUDA.cmake
+++ b/BladebitCUDA.cmake
@@ -22,6 +22,9 @@ add_executable(bladebit_cuda
     cuda/CudaPlotUtil.cu
     cuda/GpuStreams.h
     cuda/GpuStreams.cu
+    cuda/GpuDownloadStream.cu
+    cuda/GpuQueue.h
+    cuda/GpuQueue.cu
 
     # Harvester
     cuda/harvesting/CudaThresher.cu
@@ -42,7 +45,7 @@ target_compile_options(bladebit_cuda PRIVATE
     >
 
     $<${is_cuda_debug}:
-        -G
+    #    -G
     >
  )
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 56595d7c..8f72155c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,7 @@
 cmake_minimum_required(VERSION 3.19 FATAL_ERROR)
 
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 set(CMAKE_CONFIGURATION_TYPES Release Debug)
@@ -9,7 +10,7 @@ if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "Release"
         CACHE STRING "Possible values are: Release, Debug"
         FORCE
-   )
+    )
 endif()
 
 # Allows for CMAKE_MSVC_RUNTIME_LIBRARY
@@ -17,7 +18,7 @@ if(POLICY CMP0091)
   cmake_policy(SET CMP0091 NEW) 
 endif()
 
-set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14" CACHE STRING "macOS minimum supported version.")
+set(CMAKE_OSX_DEPLOYMENT_TARGET "10.16" CACHE STRING "macOS minimum supported version.")
 set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>" CACHE STRING "MSVC Runtime Library")
 
 project(bladebit LANGUAGES C CXX ASM)
@@ -83,10 +84,10 @@ endif()
 # NOTE: These are mostly sandbox test environment, not proper tests
 option(BB_ENABLE_TESTS "Enable tests." OFF)
 option(NO_CUDA_HARVESTER "Explicitly disable CUDA in the bladebit_harvester target." OFF)
-option(BB_NO_EMBED_VERSION "Disable embedding the version when building locally (non-CI)." ON)
+option(BB_NO_EMBED_VERSION "Disable embedding the version when building locally (non-CI)." OFF)
 option(BB_HARVESTER_ONLY "Enable only the harvester target." OFF)
 option(BB_HARVESTER_STATIC "Build the harvester target as a static library." OFF)
-
+option(BB_CUDA_USE_NATIVE "Only build the native CUDA architecture when in release mode." OFF)
 
 #
 # Dependencies
@@ -103,7 +104,7 @@ if(NOT ${BB_HARVESTER_ONLY})
     GIT_REPOSITORY https://github.com/Chia-Network/bls-signatures.git
     GIT_TAG        2.0.2
     EXCLUDE_FROM_ALL ${BB_IS_DEPENDENCY}
-    )
+)
 
     set(BUILD_BLS_PYTHON_BINDINGS "0" CACHE STRING "0")
     set(BUILD_BLS_TESTS "0" CACHE STRING "")
@@ -130,6 +131,7 @@ set(is_x86 $<OR:$<STREQUAL:${CMAKE_HOST_SYSTEM_PROCESSOR},AMD64>,$<STREQUAL:${CM
 set(is_arm $<OR:$<STREQUAL:${CMAKE_HOST_SYSTEM_PROCESSOR},arm64>,$<STREQUAL:${CMAKE_HOST_SYSTEM_PROCESSOR},aarch64>>)
 set(is_msvc_c_cpp $<AND:${is_c_cpp},$<CXX_COMPILER_ID:MSVC>>)
 
+
 if(CUDAToolkit_FOUND AND NOT ${NO_CUDA_HARVESTER})
     set(have_cuda $<BOOL:1>)
 else()
@@ -143,7 +145,7 @@ endif()
 include(Config.cmake)
 
 if(NOT ${BB_HARVESTER_ONLY})
-    if(NOT BB_IS_DEPENDENCY AND (NOT BB_NO_EMBED_VERSION))
+    if((NOT BB_IS_DEPENDENCY) AND (NOT BB_NO_EMBED_VERSION))
         include(cmake_modules/EmbedVersion.cmake)
     endif()
 
diff --git a/Config.cmake b/Config.cmake
index 4139b4a9..f3481d6b 100644
--- a/Config.cmake
+++ b/Config.cmake
@@ -1,6 +1,11 @@
 # Base interface configuration project
 add_library(bladebit_config INTERFACE)
 
+target_include_directories(bladebit_config INTERFACE
+    ${INCLUDE_DIRECTORIES}
+    ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
 target_compile_definitions(bladebit_config INTERFACE
     $<${is_release}:
         _NDEBUG=1
@@ -22,32 +27,34 @@ target_compile_definitions(bladebit_config INTERFACE
 
 target_compile_options(bladebit_config INTERFACE
 
-    # GCC or Clang
-    $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:
-        -Wall
-        -Wno-comment
-        -Wno-unknown-pragmas
-        -g
-
-        $<${is_release}:
-            -O3
+    $<${is_c_cpp}:
+        # GCC or Clang
+        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:
+            -Wall
+            -Wno-comment
+            -Wno-unknown-pragmas
+            -g
+
+            $<${is_release}:
+                -O3
+            >
+
+            $<${is_debug}:
+                -O0
+            >
         >
 
-        $<${is_debug}:
-            -O0
+        # GCC
+        $<$<CXX_COMPILER_ID:GNU>:
+            -fmax-errors=5
         >
-    >
-    
-    # GCC
-    $<$<CXX_COMPILER_ID:GNU>:
-        -fmax-errors=5
-    >
 
-    # Clang
-    $<$<CXX_COMPILER_ID:Clang,AppleClang>:
-        -ferror-limit=5
-        -fdeclspec
-        -Wno-empty-body
+        # Clang
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:
+            -ferror-limit=5
+            -fdeclspec
+            -Wno-empty-body
+        >
     >
 
     # MSVC
@@ -129,43 +136,36 @@ cmake_policy(SET CMP0105 NEW)
 set(cuda_archs
 
     $<${is_cuda_release}:
-## Maxwell
-    ## Tesla/Quadro M series
-        -gencode=arch=compute_50,code=sm_50
-    ## Quadro M6000 , GeForce 900, GTX-970, GTX-980, GTX Titan X
-        -gencode=arch=compute_52,code=sm_52
-    ## Tegra (Jetson) TX1 / Tegra X1, Drive CX, Drive PX, Jetson Nano
-        -gencode=arch=compute_53,code=sm_53
-## Pascal
-    ## GeForce 1000 series
-        -gencode=arch=compute_60,code=sm_60
-    ## GeForce GTX 1050Ti, GTX 1060, GTX 1070, GTX 1080
-        -gencode=arch=compute_61,code=sm_61
-    ## Drive Xavier, Jetson AGX Xavier, Jetson Xavier NX
-        -gencode=arch=compute_62,code=sm_62
-## Volta
-    ## GV100, Tesla V100, Titan V
-        -gencode=arch=compute_70,code=sm_70
-    ## Tesla V100
-        -gencode=arch=compute_72,code=sm_72
-    ## Turing
-        -gencode=arch=compute_75,code=sm_75
-## Ampere
-    ## NVIDIA A100, DGX-A100
-        -gencode=arch=compute_80,code=sm_80
-    ## GeForce RTX 3000 series, NVIDIA A100
-        -gencode=arch=compute_86,code=sm_86
-    ## Jetson Orin
-        -gencode=arch=compute_87,code=sm_87
-## Lovelace
-    ## NVIDIA GeForce RTX 4090, RTX 4080, RTX 6000, Tesla L40
-        -gencode=arch=compute_89,code=sm_89
-    ## Future proofing
-        -gencode=arch=compute_89,code=compute_89
-## Hopper
-    ## NVIDIA H100 (GH100)
-        # -gencode=arch=compute_90,code=sm_90
-        # -gencode=arch=compute_90a,code=sm_90a
+        $<$<BOOL:${BB_CUDA_USE_NATIVE}>:
+            -arch=native
+        >
+
+        $<$<NOT:$<BOOL:${BB_CUDA_USE_NATIVE}>>:
+
+            # Maxwell
+            -gencode=arch=compute_50,code=sm_50 # Tesla/Quadro M series
+            -gencode=arch=compute_52,code=sm_52 # Quadro M6000 , GeForce 900, GTX-970, GTX-980, GTX Titan X
+            -gencode=arch=compute_53,code=sm_53 # Tegra (Jetson) TX1 / Tegra X1, Drive CX, Drive PX, Jetson Nano
+
+            # Pascal
+            -gencode=arch=compute_60,code=sm_60 # GeForce 1000 series
+            -gencode=arch=compute_61,code=sm_61 # GeForce GTX 1050Ti, GTX 1060, GTX 1070, GTX 1080
+            -gencode=arch=compute_62,code=sm_62 # Drive Xavier, Jetson AGX Xavier, Jetson Xavier NX
+            
+            # Volta
+            -gencode=arch=compute_70,code=sm_70 # GV100, Tesla V100, Titan V
+            -gencode=arch=compute_72,code=sm_72 # Tesla V100
+            -gencode=arch=compute_75,code=sm_75 # Turing
+
+            # Ampere
+            -gencode=arch=compute_80,code=sm_80 # NVIDIA A100, DGX-A100
+            -gencode=arch=compute_86,code=sm_86 # GeForce RTX 3000 series, NVIDIA A100
+            -gencode=arch=compute_87,code=sm_87 # Jetson Orin
+
+            # Lovelace
+            -gencode=arch=compute_89,code=sm_89         # NVIDIA GeForce RTX 4090, RTX 4080, RTX 6000, Tesla L40
+            -gencode=arch=compute_89,code=compute_89    # Future proofing
+        >
     >
 
     $<${is_cuda_debug}:
diff --git a/Harvester.cmake b/Harvester.cmake
index d853a2db..692daa80 100644
--- a/Harvester.cmake
+++ b/Harvester.cmake
@@ -1,5 +1,5 @@
 if(NOT ${BB_HARVESTER_STATIC})
-    add_library(bladebit_harvester SHARED)
+    add_library(bladebit_harvester SHARED src/harvesting/HarvesterDummy.cpp)
 else()
     add_library(bladebit_harvester STATIC)
 endif()
@@ -82,9 +82,15 @@ target_sources(bladebit_harvester PRIVATE
         cuda/CudaF1.cu
         cuda/CudaMatch.cu
         cuda/CudaPlotUtil.cu
+        cuda/GpuQueue.cu
 
-        # TODO: Remove this, ought not be needed in harvester
+        # TODO: Does this have to be here?
         cuda/GpuStreams.cu
+        cuda/GpuDownloadStream.cu
+        src/plotting/DiskBuffer.cpp
+        src/plotting/DiskBucketBuffer.cpp
+        src/plotting/DiskBufferBase.cpp
+        src/plotting/DiskQueue.cpp
     >
 
     $<$<NOT:${have_cuda}>:
@@ -159,7 +165,7 @@ if(CUDAToolkit_FOUND)
         CUDA_RUNTIME_LIBRARY Static
         CUDA_SEPARABLE_COMPILATION ON
         CUDA_RESOLVE_DEVICE_SYMBOLS ON
-        # CUDA_ARCHITECTURES OFF
+        CUDA_ARCHITECTURES OFF
     )
 endif()
 
diff --git a/README.md b/README.md
index 9197014e..24d50f30 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,71 @@
-# BladeBit Chia Plotter
+# Bladebit Chia Plotter
 
 [![Release Builds](https://github.com/Chia-Network/bladebit/actions/workflows/build-release.yml/badge.svg?branch=master&event=push)](https://github.com/Chia-Network/bladebit/actions/workflows/build-release.yml)
 
-A high-performance **k32-only**, Chia (XCH) plotter supporting in-RAM and disk-based plotting.
+A high-performance **k32-only**, Chia (XCH) plotter.
+
+Bladebit supports 3 plotting modes:
+ - Fully In-RAM (no drives required), CPU-based mode.
+ - GPU (CUDA-based) mode. Both fully in-RAM or disk-hybrid mode.
+ - Disk-based mode
+
+## Usage
+Run `bladebit --help` to see general help. For command-specific help, use `bladebit help <command_name>`.
+
+## Requirements
+
+**CUDA**
+
+An NVIDIA GPU is required for this mode. This mode is exposed via the `cudaplot` command in a separate executable "bladebit_cuda". This mode has mainly been tested on consumer cards from the **10xx** series and up. 
+
+| Mode                           | OS             | DRAM | VRAM | CUDA capability 
+|--------------------------------|----------------|------|------|----------------
+| In-RAM                         | Linux, Windows | 256G | 8G   | 5.2 and up
+| Disk-hybrid 128G               | Linux, Windows | 128G | 8G   | 5.2 and up
+| Disk-hybrid 16G (WIP)          | Linux          | 16G  | 8G   | 5.2 and up
+
+> *NOTE: 16G mode currently a work in progress and at this stage it only works in Linux and direct I/O is unavailable in this mode.*
+
+
+**CPU RAM-Only**
+
+Available on Linux, Windows and macOS.
+Requires at least **416G** of system DRAM.
+
+
+**Disk**
+
+Available on Linux, Windows and macOS.
+
+A minimum of **4 GiB of RAM** is required, with lower bucket counts requiring up to 12 GiB of RAM. Roughly **480 GiB of disk space** is required in the default mode, or around **390 GiB of disk space** with `--alternate` mode enabled.
+
+The exact amounts of RAM and disk space required may vary slightly depending on the system's page size and the target disk file system block size (block-alignment is required for direct I/O).
+
+SSDs are highly recommended for disk-based plotting.
+
+
+## Compressed Plots
+
+Compressed plots are supported in CUDA mode and in RAM-only mode. CPU Disk-based mode does **NOT** currently support compressed plots.
+
+Compressed plots are currently supported for compression levels from **C1** to **C7**. Note that bladebit compression levels are not compatible with other plotter compression levels. These compression levels are based on the *number of bits dropped from an entry excluding the minimum bits required to fully drop a table*. At `k=32` a the first table is fully excluded from the plot at 16 bits dropped.
+
+> *NOTE: Although higher compression levels are available, support for farming them has not been currently implemented and are therefore disabled. They will be implemented in the future.*
+
+Compression levels are currently roughly equivalent to the following plot sizes.
+
+| Compression Level | Plot Size
+|-------------------|-------------
+| C1                | 87.5 GiB
+| C2                | 86.0 GiB
+| C3                | 84.4 GiB
+| C4                | 82.8 GiB
+| C5                | 81.2 GiB
+| C6                | 79.6 GiB
+| C7                | 78.0 GiB
+
+These might be optimized in the future with further compression optimizations.
+
 
 ## Requirements
 
@@ -39,7 +102,7 @@ SSDs are highly recommended for disk-based plotting.
 
 
 ## Prerequisites
-Linux, Windows and MacOS (both intel and ARM (Apple Silicon)) are supported.
+Linux, Windows and macOS (both Intel and ARM) are supported.
 
 
 ### Linux
@@ -83,8 +146,12 @@ cmake --build . --target bladebit --config Release
 The resulting binary will be found under the `build/` directory.
 On Windows it will be under `build/Release/`.
 
+For **bladebit_cuda**, the CUDA toolkit must be installed. The target name is `bladebit_cuda`.
+
+For simplicity the `build.sh` or `build-cuda.sh` scripts can be used to build. On Windows this requires gitbash or similar bash-based shell to run.
+
 ## Usage
-Run **bladebit** with the `-h` for complete usage and command line options:
+Run **bladebit** (or **bladebit_cuda**) with the `-h` for complete usage and command line options:
 
 ```bash
 # Linux & macOS
@@ -93,18 +160,33 @@ build/bladebit -h
 # Windows
 build/Release/bladebit.exe -h
 ```
+The bladebit CLI uses the format `bladebit <GLOBAL_OPTIONS> <command> <COMMAND_OPTIONS>`.
 
-
-The bladebit CLI uses the format `bladebit <GLOBAL_OPTIONS> <sub_command> <COMMAND_OPTIONS>`.
-
-Use the aforementioned `-h` parameter to get the full list of sub-commands and `GLOBAL_OPTIONS`. 
-The `sub_command`-specific `COMMAND_OPTIONS` can be obtained by using the `help` sub command with the desired command as the parameter: 
+Use the aforementioned `-h` parameter to get the full list of commands and `GLOBAL_OPTIONS`. 
+The `command`-specific `COMMAND_OPTIONS` can be obtained by using the `help` sub command with the desired command as the parameter: 
 
 ```bash
+bladebit help cudaplot
 bladebit help ramplot
 bladebit help diskplot
 ```
 
+### CUDA
+Basic `cudaplot` usage:
+```bash
+# OG plots
+./bladebit_cuda -f <farmer_public_key> -p <pool_public_key> cudaplot <output_directory>
+
+# Portable plots
+./bladebit_cuda -f <farmer_public_key> -c <pool_contract_address> cudaplot <output_directory>
+
+# Compressed plots
+./bladebit_cuda -z <copression_level> -f <farmer_public_key> -c <pool_contract_address> cudaplot <output_directory>
+
+# 128G disk-hybrid mode
+./bladebit_cuda -z <copression_level> -f <farmer_public_key> -c <pool_contract_address> cudaplot --disk-128 -t1 <temp_dir> <output_directory>
+```
+
 ### In-RAM
 Basic `ramplot` usage:
 ```bash
@@ -113,6 +195,9 @@ Basic `ramplot` usage:
 
 # Portable plots
 ./bladebit -f <farmer_public_key> -c <pool_contract_address> ramplot <output_directory>
+
+# Compressed plots
+./bladebit -z <copression_level> -f <farmer_public_key> -c <pool_contract_address> ramplot <output_directory>
 ```
 
 ### Disk-Based
diff --git a/Tests.cmake b/Tests.cmake
index 577e541c..aaba51df 100644
--- a/Tests.cmake
+++ b/Tests.cmake
@@ -1,10 +1,15 @@
 include(cmake_modules/FindCatch2.cmake)
 
-add_executable(tests ${src_bladebit})
+add_executable(tests ${src_bladebit}
+    cuda/harvesting/CudaThresherDummy.cpp
+    tests/TestUtil.h
+    tests/TestDiskQueue.cpp
+)
+
 target_compile_definitions(tests PRIVATE
     BB_TEST_MODE=1
 )
-target_link_libraries(tests PRIVATE bladebit_config Catch2::Catch2WithMain)
+target_link_libraries(tests PRIVATE bladebit_config bladebit_core Catch2::Catch2WithMain)
 
 set_target_properties(tests PROPERTIES 
     EXCLUDE_FROM_ALL ON
diff --git a/VERSION b/VERSION
index 4a36342f..0c6173b5 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1,2 @@
-3.0.0
+3.1.0
+
diff --git a/build-cuda.sh b/build-cuda.sh
new file mode 100755
index 00000000..d7a10154
--- /dev/null
+++ b/build-cuda.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+set -e
+_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+cd $_dir
+
+build_dir=build-release
+mkdir -p ${build_dir}
+cd ${build_dir}
+
+cmake .. -DCMAKE_BUILD_TYPE=Release
+cmake --build . --target bladebit_cuda --config Release --clean-first -j24
diff --git a/cmake_modules/EmbedVersion.cmake b/cmake_modules/EmbedVersion.cmake
index 6ec042c0..1c346632 100644
--- a/cmake_modules/EmbedVersion.cmake
+++ b/cmake_modules/EmbedVersion.cmake
@@ -2,18 +2,25 @@
 if((NOT DEFINED ENV{CI}) AND (NOT DEFINED CACHE{bb_version_embedded}))
     message("Embedding local build version")
 
-    set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.")
-
-    set(cmd_ver bash)
+    set(cmd_shell bash)
+    set(cmd_ext sh)
     if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
-        set(cmd_ver bash.exe)
+
+        find_program(bash_path NAMES bash.exe NO_CACHE)
+
+        if(${bash_path} MATCHES "-NOTFOUND")
+            set(cmd_shell powershell)
+            set(cmd_ext ps1)
+        else()
+            set(cmd_shell "${bash_path}")
+        endif()
     endif()
 
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh major    OUTPUT_VARIABLE bb_ver_maj    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh minor    OUTPUT_VARIABLE bb_ver_min    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh revision OUTPUT_VARIABLE bb_ver_rev    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh suffix   OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh commit   OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} major    OUTPUT_VARIABLE bb_ver_maj    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} minor    OUTPUT_VARIABLE bb_ver_min    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} revision OUTPUT_VARIABLE bb_ver_rev    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} suffix   OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} commit   OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
 
     # Remove trailing whitespace incurred in windows gitbash
     string(STRIP "${bb_ver_maj}"    bb_ver_maj)
@@ -39,3 +46,5 @@ if(NOT DEFINED ENV{CI})
     add_compile_definitions(BLADEBIT_VERSION_SUFFIX="${bb_ver_suffix}")
     add_compile_definitions(BLADEBIT_GIT_COMMIT="${bb_ver_commit}")
 endif()
+
+set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.")
\ No newline at end of file
diff --git a/cuda/CudaPlotConfig.h b/cuda/CudaPlotConfig.h
index 80721e9f..a9afd81f 100644
--- a/cuda/CudaPlotConfig.h
+++ b/cuda/CudaPlotConfig.h
@@ -19,7 +19,7 @@
 #define BBCU_TABLE_ENTRY_COUNT          (1ull<<32)
 #define BBCU_BUCKET_ENTRY_COUNT         (BBCU_TABLE_ENTRY_COUNT/BBCU_BUCKET_COUNT)
 //#define BBCU_XTRA_ENTRIES_PER_SLICE     (1024u*64u)
-#define BBCU_XTRA_ENTRIES_PER_SLICE     (4096u*1u)
+#define BBCU_XTRA_ENTRIES_PER_SLICE     (4096+1024)
 #define BBCU_MAX_SLICE_ENTRY_COUNT      ((BBCU_BUCKET_ENTRY_COUNT/BBCU_BUCKET_COUNT)+BBCU_XTRA_ENTRIES_PER_SLICE)
 #define BBCU_BUCKET_ALLOC_ENTRY_COUNT   (BBCU_MAX_SLICE_ENTRY_COUNT*BBCU_BUCKET_COUNT)
 #define BBCU_TABLE_ALLOC_ENTRY_COUNT    (((uint64)BBCU_BUCKET_ALLOC_ENTRY_COUNT)*BBCU_BUCKET_COUNT)
@@ -42,12 +42,12 @@ static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT / BBCU_BUCKET_COUNT == BBCU_MAX_SLI
     #ifdef _WIN32
         #define DBG_BBCU_DBG_DIR "D:/dbg/cuda/"
     #else
-        // #define DBG_BBCU_DBG_DIR "/home/harold/plot/dbg/cuda/"
-        #define DBG_BBCU_DBG_DIR "/home/harito/plot/dbg/cuda/"
+        #define DBG_BBCU_DBG_DIR "/home/harold/plotdisk/dbg/cuda/"
+        // #define DBG_BBCU_DBG_DIR "/home/harito/plots/dbg/cuda/"
     #endif
-    // #define DBG_BBCU_REF_DIR       "/home/harold/plot/ref/"
+    // #define DBG_BBCU_REF_DIR       "/home/harold/plots/ref/"
+
 
-    
     // #define BBCU_DBG_SKIP_PHASE_1   1   // Skip phase 1 and load pairs from disk
     // #define BBCU_DBG_SKIP_PHASE_2   1   // Skip phase 1 and 2 and load pairs and marks from disk
 
@@ -60,6 +60,7 @@ static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT / BBCU_BUCKET_COUNT == BBCU_MAX_SLI
     // #define DBG_BBCU_P2_WRITE_MARKS   1
 
     // #define DBG_BBCU_P2_COUNT_PRUNED_ENTRIES 1
+    // #define DBG_BBCU_KEEP_TEMP_FILES 1
 
 
     #define _ASSERT_DOES_NOT_OVERLAP( b0, b1, size ) ASSERT( (b1+size) <= b0 || b1 >= (b0+size) )
diff --git a/cuda/CudaPlotContext.h b/cuda/CudaPlotContext.h
index f4e8d909..fc5884b3 100644
--- a/cuda/CudaPlotContext.h
+++ b/cuda/CudaPlotContext.h
@@ -7,11 +7,16 @@
 #include "plotting/PlotTypes.h"
 #include "plotting/PlotWriter.h"
 #include "GpuStreams.h"
+#include "GpuQueue.h"
 #include "util/StackAllocator.h"
 #include "fse/fse.h"
 #include "threading/Fence.h"
 #include "plotting/GlobalPlotConfig.h"
 #include "threading/ThreadPool.h"
+#include "plotting/BufferChain.h"
+#include "plotting/DiskBuffer.h"
+#include "plotting/DiskBucketBuffer.h"
+#include <filesystem>
 
 #include "cub/device/device_radix_sort.cuh"
 // #include <cub/device/device_radix_sort.cuh>
@@ -29,7 +34,51 @@ using namespace cooperative_groups;
 #endif
 
 
+struct CudaK32ParkContext
+{
+    Span<byte>        table7Memory;             // Memory buffer reserved for finalizing table7 and writing C parks
+    BufferChain*      parkBufferChain;
+    uint32            maxParkBuffers;           // Maximum number of park buffers
+    uint64*           hostRetainedLinePoints;
+};
+
+struct CudaK32HybridMode
+{
+    // For clarity, these are the file names for the disk buffers
+    // whose disk space will be shared for temp data in both phase 1 and phase 3.
+    // The name indicates their usage and in which phase.
+    static constexpr std::string_view Y_DISK_BUFFER_FILE_NAME      = "p1y-p3index.tmp";
+    static constexpr std::string_view META_DISK_BUFFER_FILE_NAME   = "p1meta-p3rmap.tmp";
+    static constexpr std::string_view LPAIRS_DISK_BUFFER_FILE_NAME = "p1unsortedx-p1lpairs-p3lp-p3-lmap.tmp";
+
+    static constexpr std::string_view P3_RMAP_DISK_BUFFER_FILE_NAME        = META_DISK_BUFFER_FILE_NAME;
+    static constexpr std::string_view P3_INDEX_DISK_BUFFER_FILE_NAME       = Y_DISK_BUFFER_FILE_NAME;
+    static constexpr std::string_view P3_LP_AND_LMAP_DISK_BUFFER_FILE_NAME = LPAIRS_DISK_BUFFER_FILE_NAME;
+
+    DiskQueue*  temp1Queue;  // Tables Queue
+    DiskQueue*  temp2Queue;  // Metadata Queue (could be the same as temp1Queue)
 
+    DiskBucketBuffer* metaBuffer;   // Enabled in < 128G mode
+    DiskBucketBuffer* yBuffer;      // Enabled in < 128G mode
+    DiskBucketBuffer* unsortedL;    // Unsorted Xs (or L pairs in < 128G) are written to disk (uint64 entries)
+    DiskBucketBuffer* unsortedR;    // Unsorted R pairs in < 128G mode
+
+    DiskBuffer*       tablesL[7];
+    DiskBuffer*       tablesR[7];
+
+    GpuDownloadBuffer _tablesL[7];
+    GpuDownloadBuffer _tablesR[7];
+
+    struct
+    {
+        // #NOTE: These buffers shared the same file-backed storage as
+        //        with other buffers in phase 1.
+        DiskBucketBuffer* rMapBuffer;           // Step 1
+        DiskBucketBuffer* indexBuffer;          // X-step/Step 2
+        DiskBucketBuffer* lpAndLMapBuffer;      // X-step/Step 2 (LP) | Step 3 (LMap)
+
+    } phase3;
+};
 
 struct CudaK32Phase2
 {
@@ -64,11 +113,12 @@ struct CudaK32Phase3
     };
 
     uint64  pairsLoadOffset;
-    
+
+    // Device buffers
     uint32* devBucketCounts;
     uint32* devPrunedEntryCount;
 
-
+    // Host buffers
     union {
         RMap*   hostRMap;
         uint32* hostIndices;
@@ -79,12 +129,6 @@ struct CudaK32Phase3
         uint64* hostLinePoints;
     };
 
-    // #TODO: Remove this when we sort-out all of the buffer usage 
-    // uint64* hostMarkingTables[6]; // Set by Phase 2
-
-
-    // uint32* hostBucketCounts;
-
     uint32 prunedBucketCounts[7][BBCU_BUCKET_COUNT];
     uint64 prunedTableEntryCounts[7];
 
@@ -111,9 +155,10 @@ struct CudaK32Phase3
     // Step 2
     struct {
         GpuUploadBuffer   rMapIn;       // RMap from step 1
-        GpuUploadBuffer   lMapIn;       // Output map (uint64) from the previous table run. Or during L table 1, it is inlined x values
+        GpuUploadBuffer   lMapIn;       // Output map (uint64) from the previous table run. Or, when L table is the first stored table, it is inlined x values
         GpuDownloadBuffer lpOut;        // Output line points (uint64)
         GpuDownloadBuffer indexOut;     // Output source line point index (uint32) (taken from the rMap source value)
+        GpuDownloadBuffer parksOut;     // Output P7 parks on the last table
         uint32*           devLTable[2]; // Unpacked L table bucket
 
         uint32 prunedBucketSlices[BBCU_BUCKET_COUNT][BBCU_BUCKET_COUNT];
@@ -123,7 +168,7 @@ struct CudaK32Phase3
     struct {
         GpuUploadBuffer   lpIn;         // Line points from step 2
         GpuUploadBuffer   indexIn;      // Indices from step 2
-        GpuDownloadBuffer mapOut;       // lTable for next step 1
+        GpuDownloadBuffer mapOut;       // lTable for next step 2
         GpuDownloadBuffer parksOut;     // Downloads park buffers to host
 
         uint32*           hostParkOverrunCount;
@@ -137,7 +182,6 @@ struct CudaK32Phase3
         FSE_CTable*       devCTable;
         uint32*           devParkOverrunCount;
 
-        Fence*              parkFence;
         std::atomic<uint32> parkBucket;
 
         uint32 prunedBucketSlices[BBCU_BUCKET_COUNT][BBCU_BUCKET_COUNT];
@@ -178,8 +222,9 @@ struct CudaK32PlotContext
     int32           cudaDevice        = -1;
     cudaDeviceProp* cudaDevProps      = nullptr;
     bool            downloadDirect    = false;
+    TableId         firstStoredTable  = TableId::Table2;    // First non-dropped table that has back pointers
     ThreadPool*     threadPool        = nullptr;
-    
+
     TableId      table                = TableId::Table1;    // Current table being generated
     uint32       bucket               = 0;                  // Current bucket being processed
 
@@ -192,6 +237,7 @@ struct CudaK32PlotContext
     PlotRequest  plotRequest;
     PlotWriter*  plotWriter           = nullptr;
     Fence*       plotFence            = nullptr;
+    Fence*       parkFence            = nullptr;
 
     // Root allocations
     size_t allocAlignment             = 0;
@@ -263,8 +309,6 @@ struct CudaK32PlotContext
     uint32*      hostBucketSlices     = nullptr;
     uint32*      hostTableL           = nullptr;
     uint16*      hostTableR           = nullptr;
-    uint32*      hostTableSortedL     = nullptr;
-    uint16*      hostTableSortedR     = nullptr;
 
     union {
         uint32*  hostMatchCount       = nullptr;
@@ -279,6 +323,14 @@ struct CudaK32PlotContext
     CudaK32Phase2* phase2 = nullptr;
     CudaK32Phase3* phase3 = nullptr;
 
+    CudaK32HybridMode*  diskContext    = nullptr;
+    CudaK32ParkContext* parkContext    = nullptr;
+    bool                useParkContext = false;
+
+    // Used when '--check' is enabled
+    struct GreenReaperContext* grCheckContext = nullptr;
+    class  PlotChecker*        plotChecker    = nullptr;
+
     struct
     {
         Duration uploadTime   = Duration::zero();   // Host-to-device wait time
@@ -359,7 +411,7 @@ inline uint32 CudaK32PlotGetOutputIndex( CudaK32PlotContext& cx )
 }
 
 //-----------------------------------------------------------
-inline bool CudaK32PlotIsOutputInterleaved( CudaK32PlotContext& cx )
+inline bool CudaK32PlotIsOutputVertical( CudaK32PlotContext& cx )
 {
     return CudaK32PlotGetOutputIndex( cx ) == 0;
 }
diff --git a/cuda/CudaPlotPhase2.cu b/cuda/CudaPlotPhase2.cu
index 93099d86..8d2d5094 100644
--- a/cuda/CudaPlotPhase2.cu
+++ b/cuda/CudaPlotPhase2.cu
@@ -20,8 +20,7 @@
 static void CudaK32PlotAllocateBuffersTest( CudaK32PlotContext& cx );
 
 #define MARK_TABLE_BLOCK_THREADS 128
-#define P2_BUCKET_COUNT          BBCU_BUCKET_COUNT
-#define P2_ENTRIES_PER_BUCKET    BBCU_BUCKET_ALLOC_ENTRY_COUNT //((1ull<<BBCU_K)/P2_BUCKET_COUNT)
+#define P2_ENTRIES_PER_BUCKET    BBCU_BUCKET_ALLOC_ENTRY_COUNT //((1ull<<BBCU_K)/BBCU_BUCKET_COUNT)
 
 
 inline size_t GetMarkingTableByteSize()
@@ -30,7 +29,8 @@ inline size_t GetMarkingTableByteSize()
 }
 
 template<bool useRMarks>
-__global__ void CudaMarkTables( const uint32 entryCount, const uint32* lPairs, const uint16* rPairs, byte* marks, const uint64* rTableMarks, const uint32 rOffset )
+__global__ void CudaMarkTables( const uint32 entryCount, const uint32* lPairs, const uint16* rPairs,
+                                byte* marks, const uint64* rTableMarks, const uint32 rOffset )
 {
     const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -39,11 +39,11 @@ __global__ void CudaMarkTables( const uint32 entryCount, const uint32* lPairs, c
         return;
 
     if constexpr ( useRMarks )
-    {   
+    {
         if( !CuBitFieldGet( rTableMarks, rOffset + gid ) )
             return;
     }
-    
+
     const uint32 l = lPairs[gid];
     const uint32 r = l + rPairs[gid];
 
@@ -117,12 +117,12 @@ static void BytefieldToBitfield( CudaK32PlotContext& cx, const byte* bytefield,
 
     ASSERT( (uint64)blockCount * blockThreadCount * 64 == tableEntryCount );
 
-#if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
+    #if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
         #define G_PRUNED_COUNTS ,cx.phase2->devPrunedCount
         CudaErrCheck( cudaMemsetAsync( cx.phase2->devPrunedCount, 0, sizeof( uint32 ), stream ) );
-#else
+    #else
         #define G_PRUNED_COUNTS 
-#endif
+    #endif
     
     ASSERT_DOES_NOT_OVERLAP2( bitfield, bytefield, GetMarkingTableBitFieldSize(), GetMarkingTableByteSize() );
 
@@ -131,8 +131,11 @@ static void BytefieldToBitfield( CudaK32PlotContext& cx, const byte* bytefield,
 
 void LoadPairs( CudaK32PlotContext& cx, CudaK32Phase2& p2, const TableId rTable, const uint32 bucket )
 {
+    if( bucket >= BBCU_BUCKET_COUNT )
+        return;
+
     const uint64 tableEntryCount = cx.tableEntryCounts[(int)rTable];
-    const uint32 entryCount      = BBCU_BUCKET_ENTRY_COUNT;//(uint32)std::min( (uint64)BBCU_BUCKET_ENTRY_COUNT, tableEntryCount - p2.pairsLoadOffset );// cx.bucketCounts[(int)rTable][bucket];
+    const uint32 entryCount      = cx.bucketCounts[(int)rTable][bucket];
 
         //   uint32* hostPairsL     = cx.hostTableSortedL + p2.pairsLoadOffset;
         //   uint16* hostPairsR     = cx.hostTableSortedR + p2.pairsLoadOffset;
@@ -163,42 +166,48 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 )
 
     byte* devLMarks = p2.devMarkingTable;
 
+    if( cx.cfg.hybrid128Mode )
+    {
+        cx.diskContext->tablesL[(int)rTable]->Swap();
+        cx.diskContext->tablesR[(int)rTable]->Swap();
+
+        p2.pairsLIn.AssignDiskBuffer( cx.diskContext->tablesL[(int)rTable] );
+        p2.pairsRIn.AssignDiskBuffer( cx.diskContext->tablesR[(int)rTable] );
+    }
+
     // Zero-out marks
     CudaErrCheck( cudaMemsetAsync( devLMarks, 0, GetMarkingTableByteSize(), cx.computeStream ) );
 
     // Load first bucket's worth of pairs
     LoadPairs( cx, p2, rTable, 0 );
 
-    uint32 rOffset = 0;
-    for( uint32 bucket = 0; bucket < P2_BUCKET_COUNT; bucket++ )
-    {
-        const bool isLastBucket = bucket + 1 == P2_BUCKET_COUNT;
+    // Mark the table, buckey by bucket
+    uint32 rTableGlobalIndexOffset = 0;
 
-        // Load next set of pairs in the background
-        if( !isLastBucket )
-            LoadPairs( cx, p2, rTable, bucket + 1 );
+    for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+    {
+        // Load next set of pairs in the background (if there is another bucket)
+        LoadPairs( cx, p2, rTable, bucket + 1 );
 
         const uint64 tableEntryCount = cx.tableEntryCounts[(int)rTable];
-        const uint32 entryCount      = isLastBucket ? tableEntryCount - (BBCU_BUCKET_ENTRY_COUNT * (BBCU_BUCKET_COUNT-1)): BBCU_BUCKET_ENTRY_COUNT;
-        // const uint32 entryCount       = cx.bucketCounts[(int)rTable][bucket];
+        const uint32 entryCount      = cx.bucketCounts[(int)rTable][bucket];
 
         // Wait for pairs to be ready
         const uint32* devLPairs = p2.pairsLIn.GetUploadedDeviceBufferT<uint32>( cx.computeStream );
         const uint16* devRPairs = p2.pairsRIn.GetUploadedDeviceBufferT<uint16>( cx.computeStream );
 
-
         // Mark
         const uint32 blockCount = (uint32)CDiv( entryCount, MARK_TABLE_BLOCK_THREADS );
 
         if( rTable == TableId::Table7 )
             CudaMarkTables<false><<<blockCount, MARK_TABLE_BLOCK_THREADS, 0, cx.computeStream>>>( entryCount, devLPairs, devRPairs, devLMarks, nullptr, 0 );
         else
-            CudaMarkTables<true ><<<blockCount, MARK_TABLE_BLOCK_THREADS, 0, cx.computeStream>>>( entryCount, devLPairs, devRPairs, devLMarks, p2.devRMarks[(int)rTable], rOffset );
-        
+            CudaMarkTables<true ><<<blockCount, MARK_TABLE_BLOCK_THREADS, 0, cx.computeStream>>>( entryCount, devLPairs, devRPairs, devLMarks, p2.devRMarks[(int)rTable], rTableGlobalIndexOffset );
+
         p2.pairsLIn.ReleaseDeviceBuffer( cx.computeStream );
         p2.pairsRIn.ReleaseDeviceBuffer( cx.computeStream );
 
-        rOffset += entryCount;
+        rTableGlobalIndexOffset += entryCount;
     }
 
     // Convert the bytefield marking table to a bitfield
@@ -209,14 +218,14 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 )
     // Download bitfield marks
     // uint64* hostBitField = p2.hostBitFieldAllocator->AllocT<uint64>( GetMarkingTableBitFieldSize() );
     uint64* hostBitField = cx.hostMarkingTables[(int)lTable];
-    
+
     // #TODO: Do download and copy again, for now just store all of them in this pinned buffer
     // cx.phase3->hostMarkingTables[(int)lTable] = hostBitField;
     p2.outMarks.Download( hostBitField, GetMarkingTableBitFieldSize(), cx.computeStream );
-    
+
     // p2.outMarks.DownloadAndCopy( hostBitField, cx.hostMarkingTables[(int)lTable], GetMarkingTableBitFieldSize(), cx.computeStream );
     // p2.outMarks.Download( cx.hostMarkingTables[(int)lTable], GetMarkingTableBitFieldSize() );
-    
+
 
 #if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
     {
@@ -370,6 +379,9 @@ void CudaK32PlotPhase2( CudaK32PlotContext& cx )
         MarkTable( cx, p2 );
         p2.outMarks.WaitForCompletion();
         p2.outMarks.Reset();
+        p2.pairsLIn.Reset();
+        p2.pairsRIn.Reset();
+
         const auto elapsed = TimerEnd( timer );
         Log::Line( "Marked Table %u in %.2lf seconds.", rTable, elapsed );
 
@@ -380,7 +392,7 @@ void CudaK32PlotPhase2( CudaK32PlotContext& cx )
     }
 
     // Wait for everything to complete
-    
+
     // p2.outMarks.WaitForCopyCompletion(); // #TODO: Re-activate this when re-enabling copy
     p2.outMarks.WaitForCompletion();
     p2.outMarks.Reset();
@@ -392,30 +404,39 @@ void CudaK32PlotPhase2( CudaK32PlotContext& cx )
 ///
 void CudaK32PlotPhase2AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
 {
-    const size_t alignment = cx.allocAlignment;
+    GpuStreamDescriptor desc{};
+
+    desc.entriesPerSlice = P2_ENTRIES_PER_BUCKET;
+    desc.sliceCount      = 1;
+    desc.sliceAlignment  = cx.allocAlignment;
+    desc.bufferCount     = BBCU_DEFAULT_GPU_BUFFER_COUNT;
+    desc.deviceAllocator = acx.devAllocator;
+    desc.pinnedAllocator = nullptr;             // Start in direct mode (no intermediate pinined buffers)
+
+    if( cx.cfg.hybrid128Mode )
+    {
+        desc.pinnedAllocator = acx.pinnedAllocator;
+        desc.sliceAlignment  = cx.diskContext->temp1Queue->BlockSize();
+    }
 
-    IAllocator& devAllocator    = *acx.devAllocator;
-    IAllocator& pinnedAllocator = *acx.pinnedAllocator; 
+    if( !cx.downloadDirect )
+        desc.pinnedAllocator = acx.pinnedAllocator;
 
     CudaK32Phase2& p2 = *cx.phase2;
 
     const size_t markingTableByteSize     = GetMarkingTableByteSize();
     const size_t markingTableBitFieldSize = GetMarkingTableBitFieldSize();
 
-    p2.devPrunedCount  = devAllocator.CAlloc<uint32>( 1, alignment );
-    p2.devMarkingTable = devAllocator.AllocT<byte>( markingTableByteSize, alignment );
-
-    p2.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
-                    sizeof( uint32 ) * P2_ENTRIES_PER_BUCKET, devAllocator, pinnedAllocator, alignment, acx.dryRun );
+    // Device buffers
+    p2.devPrunedCount  = acx.devAllocator->CAlloc<uint32>( 1, acx.alignment );
+    p2.devMarkingTable = acx.devAllocator->AllocT<byte>( markingTableByteSize, acx.alignment );
 
-    p2.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
-                    sizeof( uint16 ) * P2_ENTRIES_PER_BUCKET, devAllocator, pinnedAllocator, alignment, acx.dryRun );
+    // Upload/Download streams
+    p2.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint32>( desc, acx.dryRun );
+    p2.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint16>( desc, acx.dryRun );
 
-    p2.outMarks = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( 
-                    markingTableBitFieldSize, devAllocator, alignment, acx.dryRun );
-
-    // These buffers are safe to use at this point
-    // p2.hostBitFieldAllocator = new StackAllocator( cx.hostTableR, sizeof( uint32 ) * BBCU_TABLE_ALLOC_ENTRY_COUNT );
+    desc.entriesPerSlice = markingTableBitFieldSize;
+    p2.outMarks          = cx.gpuDownloadStream[0]->CreateDownloadBufferT<byte>( desc, acx.dryRun );
 }
 
 
@@ -550,7 +571,7 @@ void DbgValidateTable( CudaK32PlotContext& cx )
     {
         {
             uint64 totalCount = 0;
-            for( uint32 bucket = 0; bucket < P2_BUCKET_COUNT; bucket++ )
+            for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
                 totalCount += cx.bucketCounts[(int)rt][bucket];
 
             ASSERT( totalCount == cx.tableEntryCounts[(int)rt] );
@@ -562,7 +583,7 @@ void DbgValidateTable( CudaK32PlotContext& cx )
 
         Pairs hostRTablePairs = cx.hostBackPointers[(int)rt];
 
-        for( uint32 bucket = 0; bucket < P2_BUCKET_COUNT; bucket++ )
+        for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
         {
             const uint32 rTableBucketEntryCount = cx.bucketCounts[(int)rt][bucket];
 
@@ -638,9 +659,13 @@ void DbgWriteMarks( CudaK32PlotContext& cx, const TableId table )
 {
     char path[512];
 
+    std::string baseUrl = DBG_BBCU_DBG_DIR;
+    if( cx.cfg.hybrid128Mode )
+        baseUrl += "disk/";
+
     Log::Line( "[DEBUG] Writing marking table %u to disk...", table+1 );
     {
-        sprintf( path, "%smarks%d.tmp", DBG_BBCU_DBG_DIR, (int)table+1 );
+        sprintf( path, "%smarks%d.tmp", baseUrl.c_str(), (int)table+1 );
 
         const uint64* marks = cx.hostMarkingTables[(int)table];
 
diff --git a/cuda/CudaPlotPhase3.cu b/cuda/CudaPlotPhase3.cu
index b19d42c3..8fcdfe2a 100644
--- a/cuda/CudaPlotPhase3.cu
+++ b/cuda/CudaPlotPhase3.cu
@@ -53,7 +53,7 @@ __global__ void CudaConvertInlinedXsToLinePoints(
     {
         const Pair p = inXs[gid];
         CUDA_ASSERT( p.left || p.right );
-        
+
         lp     = CudaSquareToLinePoint64( p.left, p.right );
         bucket = (uint32)(lp >> bucketShift);
         offset = atomicAdd( &sharedBuckets[bucket], 1 );
@@ -79,7 +79,6 @@ __global__ void CudaConvertInlinedXsToLinePoints(
     outIndices[dst] = rIndex;
 }
 
-
 //-----------------------------------------------------------
 __global__ void CudaTestPrune(
     const uint64 entryCount, const uint32 rOffset, const uint64* rTableMarks, uint32* gPrunedEntryCount )
@@ -236,6 +235,14 @@ void CudaK32PlotPhase3( CudaK32PlotContext& cx )
     }
     #endif
 
+    if( cx.cfg.hybrid16Mode )
+    {
+        cx.diskContext->phase3.rMapBuffer->Swap();
+        cx.diskContext->phase3.indexBuffer->Swap();
+        cx.diskContext->phase3.lpAndLMapBuffer->Swap();
+    }
+
+
     const uint32 compressionLevel = cx.gCfg->compressionLevel;
 
     // Special case with the starting table, since it has the values inlined already
@@ -259,11 +266,11 @@ void CudaK32PlotPhase3( CudaK32PlotContext& cx )
         elapsed = TimerEnd( timer );
         Log::Line( " Step 2 completed step in %.2lf seconds.", elapsed );
 
-
         const uint64 baseEntryCount   = cx.tableEntryCounts[(int)cx.table];
         const uint64 prunedEntryCount = cx.phase3->prunedTableEntryCounts[(int)cx.table];
         Log::Line( "Completed table %u in %.2lf seconds with %llu / %llu entries ( %.2lf%% ).",
             cx.table, tableElapsed, prunedEntryCount, baseEntryCount, (prunedEntryCount / (double)baseEntryCount) * 100.0 );
+
     }
     // else if( compressionLevel > 0 )
     // {
@@ -286,7 +293,7 @@ void CudaK32PlotPhase3( CudaK32PlotContext& cx )
         Log::Line( "Compressing tables %u and %u...", (uint)rTable, (uint)rTable+1 );
 
         cx.table = rTable;
-        
+
         #if BBCU_DBG_SKIP_PHASE_2
             if( rTable < TableId::Table7 )
                 DbgLoadTablePairs( cx, rTable+1, false );
@@ -340,26 +347,22 @@ void Step1( CudaK32PlotContext& cx )
         auto&         p3     = *cx.phase3;
         auto&         s1     = p3.step1;
 
-        const uint32 entryCount = BBCU_BUCKET_ENTRY_COUNT;
+        if( bucket == 0 && cx.cfg.hybrid128Mode )
+        {
+            cx.diskContext->tablesL[(int)rTable]->Swap();
+            cx.diskContext->tablesR[(int)rTable]->Swap();
+
+            s1.pairsLIn.AssignDiskBuffer( cx.diskContext->tablesL[(int)rTable] );
+            s1.pairsRIn.AssignDiskBuffer( cx.diskContext->tablesR[(int)rTable] );
+        }
+
+        const uint32 entryCount = cx.bucketCounts[(int)rTable][bucket]; //BBCU_BUCKET_ENTRY_COUNT;
 
-        // uint32* hostPairsL = cx.hostTableSortedL + p3.pairsLoadOffset;
-        // uint16* hostPairsR = cx.hostTableSortedR + p3.pairsLoadOffset;
         uint32* hostPairsL = cx.hostBackPointers[(int)rTable].left  + p3.pairsLoadOffset;
         uint16* hostPairsR = cx.hostBackPointers[(int)rTable].right + p3.pairsLoadOffset;
 
-        // if( rTable < TableId::Table7 )
-        // {
-        //     const uint32* nextHostPairsL = cx.hostBackPointers[(int)rTable + 1].left  + p3.pairsLoadOffset;
-        //     const uint16* nextHostPairsR = cx.hostBackPointers[(int)rTable + 1].right + p3.pairsLoadOffset;
-
-        //     s1.pairsLIn.UploadAndPreLoadT( hostPairsL, entryCount, nextHostPairsL, entryCount );
-        //     s1.pairsRIn.UploadAndPreLoadT( hostPairsR, entryCount, nextHostPairsR, entryCount );
-        // }
-        // else
-        {
-            s1.pairsLIn.UploadT( hostPairsL, entryCount );
-            s1.pairsRIn.UploadT( hostPairsR, entryCount );
-        }
+        s1.pairsLIn.UploadT( hostPairsL, entryCount );
+        s1.pairsRIn.UploadT( hostPairsR, entryCount );
 
         p3.pairsLoadOffset += entryCount;
     };
@@ -384,7 +387,6 @@ void Step1( CudaK32PlotContext& cx )
     p3.pairsLoadOffset = 0;
     LoadBucket( cx, 0 );
 
-
     ///
     /// Process buckets
     ///
@@ -403,9 +405,9 @@ void Step1( CudaK32PlotContext& cx )
         const uint32* devLPairs = (uint32*)s1.pairsLIn.GetUploadedDeviceBuffer( cx.computeStream );
         const uint16* devRPairs = (uint16*)s1.pairsRIn.GetUploadedDeviceBuffer( cx.computeStream );
 
-        const uint32 entryCount = bucket == BBCU_BUCKET_COUNT-1 ?
-                                  ( cx.tableEntryCounts[(int)rTable] - (BBCU_BUCKET_ENTRY_COUNT * (BBCU_BUCKET_COUNT-1)) ) :    // Get only the remaining entries for the last bucket
-                                  BBCU_BUCKET_ENTRY_COUNT;                                                                      // Otherwise, use a whole bucket's worth.
+        const uint32 entryCount = cx.bucketCounts[(int)rTable][bucket];// bucket == BBCU_BUCKET_COUNT-1 ?
+                                //   ( cx.tableEntryCounts[(int)rTable] - (BBCU_BUCKET_ENTRY_COUNT * (BBCU_BUCKET_COUNT-1)) ) :    // Get only the remaining entries for the last bucket
+                                //   BBCU_BUCKET_ENTRY_COUNT;                                                                      // Otherwise, use a whole bucket's worth.
 
         auto* devRMap = (RMap*)s1.rMapOut.LockDeviceBuffer( cx.computeStream );
 
@@ -430,7 +432,7 @@ void Step1( CudaK32PlotContext& cx )
         s1.rMapOut.Download2DT<RMap>( p3.hostRMap + (size_t)bucket * P3_PRUNED_SLICE_MAX,
             P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_BUCKET_MAX, P3_PRUNED_SLICE_MAX, cx.computeStream );
     }
-    
+
     // Download slice counts
     cudaStream_t downloadStream = s1.rMapOut.GetQueue()->GetStream();
 
@@ -464,6 +466,15 @@ void Step1( CudaK32PlotContext& cx )
         for( uint32 i = 0; i < BBCU_BUCKET_COUNT; i++ )
             p3.prunedTableEntryCounts[(int)rTable] += p3.prunedBucketCounts[(int)rTable][i];
     }
+
+    if( cx.cfg.hybrid16Mode )
+    {
+        cx.diskContext->phase3.rMapBuffer->Swap();
+    }
+
+    // #if _DEBUG
+    //     DbgValidateRMap( cx );
+    // #endif
 }
 
 //-----------------------------------------------------------
@@ -478,17 +489,25 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
         auto& p3 = *cx.phase3;
         auto& tx = p3.xTable;
 
-        if( bucket == 0 )
-            p3.pairsLoadOffset = 0;
-
         // Load inlined x's
         const TableId rTable     = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables;
         const uint32  entryCount = cx.bucketCounts[(int)rTable][bucket];
 
+        if( bucket == 0 )
+        {
+            p3.pairsLoadOffset = 0;
+
+            if( cx.cfg.hybrid128Mode )
+            {
+                cx.diskContext->tablesL[(int)rTable]->Swap();
+                tx.xIn.AssignDiskBuffer( cx.diskContext->tablesL[(int)rTable] );
+            }
+        }
+
         const Pair* inlinedXs = ((Pair*)cx.hostBackPointers[(int)rTable].left) + p3.pairsLoadOffset;
 
         tx.xIn.UploadT( inlinedXs, entryCount, cx.computeStream );
-    
+
         p3.pairsLoadOffset += entryCount;
     };
 
@@ -511,8 +530,8 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
     const bool   isCompressed     = cx.gCfg->compressionLevel > 0;
     const uint32 compressedLPBits = isCompressed ? GetCompressedLPBitCount( cx.gCfg->compressionLevel ) : 0;
 
-    const uint32 lpBits        = isCompressed ? compressedLPBits : BBCU_K * 2 - 1;
-    const uint32 lpBucketShift = lpBits - BBC_BUCKET_BITS;
+    const uint32 lpBits           = isCompressed ? compressedLPBits : BBCU_K * 2 - 1;
+    const uint32 lpBucketShift    = lpBits - BBC_BUCKET_BITS;
 
     uint64 tablePrunedEntryCount = 0;
     uint32 rTableOffset          = 0;
@@ -556,7 +575,7 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
 
         rTableOffset += entryCount;
     }
-    
+
     cudaStream_t downloadStream = tx.lpOut.GetQueue()->GetStream();
 
     CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, 
@@ -592,11 +611,17 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
             p3.prunedTableEntryCounts[(int)rTable] += p3.prunedBucketCounts[(int)rTable][i];
     }
 
-#if _DEBUG
-    // DbgValidateIndices( cx );
-    // DbgValidateStep2Output( cx );
-    // DbgDumpSortedLinePoints( cx );
-#endif
+    if( cx.cfg.hybrid16Mode )
+    {
+        cx.diskContext->phase3.lpAndLMapBuffer->Swap();
+        cx.diskContext->phase3.indexBuffer->Swap();
+    }
+
+// #if _DEBUG
+//     DbgValidateIndices( cx );
+//     // DbgValidateStep2Output( cx );
+//     // DbgDumpSortedLinePoints( cx );
+// #endif
 }
 
 
@@ -606,22 +631,47 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
 //-----------------------------------------------------------
 void CudaK32PlotPhase3AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
 {
+    static_assert( sizeof( LMap ) == sizeof( uint64 ) );
+
     auto& p3 = *cx.phase3;
 
     // Shared allocations
-    p3.devBucketCounts      = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_COUNT, acx.alignment );
-    p3.devPrunedEntryCount  = acx.devAllocator->CAlloc<uint32>( 1, acx.alignment );
+    p3.devBucketCounts     = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_COUNT, acx.alignment );
+    p3.devPrunedEntryCount = acx.devAllocator->CAlloc<uint32>( 1, acx.alignment );
 
     // Host allocations
-    p3.hostRMap             = acx.hostTempAllocator->CAlloc<RMap>( BBCU_TABLE_ALLOC_ENTRY_COUNT );     // Used for rMap and index
-    p3.hostLinePoints       = acx.hostTempAllocator->CAlloc<uint64>( BBCU_TABLE_ALLOC_ENTRY_COUNT );   // Used for lMap and LPs
-
-    if( !acx.dryRun )
+    if( !cx.cfg.hybrid16Mode )
+    {
+        p3.hostRMap       = acx.hostTempAllocator->CAlloc<RMap>( BBCU_TABLE_ALLOC_ENTRY_COUNT );     // Used for rMap and index
+        p3.hostLinePoints = acx.hostTempAllocator->CAlloc<uint64>( BBCU_TABLE_ALLOC_ENTRY_COUNT );   // Used for lMap and LPs
+    }
+    else if( !cx.diskContext->phase3.rMapBuffer )
     {
-        ASSERT( (uintptr_t)(p3.hostLinePoints + BBCU_TABLE_ALLOC_ENTRY_COUNT ) <= (uintptr_t)cx.hostTableL );
-        ASSERT( (uintptr_t)(p3.hostLinePoints + BBCU_TABLE_ALLOC_ENTRY_COUNT ) < (uintptr_t)cx.hostTableSortedL );
+        const size_t RMAP_SLICE_SIZE        = sizeof( RMap )   * P3_PRUNED_SLICE_MAX;
+        const size_t INDEX_SLICE_SIZE       = sizeof( uint32 ) * P3_PRUNED_SLICE_MAX;
+        const size_t LP_AND_LMAP_SLICE_SIZE = sizeof( uint64 ) * P3_PRUNED_SLICE_MAX;
+
+        const FileFlags TMP2_QUEUE_FILE_FLAGS = cx.cfg.temp2DirectIO ? FileFlags::NoBuffering | FileFlags::LargeFile : FileFlags::LargeFile;
+
+        cx.diskContext->phase3.rMapBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::P3_RMAP_DISK_BUFFER_FILE_NAME.data(), 
+                                            BBCU_BUCKET_COUNT, RMAP_SLICE_SIZE, FileMode::OpenOrCreate, FileAccess::ReadWrite, TMP2_QUEUE_FILE_FLAGS );
+        FatalIf( !cx.diskContext->phase3.rMapBuffer, "Failed to create R Map disk buffer." );
+
+        cx.diskContext->phase3.indexBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::P3_INDEX_DISK_BUFFER_FILE_NAME.data(), 
+                                            BBCU_BUCKET_COUNT, INDEX_SLICE_SIZE, FileMode::OpenOrCreate, FileAccess::ReadWrite, TMP2_QUEUE_FILE_FLAGS );
+        FatalIf( !cx.diskContext->phase3.indexBuffer, "Failed to create index disk buffer." );
+
+        cx.diskContext->phase3.lpAndLMapBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::P3_LP_AND_LMAP_DISK_BUFFER_FILE_NAME.data(), 
+                                            BBCU_BUCKET_COUNT, RMAP_SLICE_SIZE, FileMode::OpenOrCreate, FileAccess::ReadWrite, TMP2_QUEUE_FILE_FLAGS );
+        FatalIf( !cx.diskContext->phase3.lpAndLMapBuffer, "Failed to create LP/LMap disk buffer." );
     }
-    // p3.hostBucketCounts     = acx.pinnedAllocator->CAlloc<uint32>( BBCU_BUCKET_COUNT, acx.alignment );
+
+    #if _DEBUG
+        if( !acx.dryRun && !cx.cfg.hybrid128Mode )
+        {
+            ASSERT( (uintptr_t)(p3.hostLinePoints + BBCU_TABLE_ALLOC_ENTRY_COUNT ) <= (uintptr_t)cx.hostTableL );
+        }
+    #endif
 
     if( acx.dryRun )
     {
@@ -687,74 +737,156 @@ void CudaK32PlotPhase3AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocConte
 //-----------------------------------------------------------
 void AllocXTableStep( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
 {
+    GpuStreamDescriptor desc{};
+    desc.entriesPerSlice = BBCU_MAX_SLICE_ENTRY_COUNT;
+    desc.sliceCount      = BBCU_BUCKET_COUNT;
+    desc.sliceAlignment  = acx.alignment;
+    desc.bufferCount     = BBCU_DEFAULT_GPU_BUFFER_COUNT;
+    desc.deviceAllocator = acx.devAllocator;
+    desc.pinnedAllocator = nullptr;
+
+    GpuStreamDescriptor uploadDesc = desc;
+    if( cx.cfg.hybrid128Mode )
+    {
+        uploadDesc.pinnedAllocator = acx.pinnedAllocator;
+
+        if( cx.cfg.hybrid16Mode )
+            desc.pinnedAllocator = acx.pinnedAllocator;
+    }
+
     auto& tx = cx.phase3->xTable;
 
     tx.devRMarks = (uint64*)acx.devAllocator->AllocT<uint64>( GetMarkingTableBitFieldSize(), acx.alignment );
-    tx.xIn       = cx.gpuUploadStream[0]->CreateUploadBuffer(sizeof(Pair) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, acx.alignment, acx.dryRun);
-    tx.lpOut     = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, acx.alignment, acx.dryRun );
-    tx.indexOut  = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, acx.alignment, acx.dryRun );
+
+    tx.xIn       = cx.gpuUploadStream[0]->CreateUploadBufferT<Pair>( uploadDesc, acx.dryRun );
+    tx.lpOut     = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint64>( desc, acx.dryRun );
+    tx.indexOut  = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( desc, acx.dryRun );
+
+    if( !acx.dryRun && cx.cfg.hybrid16Mode )
+    {
+        tx.lpOut   .AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
+        tx.indexOut.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer );
+    }
 }
 
 //-----------------------------------------------------------
 void CudaK32PlotAllocateBuffersStep1( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
 {
+    GpuStreamDescriptor desc{};
+    desc.entriesPerSlice = BBCU_MAX_SLICE_ENTRY_COUNT;
+    desc.sliceCount      = BBCU_BUCKET_COUNT;
+    desc.sliceAlignment  = acx.alignment;
+    desc.bufferCount     = BBCU_DEFAULT_GPU_BUFFER_COUNT;
+    desc.deviceAllocator = acx.devAllocator;
+    desc.pinnedAllocator = nullptr;
+
+    GpuStreamDescriptor uploadDesc = desc;
+    if( cx.cfg.hybrid128Mode )
+    {
+        uploadDesc.pinnedAllocator = acx.pinnedAllocator;
+
+        if( cx.cfg.hybrid16Mode )
+            desc.pinnedAllocator = acx.pinnedAllocator;
+    }
+
     auto&        s1        = cx.phase3->step1;
     const size_t alignment = acx.alignment;
 
-    s1.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
-                    sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
-    
-    s1.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
-                    sizeof( uint16 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
-
-    s1.rMapOut  = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer(
-                    sizeof( RMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
+    s1.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint32>( uploadDesc,  acx.dryRun );
+    s1.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint16>( uploadDesc,  acx.dryRun );
+    s1.rMapOut  = cx.gpuDownloadStream[0]->CreateDownloadBufferT<RMap>( desc, acx.dryRun );
 
     s1.rTableMarks = (uint64*)acx.devAllocator->AllocT<uint64>( GetMarkingTableBitFieldSize(), acx.alignment );
+
+    if( !acx.dryRun && cx.cfg.hybrid16Mode )
+    {
+        s1.rMapOut.AssignDiskBuffer( cx.diskContext->phase3.rMapBuffer );
+    }
 }
 
 //-----------------------------------------------------------
 void CudaK32PlotAllocateBuffersStep2( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
 {
+    GpuStreamDescriptor desc{};
+    desc.entriesPerSlice = BBCU_MAX_SLICE_ENTRY_COUNT;
+    desc.sliceCount      = BBCU_BUCKET_COUNT;
+    desc.sliceAlignment  = acx.alignment;
+    desc.bufferCount     = BBCU_DEFAULT_GPU_BUFFER_COUNT;
+    desc.deviceAllocator = acx.devAllocator;
+    desc.pinnedAllocator = nullptr;
+
+    GpuStreamDescriptor uploadDesc = desc;
+    if( cx.cfg.hybrid16Mode )
+    {
+        desc.pinnedAllocator = acx.pinnedAllocator;
+    }
+
     auto&        s2        = cx.phase3->step2;
     const size_t alignment = acx.alignment;
 
-    s2.rMapIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
-        sizeof( RMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+    s2.rMapIn = cx.gpuUploadStream[0]->CreateUploadBufferT<RMap>( desc, acx.dryRun );
+    s2.lMapIn = cx.gpuUploadStream[0]->CreateUploadBufferT<LMap>( desc, acx.dryRun );
 
-    s2.lMapIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
-        sizeof( LMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+    s2.lpOut    = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint64>( desc, acx.dryRun );
+    s2.indexOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32> (desc, acx.dryRun );
 
-    s2.lpOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer(
-        sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
 
-    s2.indexOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer(
-        sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
-    
+    const size_t devParkAllocSize = P3_PARK_7_SIZE * P3_MAX_P7_PARKS_PER_BUCKET;
+
+    GpuStreamDescriptor parksDesc = desc;
+    parksDesc.sliceCount      = 1;
+    parksDesc.entriesPerSlice = devParkAllocSize;
+    parksDesc.sliceAlignment  = RoundUpToNextBoundaryT<size_t>( P3_PARK_7_SIZE, sizeof( uint64 ) );
+
+    s2.parksOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<byte>( parksDesc, acx.dryRun );
+
     s2.devLTable[0] = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
     s2.devLTable[1] = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
+
+    if( !acx.dryRun && cx.cfg.hybrid16Mode )
+    {
+        s2.rMapIn.AssignDiskBuffer( cx.diskContext->phase3.rMapBuffer );
+        s2.lMapIn.AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
+
+        s2.lpOut   .AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
+        s2.indexOut.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer );
+    }
 }
 
 //-----------------------------------------------------------
 void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
 {
+    GpuStreamDescriptor desc{};
+    desc.entriesPerSlice = BBCU_MAX_SLICE_ENTRY_COUNT;
+    desc.sliceCount      = BBCU_BUCKET_COUNT;
+    desc.sliceAlignment  = acx.alignment;
+    desc.bufferCount     = BBCU_DEFAULT_GPU_BUFFER_COUNT;
+    desc.deviceAllocator = acx.devAllocator;
+    desc.pinnedAllocator = nullptr;
+
+    if( cx.cfg.hybrid16Mode )
+    {
+        desc.pinnedAllocator = acx.pinnedAllocator;
+    }
+
     auto&        s3        = cx.phase3->step3;
     const size_t alignment = acx.alignment;
 
     s3.hostParkOverrunCount = acx.pinnedAllocator->CAlloc<uint32>( 1 );
 
-    const size_t devParkAllocSize = DEV_MAX_PARK_SIZE * P3_PRUNED_MAX_PARKS_PER_BUCKET;
+    s3.lpIn    = cx.gpuUploadStream[0]->CreateUploadBufferT<uint64>( desc, acx.dryRun );
+    s3.indexIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint32>( desc, acx.dryRun );
 
-    s3.lpIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
-        sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+    s3.mapOut  = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint64>( desc, acx.dryRun );
 
-    s3.indexIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
-        sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+    const size_t devParkAllocSize = DEV_MAX_PARK_SIZE * P3_PRUNED_MAX_PARKS_PER_BUCKET;
 
-    s3.mapOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer(
-        sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
+    GpuStreamDescriptor parksDesc = desc;
+    parksDesc.sliceCount      = 1;
+    parksDesc.entriesPerSlice = devParkAllocSize;
+    parksDesc.sliceAlignment  = RoundUpToNextBoundaryT<size_t>( DEV_MAX_PARK_SIZE, sizeof( uint64 ) );
 
-    s3.parksOut = cx.gpuDownloadStream[0]->CreateDownloadBuffer(devParkAllocSize, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun);
+    s3.parksOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<byte>( parksDesc, acx.dryRun );
 
     if( acx.dryRun )
     {
@@ -774,11 +906,16 @@ void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContex
     s3.devDeltaLinePoints = acx.devAllocator->CAlloc<uint64>( linePointAllocCount, alignment );
     s3.devIndices         = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
 
-    // s3.devParks  = acx.devAllocator->AllocT<uint64>( parkAllocSize, alignment );
-    // s3.hostParks = acx.devAllocator->AllocT<byte>  ( maxParkSize  , alignment );
-
     s3.devCTable           = acx.devAllocator->AllocT<FSE_CTable>( P3_MAX_CTABLE_SIZE, alignment );
     s3.devParkOverrunCount = acx.devAllocator->CAlloc<uint32>( 1 );
+
+    if( !acx.dryRun && cx.cfg.hybrid16Mode )
+    {
+        s3.lpIn   .AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
+        s3.indexIn.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer );
+
+        s3.mapOut.AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
+    }
 }
 
 
@@ -827,6 +964,9 @@ void DbgValidateRMap( CudaK32PlotContext& cx )
 
         RMap* rMap = bbcvirtallocbounded<RMap>( BBCU_BUCKET_ALLOC_ENTRY_COUNT );
 
+        // blake3_hasher hasher;
+        // blake3_hasher_init( &hasher );
+
         for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
         {
             const RMap* reader = p3.hostRMap + bucket * P3_PRUNED_BUCKET_MAX;
@@ -838,7 +978,7 @@ void DbgValidateRMap( CudaK32PlotContext& cx )
             {
                 const uint32 copyCount = s1.prunedBucketSlices[slice][bucket];
                 bbmemcpy_t( writer, reader, copyCount );
-                
+
                 writer     += copyCount;
                 entryCount += copyCount;
 
@@ -858,13 +998,18 @@ void DbgValidateRMap( CudaK32PlotContext& cx )
                 const uint32 right = map.dstR - bucketOffset;
                 ASSERT( left  < BBCU_BUCKET_ALLOC_ENTRY_COUNT );
                 ASSERT( right < BBCU_BUCKET_ALLOC_ENTRY_COUNT );
-                CUDA_ASSERT( left < right );
-
+                ASSERT( left < right );
             }
+
+            // Hash bucket
+            // blake3_hasher_update( &hasher, rMap, sizeof( RMap ) * entryCount );
         }
 
+        // Print hash
+        // DbgFinishAndPrintHash( hasher, "r_map", (uint)cx.table + 1 );
+
         bbvirtfreebounded( rMap );
-        Log::Line( "[DEBUG] CPU OK" );
+        Log::Line( " [DEBUG] CPU OK" );
     }
 
     // Validate in CUDA
@@ -899,10 +1044,12 @@ void DbgValidateRMap( CudaK32PlotContext& cx )
 
             p3.step2.rMapIn.ReleaseDeviceBuffer( cx.computeStream );
         }
-        Log::Line( "[DEBUG] CUDA OK" );
+        Log::Line( " [DEBUG] CUDA OK" );
 
         p3.step2.lMapIn.Reset();
     }
+
+    Log::Line( "[DEBUG] RMap validation OK" );
 }
 
 //-----------------------------------------------------------
@@ -922,23 +1069,45 @@ void DbgValidateIndices( CudaK32PlotContext& cx )
 
     const uint32* reader       = p3.hostIndices;
     const size_t  readerStride = P3_PRUNED_SLICE_MAX * 3;
-
     uint64 entryCount = 0;
 
     for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
     {
-        for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+        if( cx.cfg.hybrid16Mode )
+        {
+            const uint32* sizeSlices = &s2.prunedBucketSlices[0][bucket];
+
+            cx.diskContext->phase3.indexBuffer->OverrideReadSlices( bucket, sizeof( uint32 ), sizeSlices, BBCU_BUCKET_COUNT );
+            cx.diskContext->phase3.indexBuffer->ReadNextBucket();
+            const auto readBucket = cx.diskContext->phase3.indexBuffer->GetNextReadBufferAs<uint32>();
+            ASSERT( readBucket.Length() == p3.prunedBucketCounts[(int)cx.table][bucket] );
+
+            bbmemcpy_t( idxWriter, readBucket.Ptr(), readBucket.Length() );
+
+            idxWriter  += readBucket.Length();
+            entryCount += readBucket.Length();
+        }
+        else
         {
-            const uint32 copyCount = s2.prunedBucketSlices[bucket][slice];
+            for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+            {
+                const uint32 copyCount = s2.prunedBucketSlices[slice][bucket];
 
-            bbmemcpy_t( idxWriter, reader, copyCount );
+                bbmemcpy_t( idxWriter, reader, copyCount );
 
-            idxWriter += copyCount;
-            entryCount += copyCount;
-            reader += readerStride;
+                idxWriter  += copyCount;
+                entryCount += copyCount;
+                reader     += readerStride;
+            }
         }
     }
 
+    if( cx.cfg.hybrid16Mode )
+    {
+        cx.diskContext->phase3.indexBuffer->Swap();
+        cx.diskContext->phase3.indexBuffer->Swap();
+    }
+
     ASSERT( entryCount == p3.prunedTableEntryCounts[(int)cx.table] );
 
     RadixSort256::Sort<BB_MAX_JOBS>( pool, indices, idxTmp, entryCount );
@@ -949,10 +1118,36 @@ void DbgValidateIndices( CudaK32PlotContext& cx )
         ASSERT( indices[i] > indices[i-1] );
     }
 
+    DbgHashDataT( indices, entryCount, "indices", (uint32)cx.table+1 );
+
     bbvirtfreebounded( indices );
     bbvirtfreebounded( idxTmp );
 
-    Log::Line( "[DEBUG] OK" );
+    Log::Line( "[DEBUG] Index validation OK" );
+}
+
+//-----------------------------------------------------------
+void DbgHashData( const void* data, size_t size, const char* name, uint32 index )
+{
+    blake3_hasher hasher;
+    blake3_hasher_init( &hasher );
+    blake3_hasher_update( &hasher, data, size );
+
+    DbgFinishAndPrintHash( hasher, name, index );
+}
+
+//-----------------------------------------------------------
+void DbgFinishAndPrintHash( blake3_hasher& hasher, const char* name, uint32 index )
+{
+    constexpr size_t HASH_LEN = 256/8;
+    byte output[HASH_LEN];
+    blake3_hasher_finalize( &hasher, output, HASH_LEN );
+
+    Log::Write( "[DEBUG] %s_%u hash: 0x", name, index );
+    for( uint32 i = 0; i < HASH_LEN; i++ )
+        Log::Write( "%02x", output[i] );
+    
+    Log::NewLine();
 }
 
 #endif
diff --git a/cuda/CudaPlotPhase3Internal.h b/cuda/CudaPlotPhase3Internal.h
index 1a4bd7a8..34909123 100644
--- a/cuda/CudaPlotPhase3Internal.h
+++ b/cuda/CudaPlotPhase3Internal.h
@@ -10,8 +10,18 @@
     #include "plotdisk/jobs/IOJob.h"
     #include "algorithm/RadixSort.h"
     #include "plotmem/ParkWriter.h"
+    #include "b3/blake3.h"
 
     void DbgValidateStep2Output( CudaK32PlotContext& cx );
+
+    void DbgHashData( const void* data, size_t size, const char* name, uint32 index );
+
+    void DbgFinishAndPrintHash( blake3_hasher& hasher, const char* name, uint32 index );
+    template<typename T>
+    inline void DbgHashDataT( const T* data, uint64 count, const char* name, uint32 index )
+    {
+        DbgHashData( data, (size_t)count * sizeof( T ), name, index );
+    }
 #endif
 
 using LMap = CudaK32Phase3::LMap;
@@ -27,22 +37,11 @@ static_assert( alignof( LMap ) == sizeof( uint32 ) );
 #define P3_PRUNED_TABLE_MAX_ENTRIES     BBCU_TABLE_ALLOC_ENTRY_COUNT   //(P3_PRUNED_BUCKET_MAX*BBCU_BUCKET_COUNT)
 #define P3_PRUNED_MAX_PARKS_PER_BUCKET  ((P3_PRUNED_BUCKET_MAX/kEntriesPerPark)+2)
 
-static constexpr size_t P3_MAX_CTABLE_SIZE = 38u * 1024u;  // Should be more than enough
-
-//static constexpr size_t P3_LP_BUCKET_COUNT       = BBCU_BUCKET_COUNT;// << 1;
-//static constexpr size_t P3_LP_SLICE_ENTRY_COUNT  = BBCU_MAX_SLICE_ENTRY_COUNT;
-//static constexpr uint32 P3_LP_BUCKET_BITS        = BBC_BUCKET_BITS;
-
-// static constexpr uint32 P3_LP_BUCKET_BITS        = (uint32)(CuBBLog2( P3_LP_BUCKET_COUNT ));
-//static constexpr size_t P3_LP_SLICE_ENTRY_COUNT  = ( CuCDiv( (size_t)( ( BBCU_TABLE_ENTRY_COUNT / P3_LP_BUCKET_COUNT / P3_LP_BUCKET_COUNT ) * P3_LP_BUCKET_MULTIPLER ),
-                                                     //BBCU_XTRA_ENTRIES_PER_SLICE ) * BBCU_XTRA_ENTRIES_PER_SLICE + BBCU_XTRA_ENTRIES_PER_SLICE );
-// static constexpr size_t P3_LP_BUCKET_ENTRY_COUNT = P3_LP_SLICE_ENTRY_COUNT * P3_LP_BUCKET_COUNT;
-
-//static constexpr size_t P3_LP_BUCKET_STRIDE      = BBCU_BUCKET_ALLOC_ENTRY_COUNT;
 
-// static constexpr size_t P3_LP_BUCKET_ALLOC_COUNT = ( CuCDiv( (size_t)( ( BBCU_TABLE_ENTRY_COUNT / P3_LP_BUCKET_COUNT / P3_LP_BUCKET_COUNT ) * P3_LP_BUCKET_MULTIPLER ),
-//                                                     BBCU_XTRA_ENTRIES_PER_SLICE ) * BBCU_XTRA_ENTRIES_PER_SLICE + BBCU_XTRA_ENTRIES_PER_SLICE );
-// //static constexpr size_t P3_LP_TABLE_ALLOC_COUNT  = P3_LP_BUCKET_STRIDE * BBCU_BUCKET_COUNT;
+static constexpr size_t P3_MAX_CTABLE_SIZE         = 38u * 1024u;  // Should be more than enough
+static constexpr size_t P3_MAX_P7_PARKS_PER_BUCKET = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2;
+static constexpr size_t P3_PARK_7_SIZE             = CalculatePark7Size( BBCU_K );
+static_assert( sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT >= P3_MAX_P7_PARKS_PER_BUCKET * P3_PARK_7_SIZE );
 
 static constexpr size_t MAX_PARK_SIZE            = CalculateParkSize( TableId::Table1 );
 static constexpr size_t DEV_MAX_PARK_SIZE        = CuCDiv( MAX_PARK_SIZE, sizeof( uint64 ) ) * sizeof( uint64 );   // Align parks to 64 bits, for easier writing of stubs
diff --git a/cuda/CudaPlotPhase3Step2.cu b/cuda/CudaPlotPhase3Step2.cu
index ac13e915..3a7a6449 100644
--- a/cuda/CudaPlotPhase3Step2.cu
+++ b/cuda/CudaPlotPhase3Step2.cu
@@ -248,7 +248,7 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
 
         s2.rMapIn.UploadArrayT<RMap>( rmap, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, rSliceCounts );
     };
-    
+
 
     const TableId rTable = cx.table;
     const TableId lTable = rTable-1;
@@ -309,7 +309,7 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
         const auto*  rMap        = (RMap*)s2.rMapIn.GetUploadedDeviceBuffer( cx.computeStream );
         const uint32 rEntryCount = p3.prunedBucketCounts[(int)rTable][bucket];
 
-        
+
         uint64* devOutLPs     = (uint64*)s2.lpOut   .LockDeviceBuffer( cx.computeStream );
         uint32* devOutIndices = (uint32*)s2.indexOut.LockDeviceBuffer( cx.computeStream );
 
@@ -317,7 +317,6 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
         s2.rMapIn.ReleaseDeviceBuffer( cx.computeStream );
         rTableOffset += rEntryCount;
 
-
         // Horizontal download (write 1 row)
         s2.lpOut   .Download2DT<uint64>( p3.hostLinePoints + (size_t)bucket * P3_PRUNED_BUCKET_MAX  , P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX  , P3_PRUNED_SLICE_MAX, cx.computeStream );
         s2.indexOut.Download2DT<uint32>( p3.hostIndices    + (size_t)bucket * P3_PRUNED_BUCKET_MAX*3, P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX*3, P3_PRUNED_SLICE_MAX, cx.computeStream );
@@ -354,7 +353,7 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
 
     CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
                     cudaMemcpyDeviceToHost, downloadStream ) );
-    
+
     memset( p3.prunedBucketCounts[(int)rTable], 0, BBCU_BUCKET_COUNT * sizeof( uint32 ) );
 
     CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
@@ -370,8 +369,15 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
         ASSERT( p3.prunedBucketCounts[(int)rTable][bucket] <= P3_PRUNED_BUCKET_MAX );
     }
 
+    if( cx.cfg.hybrid16Mode )
+    {
+        cx.diskContext->phase3.rMapBuffer->Swap();
+        cx.diskContext->phase3.lpAndLMapBuffer->Swap();
+        cx.diskContext->phase3.indexBuffer->Swap();
+    }
+
     // #if _DEBUG
-    // if( cx.table > TableId::Table3 )
+    // // if( cx.table > TableId::Table3 )
     // {
     //    DbgValidateStep2Output( cx );
     // }
@@ -402,23 +408,26 @@ void WritePark7( CudaK32PlotContext& cx )
     auto& p3 = *cx.phase3;
     auto& s2 = p3.step2;
 
-    
+
     // Load initial bucket
     LoadBucket( cx, 0 );
 
     // Begin park 7 table in plot
     cx.plotWriter->BeginTable( PlotTable::Table7 );
 
-    constexpr size_t parkSize       = CalculatePark7Size( BBCU_K );
+    constexpr size_t parkSize       = P3_PARK_7_SIZE;
     constexpr size_t parkFieldCount = parkSize / sizeof( uint64 );
     static_assert( parkFieldCount * sizeof( uint64 ) == parkSize );
 
+    GpuDownloadBuffer& parkDownloader = cx.useParkContext ? s2.parksOut : s2.lpOut;
 
-    GpuDownloadBuffer& parkDownloader = s2.lpOut;
-
-    constexpr size_t maxParksPerBucket = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2;
+    constexpr size_t maxParksPerBucket = P3_MAX_P7_PARKS_PER_BUCKET;
     static_assert( sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT >= maxParksPerBucket * parkSize );
 
+    if( cx.useParkContext )
+    {
+        cx.parkContext->parkBufferChain->Reset();
+    }
 
     // Host stuff
     constexpr size_t hostMetaTableSize = sizeof( RMap ) * BBCU_TABLE_ALLOC_ENTRY_COUNT;
@@ -427,9 +436,10 @@ void WritePark7( CudaK32PlotContext& cx )
     const uint64 tableEntryCount = cx.tableEntryCounts[(int)cx.table];
     const size_t totalParkCount  = CDiv( (size_t)tableEntryCount, kEntriesPerPark );
 
-    byte*   hostParks           = hostAllocator.AllocT<byte>( totalParkCount * parkSize );
-    byte*   hostParkWriter      = hostParks;
-    uint32* hostLastParkEntries = hostAllocator.CAlloc<uint32>( kEntriesPerPark );
+    byte*   hostParks           = cx.useParkContext ? nullptr : hostAllocator.AllocT<byte>( totalParkCount * parkSize );
+    byte*   hostParksWriter     = cx.useParkContext ? nullptr : hostParks;
+    uint32* hostLastParkEntries = cx.useParkContext ? (uint32*)cx.parkContext->hostRetainedLinePoints : 
+                                                      hostAllocator.CAlloc<uint32>( kEntriesPerPark );
 
     static_assert( kEntriesPerPark * maxParksPerBucket <= BBCU_BUCKET_ALLOC_ENTRY_COUNT * 2 );
     uint32* devIndexBuffer     = s2.devLTable[0] + kEntriesPerPark;
@@ -479,14 +489,38 @@ void WritePark7( CudaK32PlotContext& cx )
         // Download parks & write to plot
         const size_t downloadSize = parkCount * parkSize;
 
-        parkDownloader.DownloadWithCallback( hostParkWriter, downloadSize,
+        if( cx.useParkContext )
+        {
+            ASSERT( downloadSize <= cx.parkContext->parkBufferChain->BufferSize() );
+
+            // Override the park buffer to be used when using a park context
+            hostParksWriter = cx.parkContext->parkBufferChain->PeekBuffer( bucket );
+
+            // Wait for the next park buffer to be available
+            parkDownloader.HostCallback([&cx]{
+               (void)cx.parkContext->parkBufferChain->GetNextBuffer();
+            });
+        }
+
+        parkDownloader.DownloadWithCallback( hostParksWriter, downloadSize,
               []( void* parksBuffer, size_t size, void* userData ) {
 
                 auto& cx = *reinterpret_cast<CudaK32PlotContext*>( userData );
                 cx.plotWriter->WriteTableData( parksBuffer, size );
+
+                // Release the buffer after the plot writer is done with it.
+                if( cx.useParkContext )
+                {
+                    cx.plotWriter->CallBack([&cx](){
+                        cx.parkContext->parkBufferChain->ReleaseNextBuffer();
+                    });
+                }
+                
             }, &cx, cx.computeStream );
 
-        hostParkWriter += downloadSize;
+        hostParksWriter += downloadSize;
+        if( cx.useParkContext )
+            hostParksWriter = nullptr;
     }
 
     // Wait for parks to complete downloading
@@ -499,9 +533,19 @@ void WritePark7( CudaK32PlotContext& cx )
     // Was there a left-over park?
     if( retainedEntryCount > 0 )
     {
+        if( cx.useParkContext )
+            hostParksWriter = cx.parkContext->parkBufferChain->GetNextBuffer();
+
         // Submit last park to plot
-        TableWriter::WriteP7Parks( 1, hostLastParkEntries, hostParkWriter );
-        cx.plotWriter->WriteTableData( hostParkWriter, parkSize );
+        TableWriter::WriteP7Parks( 1, hostLastParkEntries, hostParksWriter );
+        cx.plotWriter->WriteTableData( hostParksWriter, parkSize );
+
+        if( cx.useParkContext )
+        {
+            cx.plotWriter->CallBack([&cx](){
+                cx.parkContext->parkBufferChain->ReleaseNextBuffer();
+            });
+        }
     }
     cx.plotWriter->EndTable();
 
@@ -534,6 +578,7 @@ void _DbgValidateOutput( CudaK32PlotContext& cx )
     auto& s2 = p3.step2;
 
     // Validate line points...
+    Log::Debug( "[DEBUG] Validating line points..." );
     uint64* refLinePoints = bbcvirtallocboundednuma<uint64>( BBCU_TABLE_ALLOC_ENTRY_COUNT );
     uint64* tmpLinePoints = bbcvirtallocboundednuma<uint64>( BBCU_TABLE_ALLOC_ENTRY_COUNT );
     uint32* indices       = bbcvirtallocboundednuma<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT );
@@ -614,9 +659,13 @@ void _DbgValidateOutput( CudaK32PlotContext& cx )
         }
     }
 
+    DbgHashDataT( refLinePoints, prunedEntryCount, "line_points", (uint32)cx.table+1 );
+
     bbvirtfreebounded( refLinePoints );
     bbvirtfreebounded( tmpLinePoints );
     bbvirtfreebounded( indices );
+
+    Log::Debug( "[DEBUG] Line point validation OK" );
 }
 
 #endif
@@ -659,6 +708,8 @@ void DbgDumpSortedLinePoints( CudaK32PlotContext& cx )
     ThreadPool& pool = *cx.threadPool; //DbgGetThreadPool( cx );
     RadixSort256::Sort<BB_MAX_JOBS>( pool, sortedLinePoints, tmpLinePoints, prunedEntryCount );
 
+    // DbgHashDataT( sortedLinePoints, prunedEntryCount, "sorted_line_points", (uint32)cx.table+1 );
+
     // Write to disk
     {
         char filePath[1024] = {};
diff --git a/cuda/CudaPlotPhase3Step3.cu b/cuda/CudaPlotPhase3Step3.cu
index 3949bd8c..c8f9337b 100644
--- a/cuda/CudaPlotPhase3Step3.cu
+++ b/cuda/CudaPlotPhase3Step3.cu
@@ -52,12 +52,14 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
 
     // Load CTable
     const bool    isCompressed = cx.gCfg->compressionLevel > 0 && lTable <= (TableId)cx.gCfg->numDroppedTables;
-    const uint32  stubBitSize  = !isCompressed ? (BBCU_K - kStubMinusBits) : cx.gCfg->compressionInfo.subtSizeBits;
+    const uint32  stubBitSize  = !isCompressed ? (BBCU_K - kStubMinusBits) : cx.gCfg->compressionInfo.stubSizeBits;
     const TableId firstTable   = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables;
-    
+
+    const bool    isFirstSerializedTable = firstTable == rTable;
+
     const size_t      cTableSize = !isCompressed ? sizeof( CTable_0 )   : cx.gCfg->cTableSize;             ASSERT( cTableSize <= P3_MAX_CTABLE_SIZE );
     const FSE_CTable* hostCTable = !isCompressed ? CTables[(int)lTable] : cx.gCfg->ctable;
-    
+
     // (upload must be loaded before first bucket, on the same stream)
     CudaErrCheck( cudaMemcpyAsync( s3.devCTable, hostCTable, cTableSize, cudaMemcpyHostToDevice, 
                     s3.lpIn.GetQueue()->GetStream() ) );
@@ -75,13 +77,32 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
     const size_t hostParkSize = isCompressed ? cx.gCfg->compressionInfo.tableParkSize : CalculateParkSize( lTable );
     ASSERT( DEV_MAX_PARK_SIZE >= hostParkSize );
 
-    // #TODO: Move this allocation to the beginning
-    if( s3.parkFence == nullptr )
-        s3.parkFence = new Fence();
-
     byte*   hostParksWriter     = (byte*)cx.hostBackPointers[(int)rTable].left;  //(byte*)cx.hostTableL; 
     uint64* hostRetainedEntries = nullptr;
 
+    if( cx.cfg.hybrid128Mode )
+    {
+        hostParksWriter = (byte*)cx.hostTableL;
+
+        if( !isFirstSerializedTable && !cx.useParkContext )
+        {
+            // Ensure the this buffer is no longer in use (the last table finished writing to disk.)
+            const bool willWaitForParkFence = cx.parkFence->Value() < BBCU_BUCKET_COUNT;
+            if( willWaitForParkFence )
+                Log::Line( " Waiting for parks buffer to become available." );
+
+            Duration parkWaitTime;
+            cx.parkFence->Wait( BBCU_BUCKET_COUNT, parkWaitTime );
+
+            if( willWaitForParkFence )
+                Log::Line( " Waited %.3lf seconds for the park buffer to be released.", TicksToSeconds( parkWaitTime ) );
+        }
+    }
+    if( cx.useParkContext )
+    {
+        cx.parkContext->parkBufferChain->Reset();
+    }
+
     // if( !isCompressed && lTable == TableId::Table1 )
     //     hostParksWriter = (byte*)cx.hostBackPointers[(int)TableId::Table2].left;
 
@@ -101,7 +122,7 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
     // Set initial event LP stream event as set.
     CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) );
 
-    s3.parkFence->Reset( 0 );
+    cx.parkFence->Reset( 0 );
     s3.parkBucket = 0;
 
     for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
@@ -200,7 +221,8 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
                 // No more buckets so we have to compress this last park on the CPU
                 CudaErrCheck( cudaStreamWaitEvent( downloadStream, cx.computeEventC ) );
 
-                hostRetainedEntries = (uint64*)( hostParksWriter + hostParkSize * parkCount );
+                hostRetainedEntries = cx.useParkContext ? cx.parkContext->hostRetainedLinePoints :
+                                                       (uint64*)( hostParksWriter + hostParkSize * parkCount );
                 CudaErrCheck( cudaMemcpyAsync( hostRetainedEntries, copySource, copySize, cudaMemcpyDeviceToHost, downloadStream ) );
             }
         }
@@ -209,6 +231,19 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
 
 
         // Download parks
+        if( cx.useParkContext )
+        {
+            ASSERT( hostParkSize * parkCount <= cx.parkContext->parkBufferChain->BufferSize() );
+
+            // Override the park buffer to be used when using a park context
+            hostParksWriter = cx.parkContext->parkBufferChain->PeekBuffer( bucket );
+
+            // Wait for the next park buffer to be available
+            s3.parksOut.HostCallback([&cx]{
+               (void)cx.parkContext->parkBufferChain->GetNextBuffer();
+            });
+        }
+
         s3.parksOut.Download2DWithCallback( hostParksWriter, hostParkSize, parkCount, hostParkSize, DEV_MAX_PARK_SIZE, 
             []( void* parksBuffer, size_t size, void* userData ) {
 
@@ -216,11 +251,22 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
                 auto& s3 = cx.phase3->step3;
 
                 cx.plotWriter->WriteTableData( parksBuffer, size );
-                cx.plotWriter->SignalFence( *s3.parkFence, ++s3.parkBucket );
+                cx.plotWriter->SignalFence( *cx.parkFence, ++s3.parkBucket );
+
+                // Release the buffer after the plot writer is done with it.
+                if( cx.useParkContext )
+                {
+                    cx.plotWriter->CallBack([&cx](){
+                        cx.parkContext->parkBufferChain->ReleaseNextBuffer();
+                    });
+                }
 
             }, &cx, lpStream, cx.downloadDirect );
 
         hostParksWriter += hostParkSize * parkCount;
+    
+        if( cx.useParkContext )
+            hostParksWriter = nullptr;
     }
 
     // Copy park overrun count
@@ -242,18 +288,24 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
     // Was there a left-over park?
     if( retainedLPCount > 0 )
     {
-        ASSERT( hostRetainedEntries );
-        
+        if( cx.useParkContext )
+            hostParksWriter = cx.parkContext->parkBufferChain->GetNextBuffer();
+
         uint64 lastParkEntries[kEntriesPerPark];
         bbmemcpy_t( lastParkEntries, hostRetainedEntries, retainedLPCount );
 
         WritePark( hostParkSize, retainedLPCount, lastParkEntries, hostParksWriter, stubBitSize, hostCTable );
         cx.plotWriter->WriteTableData( hostParksWriter, hostParkSize );
+
+        if( cx.useParkContext )
+        {
+            cx.plotWriter->CallBack([&cx](){
+                cx.parkContext->parkBufferChain->ReleaseNextBuffer();
+            });
+        }
     }
     cx.plotWriter->EndTable();
 
-    // Update buckets counts for L table
-    // #TODO: These should match Step 1 pruned entry count I believe, so just copy?
 
     memset( p3.prunedBucketCounts[(int)rTable], 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT );
     for( uint32 i = 0; i < BBCU_BUCKET_COUNT; i++ )
@@ -266,12 +318,19 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
     s3.lpIn   .Reset();
     s3.indexIn.Reset();
 
+    if( cx.cfg.hybrid16Mode )
+    {
+        cx.diskContext->phase3.lpAndLMapBuffer->Swap();
+        cx.diskContext->phase3.indexBuffer->Swap();
+    }
+
 
     // #if _DEBUG
     // //if( cx.table >= TableId::Table6 )
     // //{
-    //     DbgValidateLMap( cx );
-    //     DbgValidateLMapData( cx );
+    //     // DbgValidateLMap( cx );
+    //     // DbgValidateLMapData( cx );
+
     //     // DbgSaveLMap( cx );
     // //}
     // #endif
@@ -386,7 +445,7 @@ void DbgSaveLMap( CudaK32PlotContext& cx )
 
     char path[512];
     sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.tmp", (uint)cx.table+1 );
-    
+
     const size_t writeSize = sizeof( LMap ) * BBCU_TABLE_ALLOC_ENTRY_COUNT;
     int err;
     FatalIf( !IOJob::WriteToFile( path, p3.hostLMap, writeSize, err ),
@@ -399,7 +458,7 @@ void DbgSaveLMap( CudaK32PlotContext& cx )
     sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.buckets.tmp", (uint)cx.table+1 );
     FatalIf( !IOJob::WriteToFileUnaligned( path, p3.prunedBucketCounts[(int)cx.table], sizeof( uint32 ) * BBCU_BUCKET_COUNT, err ),
         "[DEBUG] Failed to write LMap buckets with error: %d", err );
-    
+
     Log::Line( " [DEBUG] OK" );
 }
 
@@ -410,7 +469,7 @@ void DbgLoadLMap( CudaK32PlotContext& cx )
 
     char path[512];
     sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.tmp", (uint)cx.table+1 );
-    
+
     const size_t writeSize = sizeof( LMap ) * BBCU_TABLE_ALLOC_ENTRY_COUNT;
     int err;
     FatalIf( !IOJob::ReadFromFile( path, p3.hostLMap, writeSize, err ),
@@ -438,10 +497,12 @@ void DbgValidateLMap( CudaK32PlotContext& cx )
     auto& p3 = *cx.phase3;
     auto& s3 = p3.step3;
 
-    LMap* lMap = bbcvirtallocbounded<LMap>( BBCU_TABLE_ENTRY_COUNT );
+    LMap* lMap = bbcvirtallocbounded<LMap>( BBCU_BUCKET_ALLOC_ENTRY_COUNT );
 
-    
     {
+        // blake3_hasher hasher;
+        // blake3_hasher_init( &hasher );
+
         for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
         {
             const LMap* reader = p3.hostLMap + bucket * P3_PRUNED_BUCKET_MAX;
@@ -471,14 +532,18 @@ void DbgValidateLMap( CudaK32PlotContext& cx )
                 ASSERT( map.sourceIndex || map.sortedIndex );
                 ASSERT( ( map.sourceIndex >> ( 32 - BBC_BUCKET_BITS ) ) == bucket );
             }
+
+            // Hash bucket
+            // blake3_hasher_update( &hasher, lMap, sizeof( LMap ) * entryCount );
         }
 
-        
+        // Print hash
+        // DbgFinishAndPrintHash( hasher, "l_map", (uint)cx.table + 1 );
     }
 
     bbvirtfreebounded( lMap );
 
-    Log::Line( "[DEBUG] OK" );
+    Log::Line( "[DEBUG] LMap OK" );
 }
 
 //-----------------------------------------------------------
@@ -566,7 +631,7 @@ void _DbgValidateLMapData( CudaK32PlotContext& cx )
     bbvirtfreebounded( dstIndices );
     bbvirtfreebounded( tmpIndices );
 
-    Log::Line( "[DEBUG] OK" );
+    Log::Line( "[DEBUG] LMap uniqueness OK" );
 }
 
 #endif
diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu
index 8e0458dd..80ba8b0e 100644
--- a/cuda/CudaPlotter.cu
+++ b/cuda/CudaPlotter.cu
@@ -9,6 +9,10 @@
 #include "plotting/CTables.h"
 #include "plotting/TableWriter.h"
 #include "plotting/PlotTools.h"
+#include "util/VirtualAllocator.h"
+#include "harvesting/GreenReaper.h"
+#include "tools/PlotChecker.h"
+
 
 // TEST/DEBUG
 #if _DEBUG
@@ -36,6 +40,7 @@ static void InlineTable( CudaK32PlotContext& cx, const uint32* devInX, cudaStrea
 
 static void AllocBuffers( CudaK32PlotContext& cx );
 static void AllocateP1Buffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx );
+static void AllocateParkSerializationBuffers( CudaK32PlotContext& cx, IAllocator& pinnedAllocator, bool dryRun );
 
 template<typename T>
 static void UploadBucketToGpu( CudaK32PlotContext& context, TableId table, const uint32* hostPtr, T* devPtr, uint64 bucket, uint64 stride );
@@ -53,11 +58,37 @@ GPU-based (CUDA) plotter
 [OPTIONS]:
  -h, --help           : Shows this help message and exits.
  -d, --device         : Select the CUDA device index. (default=0)
+
+ --disk-128           : Enable hybrid disk plotting for 128G system RAM. 
+                         Requires a --temp1 and --temp2 to be set.
+
+ --disk-16            : (experimental) Enable hybrid disk plotting for 16G system RAM. 
+                         Requires a --temp1 and --temp2 to be set.
+
+ -t1, --temp1         : Temporary directory 1. Used for longer-lived, sequential writes.
+
+ -t2, --temp2         : Temporary directory 2. Used for temporary, shorted-lived read and writes.
+                         NOTE: If only one of -t1 or -t2 is specified, both will be
+                               set to the same directory.
+
+ --check <n>          : Perform a plot check for <n> proofs on the newly created plot.
+
+ --check-threshold <f>: Proof threshold rate below which the plots that don't pass
+                         the check will be deleted.
+                         That is, the number of proofs fetched / proof check count
+                         must be above or equal to this threshold to pass.
+                         (default=0.6).
 )";
 
 ///
 /// CLI
 ///
+//-----------------------------------------------------------
+void CudaK32PlotterPrintHelp()
+{
+    Log::Line( USAGE );
+}
+
 //-----------------------------------------------------------
 void CudaK32Plotter::ParseCLI( const GlobalPlotConfig& gCfg, CliParser& cli )
 {
@@ -68,18 +99,70 @@ void CudaK32Plotter::ParseCLI( const GlobalPlotConfig& gCfg, CliParser& cli )
     {
         if( cli.ReadU32( cfg.deviceIndex, "-d", "--device" ) )
             continue;
-        if( cli.ReadSwitch( cfg.disableDirectDownloads, "--no-direct-downloads" ) )
+        if( cli.ReadSwitch( cfg.hybrid128Mode, "--disk-128" ) )
+            continue;
+        if( cli.ReadSwitch( cfg.hybrid16Mode, "--disk-16" ) )
+        {
+            cfg.hybrid128Mode = true;
+            continue;
+        }
+        if( cli.ReadStr( cfg.temp1Path, "-t1", "--temp1" ) )
+        {
+            if( !cfg.temp2Path )
+                cfg.temp2Path = cfg.temp1Path;
+            continue;
+        }
+        if( cli.ReadStr( cfg.temp2Path, "-t2", "--temp2" ) )
+        {
+            if( !cfg.temp1Path )
+                cfg.temp1Path = cfg.temp2Path;
+            continue;
+        }
+        if( cli.ReadUnswitch( cfg.temp1DirectIO, "--no-t1-direct" ) )
+            continue;
+        if( cli.ReadUnswitch( cfg.temp2DirectIO, "--no-t2-direct" ) )
+            continue;
+
+        if( cli.ReadU64( cfg.plotCheckCount, "--check" ) )
+            continue;
+        if( cli.ReadF64( cfg.plotCheckThreshhold, "--check-threshold" ) )
             continue;
+        // if( cli.ReadSwitch( cfg.disableDirectDownloads, "--no-direct-buffers" ) )
+        //     continue;
         if( cli.ArgMatch( "--help", "-h" ) )
         {
-            Log::Line( USAGE );
+            CudaK32PlotterPrintHelp();
             exit( 0 );
         }
         else
             break;  // Let the caller handle it
     }
-
     // The rest should be output directies, parsed by the global config parser.
+
+
+    if( cfg.hybrid128Mode && gCfg.compressionLevel <= 0 )
+    {
+        Log::Error( "Error: Cannot plot classic (uncompressed) plots in 128G or 64G mode." );
+        Exit( -1 );
+    }
+
+    if( cfg.hybrid16Mode )
+    {
+        #if PLATFORM_IS_WINDOWS
+            Log::Error( "16G mode is currently unsupported on Windows." );
+            Exit( -1 );
+        #else
+            Log::Line( "Warning: 16G mode is experimental and still under development." );
+            Log::Line( "         Please use the --check <n> parameter to validate plots when using this mode." );
+
+            if( cfg.temp1DirectIO || cfg.temp2DirectIO )
+            {
+                Log::Line( "         Direct I/O not supported in 16G mode at the moment. Disabing it." );
+                cfg.temp1DirectIO = cfg.temp2DirectIO = false;
+            }
+
+        #endif
+    }
 }
 
 //-----------------------------------------------------------
@@ -97,10 +180,25 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext )
     auto& cx = *new CudaK32PlotContext{};
     outContext = &cx;
 
-    cx.cfg  = cfg;
-    cx.gCfg = cfg.gCfg;
+    cx.cfg        = cfg;
+    cx.gCfg       = cfg.gCfg;
+
+    cx.firstStoredTable = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables;
 
     Log::Line( "[Bladebit CUDA Plotter]" );
+    Log::Line( " Host RAM            : %llu GiB", SysHost::GetTotalSystemMemory() BtoGB );
+    
+    if( cx.cfg.plotCheckCount == 0 )
+        Log::Line( " Plot checks         : disabled" );
+    else
+    {
+        Log::Line( " Plot checks         : enabled ( %llu )", (llu)cx.cfg.plotCheckCount );
+        Log::Line( " Plot check threshold: %.3lf", cx.cfg.plotCheckThreshhold );
+    }
+
+    // Log::Line( " Direct transfers: %s", cfg.disableDirectDownloads ? "false" : "true" );
+    Log::NewLine();
+
     CudaInit( cx );
 
     CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStream , cudaStreamNonBlocking ) );
@@ -119,27 +217,89 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext )
     }
 
     cx.threadPool = new ThreadPool( SysHost::GetLogicalCPUCount() );
+    cx.plotFence  = new Fence();
+    cx.parkFence  = new Fence();
 
-    #if __linux__
-        cx.downloadDirect = cfg.disableDirectDownloads ? false : true;
+    #if _WIN32
+        // #MAYBE: Add a configurable option to enable direct downloads on windows?
+        // On windows always default to using intermediate pinned buffers
+        cx.downloadDirect = false;
     #else
-        // #TODO: One windows, check if we have enough memory, if so, default to true.
-        cx.downloadDirect = true ;//false;
+        cx.downloadDirect = cfg.disableDirectDownloads ? false : true;
     #endif
 
     // cx.plotWriter = new PlotWriter( !cfg.gCfg->disableOutputDirectIO );
     // if( cx.gCfg->benchmarkMode )
     //     cx.plotWriter->EnableDummyMode();
 
-    cx.plotFence  = new Fence();
+    // Need to do allocations for park serialization differently under the following conditions
+    if( cx.downloadDirect || cx.cfg.hybrid128Mode )
+    {
+        cx.parkContext    = new CudaK32ParkContext{};
 
-    cx.phase2     = new CudaK32Phase2{};
-    cx.phase3     = new CudaK32Phase3{};
+        if( cx.cfg.hybrid16Mode )
+            cx.useParkContext = true;
+    }
+
+    // Check for hybrid mode
+    if( cx.cfg.hybrid128Mode )
+    {
+        cx.diskContext             = new CudaK32HybridMode{};
+        cx.diskContext->temp1Queue = new DiskQueue( cx.cfg.temp1Path );
+
+        // Re-use the same queue for temp2 if temp1 and temp2 are pointing to the same path
+        auto t1Path = std::filesystem::canonical( cx.cfg.temp1Path );
+        auto t2Path = std::filesystem::canonical( cx.cfg.temp2Path );
+        if( t1Path.compare( t2Path ) == 0 )
+            cx.diskContext->temp2Queue = cx.diskContext->temp1Queue;
+        else
+            cx.diskContext->temp2Queue = new DiskQueue( cx.cfg.temp2Path );
+    }
+
+    cx.phase2 = new CudaK32Phase2{};
+    cx.phase3 = new CudaK32Phase3{};
 
     // #TODO: Support non-warm starting
     Log::Line( "Allocating buffers (this may take a few seconds)..." );
     AllocBuffers( cx );
     InitFSEBitMask( cx );
+    Log::Line( "Done." );
+
+
+    // Allocate GR Context if --check was specified
+    if( cfg.plotCheckCount > 0 )
+    {
+        if( cfg.gCfg->compressionLevel > 0 )
+        {
+            GreenReaperConfig grCfg{};
+            grCfg.apiVersion     = GR_API_VERSION;
+            grCfg.threadCount    = 1;
+            grCfg.gpuRequest     = GRGpuRequestKind_ExactDevice;
+            grCfg.gpuDeviceIndex = cfg.deviceIndex;
+
+            auto grResult = grCreateContext( &cx.grCheckContext, &grCfg, sizeof( grCfg ) );
+            FatalIf( grResult != GRResult_OK, "Failed to create decompression context for plot check with error '%s' (%d).",
+                    grResultToString( grResult ), (int)grResult );
+
+            grResult = grPreallocateForCompressionLevel( cx.grCheckContext, BBCU_K, cfg.gCfg->compressionLevel );
+            FatalIf( grResult != GRResult_OK, "Failed to preallocate memory for decompression context with error '%s' (%d).",
+                    grResultToString( grResult ), (int)grResult );
+        }
+
+        PlotCheckerConfig checkerCfg{};
+        checkerCfg.proofCount         = cfg.plotCheckCount;
+        checkerCfg.noGpu              = false;
+        checkerCfg.gpuIndex           = cfg.deviceIndex;
+        checkerCfg.threadCount        = 1;
+        checkerCfg.disableCpuAffinity = false;
+        checkerCfg.silent             = false;
+        checkerCfg.hasSeed            = false;
+        checkerCfg.deletePlots        = true;
+        checkerCfg.deleteThreshold    = cfg.plotCheckThreshhold;
+        checkerCfg.grContext          = cx.grCheckContext;
+
+        cx.plotChecker = PlotChecker::Create( checkerCfg );
+    }
 }
 
 //-----------------------------------------------------------
@@ -210,6 +370,8 @@ void CudaK32Plotter::Run( const PlotRequest& req )
     cx.plotWriter = new PlotWriter( !cfg.gCfg->disableOutputDirectIO );
     if( cx.gCfg->benchmarkMode )
         cx.plotWriter->EnableDummyMode();
+    if( cx.plotChecker )
+        cx.plotWriter->EnablePlotChecking( *cx.plotChecker );
 
     FatalIf( !cx.plotWriter->BeginPlot( cfg.gCfg->compressionLevel > 0 ? PlotVersion::v2_0 : PlotVersion::v1_0, 
             req.outDir, req.plotFileName, req.plotId, req.memo, req.memoSize, cfg.gCfg->compressionLevel ), 
@@ -220,19 +382,43 @@ void CudaK32Plotter::Run( const PlotRequest& req )
 
     cx.plotWriter->EndPlot( true );
 
-    // #TODO: Ensure the last plot ended here for now
+    // Ensure the last plot has ended
+    // #TODO: Move it elsewhere, using different buffers for parks
+    //        so that we can continue writing to disk until we get to
+    //        actually writing the next plot in table 7 finalization.
     {
         const auto pltoCompleteTimer = TimerBegin();
         cx.plotWriter->WaitForPlotToComplete();
         const double plotIOTime = TimerEnd( pltoCompleteTimer );
         Log::Line( "Completed writing plot in %.2lf seconds", plotIOTime );
 
-        cx.plotWriter->DumpTables();
+        if( !cx.plotChecker || !cx.plotChecker->LastPlotDeleted() )
+        {
+            cx.plotWriter->DumpTables();
+            Log::NewLine();
+        }
     }
-    Log::Line( "" );
-
+    
     delete cx.plotWriter;
     cx.plotWriter = nullptr;
+
+
+    // Delete any temporary files
+    #if !(DBG_BBCU_KEEP_TEMP_FILES)
+        if( cx.plotRequest.IsFinalPlot && cx.cfg.hybrid128Mode )
+        {
+            if( cx.diskContext->yBuffer )    delete cx.diskContext->yBuffer;
+            if( cx.diskContext->metaBuffer ) delete cx.diskContext->metaBuffer;
+            if( cx.diskContext->unsortedL )  delete cx.diskContext->unsortedL;
+            if( cx.diskContext->unsortedR )  delete cx.diskContext->unsortedR;
+
+            for( TableId t = TableId::Table1; t <= TableId::Table7; t++ )
+            {
+                if( cx.diskContext->tablesL[(int)t] ) delete cx.diskContext->tablesL[(int)t];
+                if( cx.diskContext->tablesR[(int)t] ) delete cx.diskContext->tablesR[(int)t];
+            }
+        }
+    #endif
 }
 
 //-----------------------------------------------------------
@@ -243,26 +429,51 @@ void MakePlot( CudaK32PlotContext& cx )
     memset( cx.tableEntryCounts, 0, sizeof( cx.tableEntryCounts ) );
 
     cx.table = TableId::Table1;
+
     const auto plotTimer = TimerBegin();
     const auto p1Timer   = plotTimer;
 
     #if BBCU_DBG_SKIP_PHASE_1
         DbgLoadContextAndPairs( cx );
     #else
-    // F1
-    Log::Line( "Generating F1" );
-    const auto timer = TimerBegin();
-    GenF1Cuda( cx );
-    const auto elapsed = TimerEnd( timer );
-    Log::Line( "Finished F1 in %.2lf seconds.", elapsed );
 
-    // Time for FP   
+    if( cx.cfg.hybrid128Mode )
+    {
+        cx.sortedXPairsOut.AssignDiskBuffer( nullptr );
+        cx.sortedPairsLOut.AssignDiskBuffer( nullptr );
+        cx.sortedPairsROut.AssignDiskBuffer( nullptr );
+
+        if( !cx.plotRequest.isFirstPlot )
+        {
+            for( TableId t = TableId::Table1; t <= TableId::Table7; t++ )
+            {
+                if( cx.diskContext->tablesL[(int)t] ) cx.diskContext->tablesL[(int)t]->Swap();
+                if( cx.diskContext->tablesR[(int)t] ) cx.diskContext->tablesR[(int)t]->Swap();
+            }
+            
+        }
+    }
+
+    /// Generate F1 entries
+    {
+        Log::Line( "Generating F1" );
+        const auto timer = TimerBegin();
+
+        GenF1Cuda( cx );
+
+        const auto elapsed = TimerEnd( timer );
+        Log::Line( "Finished F1 in %.2lf seconds.", elapsed );
+    }
+
+    /// Forward-propagate the rest of the tables
     for( TableId table = TableId::Table2; table <= TableId::Table7; table++ )
     {
         cx.table  = table;
         cx.bucket = 0;
+
         FpTable( cx );
     }
+
     const auto p1Elapsed = TimerEnd( p1Timer );
     Log::Line( "Completed Phase 1 in %.2lf seconds", p1Elapsed );
     #endif
@@ -294,6 +505,22 @@ void FpTable( CudaK32PlotContext& cx )
 
     cx.prevTablePairOffset = 0;
 
+    if( cx.cfg.hybrid128Mode )
+    {
+        auto* diskBufferL = cx.diskContext->tablesL[(int)inTable];
+        auto* diskBufferR = cx.diskContext->tablesR[(int)inTable];
+
+        if( inTable == cx.firstStoredTable )
+        {
+            cx.sortedXPairsOut.AssignDiskBuffer( diskBufferL );
+        }
+        else if( inTable > cx.firstStoredTable )
+        {
+            cx.sortedPairsLOut.AssignDiskBuffer( diskBufferL );
+            cx.sortedPairsROut.AssignDiskBuffer( diskBufferR );
+        }
+    }
+
     // Clear slice counts
     CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );
 
@@ -358,10 +585,28 @@ void FpTable( CudaK32PlotContext& cx )
     cx.sortedPairsROut.WaitForCompletion();//cx.sortedPairsROut.WaitForCopyCompletion();
     cx.sortedPairsROut.Reset();
 
-    
-    if( cx.table < TableId::Table7 )
+    if( cx.cfg.hybrid128Mode && inTable >= cx.firstStoredTable )
     {
+        if( cx.diskContext->tablesL[(int)inTable] ) cx.diskContext->tablesL[(int)inTable]->Swap();
+        if( cx.diskContext->tablesR[(int)inTable] ) cx.diskContext->tablesR[(int)inTable]->Swap();
+    }
+
+    if( cx.table < TableId::Table7 )
         cx.metaOut.WaitForCompletion(); cx.metaOut.Reset();
+
+    if( cx.cfg.hybrid128Mode )
+    {
+        if( cx.cfg.hybrid16Mode || cx.table == cx.firstStoredTable || cx.table == cx.firstStoredTable + 1 )
+        {
+            cx.diskContext->unsortedL->Swap();
+        }
+
+        if( cx.cfg.hybrid16Mode )
+        {
+            cx.diskContext->yBuffer->Swap();
+            cx.diskContext->metaBuffer->Swap();
+            cx.diskContext->unsortedR->Swap();
+        }
     }
 
     cx.yIn     .Reset();
@@ -373,23 +618,24 @@ void FpTable( CudaK32PlotContext& cx )
     Log::Line( "Table %u completed in %.2lf seconds with %llu entries.", 
                (uint32)cx.table+1, elapsed, cx.tableEntryCounts[(int)cx.table] );
 
+    /// DEBUG
     #if DBG_BBCU_P1_WRITE_PAIRS
         // Write them sorted, so have to wait until table 3 completes
         if( cx.table > TableId::Table2 )
             DbgWritePairs( cx, cx.table - 1 );
     #endif
-    
+
     if( cx.table == TableId::Table7 )
     {
        FinalizeTable7( cx );
 
-       #if DBG_BBCU_P1_WRITE_PAIRS
+        // DEBUG
+        #if DBG_BBCU_P1_WRITE_PAIRS
            DbgWritePairs( cx, TableId::Table7 );
-       #endif
-
+        #endif
         #if DBG_BBCU_P1_WRITE_CONTEXT
            DbgWriteContext( cx );
-       #endif
+        #endif
     }
 }
 
@@ -410,8 +656,8 @@ void FpTableBucket( CudaK32PlotContext& cx, const uint32 bucket )
     cudaStream_t metaStream  = cx.computeStream;//B;
     cudaStream_t pairsStream = cx.computeStream;//C;
 
-    uint32* sortKeyIn   = (uint32*)cx.devMatches;
-    uint32* sortKeyOut  = cx.devSortKey;
+    uint32* sortKeyIn  = (uint32*)cx.devMatches;
+    uint32* sortKeyOut = cx.devSortKey;
     if( cx.table > TableId::Table2 )
     {
         // Generate a sorting key
@@ -447,7 +693,7 @@ void FpTableBucket( CudaK32PlotContext& cx, const uint32 bucket )
 
     // Sort and download prev table's pairs
     const bool isLTableInlineable = cx.table == TableId::Table2 || (uint32)cx.table <= cx.gCfg->numDroppedTables+1;
-    
+
     if( !isLTableInlineable )
     {
         CudaErrCheck( cudaStreamWaitEvent( pairsStream, cx.computeEventC ) );   // Ensure sort key is ready
@@ -463,35 +709,36 @@ void FpTableBucket( CudaK32PlotContext& cx, const uint32 bucket )
             CudaK32PlotSortByKey( entryCount, sortKeyOut, pairsIn, sortedPairs, pairsStream );
             cx.xPairsIn.ReleaseDeviceBuffer( pairsStream );
 
-            Pair* hostPairs = ((Pair*)cx.hostBackPointers[(int)cx.table-1].left) + cx.prevTablePairOffset;
+            Pair* hostPairs = ((Pair*)cx.hostBackPointers[(int)inTable].left) + cx.prevTablePairOffset;
 
             // Write sorted pairs back to host
             cx.sortedXPairsOut.DownloadT( hostPairs, entryCount, pairsStream, cx.downloadDirect );
         }
         else
         {
-            uint32* hostPairsL, *hostPairsLFinal;
-            uint16* hostPairsR, *hostPairsRFinal;
+            // uint32* hostPairsL; 
+            // uint16* hostPairsR; 
 
             // Wait for pairs to complete loading and sort on Y (or do this before match? Giving us time to write to disk while matching?)
             uint32* pairsLIn     = (uint32*)cx.pairsLIn       .GetUploadedDeviceBuffer( pairsStream );
             uint32* sortedPairsL = (uint32*)cx.sortedPairsLOut.LockDeviceBuffer( pairsStream );
             CudaK32PlotSortByKey( entryCount, sortKeyOut, pairsLIn, sortedPairsL, pairsStream );
             cx.pairsLIn.ReleaseDeviceBuffer( pairsStream );
-            hostPairsL      = cx.hostTableSortedL + cx.prevTablePairOffset;
-            hostPairsLFinal = cx.hostBackPointers[(int)cx.table-1].left  + cx.prevTablePairOffset;
+            // hostPairsL      = cx.hostTableSortedL + cx.prevTablePairOffset;
 
+            uint32* hostPairsLFinal = cx.hostBackPointers[(int)inTable].left  + cx.prevTablePairOffset;
             cx.sortedPairsLOut.DownloadT( hostPairsLFinal, entryCount, pairsStream, cx.downloadDirect );
             // cx.sortedPairsLOut.DownloadAndCopyT( hostPairsL, hostPairsLFinal, entryCount, pairsStream );
-            
+
             // if( !isOutputCompressed )
             {
                 uint16* pairsRIn     = (uint16*)cx.pairsRIn       .GetUploadedDeviceBuffer( pairsStream );
                 uint16* sortedPairsR = (uint16*)cx.sortedPairsROut.LockDeviceBuffer( pairsStream );
                 CudaK32PlotSortByKey( entryCount, sortKeyOut, pairsRIn, sortedPairsR, pairsStream );
                 cx.pairsRIn.ReleaseDeviceBuffer( pairsStream );
-                hostPairsR      = cx.hostTableSortedR + cx.prevTablePairOffset; 
-                hostPairsRFinal = cx.hostBackPointers[(int)cx.table-1].right + cx.prevTablePairOffset;
+                // hostPairsR      = cx.hostTableSortedR + cx.prevTablePairOffset; 
+
+                uint16* hostPairsRFinal = cx.hostBackPointers[(int)inTable].right + cx.prevTablePairOffset;
                 
                 cx.sortedPairsROut.DownloadT( hostPairsRFinal, entryCount, pairsStream, cx.downloadDirect );
                 // cx.sortedPairsROut.DownloadAndCopyT( hostPairsR, hostPairsRFinal, entryCount, pairsStream );
@@ -557,7 +804,7 @@ void FpTableBucket( CudaK32PlotContext& cx, const uint32 bucket )
 void FinalizeTable7( CudaK32PlotContext& cx )
 {
     Log::Line( "Finalizing Table 7" );
-    
+
     const auto timer = TimerBegin();
 
     cx.table               = TableId::Table7+1;   // Set a false table
@@ -578,19 +825,41 @@ void FinalizeTable7( CudaK32PlotContext& cx )
     const size_t c1TableSizeBytes = c1TotalEntries * sizeof( uint32 );
     const size_t c2TableSizeBytes = c2TotalEntries * sizeof( uint32 );
 
+    if( cx.cfg.hybrid128Mode )
+    {
+        cx.sortedPairsLOut.AssignDiskBuffer( cx.diskContext->tablesL[(int)TableId::Table7] );
+        cx.sortedPairsROut.AssignDiskBuffer( cx.diskContext->tablesR[(int)TableId::Table7] );
+    }
+
+
+    // Re-use meta GPU downloader to download parks
+    GpuDownloadBuffer& parkDownloader = cx.metaOut;
+
+    // Store disk buffer temporarily, if there is one, since we don't want to write to meta now
+    DiskBufferBase* metaDiskBuffer = parkDownloader.GetDiskBuffer();
+
+    // Reset park buffer chain, if we're using it
+    if( cx.parkContext )
+    {
+        cx.parkContext->parkBufferChain->Reset();
+        parkDownloader.AssignDiskBuffer( nullptr ); // We want direct downloads to the park buffers, which are pinned already
+    }
 
     // Prepare host allocations
     constexpr size_t c3ParkSize = CalculateC3Size();
 
     const uint64 totalParkSize = CDivT( tableLength, (uint64)kCheckpoint1Interval ) * c3ParkSize;
 
-    StackAllocator hostAlloc( cx.hostMeta, BBCU_TABLE_ALLOC_ENTRY_COUNT * sizeof( uint32 ) * 4 );
+    StackAllocator hostAlloc = cx.parkContext
+        ? StackAllocator( cx.parkContext->table7Memory.Ptr(), cx.parkContext->table7Memory.Length() )
+        : StackAllocator( cx.hostMeta, BBCU_TABLE_ALLOC_ENTRY_COUNT * sizeof( uint32 ) * 4 );
+
     uint32* hostC1Buffer        = hostAlloc.CAlloc<uint32>( c1TotalEntries );
     uint32* hostC2Buffer        = hostAlloc.CAlloc<uint32>( c2TotalEntries );
     uint32* hostLastParkEntries = hostAlloc.CAlloc<uint32>( kCheckpoint1Interval );
     byte*   hostLastParkBuffer  = (byte*)hostAlloc.CAlloc<uint32>( kCheckpoint1Interval );
-    byte*   hostCompressedParks = hostAlloc.AllocT<byte>( totalParkSize );
-    
+    byte*   hostCompressedParks = cx.parkContext ? nullptr : hostAlloc.AllocT<byte>( totalParkSize );
+
     byte*   hostParkWriter      = hostCompressedParks;
     uint32* hostC1Writer        = hostC1Buffer;
 
@@ -606,8 +875,6 @@ void FinalizeTable7( CudaK32PlotContext& cx )
 
     const size_t parkBufferSize = kCheckpoint1Interval * sizeof( uint32 );
 
-    GpuDownloadBuffer& parkDownloader = cx.metaOut;
-
     cudaStream_t mainStream     = cx.computeStream;
     cudaStream_t metaStream     = cx.computeStream;//B;
     cudaStream_t pairsStream    = cx.computeStream;//C;
@@ -616,7 +883,7 @@ void FinalizeTable7( CudaK32PlotContext& cx )
     // Load CTable
     FSE_CTable* devCTable = devAlloc.AllocT<FSE_CTable>( sizeof( CTable_C3 ), sizeof( uint64 ) );
     CudaErrCheck( cudaMemcpyAsync( devCTable, CTable_C3, sizeof( CTable_C3 ), cudaMemcpyHostToDevice, cx.computeStream ) );
-
+    CudaErrCheck( cudaStreamSynchronize( cx.computeStream  ) );
 
     // Prepare plot tables
     cx.plotWriter->ReserveTableSize( PlotTable::C1, c1TableSizeBytes );
@@ -627,7 +894,6 @@ void FinalizeTable7( CudaK32PlotContext& cx )
     uint32  retainedC3EntryCount = 0;
     uint32* devYSorted           = cx.devYWork + kCheckpoint1Interval;
 
-    
     uint32* sortKeyIn  = (uint32*)cx.devMatches;
     uint32* sortKeyOut = cx.devSortKey;
 
@@ -732,13 +998,42 @@ void FinalizeTable7( CudaK32PlotContext& cx )
 
         // Download compressed parks to host
         const size_t parkDownloadSize = c3ParkSize * parkCount;
+
+        if( cx.parkContext )
+        {
+            ASSERT( parkDownloadSize <= cx.parkContext->parkBufferChain->BufferSize() );
+
+            // Override the park buffer to be used when using a park context
+            hostParkWriter = cx.parkContext->parkBufferChain->PeekBuffer( bucket );
+
+            // Wait for the next park buffer to be available to be used for download
+            parkDownloader.HostCallback([&cx]{
+               (void)cx.parkContext->parkBufferChain->GetNextBuffer();
+            });
+        }
+
+        const bool directOverride = cx.parkContext != nullptr;
+    
         parkDownloader.DownloadWithCallback( hostParkWriter, parkDownloadSize, 
             []( void* parksBuffer, size_t size, void* userData ) {
 
                 auto& cx = *reinterpret_cast<CudaK32PlotContext*>( userData );
+
                 cx.plotWriter->WriteTableData( parksBuffer, size );
-            }, &cx, mainStream );
+
+                // Release the buffer after the plot writer is done with it.
+                if( cx.parkContext )
+                {
+                    cx.plotWriter->CallBack([&cx](){
+                        cx.parkContext->parkBufferChain->ReleaseNextBuffer();
+                    });
+                }
+
+            }, &cx, mainStream, directOverride );
         hostParkWriter += parkDownloadSize;
+
+        if( cx.parkContext )
+            hostParkWriter = nullptr;
     }
 
     // Download c1 entries
@@ -788,8 +1083,6 @@ void FinalizeTable7( CudaK32PlotContext& cx )
 
 
     // Cleanup
-    // cx.sortedPairsLOut.WaitForCopyCompletion();
-    // cx.sortedPairsROut.WaitForCopyCompletion();
     cx.sortedPairsLOut.WaitForCompletion();
     cx.sortedPairsROut.WaitForCompletion();
     cx.sortedPairsLOut.Reset();
@@ -797,6 +1090,18 @@ void FinalizeTable7( CudaK32PlotContext& cx )
 
     cx.prevTablePairOffset = 0;
 
+    // Restore disk buffer on repurposed meta download stream
+    parkDownloader.AssignDiskBuffer( metaDiskBuffer );
+
+    if( cx.cfg.hybrid128Mode )
+    {
+        cx.diskContext->tablesL[(int)TableId::Table7]->Swap();
+        cx.diskContext->tablesR[(int)TableId::Table7]->Swap();
+
+        if( cx.cfg.hybrid16Mode )
+            cx.diskContext->yBuffer->Swap();
+    }
+
     auto elapsed = TimerEnd( timer );
     Log::Line( "Finalized Table 7 in %.2lf seconds.", elapsed );
 }
@@ -834,7 +1139,7 @@ __global__ void CudaCompressTable( const uint32* entryCount, const uint32* inLEn
     const uint32 x0 = inLEntries[pair.left ];
     const uint32 x1 = inLEntries[pair.right];
 
-    // Convert to linepoint   
+    // Convert to linepoint
     if constexpr ( UseLP )         
         outREntries[gid] = (uint32)CudaSquareToLinePoint64( x1 >> bitShift, x0 >> bitShift );
     else
@@ -850,7 +1155,7 @@ void InlineTable( CudaK32PlotContext& cx, const uint32* devInX, cudaStream_t str
 
     const uint32 kthreads = 256;
     const uint32 kblocks  = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, (int)kthreads );
-    
+
     if( isCompressedInput )
     {
         const bool   isFinalTable = cx.table == TableId::Table1 + (TableId)cx.gCfg->numDroppedTables;
@@ -870,7 +1175,7 @@ void InlineTable( CudaK32PlotContext& cx, const uint32* devInX, cudaStream_t str
 //-----------------------------------------------------------
 void CudaK32PlotDownloadBucket( CudaK32PlotContext& cx )
 {
-    const bool   writeVertical  = CudaK32PlotIsOutputInterleaved( cx );
+    const bool   writeVertical  = CudaK32PlotIsOutputVertical( cx );
     const size_t metaMultiplier = GetTableMetaMultiplier( cx.table );
 
     const bool   downloadCompressed   = cx.table > TableId::Table1 && (uint32)cx.table <= cx.gCfg->numDroppedTables;
@@ -879,8 +1184,8 @@ void CudaK32PlotDownloadBucket( CudaK32PlotContext& cx )
     uint32* hostY        = cx.hostY;
     uint32* hostMeta     = cx.hostMeta;
 
-    uint32* hostPairsL   = cx.hostTableL; //cx.hostBackPointers[6].left;
-    uint16* hostPairsR   = cx.hostTableR; //cx.hostBackPointers[6].right;
+    uint32* hostPairsL   = cx.hostTableL;
+    uint16* hostPairsR   = cx.hostTableR;
     Pair*   t2HostPairs  = (Pair*)cx.hostBackPointers[4].left;
 
     const size_t startOffset  = cx.bucket * ( writeVertical ? BBCU_MAX_SLICE_ENTRY_COUNT : BBCU_BUCKET_ALLOC_ENTRY_COUNT );  // vertical: offset to starting col. horizontal: to starting row
@@ -896,7 +1201,7 @@ void CudaK32PlotDownloadBucket( CudaK32PlotContext& cx )
     {
         const size_t metaSizeMultiplier = metaMultiplier == 3 ? 4 : metaMultiplier;
         const size_t metaSize           = sizeof( uint32 ) * metaSizeMultiplier;
-        
+
         const size_t  metaSrcStride = srcStride * metaSize;
         const size_t  metaDstStride = dstStride * sizeof( K32Meta4 );
         const size_t  metaWidth     = width * metaSize;
@@ -927,20 +1232,23 @@ void UploadBucketForTable( CudaK32PlotContext& cx, const uint64 bucket )
     const TableId rTable  = cx.table;
     const TableId inTable = rTable - 1;
 
-    uint32 metaMultiplier = GetTableMetaMultiplier( inTable );
+    const uint32  metaMultiplier = GetTableMetaMultiplier( inTable );
 
     const uint32  inIdx        = CudaK32PlotGetInputIndex( cx );
-    const bool    readVertical = CudaK32PlotIsOutputInterleaved( cx );
+    const bool    readVertical = CudaK32PlotIsOutputVertical( cx );
 
     const uint32* hostY        = cx.hostY;
     const uint32* hostMeta     = cx.hostMeta;
-    const uint32* hostPairsL   = cx.hostTableL; //cx.hostBackPointers[6].left;
-    const uint16* hostPairsR   = cx.hostTableR; //cx.hostBackPointers[6].right;
+    const uint32* hostPairsL   = cx.hostTableL;
+    const uint16* hostPairsR   = cx.hostTableR;
 
     const bool   uploadCompressed   = cx.table > TableId::Table2 && (uint32)cx.table-1 <= cx.gCfg->numDroppedTables;
     const bool   uploadInlinedPairs = !uploadCompressed && (uint32)cx.table == cx.gCfg->numDroppedTables+2;
     const Pair*  t2HostPairs        = (Pair*)cx.hostBackPointers[4].left; // Table 2 will use table 5, and overflow onto 6
 
+    if( cx.cfg.hybrid128Mode )
+        t2HostPairs = (Pair*)hostPairsL;
+
     uint32 stride = BBCU_BUCKET_ALLOC_ENTRY_COUNT;          // Start as vertical
     size_t offset = (size_t)bucket * BBCU_MAX_SLICE_ENTRY_COUNT;
 
@@ -974,7 +1282,7 @@ void UploadBucketForTable( CudaK32PlotContext& cx, const uint64 bucket )
                 cx.pairsRIn.UploadArrayT<uint16>( hostPairsR + offset, BBCU_BUCKET_COUNT, stride, BBCU_BUCKET_COUNT, counts, pairsStream );
         }
     }
-    
+
     // Meta
     if( metaMultiplier > 0 )
     {
@@ -982,11 +1290,13 @@ void UploadBucketForTable( CudaK32PlotContext& cx, const uint64 bucket )
         const size_t metaSize           = sizeof( uint32 ) * metaSizeMultiplier;
 
         auto actualMetaStream = inTable == TableId::Table1 ? cx.computeStream : metaStream;
+
         cx.metaIn.UploadArray( hostMeta + offset * 4, BBCU_BUCKET_COUNT, metaSize, stride * sizeof( K32Meta4 ), BBCU_BUCKET_COUNT, counts, actualMetaStream );
     }
 }
 
 
+
 ///
 /// Allocations
 ///
@@ -1002,13 +1312,19 @@ void AllocBuffers( CudaK32PlotContext& cx )
     cx.hostTempAllocSize  = 0;
     cx.devAllocSize       = 0;
 
+    // If on <= 128G mode or not using direct downloads, 
+    // we need to use a separate buffer for downloading parks, instead of re-using exisintg ones.
+    // If on <= 64G mode or not using direct downloads, 
+    const bool allocateParkBuffers = cx.downloadDirect || cx.cfg.hybrid128Mode;
+    size_t parksPinnedSize = 0;
+
     // Gather the size needed first
     {
         CudaK32AllocContext acx = {};
 
         acx.alignment = alignment;
         acx.dryRun    = true;
-        
+
         DummyAllocator pinnedAllocator;
         DummyAllocator hostTableAllocator;
         DummyAllocator hostTempAllocator;
@@ -1020,7 +1336,6 @@ void AllocBuffers( CudaK32PlotContext& cx )
         acx.devAllocator       = &devAllocator;
 
         AllocateP1Buffers( cx, acx );
-
         cx.pinnedAllocSize    = pinnedAllocator   .Size();
         cx.hostTableAllocSize = hostTableAllocator.Size();
         cx.hostTempAllocSize  = hostTempAllocator .Size();
@@ -1033,7 +1348,6 @@ void AllocBuffers( CudaK32PlotContext& cx )
         devAllocator       = {};
 
         CudaK32PlotPhase2AllocateBuffers( cx, acx );
-
         cx.pinnedAllocSize    = std::max( cx.pinnedAllocSize   , pinnedAllocator   .Size() );
         cx.hostTableAllocSize = std::max( cx.hostTableAllocSize, hostTableAllocator.Size() );
         cx.hostTempAllocSize  = std::max( cx.hostTempAllocSize , hostTempAllocator .Size() );
@@ -1046,15 +1360,23 @@ void AllocBuffers( CudaK32PlotContext& cx )
         devAllocator       = {};
 
         CudaK32PlotPhase3AllocateBuffers( cx, acx );
-
         cx.pinnedAllocSize    = std::max( cx.pinnedAllocSize   , pinnedAllocator   .Size() );
         cx.hostTableAllocSize = std::max( cx.hostTableAllocSize, hostTableAllocator.Size() );
         cx.hostTempAllocSize  = std::max( cx.hostTempAllocSize , hostTempAllocator .Size() );
         cx.devAllocSize       = std::max( cx.devAllocSize      , devAllocator      .Size() );
+
+        // May need to allocate extra pinned buffers for park buffers
+        if( allocateParkBuffers )
+        {
+            pinnedAllocator = {};
+            AllocateParkSerializationBuffers( cx, *acx.pinnedAllocator, acx.dryRun );
+            parksPinnedSize = pinnedAllocator.Size();
+        }
     }
 
-    size_t totalPinnedSize = cx.pinnedAllocSize + cx.hostTempAllocSize;
-    size_t totalHostSize   = cx.hostTableAllocSize + totalPinnedSize;
+
+    const size_t totalPinnedSize = cx.pinnedAllocSize + cx.hostTempAllocSize + parksPinnedSize;
+    const size_t totalHostSize   = cx.hostTableAllocSize + totalPinnedSize;
     Log::Line( "Kernel RAM required       : %-12llu bytes ( %-9.2lf MiB or %-6.2lf GiB )", totalPinnedSize,
                    (double)totalPinnedSize BtoMB, (double)totalPinnedSize BtoGB );
 
@@ -1070,43 +1392,46 @@ void AllocBuffers( CudaK32PlotContext& cx )
     Log::Line( "GPU RAM required          : %-12llu bytes ( %-9.2lf MiB or %-6.2lf GiB )", cx.devAllocSize,
                    (double)cx.devAllocSize BtoMB, (double)cx.devAllocSize BtoGB );
 
-    Log::Line( "Allocating buffers" );
     // Now actually allocate the buffers
+    Log::Line( "Allocating buffers..." );
     CudaErrCheck( cudaMallocHost( &cx.pinnedBuffer, cx.pinnedAllocSize, cudaHostAllocDefault ) );
 
     #if _DEBUG
         cx.hostBufferTables = bbvirtallocboundednuma<byte>( cx.hostTableAllocSize );
     #else
-        #if !_WIN32
-        // if( cx.downloadDirect )
+
+        bool allocateHostTablesPinned = cx.downloadDirect;
+        #if _WIN32
+            // On windows we always force the use of intermediate buffers, so we allocate on the host
+            allocateHostTablesPinned = false;
+        #endif
+
+        // Log::Line( "Table pairs allocated as pinned: %s", allocateHostTablesPinned ? "true" : "false" );
+        if( allocateHostTablesPinned )
             CudaErrCheck( cudaMallocHost( &cx.hostBufferTables, cx.hostTableAllocSize, cudaHostAllocDefault ) );
-        // else
-        // {
-        //     // #TODO: On windows, first check if we have enough shared memory (512G)? 
-        //     //        and attempt to alloc that way first. Otherwise, use intermediate pinned buffers.
-        #else
+        else
             cx.hostBufferTables = bbvirtallocboundednuma<byte>( cx.hostTableAllocSize );
-        #endif
-        // }
     #endif
 
-    //CudaErrCheck( cudaMallocHost( &cx.hostBufferTables, cx.hostTableAllocSize, cudaHostAllocDefault ) );
-
     cx.hostBufferTemp = nullptr;
-#if _DEBUG
-    cx.hostBufferTemp   = bbvirtallocboundednuma<byte>( cx.hostTempAllocSize );
-#endif
-    if( cx.hostBufferTemp == nullptr )
+    #if _DEBUG || _WIN32
+        if( cx.hostTempAllocSize )
+            cx.hostBufferTemp = bbvirtallocboundednuma<byte>( cx.hostTempAllocSize );
+    #endif
+
+    if( cx.hostBufferTemp == nullptr && cx.hostTempAllocSize )
         CudaErrCheck( cudaMallocHost( &cx.hostBufferTemp, cx.hostTempAllocSize, cudaHostAllocDefault ) );
 
     CudaErrCheck( cudaMalloc( &cx.deviceBuffer, cx.devAllocSize ) );
 
     // Warm start
-    if( true )
+    if( true )// cx.gCfg->warmStart )
     {
-        FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.pinnedBuffer, cx.pinnedAllocSize );
+        FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.pinnedBuffer    , cx.pinnedAllocSize    );
         FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.hostBufferTables, cx.hostTableAllocSize );
-        FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.hostBufferTemp, cx.hostTempAllocSize );
+
+        if( cx.hostTempAllocSize )
+            FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.hostBufferTemp, cx.hostTempAllocSize );
     }
 
     {
@@ -1114,7 +1439,7 @@ void AllocBuffers( CudaK32PlotContext& cx )
 
         acx.alignment = alignment;
         acx.dryRun    = false;
-        
+
         StackAllocator pinnedAllocator   ( cx.pinnedBuffer    , cx.pinnedAllocSize    );
         StackAllocator hostTableAllocator( cx.hostBufferTables, cx.hostTableAllocSize );
         StackAllocator hostTempAllocator ( cx.hostBufferTemp  , cx.hostTempAllocSize  );
@@ -1137,106 +1462,254 @@ void AllocBuffers( CudaK32PlotContext& cx )
         hostTempAllocator .PopToMarker( 0 );
         devAllocator      .PopToMarker( 0 );
         CudaK32PlotPhase3AllocateBuffers( cx, acx );
+
+        if( allocateParkBuffers )
+        {
+            // Fine to leak. App-lifetime buffer
+            void* parksBuffer = nullptr;
+            CudaErrCheck( cudaMallocHost( &parksBuffer, parksPinnedSize, cudaHostAllocDefault ) );
+            StackAllocator parkAllocator( parksBuffer, parksPinnedSize );
+            AllocateParkSerializationBuffers( cx, parkAllocator, acx.dryRun );
+        }
     }
 }
 
 //-----------------------------------------------------------
 void AllocateP1Buffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
 {
-    const size_t alignment = acx.alignment;
+    const size_t    alignment     = acx.alignment;
+    const bool      isCompressed  = cx.gCfg->compressionLevel > 0;
+    const TableId   firstTable    = cx.firstStoredTable;
 
-    const bool isCompressed = cx.gCfg->compressionLevel > 0;
+    const FileFlags tmp1FileFlags = cx.cfg.temp1DirectIO ? FileFlags::NoBuffering | FileFlags::LargeFile : FileFlags::LargeFile;
+    const FileFlags tmp2FileFlags = cx.cfg.temp2DirectIO ? FileFlags::NoBuffering | FileFlags::LargeFile : FileFlags::LargeFile;
 
-    // #TODO: Re-optimize usage here again for windows running 256G
     /// Host allocations
     {
         // Temp allocations are pinned host buffers that can be re-used for other means in different phases.
         // This is roughly equivalent to temp2 dir during disk plotting.
-        cx.hostY    = acx.hostTempAllocator->CAlloc<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
-        cx.hostMeta = acx.hostTempAllocator->CAlloc<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT * BBCU_HOST_META_MULTIPLIER, alignment );
 
-        const size_t markingTableBitFieldSize = GetMarkingTableBitFieldSize();
 
-        cx.hostMarkingTables[0] = nullptr;
-        cx.hostMarkingTables[1] = isCompressed ? nullptr : acx.hostTableAllocator->AllocT<uint64>( markingTableBitFieldSize, alignment );
-        cx.hostMarkingTables[2] = acx.hostTableAllocator->AllocT<uint64>( markingTableBitFieldSize, alignment );
-        cx.hostMarkingTables[3] = acx.hostTableAllocator->AllocT<uint64>( markingTableBitFieldSize, alignment );
-        cx.hostMarkingTables[4] = acx.hostTableAllocator->AllocT<uint64>( markingTableBitFieldSize, alignment );
-        cx.hostMarkingTables[5] = acx.hostTableAllocator->AllocT<uint64>( markingTableBitFieldSize, alignment );
+        if( !cx.cfg.hybrid16Mode )
+        {
+            cx.hostY = acx.hostTempAllocator->CAlloc<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
+            cx.hostMeta = acx.hostTempAllocator->CAlloc<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT * BBCU_HOST_META_MULTIPLIER, alignment );
+        }
+        else if( !cx.diskContext->metaBuffer )
+        {
+            const size_t ySliceSize    = sizeof( uint32 ) * BBCU_MAX_SLICE_ENTRY_COUNT;
+            const size_t metaSliceSize = sizeof( uint32 ) * BBCU_META_SLICE_ENTRY_COUNT;
 
-    
-        // NOTE: The first table has their values inlines into the backpointers of the next table
-        cx.hostBackPointers[0] = {};
+            cx.diskContext->yBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::Y_DISK_BUFFER_FILE_NAME.data(), 
+                                            BBCU_BUCKET_COUNT, ySliceSize, FileMode::Create, FileAccess::ReadWrite, tmp2FileFlags );
+            FatalIf( !cx.diskContext->yBuffer, "Failed to create y disk buffer." );
 
-        const TableId firstTable = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables;
-        
-        Pair* firstTablePairs = acx.hostTableAllocator->CAlloc<Pair>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
-        cx.hostBackPointers[(int)firstTable] = { (uint32*)firstTablePairs, nullptr };
+            cx.diskContext->metaBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::META_DISK_BUFFER_FILE_NAME.data(), 
+                                            BBCU_BUCKET_COUNT, metaSliceSize, FileMode::Create, FileAccess::ReadWrite, tmp2FileFlags );
+            FatalIf( !cx.diskContext->metaBuffer, "Failed to create metadata disk buffer." );
+        }
 
-        for( TableId table = firstTable + 1; table <= TableId::Table7; table++ )
-            cx.hostBackPointers[(int)table] = { acx.hostTableAllocator->CAlloc<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ), acx.hostTableAllocator->CAlloc<uint16>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ) };
+        // Marking tables used to prune back pointers
+        {
+            const size_t markingTableBitFieldSize = GetMarkingTableBitFieldSize();
+
+            cx.hostMarkingTables[0] = nullptr;
+            cx.hostMarkingTables[1] = isCompressed ? nullptr : acx.hostTableAllocator->AllocT<uint64>( markingTableBitFieldSize, alignment );
+            cx.hostMarkingTables[2] = acx.hostTableAllocator->AllocT<uint64>( markingTableBitFieldSize, alignment );
+            cx.hostMarkingTables[3] = acx.hostTableAllocator->AllocT<uint64>( markingTableBitFieldSize, alignment );
+            cx.hostMarkingTables[4] = acx.hostTableAllocator->AllocT<uint64>( markingTableBitFieldSize, alignment );
+            cx.hostMarkingTables[5] = acx.hostTableAllocator->AllocT<uint64>( markingTableBitFieldSize, alignment );
+        }
+
+        if( !cx.cfg.hybrid128Mode )
+        {
+            // NOTE: The first table has their values inlined into the backpointers of the next table
+            cx.hostBackPointers[0] = {};
 
-        cx.hostTableL       = cx.hostBackPointers[6].left;     // Also used for Table 7
-        cx.hostTableR       = cx.hostBackPointers[6].right;
-        cx.hostTableSortedL = cx.hostBackPointers[5].left;
-        cx.hostTableSortedR = cx.hostBackPointers[5].right;
+            Pair* firstTablePairs = acx.hostTableAllocator->CAlloc<Pair>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
+
+            cx.hostBackPointers[(int)firstTable] = { (uint32*)firstTablePairs, nullptr };
+
+            for( TableId table = firstTable + 1; table <= TableId::Table7; table++ )
+            {
+                cx.hostBackPointers[(int)table] = { 
+                    acx.hostTableAllocator->CAlloc<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ), 
+                    acx.hostTableAllocator->CAlloc<uint16>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment )
+                };
+            }
+
+            // These buffers, belonging to table 7, are re-used
+            // to store the unsorted back-pointers.
+            // For this to work, the reading ot table 7 must be horizontal (see CudaK32PlotIsOutputVertical()).
+            // This way, when we store the sorted pairs, we don't
+            // overwrite the unsorted data from other buckets.
+            cx.hostTableL = cx.hostBackPointers[6].left;
+            cx.hostTableR = cx.hostBackPointers[6].right;
+        }
+        else
+        {
+            char tableName[] = "table_l_000.tmp";
+
+            size_t multiplier = 2; // First table entries are Pair, not uint32s...
+
+            #if BBCU_DBG_SKIP_PHASE_1
+                const FileMode fileMode = FileMode::Open;
+            #else
+                const FileMode fileMode = FileMode::Create;
+            #endif
+
+            for( TableId table = firstTable; table <= TableId::Table7; table++ )
+            {
+                if( cx.diskContext->tablesL[(int)table] == nullptr )
+                {
+                    sprintf( tableName, "table_l_%d.tmp", (int32)table+1 );
+                    cx.diskContext->tablesL[(int)table] = DiskBuffer::Create(
+                        *cx.diskContext->temp1Queue, tableName, BBCU_BUCKET_COUNT, sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT * multiplier,
+                        fileMode, FileAccess::ReadWrite, tmp1FileFlags );
+
+                    FatalIf( !cx.diskContext->tablesL[(int)table], "Failed to create table %d L disk buffer.", (int)table+1 );
+                }
+
+                if( table > firstTable && cx.diskContext->tablesR[(int)table] == nullptr )
+                {
+                    sprintf( tableName, "table_r_%d.tmp", (int32)table+1 );
+                    cx.diskContext->tablesR[(int)table] = DiskBuffer::Create(
+                        *cx.diskContext->temp1Queue, tableName, BBCU_BUCKET_COUNT, sizeof( uint16 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT,
+                        fileMode, FileAccess::ReadWrite, tmp1FileFlags );
+
+                    FatalIf( !cx.diskContext->tablesR[(int)table], "Failed to create table %d R disk buffer.", (int)table+1 );
+                }
+
+                multiplier = 1;
+            }
+
+            // When storing unsorted inlined x's, we don't have enough space in RAM, store i disk instead.
+            const size_t xSliceSize = BBCU_MAX_SLICE_ENTRY_COUNT * sizeof( Pair );
+            cx.diskContext->unsortedL = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::LPAIRS_DISK_BUFFER_FILE_NAME.data(), 
+                                                                   BBCU_BUCKET_COUNT, xSliceSize, FileMode::OpenOrCreate, FileAccess::ReadWrite, tmp2FileFlags );
+            FatalIf( !cx.diskContext->unsortedL, "Failed to create unsorted L disk buffer." );
+
+            if( cx.cfg.hybrid16Mode )
+            {
+                cx.diskContext->unsortedR = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, "p1unsorted_r.tmp", 
+                                                                    BBCU_BUCKET_COUNT, BBCU_MAX_SLICE_ENTRY_COUNT * sizeof( uint16 ), FileMode::OpenOrCreate, FileAccess::ReadWrite, tmp2FileFlags );
+                FatalIf( !cx.diskContext->unsortedR, "Failed to create unsorted R disk buffer." );
+            }
+            else
+            {
+                // In 128G mode we can store intermediate pairs in the host
+                cx.hostTableL = acx.hostTableAllocator->CAlloc<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
+                cx.hostTableR = acx.hostTableAllocator->CAlloc<uint16>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
+            }
+        }
     }
 
     /// Device & Pinned allocations
     {
-        // #NOTE: The R pair is allocated as uint32 because for table 2 we want to download them as inlined x's, so we need 2 uint32 buffers
-        /// Device/Pinned allocations
-        // cx.yOut    = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
-        // cx.metaOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<K32Meta4>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
-        cx.yOut    = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer<uint32>  ( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
-        cx.metaOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer<K32Meta4>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
-
-        // These download buffers share the same backing buffers
+        GpuStreamDescriptor yDesc{};
+        yDesc.entriesPerSlice = BBCU_MAX_SLICE_ENTRY_COUNT;
+        yDesc.sliceCount      = BBCU_BUCKET_COUNT;
+        yDesc.sliceAlignment  = alignment;
+        yDesc.bufferCount     = BBCU_DEFAULT_GPU_BUFFER_COUNT;
+        yDesc.deviceAllocator = acx.devAllocator;
+        yDesc.pinnedAllocator = nullptr;             // Start in direct mode (no intermediate pinined buffers)
+
+        // In disk-backed mode, we always have pinned buffers,
+        // which are the same buffers used to write and read from disk.
+        GpuStreamDescriptor descTablePairs       = yDesc;
+        GpuStreamDescriptor descTableSortedPairs = yDesc;
+        GpuStreamDescriptor descXPairs           = yDesc;
+        GpuStreamDescriptor descMeta             = yDesc;
+
+        if( cx.cfg.hybrid128Mode )
         {
+            // Temp 1 Queue
+            descTableSortedPairs.pinnedAllocator = acx.pinnedAllocator;
+            descTableSortedPairs.sliceAlignment  = cx.diskContext->temp1Queue->BlockSize();
+
+            // Temp 2 Queue
+            descXPairs.pinnedAllocator   = acx.pinnedAllocator;
+            descXPairs.sliceAlignment    = cx.diskContext->temp2Queue->BlockSize();
+
+            if( cx.cfg.hybrid16Mode )
+            {
+                yDesc.pinnedAllocator = acx.pinnedAllocator;
+                yDesc.sliceAlignment  = cx.diskContext->temp2Queue->BlockSize();
+
+                descMeta.pinnedAllocator = acx.pinnedAllocator;
+                descMeta.sliceAlignment  = cx.diskContext->temp2Queue->BlockSize();
+
+                descTablePairs.pinnedAllocator = acx.pinnedAllocator;
+                descTablePairs.sliceAlignment  = cx.diskContext->temp2Queue->BlockSize();
+            }
+        }
+
+        if( !cx.downloadDirect )
+        {
+            // Use intermediate pinned buffer for transfers to non-pinned destinations
+            yDesc.pinnedAllocator                = acx.pinnedAllocator;
+            descTablePairs.pinnedAllocator       = acx.pinnedAllocator;
+            descTableSortedPairs.pinnedAllocator = acx.pinnedAllocator;
+            descXPairs.pinnedAllocator           = acx.pinnedAllocator;
+            descMeta.pinnedAllocator             = acx.pinnedAllocator;
+        }
+
+
+        ///
+        /// Downloads
+        ///
+        cx.yOut    = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( yDesc, acx.dryRun );
+        cx.metaOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<K32Meta4>( descMeta, acx.dryRun );
+
+        {
+            // These download buffers share the same backing buffers
             const size_t devMarker    = acx.devAllocator->Size();
             const size_t pinnedMarker = acx.pinnedAllocator->Size();
 
-            cx.pairsLOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
-            cx.pairsROut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer<uint16>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
+            cx.pairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( descTablePairs, acx.dryRun );
+            cx.pairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint16>( descTablePairs, acx.dryRun );
 
             acx.devAllocator->PopToMarker( devMarker );
             acx.pinnedAllocator->PopToMarker( pinnedMarker );
 
             // Allocate Pair at the end, to ensure we grab the highest value
-            cx.xPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<Pair>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+            cx.xPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<Pair>( descXPairs, acx.dryRun );
         }
 
-        // These download buffers share the same backing buffers
         {
+            // These download buffers share the same backing buffers
             const size_t devMarker    = acx.devAllocator->Size();
             const size_t pinnedMarker = acx.pinnedAllocator->Size();
 
-            cx.sortedPairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
-            cx.sortedPairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint16>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+            cx.sortedPairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( descTableSortedPairs, acx.dryRun );
+            cx.sortedPairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint16>( descTableSortedPairs, acx.dryRun );
 
             acx.devAllocator->PopToMarker( devMarker );
             acx.pinnedAllocator->PopToMarker( pinnedMarker );
 
             // Allocate Pair at the end, to ensure we grab the highest value
-            cx.sortedXPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<Pair>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+            cx.sortedXPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<Pair>( descXPairs, acx.dryRun );
         }
 
-        cx.yIn    = cx.gpuUploadStream[0]->CreateUploadBufferT<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
-        cx.metaIn = cx.gpuUploadStream[0]->CreateUploadBufferT<K32Meta4>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+        ///
+        /// Uploads
+        ///
+        cx.yIn    = cx.gpuUploadStream[0]->CreateUploadBufferT<uint32>( yDesc, acx.dryRun );
+        cx.metaIn = cx.gpuUploadStream[0]->CreateUploadBufferT<K32Meta4>( descMeta, acx.dryRun );
 
         // These uploaded buffers share the same backing buffers
         {
             const size_t devMarker    = acx.devAllocator->Size();
             const size_t pinnedMarker = acx.pinnedAllocator->Size();
 
-            cx.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
-            cx.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint16>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+            cx.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint32>( descTablePairs, acx.dryRun );
+            cx.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint16>( descTablePairs, acx.dryRun );
 
             acx.devAllocator->PopToMarker( devMarker );
             acx.pinnedAllocator->PopToMarker( pinnedMarker );
 
             // Allocate Pair at the end, to ensure we grab the highest value
-            cx.xPairsIn = cx.gpuUploadStream[0]->CreateUploadBufferT<Pair>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+            cx.xPairsIn = cx.gpuUploadStream[0]->CreateUploadBufferT<Pair>( descXPairs, acx.dryRun );
         }
 
         /// Device-only allocations
@@ -1268,9 +1741,56 @@ void AllocateP1Buffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
         cx.hostBucketCounts = acx.pinnedAllocator->CAlloc<uint32>( BBCU_BUCKET_COUNT, alignment );
         cx.hostBucketSlices = acx.pinnedAllocator->CAlloc<uint32>( BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, alignment );
     }
+
+    /// In disk-backed mode, assign disk buffers to gpu buffers
+    if( cx.cfg.hybrid128Mode && !acx.dryRun )
+    {
+        cx.xPairsOut.AssignDiskBuffer( cx.diskContext->unsortedL );
+        cx.xPairsIn .AssignDiskBuffer( cx.diskContext->unsortedL );
+
+        if( cx.cfg.hybrid16Mode )
+        {
+            cx.pairsLOut.AssignDiskBuffer( cx.diskContext->unsortedL );
+            cx.pairsLIn .AssignDiskBuffer( cx.diskContext->unsortedL );
+
+            cx.pairsROut.AssignDiskBuffer( cx.diskContext->unsortedR );
+            cx.pairsRIn .AssignDiskBuffer( cx.diskContext->unsortedR );
+
+            cx.yOut.AssignDiskBuffer( cx.diskContext->yBuffer );
+            cx.yIn .AssignDiskBuffer( cx.diskContext->yBuffer );
+
+            cx.metaOut.AssignDiskBuffer( cx.diskContext->metaBuffer );
+            cx.metaIn .AssignDiskBuffer( cx.diskContext->metaBuffer );
+        }
+    }
+}
+
+//-----------------------------------------------------------
+void AllocateParkSerializationBuffers( CudaK32PlotContext& cx, IAllocator& pinnedAllocator, bool dryRun )
+{
+    ASSERT( cx.parkContext );
+
+    auto& pc = *cx.parkContext;
+    pc.maxParkBuffers = 3;
+
+    // Get the largest park size
+    const size_t maxParkSize = cx.cfg.gCfg->compressionLevel == 0 ?
+                                CalculateParkSize( TableId::Table1 ) :
+                                GetLargestCompressedParkSize();
+
+    const size_t parksPerBuffer       = CDivT<size_t>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2;
+    // CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kCheckpoint1Interval ) + 1; // Need an extra park for left-over entries
+    const size_t bucketParkBufferSize = parksPerBuffer * maxParkSize;
+    const size_t alignment            = 4096;
+
+    // Allocate some extra space for C tables (see FinalizeTable7)
+    pc.hostRetainedLinePoints = pinnedAllocator.CAlloc<uint64>( kEntriesPerPark );
+    pc.table7Memory           = pinnedAllocator.CAllocSpan<byte>( 8 MiB, alignment );
+    pc.parkBufferChain        = BufferChain::Create( pinnedAllocator, pc.maxParkBuffers, bucketParkBufferSize, alignment, dryRun );
 }
 
 
+
 ///
 /// Debug
 ///
@@ -1278,6 +1798,9 @@ void AllocateP1Buffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
 
 void DbgWritePairs( CudaK32PlotContext& cx, const TableId table )
 {
+    if( cx.cfg.hybrid128Mode )
+        return;
+
     const TableId earliestTable = TableId::Table1 + (TableId)cx.gCfg->numDroppedTables+1;
     if( table < earliestTable )
         return;
@@ -1332,7 +1855,7 @@ void DbgWriteContext( CudaK32PlotContext& cx )
     Log::Line( "[DEBUG] Writing context file." );
     FileStream contxetFile;
     sprintf( path, "%scontext.tmp", DBG_BBCU_DBG_DIR );
-    FatalIf( !contxetFile.Open( path, FileMode::Create, FileAccess::Write ), "Failed to open context file." );
+    FatalIf( !contxetFile.Open( path, FileMode::Create, FileAccess::Write ), "Failed to open context file at '%s'.", path );
     FatalIf( contxetFile.Write( &cx, sizeof( CudaK32PlotContext ) ) != (ssize_t)sizeof( CudaK32PlotContext ), "Failed to write context data." );
     
     contxetFile.Close();
@@ -1360,7 +1883,7 @@ void DbgLoadContextAndPairs( CudaK32PlotContext& cx, bool loadTables )
         memcpy( cx.bucketSlices, tmpCx.bucketSlices, sizeof( tmpCx.bucketSlices ) );
         memcpy( cx.tableEntryCounts, tmpCx.tableEntryCounts, sizeof( tmpCx.tableEntryCounts ) );        
     }
-    
+
     if( !loadTables )
         return;
 
@@ -1384,8 +1907,11 @@ void DbgLoadContextAndPairs( CudaK32PlotContext& cx, bool loadTables )
     }
 }
 
-void DbgLoadTablePairs( CudaK32PlotContext& cx, const TableId table, bool copyToPinnedBuffer )
+void DbgLoadTablePairs( CudaK32PlotContext& cx, const TableId table, bool useDiskHybridData )
 {
+    if( cx.cfg.hybrid128Mode )
+        return;
+
     char lPath[512];
     char rPath[512];
 
@@ -1393,57 +1919,227 @@ void DbgLoadTablePairs( CudaK32PlotContext& cx, const TableId table, bool copyTo
     if( table < earliestTable )
         return;
 
-    // for( TableId table = TableId::Table2; table <= TableId::Table7; table++ )
+    const uint64 entryCount = cx.tableEntryCounts[(int)table];
+    Pairs& pairs = cx.hostBackPointers[(int)table];
+
     {
         Log::Line( "[DEBUG] Loading table %d", (int)table + 1 );
 
         sprintf( lPath, "%st%d.l.tmp", DBG_BBCU_DBG_DIR, (int)table + 1 );
         sprintf( rPath, "%st%d.r.tmp", DBG_BBCU_DBG_DIR, (int)table + 1 );
 
-        const uint64 entryCount = cx.tableEntryCounts[(int)table];
         // cx.hostBackPointers[(int)table].left  = bbcvirtallocbounded<uint32>( entryCount );
         // cx.hostBackPointers[(int)table].right = bbcvirtallocbounded<uint16>( entryCount );
-        Pairs& pairs = cx.hostBackPointers[(int)table];
 
         int err;
 
-        if( table == earliestTable )
+        static DiskQueue* diskQueue = nullptr;
+
+        // Load disk-hybrid tables
+        // #NOTE: Enable (and disable the block below this one), to load tables from 
+        //        the disk-hybrid output. Also adjust path in the DiskQueue below.
+
+        // useDiskHybridData = true;
+        if( useDiskHybridData )
         {
-            FatalIf( !IOJob::ReadFromFile( lPath, pairs.left, entryCount * sizeof( Pair ), err ), "Failed to read table X pairs: %d", err );
+            if( diskQueue == nullptr )
+                diskQueue = new DiskQueue( "/home/harold/plotdisk" );
+
+            char lname[64] = {};
+            sprintf( lname, "table_l_%d.tmp", (int)table + 1 );
+
+            if( table == earliestTable )
+            {
+                DiskBuffer* buf = DiskBuffer::Create( *diskQueue, lname, BBCU_BUCKET_COUNT, sizeof( Pair ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT,
+                    FileMode::Open, FileAccess::Read, FileFlags::LargeFile | FileFlags::NoBuffering );
+                PanicIf( !buf, "No table file" );
+
+                VirtualAllocator valloc;
+                buf->ReserveBuffers( valloc );
+
+                Span<Pair> pairsWriter( (Pair*)pairs.left, BBCU_TABLE_ALLOC_ENTRY_COUNT );
+                buf->ReadNextBucket();
+
+                for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+                {
+                    const size_t bucketLength = cx.bucketCounts[(int)table][bucket];
+
+                    buf->TryReadNextBucket();
+                    auto entries = buf->GetNextReadBufferAs<Pair>().SliceSize( bucketLength );
+
+                    entries.CopyTo( pairsWriter );
+                    pairsWriter = pairsWriter.Slice( entries.Length() );
+                }
+
+                delete buf;
+            }
+            else
+            {
+                char rname[64] = {};
+                sprintf( rname, "table_r_%d.tmp", (int)table + 1 );
+
+                DiskBuffer* lBuf = DiskBuffer::Create( *diskQueue, lname, BBCU_BUCKET_COUNT, sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT,
+                    FileMode::Open, FileAccess::Read, FileFlags::LargeFile | FileFlags::NoBuffering );
+                DiskBuffer* rBuf = DiskBuffer::Create( *diskQueue, rname, BBCU_BUCKET_COUNT, sizeof( uint16 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT,
+                    FileMode::Open, FileAccess::Read, FileFlags::LargeFile | FileFlags::NoBuffering );
+                PanicIf( !lBuf, "No table L file" );
+                PanicIf( !rBuf, "No table R file" );
+
+                VirtualAllocator valloc;
+                lBuf->ReserveBuffers( valloc );
+                rBuf->ReserveBuffers( valloc );
+
+                Span<uint32> lWriter( pairs.left , BBCU_TABLE_ALLOC_ENTRY_COUNT );
+                Span<uint16> rWriter( pairs.right, BBCU_TABLE_ALLOC_ENTRY_COUNT );
+
+                lBuf->ReadNextBucket();
+                rBuf->ReadNextBucket();
+
+                for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+                {
+                    const size_t bucketLength = cx.bucketCounts[(int)table][bucket];
+
+                    lBuf->TryReadNextBucket();
+                    rBuf->TryReadNextBucket();
+
+                    auto lEntries = lBuf->GetNextReadBufferAs<uint32>().SliceSize( bucketLength );
+                    lEntries.CopyTo( lWriter );
+
+                    auto rEntries = rBuf->GetNextReadBufferAs<uint16>().SliceSize( bucketLength );
+                    rEntries.CopyTo( rWriter );
+
+                    lWriter = lWriter.Slice( lEntries.Length() );
+                    rWriter = rWriter.Slice( rEntries.Length() );
+                }
+
+                delete lBuf;
+                delete rBuf;
+            }
         }
         else
         {
-            FatalIf( !IOJob::ReadFromFile( lPath, pairs.left , entryCount * sizeof( uint32 ), err ), "Failed to read table L pairs: %d", err );
-            
-            // if( (uint32)table > cx.gCfg->numDroppedTables )
-                FatalIf( !IOJob::ReadFromFile( rPath, pairs.right, entryCount * sizeof( uint16 ), err ), "Failed to read table R pairs: %d", err );
-        }
-
-        // We expect table 7 to also be found in these buffers, so copy it
-        // if( table == TableId::Table7 )
-        if( copyToPinnedBuffer )
-        {
-            bbmemcpy_t( cx.hostTableSortedL, pairs.left , entryCount );
-            bbmemcpy_t( cx.hostTableSortedR, pairs.right, entryCount );
+            if( table == earliestTable )
+            {
+                FatalIf( !IOJob::ReadFromFile( lPath, pairs.left, entryCount * sizeof( Pair ), err ), "Failed to read table X pairs: %d", err );
+            }
+            else
+            {
+                FatalIf( !IOJob::ReadFromFile( lPath, pairs.left , entryCount * sizeof( uint32 ), err ), "Failed to read table L pairs: %d", err );
+                
+                // if( (uint32)table > cx.gCfg->numDroppedTables )
+                    FatalIf( !IOJob::ReadFromFile( rPath, pairs.right, entryCount * sizeof( uint16 ), err ), "Failed to read table R pairs: %d", err );
+            }
         }
     }
 
+
+    // if( table == earliestTable && !useDiskHybridData )
+    // {
+    //     uint64* tmpBucket = bbcvirtallocboundednuma<uint64>( BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+
+    //     std::vector<std::string> hashesRam{};
+    //     std::vector<std::string> hashesDisk{};
+
+    //     byte hash[32];
+    //     char hashstr[sizeof(hash)*2+1] = {};
+    
+    //     for( uint32 run = 0; run < 2; run++ )
+    //     {
+    //         auto& hashes = run == 0 ? hashesRam : hashesDisk;
+
+    //         uint64* xs = (uint64*)pairs.left;
+    
+    //         for( uint32 b = 0; b < BBCU_BUCKET_COUNT; b++ )
+    //         {
+    //             const uint64 bucketEntryCount = cx.bucketCounts[(int)table][b];
+
+    //             RadixSort256::Sort<BB_MAX_JOBS>( DbgGetThreadPool( cx ), xs, tmpBucket, bucketEntryCount );
+
+    //             // Hash
+    //             {
+    //                 blake3_hasher hasher;
+    //                 blake3_hasher_init( &hasher );
+    //                 blake3_hasher_update( &hasher, xs, bucketEntryCount * sizeof( uint64 ) );
+    //                 blake3_hasher_finalize( &hasher, hash, sizeof( hash ) );
+
+    //                 size_t _;
+    //                 BytesToHexStr( hash, sizeof( hash ), hashstr, sizeof( hashstr ), _ );
+    //                 Log::Line( "[%3u] : 0x%s", b, hashstr );
+
+    //                 hashes.push_back( hashstr );
+        
+    //                 // DbgPrintHash( " :", xs, sizeof( uint64 ) * bucketEntryCount );
+    //             }
+            
+    //             xs += bucketEntryCount;
+    //         }
+
+    //         if( run == 0 )
+    //         {
+    //             DbgLoadTablePairs( cx, table, true );
+    //         }
+    //     }
+
+    //     // Compare hashes
+    //     {
+    //         for( uint32 b = 0; b < BBCU_BUCKET_COUNT; b++ )
+    //         {
+    //             if( hashesRam[b] != hashesDisk[b] )
+    //             {
+    //                 Panic( "Hash mismatch at bucket %u. %s != %s", b, hashesRam[b].c_str(), hashesDisk[b].c_str() );
+    //             }
+    //         }
+    //         Log::Line( "All hashes match!" );
+    //     }
+        
+
+    //     // DbgPrintHash( "Inlined X Table", cx.hostBackPointers[(int)table].left, sizeof( Pair ) * cx.tableEntryCounts[(int)table] );
+    //     Log::Line( "" );
+    //     bbvirtfreebounded( tmpBucket );
+    //     Exit( 0 );
+    // }
+    // else
+    // {
+    //     // DbgPrintHash( "L Table", cx.hostBackPointers[(int)table].left, sizeof( uint32 ) * cx.tableEntryCounts[(int)table] );
+    //     // DbgPrintHash( "R Table", cx.hostBackPointers[(int)table].right, sizeof( uint16 ) * cx.tableEntryCounts[(int)table] );
+    //     // Log::Line( "" );
+    // }
+
+    // Sort inlined xs
+    // if( table == earliestTable )
+    // {
+    //     uint64* tmpBucket = bbcvirtallocboundednuma<uint64>( BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+    //     uint64* xs = (uint64*)pairs.left;
+
+    //     for( uint32 b = 0; b < BBCU_BUCKET_COUNT; b++ )
+    //     {
+    //         const uint64 bucketEntryCount = cx.bucketCounts[(int)table][b];
+    //         RadixSort256::Sort<BB_MAX_JOBS>( DbgGetThreadPool( cx ), xs, tmpBucket, bucketEntryCount );
+    //         xs += bucketEntryCount;
+    //     }
+
+    //     DbgPrintHash( "pre_sorted_xs", pairs.left, sizeof( uint64 ) * entryCount );
+    // }
+
     Log::Line( "[DEBUG] Done." );
 }
 
-
 void DbgLoadMarks( CudaK32PlotContext& cx )
 {
     char path[512];
 
+    std::string baseUrl = DBG_BBCU_DBG_DIR;
+    if( cx.cfg.hybrid128Mode )
+        baseUrl += "disk/";
+
     // const size_t tableSize = ((1ull << BBCU_K) / 64) * sizeof(uint64);
     Log::Line( "[DEBUG] Loadinging marking tables" );
 
-    const TableId startTable = TableId::Table2 + cx.gCfg->numDroppedTables; 
+    const TableId startTable = cx.firstStoredTable;
 
     for( TableId table = startTable; table < TableId::Table7; table++ )
     {
-        sprintf( path, "%smarks%d.tmp", DBG_BBCU_DBG_DIR, (int)table+1 );
+        sprintf( path, "%smarks%d.tmp", baseUrl.c_str(), (int)table+1 );
 
         int err = 0;
         cx.hostMarkingTables[(int)table] = (uint64*)IOJob::ReadAllBytesDirect( path, err );
diff --git a/cuda/CudaPlotter.h b/cuda/CudaPlotter.h
index ebe30f67..ddcbfed2 100644
--- a/cuda/CudaPlotter.h
+++ b/cuda/CudaPlotter.h
@@ -9,10 +9,22 @@ struct CudaK32PlotConfig
 {
     const GlobalPlotConfig* gCfg        = nullptr;
 
-    uint32 deviceIndex            = 0;     // Which CUDA device to use when plotting// 
-    bool   disableDirectDownloads = false; // Don't allocate host tables using pinned buffers, instead
-                                           // download to intermediate pinned buffers then copy to the final host buffer.
-                                           // May be necessarry on Windows because of shared memory limitations (usual 50% of system memory)
+    uint32 deviceIndex            = 0;       // Which CUDA device to use when plotting/
+    bool   disableDirectDownloads = false;   // Don't allocate host tables using pinned buffers, instead
+                                             // download to intermediate pinned buffers then copy to the final host buffer.
+                                             // May be necessarry on Windows because of shared memory limitations (usual 50% of system memory)
+
+    bool hybrid128Mode            = false;   // Enable hybrid disk-offload w/ 128G of RAM.
+    bool hybrid16Mode             = false;   // Enable hybrid disk-offload w/ 64G of RAM.
+
+    const char* temp1Path         = nullptr; // For 128G RAM mode
+    const char* temp2Path         = nullptr; // For 64G RAM mode
+
+    bool temp1DirectIO            = true;    // Use direct I/O for temp1 files
+    bool temp2DirectIO            = true;    // Use direct I/O for temp2 files
+
+    uint64 plotCheckCount         = 0;       // For performing plot check command after plotting
+    double plotCheckThreshhold    = 0.6;     // Proof/check threshhold below which plots will be deleted
 };
 
 class CudaK32Plotter : public IPlotter
@@ -28,4 +40,6 @@ class CudaK32Plotter : public IPlotter
 private:
     CudaK32PlotConfig          _cfg = {};
     struct CudaK32PlotContext* _cx  = nullptr;;
-};
\ No newline at end of file
+};
+
+void CudaK32PlotterPrintHelp();
diff --git a/cuda/GpuDownloadStream.cu b/cuda/GpuDownloadStream.cu
new file mode 100644
index 00000000..3d06973c
--- /dev/null
+++ b/cuda/GpuDownloadStream.cu
@@ -0,0 +1,385 @@
+#include "GpuStreams.h"
+#include "GpuQueue.h"
+#include "plotting/DiskBucketBuffer.h"
+#include "plotting/DiskBuffer.h"
+
+
+///
+/// DownloadBuffer
+///
+void* GpuDownloadBuffer::GetDeviceBuffer()
+{
+    const uint32 index = self->outgoingSequence % self->bufferCount;
+
+    CudaErrCheck( cudaEventSynchronize( self->events[index] ) );
+
+    return self->deviceBuffer[index];
+}
+
+void* GpuDownloadBuffer::LockDeviceBuffer( cudaStream_t stream )
+{
+    ASSERT( self->lockSequence >= self->outgoingSequence );
+    ASSERT( self->lockSequence - self->outgoingSequence < self->bufferCount );
+
+    const uint32 index = self->lockSequence % self->bufferCount;
+    self->lockSequence++;
+
+    // Wait for the device buffer to be free to be used by kernels
+    CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) );
+    return self->deviceBuffer[index];
+}
+
+void GpuDownloadBuffer::Download( void* hostBuffer, const size_t size )
+{
+    Download2D( hostBuffer, size, 1, size, size );
+}
+
+void GpuDownloadBuffer::Download( void* hostBuffer, const size_t size, cudaStream_t workStream, bool directOverride )
+{
+    Download2D( hostBuffer, size, 1, size, size, workStream, directOverride );
+}
+
+void GpuDownloadBuffer::DownloadAndCopy( void* hostBuffer, void* finalBuffer, const size_t size, cudaStream_t workStream  )
+{
+    Panic( "Unavailable" );
+    // ASSERT( self->outgoingSequence < BBCU_BUCKET_COUNT );
+    // ASSERT( hostBuffer );
+    // ASSERT( workStream );
+    // ASSERT( self->lockSequence > 0 );
+    // ASSERT( self->outgoingSequence < self->lockSequence );
+    // ASSERT( self->lockSequence - self->outgoingSequence <= self->bufferCount );
+
+    // auto& cpy = self->copies[self->outgoingSequence];
+    // cpy.self            = self;
+    // cpy.sequence        = self->outgoingSequence;
+    // cpy.copy.hostBuffer = finalBuffer;
+    // cpy.copy.srcBuffer  = hostBuffer;
+    // cpy.copy.size       = size;
+
+
+    // const uint32 index = self->outgoingSequence % self->bufferCount;
+    // self->outgoingSequence++;
+
+    //       void* pinnedBuffer = self->pinnedBuffer[index];
+    // const void* devBuffer    = self->deviceBuffer[index];
+
+    // // Signal from the work stream when it has finished doing kernel work with the device buffer
+    // CudaErrCheck( cudaEventRecord( self->readyEvents[index], workStream ) );
+
+
+    // // Ensure the work stream has completed writing data to the device buffer
+    // cudaStream_t stream = self->queue->_stream;
+
+    // CudaErrCheck( cudaStreamWaitEvent( stream, self->readyEvents[index] ) );
+
+    // // Copy
+    // CudaErrCheck( cudaMemcpyAsync( hostBuffer, devBuffer, size, cudaMemcpyDeviceToHost, stream ) );
+    
+    // // Signal that the device buffer is free to be re-used
+    // CudaErrCheck( cudaEventRecord( self->events[index], stream ) );
+
+    // // Launch copy command
+    // CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){
+
+    //     const CopyInfo& c = *reinterpret_cast<CopyInfo*>( userData );
+    //     IGpuBuffer* self = c.self;
+
+    //     auto& cmd = self->queue->GetCommand( GpuQueue::CommandType::Copy );
+    //     cmd.copy.info = &c;
+
+    //     self->queue->SubmitCommands();
+        
+    //     // Signal the download completed
+    //     self->fence.Signal( ++self->completedSequence );
+    // }, &cpy ) );
+}
+
+void GpuDownloadBuffer::DownloadWithCallback( void* hostBuffer, const size_t size, GpuDownloadCallback callback, void* userData, cudaStream_t workStream, bool directOverride )
+{
+    Download2DWithCallback( hostBuffer, size, 1, size, size, callback, userData, workStream, directOverride );
+}
+
+void GpuDownloadBuffer::Download2D( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, cudaStream_t workStream, bool directOverride )
+{
+    Download2DWithCallback( hostBuffer, width, height, dstStride, srcStride, nullptr, nullptr, workStream, directOverride );
+}
+
+void GpuDownloadBuffer::Download2DWithCallback( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, 
+                                                GpuDownloadCallback callback, void* userData, cudaStream_t workStream, bool directOverride )
+{
+    PerformDownload2D( hostBuffer, width, height, dstStride, srcStride,
+                       callback, userData, 
+                       workStream, directOverride );
+}
+
+void GpuDownloadBuffer::PerformDownload2D( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride,
+                                           GpuDownloadCallback postCallback, void* postUserData, 
+                                           cudaStream_t workStream, bool directOverride )
+{
+    PanicIf( !(hostBuffer || self->pinnedBuffer[0] ), "" );
+    ASSERT( workStream );
+    ASSERT( self->lockSequence > 0 );
+    ASSERT( self->outgoingSequence < self->lockSequence );
+    ASSERT( self->lockSequence - self->outgoingSequence <= self->bufferCount );
+
+    const uint32 index = self->outgoingSequence++ % self->bufferCount;
+
+          void*  pinnedBuffer     = self->pinnedBuffer[index];
+          void*  finalHostBuffer  = hostBuffer;
+    const void*  devBuffer        = self->deviceBuffer[index];
+
+    const bool   isDirect         = (directOverride || self->pinnedBuffer[0] == nullptr) && !self->diskBuffer;   ASSERT( isDirect || self->pinnedBuffer[0] );
+    const bool   isSequentialCopy = dstStride == srcStride;
+    const size_t totalSize        = height * width;
+
+
+    // Signal from the work stream when it has finished doing kernel work with the device buffer
+    CudaErrCheck( cudaEventRecord( self->workEvent[index], workStream ) );
+
+    // From the download stream, wait for the work stream to finish
+    cudaStream_t downloadStream = self->queue->_stream;
+    CudaErrCheck( cudaStreamWaitEvent( downloadStream, self->workEvent[index] ) );
+
+
+    if( self->diskBuffer )
+    {
+        // Wait until the next disk buffer is ready for use.
+        // This also signals that the pinned buffer is ready for re-use
+        CallHostFunctionOnStream( downloadStream, [this](){
+            self->diskBuffer->GetNextWriteBuffer();
+        });
+
+        pinnedBuffer = self->diskBuffer->PeekWriteBufferForBucket( self->outgoingSequence-1 ); 
+    }
+
+    if( !isDirect )
+    {
+        // Ensure that the pinned buffer is ready for use
+        // (we signal pinned buffers are ready when using disks without events)
+        if( !self->diskBuffer )
+            CudaErrCheck( cudaStreamWaitEvent( downloadStream, self->pinnedEvent[index] ) );
+
+        // Set host buffer as the pinned buffer
+        hostBuffer = pinnedBuffer;
+    }
+
+
+    // Copy from device to host buffer
+    // #NOTE: Since the pinned buffer is simply the same size (a full bucket) as the device buffer
+    //        we also always copy as 1D if we're copying to our pinned buffer.
+    ASSERT( hostBuffer );
+    if( isSequentialCopy || hostBuffer == pinnedBuffer )
+        CudaErrCheck( cudaMemcpyAsync( hostBuffer, devBuffer, totalSize, cudaMemcpyDeviceToHost, downloadStream ) );
+    else
+        CudaErrCheck( cudaMemcpy2DAsync( hostBuffer, dstStride, devBuffer, srcStride, width, height, cudaMemcpyDeviceToHost, downloadStream ) );
+
+    // Dispatch a host callback if one was set
+    if( postCallback )
+    {
+        CallHostFunctionOnStream( downloadStream, [=](){
+            (*postCallback)( finalHostBuffer, totalSize, postUserData );
+        });
+    }
+
+
+    // Signal that the device buffer is free to be re-used
+    CudaErrCheck( cudaEventRecord( self->deviceEvents[index], downloadStream ) );
+
+    if( self->diskBuffer )
+    {
+        // If it's a disk-based copy, then write the pinned buffer to disk
+        CallHostFunctionOnStream( downloadStream, [=]() {
+
+            auto* diskBucketBuffer = dynamic_cast<DiskBucketBuffer*>( self->diskBuffer );
+            if( diskBucketBuffer != nullptr )
+                diskBucketBuffer->Submit( srcStride );
+            else
+                static_cast<DiskBuffer*>( self->diskBuffer )->Submit( totalSize );
+        });
+
+        // #NOTE: We don't need to signal that the pinned buffer is ready for re-use here as
+        //        we do that implicitly with DiskBuffer::GetNextWriteBuffer (see above).
+    }
+    else if( !isDirect )
+    {
+        // #TODO: Do this in a different host copy stream, and signal from there.
+        // #MAYBE: Perhaps use multiple host threads/streams to do host-to-host copies.
+        //        for now do it on the same download stream, but we will be blocking the download stream,
+        //        unless other download streams are used by other buffers.
+
+
+        ASSERT( hostBuffer == pinnedBuffer );
+        if( isSequentialCopy )
+            CudaErrCheck( cudaMemcpyAsync( finalHostBuffer, hostBuffer, totalSize, cudaMemcpyHostToHost, downloadStream ) );
+        else
+            CudaErrCheck( cudaMemcpy2DAsync( finalHostBuffer, dstStride, hostBuffer, srcStride, width, height, cudaMemcpyHostToHost, downloadStream ) );
+
+        // Signal the pinned buffer is free to be re-used
+        CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], downloadStream ) );
+    }
+}
+
+void GpuDownloadBuffer::CallHostFunctionOnStream( cudaStream_t stream, std::function<void()> func )
+{
+    auto* fnCpy = new std::function<void()>( std::move( func ) );
+    CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ) {
+
+        auto& fn = *reinterpret_cast<std::function<void()>*>( userData );
+        fn();
+        delete& fn;
+
+    }, fnCpy ) );
+}
+
+void GpuDownloadBuffer::HostCallback( std::function<void()> func )
+{
+    CallHostFunctionOnStream( self->queue->GetStream(), func );
+}
+
+void GpuDownloadBuffer::GetDownload2DCommand( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride,
+                                              uint32& outIndex, void*& outPinnedBuffer, const void*& outDevBuffer, GpuDownloadCallback callback, void* userData )
+{
+    ASSERT( width      );
+    ASSERT( height     );
+    ASSERT( hostBuffer );
+
+    const uint32 index = self->outgoingSequence % self->bufferCount;
+
+    // We need to block until the pinned buffer is available.
+    if( self->outgoingSequence > self->bufferCount-1 )
+        self->fence.Wait( self->outgoingSequence - self->bufferCount + 1 );
+
+          void* pinnedBuffer = self->pinnedBuffer[index];
+    const void* devBuffer    = self->deviceBuffer[index];
+
+    //auto& cmd = self->commands[index];
+    //cmd.type             = GpuQueue::CommandType::Copy2D;
+    //cmd.sequenceId       = self->outgoingSequence++;
+    //cmd.finishedSignal   = &self->fence;
+    //cmd.dstBuffer        = hostBuffer;
+    //cmd.srcBuffer        = pinnedBuffer;
+    //cmd.copy2d.width     = width;
+    //cmd.copy2d.height    = height;
+    //cmd.copy2d.dstStride = dstStride;
+    //cmd.copy2d.srcStride = srcStride;
+    //cmd.copy2d.callback  = callback;
+    //cmd.copy2d.userData  = userData;
+
+    outIndex        = index;
+    outPinnedBuffer = pinnedBuffer;
+    outDevBuffer    = devBuffer;
+}
+
+
+void GpuDownloadBuffer::DownloadAndPackArray( void* hostBuffer, const uint32 length, size_t srcStride, const uint32* counts, const uint32 elementSize )
+{
+    ASSERT( length      );
+    ASSERT( elementSize );
+    ASSERT( counts      );
+
+    uint32 totalElements = 0;
+    for( uint32 i = 0; i < length; i++ )
+        totalElements += counts[i];
+
+    const size_t totalSize = (size_t)totalElements * elementSize;
+
+    uint32      index;
+    void*       pinnedBuffer;
+    const void* devBuffer;
+    GetDownload2DCommand( hostBuffer, totalSize, 1, totalSize, totalSize, index, pinnedBuffer, devBuffer );
+
+
+    srcStride *= elementSize;
+
+          byte* dst = (byte*)pinnedBuffer;
+    const byte* src = (byte*)devBuffer;
+
+    cudaStream_t stream = self->queue->_stream;
+
+    // Copy all buffers from device to pinned buffer
+    for( uint32 i = 0; i < length; i++ )
+    {
+        const size_t copySize = counts[i] * (size_t)elementSize;
+
+        // #TODO: Determine if there's a cuda (jagged) array copy
+        CudaErrCheck( cudaMemcpyAsync( dst, src, copySize, cudaMemcpyDeviceToHost, stream ) );
+
+        src += srcStride;
+        dst += copySize;
+    }
+
+    // Signal that the device buffer is free
+    CudaErrCheck( cudaEventRecord( self->events[index], stream ) );
+
+    // Submit command to do the final copy from pinned to host
+    CudaErrCheck( cudaLaunchHostFunc( stream, GpuQueue::CopyPendingDownloadStream, self ) );
+}
+
+void GpuDownloadBuffer::WaitForCompletion()
+{
+    if( self->outgoingSequence > 0 )
+    {
+        //const uint32 index = (self->outgoingSequence - 1) % self->bufferCount;
+
+        //      cudaEvent_t event = self->completedEvents[index];
+        //const cudaError_t r     = cudaEventQuery( event );
+
+        //if( r == cudaSuccess )
+        //    return;
+
+        //if( r != cudaErrorNotReady )
+        //    CudaErrCheck( r );
+
+        //CudaErrCheck( cudaEventSynchronize( event ) );
+        
+
+        cudaStream_t downloadStream = self->queue->_stream;
+        // this->self->fence.Reset( 0 );
+        CallHostFunctionOnStream( downloadStream, [this](){
+            this->self->fence.Signal( this->self->outgoingSequence );
+        });
+        self->fence.Wait( self->outgoingSequence );
+
+    }
+}
+
+void GpuDownloadBuffer::WaitForCopyCompletion()
+{
+    if( self->outgoingSequence > 0 )
+    {
+        self->copyFence.Wait( self->outgoingSequence );
+    }
+}
+
+void GpuDownloadBuffer::Reset()
+{
+    self->lockSequence      = 0;
+    self->outgoingSequence  = 0;
+    self->completedSequence = 0;
+    self->copySequence      = 0;
+    self->fence.Reset( 0 );
+    self->copyFence.Reset( 0 );
+}
+
+GpuQueue* GpuDownloadBuffer::GetQueue() const
+{
+    return self->queue;
+}
+
+void GpuDownloadBuffer::AssignDiskBuffer( DiskBufferBase* diskBuffer )
+{
+    // ASSERT( self->pinnedBuffer[0] );
+
+    void* nullBuffers[2] = { nullptr, nullptr };
+    if( self->diskBuffer )
+        self->diskBuffer->AssignWriteBuffers( nullBuffers );
+
+    self->diskBuffer = diskBuffer;
+    if( self->diskBuffer )
+        self->diskBuffer->AssignWriteBuffers( self->pinnedBuffer );
+}
+
+DiskBufferBase* GpuDownloadBuffer::GetDiskBuffer() const
+{
+    return self->diskBuffer;
+}
diff --git a/cuda/GpuQueue.cu b/cuda/GpuQueue.cu
new file mode 100644
index 00000000..399a0fbf
--- /dev/null
+++ b/cuda/GpuQueue.cu
@@ -0,0 +1,432 @@
+#include "GpuQueue.h"
+#include "util/IAllocator.h"
+#include "plotting/DiskBucketBuffer.h"
+#include "plotting/DiskBuffer.h"
+
+///
+/// Shared GpuStream Inteface
+///
+GpuQueue::GpuQueue( Kind kind ) : _kind( kind )
+    , _bufferReadySignal( BBCU_BUCKET_COUNT )
+{
+    CudaErrCheck( cudaStreamCreateWithFlags( &_stream        , cudaStreamNonBlocking ) );
+    CudaErrCheck( cudaStreamCreateWithFlags( &_preloadStream , cudaStreamNonBlocking ) );
+    CudaErrCheck( cudaStreamCreateWithFlags( &_callbackStream, cudaStreamNonBlocking ) );
+
+    _queueThread.Run( QueueThreadEntryPoint, this );
+}
+
+GpuQueue::~GpuQueue()
+{
+    _exitQueueThread.store( true, std::memory_order_release );
+    _bufferReadySignal.Release();
+    _waitForExitSignal.Wait();
+    
+
+    if( _stream         ) cudaStreamDestroy( _stream );
+    if( _preloadStream  ) cudaStreamDestroy( _preloadStream );
+    if( _callbackStream ) cudaStreamDestroy( _callbackStream );
+    
+    _stream         = nullptr;
+    _preloadStream  = nullptr;
+    _callbackStream = nullptr;
+}
+
+GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( const GpuStreamDescriptor& desc, bool dryRun )
+{
+    FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue." );
+    GpuDownloadBuffer r = { CreateGpuBuffer( desc, dryRun ) };
+
+    if( !dryRun )
+        r.Reset();
+    
+    return r;
+}
+
+GpuDownloadBuffer GpuQueue::CreateDirectDownloadBuffer( const size_t size, IAllocator& devAllocator, const size_t alignment, const bool dryRun )
+{
+    FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" );
+
+    ASSERT( 0 );    // #TODO: Deprecated function. Replace with the new one.
+    GpuStreamDescriptor desc{};
+    desc.entrySize       = 1;
+    desc.entriesPerSlice = 1;
+    desc.sliceCount      = BBCU_BUCKET_COUNT;
+    desc.sliceAlignment  = alignment;
+    desc.bufferCount     = 2;
+    desc.deviceAllocator = &devAllocator;
+    desc.pinnedAllocator = nullptr;
+
+    return CreateDownloadBuffer( desc, dryRun );
+}
+
+GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun )
+{
+    FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" );
+    GpuDownloadBuffer r = { CreateGpuBuffer( size, devAllocator, pinnedAllocator, alignment, dryRun ) };
+
+    if( !dryRun )
+        r.Reset();
+
+    return r;
+}
+
+GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( const size_t size, const uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun )
+{
+    FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" );
+
+    ASSERT( 0 );    // #TODO: Deprecated function. Replace with the new one.
+    GpuStreamDescriptor desc{};
+    desc.entrySize       = 1;
+    desc.entriesPerSlice = 1;
+    desc.sliceCount      = BBCU_BUCKET_COUNT;
+    desc.sliceAlignment  = alignment;
+    desc.bufferCount     = bufferCount;
+    desc.deviceAllocator = &devAllocator;
+    desc.pinnedAllocator = &pinnedAllocator;
+
+    GpuDownloadBuffer r = { CreateGpuBuffer( desc, dryRun ) };
+
+    if( !dryRun )
+        r.Reset();
+
+    return r;
+}
+
+GpuUploadBuffer GpuQueue::CreateUploadBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun )
+{
+    Panic( "Deprecated" );
+    FatalIf( _kind != Uploader, "Attempted to create GpuUploadBuffer on an DownloadQueue" );
+
+    GpuUploadBuffer r = { CreateGpuBuffer( size, devAllocator, pinnedAllocator, alignment, dryRun ) };
+
+    if( !dryRun )
+        r.Reset();
+
+    return r;
+}
+
+GpuUploadBuffer GpuQueue::CreateUploadBuffer( const GpuStreamDescriptor& desc, bool dryRun )
+{
+    FatalIf( _kind != Uploader, "Attempted to create GpuUploadBuffer on an DownloadQueue." );
+
+    GpuUploadBuffer r = { CreateGpuBuffer( desc, dryRun ) };
+
+    if( !dryRun )
+        r.Reset();
+
+    return r;
+}
+
+
+
+struct IGpuBuffer* GpuQueue::CreateGpuBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun )
+{
+    Panic( "Deprecated" );
+    // ASSERT( 0 );    // #TODO: Deprecated function. Replace with the new one.
+    GpuStreamDescriptor desc{};
+    desc.entrySize       = 1;
+    desc.entriesPerSlice = size;
+    desc.sliceCount      = BBCU_BUCKET_COUNT;
+    desc.sliceAlignment  = alignment;
+    desc.bufferCount     = 2;
+    desc.deviceAllocator = &devAllocator;
+    desc.pinnedAllocator = &pinnedAllocator;
+
+    return CreateGpuBuffer( desc, dryRun );
+}
+
+struct IGpuBuffer* GpuQueue::CreateGpuBuffer( const GpuStreamDescriptor& desc, bool dryRun )
+{
+    PanicIf( desc.bufferCount > BBCU_GPU_BUFFER_MAX_COUNT || !desc.bufferCount, "Invalid GPUBuffer buffer count." );
+    PanicIf( !desc.deviceAllocator, "Null device allocator." );
+    PanicIf( !desc.entrySize, "Invalid entry size." );
+    PanicIf( !desc.entriesPerSlice, "Invalid entries per slice." );
+    PanicIf( !desc.sliceCount || desc.sliceCount > BBCU_BUCKET_COUNT, "Invalid slice count." );
+    PanicIf( !desc.sliceAlignment, "Invalid slice alignment." );
+    PanicIf( desc.diskQueue && (!desc.diskFileName || !*desc.diskFileName), "Invalid disk offload config." );
+    PanicIf( desc.diskQueue && !desc.pinnedAllocator, "A pinned allocator must be set in disk offload mode." );
+
+    const size_t allocSize = CalculateBufferSizeFromDescriptor( desc );
+
+    void* devBuffers   [BBCU_GPU_BUFFER_MAX_COUNT] = {};
+    void* pinnedBuffers[BBCU_GPU_BUFFER_MAX_COUNT] = {};
+
+    for( int32 i = 0; i < desc.bufferCount; i++ )
+    {
+        devBuffers[i] = desc.deviceAllocator->Alloc( allocSize, desc.sliceAlignment );
+
+        if( desc.pinnedAllocator )
+            pinnedBuffers[i] = desc.pinnedAllocator->Alloc( allocSize, desc.sliceAlignment );
+    }
+
+    struct IGpuBuffer* buf = nullptr;
+
+    if( !dryRun )
+    {
+        buf = new IGpuBuffer{};
+
+        for( int32 i = 0; i < desc.bufferCount; i++ )
+        {
+            CudaErrCheck( cudaEventCreateWithFlags( &buf->events[i]         , cudaEventDisableTiming ) );
+            CudaErrCheck( cudaEventCreateWithFlags( &buf->completedEvents[i], cudaEventDisableTiming ) );
+            CudaErrCheck( cudaEventCreateWithFlags( &buf->readyEvents[i]    , cudaEventDisableTiming ) );
+            // CudaErrCheck( cudaEventCreateWithFlags( &buf->preloadEvents[i]  , cudaEventDisableTiming ) );
+            CudaErrCheck( cudaEventCreateWithFlags( &buf->pinnedEvent[i]  , cudaEventDisableTiming ) );
+
+            CudaErrCheck( cudaEventCreateWithFlags( &buf->callbackLockEvent     , cudaEventDisableTiming ) );
+            CudaErrCheck( cudaEventCreateWithFlags( &buf->callbackCompletedEvent, cudaEventDisableTiming ) );
+            
+            buf->deviceBuffer[i] = devBuffers[i];
+            buf->pinnedBuffer[i] = pinnedBuffers[i];
+        }
+
+            buf->size        = allocSize;
+            buf->bufferCount = desc.bufferCount;
+            buf->queue       = this;
+    }
+
+    // Disk offload mode?
+    if( desc.diskQueue )
+    {
+        const size_t sliceSize = CalculateSliceSizeFromDescriptor( desc );
+
+        if( !dryRun )
+        {
+            if( desc.bucketedDiskBuffer )
+            {
+                buf->diskBuffer = DiskBucketBuffer::Create( 
+                    *desc.diskQueue, desc.diskFileName,
+                    desc.sliceCount, sliceSize,
+                    FileMode::Create, FileAccess::ReadWrite, 
+                    desc.directIO ? FileFlags::NoBuffering | FileFlags::LargeFile : FileFlags::None );
+            }
+            else
+            {
+                buf->diskBuffer = DiskBuffer::Create(
+                    *desc.diskQueue, desc.diskFileName,
+                    desc.sliceCount, allocSize,
+                    FileMode::Create, FileAccess::ReadWrite, 
+                    desc.directIO ? FileFlags::NoBuffering | FileFlags::LargeFile : FileFlags::None );
+            }
+
+            PanicIf( !buf->diskBuffer, "Failed to create DiskBuffer for GpuBuffer." );
+
+            void* readBuffers [2] = { nullptr, nullptr };
+            void* writeBuffers[2] = { pinnedBuffers[0], pinnedBuffers[1] };
+
+            buf->diskBuffer->AssignBuffers( readBuffers, writeBuffers );
+        }
+        else
+        {
+            size_t diskAllocSize = 0;
+            if( desc.bucketedDiskBuffer )
+            {
+                diskAllocSize = DiskBucketBuffer::GetReserveAllocSize( *desc.diskQueue, desc.sliceCount, sliceSize );
+            }
+            else
+            {
+                diskAllocSize = DiskBuffer::GetReserveAllocSize( *desc.diskQueue, allocSize );
+            }
+
+            ASSERT( diskAllocSize == allocSize * 4 );
+        }
+    }
+
+    return buf;
+}
+
+void GpuQueue::DispatchHostFunc( GpuCallbackDispath func, cudaStream_t stream, cudaEvent_t lockEvent, cudaEvent_t completedEvent )
+{
+    // #MAYBE: Perhaps support having multiple callback streams, and multiple copy streams.
+
+    // Signal from the work stream into the callback stream that we are ready for callback
+    CudaErrCheck( cudaEventRecord( lockEvent, stream ) );
+
+    // Wait on the callback stream until it's ready to dsitpatch
+    CudaErrCheck( cudaStreamWaitEvent( _callbackStream, lockEvent ) );
+
+    // #MAYBE: Use a bump allocator perhaps later to avoid locking here by new/delete if needed for performance.
+    auto* fnCpy = new std::function<void()>( std::move( func ) );
+    CudaErrCheck( cudaLaunchHostFunc( _callbackStream, []( void* userData ){
+
+        auto& fn = *reinterpret_cast<std::function<void()>*>( userData );
+        fn();
+        delete &fn;
+
+    }, fnCpy ) );
+
+    // Signal from the callback stream that the callback finished
+    CudaErrCheck( cudaEventRecord( completedEvent, _callbackStream ) );
+
+    // Wait on work stream for the callback to complete
+    CudaErrCheck( cudaStreamWaitEvent( stream, completedEvent ) );
+}
+
+size_t GpuQueue::CalculateSliceSizeFromDescriptor( const GpuStreamDescriptor& desc )
+{
+    const size_t alignment = desc.diskQueue ? desc.diskQueue->BlockSize() : desc.sliceAlignment; 
+    return RoundUpToNextBoundaryT( desc.entrySize * desc.entriesPerSlice, alignment );
+}
+
+size_t GpuQueue::CalculateBufferSizeFromDescriptor( const GpuStreamDescriptor& desc )
+{
+    return CalculateSliceSizeFromDescriptor( desc ) * desc.sliceCount;
+}
+
+void GpuQueue::CopyPendingDownloadStream( void* userData )
+{
+    auto* buf = reinterpret_cast<IGpuBuffer*>( userData );
+
+    GpuQueue* queue = buf->queue;
+
+    //const uint32 index = buf->completedSequence % buf->bufferCount;
+    buf->completedSequence++;
+
+    //queue->GetCommand( CommandType::Download2D ) = buf->commands[index];
+    queue->SubmitCommands();
+}
+
+void GpuQueue::SubmitCommands()
+{
+    const uint64 ticket = _commitTicketOut++;
+
+    // Wait for our ticket to come up
+    while( _commitTicketIn.load( std::memory_order_relaxed ) != ticket );
+
+    _queue.Commit();
+    _bufferReadySignal.Release();
+    //_bufferReadySignal.Signal();
+
+    // Use our ticket
+    _commitTicketIn.store( ticket+1, std::memory_order_release );
+}
+
+GpuQueue::Command& GpuQueue::GetCommand( CommandType type )
+{
+    const uint64 ticket = _cmdTicketOut++;
+
+    // Wait for our ticket to come up
+    while( _cmdTicketIn.load( std::memory_order_relaxed ) != ticket );
+    
+    Command* cmd;
+    while( !_queue.Write( cmd ) )
+    {
+        Log::Line( "[GpuQueue] Queue is depleted. Waiting for copies to complete." );
+        auto waitTimer = TimerBegin();
+
+        // Block and wait until we have commands free in the buffer
+        _bufferCopiedSignal.Wait();
+        
+        Log::Line( "[GpuQueue] Waited %.6lf seconds for availability.", TimerEnd( waitTimer ) );
+    }
+
+    // Use our ticket
+    _cmdTicketIn.store( ticket+1, std::memory_order_release );
+
+    ZeroMem( cmd );
+    cmd->type = type;
+
+    return *cmd;
+}
+
+///
+/// Command thread
+///
+void GpuQueue::QueueThreadEntryPoint( GpuQueue* self )
+{
+    ASSERT( self );
+    self->QueueThreadMain();
+    self->_waitForExitSignal.Signal();
+}
+
+void GpuQueue::QueueThreadMain()
+{
+    const int32 CMD_BUF_SIZE = 256;
+    Command buffers[CMD_BUF_SIZE];
+
+    for( ;; )
+    {
+        _bufferReadySignal.Wait();
+
+        if( ShouldExitQueueThread() )
+            return;
+
+        // 1 command per semaphore release
+        int32 bufCount;
+        while( ( ( bufCount = _queue.Dequeue( buffers, CMD_BUF_SIZE ) ) ) )
+        // if( ( ( bufCount = _queue.Dequeue( buffers, CMD_BUF_SIZE ) ) ) )
+        {
+            ASSERT( bufCount <= CMD_BUF_SIZE );
+            _bufferCopiedSignal.Signal();
+
+            for( int i = 0; i < bufCount; i++ )
+                ExecuteCommand( buffers[i] );
+        }
+    }
+}
+
+void GpuQueue::ExecuteCommand( const Command& cmd )
+{
+
+    // const uint32 index = cmd.sequenceId % BBCU_GPU_BUFFER_MAX_COUNT;
+
+    if( cmd.type == CommandType::Copy )
+    {
+        auto& cpy = *cmd.copy;
+
+        const bool   isSequentialCopy = cpy.dstStride == cpy.srcStride;
+        const size_t totalSize        = cpy.height * cpy.width;
+
+              byte* dst = (byte*)cpy.dstBuffer;
+        const byte* src = (byte*)cpy.srcBuffer;
+
+        if( isSequentialCopy )
+            memcpy( cpy.dstBuffer, cpy.srcBuffer, totalSize );
+        else
+        {
+            const byte* src = (byte*)cpy.srcBuffer;
+                  byte* dst = (byte*)cpy.dstBuffer;
+
+            for( size_t i = 0; i < cpy.height; i++ )
+            {
+                memcpy( dst, src, cpy.width );
+
+                dst += cpy.dstStride;
+                src += cpy.srcStride;
+            }
+        }
+
+        cpy.self->fence.Signal( cpy.sequence+1 );
+        cpy.self->copyFence.Signal( cpy.sequence+1 );
+
+        if( cpy.callback )
+            cpy.callback( cpy.dstBuffer, totalSize, cpy.userData );
+    }
+    else if( cmd.type == CommandType::CopyArray )
+    {
+        
+    }
+    else if( cmd.type == CommandType::Callback )
+    {
+        cmd.callback.callback( cmd.callback.dstbuffer, cmd.callback.copySize, cmd.callback.userData );
+    }
+    // else if( cmd.type == CommandType::Sync )
+    // {
+    //     _syncFence.Signal();
+    //     return;
+    // }
+    else
+    {
+        ASSERT( 0 );
+    }
+
+    // Signal that the pinned buffer is free
+    //cpy.finishedSignal->Signal( cpy.sequenceId + 1 );
+}
+
+inline bool GpuQueue::ShouldExitQueueThread()
+{
+    return _exitQueueThread.load( std::memory_order_acquire );
+}
+
diff --git a/cuda/GpuQueue.h b/cuda/GpuQueue.h
new file mode 100644
index 00000000..8adf41e5
--- /dev/null
+++ b/cuda/GpuQueue.h
@@ -0,0 +1,188 @@
+#pragma once
+
+#include "GpuStreams.h"
+#include <functional>
+
+class DiskQueue;
+
+struct GpuStreamDescriptor
+{
+    size_t      entrySize;
+    size_t      entriesPerSlice;
+    uint32      sliceCount;
+    uint32      sliceAlignment;
+    uint32      bufferCount;
+    IAllocator* deviceAllocator;
+    IAllocator* pinnedAllocator;
+    DiskQueue*  diskQueue;          // DiskQueue to use when disk offload mode is enabled.
+    const char* diskFileName;       // File name to use when disk offload mode is enabled. The diskQueue must be set.
+    bool        bucketedDiskBuffer; // If true, a DiskBucketBuffer will be used instead of a DiskBuffer.
+    bool        directIO;           // If true, direct I/O will be used when using disk offload mode.
+};
+
+typedef std::function<void()> GpuCallbackDispath;
+
+class GpuQueue
+{
+    friend struct IGpuBuffer;
+    friend struct GpuDownloadBuffer;
+    friend struct GpuUploadBuffer;
+
+    enum class CommandType
+    {
+        None = 0,
+        Copy,
+        CopyArray,
+        Callback,
+    };
+
+    struct Command
+    {
+        CommandType type;
+
+        union
+        {
+            struct CopyInfo* copy;
+
+            struct {
+                GpuDownloadCallback callback;
+                size_t              copySize;
+                void*               dstbuffer;
+                void*               userData;
+            } callback;
+        };
+    };
+
+public:
+
+    enum Kind
+    {
+        Downloader,
+        Uploader
+    };
+
+    GpuQueue( Kind kind );
+    virtual ~GpuQueue();
+
+    static size_t CalculateSliceSizeFromDescriptor( const GpuStreamDescriptor& desc );
+    static size_t CalculateBufferSizeFromDescriptor( const GpuStreamDescriptor& desc );
+
+    //GpuDownloadBuffer CreateDownloadBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size = 0, bool dryRun = false );
+    //GpuDownloadBuffer CreateDownloadBuffer( const size_t size, bool dryRun = false );
+    GpuDownloadBuffer CreateDirectDownloadBuffer( size_t size, IAllocator& devAllocator, size_t alignment, bool dryRun = false );
+    GpuDownloadBuffer CreateDownloadBuffer( size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false );
+    GpuDownloadBuffer CreateDownloadBuffer( size_t size, uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false );
+
+    GpuDownloadBuffer CreateDownloadBuffer( const GpuStreamDescriptor& desc, bool dryRun = false );
+
+    /// Create with descriptor and override entry size
+    inline GpuDownloadBuffer CreateDownloadBuffer( const GpuStreamDescriptor& desc, size_t entrySize, bool dryRun = false )
+    {
+        GpuStreamDescriptor copy = desc;
+        copy.entrySize = entrySize;
+
+        return CreateDownloadBuffer( copy, dryRun );
+    }
+
+    template<typename T>
+    inline GpuDownloadBuffer CreateDownloadBufferT( const GpuStreamDescriptor& desc, bool dryRun = false )
+    {
+        return CreateDownloadBuffer( desc, sizeof( T ), dryRun );
+    }
+
+    /// Create with descriptor and override entry size
+    GpuUploadBuffer CreateUploadBuffer( const GpuStreamDescriptor& desc, bool dryRun = false );
+
+    // inline GpuUploadBuffer CreateUploadBuffer( const GpuStreamDescriptor& desc, bool size_t entrySize, bool dryRun = false )
+    // {
+    //     GpuStreamDescriptor copy = desc;
+    //     copy.entrySize = entrySize;
+
+    //     return CreateUploadBuffer( copy, dryRun );
+    // }
+
+    template<typename T>
+    inline GpuUploadBuffer CreateUploadBufferT( const GpuStreamDescriptor& desc, bool dryRun = false )
+    {
+        GpuStreamDescriptor copy = desc;
+        copy.entrySize = sizeof(T);
+
+        return CreateUploadBuffer( copy, dryRun );
+        // return CreateUploadBuffer( desc, sizeof( T ), dryRun );
+    }
+
+
+    template<typename T>
+    inline GpuDownloadBuffer CreateDirectDownloadBuffer( const size_t count, IAllocator& devAllocator, size_t alignment = alignof( T ), bool dryRun = false )
+    {
+        return CreateDirectDownloadBuffer( count * sizeof( T ), devAllocator, alignment, dryRun );
+    }
+
+    template<typename T>
+    inline GpuDownloadBuffer CreateDownloadBufferT( const size_t count, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment = alignof( T ), bool dryRun = false )
+    {
+        return CreateDownloadBuffer( count * sizeof( T ), devAllocator, pinnedAllocator, alignment, dryRun );
+    }
+
+    template<typename T>
+    inline GpuDownloadBuffer CreateDownloadBufferT( const size_t count, uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment = alignof( T ), bool dryRun = false )
+    {
+        return CreateDownloadBuffer( count * sizeof( T ), bufferCount, devAllocator, pinnedAllocator, alignment, dryRun );
+    }
+
+    //GpuUploadBuffer CreateUploadBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size = 0, bool dryRun = false );
+    //GpuUploadBuffer CreateUploadBuffer( const size_t size, bool dryRun = false );
+    GpuUploadBuffer CreateUploadBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false );
+
+    template<typename T>
+    inline GpuUploadBuffer CreateUploadBufferT( const size_t count, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false )
+    {
+        return CreateUploadBuffer( count * sizeof( T ), devAllocator, pinnedAllocator, alignment, dryRun );
+    }
+
+    inline cudaStream_t GetStream() const { return _stream; }
+
+protected:
+
+    struct IGpuBuffer* CreateGpuBuffer( size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun );
+    struct IGpuBuffer* CreateGpuBuffer( const GpuStreamDescriptor& desc, bool dryRun );
+    
+    void DispatchHostFunc( GpuCallbackDispath func, cudaStream_t stream, cudaEvent_t lockEvent, cudaEvent_t completedEvent );
+
+    static void CopyPendingDownloadStream( void* userData );
+
+    [[nodiscard]]
+    Command& GetCommand( CommandType type );
+    void SubmitCommands();
+
+    // Copy threads
+    static void QueueThreadEntryPoint( GpuQueue* self );
+    void QueueThreadMain();
+
+    void ExecuteCommand( const Command& cpy );
+
+    bool ShouldExitQueueThread();
+
+protected:
+    cudaStream_t             _stream         = nullptr;
+    cudaStream_t             _preloadStream  = nullptr;
+    cudaStream_t             _callbackStream = nullptr;
+
+
+    Thread                   _queueThread;
+    //Fence                    _bufferReadySignal;
+    Semaphore                _bufferReadySignal;
+    Fence                    _bufferCopiedSignal;
+    Fence                    _syncFence;
+    SPCQueue<Command, BBCU_BUCKET_COUNT*6> _queue;
+    Kind                     _kind;
+
+    AutoResetSignal          _waitForExitSignal;
+    std::atomic<bool>        _exitQueueThread    = false;
+
+    // Support multiple threads to grab commands
+    std::atomic<uint64> _cmdTicketOut    = 0;
+    std::atomic<uint64> _cmdTicketIn     = 0;
+    std::atomic<uint64> _commitTicketOut = 0;
+    std::atomic<uint64> _commitTicketIn  = 0;
+};
diff --git a/cuda/GpuStreams.cu b/cuda/GpuStreams.cu
index e5dcfd66..63700c9c 100644
--- a/cuda/GpuStreams.cu
+++ b/cuda/GpuStreams.cu
@@ -1,137 +1,105 @@
 #include "GpuStreams.h"
-#include "util/StackAllocator.h"
+#include "GpuQueue.h"
+#include "plotting/DiskBucketBuffer.h"
+#include "plotting/DiskBuffer.h"
 
-struct PackedCopy
-{
-    struct IGpuBuffer* self;
-    const  byte*       src;
-           uint32      sequence;
-           uint32      length;
-           uint32      stride;
-           uint32      elementSize;
-           uint32      counts[BBCU_BUCKET_COUNT];
-};
-
-struct CopyInfo
-{
-    struct IGpuBuffer* self;
-    uint32             sequence;
-
-    const void* srcBuffer;
-    void*       dstBuffer;
-    size_t      width;
-    size_t      height;
-    size_t      dstStride;
-    size_t      srcStride;
-    
-    // Callback data
-    GpuDownloadCallback callback;
-    void*               userData;
-};
-
-struct IGpuBuffer
-{
-    size_t            size;
-    uint32            bufferCount;                                 // Number of pinned/device buffers this instance contains
-    void*             deviceBuffer   [BBCU_GPU_BUFFER_MAX_COUNT];
-    void*             pinnedBuffer   [BBCU_GPU_BUFFER_MAX_COUNT];  // Pinned host buffer
-    cudaEvent_t       events         [BBCU_GPU_BUFFER_MAX_COUNT];  // Signals the device buffer is ready for use
-    cudaEvent_t       completedEvents[BBCU_GPU_BUFFER_MAX_COUNT];  // Signals the buffer is ready for consumption by the device or buffer
-    cudaEvent_t       readyEvents    [BBCU_GPU_BUFFER_MAX_COUNT];  // User must signal this event when the device buffer is ready for download
-    // GpuQueue::Command commands       [BBCU_GPU_BUFFER_MAX_COUNT];  // Pending copy command for downloads
-    Fence             fence;                                       // Signals the pinned buffer is ready for use
-    Fence             copyFence;
-
-    cudaEvent_t       preloadEvents[BBCU_GPU_BUFFER_MAX_COUNT];
-
-    CopyInfo copies[BBCU_BUCKET_COUNT];
-    PackedCopy packedCopeis[BBCU_BUCKET_COUNT];    // For uplad buffers
-    // #TODO: Remove atomic again
-    uint32     lockSequence;           // Index of next buffer to lock
-    uint32     outgoingSequence;       // Index of locked buffer that will be downoaded/uploaded
-    std::atomic<uint32>     completedSequence;      // Index of buffer that finished downloading/uploading
-    std::atomic<uint32>     copySequence;
-
-    GpuQueue* queue;
-};
 
 
 ///
-/// DownloadBuffer
+/// UploadBuffer
 ///
-void* GpuDownloadBuffer::GetDeviceBuffer()
+void* GpuUploadBuffer::GetNextPinnedBuffer()
 {
+    // Wait for the pinned host buffer to be available
+    //if( self->outgoingSequence > self->bufferCount-1 )
+    //    self->fence.Wait( self->outgoingSequence - self->bufferCount + 1 );
+    //
     const uint32 index = self->outgoingSequence % self->bufferCount;
 
-    CudaErrCheck( cudaEventSynchronize( self->events[index] ) );
+    void* pinnedBuffer = self->pinnedBuffer[index];
 
-    return self->deviceBuffer[index];
+    return pinnedBuffer;
 }
 
-void* GpuDownloadBuffer::LockDeviceBuffer( cudaStream_t stream )
+void GpuUploadBuffer::Upload( const void* hostBuffer, size_t size, cudaStream_t workStream, bool directOverride )
 {
-    ASSERT( self->lockSequence >= self->outgoingSequence );
-    ASSERT( self->lockSequence - self->outgoingSequence < self->bufferCount );
+    ASSERT( size );
 
-    const uint32 index = self->lockSequence % self->bufferCount;
-    self->lockSequence++;
+    const bool isDirect = (!self->pinnedBuffer[0] || directOverride) && !self->diskBuffer;
+    PanicIf( isDirect && !hostBuffer, "No host buffer provided for direct upload." );
 
-    // Wait for the device buffer to be free to be used by kernels
-    CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) );
-    return self->deviceBuffer[index];
-}
+    const uint32 index = SynchronizeOutgoingSequence();
 
-void GpuDownloadBuffer::Download( void* hostBuffer, const size_t size )
-{
-    Download2D( hostBuffer, size, 1, size, size );
-}
+    auto uploadStream = self->queue->GetStream();
 
-void GpuDownloadBuffer::Download( void* hostBuffer, const size_t size, cudaStream_t workStream, bool directOverride )
-{
-    Download2D( hostBuffer, size, 1, size, size, workStream, directOverride );
-}
-
-void GpuDownloadBuffer::DownloadAndCopy( void* hostBuffer, void* finalBuffer, const size_t size, cudaStream_t workStream  )
-{
-    ASSERT( 0 );
-    // ASSERT( self->outgoingSequence < BBCU_BUCKET_COUNT );
-    // ASSERT( hostBuffer );
-    // ASSERT( workStream );
-    // ASSERT( self->lockSequence > 0 );
-    // ASSERT( self->outgoingSequence < self->lockSequence );
-    // ASSERT( self->lockSequence - self->outgoingSequence <= self->bufferCount );
-
-    // auto& cpy = self->copies[self->outgoingSequence];
-    // cpy.self            = self;
-    // cpy.sequence        = self->outgoingSequence;
-    // cpy.copy.hostBuffer = finalBuffer;
-    // cpy.copy.srcBuffer  = hostBuffer;
-    // cpy.copy.size       = size;
+    DiskBuffer* diskBuffer = nullptr;
+    if( self->diskBuffer )
+    {
+        // Preload data from disk into pinned buffer
+
+        diskBuffer = dynamic_cast<DiskBuffer*>( self->diskBuffer );
+        PanicIf( !diskBuffer, "Not a DiskBucketBuffer" );
+        ASSERT( diskBuffer->GetAlignedBufferSize() >= size );
+
+        hostBuffer = self->pinnedBuffer[index];
+        ASSERT( hostBuffer == diskBuffer->PeekReadBufferForBucket( self->outgoingSequence - 1 ) );
+        ASSERT( self->outgoingSequence <= BBCU_BUCKET_COUNT );
+
+        CallHostFunctionOnStream( uploadStream, [=](){
+            // Read on disk queue's thread
+            diskBuffer->ReadNextBucket();
+
+            // Block until the buffer is fully read from disk
+            // #TODO: Also should not do this here, but in a host-to-host background stream,
+            //        so that the next I/O read can happen in the background while
+            //        the previous upload to disk is happening, if needed.
+            (void)diskBuffer->GetNextReadBuffer();
+        });
+    }
+    else if( !isDirect )
+    {
+        // Copy from unpinned to pinned first
+        // #TODO: This should be done in a different backgrund host-to-host copy stream
+        CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->pinnedEvent[index] ) );
+        CudaErrCheck( cudaMemcpyAsync( self->pinnedBuffer[index], hostBuffer, size, cudaMemcpyHostToHost, uploadStream ) );
 
+        hostBuffer = self->pinnedBuffer[index];
+    }
 
-    // const uint32 index = self->outgoingSequence % self->bufferCount;
-    // self->outgoingSequence++;
+    // Ensure the device buffer is ready for use
+    CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) );
 
-    //       void* pinnedBuffer = self->pinnedBuffer[index];
-    // const void* devBuffer    = self->deviceBuffer[index];
+    // Upload to the device buffer
+    CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, size, cudaMemcpyHostToDevice, uploadStream ) );
 
-    // // Signal from the work stream when it has finished doing kernel work with the device buffer
-    // CudaErrCheck( cudaEventRecord( self->readyEvents[index], workStream ) );
+    if( !isDirect )
+    {
+        // Signal that the pinned buffer is ready for re-use
+        CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) );
+    }
 
+    // Signal work stream that the device buffer is ready to be used
+    CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) );
+}
 
-    // // Ensure the work stream has completed writing data to the device buffer
-    // cudaStream_t stream = self->queue->_stream;
+void GpuUploadBuffer::UploadAndPreLoad( void* hostBuffer, const size_t size, const void* copyBufferSrc, const size_t copySize )
+{
+    ASSERT(0);
+    // ASSERT( size >= copySize );
 
-    // CudaErrCheck( cudaStreamWaitEvent( stream, self->readyEvents[index] ) );
+    // Upload( hostBuffer, size, nullptr );
 
-    // // Copy
-    // CudaErrCheck( cudaMemcpyAsync( hostBuffer, devBuffer, size, cudaMemcpyDeviceToHost, stream ) );
-    
-    // // Signal that the device buffer is free to be re-used
-    // CudaErrCheck( cudaEventRecord( self->events[index], stream ) );
+    // // Add callback for copy
+    // const uint32 sequence = self->outgoingSequence - 1;
+    // auto& cpy = self->copies[sequence];
+    // cpy.self            = self;
+    // cpy.sequence        = sequence;
+    // cpy.copy.hostBuffer = hostBuffer;
+    // cpy.copy.srcBuffer  = copyBufferSrc;
+    // cpy.copy.size       = copySize;
 
     // // Launch copy command
-    // CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){
+    // CudaErrCheck( cudaLaunchHostFunc( self->queue->GetStream(), []( void* userData ){
 
     //     const CopyInfo& c = *reinterpret_cast<CopyInfo*>( userData );
     //     IGpuBuffer* self = c.self;
@@ -140,438 +108,113 @@ void GpuDownloadBuffer::DownloadAndCopy( void* hostBuffer, void* finalBuffer, co
     //     cmd.copy.info = &c;
 
     //     self->queue->SubmitCommands();
-        
-    //     // Signal the download completed
-    //     self->fence.Signal( ++self->completedSequence );
     // }, &cpy ) );
 }
 
-void GpuDownloadBuffer::DownloadWithCallback( void* hostBuffer, const size_t size, GpuDownloadCallback callback, void* userData, cudaStream_t workStream, bool directOverride )
-{
-    Download2DWithCallback( hostBuffer, size, 1, size, size, callback, userData, workStream, directOverride );
-}
-
-void GpuDownloadBuffer::Download2D( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, cudaStream_t workStream, bool directOverride )
-{
-    Download2DWithCallback( hostBuffer, width, height, dstStride, srcStride, nullptr, nullptr, workStream, directOverride );
-}
-
-void GpuDownloadBuffer::Download2DWithCallback( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, 
-                                                GpuDownloadCallback callback, void* userData, cudaStream_t workStream, bool directOverride )
+void GpuUploadBuffer::UploadArray( const void* hostBuffer, uint32 length, uint32 elementSize, uint32 srcStride, 
+                                   uint32 countStride, const uint32* counts, cudaStream_t workStream )
 {
-    ASSERT( hostBuffer );
-    ASSERT( workStream );
-    ASSERT( self->lockSequence > 0 );
-    ASSERT( self->outgoingSequence < self->lockSequence );
-    ASSERT( self->lockSequence - self->outgoingSequence <= self->bufferCount );
-
-    const uint32 index = self->outgoingSequence % self->bufferCount;
+    const uint32 index    = SynchronizeOutgoingSequence();
+    const bool   isDirect = self->pinnedBuffer[0] == nullptr && !self->diskBuffer;
 
-          void* pinnedBuffer = self->pinnedBuffer[index];
-    const void* devBuffer    = self->deviceBuffer[index];
+    auto uploadStream = self->queue->GetStream();
 
-    const bool isDirect = directOverride || self->pinnedBuffer[0] == nullptr;           ASSERT( isDirect || self->pinnedBuffer[0] );
+    DiskBucketBuffer* diskBuffer      = nullptr;
+    size_t            totalBufferSize = 0;
 
-    // Signal from the work stream when it has finished doing kernel work with the device buffer
-    CudaErrCheck( cudaEventRecord( self->readyEvents[index], workStream ) );
-
-    // Ensure the work stream has completed writing data to the device buffer
-    cudaStream_t stream = self->queue->_stream;
-
-    CudaErrCheck( cudaStreamWaitEvent( stream, self->readyEvents[index] ) );
-    
-    // Ensure the pinned buffer is ready for use
-    if( !isDirect )
+    if( self->diskBuffer )
     {
-        // CudaErrCheck( cudaStreamWaitEvent( stream, self->completedEvents[index] ) );
-        CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){
-            
-            IGpuBuffer* self = reinterpret_cast<IGpuBuffer*>( userData );
-            if( self->copySequence++ > 1 )
-            {
-                self->copyFence.Wait( self->copySequence-1 );
-            }
-        }, self ) );
-    }
+        diskBuffer = dynamic_cast<DiskBucketBuffer*>( self->diskBuffer );
+        PanicIf( !diskBuffer, "Not a DiskBucketBuffer" );
 
-    // Copy from device to pinned host buffer
-    const bool   isSequentialCopy = dstStride == srcStride;
-    const size_t totalSize        = height * width;
-    
-    if( isDirect )
-    {
-        if( isSequentialCopy )
-            CudaErrCheck( cudaMemcpyAsync( hostBuffer, devBuffer, totalSize, cudaMemcpyDeviceToHost, stream ) );
-        else
-            CudaErrCheck( cudaMemcpy2DAsync( hostBuffer, dstStride, devBuffer, srcStride, width, height, cudaMemcpyDeviceToHost, stream ) );
+        hostBuffer = diskBuffer->PeekReadBufferForBucket( self->outgoingSequence-1 );
+        ASSERT( self->outgoingSequence <= BBCU_BUCKET_COUNT );
 
-        // Signal direct download completed
-        auto& cpy = self->copies[self->outgoingSequence];
-        cpy.self      = self;
-        cpy.sequence  = self->outgoingSequence;
-        cpy.dstBuffer = hostBuffer;
-        cpy.callback  = callback;
-        cpy.userData  = userData;
-        cpy.height    = height;
-        cpy.width     = width;
+        // if( nextReadBucket < BBCU_BUCKET_COUNT )
+        {
+            // Override the input slice sizes with the correct ones (as we wrote them with fixed size)
+
+            // Preload the bucket buffer from disk
+            CallHostFunctionOnStream( uploadStream, [=](){
 
-        CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){
+                const uint32 nextReadBucket = diskBuffer->GetNextReadBucketId();
+                diskBuffer->OverrideReadSlices( nextReadBucket, elementSize, counts, countStride );
 
-            CopyInfo&   cpy  = *reinterpret_cast<CopyInfo*>( userData );
-            IGpuBuffer* self = cpy.self; //reinterpret_cast<IGpuBuffer*>( userData );
+                // Preloads in the background
+                diskBuffer->ReadNextBucket();
 
-            self->fence.Signal( ++self->completedSequence );
+                // Upload the next one too, if needed 
+                // #NOTE: This is a hacky way to do it for now. 
+                //        We ought to have a synchronized, separate, disk stream later
+                // if( nextReadBucket < BBCU_BUCKET_COUNT )
+                //     diskBuffer->ReadNextBucket();
+            });
+        }
 
-            // Dispatch callback, if one was set
-            if( cpy.callback )
-                cpy.callback( cpy.dstBuffer, cpy.height * cpy.width, cpy.userData );
+        // Wait for disk buffer to be ready
+        CallHostFunctionOnStream( uploadStream, [diskBuffer](){
 
-        }, &cpy ) );
+            // Wait until next buffer is ready
+            (void)diskBuffer->GetNextReadBuffer();
+        });
     }
     else
     {
-        CudaErrCheck( cudaMemcpyAsync( pinnedBuffer, devBuffer, totalSize, cudaMemcpyDeviceToHost, stream ) );
-    }
-    
-    // Signal that the device buffer is free to be re-used
-    CudaErrCheck( cudaEventRecord( self->events[index], stream ) );
+        // Perform fragmented uploads
+        const auto waitEvent = isDirect ? self->deviceEvents[index] : self->pinnedEvent[index];
+        const auto copyMode  = isDirect ? cudaMemcpyHostToDevice : cudaMemcpyHostToHost;
 
-    // If not a direct copy, we need to do another copy from the pinned buffer to the unpinned host buffer
-    if( !isDirect )
-    {
-        // Signal the copy stream that the pinned buffer is ready to be copied to the unpinned host buffer
-        CudaErrCheck( cudaEventRecord( self->preloadEvents[index], stream ) );
+        // Wait on device or pinned buffer to be ready (depending if a direct copy or not)
+        CudaErrCheck( cudaStreamWaitEvent( uploadStream, waitEvent ) );
 
-        // Ensure the pinned buffer is ready for use
-        cudaStream_t copyStream = self->queue->_preloadStream;
-        
-        CudaErrCheck( cudaStreamWaitEvent( copyStream, self->preloadEvents[index] ) );
+        const byte*   src   = (byte*)hostBuffer;
+              byte*   dst   = (byte*)( isDirect ? self->deviceBuffer[index] : self->pinnedBuffer[index] );
+        const uint32* sizes = counts;
 
+        for( uint32 i = 0; i < length; i++ )
         {
-            auto& cpy = self->copies[self->outgoingSequence];
-            cpy.self     = self;
-            cpy.sequence = self->outgoingSequence;
-
-            cpy.dstBuffer = hostBuffer;
-            cpy.srcBuffer = pinnedBuffer;
-            cpy.width     = width;
-            cpy.height    = height;
-            cpy.srcStride = srcStride;
-            cpy.dstStride = dstStride;
-            cpy.callback  = callback;
-            cpy.userData  = userData;
-
-            CudaErrCheck( cudaLaunchHostFunc( copyStream, []( void* userData ){
+            const size_t size = *sizes * (size_t)elementSize;
 
-                CopyInfo&   cpy  = *reinterpret_cast<CopyInfo*>( userData );
-                IGpuBuffer* self = cpy.self; //reinterpret_cast<IGpuBuffer*>( userData );
+            CudaErrCheck( cudaMemcpyAsync( dst, src, size, copyMode, uploadStream ) );
 
-                auto& cmd = self->queue->GetCommand( GpuQueue::CommandType::Copy );
-                cmd.copy = &cpy;
-                self->queue->SubmitCommands();
-
-            }, &cpy ) );
+            dst    += size;
+            src    += srcStride;
+            sizes += countStride;
         }
 
-        // Signal the pinned buffer is free to be re-used
-        // CudaErrCheck( cudaEventRecord( self->completedEvents[index], copyStream ) );
-    }
-
-
-    // Signal the download completed
-    // {
-    //     auto& cpy = self->copies[self->outgoingSequence];
-    //     cpy.self     = self;
-    //     cpy.sequence = self->outgoingSequence;
-        
-    //     cpy.copy2d.dstBuffer = hostBuffer;
-    //     cpy.copy2d.srcBuffer = pinnedBuffer;
-    //     cpy.copy2d.width     = width;
-    //     cpy.copy2d.height    = height;
-    //     cpy.copy2d.srcStride = srcStride;
-    //     cpy.copy2d.dstStride = dstStride;
-
-    //     CudaErrCheck( cudaLaunchHostFunc( copyStream, []( void* userData ){
-            
-    //         CopyInfo&   cpy  = *reinterpret_cast<CopyInfo*>( userData );
-    //         IGpuBuffer* self = cpy.self; //reinterpret_cast<IGpuBuffer*>( userData );
-
-    //         const uint32 idx = cpy.sequence & self->bufferCount;
-            
-    //         const byte* src = (byte*)cpy.copy2d.srcBuffer;
-    //               byte* dst = (byte*)cpy.copy2d.dstBuffer;
-            
-    //         const size_t width     = cpy.copy2d.width;
-    //         const size_t height    = cpy.copy2d.height;
-    //         const size_t dstStride = cpy.copy2d.dstStride;
-    //         const size_t srcStride = cpy.copy2d.srcStride;
-
-    //         auto& cmd = self->queue->GetCommand( GpuQueue::CommandType::Download2D );
-    //         cmd.sequenceId = cpy.sequence;
-    //         cmd.srcBuffer  = src;
-    //         cmd.dstBuffer  = dst;
-    //         cmd.download2d.buf       = self;
-    //         cmd.download2d.width     = width;
-    //         cmd.download2d.height    = height;
-    //         cmd.download2d.srcStride = srcStride;
-    //         cmd.download2d.dstStride = dstStride;
-    //         self->queue->SubmitCommands();
-
-    //         // for( size_t i = 0; i < height; i++ )
-    //         // {
-    //         //     memcpy( dst, src, width );
-
-    //         //     dst += dstStride;
-    //         //     src += srcStride;
-    //         // }
-
-    //         // self->fence.Signal( ++self->completedSequence );
-    //     }, &cpy ) );
-    // }
-    // CudaErrCheck( cudaEventRecord( self->completedEvents[index], copyStream ) );
-
-    // if( callback )
-    // {
-    //     ASSERT( width <= srcStride );
-    //     ASSERT( width <= dstStride );
-
-    //     auto& cpy = self->copies[self->outgoingSequence];
-    //     cpy.self                = self;
-    //     cpy.sequence            = self->outgoingSequence;
-    //     cpy.callback.hostBuffer = hostBuffer;
-    //     cpy.callback.size       = width * height;
-    //     cpy.callback.callback   = callback;
-    //     cpy.callback.userData   = userData;
-
-    //     CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){
-            
-    //         auto& cpy  = *reinterpret_cast<CopyInfo*>( userData );
-    //         auto* self = cpy.self;
-
-    //         // Fire callback command
-    //         auto& cmd = self->queue->GetCommand( GpuQueue::CommandType::Callback );
-    //         cmd.dstBuffer         = cpy.callback.hostBuffer;
-    //         cmd.callback.copySize = cpy.callback.size;
-    //         cmd.callback.callback = cpy.callback.callback;
-    //         cmd.callback.userData = cpy.callback.userData;
-    //         self->queue->SubmitCommands();
-
-    //         // Signal the download completed
-    //         self->fence.Signal( ++self->completedSequence );
-    //     }, &cpy ) );
-    // }
-    // else
-    // {
-    //     // Signal the download completed
-    //     CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){
-
-    //         IGpuBuffer* self = reinterpret_cast<IGpuBuffer*>( userData );
-    //         self->fence.Signal( ++self->completedSequence );
-    //     }, self ) );
-    // }
-
-    self->outgoingSequence++;
-}
-
-void GpuDownloadBuffer::GetDownload2DCommand( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride,
-                                              uint32& outIndex, void*& outPinnedBuffer, const void*& outDevBuffer, GpuDownloadCallback callback, void* userData )
-{
-    ASSERT( width      );
-    ASSERT( height     );
-    ASSERT( hostBuffer );
-
-    const uint32 index = self->outgoingSequence % self->bufferCount;
-
-    // We need to block until the pinned buffer is available.
-    if( self->outgoingSequence > self->bufferCount-1 )
-        self->fence.Wait( self->outgoingSequence - self->bufferCount + 1 );
-
-          void* pinnedBuffer = self->pinnedBuffer[index];
-    const void* devBuffer    = self->deviceBuffer[index];
-
-    //auto& cmd = self->commands[index];
-    //cmd.type             = GpuQueue::CommandType::Copy2D;
-    //cmd.sequenceId       = self->outgoingSequence++;
-    //cmd.finishedSignal   = &self->fence;
-    //cmd.dstBuffer        = hostBuffer;
-    //cmd.srcBuffer        = pinnedBuffer;
-    //cmd.copy2d.width     = width;
-    //cmd.copy2d.height    = height;
-    //cmd.copy2d.dstStride = dstStride;
-    //cmd.copy2d.srcStride = srcStride;
-    //cmd.copy2d.callback  = callback;
-    //cmd.copy2d.userData  = userData;
-
-    outIndex        = index;
-    outPinnedBuffer = pinnedBuffer;
-    outDevBuffer    = devBuffer;
-}
-
-
-void GpuDownloadBuffer::DownloadAndPackArray( void* hostBuffer, const uint32 length, size_t srcStride, const uint32* counts, const uint32 elementSize )
-{
-    ASSERT( length      );
-    ASSERT( elementSize );
-    ASSERT( counts      );
-
-    uint32 totalElements = 0;
-    for( uint32 i = 0; i < length; i++ )
-        totalElements += counts[i];
-
-    const size_t totalSize = (size_t)totalElements * elementSize;
-
-    uint32      index;
-    void*       pinnedBuffer;
-    const void* devBuffer;
-    GetDownload2DCommand( hostBuffer, totalSize, 1, totalSize, totalSize, index, pinnedBuffer, devBuffer );
-
-
-    srcStride *= elementSize;
-
-          byte* dst = (byte*)pinnedBuffer;
-    const byte* src = (byte*)devBuffer;
-
-    cudaStream_t stream = self->queue->_stream;
-
-    // Copy all buffers from device to pinned buffer
-    for( uint32 i = 0; i < length; i++ )
-    {
-        const size_t copySize = counts[i] * (size_t)elementSize;
-
-        // #TODO: Determine if there's a cuda (jagged) array copy
-        CudaErrCheck( cudaMemcpyAsync( dst, src, copySize, cudaMemcpyDeviceToHost, stream ) );
-
-        src += srcStride;
-        dst += copySize;
+        if( !isDirect )
+        {
+            // Set the pinned buffer as the host buffer so that we can do a sequential copy to the device now
+            hostBuffer = self->pinnedBuffer[index];
+        }
     }
 
-    // Signal that the device buffer is free
-    CudaErrCheck( cudaEventRecord( self->events[index], stream ) );
-
-    // Submit command to do the final copy from pinned to host
-    CudaErrCheck( cudaLaunchHostFunc( stream, GpuQueue::CopyPendingDownloadStream, self ) );
-}
-
-void GpuDownloadBuffer::WaitForCompletion()
-{
-    if( self->outgoingSequence > 0 )
+    // Upload to device buffer if in non-direct mode
+    if( !isDirect )
     {
-        //const uint32 index = (self->outgoingSequence - 1) % self->bufferCount;
-
-        //      cudaEvent_t event = self->completedEvents[index];
-        //const cudaError_t r     = cudaEventQuery( event );
-
-        //if( r == cudaSuccess )
-        //    return;
-
-        //if( r != cudaErrorNotReady )
-        //    CudaErrCheck( r );
+        for( uint32 i = 0; i < length; i++ )
+        {
+            ASSERT( *counts );
+            totalBufferSize += *counts * (size_t)elementSize;
+            counts += countStride;
+        }
 
-        //CudaErrCheck( cudaEventSynchronize( event ) );
-        
-        self->fence.Wait( self->outgoingSequence );
-    }
-}
+        // #TODO: This should be done in a copy stream to perform the copies in the background
+        CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) );
+        CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, totalBufferSize, cudaMemcpyHostToDevice, uploadStream ) );
 
-void GpuDownloadBuffer::WaitForCopyCompletion()
-{
-    if( self->outgoingSequence > 0 )
-    {
-        self->copyFence.Wait( self->outgoingSequence );
+        if( !self->diskBuffer )
+            CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) );
     }
-}
-
-void GpuDownloadBuffer::Reset()
-{
-    self->lockSequence      = 0;
-    self->outgoingSequence  = 0;
-    self->completedSequence = 0;
-    self->copySequence      = 0;
-    self->fence.Reset( 0 );
-    self->copyFence.Reset( 0 );
-}
-
-GpuQueue* GpuDownloadBuffer::GetQueue() const
-{
-    return self->queue;
-}
-
-
-///
-/// UploadBuffer
-///
-void* GpuUploadBuffer::GetNextPinnedBuffer()
-{
-    // Wait for the pinned host buffer to be available
-    //if( self->outgoingSequence > self->bufferCount-1 )
-    //    self->fence.Wait( self->outgoingSequence - self->bufferCount + 1 );
-    //
-    const uint32 index = self->outgoingSequence % self->bufferCount;
-
-    void* pinnedBuffer = self->pinnedBuffer[index];
-
-    return pinnedBuffer;
-}
-
-void GpuUploadBuffer::Upload( const void* hostBuffer, size_t size, cudaStream_t workStream )
-{
-    ASSERT( hostBuffer );
-    ASSERT( size );
-    ASSERT( self->outgoingSequence - self->lockSequence < 2 );
-    // ASSERT( workStream );
-    
-    const uint32 index = self->outgoingSequence % self->bufferCount;
-    self->outgoingSequence++;
-
-    auto stream = self->queue->GetStream();
-
-    // Ensure the device buffer is ready for use
-    CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) );
-
-    // Upload to device buffer
-    CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, size, cudaMemcpyHostToDevice, stream ) );
 
     // Signal work stream that the device buffer is ready to be used
-    CudaErrCheck( cudaEventRecord( self->readyEvents[index], stream ) );
+    CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) );
 }
 
-void GpuUploadBuffer::UploadAndPreLoad( void* hostBuffer, const size_t size, const void* copyBufferSrc, const size_t copySize )
-{
-    ASSERT(0);
-    // ASSERT( size >= copySize );
-
-    // Upload( hostBuffer, size, nullptr );
-
-    // // Add callback for copy
-    // const uint32 sequence = self->outgoingSequence - 1;
-    // auto& cpy = self->copies[sequence];
-    // cpy.self            = self;
-    // cpy.sequence        = sequence;
-    // cpy.copy.hostBuffer = hostBuffer;
-    // cpy.copy.srcBuffer  = copyBufferSrc;
-    // cpy.copy.size       = copySize;
-
-    // // Launch copy command
-    // CudaErrCheck( cudaLaunchHostFunc( self->queue->GetStream(), []( void* userData ){
-
-    //     const CopyInfo& c = *reinterpret_cast<CopyInfo*>( userData );
-    //     IGpuBuffer* self = c.self;
-
-    //     auto& cmd = self->queue->GetCommand( GpuQueue::CommandType::Copy );
-    //     cmd.copy.info = &c;
-
-    //     self->queue->SubmitCommands();
-    // }, &cpy ) );
-}
-
-void GpuUploadBuffer::UploadArray( const void* hostBuffer, uint32 length, uint32 elementSize, uint32 srcStride, 
-                                   uint32 countStride, const uint32* counts, cudaStream_t workStream )
+void GpuUploadBuffer::UploadArrayForIndex( const uint32 index, const void* hostBuffer, uint32 length, 
+                                           uint32 elementSize, uint32 srcStride, uint32 countStride, const uint32* counts )
 {
     ASSERT( hostBuffer );
-    ASSERT( self->outgoingSequence - self->lockSequence < 2 );
-
-    const uint32 index = self->outgoingSequence % self->bufferCount;
-    self->outgoingSequence++;
 
     auto stream = self->queue->GetStream();
 
@@ -632,28 +275,6 @@ void* GpuUploadBuffer::GetUploadedDeviceBuffer( cudaStream_t workStream )
     return self->deviceBuffer[index];
 }
 
-void* GpuUploadBuffer::GetUploadedDeviceBuffer()
-{ASSERT(0); // Not allowed for now
-    if( self->outgoingSequence < 1 )
-    {
-        ASSERT( 0 );
-        return nullptr;
-    }
-    ASSERT( 0 );
-    const uint32 index = self->completedSequence % self->bufferCount;
-
-    // #TODO: Make this spin way.
-    // #TODO: Find a better way to do this instead of having to wait on both primitives.
-    // Can't check the cuda event until we're sure it's been
-    // added to the stream
-    self->fence.Wait( self->completedSequence + 1 );
-    CudaErrCheck( cudaEventSynchronize( self->events[index] ) );
-
-    self->completedSequence++;
-
-    return self->deviceBuffer[index];
-}
-
 void GpuUploadBuffer::ReleaseDeviceBuffer( cudaStream_t workStream )
 {
     ASSERT( self->outgoingSequence > self->lockSequence );
@@ -663,7 +284,7 @@ void GpuUploadBuffer::ReleaseDeviceBuffer( cudaStream_t workStream )
     const uint32 index = self->lockSequence % self->bufferCount;
     self->lockSequence++;
 
-    CudaErrCheck( cudaEventRecord( self->events[index], workStream ) );
+    CudaErrCheck( cudaEventRecord( self->deviceEvents[index], workStream ) );
 }
 
 void GpuUploadBuffer::WaitForPreloadsToComplete()
@@ -674,6 +295,17 @@ void GpuUploadBuffer::WaitForPreloadsToComplete()
     }
 }
 
+uint32 GpuUploadBuffer::SynchronizeOutgoingSequence()
+{
+    PanicIf( self->outgoingSequence < self->lockSequence || self->outgoingSequence - self->lockSequence >= 2,
+            "Invalid outgoing synchro sequence state." );
+
+    const uint32 index = self->outgoingSequence % self->bufferCount;
+    self->outgoingSequence++;
+
+    return index;
+}
+
 void GpuUploadBuffer::Reset()
 {
     self->lockSequence      = 0;
@@ -689,362 +321,32 @@ GpuQueue* GpuUploadBuffer::GetQueue() const
     return self->queue;
 }
 
-
-///
-/// Shared GpuStream Inteface
-///
-GpuQueue::GpuQueue( Kind kind ) : _kind( kind )
-    , _bufferReadySignal( BBCU_BUCKET_COUNT )
-{
-    CudaErrCheck( cudaStreamCreateWithFlags( &_stream, cudaStreamNonBlocking ) );
-    CudaErrCheck( cudaStreamCreateWithFlags( &_preloadStream, cudaStreamNonBlocking ) );
-
-    _copyThread.Run( CopyThreadEntryPoint, this );
-}
-
-GpuQueue::~GpuQueue()
-{
-    _exitCopyThread.store( true, std::memory_order_release );
-    _bufferReadySignal.Release();
-    _waitForExitSignal.Wait();
-}
-
-//void GpuQueue::Synchronize()
-//{
-//    (void)GetCommand( CommandType::Sync );
-//    SubmitCommands();
-//
-//    _syncFence.Wait();
-//}
-
-
-//GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size, bool dryRun )
-//{
-//    FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" );
-//    if( dryRun ) return { nullptr };
-//
-//    // #TODO: Set size?
-//    return { CreateGpuBuffer( dev0, dev1, pinned0, pinned1, size ) };
-//}
-
-//GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( const size_t size, bool dryRun )
-//{
-//    FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" );
-//    if( dryRun ) return { nullptr };
-//    return { CreateGpuBuffer( size ) };
-//}
-
-GpuDownloadBuffer GpuQueue::CreateDirectDownloadBuffer( const size_t size, IAllocator& devAllocator, const size_t alignment, const bool dryRun )
-{
-    FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" );
-    GpuDownloadBuffer r = { CreateGpuBuffer( size, BBCU_DEFAULT_GPU_BUFFER_COUNT, &devAllocator, nullptr, alignment, dryRun ) };
-
-    if( !dryRun )
-        r.Reset();
-
-    return r;
-}
-
-GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun )
+void GpuUploadBuffer::AssignDiskBuffer( DiskBufferBase* diskBuffer )
 {
-    FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" );
-    GpuDownloadBuffer r = { CreateGpuBuffer( size, devAllocator, pinnedAllocator, alignment, dryRun ) };
+    ASSERT( self->pinnedBuffer[0] );
 
-    if( !dryRun )
-        r.Reset();
+    void* nullBuffers[2] = { nullptr, nullptr };
+    if( self->diskBuffer )
+        self->diskBuffer->AssignReadBuffers( nullBuffers );
 
-    return r;
+    self->diskBuffer = diskBuffer;
+    if( self->diskBuffer )
+        self->diskBuffer->AssignReadBuffers( self->pinnedBuffer );
 }
 
-GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( const size_t size, const uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun )
+DiskBufferBase* GpuUploadBuffer::GetDiskBuffer() const
 {
-    FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" );
-    GpuDownloadBuffer r = { CreateGpuBuffer( size, bufferCount, &devAllocator, &pinnedAllocator, alignment, dryRun ) };
-
-    if( !dryRun )
-        r.Reset();
-
-    return r;
+    return self->diskBuffer;
 }
 
-GpuUploadBuffer GpuQueue::CreateUploadBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun )
+void GpuUploadBuffer::CallHostFunctionOnStream( cudaStream_t stream, std::function<void()> func )
 {
-    FatalIf( _kind != Uploader, "Attempted to create GpuUploadBuffer on an DownloadQueue" );
-    GpuUploadBuffer r = { CreateGpuBuffer( size, devAllocator, pinnedAllocator, alignment, dryRun ) };
-
-    if( !dryRun )
-        r.Reset();
+    auto* fnCpy = new std::function<void()>( std::move( func ) );
+    CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ) {
 
-    return r;
-}
+        auto& fn = *reinterpret_cast<std::function<void()>*>( userData );
+        fn();
+        delete& fn;
 
-
-struct IGpuBuffer* GpuQueue::CreateGpuBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun )
-{
-    return CreateGpuBuffer( size, BBCU_DEFAULT_GPU_BUFFER_COUNT, &devAllocator, &pinnedAllocator, alignment, dryRun );
-}
-
-struct IGpuBuffer* GpuQueue::CreateGpuBuffer( const size_t size, const uint32 bufferCount, IAllocator* devAllocator, IAllocator* pinnedAllocator, size_t alignment, bool dryRun )
-{
-    FatalIf( bufferCount > BBCU_GPU_BUFFER_MAX_COUNT, "GPU Buffer count overflow." );
-
-    const size_t allocSize = RoundUpToNextBoundaryT( size, alignment );
-
-    void* devBuffers   [BBCU_GPU_BUFFER_MAX_COUNT] = {};
-    void* pinnedBuffers[BBCU_GPU_BUFFER_MAX_COUNT] = {};
-
-    for( int32 i = 0; i < bufferCount; i++ )
-    {
-        devBuffers[i] = devAllocator->Alloc( allocSize, alignment );
-
-        if( pinnedAllocator )
-            pinnedBuffers[i] = pinnedAllocator->Alloc( allocSize, alignment );
-    }
-
-    if( dryRun ) return nullptr;
-
-    struct IGpuBuffer* buf = new IGpuBuffer{};
-
-    for( int32 i = 0; i < bufferCount; i++ )
-    {
-        CudaErrCheck( cudaEventCreateWithFlags( &buf->events[i]         , cudaEventDisableTiming ) );
-        CudaErrCheck( cudaEventCreateWithFlags( &buf->completedEvents[i], cudaEventDisableTiming ) );
-        CudaErrCheck( cudaEventCreateWithFlags( &buf->readyEvents[i]    , cudaEventDisableTiming ) );
-        CudaErrCheck( cudaEventCreateWithFlags( &buf->preloadEvents[i]  , cudaEventDisableTiming ) );
-        
-        buf->deviceBuffer[i] = devBuffers[i];
-        buf->pinnedBuffer[i] = pinnedBuffers[i];
-        // buf->commands[i]     = {};
-
-        // Events have to be disabled initially for uploads
-        //if( _kind == Uploader )
-        //{
-        //    CudaErrCheck( cudaEventSynchronize( buf->events[i]          ) );
-        //    CudaErrCheck( cudaEventSynchronize( buf->completedEvents[i] ) );
-        //    CudaErrCheck( cudaEventSynchronize( buf->readyEvents[i]     ) );
-        //}
-    }
-
-    buf->size        = size;
-    buf->bufferCount = bufferCount;
-    buf->queue       = this;
-
-    return buf;
-}
-
-//struct IGpuBuffer* GpuQueue::CreateGpuBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, const size_t size )
-//{
-//    ASSERT( dev0 );
-//    ASSERT( dev1 );
-//    ASSERT( pinned0 );
-//    ASSERT( pinned1 );
-//
-//    ASSERT( dev0 != dev1 );
-//    ASSERT( pinned0 != pinned1 );
-//
-//#if _DEBUG
-//    if( size )
-//    {
-//        ASSERT_DOES_NOT_OVERLAP( dev0   , dev1   , size );
-//        ASSERT_DOES_NOT_OVERLAP( dev0   , pinned0, size );
-//        ASSERT_DOES_NOT_OVERLAP( dev0   , pinned1, size );
-//        ASSERT_DOES_NOT_OVERLAP( dev1   , pinned0, size );
-//        ASSERT_DOES_NOT_OVERLAP( dev1   , pinned1, size );
-//        ASSERT_DOES_NOT_OVERLAP( pinned0, pinned1, size );
-//    }
-//#endif
-//
-//    struct IGpuBuffer* buf = new IGpuBuffer();
-//
-//    CudaErrCheck( cudaEventCreateWithFlags( &buf->events[0], cudaEventDisableTiming ) );
-//    CudaErrCheck( cudaEventCreateWithFlags( &buf->events[1], cudaEventDisableTiming ) );
-//
-//    buf->deviceBuffer[0] = dev0;
-//    buf->deviceBuffer[1] = dev1;
-//
-//    buf->pinnedBuffer[0] = pinned0;
-//    buf->pinnedBuffer[1] = pinned1;
-//
-//    buf->size = size;
-//    buf->fence.Reset( 0 );
-//
-//    buf->commands[0] = {};
-//    buf->commands[1] = {};
-//
-//    buf->outgoingSequence  = 0;
-//    buf->completedSequence = 0;
-//
-//    buf->queue = this;
-//
-//    return buf;
-//}
-
-//struct IGpuBuffer* GpuQueue::CreateGpuBuffer( const size_t size )
-//{
-//    ASSERT( size );
-//
-//    void* dev0;
-//    void* dev1;
-//    void* pinned0;
-//    void* pinned1;
-//
-//    CudaErrCheck( cudaMalloc( &dev0, size ) );
-//    CudaErrCheck( cudaMalloc( &dev1, size ) );
-//    CudaErrCheck( cudaMallocHost( &pinned0, size ) );
-//    CudaErrCheck( cudaMallocHost( &pinned1, size ) );
-//
-//    return CreateGpuBuffer( dev0, dev1, pinned0, pinned1, size );
-//}
-
-void GpuQueue::CopyPendingDownloadStream( void* userData )
-{
-    auto* buf = reinterpret_cast<IGpuBuffer*>( userData );
-
-    GpuQueue* queue = buf->queue;
-
-    //const uint32 index = buf->completedSequence % buf->bufferCount;
-    buf->completedSequence++;
-
-    //queue->GetCommand( CommandType::Download2D ) = buf->commands[index];
-    queue->SubmitCommands();
-}
-
-void GpuQueue::SubmitCommands()
-{
-    const uint64 ticket = _commitTicketOut++;
-
-    // Wait for our ticket to come up
-    while( _commitTicketIn.load( std::memory_order_relaxed ) != ticket );
-
-    _queue.Commit();
-    _bufferReadySignal.Release();
-    //_bufferReadySignal.Signal();
-
-    // Use our ticket
-    _commitTicketIn.store( ticket+1, std::memory_order_release );
-}
-
-GpuQueue::Command& GpuQueue::GetCommand( CommandType type )
-{
-    const uint64 ticket = _cmdTicketOut++;
-
-    // Wait for our ticket to come up
-    while( _cmdTicketIn.load( std::memory_order_relaxed ) != ticket );
-    
-    Command* cmd;
-    while( !_queue.Write( cmd ) )
-    {
-        Log::Line( "[GpuQueue] Queue is depleted. Waiting for copies to complete." );
-        auto waitTimer = TimerBegin();
-
-        // Block and wait until we have commands free in the buffer
-        _bufferCopiedSignal.Wait();
-        
-        Log::Line( "[GpuQueue] Waited %.6lf seconds for availability.", TimerEnd( waitTimer ) );
-    }
-
-    // Use our ticket
-    _cmdTicketIn.store( ticket+1, std::memory_order_release );
-
-    ZeroMem( cmd );
-    cmd->type = type;
-
-    return *cmd;
-}
-
-
-///
-/// Command thread
-///
-void GpuQueue::CopyThreadEntryPoint( GpuQueue* self )
-{
-    ASSERT( self );
-    self->CopyThreadMain();
-    self->_waitForExitSignal.Signal();
-}
-
-void GpuQueue::CopyThreadMain()
-{
-    const int32 CMD_BUF_SIZE = 256;
-    Command buffers[CMD_BUF_SIZE];
-
-    for( ;; )
-    {
-        _bufferReadySignal.Wait();
-
-        if( ShouldExitCopyThread() )
-            return;
-
-        // 1 command per semaphore release
-        int32 bufCount;
-        while( ( ( bufCount = _queue.Dequeue( buffers, CMD_BUF_SIZE ) ) ) )
-        // if( ( ( bufCount = _queue.Dequeue( buffers, CMD_BUF_SIZE ) ) ) )
-        {
-            ASSERT( bufCount <= CMD_BUF_SIZE );
-            _bufferCopiedSignal.Signal();
-
-            for( int i = 0; i < bufCount; i++ )
-                ExecuteCommand( buffers[i] );
-        }
-    }
-}
-
-void GpuQueue::ExecuteCommand( const Command& cmd )
-{
-
-    // const uint32 index = cmd.sequenceId % BBCU_GPU_BUFFER_MAX_COUNT;
-
-    if( cmd.type == CommandType::Copy )
-    {
-        auto& cpy = *cmd.copy;
-
-        const bool   isSequentialCopy = cpy.dstStride == cpy.srcStride;
-        const size_t totalSize        = cpy.height * cpy.width;
-
-              byte* dst = (byte*)cpy.dstBuffer;
-        const byte* src = (byte*)cpy.srcBuffer;
-        
-        if( isSequentialCopy )
-            memcpy( cpy.dstBuffer, cpy.srcBuffer, totalSize );
-        else
-        {
-            const byte* src = (byte*)cpy.srcBuffer;
-                  byte* dst = (byte*)cpy.dstBuffer;
-
-            for( size_t i = 0; i < cpy.height; i++ )
-            {
-                memcpy( dst, src, cpy.width );
-
-                dst += cpy.dstStride;
-                src += cpy.srcStride;
-            }
-        }
-
-        cpy.self->fence.Signal( cpy.sequence+1 );
-        cpy.self->copyFence.Signal( cpy.sequence+1 );
-
-        if( cpy.callback )
-            cpy.callback( cpy.dstBuffer, totalSize, cpy.userData );
-    }
-    else if( cmd.type == CommandType::Callback )
-    {
-        cmd.callback.callback( cmd.callback.dstbuffer, cmd.callback.copySize, cmd.callback.userData );
-    }
-    // else if( cmd.type == CommandType::Sync )
-    // {
-    //     _syncFence.Signal();
-    //     return;
-    // }
-    else
-    {
-        ASSERT( 0 );
-    }
-
-    // Signal that the pinned buffer is free
-    //cpy.finishedSignal->Signal( cpy.sequenceId + 1 );
-}
-
-inline bool GpuQueue::ShouldExitCopyThread()
-{
-    return _exitCopyThread.load( std::memory_order_acquire );
+    }, fnCpy ) );
 }
diff --git a/cuda/GpuStreams.h b/cuda/GpuStreams.h
index ae1a5b63..2a310059 100644
--- a/cuda/GpuStreams.h
+++ b/cuda/GpuStreams.h
@@ -5,22 +5,127 @@
 #include "threading/Fence.h"
 #include "threading/Semaphore.h"
 #include "util/SPCQueue.h"
+#include "util/StackAllocator.h"
+#include <functional>
 
-//#define GPU_BUFFER_COUNT
+class DiskBufferBase;
+class DiskBuffer;
+class DiskBucketBuffer;
+struct GpuDownloadBuffer;
+struct GpuUploadBuffer;
+struct GpuQueue;
 
+typedef std::function<void()> GpuStreamCallback;
+typedef void (*GpuDownloadCallback)( void* hostBuffer, size_t downloadSize, void* userData );
+
+struct PackedCopy
+{
+    struct IGpuBuffer* self;
+    const  byte*       src;
+           uint32      sequence;
+           uint32      length;
+           uint32      stride;
+           uint32      elementSize;
+           uint32      counts[BBCU_BUCKET_COUNT];
+};
+
+struct DiskDataInfo
+{
+    DiskBufferBase* diskBuffer;
+
+    union {
+        struct {
+            GpuUploadBuffer* self;
+            uint32           sequence;
+        } uploadInfo;
+
+        struct {
+            size_t srcStride;
+        } download2DInfo;
+
+        struct {
+            size_t size;
+        } downloadSequentialInfo;
+    };
+};
+
+struct CopyInfo
+{
+    struct IGpuBuffer* self;
+    uint32             sequence;
+
+    const void* srcBuffer;
+    void*       dstBuffer;
+    size_t      width;
+    size_t      height;
+    size_t      dstStride;
+    size_t      srcStride;
+    
+    // Callback data
+    GpuDownloadCallback callback;
+    void*               userData;
+};
 
 // Represents a double-buffered device buffer, which can be used with a GpuStreamQueue to 
 // make fast transfers (via intermediate pinned memory)
 
-class IAllocator;
-
 enum class GpuStreamKind : uint32
 {
     Download = 0,
     Upload
 };
 
-typedef void (*GpuDownloadCallback)( void* hostBuffer, size_t downloadSize, void* userData );
+struct IGpuBuffer
+{
+    size_t            size;
+    uint32            bufferCount;                                 // Number of pinned/device buffers this instance contains
+    void*             deviceBuffer[BBCU_GPU_BUFFER_MAX_COUNT];
+    void*             pinnedBuffer[BBCU_GPU_BUFFER_MAX_COUNT];  // Pinned host buffer
+
+
+    cudaEvent_t       pinnedEvent[BBCU_GPU_BUFFER_MAX_COUNT];   // Signals that the pinned buffer is ready for use
+
+    union {
+        cudaEvent_t   deviceEvents[BBCU_GPU_BUFFER_MAX_COUNT];  // Signals that the device buffer is ready for use
+        cudaEvent_t   events      [BBCU_GPU_BUFFER_MAX_COUNT];  // Signals the device buffer is ready for use
+    };
+
+
+    union {
+        cudaEvent_t workEvent      [BBCU_GPU_BUFFER_MAX_COUNT]; // Signals that the the work stream is done w/ the device buffer, and it's ready for use
+        cudaEvent_t readyEvents    [BBCU_GPU_BUFFER_MAX_COUNT];  // User must signal this event when the device buffer is ready for download
+    };
+        cudaEvent_t completedEvents[BBCU_GPU_BUFFER_MAX_COUNT]; // Signals the buffer is ready for consumption by the device or buffer
+
+    // For dispatching host callbacks.
+    // Each buffer uses its own function?
+    cudaEvent_t       callbackLockEvent;
+    cudaEvent_t       callbackCompletedEvent;
+
+    Fence             fence;                                       // Signals the pinned buffer is ready for use
+    Fence             copyFence;
+
+    cudaEvent_t       preloadEvents[BBCU_GPU_BUFFER_MAX_COUNT];
+
+
+    CopyInfo copies[BBCU_BUCKET_COUNT];
+    // union {
+        // PackedCopy packedCopeis[BBCU_BUCKET_COUNT];    // For upload buffers
+        DiskDataInfo diskData[BBCU_BUCKET_COUNT];
+    // };
+    // DiskBucketBuffer* diskBucketBuffer = nullptr;
+
+    // #TODO: Remove atomic again
+    uint32              lockSequence;           // Index of next buffer to lock
+    uint32              outgoingSequence;       // Index of locked buffer that will be downloaded/uploaded
+    std::atomic<uint32> completedSequence;      // Index of buffer that finished downloading/uploading
+    std::atomic<uint32> copySequence;
+
+    GpuQueue*       queue;      // Queue associated with this buffer
+    DiskBufferBase* diskBuffer; // DiskBuffer, is any, used when using disk offload mode.
+};
+
+
 
 struct GpuDownloadBuffer
 {
@@ -79,7 +184,7 @@ struct GpuDownloadBuffer
     }
 
     void DownloadWithCallback( void* hostBuffer, size_t size, GpuDownloadCallback callback, void* userData, cudaStream_t workStream = nullptr, bool directOverride = false );
-    
+
     // Performs a direct host-to-pinned buffer copy,
     // and then a 2-dimensional copy from pinned buffer to host buffer
     //  - width    : Size in bytes of each row to copy
@@ -98,6 +203,15 @@ struct GpuDownloadBuffer
         Download2D( hostBuffer, width * sizeof( T ), height, dstStride * sizeof( T ), srcStride * sizeof( T ), workStream, directOverride );
     }
 
+    template<typename T>
+    inline void Download2DWithCallbackT( T* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride,
+                                         GpuDownloadCallback callback, void* userData, cudaStream_t workStream = nullptr, bool directOverride = false )
+    {
+        Download2DWithCallback( 
+            hostBuffer, width * sizeof( T ), height, dstStride * sizeof( T ), srcStride * sizeof( T ), 
+            callback, userData, workStream, directOverride );
+    }
+
     // Performs several gpu-to-pinned downloads, then copies the pinned data as a contiguous buffer
     // to the destination host buffer
     void DownloadAndPackArray( void* hostBuffer, uint32 length, size_t srcStride, const uint32* counts, uint32 elementSize );
@@ -120,25 +234,37 @@ struct GpuDownloadBuffer
 
     class GpuQueue* GetQueue() const;
 
+    DiskBufferBase* GetDiskBuffer() const;
+    void AssignDiskBuffer( DiskBufferBase* diskBuffer );
+
+    void HostCallback( std::function<void()> func );
+
 //private:
     struct IGpuBuffer* self;
 
 private:
+
+    void PerformDownload2D( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride,
+                            GpuDownloadCallback postCallback, void* postUserData, 
+                            cudaStream_t workStream, bool directOverride );
+
     void PerformDownload( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride,
                           GpuDownloadCallback callback, void* userData, cudaStream_t workStream, struct CopyInfo* copy = nullptr );
 
     void GetDownload2DCommand( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, 
                                uint32& outIndex, void*& outPinnedBuffer, const void*& outDevBuffer, GpuDownloadCallback callback = nullptr, void* userData = nullptr );
+
+    void CallHostFunctionOnStream( cudaStream_t stream, std::function<void()> func );
 };
 
 struct GpuUploadBuffer
 {
-    void Upload( const void* hostBuffer, size_t size, cudaStream_t workStream );
+    void Upload( const void* hostBuffer, size_t size, cudaStream_t workStream, bool directOverride = false );
 
     template<typename T>
-    inline void UploadT( const T* hostBuffer, size_t count, cudaStream_t workStream )
+    inline void UploadT( const T* hostBuffer, size_t count, cudaStream_t workStream, bool directOverride = false  )
     {
-        Upload( hostBuffer, count * sizeof( T ), workStream );
+        Upload( hostBuffer, count * sizeof( T ), workStream, directOverride );
     }
 
     void Upload( const void* hostBuffer, size_t size );
@@ -152,7 +278,7 @@ struct GpuUploadBuffer
     // Upload the host buffer, then copy the copyBufferSrc to the host buffer. Preloading
     // data into that hostBuffer (should be pinned) as soon as it is free so that memory is ready for the next upload.
     void UploadAndPreLoad( void* hostBuffer, size_t size, const void* copyBufferSrc, size_t copySize );
-    
+
     template<typename T>
     inline void UploadAndPreLoadT( T* hostBuffer, const size_t count, const T* copyBufferSrc, const size_t copyCount )
     {
@@ -170,6 +296,9 @@ struct GpuUploadBuffer
 
     void UploadArray( const void* hostBuffer, uint32 length, uint32 elementSize, uint32 srcStrideBytes, uint32 countStride, const uint32* counts );
 
+    void UploadArrayForIndex( const uint32 index, const void* hostBuffer, uint32 length, 
+                              uint32 elementSize, uint32 srcStride, uint32 countStride, const uint32* counts );
+
     // srcStride here is in element count
     template<typename T>
     inline void UploadArrayT( const T* hostBuffer, uint32 length, uint32 srcStride, uint32 countStride, const uint32* counts )
@@ -177,18 +306,12 @@ struct GpuUploadBuffer
         UploadArray( hostBuffer, length, (uint32)sizeof( T ), srcStride * (uint32)sizeof( T ), countStride, counts );
     }
 
-
-    void* GetUploadedDeviceBuffer( cudaStream_t workStream );
-
-    template<typename T>
-    inline T* GetUploadedDeviceBufferT( cudaStream_t workStream ) { return (T*)GetUploadedDeviceBuffer( workStream ); }
-
     // Waits until the earliest buffer has been uploaded to the GPU
     // and returns the device buffer.
-    void* GetUploadedDeviceBuffer();
+    void* GetUploadedDeviceBuffer( cudaStream_t workStream );
 
     template<typename T>
-    inline T* GetUploadedDeviceBufferT() { return (T*)GetUploadedDeviceBuffer(); }
+    inline T* GetUploadedDeviceBufferT( cudaStream_t workStream ) { return (T*)GetUploadedDeviceBuffer( workStream ); }
 
     // #TODO: Pass in the buffer used as a reference so that it can be nullified, for safety.
     void ReleaseDeviceBuffer( cudaStream_t workStream );
@@ -205,131 +328,17 @@ struct GpuUploadBuffer
 
     class GpuQueue* GetQueue() const;
 
+    void AssignDiskBuffer( DiskBufferBase* diskBuffer );
+    DiskBufferBase* GetDiskBuffer() const;
+
+    void CallHostFunctionOnStream( cudaStream_t stream, std::function<void()> func );
+
+
 //private:
     struct IGpuBuffer* self;
 
 private:
+    uint32 SynchronizeOutgoingSequence();
     void* GetNextPinnedBuffer();
 };
 
-
-class GpuQueue
-{
-    friend struct IGpuBuffer;
-    friend struct GpuDownloadBuffer;
-    friend struct GpuUploadBuffer;
-
-    enum class CommandType
-    {
-        None = 0,
-        Copy,
-        Callback,
-    };
-
-    struct Command
-    {
-        CommandType type;
-
-        union
-        {
-            struct CopyInfo* copy;
-
-            struct {
-                GpuDownloadCallback callback;
-                size_t              copySize;
-                void*               dstbuffer;
-                void*               userData;
-            } callback;
-        };
-    };
-
-public:
-
-    enum Kind
-    {
-        Downloader,
-        Uploader
-    };
-
-    GpuQueue( Kind kind );
-    virtual ~GpuQueue();
-
-    //void Synchronize();
-
-    //GpuDownloadBuffer CreateDownloadBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size = 0, bool dryRun = false );
-    //GpuDownloadBuffer CreateDownloadBuffer( const size_t size, bool dryRun = false );
-    GpuDownloadBuffer CreateDirectDownloadBuffer( size_t size, IAllocator& devAllocator, size_t alignment, bool dryRun = false );
-    GpuDownloadBuffer CreateDownloadBuffer( size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false );
-    GpuDownloadBuffer CreateDownloadBuffer( size_t size, uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false );
-
-    template<typename T>
-    inline GpuDownloadBuffer CreateDirectDownloadBuffer( const size_t count, IAllocator& devAllocator, size_t alignment = alignof( T ), bool dryRun = false )
-    {
-        return CreateDirectDownloadBuffer( count * sizeof( T ), devAllocator, alignment, dryRun );
-    }
-
-    template<typename T>
-    inline GpuDownloadBuffer CreateDownloadBufferT( const size_t count, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment = alignof( T ), bool dryRun = false )
-    {
-        return CreateDownloadBuffer( count * sizeof( T ), devAllocator, pinnedAllocator, alignment, dryRun );
-    }
-
-    template<typename T>
-    inline GpuDownloadBuffer CreateDownloadBufferT( const size_t count, uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment = alignof( T ), bool dryRun = false )
-    {
-        return CreateDownloadBuffer( count * sizeof( T ), bufferCount, devAllocator, pinnedAllocator, alignment, dryRun );
-    }
-
-    //GpuUploadBuffer CreateUploadBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size = 0, bool dryRun = false );
-    //GpuUploadBuffer CreateUploadBuffer( const size_t size, bool dryRun = false );
-    GpuUploadBuffer CreateUploadBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false );
-
-    template<typename T>
-    inline GpuUploadBuffer CreateUploadBufferT( const size_t count, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false )
-    {
-        return CreateUploadBuffer( count * sizeof( T ), devAllocator, pinnedAllocator, alignment, dryRun );
-    }
-
-    inline cudaStream_t GetStream() const { return _stream; }
-
-protected:
-
-    struct IGpuBuffer* CreateGpuBuffer( size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun );
-    struct IGpuBuffer* CreateGpuBuffer( size_t size, uint32 bufferCount, IAllocator* devAllocator, IAllocator* pinnedAllocator, size_t alignment, bool dryRun );
-    //struct IGpuBuffer* CreateGpuBuffer( const size_t size );
-    //struct IGpuBuffer* CreateGpuBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size );
-
-    static void CopyPendingDownloadStream( void* userData );
-
-    [[nodiscard]]
-    Command& GetCommand( CommandType type );
-    void SubmitCommands();
-
-    // Copy threads
-    static void CopyThreadEntryPoint( GpuQueue* self );
-    virtual void CopyThreadMain();
-
-    void ExecuteCommand( const Command& cpy );
-
-    bool ShouldExitCopyThread();
-
-protected:
-    cudaStream_t             _stream;
-    cudaStream_t             _preloadStream;
-    Thread                   _copyThread;
-    //Fence                    _bufferReadySignal;
-    Semaphore                _bufferReadySignal;
-    Fence                    _bufferCopiedSignal;
-    Fence                    _syncFence;
-    SPCQueue<Command, BBCU_BUCKET_COUNT*6> _queue;
-    Kind                     _kind;
-
-    AutoResetSignal          _waitForExitSignal;
-    std::atomic<bool>        _exitCopyThread    = false;
-
-    // Support multiple threads to grab commands
-    std::atomic<uint64> _cmdTicketOut    = 0;
-    std::atomic<uint64> _cmdTicketIn     = 0;
-    std::atomic<uint64> _commitTicketOut = 0;
-    std::atomic<uint64> _commitTicketIn  = 0;
-};
diff --git a/cuda/chacha8.cu b/cuda/chacha8.cu
index ffa4e5fb..7fb7c5d0 100644
--- a/cuda/chacha8.cu
+++ b/cuda/chacha8.cu
@@ -1,5 +1,6 @@
 #include "pos/chacha8.h"
 #include "CudaPlotContext.h"
+#include "plotting/DiskBucketBuffer.h"
 
 // #TEST
 #if _DEBUG
@@ -247,6 +248,12 @@ void GenF1Cuda( CudaK32PlotContext& cx )
     cx.metaOut.WaitForCompletion();
     cx.yOut   .Reset();
     cx.metaOut.Reset();
+
+    if( cx.cfg.hybrid16Mode )
+    {
+        cx.diskContext->yBuffer->Swap();
+        cx.diskContext->metaBuffer->Swap();
+    }
 }
 
 ///
diff --git a/extract-version.ps1 b/extract-version.ps1
new file mode 100644
index 00000000..c26d1c70
--- /dev/null
+++ b/extract-version.ps1
@@ -0,0 +1,60 @@
+# Navigate to the script's directory
+$scriptPath = Split-Path -Path $MyInvocation.MyCommand.Definition -Parent
+Set-Location -Path $scriptPath
+
+# Arguments
+$ver_component = $args[0]  # The user-specified component from the full version
+
+# Read the version from the file
+$version_str = (Get-Content 'VERSION' | Select-Object -First 1 | Out-String).Trim()
+$bb_version_suffix = (Get-Content 'VERSION' | Select-Object -Last 1 | Out-String).Trim()
+$version_header = 'src\Version.h'
+
+if ($version_str -eq $bb_version_suffix) {
+    $bb_version_suffix = ""
+}
+
+# Prepend a '-' to the suffix, if necessary
+if (-Not [string]::IsNullOrEmpty($bb_version_suffix) -and $bb_version_suffix[0] -ne '-') {
+    $bb_version_suffix = "-$bb_version_suffix"
+}
+
+# Parse the major, minor, and revision numbers
+$bb_ver_maj, $bb_ver_min, $bb_ver_rev = $version_str -split '\.' | ForEach-Object { $_.Trim() }
+
+# Get the Git commit hash
+$bb_git_commit = $env:GITHUB_SHA
+if ([string]::IsNullOrEmpty($bb_git_commit)) {
+    $bb_git_commit = & git rev-parse HEAD
+}
+
+if ([string]::IsNullOrEmpty($bb_git_commit)) {
+    $bb_git_commit = "unknown"
+}
+
+# Check if the user wants a specific component
+if (-Not [string]::IsNullOrEmpty($ver_component)) {
+    switch ($ver_component) {
+        "major" {
+            Write-Host -NoNewline $bb_ver_maj
+        }
+        "minor" {
+            Write-Host -NoNewline $bb_ver_min
+        }
+        "revision" {
+            Write-Host -NoNewline $bb_ver_rev
+        }
+        "suffix" {
+            Write-Host -NoNewline $bb_version_suffix
+        }
+        "commit" {
+            Write-Host -NoNewline $bb_git_commit
+        }
+        default {
+            Write-Error "Invalid version component '$ver_component'"
+            exit 1
+        }
+    }
+    exit 0
+}
+
diff --git a/src/PlotContext.h b/src/PlotContext.h
index 9cf78630..7465493e 100644
--- a/src/PlotContext.h
+++ b/src/PlotContext.h
@@ -8,10 +8,11 @@
 
 struct PlotRequest
 {
-    const byte* plotId;       // Id of the plot we want to create       
-    const char* outDir;       // Output plot directory
-    const char* plotFileName; // .plot.tmp file name
-    const byte* memo;         // Plot memo
+    const byte* plotId;         // Id of the plot we want to create       
+    const char* outDir;         // Output plot directory
+    const char* plotFileName;   // .plot.tmp file name
+    const char* plotOutPath;    // Full output path for the final .plot.tmp file
+    const byte* memo;           // Plot memo
     uint16      memoSize;
     bool        isFirstPlot;
     bool        IsFinalPlot;
diff --git a/src/PlotWriter.h b/src/PlotWriter.h
index 71e9e954..0255532b 100644
--- a/src/PlotWriter.h
+++ b/src/PlotWriter.h
@@ -3,6 +3,7 @@
 #include "threading/Thread.h"
 #include "threading/Semaphore.h"
 
+
 /**
  * Handles writing the final plot to disk
  *
diff --git a/src/Types.h b/src/Types.h
index 44d20992..d2364cc3 100644
--- a/src/Types.h
+++ b/src/Types.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <memory>
+
 typedef uint8_t                 byte;
 typedef uint8_t                 uint8;
 typedef uint16_t                uint16;
@@ -67,3 +69,13 @@ typedef uint128_t uint128;
 typedef std::chrono::steady_clock::duration   Duration;
 typedef std::chrono::steady_clock::time_point TimePoint;
 typedef std::chrono::nanoseconds              NanoSeconds;
+
+
+template<typename T>
+using ptr = std::unique_ptr<T>;
+
+template<typename T>
+using sptr = std::shared_ptr<T>;
+
+template<typename T>
+using wptr = std::weak_ptr<T>;
\ No newline at end of file
diff --git a/src/commands/CmdPlotCheck.cpp b/src/commands/CmdPlotCheck.cpp
index a05beeb8..0ead02b7 100644
--- a/src/commands/CmdPlotCheck.cpp
+++ b/src/commands/CmdPlotCheck.cpp
@@ -1,26 +1,31 @@
-#include "Commands.h"
-#include "plotting/GlobalPlotConfig.h"
 #include "threading/MTJob.h"
+#include "util/CliParser.h"
 #include "tools/PlotReader.h"
+#include "plotting/GlobalPlotConfig.h"
 #include "plotting/PlotValidation.h"
 #include "plotting/f1/F1Gen.h"
+#include "tools/PlotChecker.h"
+#include "harvesting/GreenReaper.h"
 
 
-struct Config
+struct PlotCheckConfig
 {
     GlobalPlotConfig* gCfg    = nullptr;
 
-    uint64      proofCount = 100;
-    const char* plotPath   = "";
+    uint64                   proofCount = 100;
+    std::vector<const char*> plotPaths{};
+    byte                     seed[BB_PLOT_ID_LEN]{};
+    bool                     hasSeed    = false;
+    bool                     noGpu      = false;
+    int32                    gpuIndex   = -1;
 };
 
 void CmdPlotsCheckHelp();
 
-
 //-----------------------------------------------------------
 void CmdPlotsCheckMain( GlobalPlotConfig& gCfg, CliParser& cli )
 {
-    Config cfg = {};
+    PlotCheckConfig cfg = {};
     cfg.gCfg = &gCfg;
 
     while( cli.HasArgs() )
@@ -30,111 +35,75 @@ void CmdPlotsCheckMain( GlobalPlotConfig& gCfg, CliParser& cli )
             CmdPlotsCheckHelp();
             Exit( 0 );
         }
+        if( cli.ReadHexStrAsBytes( cfg.seed, sizeof( cfg.seed ), "-s", "--seed" ) )
+        {
+            cfg.hasSeed = true;
+        }
         else if( cli.ReadU64( cfg.proofCount, "-n", "--iterations" ) ) continue;
+        else if( cli.ReadSwitch( cfg.noGpu, "-g", "--no-gpu" ) ) continue;
+        else if( cli.ReadI32( cfg.gpuIndex, "-d", "--device" ) ) continue;
         else
             break;
     }
 
     FatalIf( !cli.HasArgs(), "Expected a path to a plot file." );
+    do
     {
-        cfg.plotPath = cli.Arg();
+        cfg.plotPaths.push_back( cli.Arg() );
         cli.NextArg();
-
-        if( cli.HasArgs() )
-        {
-            Fatal( "Unexpected argument '%s'.", cli.Arg() );
-            Exit( 1 );
-        }
-    }
-
-    cfg.proofCount = std::max( cfg.proofCount, (uint64)1 );
-
-    FilePlot plot;
-    FatalIf( !plot.Open( cfg.plotPath ), "Failed to open plot file at '%s' with error %d.", cfg.plotPath, plot.GetError() );
-
-    const uint32 threadCount = gCfg.threadCount == 0 ? SysHost::GetLogicalCPUCount() :
-                                std::min( (uint32)MAX_THREADS, std::min( gCfg.threadCount, SysHost::GetLogicalCPUCount() ) );
-
-    PlotReader reader( plot );
-    reader.ConfigDecompressor( threadCount, gCfg.disableCpuAffinity );
-
-    const uint32 k = plot.K();
-
-    byte AlignAs(8) seed[BB_PLOT_ID_LEN] = {};
-    SysHost::Random( seed, sizeof( seed ) );
-
-    {
-        std::string seedHex = BytesToHexStdString( seed, sizeof( seed ) );
-        Log::Line( "Checking %llu random proofs with seed 0x%s...", (llu)cfg.proofCount, seedHex.c_str() );
     }
-    Log::Line( "Plot compression level: %u", plot.CompressionLevel() );
-
-    const uint64 f7Mask = (1ull << k) - 1;
-
-    uint64 prevF7     = 0;
-    uint64 proofCount = 0;
-
-    uint64 proofXs[BB_PLOT_PROOF_X_COUNT];
-
-    uint64 nextPercentage = 10;
-
-    for( uint64 i = 0; i < cfg.proofCount; i++ )
+    while( cli.HasArgs() );
+
+    
+    // GreenReaperContext* grContext = nullptr;
+    // {
+    //     // Pre-create decompressor here?
+    //     grCreateContext( &grcontext, grCfg, sizeof( GreenReaperConfig ) )
+    // }
+
+        // const bool hasGPU = grHasGpuDecompressor( reader.GetDecompressorContext() );
+        // if( hasGPU && !cfg.silent )
+        //     Log::Line( "Using GPU for decompression." );
+        // else if( !cfg.silent )
+        //     Log::Line( "No GPU was selected for decompression." );
+
+    PlotCheckerConfig checkerCfg{
+        .proofCount         = cfg.proofCount,
+        .noGpu              = cfg.noGpu,
+        .gpuIndex           = cfg.gpuIndex,
+        .threadCount        = gCfg.threadCount,
+        .disableCpuAffinity = gCfg.disableCpuAffinity,
+        .silent             = false,
+        .hasSeed            = cfg.hasSeed,
+        .deletePlots        = false,
+        .deleteThreshold    = 0.0
+    };
+
+    static_assert( sizeof( checkerCfg.seed ) == sizeof( cfg.seed ) );
+    if( cfg.hasSeed )
+        memcpy( checkerCfg.seed, cfg.seed, sizeof( checkerCfg.seed ) );
+
+    ptr<PlotChecker> checker( PlotChecker::Create( checkerCfg ) );
+
+    for( auto* plotPath : cfg.plotPaths )
     {
-        const uint64 f7 = F1GenSingleForK( k, seed, prevF7 ) & f7Mask;
-        prevF7 = f7;
-
-        uint64 startP7Idx = 0;
-        const uint64 nF7Proofs = reader.GetP7IndicesForF7( f7, startP7Idx );
-
-        for( uint64 j = 0; j < nF7Proofs; j++ )
+        PlotCheckResult result{};
+        checker->CheckPlot( plotPath, &result );
+        if( !result.error.empty() )
         {
-            uint64 p7Entry;
-            if( !reader.ReadP7Entry( startP7Idx + j, p7Entry ) )
-            {
-                // #TODO: Handle error
-                continue;
-            }
-
-            const auto r = reader.FetchProof( p7Entry, proofXs );
-            if( r == ProofFetchResult::OK )
-            {
-                // Convert to 
-                uint64 outF7 = 0;
-                if( PlotValidation::ValidateFullProof( k, plot.PlotId(), proofXs, outF7 ) )
-                {
-                    if( f7 == outF7 )
-                    {
-                        proofCount++;
-                    }
-                    else {}// #TODO: Handle error
-                }
-                else
-                {
-                    // #TODO: Handle error
-                }
-
-            }
-            else
-            {   
-                // #TODO: Handle error
-                continue;
-            }
+            Fatal( result.error.c_str() );
         }
 
-        const double percent = i / (double)cfg.proofCount * 100.0;
-        if( (uint64)percent == nextPercentage )
-        {
-            Log::Line( " %llu%%...", (llu)nextPercentage );
-            nextPercentage += 10;
-        }
+        Log::NewLine();
+
+        // Log::Line( "%llu / %llu (%.2lf%%) valid proofs found.",
+        //     (llu)result.proofCount, (llu)cfg.proofCount, ((double)result.proofCount / cfg.proofCount) * 100.0 );
     }
 
-    Log::Line( "%llu / %llu (%.2lf%%) valid proofs found.",
-        (llu)proofCount, (llu)cfg.proofCount, ((double)proofCount / cfg.proofCount) * 100.0 );
 }
 
 //-----------------------------------------------------------
 void CmdPlotsCheckHelp()
 {
 
-}
\ No newline at end of file
+}
diff --git a/src/harvesting/GreenReaper.cpp b/src/harvesting/GreenReaper.cpp
index 325a4982..3aae4cdd 100644
--- a/src/harvesting/GreenReaper.cpp
+++ b/src/harvesting/GreenReaper.cpp
@@ -353,7 +353,7 @@ GRResult grGetCompressionInfo( GRCompressionInfo* outInfo, const size_t infoStru
 
     auto c = GetCompressionInfoForLevel( compressionLevel );
     outInfo->entrySizeBits = c.entrySizeBits;
-    outInfo->subtSizeBits  = c.subtSizeBits;
+    outInfo->stubSizeBits  = c.stubSizeBits;
     outInfo->tableParkSize = c.tableParkSize;
     outInfo->ansRValue     = c.ansRValue;
 
diff --git a/src/harvesting/GreenReaper.h b/src/harvesting/GreenReaper.h
index 3fdfa6c9..499e755d 100644
--- a/src/harvesting/GreenReaper.h
+++ b/src/harvesting/GreenReaper.h
@@ -69,7 +69,7 @@ typedef enum GRResult
 typedef struct GRCompressionInfo
 {
     uint32_t entrySizeBits;
-    uint32_t subtSizeBits;
+    uint32_t stubSizeBits;
     size_t   tableParkSize;
     double   ansRValue;
 } GRCompressionInfo;
@@ -165,6 +165,22 @@ GR_API GRBool grHasGpuDecompressor( GreenReaperContext* context );
 
 GR_API GRResult grGetCompressionInfo( GRCompressionInfo* outInfo, size_t infoStructSize, uint32_t k, uint32_t compressionLevel );
 
+inline const char* grResultToString( const GRResult r )
+{
+    switch( r )
+    {
+        case GRResult_Failed       : return "GRResult_Failed";
+        case GRResult_OK           : return "GRResult_OK";
+        case GRResult_OutOfMemory  : return "GRResult_OutOfMemory";
+        case GRResult_NoProof      : return "GRResult_NoProof";
+        case GRResult_WrongVersion : return "GRResult_WrongVersion";
+        case GRResult_InvalidGPU   : return "GRResult_InvalidGPU";
+        case GRResult_InvalidArg   : return "GRResult_InvalidArg";
+    }
+
+    return "Unknown";
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/harvesting/HarvesterDummy.cpp b/src/harvesting/HarvesterDummy.cpp
new file mode 100644
index 00000000..e2d8f69e
--- /dev/null
+++ b/src/harvesting/HarvesterDummy.cpp
@@ -0,0 +1 @@
+// Only here to make CMake happy
\ No newline at end of file
diff --git a/src/io/FileStream.h b/src/io/FileStream.h
index 3521faa1..e67d2544 100644
--- a/src/io/FileStream.h
+++ b/src/io/FileStream.h
@@ -31,7 +31,28 @@ class FileStream : public IStream
 {
 public:
     inline FileStream() {}
-    inline ~FileStream()
+
+    inline FileStream( FileStream&& other ) noexcept
+        :  _position ( other._position )
+        , _access    ( other._access )
+        , _flags     ( other._flags )
+        , _error     ( other._error )
+        , _blockSize ( other._blockSize )
+        , _fd        ( other._fd )
+    {
+        other._position  = 0;
+        other._access    = FileAccess::None;
+        other._flags     = FileFlags::None;
+        other._error     = 0;
+        other._blockSize = 0;
+        #if PLATFORM_IS_UNIX
+            other._fd = -1;
+        #else
+            other._fd = INVALID_WIN32_HANDLE;
+        #endif
+    }
+
+    virtual inline ~FileStream()
     {
         Close();
     }
diff --git a/src/main.cpp b/src/main.cpp
index ae568d1c..c510ed5c 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -135,6 +135,7 @@ int main( int argc, const char* argv[] )
         req.memoSize     = plotMemoSize;
         req.outDir       = plotOutFolder;
         req.plotFileName = plotFileName;
+        req.plotOutPath  = plotOutPath;
         req.isFirstPlot  = i == 0;
         req.IsFinalPlot  = i == plotCount-1;
 
@@ -177,9 +178,11 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c
             // The next parameter is potentially the compression level
              if( IsNumber( cli.Peek() ) )
                 cfg.compressionLevel = (uint32)cli.ReadU64();
-            
+
             continue;
         }
+        else if( cli.ReadSwitch( cfg.disableOutputDirectIO, "--no-direct-io" ) )
+            continue;
         else if( cli.ReadStr( cfg.plotMemoStr, "--memo" ) )
             continue;
         else if( cli.ReadSwitch( cfg.showMemo, "--show-memo" ) )
@@ -325,8 +328,10 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c
                     DiskPlotter::PrintUsage();
                 else if( cli.ArgMatch( "ramplot" ) )
                     Log::Line( "bladebit -f ... -p/c ... ramplot <out_dirs>" );
+            #if BB_CUDA_ENABLED
                 else if( cli.ArgMatch( "cudaplot" ) )
-                    Log::Line( "bladebit_cuda -f ... -p/c ... cudaplot [-d=device] <out_dirs>" );
+                    CudaK32PlotterPrintHelp();
+            #endif
                 else if( cli.ArgMatch( "iotest" ) )
                     IOTestPrintUsage();
                 else if( cli.ArgMatch( "memtest" ) )
@@ -362,7 +367,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c
     // The remainder should be output folders, which we parse after the plotter consumes it's config
 
     ///
-    /// Validate global conifg
+    /// Validate global config
     ///
     FatalIf( farmerPublicKey == nullptr, "A farmer public key must be specified." );
     FatalIf( !KeyTools::HexPKeyToG1Element( farmerPublicKey, *(cfg.farmerPublicKey = new bls::G1Element()) ),
@@ -391,7 +396,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c
     {
         // #TODO: Remove this when added
         if( cfg.compressionLevel > 7 )
-            Log::Line( "[WARNING] Compression levels greater than 7 are only for testing purposes and are not configured to the final plot size." );
+            Log::Line( "WARNING: Compression levels greater than 7 are only for testing purposes and are not configured to the final plot size." );
 
         cfg.compressedEntryBits = 17 - cfg.compressionLevel;
         cfg.ctable              = CreateCompressionCTable( cfg.compressionLevel, &cfg.cTableSize );
@@ -477,7 +482,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c
     Log::Line( " Benchmark mode        : %s", cfg.benchmarkMode ? "enabled" : "disabled" );
     // Log::Line( " Output path           : %s", cfg.outputFolder );
     // Log::Line( "" );
-    
+
 
     FatalIf( plotter == nullptr, "No plotter type chosen." );
 
@@ -486,7 +491,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c
 
     // Parse plotter-specific CLI
     plotter->ParseCLI( cfg, cli );
-    
+
     // Parse remaining args as output directories
     cfg.outputFolderCount = (uint32)cli.RemainingArgCount();
     FatalIf( cfg.outputFolderCount < 1, "At least one output folder must be specified." );
@@ -498,6 +503,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c
     while( cli.HasArgs() )
     {
         outPath = cli.Arg();
+        FatalIf( outPath[0] == '-', "Unrecognized argument '%s'.", outPath.c_str() );
 
         // Add trailing slash?
         const char endChar = outPath.back();
@@ -541,7 +547,7 @@ R"(
 
  -t, --threads        : Maximum number of threads to use.
                         By default, this is set to the maximum number of logical cpus present.
- 
+
  -n, --count          : Number of plots to create. Default = 1.
 
  -f, --farmer-key     : Farmer public key, specified in hexadecimal format.
@@ -560,7 +566,11 @@ R"(
                         Current compression levels supported are from 0 to 7 (inclusive).
                         Where 0 means no compression, and 7 is the highest compression.
                         Higher compression means smaller plots, but more CPU usage during harvesting.
- 
+
+ --no-direct-io       : Disable direct I/O when writing plot files.
+                        Enable this if writing to a storage destination 
+                        that does not support direct I/O.
+
  --benchmark          : Enables benchmark mode. This is meant to test plotting without
                         actually writing a final plot to disk.
 
@@ -582,10 +592,10 @@ R"(
                         This is useful when running multiple simultaneous
                         instances of Bladebit as you can manually
                         assign thread affinity yourself when launching Bladebit.
- 
+
  --memory             : Display system memory available, in bytes, and the 
                         required memory to run Bladebit, in bytes.
- 
+
  --memory-json        : Same as --memory, but formats the output as json.
 
  --version            : Display current version.
diff --git a/src/pch.h b/src/pch.h
index 44e56636..eae8251c 100644
--- a/src/pch.h
+++ b/src/pch.h
@@ -10,6 +10,8 @@
 #include <chrono>
 #include <atomic>
 #include <cmath>
+#include <memory>
+#include <vector>
 #include "Platform.h"
 
 // Defined in Util.cpp
diff --git a/src/platform/win32/SysHost_Win32.cpp b/src/platform/win32/SysHost_Win32.cpp
index b1744027..cb05c0d6 100644
--- a/src/platform/win32/SysHost_Win32.cpp
+++ b/src/platform/win32/SysHost_Win32.cpp
@@ -684,8 +684,7 @@ bool EnableLockMemoryPrivilege()
 
     // Still have to check if it actually adjusted the privilege
     // #See: https://devblogs.microsoft.com/oldnewthing/20211126-00/?p=105973
-    DWORD r = ::GetLastError();
-    if( r != ERROR_SUCCESS )
+    if( ::GetLastError() != ERROR_SUCCESS )
         goto Failed;
 
     _enabledState = 1;
diff --git a/src/plotdisk/DiskPlotPhase3.cpp b/src/plotdisk/DiskPlotPhase3.cpp
index 6be14d56..5fac28f3 100644
--- a/src/plotdisk/DiskPlotPhase3.cpp
+++ b/src/plotdisk/DiskPlotPhase3.cpp
@@ -1065,7 +1065,7 @@ class P3StepTwo
 
             auto info = GetCompressionInfoForLevel( _context.cfg->globalCfg->compressionLevel );
             outParkSize    = info.tableParkSize;
-            outStubBitSize = info.subtSizeBits;
+            outStubBitSize = info.stubSizeBits;
             outCtable      = _context.cfg->globalCfg->ctable;
         }
         else
diff --git a/src/plotdisk/jobs/IOJob.cpp b/src/plotdisk/jobs/IOJob.cpp
index 90b27da1..2f7f2e8f 100644
--- a/src/plotdisk/jobs/IOJob.cpp
+++ b/src/plotdisk/jobs/IOJob.cpp
@@ -146,21 +146,29 @@ bool IOJob::WriteToFile( const char* filePath, const void* writeBuffer, const si
 
 //-----------------------------------------------------------
 bool IOJob::WriteToFile( IStream& file, const void* writeBuffer, const size_t size,
-                         void* fileBlockBuffer, const size_t blockSize, int& error )                 
+                         void* fileBlockBuffer, const size_t blockSize, int& error, size_t* outSizeWritten )                 
 {
     error = 0;
 
     const byte* buffer      = (byte*)writeBuffer;
     byte*       blockBuffer = (byte*)fileBlockBuffer;
 
-    size_t       sizeToWrite = size / blockSize * blockSize;
-    const size_t remainder   = size - sizeToWrite;
+    const size_t totalSizeToWrite = size / blockSize * blockSize;           ASSERT( totalSizeToWrite <= size );
+
+    size_t       sizeToWrite = totalSizeToWrite;
+    const size_t remainder   = size - sizeToWrite;                          ASSERT( remainder < blockSize );
+    ASSERT( !remainder || blockBuffer );
 
     while( sizeToWrite )
     {
-        ssize_t sizeWritten = file.Write( buffer, sizeToWrite );
+        const ssize_t sizeWritten = file.Write( buffer, sizeToWrite );
+
         if( sizeWritten < 1 )
         {
+            // Output size written thus far
+            if( outSizeWritten )
+                *outSizeWritten = totalSizeToWrite - sizeToWrite;
+
             error = file.GetError();
             return false;
         }
@@ -171,23 +179,49 @@ bool IOJob::WriteToFile( IStream& file, const void* writeBuffer, const size_t si
         buffer      += sizeWritten;
     }
 
+    // Write unaligned portion, if any
     if( remainder )
     {
-        ASSERT( blockBuffer );
-        
+        if( !blockBuffer )
+        {
+            // All aligned data was written (if there was any)
+            if( outSizeWritten )
+                *outSizeWritten = totalSizeToWrite;
+
+            error = -1;
+            return false;
+        }
+
         // Unnecessary zeroing of memory, but might be useful for debugging
         memset( blockBuffer, 0, blockSize );
         memcpy( blockBuffer, buffer, remainder );
 
-        ssize_t sizeWritten = file.Write( blockBuffer, blockSize );
+        const ssize_t sizeWritten = file.Write( blockBuffer, blockSize );
 
         if( sizeWritten < 1 )
         {
+            // All aligned data was written (if there was any)
+            if( outSizeWritten )
+                *outSizeWritten = totalSizeToWrite;
+
             error = file.GetError();
             return false;
         }
+
+        // Expect to always write a full block.
+        if( (size_t)sizeWritten != blockSize )
+        {
+            if( outSizeWritten )
+                *outSizeWritten = totalSizeToWrite + (size_t)sizeWritten;
+
+            error = -2;
+            return false;
+        }
     }
 
+    if( outSizeWritten )
+        *outSizeWritten = size;
+
     return true;
 }
 
@@ -332,6 +366,12 @@ bool IOJob::ReadFromFile( IStream& file, void* readBuffer, const size_t size,
 
     if( remainder )
     {
+        if( blockBuffer == nullptr )
+        {
+            error = -1;
+            return false;
+        }
+
         ssize_t sizeRead = file.Read( blockBuffer, blockSize );
 
         if( sizeRead < (ssize_t)remainder )
diff --git a/src/plotdisk/jobs/IOJob.h b/src/plotdisk/jobs/IOJob.h
index ef09e807..4bfc67c0 100644
--- a/src/plotdisk/jobs/IOJob.h
+++ b/src/plotdisk/jobs/IOJob.h
@@ -34,8 +34,11 @@ struct IOJob : MTJob<IOJob>
 
     static bool WriteToFile( const char* filePath, const void* writeBuffer, const size_t size, int& error );
 
+    // Aligned write.
+    // Guaranteed to write all data in the buffer, if not it returns false and sets the error.
+    // Negative error values are non-OS errors.
     static bool WriteToFile( IStream& file, const void* writeBuffer, const size_t size,
-                             void* fileBlockBuffer, const size_t blockSize, int& error );
+                             void* fileBlockBuffer, const size_t blockSize, int& error, size_t* outSizeWritten = nullptr );
     
     static bool WriteToFileUnaligned( const char* filePath, const void* writeBuffer, const size_t size, int& error );
     static bool WriteToFileUnaligned( IStream& file, const void* writeBuffer, const size_t size, int& error );
diff --git a/src/plotmem/MemPhase3.cpp b/src/plotmem/MemPhase3.cpp
index 9db5c54e..0c7260d0 100644
--- a/src/plotmem/MemPhase3.cpp
+++ b/src/plotmem/MemPhase3.cpp
@@ -185,7 +185,7 @@ uint64 MemPhase3::ProcessTable( uint32* lEntries, uint64* lpBuffer, Pair* rTable
     if( tableId == TableId::Table2 && cx.cfg.gCfg->compressionLevel > 0 )
     {
         parkSize    = cx.cfg.gCfg->compressionInfo.tableParkSize;
-        stubBitSize = cx.cfg.gCfg->compressionInfo.subtSizeBits;
+        stubBitSize = cx.cfg.gCfg->compressionInfo.stubSizeBits;
         cTable      = cx.cfg.gCfg->ctable;
     }
 
diff --git a/src/plotting/BufferChain.cpp b/src/plotting/BufferChain.cpp
new file mode 100644
index 00000000..43a7e47b
--- /dev/null
+++ b/src/plotting/BufferChain.cpp
@@ -0,0 +1,72 @@
+#include "BufferChain.h"
+#include "util/IAllocator.h"
+
+BufferChain::BufferChain( uint32 bufferCount, size_t bufferSize )
+    : _buffers    ( new byte*[bufferCount], bufferCount )
+    , _bufferSize ( bufferSize )
+{}
+
+BufferChain::~BufferChain()
+{
+    delete[] _buffers.Ptr();
+}
+
+BufferChain* BufferChain::Create( IAllocator& allocator, uint32 bufferCount, size_t bufferSize, size_t bufferAlignment, bool dryRun )
+{
+    PanicIf( !bufferSize, "" );
+    PanicIf( !bufferCount, "" );
+    PanicIf( !bufferAlignment, "" );
+
+    BufferChain* self = nullptr;
+    if( !dryRun )
+        self = new BufferChain( bufferCount, bufferSize );
+
+    for( uint32 i = 0; i < bufferCount; i++ )
+    {
+        byte* buffer = allocator.AllocT<byte>( bufferSize, bufferAlignment );
+
+        if( !dryRun )
+            self->_buffers[i] = buffer;
+    }
+
+    return self;
+}
+
+byte* BufferChain::PeekBuffer( const uint32 index )
+{
+    return _buffers[index % (uint32)_buffers.Length()];
+}
+
+byte* BufferChain::GetNextBuffer()
+{
+    const uint32 bufferCount = (uint32)_buffers.Length();
+
+    PanicIf( _nextBufferToRelease > _nextBufferToLock, "" );
+    PanicIf( _nextBufferToLock - _nextBufferToRelease > bufferCount, "" );
+
+    if( _nextBufferToLock >= bufferCount )
+    {
+        _fence.Wait( _nextBufferToLock - bufferCount + 1 );
+    }
+
+    return PeekBuffer( _nextBufferToLock++ );
+}
+
+void BufferChain::ReleaseNextBuffer()
+{
+    PanicIf( _nextBufferToRelease >= _nextBufferToLock, "" );
+    PanicIf(_nextBufferToLock - _nextBufferToRelease > (uint32)_buffers.Length(), "" );
+
+    _fence.Signal( ++_nextBufferToRelease );
+}
+
+void BufferChain::Reset()
+{
+    // Wait for the last buffer to be released
+    _fence.Wait( _nextBufferToLock );
+
+    // Reset state
+    _fence.Reset( 0 );
+    _nextBufferToRelease = 0;
+    _nextBufferToLock    = 0;
+}
diff --git a/src/plotting/BufferChain.h b/src/plotting/BufferChain.h
new file mode 100644
index 00000000..edb934a7
--- /dev/null
+++ b/src/plotting/BufferChain.h
@@ -0,0 +1,44 @@
+#pragma once
+#include "threading/Fence.h"
+#include "util/Span.h"
+
+class IAllocator;
+
+/// Maintains a chain of buffers which is to be used (and re-used) with first-out, first-in semantics.
+///  #NOTE: The caller is expected to free the buffers, as we don't own them, just use them.
+class BufferChain
+{
+    BufferChain( uint32 bufferCount, size_t bufferSize );
+
+public:
+    ~BufferChain();
+
+    static BufferChain* Create( IAllocator& allocator, uint32 bufferCount, size_t bufferSize, size_t bufferAlignment, bool dryRun );
+
+    /// Get the pointer to a buffer that will be used for a certain index
+    /// without actually waiting for it to be available.
+    byte* PeekBuffer( uint32 index );
+
+    /// Blocks calling thread until the next buffer in the chain
+    /// is ready for use, and returns it.
+    byte* GetNextBuffer();
+
+    /// Releases the earliest locked buffer
+    void ReleaseNextBuffer();
+
+    /// Blocks the calling thread until all outstanding buffers have been released,
+    /// and resets its state to the first buffer index again.
+    void Reset();
+
+    inline size_t BufferSize() const { return _bufferSize; }
+
+    inline uint32 BufferCount() const { return (uint32)_buffers.Length(); }
+
+private:
+    Fence        _fence;
+    Span<byte*>  _buffers;
+    IAllocator*  _allocator           = nullptr;
+    size_t       _bufferSize          = 0;  // Size of each individual buffer
+    uint32       _nextBufferToLock    = 0;
+    uint32       _nextBufferToRelease = 0;
+};
diff --git a/src/plotting/Compression.cpp b/src/plotting/Compression.cpp
index d1db2973..bde4313b 100644
--- a/src/plotting/Compression.cpp
+++ b/src/plotting/Compression.cpp
@@ -2,6 +2,7 @@
 #include "plotting/FSETableGenerator.h"
 #include "util/Util.h"
 #include <mutex>
+#include <algorithm>
 
 // Caches for C and D tables
 static std::atomic<FSE_CTable*> _cTableCache[32] = {};
@@ -62,7 +63,7 @@ template<uint32 level>
 void GetCompressionInfoForLevel( CompressionInfo& info )
 {
     info.entrySizeBits = CompressionLevelInfo<level>::ENTRY_SIZE;
-    info.subtSizeBits  = CompressionLevelInfo<level>::STUB_BIT_SIZE;
+    info.stubSizeBits  = CompressionLevelInfo<level>::STUB_BIT_SIZE;
     info.tableParkSize = CompressionLevelInfo<level>::TABLE_PARK_SIZE;
     info.ansRValue     = CompressionLevelInfo<level>::ANS_R_VALUE;
 }
@@ -140,4 +141,19 @@ uint32 GetCompressedLPBitCount( const uint32 compressionLevel )
     //     lpBitSize = lpBitSize * 2 - 1;
 
     return lpBitSize * 2 - 1;
+}
+
+size_t GetLargestCompressedParkSize()
+{
+    return std::max( {
+        GetCompressionInfoForLevel( 1 ).tableParkSize,
+        GetCompressionInfoForLevel( 2 ).tableParkSize,
+        GetCompressionInfoForLevel( 3 ).tableParkSize,
+        GetCompressionInfoForLevel( 4 ).tableParkSize,
+        GetCompressionInfoForLevel( 5 ).tableParkSize,
+        GetCompressionInfoForLevel( 6 ).tableParkSize,
+        GetCompressionInfoForLevel( 7 ).tableParkSize,
+        GetCompressionInfoForLevel( 8 ).tableParkSize,
+        GetCompressionInfoForLevel( 9 ).tableParkSize }
+    );
 }
\ No newline at end of file
diff --git a/src/plotting/Compression.h b/src/plotting/Compression.h
index babb379f..dbb01228 100644
--- a/src/plotting/Compression.h
+++ b/src/plotting/Compression.h
@@ -4,7 +4,7 @@
 struct CompressionInfo
 {
     uint32_t entrySizeBits;
-    uint32_t subtSizeBits;
+    uint32_t stubSizeBits;
     size_t   tableParkSize;
     double   ansRValue;
 };
@@ -16,6 +16,7 @@ FSE_CTable*     CreateCompressionCTable( const uint32_t compressionLevel, size_t
 FSE_DTable*     CreateCompressionDTable( const uint32_t compressionLevel, size_t* outTableSize = nullptr );
 CompressionInfo GetCompressionInfoForLevel( const uint32_t compressionLevel );
 uint32_t        GetCompressedLPBitCount( const uint32_t compressionLevel );
+size_t          GetLargestCompressedParkSize();
 
 template<uint32_t level>
 struct CompressionLevelInfo 
diff --git a/src/plotting/DiskBucketBuffer.cpp b/src/plotting/DiskBucketBuffer.cpp
new file mode 100644
index 00000000..39c15e70
--- /dev/null
+++ b/src/plotting/DiskBucketBuffer.cpp
@@ -0,0 +1,238 @@
+#include "DiskBucketBuffer.h"
+#include "DiskQueue.h"
+#include "plotdisk/jobs/IOJob.h"
+#include "util/IAllocator.h"
+#include "util/StackAllocator.h"
+#include <filesystem>
+
+DiskBucketBuffer::DiskBucketBuffer( DiskQueue& queue, FileStream& stream, const char* name,
+                                    uint32 bucketCount, size_t sliceCapacity )
+    : DiskBufferBase( queue, stream, name, bucketCount )
+    , _sliceCapacity( RoundUpToNextBoundaryT( sliceCapacity, queue.BlockSize() ) )
+    // , _writeSliceStride( _sliceCapacity )   // Start writing horizontally
+    // , _readSliceStride( _sliceCapacity * bucketCount )
+{
+    ASSERT( bucketCount > 0 );
+
+    _writeSliceSizes.resize( bucketCount );
+    _readSliceSizes .resize( bucketCount );
+    for( size_t bucket = 0; bucket < bucketCount; bucket++ )
+    {
+        _writeSliceSizes[bucket].resize( bucketCount );
+        _readSliceSizes [bucket].resize( bucketCount );
+    }
+}
+
+DiskBucketBuffer::~DiskBucketBuffer()
+{}
+
+DiskBucketBuffer*
+DiskBucketBuffer::Create( DiskQueue& queue, const char* fileName,
+                          uint32 bucketCount, size_t sliceCapacity,
+                          FileMode mode, FileAccess access, FileFlags flags )
+{
+    FileStream file;
+    if( !DiskBufferBase::MakeFile( queue, fileName, mode, access, flags, file ) )
+        return nullptr;
+
+    return new DiskBucketBuffer( queue, file, fileName, bucketCount, sliceCapacity );
+}
+
+size_t DiskBucketBuffer::GetSingleBucketBufferSize( DiskQueue& queue, uint32 bucketCount, size_t sliceCapacity )
+{
+    return RoundUpToNextBoundaryT( sliceCapacity, queue.BlockSize() ) * bucketCount;
+}
+
+size_t DiskBucketBuffer::GetReserveAllocSize( DiskQueue& queue, uint32 bucketCount, size_t sliceCapacity )
+{
+    return DiskBufferBase::GetReserveAllocSize(
+        GetSingleBucketBufferSize( queue, bucketCount, sliceCapacity ),
+        queue.BlockSize() );
+}
+
+void DiskBucketBuffer::ReserveBuffers( IAllocator& allocator )
+{   
+    DiskBufferBase::ReserveBuffers( allocator, GetBucketRowStride(), _queue->BlockSize() );
+}
+
+void DiskBucketBuffer::Swap()
+{
+    DiskBufferBase::Swap();
+
+    // std::swap( _writeSliceStride, _readSliceStride );
+    _verticalWrite = !_verticalWrite;
+    std::swap( _writeSliceSizes, _readSliceSizes );
+}
+
+void DiskBucketBuffer::Submit( const size_t sliceStride )
+{
+    PanicIf( sliceStride > _sliceCapacity, "Invalid slice stride %llu is greater than capacity %llu for %s.", 
+        (llu)sliceStride, (llu)_sliceCapacity, Name() );
+
+    const uint32 bucket = BeginWriteSubmission();
+
+    DiskQueueDispatchCommand dcmd = {};
+    auto& cmd = dcmd.bucketBufferCmd;
+
+    cmd.type = DiskBucketBufferCommand::Write;
+    auto& c = cmd.write;
+
+    c.sliceStride = sliceStride;
+    c.bucket      = bucket;
+    c.vertical    = _verticalWrite;
+
+    _queue->EnqueueDispatchCommand( this, dcmd );
+
+    // Record slice sizes (write 1 column cell per row)
+    // At the end of a table a bucket row will have
+    // all the slice sizes of a given bucket.
+    for( uint32 row = 0; row < _bucketCount; row++ )
+    {
+        _writeSliceSizes[row][bucket] = sliceStride;//sliceSizes[row];
+    }
+
+    // Signal completion
+    EndWriteSubmission();
+}
+
+void DiskBucketBuffer::ReadNextBucket()
+{
+    const uint32 bucket = BeginReadSubmission();
+
+    DiskQueueDispatchCommand dcmd = {};
+    auto& cmd = dcmd.bucketBufferCmd;
+
+    cmd.type = DiskBucketBufferCommand::Read;
+    auto& c = cmd.read;
+    c.bucket   = bucket;
+    c.vertical = _verticalWrite; // If the last write was NOT vertical, then the read is vertical.
+
+    _queue->EnqueueDispatchCommand( this, dcmd );
+
+    EndReadSubmission();
+}
+
+Span<byte> DiskBucketBuffer::PeekReadBuffer( const uint32 bucket )
+{
+    size_t totalSize = 0;
+    for( auto sz : _readSliceSizes[bucket] )
+        totalSize += sz;
+
+    return Span<byte>( _readBuffers[bucket % 2], totalSize );
+}
+
+void DiskBucketBuffer::OverrideReadSlices( const uint32 bucket, const size_t elementSize, const uint32* sliceSizes, const uint32 stride )
+{
+    size_t totalSize = 0;
+
+    auto& readSlices = _readSliceSizes[bucket];
+    ASSERT( readSlices.size() == _bucketCount );
+
+    for( size_t i = 0; i < _bucketCount; i++ )
+    {
+        readSlices[i] = *sliceSizes * elementSize;
+        sliceSizes += stride;
+    }
+}
+
+
+///
+/// These are executed from the DiskQueue thread
+///
+void DiskBucketBuffer::HandleCommand( const DiskQueueDispatchCommand& cmd )
+{
+    const auto& c = cmd.bucketBufferCmd;
+
+    switch( c.type )
+    {
+        default:
+            Panic( "Unexpected." );
+            break;
+
+        case DiskBucketBufferCommand::Write:
+            CmdWriteSlices( c );
+            break;
+
+        case DiskBucketBufferCommand::Read:
+            CmdReadSlices( c );
+            break;
+    }
+}
+
+void DiskBucketBuffer::CmdWriteSlices( const DiskBucketBufferCommand& cmd )
+{
+    auto & c = cmd.write;
+    int err = 0;
+
+    const byte*  src       = (byte*)_writeBuffers[c.bucket % 2];
+    const size_t srcStride = c.sliceStride;
+    const size_t dstStride = c.vertical ? GetBucketRowStride() : GetSliceStride(); 
+
+    // Offset to the starting location
+    int64 offset = (int64)(c.vertical ? _sliceCapacity * c.bucket : GetBucketRowStride() * c.bucket );
+
+    // Seek to starting location
+    for( uint32 i = 0; i < _bucketCount; i++ )
+    {
+        // Seek to next slice
+        FatalIf( !_file.Seek( offset, SeekOrigin::Begin ),
+                    "Failed to seek to slice %u start on '%s/%s' with error %d.",
+                    i, _queue->Path(), Name(), (int32)_file.GetError() );
+        offset += (int64)dstStride;
+
+        // Write slice
+        if( !IOJob::WriteToFileUnaligned( _file, src, srcStride, err ) )
+        {
+            Fatal( "Failed to write slice on '%s/%s' with error %d.", _queue->Path(), Name(), err );
+        }
+
+        src += srcStride;
+    }
+}
+
+void DiskBucketBuffer::CmdReadSlices( const DiskBucketBufferCommand& cmd )
+{
+    const auto& c = cmd.read;
+
+    int err = 0;
+
+    byte* dst = _readBuffers[c.bucket % 2];
+
+    const size_t rowStride   = GetBucketRowStride();
+    const size_t sliceStride = GetSliceStride();
+
+    // Use the last slice as a temp buffer (to avoid the slower memmove on most copies)
+    byte* tmpBuffer = dst + sliceStride * (_bucketCount-1);
+
+    for( size_t i = 0; i < _bucketCount; i++ )
+    {
+        // Seek to starting location of the slice
+        const size_t colOffset = c.vertical ? sliceStride * c.bucket : sliceStride * i;
+        const size_t rowOffset = c.vertical ? rowStride * i          : rowStride * c.bucket;
+
+        if( !_file.Seek( (int64)(rowOffset + colOffset), SeekOrigin::Begin ) )
+        {
+            Fatal( "Failed to seek to slice %u start on '%s/%s' with error %d.",
+                   i, _queue->Path(), Name(), (int32)_file.GetError() );
+        }
+
+        // Read a full block-aligned slice
+        if( !IOJob::ReadFromFileUnaligned( _file, tmpBuffer, sliceStride, err ) )
+        {
+            if( err != 0 || i + 1 < _bucketCount )
+            {
+                Fatal( "Failed to read slice from '%s/%s' with error %d.", _queue->Path(), Name(), err );
+            }
+        }
+
+        // Copy read buffer to actual location
+        const size_t sliceSize = _readSliceSizes[c.bucket][i];
+
+        if( i + 1 < _bucketCount )
+            memcpy( dst, tmpBuffer, sliceSize );
+        else
+            memmove( dst, tmpBuffer, sliceSize );   // Last copy overlaps since it's the same as the temp buffer
+
+        dst += sliceSize;
+    }
+}
diff --git a/src/plotting/DiskBucketBuffer.h b/src/plotting/DiskBucketBuffer.h
new file mode 100644
index 00000000..ec50cc8b
--- /dev/null
+++ b/src/plotting/DiskBucketBuffer.h
@@ -0,0 +1,97 @@
+#pragma once
+#include "DiskBufferBase.h"
+
+/**
+ * A disk-backed buffer which read/writes in buckets and slices. Where a slice is a a portion
+ * of data that belongs to a bucket. The number of slices is equal to n_buckets * n_buckets.
+ * Where each bucket has n_buckets slices.
+ * The data layout can be visualized as a grid, where each cell of the grid represents a slice.
+ * And depending on the manner of writing, each row or column of the grid represents a bucket.
+ * The manner or writing and reading is swapped between tables. When horizontal (row-writes) are
+ * performed, then column reads must subsequently be performed.
+ * This is because each write consists of a row of slices, each for a different bucket. Therefore if we previously
+ * wrote as a row, (full sequential write), then when we've finished writing all of the rows, all of a bucket's
+ * data will be found in a column (vertically). If we write vertically, then
+ * the opposite is true, and the bucket's data is found in a single row (horizontally).
+ */
+class DiskBucketBuffer : public DiskBufferBase
+{
+    DiskBucketBuffer( DiskQueue& queue, FileStream& stream, const char* name, uint32 bucketCount, size_t sliceCapacity );
+
+public:
+    static DiskBucketBuffer* Create( DiskQueue& queue, const char* fileName,
+                                     uint32 bucketCount, size_t sliceCapacity,
+                                     FileMode mode, FileAccess access, FileFlags flags );
+
+    static size_t GetSingleBucketBufferSize( DiskQueue& queue, uint32 bucketCount, size_t sliceCapacity );
+    static size_t GetReserveAllocSize( DiskQueue& queue, uint32 bucketCount, size_t sliceCapacity );
+
+    virtual ~DiskBucketBuffer();
+
+    void ReserveBuffers( IAllocator& allocator ) override;
+
+    void Swap() override;
+
+    /** 
+     * Submit next write buffer and track the actual
+     * size of each submitted slice.
+    */
+    // void Submit( const Span<size_t> sliceSizes );
+
+    /**
+     * Submit next write buffer w/ fixed source stride.
+     * sliceStride must be <= the slice capacity.
+     * It ought to be used when the slices are tracked by the
+     * user separately, and it will be read with a slice override.
+    */
+    void Submit( size_t sliceStride );
+
+    /**
+     * Assumes the sliceStride is the same as the maximum slice capacity.
+    */
+    inline void Submit() { Submit( GetSliceStride() ); }
+
+    /**
+     * Read next bucket
+     */
+    void ReadNextBucket() override;
+
+    inline size_t GetSliceStride() const { return _sliceCapacity; }
+
+    inline size_t GetBucketRowStride() const { return _sliceCapacity * _bucketCount; }
+
+    template<typename T>
+    inline Span<T> GetNextWriteBufferAs()
+    {
+        return Span<T>( reinterpret_cast<T*>( GetNextWriteBuffer() ), GetBucketRowStride() );
+    }
+
+    template<typename T>
+    inline Span<T> GetNextReadBufferAs()
+    {
+        size_t totalSize = 0;
+        for( auto sz : _readSliceSizes[_nextReadLock] )
+            totalSize += sz;
+
+        return Span<T>( reinterpret_cast<T*>( GetNextReadBuffer() ), totalSize / sizeof( T ) );
+    }
+
+    Span<byte> PeekReadBuffer( uint32 bucket );
+
+    void OverrideReadSlices( uint32 bucket, size_t elementSize, const uint32* sliceSizes, uint32 stride );
+
+private:
+    void HandleCommand( const DiskQueueDispatchCommand& cmd ) override;
+    void CmdWriteSlices( const DiskBucketBufferCommand& cmd );
+    void CmdReadSlices( const DiskBucketBufferCommand& cmd );
+
+private:
+    size_t _sliceCapacity;         // Maximum size of each slice
+
+    bool   _verticalWrite = false;
+    // size_t _writeSliceStride;      // Offset to the start of the next slices when writing
+    // size_t _readSliceStride;       // Offset to the start of the next slice when reading (these are swapped between tables).
+
+    std::vector<std::vector<size_t>> _writeSliceSizes = {};
+    std::vector<std::vector<size_t>> _readSliceSizes  = {};
+};
diff --git a/src/plotting/DiskBuffer.cpp b/src/plotting/DiskBuffer.cpp
new file mode 100644
index 00000000..0fb3f44a
--- /dev/null
+++ b/src/plotting/DiskBuffer.cpp
@@ -0,0 +1,119 @@
+#include "DiskBuffer.h"
+#include "DiskQueue.h"
+#include "plotdisk/jobs/IOJob.h"
+
+DiskBuffer* DiskBuffer::Create( DiskQueue& queue, const char* fileName, uint32 bucketCount,
+                                size_t bufferSize, FileMode mode, FileAccess access, FileFlags flags )
+{
+    FileStream file;
+    if( !DiskBufferBase::MakeFile( queue, fileName, mode, access, flags, file ) )
+        return nullptr;
+
+    return new DiskBuffer( queue, file, fileName, bucketCount, bufferSize );
+}
+
+DiskBuffer::DiskBuffer( DiskQueue& queue, FileStream& stream, const char* name,
+                        uint32 bucketCount, size_t bufferSize )
+    : DiskBufferBase( queue, stream, name, bucketCount )
+    , _bufferSize( bufferSize )
+    , _alignedBufferSize( RoundUpToNextBoundaryT( bufferSize, _file.BlockSize() ) )
+{
+    _bucketSizes.resize( bucketCount );
+}
+
+DiskBuffer::~DiskBuffer() {}
+
+void DiskBuffer::ReserveBuffers( IAllocator& allocator )
+{
+    DiskBufferBase::ReserveBuffers( allocator, _alignedBufferSize, _file.BlockSize() );
+}
+
+size_t DiskBuffer::GetReserveAllocSize( DiskQueue& queue, size_t bufferSize )
+{
+    const size_t alignment = queue.BlockSize();
+    
+    return DiskBufferBase::GetReserveAllocSize( RoundUpToNextBoundaryT( bufferSize, alignment ), alignment );
+}
+
+void DiskBuffer::Swap()
+{
+    DiskBufferBase::Swap();
+
+    FatalIf( !_file.Seek( 0, SeekOrigin::Begin ), "Failed to seek to file start on '%s/%s' with error %d.",
+            _queue->Path(), Name(), (int32)_file.GetError() );
+}
+
+void DiskBuffer::ReadNextBucket()
+{
+    FatalIf( _nextReadBucket >= _bucketCount, "'%s' Read bucket overflow.", Name() );
+
+    // Read whole bucket
+    DiskQueueDispatchCommand dcmd = {};
+    auto& cmd = dcmd.bufferCmd;
+    cmd.type = DiskBufferCommand::Read;
+
+    auto& c = cmd.read;
+    c.bucket = _nextReadBucket;
+
+    _queue->EnqueueDispatchCommand( this, dcmd );
+    _queue->SignalFence( _readFence, ++_nextReadBucket );
+}
+
+void DiskBuffer::Submit( const size_t size )
+{
+    FatalIf( (int64)_nextWriteLock - (int64)_nextWriteBucket > 2, "Invalid write lock state for '%s'.", _name.c_str() );
+    FatalIf( size > _alignedBufferSize, "Write submission too large for '%s'.", _name.c_str() );
+
+    DiskQueueDispatchCommand dcmd = {};
+    auto& cmd = dcmd.bufferCmd;
+    cmd.type = DiskBufferCommand::Write;
+
+    auto& c = cmd.write;
+    c.bucket = _nextWriteBucket;
+    _queue->EnqueueDispatchCommand( this, dcmd );
+
+    // Signal completion
+    _queue->SignalFence( _writeFence, ++_nextWriteBucket );
+}
+
+void DiskBuffer::HandleCommand( const DiskQueueDispatchCommand& cmd )
+{
+    const auto& c = cmd.bufferCmd;
+
+    switch( c.type )
+    {
+        case DiskBufferCommand::None:
+            ASSERT( 0 );
+            break;
+        case DiskBufferCommand::Write:
+            CmdWrite( c );
+            break;
+        case DiskBufferCommand::Read:
+            CmdRead( c );
+            break;
+    }
+}
+
+void DiskBuffer::CmdWrite( const DiskBufferCommand& cmd )
+{
+    const auto& c = cmd.write;
+
+    // Write a full block-aligned bucket
+    int err = 0;
+    if( !IOJob::WriteToFileUnaligned( _file, _writeBuffers[c.bucket % 2], _alignedBufferSize, err ) )
+    {
+        Fatal( "Failed to write bucket to '%s/%s' with error %d.", _queue->Path(), Name(), err );
+    }
+}
+
+void DiskBuffer::CmdRead( const DiskBufferCommand& cmd )
+{
+    const auto& c = cmd.read;
+
+    // Read a full block-aligned bucket
+    int err = 0;
+    if( !IOJob::ReadFromFileUnaligned( _file, _readBuffers[c.bucket % 2], _alignedBufferSize, err ) )
+    {
+        Fatal( "Failed to read bucket from '%s/%s' with error %d.", _queue->Path(), Name(), err );
+    }
+}
diff --git a/src/plotting/DiskBuffer.h b/src/plotting/DiskBuffer.h
new file mode 100644
index 00000000..2bcba94a
--- /dev/null
+++ b/src/plotting/DiskBuffer.h
@@ -0,0 +1,58 @@
+#pragma once
+#include "DiskBufferBase.h"
+
+/**
+ * Sequential disk buffer that whose actions are dispatched on a DiskQueue.
+ * This performs block-aligned reads and writes.
+ */
+class DiskBuffer : public DiskBufferBase
+{
+    DiskBuffer( DiskQueue& queue, FileStream& stream, const char* name, uint32 bucketCount, size_t bufferSize );
+
+public:
+    static DiskBuffer* Create( DiskQueue& queue, const char* fileName,
+                               uint32 bucketCount, size_t bufferSize,
+                               FileMode mode, FileAccess access, FileFlags flags );
+
+    virtual ~DiskBuffer();
+
+    void ReserveBuffers( IAllocator& allocator ) override;
+
+    static size_t GetReserveAllocSize( DiskQueue& queue, size_t bufferSize ); 
+
+    inline size_t GetAlignedBufferSize() const
+    {
+        return _alignedBufferSize;
+    }
+
+    void ReadNextBucket() override;
+    void Swap() override;
+
+    void Submit( size_t size );
+
+    template<typename T>
+    inline Span<T> GetNextWriteBufferAs()
+    {
+        return Span<T>( reinterpret_cast<T*>( GetNextWriteBuffer() ), GetAlignedBufferSize() );
+    }
+
+    template<typename T>
+    inline Span<T> GetNextReadBufferAs()
+    {
+        return Span<T>( reinterpret_cast<T*>( GetNextReadBuffer() ), GetAlignedBufferSize() / sizeof( T ) );
+    }
+
+protected:
+    void HandleCommand( const DiskQueueDispatchCommand& cmd ) override;
+
+private:
+    /// Command handlers
+    void CmdWrite( const DiskBufferCommand& cmd );
+    void CmdRead( const DiskBufferCommand& cmd );
+
+private:
+    size_t _bufferSize;                 // Requested buffer size
+    size_t _alignedBufferSize;          // Block-aligned requested buffer size
+
+    std::vector<size_t> _bucketSizes;   // The actual (unaligned) size of each bucket.
+};
diff --git a/src/plotting/DiskBufferBase.cpp b/src/plotting/DiskBufferBase.cpp
new file mode 100644
index 00000000..38a26270
--- /dev/null
+++ b/src/plotting/DiskBufferBase.cpp
@@ -0,0 +1,203 @@
+#include "DiskBufferBase.h"
+#include "DiskQueue.h"
+#include "util/IAllocator.h"
+#include "util/StackAllocator.h"
+#include <filesystem>
+
+bool DiskBufferBase::MakeFile( DiskQueue& queue, const char* name,
+                               FileMode mode, FileAccess access, FileFlags flags, FileStream& file )
+{
+    ASSERT( !file.IsOpen() );
+
+    std::string path = std::filesystem::path( queue.Path() ).append( name ).string();
+
+    return file.Open( path.c_str(), mode, access, flags );
+}
+
+DiskBufferBase::DiskBufferBase( DiskQueue& queue, FileStream& stream,
+                                const char* name, uint32 bucketCount )
+    : _queue      ( &queue )
+    , _file       ( std::move( stream ) )
+    , _name       ( name )
+    , _bucketCount( bucketCount )
+{}
+
+DiskBufferBase::~DiskBufferBase()
+{
+    _file.Close();
+    std::string path = std::filesystem::path( _queue->Path() ).append( _name ).string();
+    ::remove( path.c_str() );
+
+    // #TODO: Track the allocator used, and only release if we have that reference.
+    // if( _writeBuffers[0] ) bbvirtfreebounded( _writeBuffers[0] );
+    // if( _writeBuffers[1] ) bbvirtfreebounded( _writeBuffers[1] );
+    // if( _readBuffers[0] ) bbvirtfreebounded( _readBuffers[0] );
+    // if( _readBuffers[1] ) bbvirtfreebounded( _readBuffers[1] );
+}
+
+void DiskBufferBase::ReserveBufferForInstance( DiskBufferBase* self, IAllocator& allocator, const size_t size, const size_t alignment )
+{
+    if( self )
+    {
+        PanicIf( self->_writeBuffers[0], "Buffers already reserved for '%s'.", self->_name.c_str() );
+    }
+
+    byte* w0 = allocator.AllocT<byte>( size, alignment );
+    byte* w1 = allocator.AllocT<byte>( size, alignment );
+    byte* r0 = allocator.AllocT<byte>( size, alignment );
+    byte* r1 = allocator.AllocT<byte>( size, alignment );
+
+    if( self )
+    {
+        self->_writeBuffers[0] = w0;
+        self->_writeBuffers[1] = w1;
+        self->_readBuffers [0] = r0;
+        self->_readBuffers [1] = r1;
+    }
+}
+
+size_t DiskBufferBase::GetReserveAllocSize( const size_t size, const size_t alignment )
+{
+    DummyAllocator allocator;
+    ReserveBufferForInstance( nullptr, allocator, size, alignment );
+
+    return allocator.Size();
+}
+
+void DiskBufferBase::ReserveBuffers( IAllocator& allocator, const size_t size, const size_t alignment )
+{
+    ReserveBufferForInstance( this, allocator, size, alignment );
+}
+
+void DiskBufferBase::AssignBuffers( void* readBuffers[2], void* writeBuffers[2] )
+{
+    AssignReadBuffers( readBuffers );
+    AssignWriteBuffers( writeBuffers );
+}
+
+void DiskBufferBase::AssignReadBuffers( void* readBuffers[2] )
+{
+    // PanicIf( _readBuffers[0], "Read buffers already assigned for '%s'.", _name.c_str() );
+    _readBuffers [0] = (byte*)readBuffers [0];
+    _readBuffers [1] = (byte*)readBuffers [1];
+}
+
+void DiskBufferBase::AssignWriteBuffers( void* writeBuffers[2] )
+{
+    // PanicIf( _writeBuffers[0], "Write buffers already assigned for '%s'.", _name.c_str() );
+    _writeBuffers[0] = (byte*)writeBuffers[0];
+    _writeBuffers[1] = (byte*)writeBuffers[1];
+}
+
+
+void DiskBufferBase::ShareBuffers( const DiskBufferBase& other )
+{
+    _writeBuffers[0] = other._writeBuffers[0];
+    _writeBuffers[1] = other._writeBuffers[1];
+    _readBuffers [0] = other._readBuffers [0];
+    _readBuffers [1] = other._readBuffers [1];
+}
+
+void DiskBufferBase::Swap()
+{
+//    FatalIf( !_file.Seek( 0, SeekOrigin::Begin ), "Failed to seek '%s'.", _name.c_str() );
+    WaitForLastWriteToComplete();
+
+    _nextWriteBucket = 0;
+    _nextReadBucket  = 0;
+    _nextWriteLock   = 0;
+    _nextReadLock    = 0;
+
+    _readFence .Reset();
+    _writeFence.Reset();
+}
+
+void* DiskBufferBase::GetNextWriteBuffer()
+{
+    PanicIf( _nextWriteLock >= _bucketCount, "Write bucket overflow." );
+    PanicIf( (int64)_nextWriteLock - (int64)_nextWriteBucket >= 2, "Invalid write buffer lock for '%s'.", _name.c_str() );
+
+    void* buf = _writeBuffers[_nextWriteLock % 2];
+    PanicIf( !buf, "No write buffer reserved for '%s'.", _name.c_str() );
+
+    if( _nextWriteLock++ >= 2 )
+        WaitForWriteToComplete( _nextWriteLock-2 );
+
+    return buf;
+}
+
+void* DiskBufferBase::PeekReadBufferForBucket( uint32 bucket )
+{
+    PanicIf( _nextReadLock >= _bucketCount, "Read bucket overflow." );
+    return _readBuffers[bucket % 2];
+}
+
+void* DiskBufferBase::PeekWriteBufferForBucket( const uint32 bucket )
+{
+    PanicIf( _nextWriteLock >= _bucketCount, "Write bucket overflow." );
+    return _writeBuffers[bucket % 2];
+}
+
+void DiskBufferBase::WaitForWriteToComplete( const uint32 bucket )
+{
+    _writeFence.Wait( bucket + 1 );
+}
+
+void DiskBufferBase::WaitForLastWriteToComplete()
+{
+    if( _nextWriteBucket < 1 )
+        return;
+
+    WaitForWriteToComplete( _nextWriteBucket-1 );
+}
+
+void* DiskBufferBase::GetNextReadBuffer()
+{
+    PanicIf( _nextReadLock >= _bucketCount, "Read bucket overflow." );
+    PanicIf( _nextReadLock >= _nextReadBucket, "Invalid read buffer lock for '%s'.", _name.c_str() );
+
+    void* buf = _readBuffers[_nextReadLock % 2];
+    PanicIf( !buf, "No read buffer reserved for '%s'.", _name.c_str() );
+
+    WaitForReadToComplete( _nextReadLock++ );
+    return buf;
+}
+
+void DiskBufferBase::WaitForReadToComplete( const uint32 bucket )
+{
+    _readFence.Wait( bucket + 1 );
+}
+
+void DiskBufferBase::WaitForNextReadToComplete()
+{
+    FatalIf( _nextReadBucket < 1, "Nothing yet read for '%s'.", _name.c_str() );
+    FatalIf( _nextReadLock >= _nextReadBucket, "Invalid read buffer lock for '%s'.", _name.c_str() );
+
+    Panic( "Unsupported. Nothing to see here." );
+
+    // # TODO: Don't use this as is, it is not intuitive and can causes errors.
+    //         Use GetNextReadBuffer() or WaitForReadToComplete() instead.
+    WaitForReadToComplete( _nextReadBucket-1 );
+}
+
+uint32 DiskBufferBase::BeginWriteSubmission()
+{
+    FatalIf( (int64)_nextWriteLock - (int64)_nextWriteBucket > 2, "Invalid write lock state for '%s'.", _name.c_str() );
+    return _nextWriteBucket;
+}
+
+void DiskBufferBase::EndWriteSubmission()
+{
+    _queue->SignalFence( _writeFence, ++_nextWriteBucket );
+}
+
+uint32 DiskBufferBase::BeginReadSubmission()
+{
+    FatalIf( _nextReadBucket >= _bucketCount, "'%s' Read bucket overflow.", Name() );
+    return _nextReadBucket;
+}
+
+void DiskBufferBase::EndReadSubmission()
+{
+    _queue->SignalFence( _readFence, ++_nextReadBucket );
+}
diff --git a/src/plotting/DiskBufferBase.h b/src/plotting/DiskBufferBase.h
new file mode 100644
index 00000000..d361281f
--- /dev/null
+++ b/src/plotting/DiskBufferBase.h
@@ -0,0 +1,117 @@
+#pragma once
+#include "io/FileStream.h"
+#include "threading/Fence.h"
+#include "DiskQueue.h"
+#include "util/Span.h"
+
+
+class IAllocator;
+
+/**
+ * Dual-buffered base class for DiskQueue-based writing and reading.
+ */
+class DiskBufferBase
+{
+    friend class DiskQueue;
+
+protected:
+    static bool MakeFile( DiskQueue& queue, const char* name, FileMode mode, FileAccess access, FileFlags flags, FileStream& file );
+
+    DiskBufferBase( DiskQueue& queue, FileStream& stream, const char* name, uint32 bucketCount );
+
+    virtual void HandleCommand( const DiskQueueDispatchCommand& cmd ) = 0;
+
+    static void ReserveBufferForInstance( DiskBufferBase* self, IAllocator& allocator, size_t size, size_t alignment );
+    void ReserveBuffers( IAllocator& allocator, size_t size, size_t alignment );
+
+    static size_t GetReserveAllocSize( size_t size, const size_t alignment );
+public:
+
+    virtual void ReserveBuffers( IAllocator& allocator ) = 0;
+
+    /// Assigns already existing buffers to be used as I/O buffers
+    void AssignBuffers( void* readBuffers[2], void* writeBuffers[2] );
+    void AssignReadBuffers( void* readBuffers[2] );
+    void AssignWriteBuffers( void* writeBuffers[2] );
+
+    /// Takes the same buffers that another DiskBufferBase uses and shares them.
+    void ShareBuffers( const DiskBufferBase& other );
+
+    /// Read next bucket
+    virtual void ReadNextBucket() = 0;
+
+    /// Waits for the last write to finish
+    /// and marks completion of writing and reading a table.
+    virtual void Swap();
+
+    void* GetNextWriteBuffer();
+    void* GetNextReadBuffer();
+
+    void* PeekReadBufferForBucket( uint32 bucket );
+
+    /// Gets the write buffer for a certain bucket without waiting for it (unsafe)
+    void* PeekWriteBufferForBucket( uint32 bucket );
+
+    void WaitForWriteToComplete( uint32 bucket );
+    void WaitForLastWriteToComplete();
+
+    void WaitForReadToComplete( uint32 bucket );
+    void WaitForNextReadToComplete();
+
+
+    inline const char* Name() const
+    {
+        return _name.c_str();
+    }
+
+    inline FileStream& File() const
+    {
+        return const_cast<DiskBufferBase*>( this )->_file;
+    }
+
+    /// Helpers
+    inline bool TryReadNextBucket()
+    {
+        if( _nextReadBucket >= _bucketCount )
+            return false;
+
+        ReadNextBucket();
+        return true;
+    }
+
+    inline uint32 GetNextReadBucketId() const
+    {
+        ASSERT( _nextReadBucket < _bucketCount );
+        return _nextReadBucket; 
+    }
+
+public:
+    virtual ~DiskBufferBase();
+
+protected:
+    /**
+     * Returns the bucket about to be written.
+    */
+    uint32 BeginWriteSubmission();
+    void   EndWriteSubmission();
+
+    uint32 BeginReadSubmission();
+    void   EndReadSubmission();
+
+protected:
+    DiskQueue*  _queue;
+    FileStream  _file;
+    std::string _name;
+
+    uint32      _bucketCount;
+    Fence       _writeFence;
+    Fence       _readFence;
+
+    byte*       _writeBuffers[2]  = {};
+    byte*       _readBuffers [2]  = {};
+
+    uint32      _nextWriteBucket = 0;   // Next bucket that will be written to disk
+    uint32      _nextReadBucket  = 0;   // Next bucket that will be read from disk
+    uint32      _nextWriteLock   = 0;   // Next write bucket buffer index that will be locked (for user use)
+    uint32      _nextReadLock    = 0;   // Next read bucket buffer index that will be locked (for user use)
+};
diff --git a/src/plotting/DiskQueue.cpp b/src/plotting/DiskQueue.cpp
new file mode 100644
index 00000000..8906efd4
--- /dev/null
+++ b/src/plotting/DiskQueue.cpp
@@ -0,0 +1,64 @@
+#include "DiskQueue.h"
+#include "DiskBucketBuffer.h"
+#include "io/FileStream.h"
+#include "threading/Fence.h"
+#include "plotdisk/jobs/IOJob.h"
+
+DiskQueue::DiskQueue( const char* path )
+    : Super()
+    , _path( path )
+{
+    ASSERT( path );
+
+    _blockSize = FileStream::GetBlockSizeForPath( path );
+    FatalIf( _blockSize < 1, "Failed to obtain file system block size for path '%s'", path );
+
+    StartConsumer();
+}
+
+DiskQueue::~DiskQueue()
+{}
+
+void DiskQueue::ProcessCommands( const Span<DiskQueueCommand> items )
+{
+    for( uint32 item = 0; item < items.Length(); item++ )
+    {
+        auto& cmd = items[item];
+
+        switch( cmd.type )
+        {
+            case DiskQueueCommand::DispatchDiskBufferCommand:
+                cmd.dispatch.sender->HandleCommand( cmd.dispatch.cmd );
+                break;
+
+            case DiskQueueCommand::Signal:
+                cmd.signal.fence->Signal( (uint32)cmd.signal.value );
+            break;
+
+            default:
+                ASSERT(0);
+                break;
+        }
+    }
+}
+
+void DiskQueue::EnqueueDispatchCommand( DiskBufferBase* sender, const DiskQueueDispatchCommand& cmd )
+{
+    // #TODO: Don't copy and just have them send in a DiskQueueCommand?
+    DiskQueueCommand c;
+    c.type                    = DiskQueueCommand::DispatchDiskBufferCommand;
+    c.dispatch.sender = sender;
+    c.dispatch.cmd    = cmd;
+
+    this->Submit( c );
+}
+
+void DiskQueue::SignalFence( Fence& fence, uint64 value )
+{
+    DiskQueueCommand c;
+    c.type                    = DiskQueueCommand::Signal;
+    c.signal.fence            = &fence;
+    c.signal.value            = value;
+
+    this->Submit( c );
+}
diff --git a/src/plotting/DiskQueue.h b/src/plotting/DiskQueue.h
new file mode 100644
index 00000000..b3a7a145
--- /dev/null
+++ b/src/plotting/DiskQueue.h
@@ -0,0 +1,121 @@
+#pragma once
+#include "threading/Thread.h"
+#include "threading/AutoResetSignal.h"
+#include "util/MPMCQueue.h"
+#include "util/CommandQueue.h"
+
+class IStream;
+class Fence;
+class DiskBufferBase;
+
+struct DiskBufferCommand
+{
+    enum Type
+    {
+        None = 0,
+        Write,
+        Read,
+    };
+
+    Type type;
+
+    union
+    {
+        struct {
+            uint32 bucket;
+        } write;
+
+        struct {
+            uint32 bucket;
+        } read;
+    };
+};
+
+struct DiskBucketBufferCommand
+{
+    enum Type
+    {
+        None = 0,
+        Write,
+        Read,
+        // Seek,
+        // Close,
+    };
+
+    Type type;
+
+    union
+    {
+        struct {
+            size_t sliceStride;
+            uint32 bucket;
+            bool   vertical;
+        } write;
+
+        struct {
+            uint32 bucket;
+            bool   vertical;
+        } read;
+    };
+};
+
+union DiskQueueDispatchCommand
+{
+    DiskBufferCommand       bufferCmd;
+    DiskBucketBufferCommand bucketBufferCmd;
+};
+
+struct DiskQueueCommand
+{
+    static constexpr uint32 MAX_STACK_COMMANDS = 64;
+
+    enum Type
+    {
+        None = 0,
+        DispatchDiskBufferCommand,
+        Signal,
+    };
+
+    Type type;
+
+    union
+    {
+        struct {
+            DiskBufferBase* sender;
+            DiskQueueDispatchCommand cmd;
+        } dispatch;
+
+        struct {
+            Fence* fence;
+            uint64 value;
+        } signal;
+    };
+};
+
+class DiskQueue : public MPCommandQueue<DiskQueueCommand, DiskQueueCommand::MAX_STACK_COMMANDS>
+{
+    using Super = MPCommandQueue<DiskQueueCommand, DiskQueueCommand::MAX_STACK_COMMANDS>;
+
+    friend class DiskBufferBase;
+    friend class DiskBuffer;
+    friend class DiskBucketBuffer;
+
+public:
+    DiskQueue( const char* path );
+    ~DiskQueue();
+
+    inline const char* Path() const { return _path.c_str(); }
+    inline size_t      BlockSize() const { return _blockSize; }
+
+protected:
+    void ProcessCommands( const Span<DiskQueueCommand> items ) override;
+
+private:
+    void EnqueueDispatchCommand( DiskBufferBase* sender, const DiskQueueDispatchCommand& cmd );
+    void SignalFence( Fence& fence, uint64 value );
+
+private:
+    std::string _path;          // Storage directory
+    size_t      _blockSize = 0; // File system block size at path
+};
+
diff --git a/src/plotting/PlotWriter.cpp b/src/plotting/PlotWriter.cpp
index 6e0785aa..3d3440c7 100644
--- a/src/plotting/PlotWriter.cpp
+++ b/src/plotting/PlotWriter.cpp
@@ -2,16 +2,19 @@
 #include "ChiaConsts.h"
 #include "plotdisk/jobs/IOJob.h"
 #include "plotdisk/DiskBufferQueue.h"
+#include "harvesting/GreenReaper.h"
 
 //-----------------------------------------------------------
 PlotWriter::PlotWriter() : PlotWriter( true ) {}
 
 //-----------------------------------------------------------
 PlotWriter::PlotWriter( bool useDirectIO )
-    : _queue()
-    , _writerThread( new Thread( 4 MiB ) )
+    : _writerThread( new Thread( 4 MiB ) )
     , _directIO    ( useDirectIO )
+    , _queue()
 {
+    _readyToPlotSignal.Signal();    // Start ready to plot
+
     // #MAYBE: Start the thread at first plot?
     _writerThread->Run( WriterThreadEntry, this );
 }
@@ -40,12 +43,25 @@ PlotWriter::~PlotWriter()
         bbvirtfree( _writeBuffer.Ptr() );
 }
 
+//-----------------------------------------------------------
+void PlotWriter::EnablePlotChecking( PlotChecker& checker )
+{
+    _plotChecker = &checker;
+}
+
 //-----------------------------------------------------------
 bool PlotWriter::BeginPlot( PlotVersion version, 
     const char* plotFileDir, const char* plotFileName, const byte plotId[32],
     const byte* plotMemo, const uint16 plotMemoSize, const uint32 compressionLevel )
 {
-    return BeginPlotInternal( version, plotFileDir, plotFileName, plotId, plotMemo, plotMemoSize, compressionLevel );
+    _readyToPlotSignal.Wait();
+
+    const bool r = BeginPlotInternal( version, plotFileDir, plotFileName, plotId, plotMemo, plotMemoSize, compressionLevel );
+
+    if( !r )
+        _readyToPlotSignal.Signal();
+
+    return r;
 }
 
 //-----------------------------------------------------------
@@ -259,7 +275,6 @@ bool PlotWriter::BeginPlotInternal( PlotVersion version,
     return true;
 }
 
-
 //-----------------------------------------------------------
 void PlotWriter::EndPlot( const bool rename )
 {
@@ -267,10 +282,32 @@ void PlotWriter::EndPlot( const bool rename )
 
     ASSERT( _stream.IsOpen() );
 
-    auto& cmd = GetCommand( CommandType::EndPlot );
-    cmd.endPlot.fence    = &_completedFence;
-    cmd.endPlot.rename   = rename;
-    SubmitCommands();
+    // auto& cmd = GetCommand( CommandType::EndPlot );
+    // cmd.endPlot.fence    = &_completedFence;
+    // cmd.endPlot.rename   = rename;
+    // SubmitCommands();
+
+    SubmitCommand({ .type = CommandType::EndPlot,
+        .endPlot{ .fence    = &_completedFence,
+                  .rename   = rename
+        }
+    });
+}
+
+//-----------------------------------------------------------
+bool PlotWriter::CheckPlot()
+{
+    if( _dummyMode || !_plotChecker ) return false;
+
+    const char* plotPath = _plotPathBuffer.Ptr();
+
+    PlotCheckResult checksResult{};
+    _plotChecker->CheckPlot( plotPath, &checksResult );
+
+    if( !checksResult.error.empty() )
+        return false;
+
+    return !checksResult.deleted;
 }
 
 
@@ -322,9 +359,14 @@ void PlotWriter::BeginTable( const PlotTable table )
 {
     if( _dummyMode ) return;
 
-    auto& cmd = GetCommand( CommandType::BeginTable );
-    cmd.beginTable.table = table;
-    SubmitCommands();
+    SubmitCommand({
+        .type = CommandType::BeginTable,
+        .beginTable{ .table = table }
+    });
+    // auto& cmd = GetCommand( CommandType::BeginTable );
+    // auto cmd = GetCommand( CommandType::BeginTable );
+    // cmd.beginTable.table = table;
+    // SubmitCommands();
 }
 
 //-----------------------------------------------------------
@@ -332,10 +374,18 @@ void PlotWriter::ReserveTableSize( const PlotTable table, const size_t size )
 {
     if( _dummyMode ) return;
 
-    auto& cmd = GetCommand( CommandType::ReserveTable );
-    cmd.reserveTable.table = table;
-    cmd.reserveTable.size  = size;
-    SubmitCommands();
+    // auto& cmd = GetCommand( CommandType::ReserveTable );
+    // cmd.reserveTable.table = table;
+    // cmd.reserveTable.size  = size;
+    // SubmitCommands();
+
+     SubmitCommand({
+        .type = CommandType::ReserveTable,
+        .reserveTable { 
+            .table = table,
+            .size  = size 
+        }
+    });
 }
 
 //-----------------------------------------------------------
@@ -343,8 +393,9 @@ void PlotWriter::EndTable()
 {
     if( _dummyMode ) return;
 
-    auto& cmd = GetCommand( CommandType::EndTable );
-    SubmitCommands();
+    // auto& cmd = GetCommand( CommandType::EndTable );
+    // SubmitCommands();
+    SubmitCommand({ .type = CommandType::EndTable });
 }
 
 //-----------------------------------------------------------
@@ -352,10 +403,16 @@ void PlotWriter::WriteTableData( const void* data, const size_t size )
 {
     if( _dummyMode ) return;
 
-    auto& cmd = GetCommand( CommandType::WriteTable );
-    cmd.writeTable.buffer = (byte*)data;
-    cmd.writeTable.size   = size;
-    SubmitCommands();
+    // auto& cmd = GetCommand( CommandType::WriteTable );
+    // cmd.writeTable.buffer = (byte*)data;
+    // cmd.writeTable.size   = size;
+    // SubmitCommands();
+
+    SubmitCommand({ .type = CommandType::WriteTable,
+        .writeTable{ .buffer = (byte*)data,
+                     .size   = size,
+        }
+    });
 }
 
 //-----------------------------------------------------------
@@ -363,41 +420,90 @@ void PlotWriter::WriteReservedTable( const PlotTable table, const void* data )
 {
     if( _dummyMode ) return;
 
-    auto& cmd = GetCommand( CommandType::WriteReservedTable );
-    cmd.writeReservedTable.table  = table;
-    cmd.writeReservedTable.buffer = (byte*)data;
-    SubmitCommands();
+    // auto& cmd = GetCommand( CommandType::WriteReservedTable );
+    // cmd.writeReservedTable.table  = table;
+    // cmd.writeReservedTable.buffer = (byte*)data;
+    // SubmitCommands();
+
+    SubmitCommand({ .type = CommandType::WriteReservedTable,
+        .writeReservedTable{ 
+            .table  = table,
+            .buffer = (byte*)data
+        }
+    });
 }
 
 //-----------------------------------------------------------
 void PlotWriter::SignalFence( Fence& fence )
 {
-    if( _dummyMode ) fence.Signal();
+    if( _dummyMode ) 
+    {
+        fence.Signal();
+        return;
+    }
+
+    // auto& cmd = GetCommand( CommandType::SignalFence );
+    // cmd.signalFence.fence    = &fence;
+    // cmd.signalFence.sequence = -1;
+    // SubmitCommands();
 
-    auto& cmd = GetCommand( CommandType::SignalFence );
-    cmd.signalFence.fence    = &fence;
-    cmd.signalFence.sequence = -1;
-    SubmitCommands();
+    SubmitCommand({ .type = CommandType::SignalFence,
+        .signalFence{ .fence    = &fence,
+                      .sequence = -1
+        }
+    });
 }
 
 //-----------------------------------------------------------
 void PlotWriter::SignalFence( Fence& fence, uint32 sequence )
 {
-    if( _dummyMode ) fence.Signal( sequence );
+    if( _dummyMode )
+    {
+        fence.Signal( sequence );
+        return;
+    }
+
+    // auto& cmd = GetCommand( CommandType::SignalFence );
+    // cmd.signalFence.fence    = &fence;
+    // cmd.signalFence.sequence = (int64)sequence;
+    // SubmitCommands();
+    
+    SubmitCommand({ .type = CommandType::SignalFence,
+        .signalFence{ .fence    = &fence,
+                      .sequence = (int64)sequence
+        }
+    });
+}
+
+//-----------------------------------------------------------
+void PlotWriter::CallBack( std::function<void()> func )
+{
+    if( _dummyMode )
+    {
+        func();
+        return;
+    }
+
+    // auto& cmd = GetCommand( CommandType::CallBack );
+    // cmd.callback.func = new std::function<void()>( std::move( func ) );
+    // SubmitCommands();
 
-    auto& cmd = GetCommand( CommandType::SignalFence );
-    cmd.signalFence.fence    = &fence;
-    cmd.signalFence.sequence = (int64)sequence;
-    SubmitCommands();
+    SubmitCommand({ .type =  CommandType::CallBack,
+        .callback{ .func = new std::function<void()>( std::move( func ) ) }
+    });
 }
 
 //-----------------------------------------------------------
 void PlotWriter::ExitWriterThread()
 {
     // Signal writer thread to exit after it finishes its commands
-    auto& cmd = GetCommand( CommandType::Exit );
-    cmd.signalFence.fence = &_completedFence;
-    SubmitCommands();
+    // auto& cmd = GetCommand( CommandType::Exit );
+    // cmd.signalFence.fence = &_completedFence;
+    // SubmitCommands();
+
+    SubmitCommand({ .type = CommandType::Exit,
+        .signalFence{ .fence = &_completedFence }
+    });
 
     // Wait for writer thread to exit
     _completedFence.Wait();
@@ -407,50 +513,60 @@ void PlotWriter::ExitWriterThread()
 //-----------------------------------------------------------
 PlotWriter::Command& PlotWriter::GetCommand( CommandType type )
 {
-    if( _owner != nullptr )
-    {
-        auto* cmd = _owner->GetCommandObject( DiskBufferQueue::Command::CommandType::PlotWriterCommand );
-        ASSERT( cmd );
-
-        ZeroMem( &cmd->plotWriterCmd );
-        cmd->plotWriterCmd.writer   = this;
-        cmd->plotWriterCmd.cmd.type = type;
-        return cmd->plotWriterCmd.cmd;
-    }
-    else
-    {
-        Command* cmd = nullptr;
-        while( !_queue.Write( cmd ) )
-        {
-            Log::Line( "[PlotWriter] Command buffer full. Waiting for commands." );
-            auto waitTimer = TimerBegin();
-
-            // Block and wait until we have commands free in the buffer
-            _cmdConsumedSignal.Wait();
+    Panic( "Don't use me!" );
+
+    // if( _owner != nullptr )
+    // {
+    //     auto* cmd = _owner->GetCommandObject( DiskBufferQueue::Command::CommandType::PlotWriterCommand );
+    //     ASSERT( cmd );
+
+    //     ZeroMem( &cmd->plotWriterCmd );
+    //     cmd->plotWriterCmd.writer   = this;
+    //     cmd->plotWriterCmd.cmd.type = type;
+    //     return cmd->plotWriterCmd.cmd;
+    // }
+    // else
+    // {
+    //     Command* cmd = nullptr;
+    //     while( !_queue.Write( cmd ) )
+    //     {
+    //         Log::Line( "[PlotWriter] Command buffer full. Waiting for commands." );
+    //         auto waitTimer = TimerBegin();
+
+    //         // Block and wait until we have commands free in the buffer
+    //         _cmdConsumedSignal.Wait();
             
-            Log::Line( "[PlotWriter] Waited %.6lf seconds for a Command to be available.", TimerEnd( waitTimer ) );
-        }
+    //         Log::Line( "[PlotWriter] Waited %.6lf seconds for a Command to be available.", TimerEnd( waitTimer ) );
+    //     }
         
-        ASSERT( cmd );
-        ZeroMem( cmd );
-        cmd->type = type;
+    //     ASSERT( cmd );
+    //     ZeroMem( cmd );
+    //     cmd->type = type;
 
-        return *cmd;
-    }
+    //     return *cmd;
+    // }
 }
 
 //-----------------------------------------------------------
-void PlotWriter::SubmitCommands()
+void PlotWriter::SubmitCommand( const Command cmd )
 {
-    if( _owner != nullptr )
-    {
-        _owner->CommitCommands();
-    }
-    else
-    {
-        _queue.Commit();
-        _cmdReadySignal.Signal();
-    }
+    std::unique_lock lock( _queueLock );
+    _queue.push( cmd );
+    _cmdReadySignal.Signal();
+}
+
+//-----------------------------------------------------------
+void PlotWriter::SubmitCommands()
+{Panic( "" );
+    // if( _owner != nullptr )
+    // {
+    //     _owner->CommitCommands();
+    // }
+    // else
+    // {
+    //     _queue.Commit();
+    //     _cmdReadySignal.Signal();
+    // }
 }
 
 
@@ -475,12 +591,48 @@ void PlotWriter::WriterThreadMain()
         _cmdReadySignal.Wait();
 
         // Load commands from the queue
-        int32 cmdCount;
-        while( ( ( cmdCount = _queue.Dequeue( commands, MAX_COMMANDS ) ) ) )
+        // int32 cmdCount;
+        // while( ( ( cmdCount = _queue.Dequeue( commands, MAX_COMMANDS ) ) ) )
+        // {
+        //     // Notify we consumed commands
+        //     _cmdConsumedSignal.Signal();
+
+        //     for( int32 i = 0; i < cmdCount; i++ )
+        //     {
+        //         if( commands[i].type == CommandType::Exit )
+        //         {
+        //             commands[i].signalFence.fence->Signal();
+        //             return;
+        //         }
+
+        //         ExecuteCommand( commands[i] );
+        //     }
+        // }
+
+        // Consume commands from the queue and execute them
+        // until there are none more found in the queue
+        size_t cmdCount = 0;
+        for( ;; )
         {
+            // Consume commands from queue
+            {
+                std::unique_lock lock( _queueLock );
+                cmdCount = std::min<size_t>( _queue.size(), MAX_COMMANDS );
+                
+                for( size_t i = 0; i < cmdCount; i++ )
+                {
+                    commands[i] = _queue.front();
+                    _queue.pop();
+                }
+            }
+
             // Notify we consumed commands
             _cmdConsumedSignal.Signal();
 
+            if( cmdCount < 1 )
+                break;
+
+            // Execute commands
             for( int32 i = 0; i < cmdCount; i++ )
             {
                 if( commands[i].type == CommandType::Exit )
@@ -508,6 +660,7 @@ void PlotWriter::ExecuteCommand( const Command& cmd )
         case CommandType::ReserveTable       : CmdReserveTable( cmd ); break;
         case CommandType::WriteReservedTable : CmdWriteReservedTable( cmd ); break;
         case CommandType::EndPlot            : CmdEndPlot( cmd ); break;
+        case CommandType::CallBack           : CmdCallBack( cmd ); break;
 
         case CommandType::SignalFence:
             if( cmd.signalFence.sequence >= 0 )
@@ -527,7 +680,7 @@ void PlotWriter::SeekToLocation( const size_t location )
     // - The seeked-to block already existed
 
     const size_t blockSize              = _stream.BlockSize();
-    const size_t currentAlignedLocation = _position / blockSize * blockSize;
+    // const size_t currentAlignedLocation = _position / blockSize * blockSize;
     const size_t alignedLocation        = location / blockSize * blockSize;
 
     if( _bufferBytes )
@@ -618,16 +771,51 @@ void PlotWriter::WriteData( const byte* src, const size_t size )
         ASSERT( (copySize + _bufferBytes) / blockSize * blockSize == (copySize + _bufferBytes) );
 
         memcpy( writeBuffer + _bufferBytes, src, copySize );
-        
-        const size_t writeSize = _bufferBytes + copySize;
+
+        size_t writeSize = _bufferBytes + copySize;
         sizeToWrite -= writeSize;
         src         += copySize;
         _bufferBytes = 0;
 
         ASSERT( writeSize / blockSize * blockSize == writeSize );
 
-        PanicIf( !IOJob::WriteToFile( _stream, writeBuffer, writeSize, nullptr, blockSize, err ),
-            "Failed to write to plot with error %d:", err );
+
+        size_t totalSizeWritten = 0;
+        size_t sizeWritten      = 0;
+        while( !IOJob::WriteToFile( _stream, writeBuffer, writeSize, nullptr, blockSize, err, &sizeWritten ) )
+        {
+            ASSERT( writeSize / blockSize * blockSize == writeSize );
+
+            bool isOutOfSpace = false;
+
+            #if !defined( _WIN32 )
+                isOutOfSpace = err == ENOSPC;
+            #else
+                // #TODO: Add out of space error check for windows
+            #endif
+
+            // Wait indefinitely until there's more space
+            if( isOutOfSpace )
+            {
+                const long SLEEP_TIME = 10 * (long)1000;
+
+                Log::Line( "No space left in plot output directory for plot '%s'. Waiting %.1lf seconds before trying again...",
+                            this->_plotPathBuffer.Ptr(), (double)SLEEP_TIME/1000.0 );
+                Thread::Sleep( SLEEP_TIME );
+            }
+            else
+                Log::Line( "Error %d encountered when writing to plot '%s.", err, this->_plotPathBuffer.Ptr() );
+
+            totalSizeWritten += sizeWritten;
+            if( totalSizeWritten >= writeSize )
+                break;
+
+            ASSERT( sizeWritten >= writeSize );
+
+            writeBuffer += sizeWritten;
+            writeSize   -= sizeWritten;
+            sizeWritten = 0;
+        }
     }
 
 
@@ -783,8 +971,14 @@ void PlotWriter::CmdEndPlot( const Command& cmd )
     FlushRetainedBytes();
     _stream.Close();
 
+    bool renamePlot = cmd.endPlot.rename;
+    if( _plotChecker )
+    {
+        renamePlot = CheckPlot();
+    }
+
     // Now rename to its final non-temp name
-    if( cmd.endPlot.rename )
+    if( renamePlot )
     {
         const uint32 RETRY_COUNT  = 10;
         const long   MS_WAIT_TIME = 1000;
@@ -820,6 +1014,15 @@ void PlotWriter::CmdEndPlot( const Command& cmd )
         }
     }
 
+    _readyToPlotSignal.Signal();
     cmd.endPlot.fence->Signal();
 }
 
+//-----------------------------------------------------------
+void PlotWriter::CmdCallBack( const Command& cmd )
+{
+    ASSERT( cmd.type == CommandType::CallBack );
+
+    (*cmd.callback.func)();
+    delete cmd.callback.func;
+}
\ No newline at end of file
diff --git a/src/plotting/PlotWriter.h b/src/plotting/PlotWriter.h
index 28084687..47113b06 100644
--- a/src/plotting/PlotWriter.h
+++ b/src/plotting/PlotWriter.h
@@ -3,10 +3,14 @@
 #include "util/SPCQueue.h"
 #include "plotting/PlotTypes.h"
 #include "plotting/PlotHeader.h"
+#include "tools/PlotChecker.h"
 #include "io/FileStream.h"
 #include "threading/Thread.h"
 #include "threading/AutoResetSignal.h"
 #include "threading/Fence.h"
+#include <functional>
+#include <mutex>
+#include <queue>
 
 /**
  * Handles writing the final plot data to disk asynchronously.
@@ -83,6 +87,7 @@ class PlotWriter
     PlotWriter( DiskBufferQueue& ownerQueue );
     virtual ~PlotWriter();
     
+    void EnablePlotChecking( PlotChecker& checker );
 
     // Begins writing a new plot. Any previous plot must have finished before calling this
     bool BeginPlot( PlotVersion version, 
@@ -118,6 +123,9 @@ class PlotWriter
 
     void SignalFence( Fence& fence );
     void SignalFence( Fence& fence, uint32 sequence );
+
+    // Dispatch a callback from the writer thread
+    void CallBack( std::function<void()> func );
     
     void CompleteTable();
 
@@ -154,8 +162,11 @@ class PlotWriter
         const byte* plotMemo, const uint16 plotMemoSize,
         int32 compressionLevel );
 
+    bool CheckPlot();
+
     Command& GetCommand( CommandType type );
     void SubmitCommands();
+    void SubmitCommand( const Command cmd );
     
     void SeekToLocation( size_t location );
 
@@ -168,6 +179,7 @@ class PlotWriter
 
     void WriteData( const byte* data, size_t size );
 
+
 private:
     void CmdBeginTable( const Command& cmd );
     void CmdEndTable( const Command& cmd );
@@ -176,6 +188,7 @@ class PlotWriter
     void CmdWriteReservedTable( const Command& cmd );
     void CmdSignalFence( const Command& cmd );
     void CmdEndPlot( const Command& cmd );
+    void CmdCallBack( const Command& cmd );
 
 private:
     enum class CommandType : uint32
@@ -188,7 +201,8 @@ class PlotWriter
         ReserveTable,
         WriteReservedTable,
         SignalFence,
-        EndPlot
+        EndPlot,
+        CallBack,
     };
 
     struct Command
@@ -237,6 +251,11 @@ class PlotWriter
                 Fence* fence;
                 bool   rename;
             } endPlot;
+
+            struct
+            {
+                std::function<void()>* func;
+            } callback;
         };
     };
 
@@ -244,7 +263,7 @@ class PlotWriter
 private:
     class DiskBufferQueue* _owner               = nullptr;  // This instance might be own by an IOQueue, which will 
                                                             // dispatch our ocmmands in its own threads.
-                                                            
+
     FileStream             _stream;
     bool                   _directIO;
     bool                   _dummyMode           = false;    // In this mode we don't actually write anything
@@ -255,6 +274,7 @@ class PlotWriter
     Fence                  _completedFence;             // Signal plot completed
     AutoResetSignal        _cmdReadySignal;
     AutoResetSignal        _cmdConsumedSignal;
+    AutoResetSignal        _readyToPlotSignal;          // Set when the writer is ready to start the next plot.
     Span<byte>             _writeBuffer         = {};
     size_t                 _bufferBytes         = 0;    // Current number of bytes in the buffer
     size_t                 _headerSize          = 0;
@@ -271,6 +291,12 @@ class PlotWriter
     size_t                 _tableStart          = 0;    // Current table start location
     uint64                 _tablePointers[10]   = {};
     uint64                 _tableSizes   [10]   = {};
-    SPCQueue<Command, 512> _queue;
+    // SPCQueue<Command, 512> _queue;
+
+    std::queue<Command>     _queue;
+    std::mutex              _queueLock;
+    // std::mutex              _pushLock;
+
+    PlotChecker* _plotChecker              = nullptr;    // User responsible for ownership of checker. Must live until this PlotWriter's lifetime neds.
 };
 
diff --git a/src/threading/MTJob.h b/src/threading/MTJob.h
index a7e0badd..fe00de25 100644
--- a/src/threading/MTJob.h
+++ b/src/threading/MTJob.h
@@ -315,7 +315,7 @@ inline void MTJobSyncT<TJob>::WaitForRelease()
     // Trace( "- locked: %d", count );
     
     // Wait for the control thread (id == 0 ) to signal us
-    while( finishedCount.load( std::memory_order_relaxed ) != 0 );
+    while( finishedCount.load( std::memory_order_relaxed ) != 0 ){}
 
     // Ensure all threads have been released (prevent re-locking before another thread has been released)
     // count = releaseLock.load( std::memory_order_acquire );
diff --git a/src/threading/Semaphore.cpp b/src/threading/Semaphore.cpp
index 52ffc7f2..62671710 100644
--- a/src/threading/Semaphore.cpp
+++ b/src/threading/Semaphore.cpp
@@ -133,7 +133,7 @@ int Semaphore::GetCount()
 
         return value;
     #elif PLATFORM_IS_WINDOWS || PLATFORM_IS_APPLE
-        return _count.load( std::memory_order::memory_order_release );
+        return _count.load( std::memory_order_release );
     #else
         #error Unimplemented
     #endif
diff --git a/src/threading/ThreadPool.cpp b/src/threading/ThreadPool.cpp
index fc6dee58..31963e05 100644
--- a/src/threading/ThreadPool.cpp
+++ b/src/threading/ThreadPool.cpp
@@ -157,7 +157,7 @@ void ThreadPool::FixedThreadRunner( void* tParam )
 
     for( ;; )
     {
-        if( exitSignal.load( std::memory_order::memory_order_acquire ) )
+        if( exitSignal.load( std::memory_order_acquire ) )
             break;
 
         // Wait until we are signalled to go
@@ -190,7 +190,7 @@ void ThreadPool::GreedyThreadRunner( void* tParam )
 
     for( ;; )
     {
-        if( pool._exitSignal.load( std::memory_order::memory_order_acquire ) )
+        if( pool._exitSignal.load( std::memory_order_acquire ) )
             return;
 
         // Wait until we are signalled to go
diff --git a/src/tools/PlotChecker.cpp b/src/tools/PlotChecker.cpp
new file mode 100644
index 00000000..6578a90e
--- /dev/null
+++ b/src/tools/PlotChecker.cpp
@@ -0,0 +1,215 @@
+#include "PlotChecker.h"
+#include "tools/PlotReader.h"
+#include "plotting/PlotValidation.h"
+#include "harvesting/GreenReaper.h"
+#include "plotting/f1/F1Gen.h"
+
+class PlotCheckerImpl : public PlotChecker
+{
+    PlotCheckerConfig _cfg;
+    bool              _lastPlotDeleted = false;
+public:
+
+    //-----------------------------------------------------------
+    PlotCheckerImpl( PlotCheckerConfig& cfg )
+        : _cfg( cfg )
+    {}
+
+    //-----------------------------------------------------------
+    ~PlotCheckerImpl() override = default;
+
+    //-----------------------------------------------------------
+    void CheckPlot( const char* plotPath, PlotCheckResult* outResult ) override
+    {
+        _lastPlotDeleted = false;
+
+        PlotCheckResult result{};
+        PerformPlotCheck( plotPath, result );
+
+        if( !result.error.empty() )
+        {
+            if( !_cfg.silent )
+            {
+                Log::Line( "An error occured checking the plot: %s.", result.error.c_str() );
+
+                if( _cfg.deletePlots )
+                    Log::Line( "Any actions against plot '%s' will be ignored.", plotPath );
+            }
+        }
+
+        // Check threshold for plot deletion
+        const double passRate = result.proofCount / (double)result.checkCount;
+
+        // Print stats
+        if( !_cfg.silent )
+        {
+            std::string seedHex = BytesToHexStdString( result.seedUsed, sizeof( result.seedUsed ) );
+            Log::Line( "Seed used: 0x%s", seedHex.c_str() );
+            Log::Line( "Proofs requested/fetched: %llu / %llu ( %.3lf%% )", result.proofCount, result.checkCount, passRate * 100.0 );
+
+            if( result.proofFetchFailCount > 0 )
+                Log::Line( "Proof fetches failed    : %llu ( %.3lf%% )", result.proofFetchFailCount, result.proofFetchFailCount / (double)result.checkCount * 100.0 );
+            if( result.proofValidationFailCount > 0 )
+                Log::Line( "Proof validation failed : %llu ( %.3lf%% )", result.proofValidationFailCount, result.proofValidationFailCount / (double)result.checkCount * 100.0 );
+            Log::NewLine();
+        }
+
+        // Delete the plot if it's below the set threshold
+        if( _cfg.deletePlots )
+        {
+            if( result.proofFetchFailCount > 0 || passRate < _cfg.deleteThreshold )
+            {
+                if( !_cfg.silent )
+                {
+                    if( result.proofFetchFailCount > 0 )
+                    {
+                        Log::Line( "WARNING: Deleting plot '%s' as it failed to fetch some proofs. This might indicate corrupt plot file.", plotPath );
+                    }
+                    else
+                    {
+                        Log::Line( "WARNING: Deleting plot '%s' as it is below the proof threshold: %.3lf / %.3lf.",
+                            plotPath, passRate, _cfg.deleteThreshold );
+                    }
+                    Log::NewLine();
+                }
+
+                remove( plotPath );
+                result.deleted   = true;
+                _lastPlotDeleted = true;
+            }
+            else
+            {
+                Log::Line( "Plot is OK. It passed the proof threshold of %.3lf%%", _cfg.deleteThreshold * 100.0 );
+                Log::NewLine();
+            }
+        }
+
+        if( outResult )
+            *outResult = result;
+    }
+    
+    //-----------------------------------------------------------
+    void PerformPlotCheck( const char* plotPath, PlotCheckResult& result )
+    {
+        FilePlot plot;
+        if( !plot.Open( plotPath ) )
+        {
+            std::stringstream err; err << "Failed to open plot file at '" << plotPath << "' with error " << plot.GetError() << ".";
+            result.error = err.str();
+            return;
+        }
+
+        const uint32 threadCount = _cfg.threadCount == 0 ? SysHost::GetLogicalCPUCount() :
+                                        std::min( (uint32)MAX_THREADS, std::min( _cfg.threadCount, SysHost::GetLogicalCPUCount() ) );
+
+        const bool useGpu = plot.CompressionLevel() > 0 && !_cfg.noGpu;
+
+        PlotReader reader( plot );
+
+        if( _cfg.grContext )
+            reader.AssignDecompressionContext( _cfg.grContext );
+        else
+            reader.ConfigDecompressor( threadCount, _cfg.disableCpuAffinity, 0, useGpu, (int)_cfg.gpuIndex );
+
+        const uint32 k = plot.K();
+
+        byte AlignAs(8) seed[BB_PLOT_ID_LEN] = {};
+
+        if( !_cfg.hasSeed )
+            SysHost::Random( seed, sizeof( seed ) );
+        else
+            memcpy( seed, _cfg.seed, sizeof( _cfg.seed ) );
+
+        {
+            std::string seedHex = BytesToHexStdString( seed, sizeof( seed ) );
+            if( !_cfg.silent )
+                Log::Line( "Checking %llu random proofs with seed 0x%s...", (llu)_cfg.proofCount, seedHex.c_str() );
+        }
+
+        if( !_cfg.silent )
+            Log::Line( "Plot compression level: %u", plot.CompressionLevel() );
+
+        if( !_cfg.grContext && plot.CompressionLevel() > 0 && useGpu )
+        {
+            const bool hasGPU = grHasGpuDecompressor( reader.GetDecompressorContext() );
+            if( hasGPU && !_cfg.silent )
+                Log::Line( "Using GPU for decompression." );
+            else if( !_cfg.silent )
+                Log::Line( "No GPU was selected for decompression." );
+        }
+
+        const uint64 f7Mask = (1ull << k) - 1;
+
+        uint64 prevF7     = 0;
+        uint64 proofCount = 0;
+
+        uint64 proofXs[BB_PLOT_PROOF_X_COUNT];
+
+        uint64 nextPercentage = 10;
+
+        for( uint64 i = 0; i < _cfg.proofCount; i++ )
+        {
+            const uint64 f7 = F1GenSingleForK( k, seed, prevF7 ) & f7Mask;
+            prevF7 = f7;
+
+            uint64 startP7Idx = 0;
+            const uint64 nF7Proofs = reader.GetP7IndicesForF7( f7, startP7Idx );
+
+            for( uint64 j = 0; j < nF7Proofs; j++ )
+            {
+                uint64 p7Entry;
+                if( !reader.ReadP7Entry( startP7Idx + j, p7Entry ) )
+                {
+                    result.proofFetchFailCount ++;
+                    continue;
+                }
+
+                const auto r = reader.FetchProof( p7Entry, proofXs );
+                if( r == ProofFetchResult::OK )
+                {
+                    // Convert to 
+                    uint64 outF7 = 0;
+                    if( PlotValidation::ValidateFullProof( k, plot.PlotId(), proofXs, outF7 ) && outF7 == f7 )
+                    {
+                        proofCount++;
+                    }
+                    else
+                    {
+                        result.proofValidationFailCount++;
+                    }
+                }
+                else
+                {
+                    if( r != ProofFetchResult::NoProof )
+                        result.proofFetchFailCount ++;
+                }
+            }
+
+            const double percent = i / (double)_cfg.proofCount * 100.0;
+            if( (uint64)percent == nextPercentage )
+            {
+                if( !_cfg.silent )
+                    Log::Line( " %llu%%...", (llu)nextPercentage );
+                nextPercentage += 10;
+            }
+        }
+
+        result.checkCount = _cfg.proofCount;
+        result.proofCount = proofCount;
+        result.error.clear();
+        static_assert( sizeof(PlotCheckResult::seedUsed) == sizeof(seed) );
+        memcpy( result.seedUsed, seed, sizeof( result.seedUsed ) );
+    }
+
+    //-----------------------------------------------------------
+    bool LastPlotDeleted() override
+    {
+        return _lastPlotDeleted;
+    }
+};
+
+//-----------------------------------------------------------
+PlotChecker* PlotChecker::Create( PlotCheckerConfig& cfg )
+{
+    return new PlotCheckerImpl( cfg );
+}
diff --git a/src/tools/PlotChecker.h b/src/tools/PlotChecker.h
new file mode 100644
index 00000000..e63adc23
--- /dev/null
+++ b/src/tools/PlotChecker.h
@@ -0,0 +1,50 @@
+#pragma once
+#include "ChiaConsts.h"
+#include <string>
+
+struct PlotCheckerConfig
+{
+    uint64      proofCount         = 100;
+    bool        noGpu              = false;
+    int32       gpuIndex           = -1;
+    uint32      threadCount        = 0;
+    bool        disableCpuAffinity = false;
+    bool        silent             = false;
+    bool        hasSeed            = false;
+    byte        seed[BB_PLOT_ID_LEN]{};
+
+    bool        deletePlots     = false;    // If true, plots that fail to fetch proofs, or are below a threshold, will be deleted
+    double      deleteThreshold = 0.0;      // If proofs received to proof request ratio is below this, the plot will be deleted
+
+    struct GreenReaperContext* grContext = nullptr;
+};
+
+struct PlotCheckResult
+{
+    uint64      checkCount;
+    uint64      proofCount;
+    uint64      proofFetchFailCount;
+    uint64      proofValidationFailCount;
+    byte        seedUsed[BB_PLOT_ID_LEN];
+    std::string error;
+    bool        deleted;
+};
+
+class PlotChecker
+{
+public:
+
+protected:
+    PlotChecker() = default;
+
+public:
+    static PlotChecker* Create( PlotCheckerConfig& cfg );
+    virtual ~PlotChecker() = default;
+
+    // Add a plot ot the queue to be checked
+    /// Returns true if the plot passed the threshold check
+    virtual void CheckPlot( const char* plotPath, PlotCheckResult* outResult ) = 0;
+
+    // Returns true if the last plot checked was deleted
+    virtual bool LastPlotDeleted() = 0;
+};
diff --git a/src/tools/PlotComparer.cpp b/src/tools/PlotComparer.cpp
index 9562e2a0..f275d980 100644
--- a/src/tools/PlotComparer.cpp
+++ b/src/tools/PlotComparer.cpp
@@ -1,5 +1,6 @@
 #include "io/FileStream.h"
 #include "ChiaConsts.h"
+#include "tools/PlotReader.h"
 #include "util/Util.h"
 #include "util/Log.h"
 #include "util/CliParser.h"
@@ -11,279 +12,17 @@
 #include <vector>
 #include <algorithm>
 
-class PlotInfo;
 
-void TestTable( TableId table, PlotInfo& ref, PlotInfo& tgt );
-void TestC3Table( PlotInfo& ref, PlotInfo& tgt );
-void TestTable( PlotInfo& ref, PlotInfo& tgt, TableId table );
+void DumpPlotHeader( FilePlot& plot );
+void TestTable( TableId table, FilePlot& ref, FilePlot& tgt );
+void TestC3Table( FilePlot& ref, FilePlot& tgt );
+void TestTable( FilePlot& ref, FilePlot& tgt, TableId table );
 
-void UnpackPark7( const byte* srcBits, uint64* dstEntries );
-
-void DumpP7( PlotInfo& plot, const char* path );
-
-Span<uint> ReadC1Table( PlotInfo& plot );
-
-class PlotInfo
-{
-public:
-    PlotInfo() {}
-
-    ~PlotInfo()
-    {
-
-    }
-
-    void Open( const char* path )
-    {
-        _path = path;
-        FatalIf( IsOpen(), "Plot is already open." );
-
-        // FileFlags::NoBuffering | FileFlags::NoBuffering ),   // #TODO: allow unbuffered reading with our own buffer... For now just use like this
-        FatalIf( !_plotFile.Open( path, FileMode::Open, FileAccess::Read, FileFlags::None ), 
-            "Failed to open plot '%s' with error %d.", path, _plotFile.GetError() );
-
-        const size_t blockSize = _plotFile.BlockSize();
-        _blockBuffer = bbvirtalloc<byte>( blockSize );
-
-        ///
-        /// Read header
-        ///
-
-        // Magic
-        {
-            char magic[sizeof( kPOSMagic )-1] = { 0 };
-            Read( sizeof( magic ), magic );
-            FatalIf( !MemCmp( magic, kPOSMagic, sizeof( magic ) ), "Invalid plot magic." );
-        }
-        
-        // Plot Id
-        {
-            Read( sizeof( _id ), _id );
-
-            char str[65] = { 0 };
-            size_t numEncoded = 0;
-            BytesToHexStr( _id, sizeof( _id ), str, sizeof( str ), numEncoded );
-            ASSERT( numEncoded == sizeof( _id ) );
-            _idString = str;
-        }
-
-        // K
-        {
-            byte k = 0;
-            Read( 1, &k );
-            _k = k;
-        }
-
-        // Format Descritption
-        {
-            const uint formatDescSize =  ReadUInt16();
-            FatalIf( formatDescSize != sizeof( kFormatDescription ) - 1, "Invalid format description size." );
-
-            char desc[sizeof( kFormatDescription )-1] = { 0 };
-            Read( sizeof( desc ), desc );
-            FatalIf( !MemCmp( desc, kFormatDescription, sizeof( desc ) ), "Invalid format description." );
-        }
-        
-        // Memo
-        {
-            uint memoSize = ReadUInt16();
-            FatalIf( memoSize > sizeof( _memo ), "Invalid memo." );
-            _memoLength = memoSize;
-
-            Read( memoSize, _memo );
-
-            char str[BB_PLOT_MEMO_MAX_SIZE*2+1] = { 0 };
-            size_t numEncoded = 0;
-            BytesToHexStr( _memo, memoSize, str, sizeof( str ), numEncoded );
-            
-            _memoString = str;
-        }
-
-        // Table pointers
-        Read( sizeof( _tablePtrs ), _tablePtrs );
-        for( int i = 0; i < 10; i++ )
-            _tablePtrs[i] = Swap64( _tablePtrs[i] );
-
-        // What follows is table data
-    }
-
-public:
-    const bool IsOpen() const { return _plotFile.IsOpen(); }
-
-    const byte* PlotId() const { return _id; }
-
-    const std::string& PlotIdStr() const { return _idString; }
-
-    uint PlotMemoSize() const { return _memoLength; }
-
-    const byte* PlotMemo() const { return _memo; }
-
-    const std::string& PlotMemoStr() const { return _memoString; }
-
-    uint K() const { return _k; }
-
-    FileStream& PlotFile() { return _plotFile; }
-
-    uint64 TableAddress( TableId table ) const
-    {
-        ASSERT( table >= TableId::Table1 && table <= TableId::Table7 );
-        return _tablePtrs[(int)table];
-    }
-
-    uint64 CTableAddress( int c )
-    {
-        ASSERT( c >= 1 && c <= 3 );
-
-        return _tablePtrs[c+6];
-    }
-
-    size_t TableSize( int tableIndex )
-    {
-        ASSERT( tableIndex >= 0 && tableIndex < 10 );
-
-        const uint64 address = _tablePtrs[tableIndex];
-        uint64 endAddress = _plotFile.Size();
-
-        // Check all table entris where we find and address that is 
-        // greater than ours and less than the current end address
-        for( int i = 0; i < 10; i++ )
-        {
-            const uint64 a = _tablePtrs[i];
-            if( a > address && a < endAddress )
-                endAddress = a;
-        }
-
-        return (size_t)( endAddress - address );
-    }
-
-    ssize_t Read( size_t size, void* buffer )
-    {
-        ASSERT( buffer );
-        if( size == 0 )
-            return 0;
-
-        const size_t blockSize  = _plotFile.BlockSize();
-
-        // Read-in any data already left-over in the block buffer
-        // if( _blockRemainder )
-        // {
-        //     const size_t copySize = std::min( _blockRemainder, size );
-        //     memcpy( buffer, _blockBuffer + _blockOffset, copySize );
-
-        //     _blockOffset    += copySize;
-        //     _blockRemainder -= copySize;
-
-        //     buffer = (void*)((byte*)buffer + copySize);
-        //     size -= copySize;
-
-        //     if( size == 0 )
-        //         return copySize;
-        // }
-
-        const size_t blockCount = size / blockSize;
-
-        size_t blockSizeToRead = blockCount * blockSize;
-        const size_t remainder  = size - blockSizeToRead;
-
-        byte* reader = (byte*)buffer;
-        ssize_t sizeRead = 0;
-
-        while( blockSizeToRead )
-        {
-            ssize_t read = _plotFile.Read( reader, blockSizeToRead );
-            FatalIf( read < 0 , "Plot %s failed to read with error: %d.", _path.c_str(), _plotFile.GetError() );
-            
-            reader   += read;
-            sizeRead += read;
-            blockSizeToRead -= (size_t)read;
-        }
-
-        if( remainder )
-        {
-            ssize_t read = _plotFile.Read( reader, remainder );
-            
-            // ssize_t read = _plotFile.Read( _blockBuffer, blockSize );
-            ASSERT( read == (ssize_t)remainder || read == (ssize_t)blockSize );       
-
-            // FatalIf( read < (ssize_t)remainder, "Failed to read a full block on plot %s.", _path.c_str() );
-
-            // memcpy( reader, _blockBuffer, remainder );
-            sizeRead += read;
-
-            // // Save any left over data in the block buffer
-            // _blockOffset    = remainder;
-            // _blockRemainder = blockSize - remainder;
-        }
-
-        return sizeRead;
-    }
-
-    uint16 ReadUInt16()
-    {
-        uint16 value = 0;
-        Read( sizeof( value ), &value );
-        return Swap16( value );
-    }
-
-    void ReadTable( int tableIndex, void* buffer )
-    {
-        const size_t size = TableSize( tableIndex );
-        
-        _blockRemainder = 0;
-        FatalIf( !_plotFile.Seek( (int64)_tablePtrs[tableIndex], SeekOrigin::Begin ),
-            "Failed to seek to table %u.", tableIndex+1 );
-
-        Read( size, buffer );
-    }
-
-    void DumpHeader()
-    {
-        Log::Line( "Plot %s", _path.c_str() );
-        Log::Line( "-----------------------------------------" );
-        Log::Line( "Id       : %s", _idString.c_str() );
-        Log::Line( "Memo     : %s", _memoString.c_str() );
-        Log::Line( "K        : %u", _k );
-
-        for( int i = 0; i <= (int)TableId::Table7; i++ )
-        {
-            const size_t size = TableSize( i );
-
-            Log::Line( "Table %u  : %16lu ( 0x%016lx ) : %8llu MiB ( %.2lf GiB )", 
-                i+1, _tablePtrs[i], _tablePtrs[i],
-                size BtoMB, (double)size BtoGB );
-
-        }
-
-        for( int i = (int)TableId::Table7+1; i < 10; i++ )
-        {
-            const size_t size = TableSize( i );
-
-            Log::Line( "C%u       : %16lu ( 0x%016lx ) : %8llu MiB ( %.2lf GiB )",
-                i-6, _tablePtrs[i], _tablePtrs[i],
-                size BtoMB, (double)size BtoGB );
-        }
-    }
-
-private:
-    FileStream  _plotFile;
-    byte        _id[BB_PLOT_ID_LEN]          = { 0 };
-    byte        _memo[BB_PLOT_MEMO_MAX_SIZE] = { 0 };
-    uint        _memoLength     = 0;
-    std::string _idString       = "";
-    std::string _memoString     = "";
-    uint        _k              = 0;
-    std::string _path           = "";
-    uint64      _tablePtrs[10]  = { 0 };
-    byte*       _blockBuffer    = nullptr;
-    size_t      _blockRemainder = 0;
-    size_t      _blockOffset    = 0;
-
-    // size_t      _readBufferSize = 32 MB;
-    // byte*       _readBuffer     = nullptr;
-
-};
+void UnpackPark7( uint32 k, const byte* srcBits, uint64* dstEntries );
 
+void DumpP7( FilePlot& plot, const char* path );
 
+Span<uint> ReadC1Table( FilePlot& plot );
 
 //-----------------------------------------------------------
 const char USAGE[] = R"(plotcmp <plot_a_path> <plot_b_path>
@@ -330,8 +69,8 @@ void PlotCompareMain( GlobalPlotConfig& gCfg, CliParser& cli )
     opts.plotAPath = cli.ArgConsume();
     opts.plotBPath = cli.ArgConsume();
 
-    PlotInfo refPlot; // Reference
-    PlotInfo tgtPlot; // Target
+    FilePlot refPlot; // Reference
+    FilePlot tgtPlot; // Target
 
     {
         const char* refPath = opts.plotAPath;
@@ -340,9 +79,12 @@ void PlotCompareMain( GlobalPlotConfig& gCfg, CliParser& cli )
         refPlot.Open( refPath );
         tgtPlot.Open( tgtPath );
 
-        refPlot.DumpHeader();
-        Log::Line( "" );
-        tgtPlot.DumpHeader();
+        Log::Line( "[Reference Plot]" );
+        DumpPlotHeader( refPlot );
+        Log::NewLine();
+        Log::Line( "[Target Plot]" );
+        DumpPlotHeader( tgtPlot );
+        Log::NewLine();
     }
 
     FatalIf( refPlot.K() != 32, "Plot A is k%u. Only k32 plots are currently supported.", refPlot.K() );
@@ -352,10 +94,14 @@ void PlotCompareMain( GlobalPlotConfig& gCfg, CliParser& cli )
     // FatalIf( !MemCmp( refPlot.PlotMemo(), tgtPlot.PlotMemo(), std::min( refPlot.PlotMemoSize(), tgtPlot.PlotMemoSize() ) ), "Plot memo mismatch." );
     FatalIf( refPlot.K() != tgtPlot.K(), "K value mismatch." );
 
+    FatalIf( refPlot.CompressionLevel() != tgtPlot.CompressionLevel(), 
+        "Compression mismatch. %u != %u.", refPlot.CompressionLevel(), tgtPlot.CompressionLevel() );
+
     // Test P7, dump it
     // DumpP7( refPlot, "/mnt/p5510a/reference/p7.tmp" );
 
-    // TestC3Table( refPlot, tgtPlot );
+    // TestC3Table( refPlot, tgtPlot ); Exit( 0 );
+
     // TestTable( refPlot, tgtPlot, TableId::Table7 );
     // TestTable( refPlot, tgtPlot, TableId::Table3 );
 
@@ -368,13 +114,36 @@ void PlotCompareMain( GlobalPlotConfig& gCfg, CliParser& cli )
 }
 
 //-----------------------------------------------------------
-Span<uint> ReadC1Table( PlotInfo& plot )
+void DumpPlotHeader( FilePlot& p )
 {
-    const size_t tableSize  = plot.TableSize( 7 );
+    // Log::Line( "Id: %")
+    Log::Line( "K: %u", p.K() );
+    Log::Line( "Compression Level: %u", p.CompressionLevel() );
+
+    Log::Line( "Table Addresses:" );
+    for( uint32 i = 0; i < 10; i++ )
+        Log::Line( " [%2u] : 0x%016llx", i+1, (llu)p.TableAddress( (PlotTable)i ) );
+
+    if( p.Version() >= PlotVersion::v2_0 )
+    {
+        const auto sizes = p.TableSizes();
+
+        Log::Line( "Table Sizes:" );
+        for( uint32 i = 0; i < 10; i++ )
+            Log::Line( " [%2u] : %-12llu B | %llu MiB", i+1, (llu)sizes[i], (llu)(sizes[i] BtoMB) );
+    }
+}
+
+//-----------------------------------------------------------
+Span<uint> ReadC1Table( FilePlot& plot )
+{
+    const size_t tableSize  = plot.TableSize( PlotTable::C1 );
     const uint32 entryCount = (uint)( tableSize / sizeof( uint32 ) );
-    
+
     uint32* c1 = bbvirtalloc<uint32>( tableSize );
-    plot.ReadTable( 7, c1 );
+
+    FatalIf( !plot.SeekToTable( PlotTable::C1 ), "Failed to seek to table C1." );
+    plot.Read( tableSize, c1 );
 
     for( uint i = 0; i < entryCount; i++ )
         c1[i] = Swap32( c1[i] );
@@ -383,13 +152,15 @@ Span<uint> ReadC1Table( PlotInfo& plot )
 }
 
 //-----------------------------------------------------------
-Span<uint> ReadC2Table( PlotInfo& plot )
+Span<uint> ReadC2Table( FilePlot& plot )
 {
-    const size_t tableSize  = plot.TableSize( 8 );
+    const size_t tableSize  = plot.TableSize( PlotTable::C2 );
     const uint32 entryCount = (uint)( tableSize / sizeof( uint32 ) );
     
     uint32* c2 = bbvirtalloc<uint32>( tableSize );
-    plot.ReadTable( 8, c2 );
+
+    FatalIf( !plot.SeekToTable( PlotTable::C2 ), "Failed to seek to table C1." );
+    plot.Read( tableSize, c2 );
 
     for( uint i = 0; i < entryCount; i++ )
         c2[i] = Swap32( c2[i] );
@@ -398,13 +169,9 @@ Span<uint> ReadC2Table( PlotInfo& plot )
 }
 
 //-----------------------------------------------------------
-void TestC3Table( PlotInfo& ref, PlotInfo& tgt )
+void TestC3Table( FilePlot& ref, FilePlot& tgt )
 {
     Log::Line( "Reading C tables..." );
-    // const size_t refSize = ref.TableSize( 9 );
-    // const size_t tgtSize = tgt.TableSize( 9 );
-
-    // const size_t c3Size = std::min( refSize, tgtSize );
 
     // Read C1 so that we know how many parks we got
     Span<uint> refC1 = ReadC1Table( ref );
@@ -463,14 +230,17 @@ void TestC3Table( PlotInfo& ref, PlotInfo& tgt )
 
     Log::Line( "Validating C3 table..." );
     {
-        const size_t refC3Size = ref.TableSize( 9 );
-        const size_t tgtC3Size = tgt.TableSize( 9 );
+        const size_t refC3Size = ref.TableSize( PlotTable::C3 );
+        const size_t tgtC3Size = tgt.TableSize( PlotTable::C3 );
 
         byte* refC3 = bbvirtalloc<byte>( refC3Size );
         byte* tgtC3 = bbvirtalloc<byte>( tgtC3Size );
 
-        ref.ReadTable( 9, refC3 );
-        tgt.ReadTable( 9, tgtC3 );
+        FatalIf( !ref.SeekToTable( PlotTable::C3 ), "Failed to seek ref plot to C3 table." );
+        FatalIf( !tgt.SeekToTable( PlotTable::C3 ), "Failed to seek tgt plot to C3 table." );
+
+        FatalIf( (ssize_t)refC3Size != ref.Read( refC3Size, refC3 ), "Failed to read ref C3 table." );
+        FatalIf( (ssize_t)tgtC3Size != tgt.Read( tgtC3Size, tgtC3 ), "Failed to read tgt C3 table." );
 
         // const size_t c3Size = std::min( refC3Size, tgtC3Size );
 
@@ -515,7 +285,7 @@ void TestC3Table( PlotInfo& ref, PlotInfo& tgt )
 }
 
 //-----------------------------------------------------------
-uint64 CompareP7( PlotInfo& ref, PlotInfo& tgt, const byte* p7RefBytes, const byte* p7TgtBytes, const int64 parkCount )
+uint64 CompareP7( FilePlot& ref, FilePlot& tgt, const byte* p7RefBytes, const byte* p7TgtBytes, const int64 parkCount )
 {
     // Double-buffer parks at a time so that we can compare entries across parks
     uint64 refParks[2][kEntriesPerPark];
@@ -529,8 +299,8 @@ uint64 CompareP7( PlotInfo& ref, PlotInfo& tgt, const byte* p7RefBytes, const by
 
     const size_t parkSize = CalculatePark7Size( ref.K() );
 
-    UnpackPark7( p7RefBytes, refParks[0] );
-    UnpackPark7( p7TgtBytes, tgtParks[0] );
+    UnpackPark7( ref.K(), p7RefBytes, refParks[0] );
+    UnpackPark7( tgt.K(), p7TgtBytes, tgtParks[0] );
     p7RefBytes += parkSize;
     p7TgtBytes += parkSize;
 
@@ -543,8 +313,8 @@ uint64 CompareP7( PlotInfo& ref, PlotInfo& tgt, const byte* p7RefBytes, const by
         // Load the next park, if we can
         if( !isLastPark )
         {
-            UnpackPark7( p7RefBytes, refParks[1] );
-            UnpackPark7( p7TgtBytes, tgtParks[1] );
+            UnpackPark7( ref.K(), p7RefBytes, refParks[1] );
+            UnpackPark7( tgt.K(), p7TgtBytes, tgtParks[1] );
             p7RefBytes += parkSize;
             p7TgtBytes += parkSize;
         }
@@ -601,32 +371,61 @@ uint64 CompareP7( PlotInfo& ref, PlotInfo& tgt, const byte* p7RefBytes, const by
 }
 
 //-----------------------------------------------------------
-void TestTable( PlotInfo& ref, PlotInfo& tgt, TableId table )
+void TestTable( FilePlot& ref, FilePlot& tgt, TableId table )
 {
+    if( table == TableId::Table1 && tgt.CompressionLevel() > 0 )
+        return;
+
+    if( table == TableId::Table2 && tgt.CompressionLevel() >= 9 )
+        return;
+
+    // if( table == TableId::Table7 ) return;
+
     Log::Line( "Reading Table %u...", table+1 );
 
-    const size_t parkSize = table < TableId::Table7 ? CalculateParkSize( table ) : CalculatePark7Size( ref.K() );
+    const uint32 numTablesDropped = tgt.CompressionLevel() >= 9 ? 2 :
+                                    tgt.CompressionLevel() >= 1 ? 1 : 0;
 
-    const size_t sizeRef = ref.TableSize( (int)table );
-    const size_t sizeTgt = tgt.TableSize( (int)table );
+    const size_t parkSize = table < TableId::Table7 ? 
+                                (uint)table == numTablesDropped ? 
+                                    GetCompressionInfoForLevel( tgt.CompressionLevel() ).tableParkSize : CalculateParkSize( table ) :
+                                CalculatePark7Size( ref.K() );
+
+    const size_t sizeRef = ref.TableSize( (PlotTable)table );
+    const size_t sizeTgt = tgt.TableSize( (PlotTable)table );
 
     byte* tableParksRef = bbvirtalloc<byte>( sizeRef );
     byte* tableParksTgt = bbvirtalloc<byte>( sizeTgt );
 
-    ref.ReadTable( (int)table, tableParksRef );
-    tgt.ReadTable( (int)table, tableParksTgt );
-
     const size_t tableSize = std::min( sizeRef, sizeTgt );
     const int64  parkCount = (int64)( tableSize / parkSize );
 
+    FatalIf( !ref.SeekToTable( (PlotTable)table ), "Failed to seek to table %u on reference plot.", (uint32)table+1 );
+    FatalIf( !tgt.SeekToTable( (PlotTable)table ), "Failed to seek to table %u on target plot.", (uint32)table+1 );
+
+    {
+        const ssize_t refRead = ref.Read( tableSize, tableParksRef );
+        FatalIf( (ssize_t)tableSize != refRead, "Failed to read reference table %u.", (uint32)table+1 );
+        
+        const ssize_t tgtRead = tgt.Read( tableSize, tableParksTgt );
+        FatalIf( (ssize_t)tableSize != tgtRead, "Failed to read target table %u.", (uint32)table+1 );
+
+    }
+
     const byte* parkRef = tableParksRef;
     const byte* parkTgt = tableParksTgt;
-    
     Log::Line( "Validating Table %u...", table+1 );
 
-    const uint64 stubBitSize      = (_K - kStubMinusBits);
+    uint64 stubBitSize = (ref.K() - kStubMinusBits);
+    if( ref.CompressionLevel() > 0 )
+    {
+        auto cInfo = GetCompressionInfoForLevel( ref.CompressionLevel() );
+        stubBitSize = cInfo.stubSizeBits;
+    }
+
     const size_t stubSectionBytes = CDiv( (kEntriesPerPark - 1) * stubBitSize, 8 );
 
+
     uint64 failureCount = 0;
     if( table == TableId::Table7 )
     {
@@ -643,7 +442,6 @@ void TestTable( PlotInfo& ref, PlotInfo& tgt, TableId table )
         for( int64 i = 0; i < parkCount; i++ )
         {
             // Ignore buffer zone
-
             const uint16 pRefCSize = *(uint16*)(parkRef + stubSectionBytes + sizeof( uint64 ) );
             const uint16 pTgtCSize = *(uint16*)(parkTgt + stubSectionBytes + sizeof( uint64 ) );
 
@@ -652,7 +450,7 @@ void TestTable( PlotInfo& ref, PlotInfo& tgt, TableId table )
             if( !failed )
             {
                 const size_t realParkSize = sizeof( uint64 ) + stubSectionBytes + pRefCSize;
-                failed =!MemCmp( parkRef, parkTgt, realParkSize );
+                failed = !MemCmp( parkRef, parkTgt, realParkSize );
             }
             // if( pRefCSize != pTgtCSize || !MemCmp( parkRef, parkTgt, parkSize ) )
 
@@ -661,7 +459,7 @@ void TestTable( PlotInfo& ref, PlotInfo& tgt, TableId table )
 
                 if( failed )
                 {
-                    bool stubsEqual = MemCmp( parkRef, parkTgt, stubSectionBytes + sizeof( uint64 ) );
+                    // bool stubsEqual = MemCmp( parkRef, parkTgt, stubSectionBytes + sizeof( uint64 ) );
                     Log::Line( " T%u park %lld failed.", table+1, i );
                     failureCount++;
                 }
@@ -705,20 +503,19 @@ void TestTable( PlotInfo& ref, PlotInfo& tgt, TableId table )
 // Unpack a single park 7,
 // ensure srcBits is algined to uint64
 //-----------------------------------------------------------
-void UnpackPark7( const byte* srcBits, uint64* dstEntries )
+void UnpackPark7( const uint32 k, const byte* srcBits, uint64* dstEntries )
 {
     ASSERT( ((uintptr_t)srcBits & 7 ) == 0 );
-    const uint32 _k = _K;
 
-    const uint32 bitsPerEntry = _k + 1;
-    CPBitReader reader( srcBits, CalculatePark7Size( _k ) * 8, 0  );
+    const uint32 bitsPerEntry = k + 1;
+    CPBitReader reader( srcBits, CalculatePark7Size( k ) * 8, 0  );
 
     for( int32 i = 0; i < kEntriesPerPark; i++ )
         dstEntries[i] = reader.Read64Aligned( bitsPerEntry );
 }
 
 //-----------------------------------------------------------
-void DumpP7( PlotInfo& plot, const char* path )
+void DumpP7( FilePlot& plot, const char* path )
 {
     FileStream file;
     FatalIf( !file.Open( path, FileMode::Create, FileAccess::Write, FileFlags::LargeFile | FileFlags::NoBuffering ),
@@ -726,14 +523,14 @@ void DumpP7( PlotInfo& plot, const char* path )
 
     const size_t parkSize = CalculatePark7Size( plot.K() );
 
-    const size_t tableSize  = plot.TableSize( (int)TableId::Table7 );
+    const size_t tableSize  = plot.TableSize( PlotTable::Table7 );
     const int64  parkCount  = (int64)( tableSize / parkSize );
     const uint64 numEntries = (uint64)parkCount * kEntriesPerPark;
 
     byte* p7Bytes = bbvirtalloc<byte>( tableSize );
 
     Log::Line( "Reading Table7..." );
-    plot.ReadTable( (int)TableId::Table7, p7Bytes );
+    // plot.ReadTable( (int)TableId::Table7, p7Bytes );
 
     Log::Line( "Unpacking Table 7..." );
     uint64* entries = bbvirtalloc<uint64>( RoundUpToNextBoundaryT( (size_t)numEntries* sizeof( uint64 ), file.BlockSize() ) );
@@ -742,7 +539,7 @@ void DumpP7( PlotInfo& plot, const char* path )
           uint64* entryWriter = entries;
     for( int64 i = 0; i < parkCount; i++ )
     {
-        UnpackPark7( p7Bytes, entryWriter );
+        UnpackPark7( plot.K(), p7Bytes, entryWriter );
 
         parkReader  += parkSize;
         entryWriter += kEntriesPerPark;
diff --git a/src/tools/PlotReader.cpp b/src/tools/PlotReader.cpp
index 6e925947..0059f186 100644
--- a/src/tools/PlotReader.cpp
+++ b/src/tools/PlotReader.cpp
@@ -8,6 +8,7 @@
 #include "plotting/Compression.h"
 #include "harvesting/GreenReaper.h"
 #include "BLS.h"
+#include "plotdisk/jobs/IOJob.h"
 
 ///
 /// Plot Reader
@@ -38,7 +39,7 @@ PlotReader::~PlotReader()
 
     bbvirtfreebounded_span( _c3Buffer );
 
-    if( _grContext )
+    if( _grContext && _ownsGrContext )
         grDestroyContext( _grContext );
     _grContext = nullptr;
 }
@@ -347,22 +348,25 @@ bool PlotReader::ReadLPParkComponents( TableId table, uint64 parkIndex,
     if( _plot.Read( 2, &compressedDeltasSize ) != 2 )
         return false;
 
-    if( !( compressedDeltasSize & 0x8000 ) && compressedDeltasSize > maxDeltasSizeBytes )
+    // Don't support uncompressed deltas
+    if( compressedDeltasSize & 0x8000 )
         return false;
 
     size_t deltaCount = 0;
-    if( compressedDeltasSize & 0x8000 ) 
-    {
-        // Uncompressed
-        compressedDeltasSize &= 0x7fff;
-        if( _plot.Read( compressedDeltasSize, compressedDeltaBuffer ) != compressedDeltasSize )
-            return false;
 
-        deltaCount = compressedDeltasSize;
-    }
-    else
+    // #TODO: Investigate this, but we should not support uncompressed deltas
+    // if( compressedDeltasSize & 0x8000 ) 
+    // {
+    //     // Uncompressed
+    //     compressedDeltasSize &= 0x7fff;
+    //     if( _plot.Read( compressedDeltasSize, compressedDeltaBuffer ) != compressedDeltasSize )
+    //         return false;
+
+    //     deltaCount = compressedDeltasSize;
+    // }
+    // else
     {
-        // Compressed
+        // Compressed deltas
         if( _plot.Read( compressedDeltasSize, compressedDeltaBuffer ) != compressedDeltasSize )
             return false;
 
@@ -470,7 +474,7 @@ uint32 PlotReader::GetLPStubBitSize( TableId table ) const
         return _plot.K() - kStubMinusBits;
 
     auto info = GetCompressionInfoForLevel( _plot.CompressionLevel() );
-    return info.subtSizeBits;
+    return info.stubSizeBits;
 }
 
 //-----------------------------------------------------------
@@ -932,24 +936,30 @@ void PlotReader::AssignDecompressionContext( struct GreenReaperContext* context
     if( !context)
         return;
 
-    if( _grContext )
+    if( _grContext && _ownsGrContext )
         grDestroyContext( _grContext );
     
-    _grContext = context;
+    _grContext     = context;
+    _ownsGrContext = false;
 }
 
 //-----------------------------------------------------------
-void PlotReader::ConfigDecompressor( const uint32 threadCount, const bool disableCPUAffinity, const uint32 cpuOffset )
+void PlotReader::ConfigDecompressor( const uint32 threadCount, const bool disableCPUAffinity, const uint32 cpuOffset, bool useGpu, int gpuIndex )
 {
-    if( _grContext )
+    if( _grContext && _ownsGrContext )
         grDestroyContext( _grContext );
-    _grContext = nullptr;
+
+    _grContext     = nullptr;
+    _ownsGrContext = true;
 
     GreenReaperConfig cfg = {};
     cfg.apiVersion         = GR_API_VERSION;
     cfg.threadCount        = bbclamp( threadCount, 1u, SysHost::GetLogicalCPUCount() );
     cfg.cpuOffset          = cpuOffset;
     cfg.disableCpuAffinity = disableCPUAffinity ? GR_TRUE : GR_FALSE;
+    cfg.gpuRequest         = !useGpu ? GRGpuRequestKind_None : 
+                                gpuIndex >= 0 ? GRGpuRequestKind_ExactDevice : GRGpuRequestKind_FirstAvailable;
+    cfg.gpuDeviceIndex     = gpuIndex < 0 ? 0 : gpuIndex;
 
     auto result = grCreateContext( &_grContext, &cfg, sizeof( GreenReaperConfig ) );
     ASSERT( result == GRResult_OK );
@@ -966,6 +976,8 @@ GreenReaperContext* PlotReader::GetGRContext()
 
         auto result = grCreateContext( &_grContext, &cfg, sizeof( GreenReaperConfig ) );
         ASSERT( result == GRResult_OK );
+
+        _ownsGrContext = true;
     }
 
     return _grContext;
@@ -1262,7 +1274,18 @@ size_t FilePlot::PlotSize() const
 //-----------------------------------------------------------
 ssize_t FilePlot::Read( size_t size, void* buffer )
 {
-    return _file.Read( buffer, size );
+    if( size > (size_t)std::numeric_limits<ssize_t>::max() )
+        size = (size_t)std::numeric_limits<ssize_t>::max();
+
+    int error = 0;
+    if( !IOJob::ReadFromFileUnaligned( _file, buffer, size, error ) )
+    {
+        size = 0;
+        (void)error;
+        Log::Error( "Failed to read from plot with error %d", error );
+    }
+
+    return (ssize_t)size;
 }
 
 //-----------------------------------------------------------
diff --git a/src/tools/PlotReader.h b/src/tools/PlotReader.h
index 9517c78a..a8d8bdbb 100644
--- a/src/tools/PlotReader.h
+++ b/src/tools/PlotReader.h
@@ -88,6 +88,15 @@ class IPlotFile
         return Swap16( value );
     }
 
+    inline bool SeekToTable( PlotTable table )
+    {
+        return Seek( SeekOrigin::Begin, (int64)TableAddress( table ) );
+    }
+
+    inline PlotVersion Version() { return _version; }
+
+    inline Span<uint64> TableSizes() { return Span<uint64>( _header.tableSizes, 10 ); }
+
     // Abstract Interface
 public:
     virtual bool Open( const char* path ) = 0;
@@ -233,7 +242,13 @@ class PlotReader
     // Takes ownership of a decompression context
     void AssignDecompressionContext( struct GreenReaperContext* context );
 
-    void ConfigDecompressor( uint32 threadCount, bool disableCPUAffinity, uint32 cpuOffset = 0 );
+    void ConfigDecompressor( uint32 threadCount, bool disableCPUAffinity, uint32 cpuOffset = 0, bool useGpu = false, int gpuIndex = -1 );
+
+    inline void ConfigGpuDecompressor( uint32 threadCount, bool disableCPUAffinity, uint32 cpuOffset = 0 )
+    {
+        ConfigDecompressor( threadCount, disableCPUAffinity, cpuOffset, true );
+    }
+
     inline struct GreenReaperContext* GetDecompressorContext() const { return _grContext; }
 
 private:
@@ -261,8 +276,9 @@ class PlotReader
     Span<uint64> _c2Entries;
     Span<uint64> _c3Buffer;
 
-    struct GreenReaperContext* _grContext = nullptr;    // Used for decompressing
-    
+    struct GreenReaperContext* _grContext     = nullptr;    // Used for decompressing
+    bool                       _ownsGrContext = true;
+
     int64  _park7Index = -1;
     uint64 _park7Entries[kEntriesPerPark];
 };
diff --git a/src/tools/PlotValidator.cpp b/src/tools/PlotValidator.cpp
index d0ec5e8d..16a7f14a 100644
--- a/src/tools/PlotValidator.cpp
+++ b/src/tools/PlotValidator.cpp
@@ -271,7 +271,7 @@ bool ValidatePlot( const ValidatePlotOptions& options )
     {
         auto* memPlot = new MemoryPlot();
         plotFile = memPlot;
-        
+
         Log::Line( "Reading plot file into memory..." );
         if( memPlot->Open( options.plotPath.c_str() ) )
         {
@@ -305,7 +305,7 @@ bool ValidatePlot( const ValidatePlotOptions& options )
 
     // Duplicate the plot file,     
     ThreadPool pool( threadCount );
-    
+
     UnpackedK32Plot unpackedPlot;
     if( options.unpacked )
     {
@@ -332,7 +332,7 @@ bool ValidatePlot( const ValidatePlotOptions& options )
     }
 
     MTJobRunner<ValidateJob> jobs( pool );
-    
+
     for( uint32 i = 0; i < threadCount; i++ )
     {
         auto& job = jobs[i];
diff --git a/src/util/BitView.h b/src/util/BitView.h
index da74ea4b..fd918abe 100644
--- a/src/util/BitView.h
+++ b/src/util/BitView.h
@@ -153,7 +153,7 @@ class CPBitReader
         if constexpr ( CheckAlignment )
         {
             isPtrAligned = ((uintptr_t)pField & 7) == 0; // % 8
-            isLastField  = fieldIndex == ( sizeBits >> 6 ) - 1;
+            isLastField  = fieldIndex == ( sizeBits >> 6 );
             
             if( isPtrAligned && !isLastField )
                 field = *((uint64*)pField);
diff --git a/src/util/CliParser.cpp b/src/util/CliParser.cpp
index 77ba7c47..d78f5b57 100644
--- a/src/util/CliParser.cpp
+++ b/src/util/CliParser.cpp
@@ -44,6 +44,21 @@ bool CliParser::ReadStr( const char*& value, const char* paramA, const char* par
     return true;
 }
 
+//-----------------------------------------------------------
+bool CliParser::ReadStr( std::string& value, const char* paramA, const char* paramB )
+{
+    if( !ArgMatch( paramA, paramB ) )
+        return false;
+
+    NextArg();
+    FatalIf( !HasArgs(), "Expected a value for argument '%s'.", _argv[_i-1] );
+
+    value = _argv[_i];
+    NextArg();
+
+    return true;
+}
+
 //-----------------------------------------------------------
 uint64 CliParser::ReadU64()
 {
@@ -317,7 +332,7 @@ bool CliParser::ReadHexStr( const char*& hexStr, const size_t maxStrLength, cons
         return false;
     
     size_t len = strlen( hexStr );
-    if( len >= 2 && hexStr[0] == '0' && hexStr[0] == 'x' )
+    if( len >= 2 && hexStr[0] == '0' && hexStr[1] == 'x' )
     {
         hexStr += 2;
         len -= 2;
diff --git a/src/util/CliParser.h b/src/util/CliParser.h
index dcf5b490..d8c3b0db 100644
--- a/src/util/CliParser.h
+++ b/src/util/CliParser.h
@@ -97,6 +97,8 @@ class CliParser
     bool ReadUnswitch( bool& value, const char* paramA, const char* paramB = nullptr );
 
     bool ReadStr( const char*& value, const char* paramA, const char* paramB = nullptr );
+
+    bool ReadStr( std::string& value, const char* paramA, const char* paramB = nullptr );
     
     uint64 ReadU64();
     
diff --git a/src/util/CommandQueue.h b/src/util/CommandQueue.h
new file mode 100644
index 00000000..4da58a68
--- /dev/null
+++ b/src/util/CommandQueue.h
@@ -0,0 +1,95 @@
+#pragma once
+#include "MPMCQueue.h"
+#include "threading/Thread.h"
+#include "threading/AutoResetSignal.h"
+#include "util/Span.h"
+#include "util/Util.h"
+
+/// Multi-producer command queue base class
+template<typename TCommand, uint32 _MaxDequeue = 32>
+class MPCommandQueue
+{
+    using TSelf = MPCommandQueue<TCommand, _MaxDequeue>;
+
+    enum State : uint32
+    {
+        Default = 0,
+        Running,
+        Exiting,
+    };
+
+public:
+    MPCommandQueue() {}
+
+    virtual inline ~MPCommandQueue()
+    {
+        _state.store( Exiting, std::memory_order_release );
+        _consumerSignal.Signal();
+        _consumerThread.WaitForExit();
+    }
+
+    void StartConsumer()
+    {
+        PanicIf( _state.load( std::memory_order_relaxed ) != Default, "Unexpected state" );
+
+        State expectedState = Default;
+        if( !_state.compare_exchange_weak( expectedState, Running,
+                                           std::memory_order_release,
+                                           std::memory_order_relaxed ) )
+        {
+            Panic( "Unexpected state %u.", expectedState );
+        }
+
+        _consumerThread.Run( ConsumerThreadMain , this );
+    }
+
+    /// Thread-safe
+    void Submit( const TCommand& cmd )
+    {
+        Submit( &cmd, 1 );
+    }
+
+    void Submit( const TCommand* commands, const i32 count )
+    {
+        ASSERT( commands );
+        ASSERT( count > 0 );
+
+        _queue.Enqueue( commands, (size_t)count );
+        _consumerSignal.Signal();
+    }
+
+protected:
+    /// Implementors must implement this
+    virtual void ProcessCommands( const Span<TCommand> items ) = 0;
+
+    /// Command thread
+    static void ConsumerThreadMain( TSelf* self )
+    {
+        self->ConsumerThread();
+    }
+
+    void ConsumerThread()
+    {
+        TCommand items[_MaxDequeue] = {};
+
+        for( ;; )
+        {
+            _consumerSignal.Wait();
+
+            if( _state.load( std::memory_order_relaxed ) == Exiting )
+                break;
+
+            const size_t itemCount = _queue.Dequeue( items, _MaxDequeue );
+
+            if( itemCount > 0 )
+                this->ProcessCommands( Span<TCommand>( items, itemCount ) );
+        }
+    }
+
+private:
+    MPMCQueue<TCommand> _queue;
+    Thread              _consumerThread;
+    AutoResetSignal     _consumerSignal;
+    std::atomic<State>  _state = Default;
+};
+
diff --git a/src/util/IAllocator.h b/src/util/IAllocator.h
new file mode 100644
index 00000000..fe9127cc
--- /dev/null
+++ b/src/util/IAllocator.h
@@ -0,0 +1,93 @@
+#pragma once
+#include "util/Util.h"
+
+class IAllocator
+{
+public:
+    virtual ~IAllocator() {}
+
+    virtual void* Alloc( const size_t size, const size_t alignment ) = 0;
+
+    inline virtual void Free( void* ptr ) { (void)ptr; }
+
+    //-----------------------------------------------------------
+    template<typename T>
+    inline T* AllocT( const size_t size, size_t alignment = alignof( T ) )
+    {
+        return reinterpret_cast<T*>( Alloc( size, alignment ) );
+    }
+
+    //-----------------------------------------------------------
+    template<typename T>
+    inline T* CAlloc( const size_t count, size_t alignment = alignof( T ) )
+    {
+        const size_t allocSize = sizeof( T ) * count;
+        ASSERT( allocSize >= count );
+        
+        return AllocT<T>( allocSize, alignment );
+    }
+
+    //-----------------------------------------------------------
+    template<typename T>
+    inline Span<T> CAllocSpan( const size_t count, size_t alignment = alignof( T ) )
+    {
+        return Span<T>( this->CAlloc<T>( count, alignment ), count );
+    }
+
+    //-----------------------------------------------------------
+    inline void* CAlloc( const size_t count, const size_t size, const size_t alignment )
+    {
+        const size_t paddedSize = RoundUpToNextBoundaryT( size, alignment );
+        
+        return Alloc( paddedSize * count, alignment );
+    }
+
+    //-----------------------------------------------------------
+    inline void TryFree( void* ptr )
+    {
+        if( ptr )
+            Free( ptr );
+    }
+
+    //-----------------------------------------------------------
+    template<typename T>
+    inline void SafeFree( T*& ptr )
+    {
+        if( ptr )
+        {
+            Free( ptr );
+            ptr = nullptr;
+        }
+    }
+};
+
+class GlobalAllocator : public IAllocator
+{
+public:
+    inline void* Alloc( const size_t size, const size_t alignment ) override
+    {
+        // Ignore alignment
+        (void)alignment;
+        return malloc( size );
+    }
+};
+
+// class ProxyAllocator : public IAllocator
+// {
+//     IAllocator& _allocator;
+
+// public:
+//     ProxyAllocator() = delete;
+//     inline ProxyAllocator( IAllocator& allocator )
+//         , _allocator( allocator )
+//     {}
+
+//     inline ProxyAllocator( const ProxyAllocator& other )
+//         : _allocator( other._allocator )
+//     {}
+
+//     inline void* Alloc( const size_t size, const size_t alignment ) override
+//     {
+//         return _allocator.Alloc( size, alignment );
+//     }
+// };
diff --git a/src/util/MPMCQueue.h b/src/util/MPMCQueue.h
new file mode 100644
index 00000000..ba77719a
--- /dev/null
+++ b/src/util/MPMCQueue.h
@@ -0,0 +1,68 @@
+#pragma once
+#include <mutex>
+#include <queue>
+
+/// Lock-based multi-producer, multi-consumer queue
+/// Simple and good enough for most uses
+template<typename T>
+class MPMCQueue
+{
+public:
+    inline MPMCQueue() {}
+
+    void Enqueue( const T& item )
+    {
+        _mutex.lock();
+        _queue.push( item );
+        _mutex.unlock();
+    }
+
+    void Enqueue( const T* items, const size_t count )
+    {
+        if( count < 1 )
+            return;
+
+        _mutex.lock();
+
+        for( size_t i = 0; i < count; i++ )
+            _queue.push( items[i] );
+
+        _mutex.unlock();
+    }
+
+    size_t Dequeue( T* outItem, const size_t maxDequeue )
+    {
+        _mutex.lock();
+
+        const size_t max = std::min( maxDequeue, _queue.size() );
+
+        for( size_t i = 0; i < max; i++ )
+        {
+            outItem[i] = _queue.front();
+            _queue.pop();
+        }
+        _mutex.unlock();
+
+        return max;
+    }
+
+    bool Dequeue( T* outItem )
+    {
+        _mutex.lock();
+        const bool hasItem = !_queue.empty();
+
+        if( hasItem )
+        {
+            *outItem = _queue.front();
+            _queue.pop();
+        }
+        _mutex.unlock();
+
+        return hasItem;
+    }
+
+private:
+    std::mutex    _mutex;
+    std::queue<T> _queue;
+};
+
diff --git a/src/util/Span.h b/src/util/Span.h
index 3624048b..df3e82c2 100644
--- a/src/util/Span.h
+++ b/src/util/Span.h
@@ -157,6 +157,8 @@ struct Span
 
         return Span<TCast>( reinterpret_cast<TCast*>( values ), targetLength );
     }
+
+    inline bool IsEmpty() const { return length == 0; }
 };
 
 typedef Span<uint8_t> ByteSpan;
diff --git a/src/util/StackAllocator.h b/src/util/StackAllocator.h
index a8ac2573..6fb710b3 100644
--- a/src/util/StackAllocator.h
+++ b/src/util/StackAllocator.h
@@ -1,44 +1,5 @@
 #pragma once
-
-class IAllocator
-{
-public:
-    virtual ~IAllocator() {}
-
-    virtual void* Alloc( const size_t size, const size_t alignment ) = 0;
-
-    //-----------------------------------------------------------
-    template<typename T>
-    inline T* AllocT( const size_t size, size_t alignment = alignof( T ) )
-    {
-        return reinterpret_cast<T*>( Alloc( size, alignment ) );
-    }
-
-    //-----------------------------------------------------------
-    template<typename T>
-    inline T* CAlloc( const size_t count, size_t alignment = alignof( T ) )
-    {
-        const size_t allocSize = sizeof( T ) * count;
-        ASSERT( allocSize >= count );
-        
-        return AllocT<T>( allocSize, alignment );
-    }
-
-    //-----------------------------------------------------------
-    template<typename T>
-    inline Span<T> CAllocSpan( const size_t count, size_t alignment = alignof( T ) )
-    {
-        return Span<T>( this->CAlloc<T>( count, alignment ), count );
-    }
-
-    //-----------------------------------------------------------
-    inline void* CAlloc( const size_t count, const size_t size, const size_t alignment )
-    {
-        const size_t paddedSize = RoundUpToNextBoundaryT( size, alignment );
-        
-        return Alloc( paddedSize * count, alignment );
-    }
-};
+#include "IAllocator.h"
 
 class IStackAllocator : public IAllocator
 {
@@ -98,7 +59,7 @@ class StackAllocator : public IStackAllocator
     {
         // Start address must be aligned to the specified alignment
         const size_t paddedSize = RoundUpToNextBoundaryT( _size, alignment );
-        
+
         ASSERT( size > 0 );
         ASSERT( _size < _capacity ); 
         ASSERT( paddedSize <= _capacity );
diff --git a/src/util/Util.h b/src/util/Util.h
index 7d38cdde..e4477e84 100644
--- a/src/util/Util.h
+++ b/src/util/Util.h
@@ -68,9 +68,9 @@
 ///
 /// Assorted utility functions
 /// 
-void Exit( int code );
-void FatalExit();
-void PanicExit();
+[[noreturn]] void Exit( int code );
+[[noreturn]] void FatalExit();
+[[noreturn]] void PanicExit();
 void FatalErrorMsg( const char* message, ... );
 void PanicErrorMsg( const char* message, ... );
 
diff --git a/src/util/VirtualAllocator.h b/src/util/VirtualAllocator.h
index f3a2ee33..2b79755b 100644
--- a/src/util/VirtualAllocator.h
+++ b/src/util/VirtualAllocator.h
@@ -14,7 +14,7 @@ class VirtualAllocator : public IAllocator
         const size_t allocSize = PageAlign( size );
         _size += allocSize;
         
-        return bbvirtalloc<byte>( allocSize );
+        return bbvirtallocbounded<byte>( allocSize );
     }
 
     inline void* TryAlloc( const size_t size )
diff --git a/tests/TestDiskQueue.cpp b/tests/TestDiskQueue.cpp
new file mode 100644
index 00000000..7d8f5bfc
--- /dev/null
+++ b/tests/TestDiskQueue.cpp
@@ -0,0 +1,172 @@
+#include "TestUtil.h"
+#include "plotting/DiskQueue.h"
+#include "plotting/DiskBucketBuffer.h"
+#include "plotting/DiskBuffer.h"
+#include "util/VirtualAllocator.h"
+
+constexpr uint32 bucketCount      = 64;
+constexpr uint32 entriesPerBucket = 1 << 16;
+constexpr uint32 entriesPerSlice  = entriesPerBucket / bucketCount;
+
+static void WriteBucketSlices( DiskBucketBuffer* buf, uint32 bucket, uint32 mask, Span<size_t> sliceSizes );
+
+//-----------------------------------------------------------
+TEST_CASE( "disk-slices", "[disk-queue]" )
+{
+    const char* tempPath = GetEnv( "bb_queue_path", "/Users/harito/.sandbox/plot" );
+
+    DiskQueue queue( tempPath );
+
+    auto buf = std::unique_ptr<DiskBucketBuffer>( DiskBucketBuffer::Create( 
+        queue, "slices.tmp", bucketCount, sizeof( uint32 ) * entriesPerSlice,
+        FileMode::Create, FileAccess::ReadWrite ) );
+
+    ENSURE( buf.get() );
+
+    {
+        VirtualAllocator allocator{};
+        buf->ReserveBuffers( allocator );
+    }
+
+    size_t _sliceSizes[bucketCount] = {};
+    for( uint32 i = 0; i < bucketCount; i++ )
+        _sliceSizes[i] = entriesPerSlice * sizeof( uint32 );
+
+    Span<size_t> sliceSizes( _sliceSizes, bucketCount );
+
+    // Write a whole "table"'s worth of data
+    for( uint32 b = 0; b < bucketCount; b++ )
+    {
+        WriteBucketSlices( buf.get(), b, 0, sliceSizes  );
+    }
+
+    // Read back
+    buf->Swap();
+    const uint32 secondMask = 0xF0000000;
+
+    {
+        buf->ReadNextBucket();
+
+        for( uint32 b = 0; b < bucketCount; b++ )
+        {
+            buf->TryReadNextBucket();
+
+            auto input = buf->GetNextReadBufferAs<uint32>();
+
+            const uint32 readMask = b << 16;
+
+            ENSURE( input.Length() == entriesPerBucket );
+
+            // Validate
+            for( uint32 i = 0; i < input.Length(); i++ )
+            {
+                ENSURE( input[i] == (readMask | i ) );
+            }
+
+            // Write new bucket
+            WriteBucketSlices( buf.get(), b, secondMask, sliceSizes );
+        }
+    }
+
+    // Read again and validate the second match
+    buf->Swap();
+    {
+        buf->ReadNextBucket();
+
+        for( uint32 b = 0; b < bucketCount; b++ )
+        {
+            buf->TryReadNextBucket();
+
+            auto input = buf->GetNextReadBufferAs<uint32>();
+
+            const uint32 readMask = secondMask | (b << 16);
+
+            ENSURE( input.Length() == entriesPerBucket );
+
+            // Validate
+            for( uint32 i = 0; i < input.Length(); i++ )
+            {
+                ENSURE( input[i] == (readMask | i ) );
+            }
+        }
+    }
+
+    buf->Swap();
+    Log::Line( "Ok" );
+}
+
+//-----------------------------------------------------------
+TEST_CASE( "disk-buckets", "[disk-queue]" )
+{
+    const char* tempPath = GetEnv( "bb_queue_path", "/Users/harito/.sandbox/plot" );
+    DiskQueue queue( tempPath );
+
+    auto buffer = std::unique_ptr<DiskBuffer>( DiskBuffer::Create( 
+        queue, "bucket.tmp",
+        bucketCount, sizeof( uint32 ) * entriesPerBucket,
+        FileMode::Create, FileAccess::ReadWrite ) );
+
+    ENSURE( buffer );
+    {
+        VirtualAllocator allocator{};
+        buffer->ReserveBuffers( allocator );
+    }
+
+    // Write bucket
+    {
+        for( uint32 b = 0; b < bucketCount; b++ )
+        {
+            auto bucket = buffer->GetNextWriteBufferAs<uint32>();
+
+            for( uint32 i = 0; i < entriesPerBucket; i++ )
+                bucket[i] = b * entriesPerBucket + i;
+
+            buffer->Submit( entriesPerBucket * sizeof( uint32 ) );
+        }
+    }
+
+    // Read back bucket
+    buffer->Swap();
+
+    {
+        buffer->ReadNextBucket();
+        for( uint32 b = 0; b < bucketCount; b++ )
+        {
+            buffer->TryReadNextBucket();
+
+            auto bucket = buffer->GetNextReadBufferAs<uint32>();
+
+            // Validate
+            ENSURE( bucket.Length() == entriesPerBucket );
+            for( uint32 i = 0; i < entriesPerBucket; i++ )
+            {
+                ENSURE( bucket[i] == b * entriesPerBucket + i );
+            }
+        }
+    }
+
+    Log::Line( "Ok" );
+}
+
+//-----------------------------------------------------------
+void WriteBucketSlices( DiskBucketBuffer* buf, uint32 bucket, uint32 writeMask, Span<size_t> sliceSizes )
+{
+    const uint32 base = entriesPerSlice * bucket;
+
+    auto slices = buf->GetNextWriteBufferAs<uint32>();
+
+    for( uint32 slice = 0; slice < bucketCount; slice++ )
+    {
+        const uint32 mask = writeMask | (slice << 16);
+
+        for( uint32 i = 0; i < entriesPerSlice; i++ )
+            slices[i] = mask | (base + i);
+
+        slices = slices.Slice( buf->GetSliceStride() / sizeof( uint32 ) );
+    }
+
+    // Submit next buffer
+    buf->Submit();
+}
+
+