diff --git a/.idea/vcs.xml b/.idea/vcs.xml
index 54a1aefd..94a25f7f 100644
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@@ -2,13 +2,5 @@
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json
deleted file mode 100644
index 8ae95aee..00000000
--- a/.vscode/c_cpp_properties.json
+++ /dev/null
@@ -1,43 +0,0 @@
-{
- "configurations": [
- {
- "name": "Win32",
- "includePath": [
- "${workspaceFolder}/**"
- ],
- "defines": [
- "_DEBUG",
- "UNICODE",
- "_UNICODE"
- ],
- "windowsSdkVersion": "10.0.19041.0",
- "compilerPath": "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\bin\\Hostx64\\x64\\cl.exe",
- "cStandard": "c17",
- "cppStandard": "c++17",
- "intelliSenseMode": "windows-msvc-x64",
- "configurationProvider": "ms-vscode.cmake-tools",
- "forcedInclude": [
- "src/pch.h"
- ]
- },
- {
- "name": "macOS",
- "includePath": [
- "${workspaceFolder}/**"
- ],
- "defines": [],
- "macFrameworkPath": [
- "/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/System/Library/Frameworks"
- ],
- "compilerPath": "/usr/bin/clang",
- "cStandard": "c17",
- "cppStandard": "c++17",
- "intelliSenseMode": "macos-clang-arm64",
- "configurationProvider": "ms-vscode.cmake-tools",
- "forcedInclude": [
- "src/pch.h"
- ]
- }
- ],
- "version": 4
-}
\ No newline at end of file
diff --git a/.vscode/launch.json b/.vscode/launch.json
index 6957af27..bb356736 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -131,19 +131,25 @@
"preLaunchTask" : "build_cuda_debug",
"program": "${workspaceFolder}/build/bladebit_cuda",
-
+
// "-c", "xch1uf48n3f50xrs7zds0uek9wp9wmyza6crnex6rw8kwm3jnm39y82q5mvps6",
// "-i", "7a709594087cca18cffa37be61bdecf9b6b465de91acb06ecb6dbe0f4a536f73", // Yes overflow
// "--memo", "80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef207d52406afa2b6d7d92ea778f407205bd9dca40816c1b1cacfca2a6612b93eb",
+
+ "args":
+ "-w -n 1 -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --check 100 --check-threshold 2 /home/harold/plot",
+
+ // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot /home/harold/plot",
+ // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk --no-direct-buffers /home/harold/plot",
+ // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk /home/harold/plot",
+ "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-64 -t1 /home/harold/plotdisk /home/harold/plot",
- "args":
- // "-w --compress 3 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot ~/plot/tmp",
- "-w --compress 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot ~/plot",
"windows": {
"type": "cppvsdbg",
"program": "${workspaceFolder}/build/Debug/bladebit_cuda.exe",
- "args": "--benchmark --compress 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot D:/"
+ // "args": "--benchmark -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot D:/"
+ "args": "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot -t2 D:/chia_test_plots D:/chia_test_plots",
}
},
@@ -236,7 +242,7 @@
{
"name" : "Tests",
-
+
"type" : "cppdbg",
"osx": {
"MIMode": "lldb",
@@ -245,7 +251,7 @@
"stopAtEntry" : false,
"cwd" : "${workspaceFolder}",
"preLaunchTask" : "build_tests_debug",
- "console" : "internalConsole",
+ // "console" : "internalConsole",
"program": "${workspaceRoot}/build/tests",
@@ -260,6 +266,8 @@
// { "name": "bb_plot" , "value": "/home/harold/plot/tmp/plot-k32-c06-2023-02-14-21-43-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" },
{ "name": "bb_clevel" , "value": "1" },
{ "name": "bb_end_clevel" , "value": "1" },
+
+ { "name": "bb_queue_path" , "value": "/home/ubuntu/plot" },
],
"args": [
@@ -273,7 +281,10 @@
// "line-point-deltas"
// "compressed-plot-proof"
// "compressed-plot-qualities"
- "macos-threads"
+ // "macos-threads"
+ // "disk-slices"
+ // "disk-buckets"
+ "[disk-queue]"
]
}
@@ -285,10 +296,16 @@
"stopAtEntry" : false,
"cwd" : "${workspaceFolder}",
"preLaunchTask" : "build_debug",
- "console" : "internalConsole",
"program": "${workspaceFolder}/build/bladebit",
-
+ // "program": "${workspaceFolder}/build/bladebit_cuda",
+
+ "linux": {
+ "MIMode": "gdb",
+ "miDebuggerPath": "/usr/bin/gdb",
+ "program": "${workspaceFolder}/build/bladebit"
+ },
+
"windows": {
"type" : "cppvsdbg",
"program": "${workspaceFolder}/build/debug/bladebit.exe"
@@ -301,6 +318,11 @@
// "-t", "48",
// "-t", "1",
+ // "validate", "--f7", "2",
+ // "/home/harold/plot/jmplot-c01-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+ // "/home/harold/plot/plot-k32-c01-2023-07-19-00-29-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+ // "/home/harold/plot/plot-k32-c01-2023-08-03-04-57-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+
// "-t", "1", "validate", "--f7", "324", "~/plot/tmp/plot-k32-c01-2023-02-13-22-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
// "validate", "--f7", "7", "~/plot/tmp/plot-k32-c01-2023-03-09-14-07-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
// "validate", "--cuda", "--f7", "4", "~/plot/tmp/plot-k32-c07-2023-04-13-16-08-330fbf677f78641061c93312c1a7ffa28138739b69975f3b874df6acc3e76378.plot",
@@ -322,8 +344,8 @@
// // "/home/harold/plot/tmp/plot-k32-c04-2023-01-31-23-15-5cfc42dfaa5613da0b425994c2427a2ba4a8efcfb49e7844e93c0854baf09863.plot"
// Simulation
- "-t", "1", "simulate", "--seed", "b8e9ec6bc179ae6ba5f5c3483f7501db32879efa84b62001d27601a540dca5ff",
- "-p", "16", "-n", "1", "--power", "45", "--size", "4PB", "~/plot/tmp/plot-k32-c01-2023-03-09-14-07-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+ // "-t", "1", "simulate", "--seed", "b8e9ec6bc179ae6ba5f5c3483f7501db32879efa84b62001d27601a540dca5ff",
+ // "-p", "16", "-n", "1", "--power", "45", "--size", "4PB", "~/plot/tmp/plot-k32-c01-2023-03-09-14-07-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
// "-t", "30", "simulate", "-p", "2", "-n", "600", "~/plot/tmp/plot-k32-c07-2023-03-16-11-49-7732c75d9f3b5ad1fc804bb7429121e334bd4f25f9bbbb76ef0370b5a0e80aae.plot"
// "-m",
@@ -335,11 +357,18 @@
// "--f7", "3983284117", "/home/harito/plot/tmp/gpu_1.plot",
/// Compare
- // "plotcmp",
- // "/home/harito/plot/tmp/gpu_1.plot.old",
- // "/home/harold/plot-tmpfs/gpu_1.plot",
- // "/home/harito/plot/tmp/gpu_1.plot",
- // "/home/harito/plot/tmp/plot-k32-2022-11-21-05-59-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+ "plotcmp",
+ "/home/harold/plot/plot-k32-c01-2023-08-22-16-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+ "/home/harold/plot/plot-k32-c01-2023-08-22-16-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+
+ // "/home/harold/plot/plot-k32-c01-2023-08-03-22-59-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+ // "/home/harold/plot/jmplot-c01-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+
+ // Check
+ // "check",
+ // "-n", "100", "--seed", "dc471c4d905ba3a65c6cecb46d97b132c0c98f51d416db5ec5cbdbe95ef2832f",
+ // "/home/harold/plot/plot-k32-c01-2023-07-19-00-29-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+ // "/home/harold/plot/jm.plot"
]
},
diff --git a/.vscode/settings.json b/.vscode/settings.json
index c6c5274d..6c2da21b 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -4,16 +4,16 @@
"nominmax"
],
"files.associations": {
+ "*.sd": "yaml",
+ "*.userprefs": "xml",
+ "*.make": "makefile",
"Fastfile": "ruby",
"*.plist": "xml",
- "*.sd": "yaml",
"*.json": "jsonc",
"*.ir": "llvm",
"*.qs": "javascript",
"*.ac": "shellscript",
"player": "json",
- "*.userprefs": "xml",
- "*.make": "makefile",
"memory": "cpp",
"cstddef": "cpp",
"string": "cpp",
@@ -113,7 +113,18 @@
"filesystem": "cpp",
"__bits": "cpp",
"csignal": "cpp",
- "cfenv": "cpp"
+ "cfenv": "cpp",
+ "ranges": "cpp",
+ "xhash": "cpp",
+ "xmemory": "cpp",
+ "xstddef": "cpp",
+ "xstring": "cpp",
+ "xtr1common": "cpp",
+ "xtree": "cpp",
+ "xutility": "cpp",
+ "__assert": "cpp",
+ "*.inc": "cpp",
+ "xiosbase": "cpp"
},
"cSpell.words": [
"Ryzen"
@@ -124,7 +135,13 @@
"cmake.preferredGenerators": [
"Unix Makefiles",
"Visual Studio 17 2022"
- ]
+ ],
+ // "cmake.buildArgs": [],
+ "cmake.configureSettings": {
+ "BB_ENABLE_TESTS": "ON",
+ "BB_CUDA_USE_NATIVE": "ON"
+ },
+ "C_Cpp.dimInactiveRegions": false,
// "cmake.generator": "Unix Makefiles"
// "cmake.generator": "Visual Studio 17 2022"
diff --git a/Bladebit.cmake b/Bladebit.cmake
index 6ce0ad97..ffd03d67 100644
--- a/Bladebit.cmake
+++ b/Bladebit.cmake
@@ -227,6 +227,8 @@ set(src_bladebit
src/plotting/PlotWriter.cpp
src/plotting/PlotWriter.h
src/plotting/Tables.h
+ src/plotting/BufferChain.h
+ src/plotting/BufferChain.cpp
src/plotting/f1/F1Gen.h
src/plotting/f1/F1Gen.cpp
@@ -258,6 +260,7 @@ set(src_bladebit
src/tools/PlotReader.cpp
src/tools/PlotReader.h
src/tools/PlotValidator.cpp
+ src/tools/PlotChecker.cpp
src/util/Array.h
src/util/Array.inl
@@ -289,6 +292,18 @@ set(src_bladebit
src/harvesting/GreenReaper.h
src/harvesting/GreenReaperInternal.h
src/harvesting/Thresher.h
+
+ src/plotting/DiskQueue.h
+ src/plotting/DiskQueue.cpp
+ src/plotting/DiskBuffer.h
+ src/plotting/DiskBuffer.cpp
+ src/plotting/DiskBucketBuffer.h
+ src/plotting/DiskBucketBuffer.cpp
+ src/plotting/DiskBufferBase.h
+ src/plotting/DiskBufferBase.cpp
+
+ src/util/MPMCQueue.h
+ src/util/CommandQueue.h
)
target_sources(bladebit_core PUBLIC ${src_bladebit})
diff --git a/BladebitCUDA.cmake b/BladebitCUDA.cmake
index 1fc668fa..8b140c2f 100644
--- a/BladebitCUDA.cmake
+++ b/BladebitCUDA.cmake
@@ -22,6 +22,9 @@ add_executable(bladebit_cuda
cuda/CudaPlotUtil.cu
cuda/GpuStreams.h
cuda/GpuStreams.cu
+ cuda/GpuDownloadStream.cu
+ cuda/GpuQueue.h
+ cuda/GpuQueue.cu
# Harvester
cuda/harvesting/CudaThresher.cu
@@ -42,7 +45,7 @@ target_compile_options(bladebit_cuda PRIVATE
>
$<${is_cuda_debug}:
- -G
+ # -G
>
)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 56595d7c..8f72155c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,7 @@
cmake_minimum_required(VERSION 3.19 FATAL_ERROR)
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CONFIGURATION_TYPES Release Debug)
@@ -9,7 +10,7 @@ if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release"
CACHE STRING "Possible values are: Release, Debug"
FORCE
- )
+ )
endif()
# Allows for CMAKE_MSVC_RUNTIME_LIBRARY
@@ -17,7 +18,7 @@ if(POLICY CMP0091)
cmake_policy(SET CMP0091 NEW)
endif()
-set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14" CACHE STRING "macOS minimum supported version.")
+set(CMAKE_OSX_DEPLOYMENT_TARGET "10.16" CACHE STRING "macOS minimum supported version.")
set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>" CACHE STRING "MSVC Runtime Library")
project(bladebit LANGUAGES C CXX ASM)
@@ -83,10 +84,10 @@ endif()
# NOTE: These are mostly sandbox test environment, not proper tests
option(BB_ENABLE_TESTS "Enable tests." OFF)
option(NO_CUDA_HARVESTER "Explicitly disable CUDA in the bladebit_harvester target." OFF)
-option(BB_NO_EMBED_VERSION "Disable embedding the version when building locally (non-CI)." ON)
+option(BB_NO_EMBED_VERSION "Disable embedding the version when building locally (non-CI)." OFF)
option(BB_HARVESTER_ONLY "Enable only the harvester target." OFF)
option(BB_HARVESTER_STATIC "Build the harvester target as a static library." OFF)
-
+option(BB_CUDA_USE_NATIVE "Only build the native CUDA architecture when in release mode." OFF)
#
# Dependencies
@@ -103,7 +104,7 @@ if(NOT ${BB_HARVESTER_ONLY})
GIT_REPOSITORY https://github.com/Chia-Network/bls-signatures.git
GIT_TAG 2.0.2
EXCLUDE_FROM_ALL ${BB_IS_DEPENDENCY}
- )
+)
set(BUILD_BLS_PYTHON_BINDINGS "0" CACHE STRING "0")
set(BUILD_BLS_TESTS "0" CACHE STRING "")
@@ -130,6 +131,7 @@ set(is_x86 $,$,$>)
set(is_msvc_c_cpp $>)
+
if(CUDAToolkit_FOUND AND NOT ${NO_CUDA_HARVESTER})
set(have_cuda $)
else()
@@ -143,7 +145,7 @@ endif()
include(Config.cmake)
if(NOT ${BB_HARVESTER_ONLY})
- if(NOT BB_IS_DEPENDENCY AND (NOT BB_NO_EMBED_VERSION))
+ if((NOT BB_IS_DEPENDENCY) AND (NOT BB_NO_EMBED_VERSION))
include(cmake_modules/EmbedVersion.cmake)
endif()
diff --git a/Config.cmake b/Config.cmake
index 4139b4a9..f3481d6b 100644
--- a/Config.cmake
+++ b/Config.cmake
@@ -1,6 +1,11 @@
# Base interface configuration project
add_library(bladebit_config INTERFACE)
+target_include_directories(bladebit_config INTERFACE
+ ${INCLUDE_DIRECTORIES}
+ ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
target_compile_definitions(bladebit_config INTERFACE
$<${is_release}:
_NDEBUG=1
@@ -22,32 +27,34 @@ target_compile_definitions(bladebit_config INTERFACE
target_compile_options(bladebit_config INTERFACE
- # GCC or Clang
- $<$:
- -Wall
- -Wno-comment
- -Wno-unknown-pragmas
- -g
-
- $<${is_release}:
- -O3
+ $<${is_c_cpp}:
+ # GCC or Clang
+ $<$:
+ -Wall
+ -Wno-comment
+ -Wno-unknown-pragmas
+ -g
+
+ $<${is_release}:
+ -O3
+ >
+
+ $<${is_debug}:
+ -O0
+ >
>
- $<${is_debug}:
- -O0
+ # GCC
+ $<$:
+ -fmax-errors=5
>
- >
-
- # GCC
- $<$:
- -fmax-errors=5
- >
- # Clang
- $<$:
- -ferror-limit=5
- -fdeclspec
- -Wno-empty-body
+ # Clang
+ $<$:
+ -ferror-limit=5
+ -fdeclspec
+ -Wno-empty-body
+ >
>
# MSVC
@@ -129,43 +136,36 @@ cmake_policy(SET CMP0105 NEW)
set(cuda_archs
$<${is_cuda_release}:
-## Maxwell
- ## Tesla/Quadro M series
- -gencode=arch=compute_50,code=sm_50
- ## Quadro M6000 , GeForce 900, GTX-970, GTX-980, GTX Titan X
- -gencode=arch=compute_52,code=sm_52
- ## Tegra (Jetson) TX1 / Tegra X1, Drive CX, Drive PX, Jetson Nano
- -gencode=arch=compute_53,code=sm_53
-## Pascal
- ## GeForce 1000 series
- -gencode=arch=compute_60,code=sm_60
- ## GeForce GTX 1050Ti, GTX 1060, GTX 1070, GTX 1080
- -gencode=arch=compute_61,code=sm_61
- ## Drive Xavier, Jetson AGX Xavier, Jetson Xavier NX
- -gencode=arch=compute_62,code=sm_62
-## Volta
- ## GV100, Tesla V100, Titan V
- -gencode=arch=compute_70,code=sm_70
- ## Tesla V100
- -gencode=arch=compute_72,code=sm_72
- ## Turing
- -gencode=arch=compute_75,code=sm_75
-## Ampere
- ## NVIDIA A100, DGX-A100
- -gencode=arch=compute_80,code=sm_80
- ## GeForce RTX 3000 series, NVIDIA A100
- -gencode=arch=compute_86,code=sm_86
- ## Jetson Orin
- -gencode=arch=compute_87,code=sm_87
-## Lovelace
- ## NVIDIA GeForce RTX 4090, RTX 4080, RTX 6000, Tesla L40
- -gencode=arch=compute_89,code=sm_89
- ## Future proofing
- -gencode=arch=compute_89,code=compute_89
-## Hopper
- ## NVIDIA H100 (GH100)
- # -gencode=arch=compute_90,code=sm_90
- # -gencode=arch=compute_90a,code=sm_90a
+ $<$:
+ -arch=native
+ >
+
+ $<$>:
+
+ # Maxwell
+ -gencode=arch=compute_50,code=sm_50 # Tesla/Quadro M series
+ -gencode=arch=compute_52,code=sm_52 # Quadro M6000 , GeForce 900, GTX-970, GTX-980, GTX Titan X
+ -gencode=arch=compute_53,code=sm_53 # Tegra (Jetson) TX1 / Tegra X1, Drive CX, Drive PX, Jetson Nano
+
+ # Pascal
+ -gencode=arch=compute_60,code=sm_60 # GeForce 1000 series
+ -gencode=arch=compute_61,code=sm_61 # GeForce GTX 1050Ti, GTX 1060, GTX 1070, GTX 1080
+ -gencode=arch=compute_62,code=sm_62 # Drive Xavier, Jetson AGX Xavier, Jetson Xavier NX
+
+ # Volta
+ -gencode=arch=compute_70,code=sm_70 # GV100, Tesla V100, Titan V
+ -gencode=arch=compute_72,code=sm_72 # Tesla V100
+ -gencode=arch=compute_75,code=sm_75 # Turing
+
+ # Ampere
+ -gencode=arch=compute_80,code=sm_80 # NVIDIA A100, DGX-A100
+ -gencode=arch=compute_86,code=sm_86 # GeForce RTX 3000 series, NVIDIA A100
+ -gencode=arch=compute_87,code=sm_87 # Jetson Orin
+
+ # Lovelace
+ -gencode=arch=compute_89,code=sm_89 # NVIDIA GeForce RTX 4090, RTX 4080, RTX 6000, Tesla L40
+ -gencode=arch=compute_89,code=compute_89 # Future proofing
+ >
>
$<${is_cuda_debug}:
diff --git a/Harvester.cmake b/Harvester.cmake
index d853a2db..692daa80 100644
--- a/Harvester.cmake
+++ b/Harvester.cmake
@@ -1,5 +1,5 @@
if(NOT ${BB_HARVESTER_STATIC})
- add_library(bladebit_harvester SHARED)
+ add_library(bladebit_harvester SHARED src/harvesting/HarvesterDummy.cpp)
else()
add_library(bladebit_harvester STATIC)
endif()
@@ -82,9 +82,15 @@ target_sources(bladebit_harvester PRIVATE
cuda/CudaF1.cu
cuda/CudaMatch.cu
cuda/CudaPlotUtil.cu
+ cuda/GpuQueue.cu
- # TODO: Remove this, ought not be needed in harvester
+ # TODO: Does this have to be here?
cuda/GpuStreams.cu
+ cuda/GpuDownloadStream.cu
+ src/plotting/DiskBuffer.cpp
+ src/plotting/DiskBucketBuffer.cpp
+ src/plotting/DiskBufferBase.cpp
+ src/plotting/DiskQueue.cpp
>
$<$:
@@ -159,7 +165,7 @@ if(CUDAToolkit_FOUND)
CUDA_RUNTIME_LIBRARY Static
CUDA_SEPARABLE_COMPILATION ON
CUDA_RESOLVE_DEVICE_SYMBOLS ON
- # CUDA_ARCHITECTURES OFF
+ CUDA_ARCHITECTURES OFF
)
endif()
diff --git a/README.md b/README.md
index 9197014e..24d50f30 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,71 @@
-# BladeBit Chia Plotter
+# Bladebit Chia Plotter
[![Release Builds](https://github.com/Chia-Network/bladebit/actions/workflows/build-release.yml/badge.svg?branch=master&event=push)](https://github.com/Chia-Network/bladebit/actions/workflows/build-release.yml)
-A high-performance **k32-only**, Chia (XCH) plotter supporting in-RAM and disk-based plotting.
+A high-performance **k32-only**, Chia (XCH) plotter.
+
+Bladebit supports 3 plotting modes:
+ - Fully In-RAM (no drives required), CPU-based mode.
+ - GPU (CUDA-based) mode. Both fully in-RAM or disk-hybrid mode.
+ - Disk-based mode
+
+## Usage
+Run `bladebit --help` to see general help. For command-specific help, use `bladebit help `.
+
+## Requirements
+
+**CUDA**
+
+An NVIDIA GPU is required for this mode. This mode is exposed via the `cudaplot` command in a separate executable "bladebit_cuda". This mode has mainly been tested on consumer cards from the **10xx** series and up.
+
+| Mode | OS | DRAM | VRAM | CUDA capability
+|--------------------------------|----------------|------|------|----------------
+| In-RAM | Linux, Windows | 256G | 8G | 5.2 and up
+| Disk-hybrid 128G | Linux, Windows | 128G | 8G | 5.2 and up
+| Disk-hybrid 16G (WIP) | Linux | 16G | 8G | 5.2 and up
+
+> *NOTE: 16G mode currently a work in progress and at this stage it only works in Linux and direct I/O is unavailable in this mode.*
+
+
+**CPU RAM-Only**
+
+Available on Linux, Windows and macOS.
+Requires at least **416G** of system DRAM.
+
+
+**Disk**
+
+Available on Linux, Windows and macOS.
+
+A minimum of **4 GiB of RAM** is required, with lower bucket counts requiring up to 12 GiB of RAM. Roughly **480 GiB of disk space** is required in the default mode, or around **390 GiB of disk space** with `--alternate` mode enabled.
+
+The exact amounts of RAM and disk space required may vary slightly depending on the system's page size and the target disk file system block size (block-alignment is required for direct I/O).
+
+SSDs are highly recommended for disk-based plotting.
+
+
+## Compressed Plots
+
+Compressed plots are supported in CUDA mode and in RAM-only mode. CPU Disk-based mode does **NOT** currently support compressed plots.
+
+Compressed plots are currently supported for compression levels from **C1** to **C7**. Note that bladebit compression levels are not compatible with other plotter compression levels. These compression levels are based on the *number of bits dropped from an entry excluding the minimum bits required to fully drop a table*. At `k=32` a the first table is fully excluded from the plot at 16 bits dropped.
+
+> *NOTE: Although higher compression levels are available, support for farming them has not been currently implemented and are therefore disabled. They will be implemented in the future.*
+
+Compression levels are currently roughly equivalent to the following plot sizes.
+
+| Compression Level | Plot Size
+|-------------------|-------------
+| C1 | 87.5 GiB
+| C2 | 86.0 GiB
+| C3 | 84.4 GiB
+| C4 | 82.8 GiB
+| C5 | 81.2 GiB
+| C6 | 79.6 GiB
+| C7 | 78.0 GiB
+
+These might be optimized in the future with further compression optimizations.
+
## Requirements
@@ -39,7 +102,7 @@ SSDs are highly recommended for disk-based plotting.
## Prerequisites
-Linux, Windows and MacOS (both intel and ARM (Apple Silicon)) are supported.
+Linux, Windows and macOS (both Intel and ARM) are supported.
### Linux
@@ -83,8 +146,12 @@ cmake --build . --target bladebit --config Release
The resulting binary will be found under the `build/` directory.
On Windows it will be under `build/Release/`.
+For **bladebit_cuda**, the CUDA toolkit must be installed. The target name is `bladebit_cuda`.
+
+For simplicity the `build.sh` or `build-cuda.sh` scripts can be used to build. On Windows this requires gitbash or similar bash-based shell to run.
+
## Usage
-Run **bladebit** with the `-h` for complete usage and command line options:
+Run **bladebit** (or **bladebit_cuda**) with the `-h` for complete usage and command line options:
```bash
# Linux & macOS
@@ -93,18 +160,33 @@ build/bladebit -h
# Windows
build/Release/bladebit.exe -h
```
+The bladebit CLI uses the format `bladebit `.
-
-The bladebit CLI uses the format `bladebit `.
-
-Use the aforementioned `-h` parameter to get the full list of sub-commands and `GLOBAL_OPTIONS`.
-The `sub_command`-specific `COMMAND_OPTIONS` can be obtained by using the `help` sub command with the desired command as the parameter:
+Use the aforementioned `-h` parameter to get the full list of commands and `GLOBAL_OPTIONS`.
+The `command`-specific `COMMAND_OPTIONS` can be obtained by using the `help` sub command with the desired command as the parameter:
```bash
+bladebit help cudaplot
bladebit help ramplot
bladebit help diskplot
```
+### CUDA
+Basic `cudaplot` usage:
+```bash
+# OG plots
+./bladebit_cuda -f -p cudaplot
+
+# Portable plots
+./bladebit_cuda -f -c cudaplot
+
+# Compressed plots
+./bladebit_cuda -z -f -c cudaplot
+
+# 128G disk-hybrid mode
+./bladebit_cuda -z -f -c cudaplot --disk-128 -t1
+```
+
### In-RAM
Basic `ramplot` usage:
```bash
@@ -113,6 +195,9 @@ Basic `ramplot` usage:
# Portable plots
./bladebit -f -c ramplot
+
+# Compressed plots
+./bladebit -z -f -c ramplot
```
### Disk-Based
diff --git a/Tests.cmake b/Tests.cmake
index 577e541c..aaba51df 100644
--- a/Tests.cmake
+++ b/Tests.cmake
@@ -1,10 +1,15 @@
include(cmake_modules/FindCatch2.cmake)
-add_executable(tests ${src_bladebit})
+add_executable(tests ${src_bladebit}
+ cuda/harvesting/CudaThresherDummy.cpp
+ tests/TestUtil.h
+ tests/TestDiskQueue.cpp
+)
+
target_compile_definitions(tests PRIVATE
BB_TEST_MODE=1
)
-target_link_libraries(tests PRIVATE bladebit_config Catch2::Catch2WithMain)
+target_link_libraries(tests PRIVATE bladebit_config bladebit_core Catch2::Catch2WithMain)
set_target_properties(tests PROPERTIES
EXCLUDE_FROM_ALL ON
diff --git a/VERSION b/VERSION
index 4a36342f..0c6173b5 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1,2 @@
-3.0.0
+3.1.0
+
diff --git a/build-cuda.sh b/build-cuda.sh
new file mode 100755
index 00000000..d7a10154
--- /dev/null
+++ b/build-cuda.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+set -e
+_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+cd $_dir
+
+build_dir=build-release
+mkdir -p ${build_dir}
+cd ${build_dir}
+
+cmake .. -DCMAKE_BUILD_TYPE=Release
+cmake --build . --target bladebit_cuda --config Release --clean-first -j24
diff --git a/cmake_modules/EmbedVersion.cmake b/cmake_modules/EmbedVersion.cmake
index 6ec042c0..1c346632 100644
--- a/cmake_modules/EmbedVersion.cmake
+++ b/cmake_modules/EmbedVersion.cmake
@@ -2,18 +2,25 @@
if((NOT DEFINED ENV{CI}) AND (NOT DEFINED CACHE{bb_version_embedded}))
message("Embedding local build version")
- set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.")
-
- set(cmd_ver bash)
+ set(cmd_shell bash)
+ set(cmd_ext sh)
if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
- set(cmd_ver bash.exe)
+
+ find_program(bash_path NAMES bash.exe NO_CACHE)
+
+ if(${bash_path} MATCHES "-NOTFOUND")
+ set(cmd_shell powershell)
+ set(cmd_ext ps1)
+ else()
+ set(cmd_shell "${bash_path}")
+ endif()
endif()
- execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh major OUTPUT_VARIABLE bb_ver_maj WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
- execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh minor OUTPUT_VARIABLE bb_ver_min WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
- execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh revision OUTPUT_VARIABLE bb_ver_rev WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
- execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh suffix OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
- execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh commit OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+ execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} major OUTPUT_VARIABLE bb_ver_maj WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+ execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} minor OUTPUT_VARIABLE bb_ver_min WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+ execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} revision OUTPUT_VARIABLE bb_ver_rev WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+ execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} suffix OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+ execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} commit OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
# Remove trailing whitespace incurred in windows gitbash
string(STRIP "${bb_ver_maj}" bb_ver_maj)
@@ -39,3 +46,5 @@ if(NOT DEFINED ENV{CI})
add_compile_definitions(BLADEBIT_VERSION_SUFFIX="${bb_ver_suffix}")
add_compile_definitions(BLADEBIT_GIT_COMMIT="${bb_ver_commit}")
endif()
+
+set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.")
\ No newline at end of file
diff --git a/cuda/CudaPlotConfig.h b/cuda/CudaPlotConfig.h
index 80721e9f..a9afd81f 100644
--- a/cuda/CudaPlotConfig.h
+++ b/cuda/CudaPlotConfig.h
@@ -19,7 +19,7 @@
#define BBCU_TABLE_ENTRY_COUNT (1ull<<32)
#define BBCU_BUCKET_ENTRY_COUNT (BBCU_TABLE_ENTRY_COUNT/BBCU_BUCKET_COUNT)
//#define BBCU_XTRA_ENTRIES_PER_SLICE (1024u*64u)
-#define BBCU_XTRA_ENTRIES_PER_SLICE (4096u*1u)
+#define BBCU_XTRA_ENTRIES_PER_SLICE (4096+1024)
#define BBCU_MAX_SLICE_ENTRY_COUNT ((BBCU_BUCKET_ENTRY_COUNT/BBCU_BUCKET_COUNT)+BBCU_XTRA_ENTRIES_PER_SLICE)
#define BBCU_BUCKET_ALLOC_ENTRY_COUNT (BBCU_MAX_SLICE_ENTRY_COUNT*BBCU_BUCKET_COUNT)
#define BBCU_TABLE_ALLOC_ENTRY_COUNT (((uint64)BBCU_BUCKET_ALLOC_ENTRY_COUNT)*BBCU_BUCKET_COUNT)
@@ -42,12 +42,12 @@ static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT / BBCU_BUCKET_COUNT == BBCU_MAX_SLI
#ifdef _WIN32
#define DBG_BBCU_DBG_DIR "D:/dbg/cuda/"
#else
- // #define DBG_BBCU_DBG_DIR "/home/harold/plot/dbg/cuda/"
- #define DBG_BBCU_DBG_DIR "/home/harito/plot/dbg/cuda/"
+ #define DBG_BBCU_DBG_DIR "/home/harold/plotdisk/dbg/cuda/"
+ // #define DBG_BBCU_DBG_DIR "/home/harito/plots/dbg/cuda/"
#endif
- // #define DBG_BBCU_REF_DIR "/home/harold/plot/ref/"
+ // #define DBG_BBCU_REF_DIR "/home/harold/plots/ref/"
+
-
// #define BBCU_DBG_SKIP_PHASE_1 1 // Skip phase 1 and load pairs from disk
// #define BBCU_DBG_SKIP_PHASE_2 1 // Skip phase 1 and 2 and load pairs and marks from disk
@@ -60,6 +60,7 @@ static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT / BBCU_BUCKET_COUNT == BBCU_MAX_SLI
// #define DBG_BBCU_P2_WRITE_MARKS 1
// #define DBG_BBCU_P2_COUNT_PRUNED_ENTRIES 1
+ // #define DBG_BBCU_KEEP_TEMP_FILES 1
#define _ASSERT_DOES_NOT_OVERLAP( b0, b1, size ) ASSERT( (b1+size) <= b0 || b1 >= (b0+size) )
diff --git a/cuda/CudaPlotContext.h b/cuda/CudaPlotContext.h
index f4e8d909..fc5884b3 100644
--- a/cuda/CudaPlotContext.h
+++ b/cuda/CudaPlotContext.h
@@ -7,11 +7,16 @@
#include "plotting/PlotTypes.h"
#include "plotting/PlotWriter.h"
#include "GpuStreams.h"
+#include "GpuQueue.h"
#include "util/StackAllocator.h"
#include "fse/fse.h"
#include "threading/Fence.h"
#include "plotting/GlobalPlotConfig.h"
#include "threading/ThreadPool.h"
+#include "plotting/BufferChain.h"
+#include "plotting/DiskBuffer.h"
+#include "plotting/DiskBucketBuffer.h"
+#include
#include "cub/device/device_radix_sort.cuh"
// #include
@@ -29,7 +34,51 @@ using namespace cooperative_groups;
#endif
+struct CudaK32ParkContext
+{
+ Span table7Memory; // Memory buffer reserved for finalizing table7 and writing C parks
+ BufferChain* parkBufferChain;
+ uint32 maxParkBuffers; // Maximum number of park buffers
+ uint64* hostRetainedLinePoints;
+};
+
+struct CudaK32HybridMode
+{
+ // For clarity, these are the file names for the disk buffers
+ // whose disk space will be shared for temp data in both phase 1 and phase 3.
+ // The name indicates their usage and in which phase.
+ static constexpr std::string_view Y_DISK_BUFFER_FILE_NAME = "p1y-p3index.tmp";
+ static constexpr std::string_view META_DISK_BUFFER_FILE_NAME = "p1meta-p3rmap.tmp";
+ static constexpr std::string_view LPAIRS_DISK_BUFFER_FILE_NAME = "p1unsortedx-p1lpairs-p3lp-p3-lmap.tmp";
+
+ static constexpr std::string_view P3_RMAP_DISK_BUFFER_FILE_NAME = META_DISK_BUFFER_FILE_NAME;
+ static constexpr std::string_view P3_INDEX_DISK_BUFFER_FILE_NAME = Y_DISK_BUFFER_FILE_NAME;
+ static constexpr std::string_view P3_LP_AND_LMAP_DISK_BUFFER_FILE_NAME = LPAIRS_DISK_BUFFER_FILE_NAME;
+
+ DiskQueue* temp1Queue; // Tables Queue
+ DiskQueue* temp2Queue; // Metadata Queue (could be the same as temp1Queue)
+ DiskBucketBuffer* metaBuffer; // Enabled in < 128G mode
+ DiskBucketBuffer* yBuffer; // Enabled in < 128G mode
+ DiskBucketBuffer* unsortedL; // Unsorted Xs (or L pairs in < 128G) are written to disk (uint64 entries)
+ DiskBucketBuffer* unsortedR; // Unsorted R pairs in < 128G mode
+
+ DiskBuffer* tablesL[7];
+ DiskBuffer* tablesR[7];
+
+ GpuDownloadBuffer _tablesL[7];
+ GpuDownloadBuffer _tablesR[7];
+
+ struct
+ {
+ // #NOTE: These buffers shared the same file-backed storage as
+ // with other buffers in phase 1.
+ DiskBucketBuffer* rMapBuffer; // Step 1
+ DiskBucketBuffer* indexBuffer; // X-step/Step 2
+ DiskBucketBuffer* lpAndLMapBuffer; // X-step/Step 2 (LP) | Step 3 (LMap)
+
+ } phase3;
+};
struct CudaK32Phase2
{
@@ -64,11 +113,12 @@ struct CudaK32Phase3
};
uint64 pairsLoadOffset;
-
+
+ // Device buffers
uint32* devBucketCounts;
uint32* devPrunedEntryCount;
-
+ // Host buffers
union {
RMap* hostRMap;
uint32* hostIndices;
@@ -79,12 +129,6 @@ struct CudaK32Phase3
uint64* hostLinePoints;
};
- // #TODO: Remove this when we sort-out all of the buffer usage
- // uint64* hostMarkingTables[6]; // Set by Phase 2
-
-
- // uint32* hostBucketCounts;
-
uint32 prunedBucketCounts[7][BBCU_BUCKET_COUNT];
uint64 prunedTableEntryCounts[7];
@@ -111,9 +155,10 @@ struct CudaK32Phase3
// Step 2
struct {
GpuUploadBuffer rMapIn; // RMap from step 1
- GpuUploadBuffer lMapIn; // Output map (uint64) from the previous table run. Or during L table 1, it is inlined x values
+ GpuUploadBuffer lMapIn; // Output map (uint64) from the previous table run. Or, when L table is the first stored table, it is inlined x values
GpuDownloadBuffer lpOut; // Output line points (uint64)
GpuDownloadBuffer indexOut; // Output source line point index (uint32) (taken from the rMap source value)
+ GpuDownloadBuffer parksOut; // Output P7 parks on the last table
uint32* devLTable[2]; // Unpacked L table bucket
uint32 prunedBucketSlices[BBCU_BUCKET_COUNT][BBCU_BUCKET_COUNT];
@@ -123,7 +168,7 @@ struct CudaK32Phase3
struct {
GpuUploadBuffer lpIn; // Line points from step 2
GpuUploadBuffer indexIn; // Indices from step 2
- GpuDownloadBuffer mapOut; // lTable for next step 1
+ GpuDownloadBuffer mapOut; // lTable for next step 2
GpuDownloadBuffer parksOut; // Downloads park buffers to host
uint32* hostParkOverrunCount;
@@ -137,7 +182,6 @@ struct CudaK32Phase3
FSE_CTable* devCTable;
uint32* devParkOverrunCount;
- Fence* parkFence;
std::atomic parkBucket;
uint32 prunedBucketSlices[BBCU_BUCKET_COUNT][BBCU_BUCKET_COUNT];
@@ -178,8 +222,9 @@ struct CudaK32PlotContext
int32 cudaDevice = -1;
cudaDeviceProp* cudaDevProps = nullptr;
bool downloadDirect = false;
+ TableId firstStoredTable = TableId::Table2; // First non-dropped table that has back pointers
ThreadPool* threadPool = nullptr;
-
+
TableId table = TableId::Table1; // Current table being generated
uint32 bucket = 0; // Current bucket being processed
@@ -192,6 +237,7 @@ struct CudaK32PlotContext
PlotRequest plotRequest;
PlotWriter* plotWriter = nullptr;
Fence* plotFence = nullptr;
+ Fence* parkFence = nullptr;
// Root allocations
size_t allocAlignment = 0;
@@ -263,8 +309,6 @@ struct CudaK32PlotContext
uint32* hostBucketSlices = nullptr;
uint32* hostTableL = nullptr;
uint16* hostTableR = nullptr;
- uint32* hostTableSortedL = nullptr;
- uint16* hostTableSortedR = nullptr;
union {
uint32* hostMatchCount = nullptr;
@@ -279,6 +323,14 @@ struct CudaK32PlotContext
CudaK32Phase2* phase2 = nullptr;
CudaK32Phase3* phase3 = nullptr;
+ CudaK32HybridMode* diskContext = nullptr;
+ CudaK32ParkContext* parkContext = nullptr;
+ bool useParkContext = false;
+
+ // Used when '--check' is enabled
+ struct GreenReaperContext* grCheckContext = nullptr;
+ class PlotChecker* plotChecker = nullptr;
+
struct
{
Duration uploadTime = Duration::zero(); // Host-to-device wait time
@@ -359,7 +411,7 @@ inline uint32 CudaK32PlotGetOutputIndex( CudaK32PlotContext& cx )
}
//-----------------------------------------------------------
-inline bool CudaK32PlotIsOutputInterleaved( CudaK32PlotContext& cx )
+inline bool CudaK32PlotIsOutputVertical( CudaK32PlotContext& cx )
{
return CudaK32PlotGetOutputIndex( cx ) == 0;
}
diff --git a/cuda/CudaPlotPhase2.cu b/cuda/CudaPlotPhase2.cu
index 93099d86..8d2d5094 100644
--- a/cuda/CudaPlotPhase2.cu
+++ b/cuda/CudaPlotPhase2.cu
@@ -20,8 +20,7 @@
static void CudaK32PlotAllocateBuffersTest( CudaK32PlotContext& cx );
#define MARK_TABLE_BLOCK_THREADS 128
-#define P2_BUCKET_COUNT BBCU_BUCKET_COUNT
-#define P2_ENTRIES_PER_BUCKET BBCU_BUCKET_ALLOC_ENTRY_COUNT //((1ull<
-__global__ void CudaMarkTables( const uint32 entryCount, const uint32* lPairs, const uint16* rPairs, byte* marks, const uint64* rTableMarks, const uint32 rOffset )
+__global__ void CudaMarkTables( const uint32 entryCount, const uint32* lPairs, const uint16* rPairs,
+ byte* marks, const uint64* rTableMarks, const uint32 rOffset )
{
const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -39,11 +39,11 @@ __global__ void CudaMarkTables( const uint32 entryCount, const uint32* lPairs, c
return;
if constexpr ( useRMarks )
- {
+ {
if( !CuBitFieldGet( rTableMarks, rOffset + gid ) )
return;
}
-
+
const uint32 l = lPairs[gid];
const uint32 r = l + rPairs[gid];
@@ -117,12 +117,12 @@ static void BytefieldToBitfield( CudaK32PlotContext& cx, const byte* bytefield,
ASSERT( (uint64)blockCount * blockThreadCount * 64 == tableEntryCount );
-#if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
+ #if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
#define G_PRUNED_COUNTS ,cx.phase2->devPrunedCount
CudaErrCheck( cudaMemsetAsync( cx.phase2->devPrunedCount, 0, sizeof( uint32 ), stream ) );
-#else
+ #else
#define G_PRUNED_COUNTS
-#endif
+ #endif
ASSERT_DOES_NOT_OVERLAP2( bitfield, bytefield, GetMarkingTableBitFieldSize(), GetMarkingTableByteSize() );
@@ -131,8 +131,11 @@ static void BytefieldToBitfield( CudaK32PlotContext& cx, const byte* bytefield,
void LoadPairs( CudaK32PlotContext& cx, CudaK32Phase2& p2, const TableId rTable, const uint32 bucket )
{
+ if( bucket >= BBCU_BUCKET_COUNT )
+ return;
+
const uint64 tableEntryCount = cx.tableEntryCounts[(int)rTable];
- const uint32 entryCount = BBCU_BUCKET_ENTRY_COUNT;//(uint32)std::min( (uint64)BBCU_BUCKET_ENTRY_COUNT, tableEntryCount - p2.pairsLoadOffset );// cx.bucketCounts[(int)rTable][bucket];
+ const uint32 entryCount = cx.bucketCounts[(int)rTable][bucket];
// uint32* hostPairsL = cx.hostTableSortedL + p2.pairsLoadOffset;
// uint16* hostPairsR = cx.hostTableSortedR + p2.pairsLoadOffset;
@@ -163,42 +166,48 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 )
byte* devLMarks = p2.devMarkingTable;
+ if( cx.cfg.hybrid128Mode )
+ {
+ cx.diskContext->tablesL[(int)rTable]->Swap();
+ cx.diskContext->tablesR[(int)rTable]->Swap();
+
+ p2.pairsLIn.AssignDiskBuffer( cx.diskContext->tablesL[(int)rTable] );
+ p2.pairsRIn.AssignDiskBuffer( cx.diskContext->tablesR[(int)rTable] );
+ }
+
// Zero-out marks
CudaErrCheck( cudaMemsetAsync( devLMarks, 0, GetMarkingTableByteSize(), cx.computeStream ) );
// Load first bucket's worth of pairs
LoadPairs( cx, p2, rTable, 0 );
- uint32 rOffset = 0;
- for( uint32 bucket = 0; bucket < P2_BUCKET_COUNT; bucket++ )
- {
- const bool isLastBucket = bucket + 1 == P2_BUCKET_COUNT;
+ // Mark the table, buckey by bucket
+ uint32 rTableGlobalIndexOffset = 0;
- // Load next set of pairs in the background
- if( !isLastBucket )
- LoadPairs( cx, p2, rTable, bucket + 1 );
+ for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+ {
+ // Load next set of pairs in the background (if there is another bucket)
+ LoadPairs( cx, p2, rTable, bucket + 1 );
const uint64 tableEntryCount = cx.tableEntryCounts[(int)rTable];
- const uint32 entryCount = isLastBucket ? tableEntryCount - (BBCU_BUCKET_ENTRY_COUNT * (BBCU_BUCKET_COUNT-1)): BBCU_BUCKET_ENTRY_COUNT;
- // const uint32 entryCount = cx.bucketCounts[(int)rTable][bucket];
+ const uint32 entryCount = cx.bucketCounts[(int)rTable][bucket];
// Wait for pairs to be ready
const uint32* devLPairs = p2.pairsLIn.GetUploadedDeviceBufferT( cx.computeStream );
const uint16* devRPairs = p2.pairsRIn.GetUploadedDeviceBufferT( cx.computeStream );
-
// Mark
const uint32 blockCount = (uint32)CDiv( entryCount, MARK_TABLE_BLOCK_THREADS );
if( rTable == TableId::Table7 )
CudaMarkTables<<>>( entryCount, devLPairs, devRPairs, devLMarks, nullptr, 0 );
else
- CudaMarkTables<<>>( entryCount, devLPairs, devRPairs, devLMarks, p2.devRMarks[(int)rTable], rOffset );
-
+ CudaMarkTables<<>>( entryCount, devLPairs, devRPairs, devLMarks, p2.devRMarks[(int)rTable], rTableGlobalIndexOffset );
+
p2.pairsLIn.ReleaseDeviceBuffer( cx.computeStream );
p2.pairsRIn.ReleaseDeviceBuffer( cx.computeStream );
- rOffset += entryCount;
+ rTableGlobalIndexOffset += entryCount;
}
// Convert the bytefield marking table to a bitfield
@@ -209,14 +218,14 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 )
// Download bitfield marks
// uint64* hostBitField = p2.hostBitFieldAllocator->AllocT( GetMarkingTableBitFieldSize() );
uint64* hostBitField = cx.hostMarkingTables[(int)lTable];
-
+
// #TODO: Do download and copy again, for now just store all of them in this pinned buffer
// cx.phase3->hostMarkingTables[(int)lTable] = hostBitField;
p2.outMarks.Download( hostBitField, GetMarkingTableBitFieldSize(), cx.computeStream );
-
+
// p2.outMarks.DownloadAndCopy( hostBitField, cx.hostMarkingTables[(int)lTable], GetMarkingTableBitFieldSize(), cx.computeStream );
// p2.outMarks.Download( cx.hostMarkingTables[(int)lTable], GetMarkingTableBitFieldSize() );
-
+
#if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
{
@@ -370,6 +379,9 @@ void CudaK32PlotPhase2( CudaK32PlotContext& cx )
MarkTable( cx, p2 );
p2.outMarks.WaitForCompletion();
p2.outMarks.Reset();
+ p2.pairsLIn.Reset();
+ p2.pairsRIn.Reset();
+
const auto elapsed = TimerEnd( timer );
Log::Line( "Marked Table %u in %.2lf seconds.", rTable, elapsed );
@@ -380,7 +392,7 @@ void CudaK32PlotPhase2( CudaK32PlotContext& cx )
}
// Wait for everything to complete
-
+
// p2.outMarks.WaitForCopyCompletion(); // #TODO: Re-activate this when re-enabling copy
p2.outMarks.WaitForCompletion();
p2.outMarks.Reset();
@@ -392,30 +404,39 @@ void CudaK32PlotPhase2( CudaK32PlotContext& cx )
///
void CudaK32PlotPhase2AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
{
- const size_t alignment = cx.allocAlignment;
+ GpuStreamDescriptor desc{};
+
+ desc.entriesPerSlice = P2_ENTRIES_PER_BUCKET;
+ desc.sliceCount = 1;
+ desc.sliceAlignment = cx.allocAlignment;
+ desc.bufferCount = BBCU_DEFAULT_GPU_BUFFER_COUNT;
+ desc.deviceAllocator = acx.devAllocator;
+ desc.pinnedAllocator = nullptr; // Start in direct mode (no intermediate pinined buffers)
+
+ if( cx.cfg.hybrid128Mode )
+ {
+ desc.pinnedAllocator = acx.pinnedAllocator;
+ desc.sliceAlignment = cx.diskContext->temp1Queue->BlockSize();
+ }
- IAllocator& devAllocator = *acx.devAllocator;
- IAllocator& pinnedAllocator = *acx.pinnedAllocator;
+ if( !cx.downloadDirect )
+ desc.pinnedAllocator = acx.pinnedAllocator;
CudaK32Phase2& p2 = *cx.phase2;
const size_t markingTableByteSize = GetMarkingTableByteSize();
const size_t markingTableBitFieldSize = GetMarkingTableBitFieldSize();
- p2.devPrunedCount = devAllocator.CAlloc( 1, alignment );
- p2.devMarkingTable = devAllocator.AllocT( markingTableByteSize, alignment );
-
- p2.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
- sizeof( uint32 ) * P2_ENTRIES_PER_BUCKET, devAllocator, pinnedAllocator, alignment, acx.dryRun );
+ // Device buffers
+ p2.devPrunedCount = acx.devAllocator->CAlloc( 1, acx.alignment );
+ p2.devMarkingTable = acx.devAllocator->AllocT( markingTableByteSize, acx.alignment );
- p2.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
- sizeof( uint16 ) * P2_ENTRIES_PER_BUCKET, devAllocator, pinnedAllocator, alignment, acx.dryRun );
+ // Upload/Download streams
+ p2.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT( desc, acx.dryRun );
+ p2.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT( desc, acx.dryRun );
- p2.outMarks = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer(
- markingTableBitFieldSize, devAllocator, alignment, acx.dryRun );
-
- // These buffers are safe to use at this point
- // p2.hostBitFieldAllocator = new StackAllocator( cx.hostTableR, sizeof( uint32 ) * BBCU_TABLE_ALLOC_ENTRY_COUNT );
+ desc.entriesPerSlice = markingTableBitFieldSize;
+ p2.outMarks = cx.gpuDownloadStream[0]->CreateDownloadBufferT( desc, acx.dryRun );
}
@@ -550,7 +571,7 @@ void DbgValidateTable( CudaK32PlotContext& cx )
{
{
uint64 totalCount = 0;
- for( uint32 bucket = 0; bucket < P2_BUCKET_COUNT; bucket++ )
+ for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
totalCount += cx.bucketCounts[(int)rt][bucket];
ASSERT( totalCount == cx.tableEntryCounts[(int)rt] );
@@ -562,7 +583,7 @@ void DbgValidateTable( CudaK32PlotContext& cx )
Pairs hostRTablePairs = cx.hostBackPointers[(int)rt];
- for( uint32 bucket = 0; bucket < P2_BUCKET_COUNT; bucket++ )
+ for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
{
const uint32 rTableBucketEntryCount = cx.bucketCounts[(int)rt][bucket];
@@ -638,9 +659,13 @@ void DbgWriteMarks( CudaK32PlotContext& cx, const TableId table )
{
char path[512];
+ std::string baseUrl = DBG_BBCU_DBG_DIR;
+ if( cx.cfg.hybrid128Mode )
+ baseUrl += "disk/";
+
Log::Line( "[DEBUG] Writing marking table %u to disk...", table+1 );
{
- sprintf( path, "%smarks%d.tmp", DBG_BBCU_DBG_DIR, (int)table+1 );
+ sprintf( path, "%smarks%d.tmp", baseUrl.c_str(), (int)table+1 );
const uint64* marks = cx.hostMarkingTables[(int)table];
diff --git a/cuda/CudaPlotPhase3.cu b/cuda/CudaPlotPhase3.cu
index b19d42c3..8fcdfe2a 100644
--- a/cuda/CudaPlotPhase3.cu
+++ b/cuda/CudaPlotPhase3.cu
@@ -53,7 +53,7 @@ __global__ void CudaConvertInlinedXsToLinePoints(
{
const Pair p = inXs[gid];
CUDA_ASSERT( p.left || p.right );
-
+
lp = CudaSquareToLinePoint64( p.left, p.right );
bucket = (uint32)(lp >> bucketShift);
offset = atomicAdd( &sharedBuckets[bucket], 1 );
@@ -79,7 +79,6 @@ __global__ void CudaConvertInlinedXsToLinePoints(
outIndices[dst] = rIndex;
}
-
//-----------------------------------------------------------
__global__ void CudaTestPrune(
const uint64 entryCount, const uint32 rOffset, const uint64* rTableMarks, uint32* gPrunedEntryCount )
@@ -236,6 +235,14 @@ void CudaK32PlotPhase3( CudaK32PlotContext& cx )
}
#endif
+ if( cx.cfg.hybrid16Mode )
+ {
+ cx.diskContext->phase3.rMapBuffer->Swap();
+ cx.diskContext->phase3.indexBuffer->Swap();
+ cx.diskContext->phase3.lpAndLMapBuffer->Swap();
+ }
+
+
const uint32 compressionLevel = cx.gCfg->compressionLevel;
// Special case with the starting table, since it has the values inlined already
@@ -259,11 +266,11 @@ void CudaK32PlotPhase3( CudaK32PlotContext& cx )
elapsed = TimerEnd( timer );
Log::Line( " Step 2 completed step in %.2lf seconds.", elapsed );
-
const uint64 baseEntryCount = cx.tableEntryCounts[(int)cx.table];
const uint64 prunedEntryCount = cx.phase3->prunedTableEntryCounts[(int)cx.table];
Log::Line( "Completed table %u in %.2lf seconds with %llu / %llu entries ( %.2lf%% ).",
cx.table, tableElapsed, prunedEntryCount, baseEntryCount, (prunedEntryCount / (double)baseEntryCount) * 100.0 );
+
}
// else if( compressionLevel > 0 )
// {
@@ -286,7 +293,7 @@ void CudaK32PlotPhase3( CudaK32PlotContext& cx )
Log::Line( "Compressing tables %u and %u...", (uint)rTable, (uint)rTable+1 );
cx.table = rTable;
-
+
#if BBCU_DBG_SKIP_PHASE_2
if( rTable < TableId::Table7 )
DbgLoadTablePairs( cx, rTable+1, false );
@@ -340,26 +347,22 @@ void Step1( CudaK32PlotContext& cx )
auto& p3 = *cx.phase3;
auto& s1 = p3.step1;
- const uint32 entryCount = BBCU_BUCKET_ENTRY_COUNT;
+ if( bucket == 0 && cx.cfg.hybrid128Mode )
+ {
+ cx.diskContext->tablesL[(int)rTable]->Swap();
+ cx.diskContext->tablesR[(int)rTable]->Swap();
+
+ s1.pairsLIn.AssignDiskBuffer( cx.diskContext->tablesL[(int)rTable] );
+ s1.pairsRIn.AssignDiskBuffer( cx.diskContext->tablesR[(int)rTable] );
+ }
+
+ const uint32 entryCount = cx.bucketCounts[(int)rTable][bucket]; //BBCU_BUCKET_ENTRY_COUNT;
- // uint32* hostPairsL = cx.hostTableSortedL + p3.pairsLoadOffset;
- // uint16* hostPairsR = cx.hostTableSortedR + p3.pairsLoadOffset;
uint32* hostPairsL = cx.hostBackPointers[(int)rTable].left + p3.pairsLoadOffset;
uint16* hostPairsR = cx.hostBackPointers[(int)rTable].right + p3.pairsLoadOffset;
- // if( rTable < TableId::Table7 )
- // {
- // const uint32* nextHostPairsL = cx.hostBackPointers[(int)rTable + 1].left + p3.pairsLoadOffset;
- // const uint16* nextHostPairsR = cx.hostBackPointers[(int)rTable + 1].right + p3.pairsLoadOffset;
-
- // s1.pairsLIn.UploadAndPreLoadT( hostPairsL, entryCount, nextHostPairsL, entryCount );
- // s1.pairsRIn.UploadAndPreLoadT( hostPairsR, entryCount, nextHostPairsR, entryCount );
- // }
- // else
- {
- s1.pairsLIn.UploadT( hostPairsL, entryCount );
- s1.pairsRIn.UploadT( hostPairsR, entryCount );
- }
+ s1.pairsLIn.UploadT( hostPairsL, entryCount );
+ s1.pairsRIn.UploadT( hostPairsR, entryCount );
p3.pairsLoadOffset += entryCount;
};
@@ -384,7 +387,6 @@ void Step1( CudaK32PlotContext& cx )
p3.pairsLoadOffset = 0;
LoadBucket( cx, 0 );
-
///
/// Process buckets
///
@@ -403,9 +405,9 @@ void Step1( CudaK32PlotContext& cx )
const uint32* devLPairs = (uint32*)s1.pairsLIn.GetUploadedDeviceBuffer( cx.computeStream );
const uint16* devRPairs = (uint16*)s1.pairsRIn.GetUploadedDeviceBuffer( cx.computeStream );
- const uint32 entryCount = bucket == BBCU_BUCKET_COUNT-1 ?
- ( cx.tableEntryCounts[(int)rTable] - (BBCU_BUCKET_ENTRY_COUNT * (BBCU_BUCKET_COUNT-1)) ) : // Get only the remaining entries for the last bucket
- BBCU_BUCKET_ENTRY_COUNT; // Otherwise, use a whole bucket's worth.
+ const uint32 entryCount = cx.bucketCounts[(int)rTable][bucket];// bucket == BBCU_BUCKET_COUNT-1 ?
+ // ( cx.tableEntryCounts[(int)rTable] - (BBCU_BUCKET_ENTRY_COUNT * (BBCU_BUCKET_COUNT-1)) ) : // Get only the remaining entries for the last bucket
+ // BBCU_BUCKET_ENTRY_COUNT; // Otherwise, use a whole bucket's worth.
auto* devRMap = (RMap*)s1.rMapOut.LockDeviceBuffer( cx.computeStream );
@@ -430,7 +432,7 @@ void Step1( CudaK32PlotContext& cx )
s1.rMapOut.Download2DT( p3.hostRMap + (size_t)bucket * P3_PRUNED_SLICE_MAX,
P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_BUCKET_MAX, P3_PRUNED_SLICE_MAX, cx.computeStream );
}
-
+
// Download slice counts
cudaStream_t downloadStream = s1.rMapOut.GetQueue()->GetStream();
@@ -464,6 +466,15 @@ void Step1( CudaK32PlotContext& cx )
for( uint32 i = 0; i < BBCU_BUCKET_COUNT; i++ )
p3.prunedTableEntryCounts[(int)rTable] += p3.prunedBucketCounts[(int)rTable][i];
}
+
+ if( cx.cfg.hybrid16Mode )
+ {
+ cx.diskContext->phase3.rMapBuffer->Swap();
+ }
+
+ // #if _DEBUG
+ // DbgValidateRMap( cx );
+ // #endif
}
//-----------------------------------------------------------
@@ -478,17 +489,25 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
auto& p3 = *cx.phase3;
auto& tx = p3.xTable;
- if( bucket == 0 )
- p3.pairsLoadOffset = 0;
-
// Load inlined x's
const TableId rTable = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables;
const uint32 entryCount = cx.bucketCounts[(int)rTable][bucket];
+ if( bucket == 0 )
+ {
+ p3.pairsLoadOffset = 0;
+
+ if( cx.cfg.hybrid128Mode )
+ {
+ cx.diskContext->tablesL[(int)rTable]->Swap();
+ tx.xIn.AssignDiskBuffer( cx.diskContext->tablesL[(int)rTable] );
+ }
+ }
+
const Pair* inlinedXs = ((Pair*)cx.hostBackPointers[(int)rTable].left) + p3.pairsLoadOffset;
tx.xIn.UploadT( inlinedXs, entryCount, cx.computeStream );
-
+
p3.pairsLoadOffset += entryCount;
};
@@ -511,8 +530,8 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
const bool isCompressed = cx.gCfg->compressionLevel > 0;
const uint32 compressedLPBits = isCompressed ? GetCompressedLPBitCount( cx.gCfg->compressionLevel ) : 0;
- const uint32 lpBits = isCompressed ? compressedLPBits : BBCU_K * 2 - 1;
- const uint32 lpBucketShift = lpBits - BBC_BUCKET_BITS;
+ const uint32 lpBits = isCompressed ? compressedLPBits : BBCU_K * 2 - 1;
+ const uint32 lpBucketShift = lpBits - BBC_BUCKET_BITS;
uint64 tablePrunedEntryCount = 0;
uint32 rTableOffset = 0;
@@ -556,7 +575,7 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
rTableOffset += entryCount;
}
-
+
cudaStream_t downloadStream = tx.lpOut.GetQueue()->GetStream();
CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
@@ -592,11 +611,17 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
p3.prunedTableEntryCounts[(int)rTable] += p3.prunedBucketCounts[(int)rTable][i];
}
-#if _DEBUG
- // DbgValidateIndices( cx );
- // DbgValidateStep2Output( cx );
- // DbgDumpSortedLinePoints( cx );
-#endif
+ if( cx.cfg.hybrid16Mode )
+ {
+ cx.diskContext->phase3.lpAndLMapBuffer->Swap();
+ cx.diskContext->phase3.indexBuffer->Swap();
+ }
+
+// #if _DEBUG
+// DbgValidateIndices( cx );
+// // DbgValidateStep2Output( cx );
+// // DbgDumpSortedLinePoints( cx );
+// #endif
}
@@ -606,22 +631,47 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
//-----------------------------------------------------------
void CudaK32PlotPhase3AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
{
+ static_assert( sizeof( LMap ) == sizeof( uint64 ) );
+
auto& p3 = *cx.phase3;
// Shared allocations
- p3.devBucketCounts = acx.devAllocator->CAlloc( BBCU_BUCKET_COUNT, acx.alignment );
- p3.devPrunedEntryCount = acx.devAllocator->CAlloc( 1, acx.alignment );
+ p3.devBucketCounts = acx.devAllocator->CAlloc( BBCU_BUCKET_COUNT, acx.alignment );
+ p3.devPrunedEntryCount = acx.devAllocator->CAlloc( 1, acx.alignment );
// Host allocations
- p3.hostRMap = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT ); // Used for rMap and index
- p3.hostLinePoints = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT ); // Used for lMap and LPs
-
- if( !acx.dryRun )
+ if( !cx.cfg.hybrid16Mode )
+ {
+ p3.hostRMap = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT ); // Used for rMap and index
+ p3.hostLinePoints = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT ); // Used for lMap and LPs
+ }
+ else if( !cx.diskContext->phase3.rMapBuffer )
{
- ASSERT( (uintptr_t)(p3.hostLinePoints + BBCU_TABLE_ALLOC_ENTRY_COUNT ) <= (uintptr_t)cx.hostTableL );
- ASSERT( (uintptr_t)(p3.hostLinePoints + BBCU_TABLE_ALLOC_ENTRY_COUNT ) < (uintptr_t)cx.hostTableSortedL );
+ const size_t RMAP_SLICE_SIZE = sizeof( RMap ) * P3_PRUNED_SLICE_MAX;
+ const size_t INDEX_SLICE_SIZE = sizeof( uint32 ) * P3_PRUNED_SLICE_MAX;
+ const size_t LP_AND_LMAP_SLICE_SIZE = sizeof( uint64 ) * P3_PRUNED_SLICE_MAX;
+
+ const FileFlags TMP2_QUEUE_FILE_FLAGS = cx.cfg.temp2DirectIO ? FileFlags::NoBuffering | FileFlags::LargeFile : FileFlags::LargeFile;
+
+ cx.diskContext->phase3.rMapBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::P3_RMAP_DISK_BUFFER_FILE_NAME.data(),
+ BBCU_BUCKET_COUNT, RMAP_SLICE_SIZE, FileMode::OpenOrCreate, FileAccess::ReadWrite, TMP2_QUEUE_FILE_FLAGS );
+ FatalIf( !cx.diskContext->phase3.rMapBuffer, "Failed to create R Map disk buffer." );
+
+ cx.diskContext->phase3.indexBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::P3_INDEX_DISK_BUFFER_FILE_NAME.data(),
+ BBCU_BUCKET_COUNT, INDEX_SLICE_SIZE, FileMode::OpenOrCreate, FileAccess::ReadWrite, TMP2_QUEUE_FILE_FLAGS );
+ FatalIf( !cx.diskContext->phase3.indexBuffer, "Failed to create index disk buffer." );
+
+ cx.diskContext->phase3.lpAndLMapBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::P3_LP_AND_LMAP_DISK_BUFFER_FILE_NAME.data(),
+ BBCU_BUCKET_COUNT, RMAP_SLICE_SIZE, FileMode::OpenOrCreate, FileAccess::ReadWrite, TMP2_QUEUE_FILE_FLAGS );
+ FatalIf( !cx.diskContext->phase3.lpAndLMapBuffer, "Failed to create LP/LMap disk buffer." );
}
- // p3.hostBucketCounts = acx.pinnedAllocator->CAlloc( BBCU_BUCKET_COUNT, acx.alignment );
+
+ #if _DEBUG
+ if( !acx.dryRun && !cx.cfg.hybrid128Mode )
+ {
+ ASSERT( (uintptr_t)(p3.hostLinePoints + BBCU_TABLE_ALLOC_ENTRY_COUNT ) <= (uintptr_t)cx.hostTableL );
+ }
+ #endif
if( acx.dryRun )
{
@@ -687,74 +737,156 @@ void CudaK32PlotPhase3AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocConte
//-----------------------------------------------------------
void AllocXTableStep( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
{
+ GpuStreamDescriptor desc{};
+ desc.entriesPerSlice = BBCU_MAX_SLICE_ENTRY_COUNT;
+ desc.sliceCount = BBCU_BUCKET_COUNT;
+ desc.sliceAlignment = acx.alignment;
+ desc.bufferCount = BBCU_DEFAULT_GPU_BUFFER_COUNT;
+ desc.deviceAllocator = acx.devAllocator;
+ desc.pinnedAllocator = nullptr;
+
+ GpuStreamDescriptor uploadDesc = desc;
+ if( cx.cfg.hybrid128Mode )
+ {
+ uploadDesc.pinnedAllocator = acx.pinnedAllocator;
+
+ if( cx.cfg.hybrid16Mode )
+ desc.pinnedAllocator = acx.pinnedAllocator;
+ }
+
auto& tx = cx.phase3->xTable;
tx.devRMarks = (uint64*)acx.devAllocator->AllocT( GetMarkingTableBitFieldSize(), acx.alignment );
- tx.xIn = cx.gpuUploadStream[0]->CreateUploadBuffer(sizeof(Pair) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, acx.alignment, acx.dryRun);
- tx.lpOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, acx.alignment, acx.dryRun );
- tx.indexOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, acx.alignment, acx.dryRun );
+
+ tx.xIn = cx.gpuUploadStream[0]->CreateUploadBufferT( uploadDesc, acx.dryRun );
+ tx.lpOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( desc, acx.dryRun );
+ tx.indexOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( desc, acx.dryRun );
+
+ if( !acx.dryRun && cx.cfg.hybrid16Mode )
+ {
+ tx.lpOut .AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
+ tx.indexOut.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer );
+ }
}
//-----------------------------------------------------------
void CudaK32PlotAllocateBuffersStep1( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
{
+ GpuStreamDescriptor desc{};
+ desc.entriesPerSlice = BBCU_MAX_SLICE_ENTRY_COUNT;
+ desc.sliceCount = BBCU_BUCKET_COUNT;
+ desc.sliceAlignment = acx.alignment;
+ desc.bufferCount = BBCU_DEFAULT_GPU_BUFFER_COUNT;
+ desc.deviceAllocator = acx.devAllocator;
+ desc.pinnedAllocator = nullptr;
+
+ GpuStreamDescriptor uploadDesc = desc;
+ if( cx.cfg.hybrid128Mode )
+ {
+ uploadDesc.pinnedAllocator = acx.pinnedAllocator;
+
+ if( cx.cfg.hybrid16Mode )
+ desc.pinnedAllocator = acx.pinnedAllocator;
+ }
+
auto& s1 = cx.phase3->step1;
const size_t alignment = acx.alignment;
- s1.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
- sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
-
- s1.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
- sizeof( uint16 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
-
- s1.rMapOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer(
- sizeof( RMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
+ s1.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT( uploadDesc, acx.dryRun );
+ s1.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT( uploadDesc, acx.dryRun );
+ s1.rMapOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( desc, acx.dryRun );
s1.rTableMarks = (uint64*)acx.devAllocator->AllocT( GetMarkingTableBitFieldSize(), acx.alignment );
+
+ if( !acx.dryRun && cx.cfg.hybrid16Mode )
+ {
+ s1.rMapOut.AssignDiskBuffer( cx.diskContext->phase3.rMapBuffer );
+ }
}
//-----------------------------------------------------------
void CudaK32PlotAllocateBuffersStep2( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
{
+ GpuStreamDescriptor desc{};
+ desc.entriesPerSlice = BBCU_MAX_SLICE_ENTRY_COUNT;
+ desc.sliceCount = BBCU_BUCKET_COUNT;
+ desc.sliceAlignment = acx.alignment;
+ desc.bufferCount = BBCU_DEFAULT_GPU_BUFFER_COUNT;
+ desc.deviceAllocator = acx.devAllocator;
+ desc.pinnedAllocator = nullptr;
+
+ GpuStreamDescriptor uploadDesc = desc;
+ if( cx.cfg.hybrid16Mode )
+ {
+ desc.pinnedAllocator = acx.pinnedAllocator;
+ }
+
auto& s2 = cx.phase3->step2;
const size_t alignment = acx.alignment;
- s2.rMapIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
- sizeof( RMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+ s2.rMapIn = cx.gpuUploadStream[0]->CreateUploadBufferT( desc, acx.dryRun );
+ s2.lMapIn = cx.gpuUploadStream[0]->CreateUploadBufferT( desc, acx.dryRun );
- s2.lMapIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
- sizeof( LMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+ s2.lpOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( desc, acx.dryRun );
+ s2.indexOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT (desc, acx.dryRun );
- s2.lpOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer(
- sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
- s2.indexOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer(
- sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
-
+ const size_t devParkAllocSize = P3_PARK_7_SIZE * P3_MAX_P7_PARKS_PER_BUCKET;
+
+ GpuStreamDescriptor parksDesc = desc;
+ parksDesc.sliceCount = 1;
+ parksDesc.entriesPerSlice = devParkAllocSize;
+ parksDesc.sliceAlignment = RoundUpToNextBoundaryT( P3_PARK_7_SIZE, sizeof( uint64 ) );
+
+ s2.parksOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( parksDesc, acx.dryRun );
+
s2.devLTable[0] = acx.devAllocator->CAlloc( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
s2.devLTable[1] = acx.devAllocator->CAlloc( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
+
+ if( !acx.dryRun && cx.cfg.hybrid16Mode )
+ {
+ s2.rMapIn.AssignDiskBuffer( cx.diskContext->phase3.rMapBuffer );
+ s2.lMapIn.AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
+
+ s2.lpOut .AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
+ s2.indexOut.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer );
+ }
}
//-----------------------------------------------------------
void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
{
+ GpuStreamDescriptor desc{};
+ desc.entriesPerSlice = BBCU_MAX_SLICE_ENTRY_COUNT;
+ desc.sliceCount = BBCU_BUCKET_COUNT;
+ desc.sliceAlignment = acx.alignment;
+ desc.bufferCount = BBCU_DEFAULT_GPU_BUFFER_COUNT;
+ desc.deviceAllocator = acx.devAllocator;
+ desc.pinnedAllocator = nullptr;
+
+ if( cx.cfg.hybrid16Mode )
+ {
+ desc.pinnedAllocator = acx.pinnedAllocator;
+ }
+
auto& s3 = cx.phase3->step3;
const size_t alignment = acx.alignment;
s3.hostParkOverrunCount = acx.pinnedAllocator->CAlloc( 1 );
- const size_t devParkAllocSize = DEV_MAX_PARK_SIZE * P3_PRUNED_MAX_PARKS_PER_BUCKET;
+ s3.lpIn = cx.gpuUploadStream[0]->CreateUploadBufferT( desc, acx.dryRun );
+ s3.indexIn = cx.gpuUploadStream[0]->CreateUploadBufferT( desc, acx.dryRun );
- s3.lpIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
- sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+ s3.mapOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( desc, acx.dryRun );
- s3.indexIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
- sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+ const size_t devParkAllocSize = DEV_MAX_PARK_SIZE * P3_PRUNED_MAX_PARKS_PER_BUCKET;
- s3.mapOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer(
- sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
+ GpuStreamDescriptor parksDesc = desc;
+ parksDesc.sliceCount = 1;
+ parksDesc.entriesPerSlice = devParkAllocSize;
+ parksDesc.sliceAlignment = RoundUpToNextBoundaryT( DEV_MAX_PARK_SIZE, sizeof( uint64 ) );
- s3.parksOut = cx.gpuDownloadStream[0]->CreateDownloadBuffer(devParkAllocSize, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun);
+ s3.parksOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( parksDesc, acx.dryRun );
if( acx.dryRun )
{
@@ -774,11 +906,16 @@ void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContex
s3.devDeltaLinePoints = acx.devAllocator->CAlloc( linePointAllocCount, alignment );
s3.devIndices = acx.devAllocator->CAlloc( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
- // s3.devParks = acx.devAllocator->AllocT( parkAllocSize, alignment );
- // s3.hostParks = acx.devAllocator->AllocT ( maxParkSize , alignment );
-
s3.devCTable = acx.devAllocator->AllocT( P3_MAX_CTABLE_SIZE, alignment );
s3.devParkOverrunCount = acx.devAllocator->CAlloc( 1 );
+
+ if( !acx.dryRun && cx.cfg.hybrid16Mode )
+ {
+ s3.lpIn .AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
+ s3.indexIn.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer );
+
+ s3.mapOut.AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
+ }
}
@@ -827,6 +964,9 @@ void DbgValidateRMap( CudaK32PlotContext& cx )
RMap* rMap = bbcvirtallocbounded( BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+ // blake3_hasher hasher;
+ // blake3_hasher_init( &hasher );
+
for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
{
const RMap* reader = p3.hostRMap + bucket * P3_PRUNED_BUCKET_MAX;
@@ -838,7 +978,7 @@ void DbgValidateRMap( CudaK32PlotContext& cx )
{
const uint32 copyCount = s1.prunedBucketSlices[slice][bucket];
bbmemcpy_t( writer, reader, copyCount );
-
+
writer += copyCount;
entryCount += copyCount;
@@ -858,13 +998,18 @@ void DbgValidateRMap( CudaK32PlotContext& cx )
const uint32 right = map.dstR - bucketOffset;
ASSERT( left < BBCU_BUCKET_ALLOC_ENTRY_COUNT );
ASSERT( right < BBCU_BUCKET_ALLOC_ENTRY_COUNT );
- CUDA_ASSERT( left < right );
-
+ ASSERT( left < right );
}
+
+ // Hash bucket
+ // blake3_hasher_update( &hasher, rMap, sizeof( RMap ) * entryCount );
}
+ // Print hash
+ // DbgFinishAndPrintHash( hasher, "r_map", (uint)cx.table + 1 );
+
bbvirtfreebounded( rMap );
- Log::Line( "[DEBUG] CPU OK" );
+ Log::Line( " [DEBUG] CPU OK" );
}
// Validate in CUDA
@@ -899,10 +1044,12 @@ void DbgValidateRMap( CudaK32PlotContext& cx )
p3.step2.rMapIn.ReleaseDeviceBuffer( cx.computeStream );
}
- Log::Line( "[DEBUG] CUDA OK" );
+ Log::Line( " [DEBUG] CUDA OK" );
p3.step2.lMapIn.Reset();
}
+
+ Log::Line( "[DEBUG] RMap validation OK" );
}
//-----------------------------------------------------------
@@ -922,23 +1069,45 @@ void DbgValidateIndices( CudaK32PlotContext& cx )
const uint32* reader = p3.hostIndices;
const size_t readerStride = P3_PRUNED_SLICE_MAX * 3;
-
uint64 entryCount = 0;
for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
{
- for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+ if( cx.cfg.hybrid16Mode )
+ {
+ const uint32* sizeSlices = &s2.prunedBucketSlices[0][bucket];
+
+ cx.diskContext->phase3.indexBuffer->OverrideReadSlices( bucket, sizeof( uint32 ), sizeSlices, BBCU_BUCKET_COUNT );
+ cx.diskContext->phase3.indexBuffer->ReadNextBucket();
+ const auto readBucket = cx.diskContext->phase3.indexBuffer->GetNextReadBufferAs();
+ ASSERT( readBucket.Length() == p3.prunedBucketCounts[(int)cx.table][bucket] );
+
+ bbmemcpy_t( idxWriter, readBucket.Ptr(), readBucket.Length() );
+
+ idxWriter += readBucket.Length();
+ entryCount += readBucket.Length();
+ }
+ else
{
- const uint32 copyCount = s2.prunedBucketSlices[bucket][slice];
+ for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+ {
+ const uint32 copyCount = s2.prunedBucketSlices[slice][bucket];
- bbmemcpy_t( idxWriter, reader, copyCount );
+ bbmemcpy_t( idxWriter, reader, copyCount );
- idxWriter += copyCount;
- entryCount += copyCount;
- reader += readerStride;
+ idxWriter += copyCount;
+ entryCount += copyCount;
+ reader += readerStride;
+ }
}
}
+ if( cx.cfg.hybrid16Mode )
+ {
+ cx.diskContext->phase3.indexBuffer->Swap();
+ cx.diskContext->phase3.indexBuffer->Swap();
+ }
+
ASSERT( entryCount == p3.prunedTableEntryCounts[(int)cx.table] );
RadixSort256::Sort( pool, indices, idxTmp, entryCount );
@@ -949,10 +1118,36 @@ void DbgValidateIndices( CudaK32PlotContext& cx )
ASSERT( indices[i] > indices[i-1] );
}
+ DbgHashDataT( indices, entryCount, "indices", (uint32)cx.table+1 );
+
bbvirtfreebounded( indices );
bbvirtfreebounded( idxTmp );
- Log::Line( "[DEBUG] OK" );
+ Log::Line( "[DEBUG] Index validation OK" );
+}
+
+//-----------------------------------------------------------
+void DbgHashData( const void* data, size_t size, const char* name, uint32 index )
+{
+ blake3_hasher hasher;
+ blake3_hasher_init( &hasher );
+ blake3_hasher_update( &hasher, data, size );
+
+ DbgFinishAndPrintHash( hasher, name, index );
+}
+
+//-----------------------------------------------------------
+void DbgFinishAndPrintHash( blake3_hasher& hasher, const char* name, uint32 index )
+{
+ constexpr size_t HASH_LEN = 256/8;
+ byte output[HASH_LEN];
+ blake3_hasher_finalize( &hasher, output, HASH_LEN );
+
+ Log::Write( "[DEBUG] %s_%u hash: 0x", name, index );
+ for( uint32 i = 0; i < HASH_LEN; i++ )
+ Log::Write( "%02x", output[i] );
+
+ Log::NewLine();
}
#endif
diff --git a/cuda/CudaPlotPhase3Internal.h b/cuda/CudaPlotPhase3Internal.h
index 1a4bd7a8..34909123 100644
--- a/cuda/CudaPlotPhase3Internal.h
+++ b/cuda/CudaPlotPhase3Internal.h
@@ -10,8 +10,18 @@
#include "plotdisk/jobs/IOJob.h"
#include "algorithm/RadixSort.h"
#include "plotmem/ParkWriter.h"
+ #include "b3/blake3.h"
void DbgValidateStep2Output( CudaK32PlotContext& cx );
+
+ void DbgHashData( const void* data, size_t size, const char* name, uint32 index );
+
+ void DbgFinishAndPrintHash( blake3_hasher& hasher, const char* name, uint32 index );
+ template
+ inline void DbgHashDataT( const T* data, uint64 count, const char* name, uint32 index )
+ {
+ DbgHashData( data, (size_t)count * sizeof( T ), name, index );
+ }
#endif
using LMap = CudaK32Phase3::LMap;
@@ -27,22 +37,11 @@ static_assert( alignof( LMap ) == sizeof( uint32 ) );
#define P3_PRUNED_TABLE_MAX_ENTRIES BBCU_TABLE_ALLOC_ENTRY_COUNT //(P3_PRUNED_BUCKET_MAX*BBCU_BUCKET_COUNT)
#define P3_PRUNED_MAX_PARKS_PER_BUCKET ((P3_PRUNED_BUCKET_MAX/kEntriesPerPark)+2)
-static constexpr size_t P3_MAX_CTABLE_SIZE = 38u * 1024u; // Should be more than enough
-
-//static constexpr size_t P3_LP_BUCKET_COUNT = BBCU_BUCKET_COUNT;// << 1;
-//static constexpr size_t P3_LP_SLICE_ENTRY_COUNT = BBCU_MAX_SLICE_ENTRY_COUNT;
-//static constexpr uint32 P3_LP_BUCKET_BITS = BBC_BUCKET_BITS;
-
-// static constexpr uint32 P3_LP_BUCKET_BITS = (uint32)(CuBBLog2( P3_LP_BUCKET_COUNT ));
-//static constexpr size_t P3_LP_SLICE_ENTRY_COUNT = ( CuCDiv( (size_t)( ( BBCU_TABLE_ENTRY_COUNT / P3_LP_BUCKET_COUNT / P3_LP_BUCKET_COUNT ) * P3_LP_BUCKET_MULTIPLER ),
- //BBCU_XTRA_ENTRIES_PER_SLICE ) * BBCU_XTRA_ENTRIES_PER_SLICE + BBCU_XTRA_ENTRIES_PER_SLICE );
-// static constexpr size_t P3_LP_BUCKET_ENTRY_COUNT = P3_LP_SLICE_ENTRY_COUNT * P3_LP_BUCKET_COUNT;
-
-//static constexpr size_t P3_LP_BUCKET_STRIDE = BBCU_BUCKET_ALLOC_ENTRY_COUNT;
-// static constexpr size_t P3_LP_BUCKET_ALLOC_COUNT = ( CuCDiv( (size_t)( ( BBCU_TABLE_ENTRY_COUNT / P3_LP_BUCKET_COUNT / P3_LP_BUCKET_COUNT ) * P3_LP_BUCKET_MULTIPLER ),
-// BBCU_XTRA_ENTRIES_PER_SLICE ) * BBCU_XTRA_ENTRIES_PER_SLICE + BBCU_XTRA_ENTRIES_PER_SLICE );
-// //static constexpr size_t P3_LP_TABLE_ALLOC_COUNT = P3_LP_BUCKET_STRIDE * BBCU_BUCKET_COUNT;
+static constexpr size_t P3_MAX_CTABLE_SIZE = 38u * 1024u; // Should be more than enough
+static constexpr size_t P3_MAX_P7_PARKS_PER_BUCKET = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2;
+static constexpr size_t P3_PARK_7_SIZE = CalculatePark7Size( BBCU_K );
+static_assert( sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT >= P3_MAX_P7_PARKS_PER_BUCKET * P3_PARK_7_SIZE );
static constexpr size_t MAX_PARK_SIZE = CalculateParkSize( TableId::Table1 );
static constexpr size_t DEV_MAX_PARK_SIZE = CuCDiv( MAX_PARK_SIZE, sizeof( uint64 ) ) * sizeof( uint64 ); // Align parks to 64 bits, for easier writing of stubs
diff --git a/cuda/CudaPlotPhase3Step2.cu b/cuda/CudaPlotPhase3Step2.cu
index ac13e915..3a7a6449 100644
--- a/cuda/CudaPlotPhase3Step2.cu
+++ b/cuda/CudaPlotPhase3Step2.cu
@@ -248,7 +248,7 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
s2.rMapIn.UploadArrayT( rmap, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, rSliceCounts );
};
-
+
const TableId rTable = cx.table;
const TableId lTable = rTable-1;
@@ -309,7 +309,7 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
const auto* rMap = (RMap*)s2.rMapIn.GetUploadedDeviceBuffer( cx.computeStream );
const uint32 rEntryCount = p3.prunedBucketCounts[(int)rTable][bucket];
-
+
uint64* devOutLPs = (uint64*)s2.lpOut .LockDeviceBuffer( cx.computeStream );
uint32* devOutIndices = (uint32*)s2.indexOut.LockDeviceBuffer( cx.computeStream );
@@ -317,7 +317,6 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
s2.rMapIn.ReleaseDeviceBuffer( cx.computeStream );
rTableOffset += rEntryCount;
-
// Horizontal download (write 1 row)
s2.lpOut .Download2DT( p3.hostLinePoints + (size_t)bucket * P3_PRUNED_BUCKET_MAX , P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX , P3_PRUNED_SLICE_MAX, cx.computeStream );
s2.indexOut.Download2DT( p3.hostIndices + (size_t)bucket * P3_PRUNED_BUCKET_MAX*3, P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX*3, P3_PRUNED_SLICE_MAX, cx.computeStream );
@@ -354,7 +353,7 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
cudaMemcpyDeviceToHost, downloadStream ) );
-
+
memset( p3.prunedBucketCounts[(int)rTable], 0, BBCU_BUCKET_COUNT * sizeof( uint32 ) );
CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
@@ -370,8 +369,15 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
ASSERT( p3.prunedBucketCounts[(int)rTable][bucket] <= P3_PRUNED_BUCKET_MAX );
}
+ if( cx.cfg.hybrid16Mode )
+ {
+ cx.diskContext->phase3.rMapBuffer->Swap();
+ cx.diskContext->phase3.lpAndLMapBuffer->Swap();
+ cx.diskContext->phase3.indexBuffer->Swap();
+ }
+
// #if _DEBUG
- // if( cx.table > TableId::Table3 )
+ // // if( cx.table > TableId::Table3 )
// {
// DbgValidateStep2Output( cx );
// }
@@ -402,23 +408,26 @@ void WritePark7( CudaK32PlotContext& cx )
auto& p3 = *cx.phase3;
auto& s2 = p3.step2;
-
+
// Load initial bucket
LoadBucket( cx, 0 );
// Begin park 7 table in plot
cx.plotWriter->BeginTable( PlotTable::Table7 );
- constexpr size_t parkSize = CalculatePark7Size( BBCU_K );
+ constexpr size_t parkSize = P3_PARK_7_SIZE;
constexpr size_t parkFieldCount = parkSize / sizeof( uint64 );
static_assert( parkFieldCount * sizeof( uint64 ) == parkSize );
+ GpuDownloadBuffer& parkDownloader = cx.useParkContext ? s2.parksOut : s2.lpOut;
- GpuDownloadBuffer& parkDownloader = s2.lpOut;
-
- constexpr size_t maxParksPerBucket = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2;
+ constexpr size_t maxParksPerBucket = P3_MAX_P7_PARKS_PER_BUCKET;
static_assert( sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT >= maxParksPerBucket * parkSize );
+ if( cx.useParkContext )
+ {
+ cx.parkContext->parkBufferChain->Reset();
+ }
// Host stuff
constexpr size_t hostMetaTableSize = sizeof( RMap ) * BBCU_TABLE_ALLOC_ENTRY_COUNT;
@@ -427,9 +436,10 @@ void WritePark7( CudaK32PlotContext& cx )
const uint64 tableEntryCount = cx.tableEntryCounts[(int)cx.table];
const size_t totalParkCount = CDiv( (size_t)tableEntryCount, kEntriesPerPark );
- byte* hostParks = hostAllocator.AllocT( totalParkCount * parkSize );
- byte* hostParkWriter = hostParks;
- uint32* hostLastParkEntries = hostAllocator.CAlloc( kEntriesPerPark );
+ byte* hostParks = cx.useParkContext ? nullptr : hostAllocator.AllocT( totalParkCount * parkSize );
+ byte* hostParksWriter = cx.useParkContext ? nullptr : hostParks;
+ uint32* hostLastParkEntries = cx.useParkContext ? (uint32*)cx.parkContext->hostRetainedLinePoints :
+ hostAllocator.CAlloc( kEntriesPerPark );
static_assert( kEntriesPerPark * maxParksPerBucket <= BBCU_BUCKET_ALLOC_ENTRY_COUNT * 2 );
uint32* devIndexBuffer = s2.devLTable[0] + kEntriesPerPark;
@@ -479,14 +489,38 @@ void WritePark7( CudaK32PlotContext& cx )
// Download parks & write to plot
const size_t downloadSize = parkCount * parkSize;
- parkDownloader.DownloadWithCallback( hostParkWriter, downloadSize,
+ if( cx.useParkContext )
+ {
+ ASSERT( downloadSize <= cx.parkContext->parkBufferChain->BufferSize() );
+
+ // Override the park buffer to be used when using a park context
+ hostParksWriter = cx.parkContext->parkBufferChain->PeekBuffer( bucket );
+
+ // Wait for the next park buffer to be available
+ parkDownloader.HostCallback([&cx]{
+ (void)cx.parkContext->parkBufferChain->GetNextBuffer();
+ });
+ }
+
+ parkDownloader.DownloadWithCallback( hostParksWriter, downloadSize,
[]( void* parksBuffer, size_t size, void* userData ) {
auto& cx = *reinterpret_cast( userData );
cx.plotWriter->WriteTableData( parksBuffer, size );
+
+ // Release the buffer after the plot writer is done with it.
+ if( cx.useParkContext )
+ {
+ cx.plotWriter->CallBack([&cx](){
+ cx.parkContext->parkBufferChain->ReleaseNextBuffer();
+ });
+ }
+
}, &cx, cx.computeStream );
- hostParkWriter += downloadSize;
+ hostParksWriter += downloadSize;
+ if( cx.useParkContext )
+ hostParksWriter = nullptr;
}
// Wait for parks to complete downloading
@@ -499,9 +533,19 @@ void WritePark7( CudaK32PlotContext& cx )
// Was there a left-over park?
if( retainedEntryCount > 0 )
{
+ if( cx.useParkContext )
+ hostParksWriter = cx.parkContext->parkBufferChain->GetNextBuffer();
+
// Submit last park to plot
- TableWriter::WriteP7Parks( 1, hostLastParkEntries, hostParkWriter );
- cx.plotWriter->WriteTableData( hostParkWriter, parkSize );
+ TableWriter::WriteP7Parks( 1, hostLastParkEntries, hostParksWriter );
+ cx.plotWriter->WriteTableData( hostParksWriter, parkSize );
+
+ if( cx.useParkContext )
+ {
+ cx.plotWriter->CallBack([&cx](){
+ cx.parkContext->parkBufferChain->ReleaseNextBuffer();
+ });
+ }
}
cx.plotWriter->EndTable();
@@ -534,6 +578,7 @@ void _DbgValidateOutput( CudaK32PlotContext& cx )
auto& s2 = p3.step2;
// Validate line points...
+ Log::Debug( "[DEBUG] Validating line points..." );
uint64* refLinePoints = bbcvirtallocboundednuma( BBCU_TABLE_ALLOC_ENTRY_COUNT );
uint64* tmpLinePoints = bbcvirtallocboundednuma( BBCU_TABLE_ALLOC_ENTRY_COUNT );
uint32* indices = bbcvirtallocboundednuma( BBCU_TABLE_ALLOC_ENTRY_COUNT );
@@ -614,9 +659,13 @@ void _DbgValidateOutput( CudaK32PlotContext& cx )
}
}
+ DbgHashDataT( refLinePoints, prunedEntryCount, "line_points", (uint32)cx.table+1 );
+
bbvirtfreebounded( refLinePoints );
bbvirtfreebounded( tmpLinePoints );
bbvirtfreebounded( indices );
+
+ Log::Debug( "[DEBUG] Line point validation OK" );
}
#endif
@@ -659,6 +708,8 @@ void DbgDumpSortedLinePoints( CudaK32PlotContext& cx )
ThreadPool& pool = *cx.threadPool; //DbgGetThreadPool( cx );
RadixSort256::Sort( pool, sortedLinePoints, tmpLinePoints, prunedEntryCount );
+ // DbgHashDataT( sortedLinePoints, prunedEntryCount, "sorted_line_points", (uint32)cx.table+1 );
+
// Write to disk
{
char filePath[1024] = {};
diff --git a/cuda/CudaPlotPhase3Step3.cu b/cuda/CudaPlotPhase3Step3.cu
index 3949bd8c..c8f9337b 100644
--- a/cuda/CudaPlotPhase3Step3.cu
+++ b/cuda/CudaPlotPhase3Step3.cu
@@ -52,12 +52,14 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
// Load CTable
const bool isCompressed = cx.gCfg->compressionLevel > 0 && lTable <= (TableId)cx.gCfg->numDroppedTables;
- const uint32 stubBitSize = !isCompressed ? (BBCU_K - kStubMinusBits) : cx.gCfg->compressionInfo.subtSizeBits;
+ const uint32 stubBitSize = !isCompressed ? (BBCU_K - kStubMinusBits) : cx.gCfg->compressionInfo.stubSizeBits;
const TableId firstTable = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables;
-
+
+ const bool isFirstSerializedTable = firstTable == rTable;
+
const size_t cTableSize = !isCompressed ? sizeof( CTable_0 ) : cx.gCfg->cTableSize; ASSERT( cTableSize <= P3_MAX_CTABLE_SIZE );
const FSE_CTable* hostCTable = !isCompressed ? CTables[(int)lTable] : cx.gCfg->ctable;
-
+
// (upload must be loaded before first bucket, on the same stream)
CudaErrCheck( cudaMemcpyAsync( s3.devCTable, hostCTable, cTableSize, cudaMemcpyHostToDevice,
s3.lpIn.GetQueue()->GetStream() ) );
@@ -75,13 +77,32 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
const size_t hostParkSize = isCompressed ? cx.gCfg->compressionInfo.tableParkSize : CalculateParkSize( lTable );
ASSERT( DEV_MAX_PARK_SIZE >= hostParkSize );
- // #TODO: Move this allocation to the beginning
- if( s3.parkFence == nullptr )
- s3.parkFence = new Fence();
-
byte* hostParksWriter = (byte*)cx.hostBackPointers[(int)rTable].left; //(byte*)cx.hostTableL;
uint64* hostRetainedEntries = nullptr;
+ if( cx.cfg.hybrid128Mode )
+ {
+ hostParksWriter = (byte*)cx.hostTableL;
+
+ if( !isFirstSerializedTable && !cx.useParkContext )
+ {
+ // Ensure the this buffer is no longer in use (the last table finished writing to disk.)
+ const bool willWaitForParkFence = cx.parkFence->Value() < BBCU_BUCKET_COUNT;
+ if( willWaitForParkFence )
+ Log::Line( " Waiting for parks buffer to become available." );
+
+ Duration parkWaitTime;
+ cx.parkFence->Wait( BBCU_BUCKET_COUNT, parkWaitTime );
+
+ if( willWaitForParkFence )
+ Log::Line( " Waited %.3lf seconds for the park buffer to be released.", TicksToSeconds( parkWaitTime ) );
+ }
+ }
+ if( cx.useParkContext )
+ {
+ cx.parkContext->parkBufferChain->Reset();
+ }
+
// if( !isCompressed && lTable == TableId::Table1 )
// hostParksWriter = (byte*)cx.hostBackPointers[(int)TableId::Table2].left;
@@ -101,7 +122,7 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
// Set initial event LP stream event as set.
CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) );
- s3.parkFence->Reset( 0 );
+ cx.parkFence->Reset( 0 );
s3.parkBucket = 0;
for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
@@ -200,7 +221,8 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
// No more buckets so we have to compress this last park on the CPU
CudaErrCheck( cudaStreamWaitEvent( downloadStream, cx.computeEventC ) );
- hostRetainedEntries = (uint64*)( hostParksWriter + hostParkSize * parkCount );
+ hostRetainedEntries = cx.useParkContext ? cx.parkContext->hostRetainedLinePoints :
+ (uint64*)( hostParksWriter + hostParkSize * parkCount );
CudaErrCheck( cudaMemcpyAsync( hostRetainedEntries, copySource, copySize, cudaMemcpyDeviceToHost, downloadStream ) );
}
}
@@ -209,6 +231,19 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
// Download parks
+ if( cx.useParkContext )
+ {
+ ASSERT( hostParkSize * parkCount <= cx.parkContext->parkBufferChain->BufferSize() );
+
+ // Override the park buffer to be used when using a park context
+ hostParksWriter = cx.parkContext->parkBufferChain->PeekBuffer( bucket );
+
+ // Wait for the next park buffer to be available
+ s3.parksOut.HostCallback([&cx]{
+ (void)cx.parkContext->parkBufferChain->GetNextBuffer();
+ });
+ }
+
s3.parksOut.Download2DWithCallback( hostParksWriter, hostParkSize, parkCount, hostParkSize, DEV_MAX_PARK_SIZE,
[]( void* parksBuffer, size_t size, void* userData ) {
@@ -216,11 +251,22 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
auto& s3 = cx.phase3->step3;
cx.plotWriter->WriteTableData( parksBuffer, size );
- cx.plotWriter->SignalFence( *s3.parkFence, ++s3.parkBucket );
+ cx.plotWriter->SignalFence( *cx.parkFence, ++s3.parkBucket );
+
+ // Release the buffer after the plot writer is done with it.
+ if( cx.useParkContext )
+ {
+ cx.plotWriter->CallBack([&cx](){
+ cx.parkContext->parkBufferChain->ReleaseNextBuffer();
+ });
+ }
}, &cx, lpStream, cx.downloadDirect );
hostParksWriter += hostParkSize * parkCount;
+
+ if( cx.useParkContext )
+ hostParksWriter = nullptr;
}
// Copy park overrun count
@@ -242,18 +288,24 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
// Was there a left-over park?
if( retainedLPCount > 0 )
{
- ASSERT( hostRetainedEntries );
-
+ if( cx.useParkContext )
+ hostParksWriter = cx.parkContext->parkBufferChain->GetNextBuffer();
+
uint64 lastParkEntries[kEntriesPerPark];
bbmemcpy_t( lastParkEntries, hostRetainedEntries, retainedLPCount );
WritePark( hostParkSize, retainedLPCount, lastParkEntries, hostParksWriter, stubBitSize, hostCTable );
cx.plotWriter->WriteTableData( hostParksWriter, hostParkSize );
+
+ if( cx.useParkContext )
+ {
+ cx.plotWriter->CallBack([&cx](){
+ cx.parkContext->parkBufferChain->ReleaseNextBuffer();
+ });
+ }
}
cx.plotWriter->EndTable();
- // Update buckets counts for L table
- // #TODO: These should match Step 1 pruned entry count I believe, so just copy?
memset( p3.prunedBucketCounts[(int)rTable], 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT );
for( uint32 i = 0; i < BBCU_BUCKET_COUNT; i++ )
@@ -266,12 +318,19 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
s3.lpIn .Reset();
s3.indexIn.Reset();
+ if( cx.cfg.hybrid16Mode )
+ {
+ cx.diskContext->phase3.lpAndLMapBuffer->Swap();
+ cx.diskContext->phase3.indexBuffer->Swap();
+ }
+
// #if _DEBUG
// //if( cx.table >= TableId::Table6 )
// //{
- // DbgValidateLMap( cx );
- // DbgValidateLMapData( cx );
+ // // DbgValidateLMap( cx );
+ // // DbgValidateLMapData( cx );
+
// // DbgSaveLMap( cx );
// //}
// #endif
@@ -386,7 +445,7 @@ void DbgSaveLMap( CudaK32PlotContext& cx )
char path[512];
sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.tmp", (uint)cx.table+1 );
-
+
const size_t writeSize = sizeof( LMap ) * BBCU_TABLE_ALLOC_ENTRY_COUNT;
int err;
FatalIf( !IOJob::WriteToFile( path, p3.hostLMap, writeSize, err ),
@@ -399,7 +458,7 @@ void DbgSaveLMap( CudaK32PlotContext& cx )
sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.buckets.tmp", (uint)cx.table+1 );
FatalIf( !IOJob::WriteToFileUnaligned( path, p3.prunedBucketCounts[(int)cx.table], sizeof( uint32 ) * BBCU_BUCKET_COUNT, err ),
"[DEBUG] Failed to write LMap buckets with error: %d", err );
-
+
Log::Line( " [DEBUG] OK" );
}
@@ -410,7 +469,7 @@ void DbgLoadLMap( CudaK32PlotContext& cx )
char path[512];
sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.tmp", (uint)cx.table+1 );
-
+
const size_t writeSize = sizeof( LMap ) * BBCU_TABLE_ALLOC_ENTRY_COUNT;
int err;
FatalIf( !IOJob::ReadFromFile( path, p3.hostLMap, writeSize, err ),
@@ -438,10 +497,12 @@ void DbgValidateLMap( CudaK32PlotContext& cx )
auto& p3 = *cx.phase3;
auto& s3 = p3.step3;
- LMap* lMap = bbcvirtallocbounded( BBCU_TABLE_ENTRY_COUNT );
+ LMap* lMap = bbcvirtallocbounded( BBCU_BUCKET_ALLOC_ENTRY_COUNT );
-
{
+ // blake3_hasher hasher;
+ // blake3_hasher_init( &hasher );
+
for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
{
const LMap* reader = p3.hostLMap + bucket * P3_PRUNED_BUCKET_MAX;
@@ -471,14 +532,18 @@ void DbgValidateLMap( CudaK32PlotContext& cx )
ASSERT( map.sourceIndex || map.sortedIndex );
ASSERT( ( map.sourceIndex >> ( 32 - BBC_BUCKET_BITS ) ) == bucket );
}
+
+ // Hash bucket
+ // blake3_hasher_update( &hasher, lMap, sizeof( LMap ) * entryCount );
}
-
+ // Print hash
+ // DbgFinishAndPrintHash( hasher, "l_map", (uint)cx.table + 1 );
}
bbvirtfreebounded( lMap );
- Log::Line( "[DEBUG] OK" );
+ Log::Line( "[DEBUG] LMap OK" );
}
//-----------------------------------------------------------
@@ -566,7 +631,7 @@ void _DbgValidateLMapData( CudaK32PlotContext& cx )
bbvirtfreebounded( dstIndices );
bbvirtfreebounded( tmpIndices );
- Log::Line( "[DEBUG] OK" );
+ Log::Line( "[DEBUG] LMap uniqueness OK" );
}
#endif
diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu
index 8e0458dd..80ba8b0e 100644
--- a/cuda/CudaPlotter.cu
+++ b/cuda/CudaPlotter.cu
@@ -9,6 +9,10 @@
#include "plotting/CTables.h"
#include "plotting/TableWriter.h"
#include "plotting/PlotTools.h"
+#include "util/VirtualAllocator.h"
+#include "harvesting/GreenReaper.h"
+#include "tools/PlotChecker.h"
+
// TEST/DEBUG
#if _DEBUG
@@ -36,6 +40,7 @@ static void InlineTable( CudaK32PlotContext& cx, const uint32* devInX, cudaStrea
static void AllocBuffers( CudaK32PlotContext& cx );
static void AllocateP1Buffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx );
+static void AllocateParkSerializationBuffers( CudaK32PlotContext& cx, IAllocator& pinnedAllocator, bool dryRun );
template
static void UploadBucketToGpu( CudaK32PlotContext& context, TableId table, const uint32* hostPtr, T* devPtr, uint64 bucket, uint64 stride );
@@ -53,11 +58,37 @@ GPU-based (CUDA) plotter
[OPTIONS]:
-h, --help : Shows this help message and exits.
-d, --device : Select the CUDA device index. (default=0)
+
+ --disk-128 : Enable hybrid disk plotting for 128G system RAM.
+ Requires a --temp1 and --temp2 to be set.
+
+ --disk-16 : (experimental) Enable hybrid disk plotting for 16G system RAM.
+ Requires a --temp1 and --temp2 to be set.
+
+ -t1, --temp1 : Temporary directory 1. Used for longer-lived, sequential writes.
+
+ -t2, --temp2 : Temporary directory 2. Used for temporary, shorted-lived read and writes.
+ NOTE: If only one of -t1 or -t2 is specified, both will be
+ set to the same directory.
+
+ --check : Perform a plot check for proofs on the newly created plot.
+
+ --check-threshold : Proof threshold rate below which the plots that don't pass
+ the check will be deleted.
+ That is, the number of proofs fetched / proof check count
+ must be above or equal to this threshold to pass.
+ (default=0.6).
)";
///
/// CLI
///
+//-----------------------------------------------------------
+void CudaK32PlotterPrintHelp()
+{
+ Log::Line( USAGE );
+}
+
//-----------------------------------------------------------
void CudaK32Plotter::ParseCLI( const GlobalPlotConfig& gCfg, CliParser& cli )
{
@@ -68,18 +99,70 @@ void CudaK32Plotter::ParseCLI( const GlobalPlotConfig& gCfg, CliParser& cli )
{
if( cli.ReadU32( cfg.deviceIndex, "-d", "--device" ) )
continue;
- if( cli.ReadSwitch( cfg.disableDirectDownloads, "--no-direct-downloads" ) )
+ if( cli.ReadSwitch( cfg.hybrid128Mode, "--disk-128" ) )
+ continue;
+ if( cli.ReadSwitch( cfg.hybrid16Mode, "--disk-16" ) )
+ {
+ cfg.hybrid128Mode = true;
+ continue;
+ }
+ if( cli.ReadStr( cfg.temp1Path, "-t1", "--temp1" ) )
+ {
+ if( !cfg.temp2Path )
+ cfg.temp2Path = cfg.temp1Path;
+ continue;
+ }
+ if( cli.ReadStr( cfg.temp2Path, "-t2", "--temp2" ) )
+ {
+ if( !cfg.temp1Path )
+ cfg.temp1Path = cfg.temp2Path;
+ continue;
+ }
+ if( cli.ReadUnswitch( cfg.temp1DirectIO, "--no-t1-direct" ) )
+ continue;
+ if( cli.ReadUnswitch( cfg.temp2DirectIO, "--no-t2-direct" ) )
+ continue;
+
+ if( cli.ReadU64( cfg.plotCheckCount, "--check" ) )
+ continue;
+ if( cli.ReadF64( cfg.plotCheckThreshhold, "--check-threshold" ) )
continue;
+ // if( cli.ReadSwitch( cfg.disableDirectDownloads, "--no-direct-buffers" ) )
+ // continue;
if( cli.ArgMatch( "--help", "-h" ) )
{
- Log::Line( USAGE );
+ CudaK32PlotterPrintHelp();
exit( 0 );
}
else
break; // Let the caller handle it
}
-
// The rest should be output directies, parsed by the global config parser.
+
+
+ if( cfg.hybrid128Mode && gCfg.compressionLevel <= 0 )
+ {
+ Log::Error( "Error: Cannot plot classic (uncompressed) plots in 128G or 64G mode." );
+ Exit( -1 );
+ }
+
+ if( cfg.hybrid16Mode )
+ {
+ #if PLATFORM_IS_WINDOWS
+ Log::Error( "16G mode is currently unsupported on Windows." );
+ Exit( -1 );
+ #else
+ Log::Line( "Warning: 16G mode is experimental and still under development." );
+ Log::Line( " Please use the --check parameter to validate plots when using this mode." );
+
+ if( cfg.temp1DirectIO || cfg.temp2DirectIO )
+ {
+ Log::Line( " Direct I/O not supported in 16G mode at the moment. Disabing it." );
+ cfg.temp1DirectIO = cfg.temp2DirectIO = false;
+ }
+
+ #endif
+ }
}
//-----------------------------------------------------------
@@ -97,10 +180,25 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext )
auto& cx = *new CudaK32PlotContext{};
outContext = &cx;
- cx.cfg = cfg;
- cx.gCfg = cfg.gCfg;
+ cx.cfg = cfg;
+ cx.gCfg = cfg.gCfg;
+
+ cx.firstStoredTable = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables;
Log::Line( "[Bladebit CUDA Plotter]" );
+ Log::Line( " Host RAM : %llu GiB", SysHost::GetTotalSystemMemory() BtoGB );
+
+ if( cx.cfg.plotCheckCount == 0 )
+ Log::Line( " Plot checks : disabled" );
+ else
+ {
+ Log::Line( " Plot checks : enabled ( %llu )", (llu)cx.cfg.plotCheckCount );
+ Log::Line( " Plot check threshold: %.3lf", cx.cfg.plotCheckThreshhold );
+ }
+
+ // Log::Line( " Direct transfers: %s", cfg.disableDirectDownloads ? "false" : "true" );
+ Log::NewLine();
+
CudaInit( cx );
CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStream , cudaStreamNonBlocking ) );
@@ -119,27 +217,89 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext )
}
cx.threadPool = new ThreadPool( SysHost::GetLogicalCPUCount() );
+ cx.plotFence = new Fence();
+ cx.parkFence = new Fence();
- #if __linux__
- cx.downloadDirect = cfg.disableDirectDownloads ? false : true;
+ #if _WIN32
+ // #MAYBE: Add a configurable option to enable direct downloads on windows?
+ // On windows always default to using intermediate pinned buffers
+ cx.downloadDirect = false;
#else
- // #TODO: One windows, check if we have enough memory, if so, default to true.
- cx.downloadDirect = true ;//false;
+ cx.downloadDirect = cfg.disableDirectDownloads ? false : true;
#endif
// cx.plotWriter = new PlotWriter( !cfg.gCfg->disableOutputDirectIO );
// if( cx.gCfg->benchmarkMode )
// cx.plotWriter->EnableDummyMode();
- cx.plotFence = new Fence();
+ // Need to do allocations for park serialization differently under the following conditions
+ if( cx.downloadDirect || cx.cfg.hybrid128Mode )
+ {
+ cx.parkContext = new CudaK32ParkContext{};
- cx.phase2 = new CudaK32Phase2{};
- cx.phase3 = new CudaK32Phase3{};
+ if( cx.cfg.hybrid16Mode )
+ cx.useParkContext = true;
+ }
+
+ // Check for hybrid mode
+ if( cx.cfg.hybrid128Mode )
+ {
+ cx.diskContext = new CudaK32HybridMode{};
+ cx.diskContext->temp1Queue = new DiskQueue( cx.cfg.temp1Path );
+
+ // Re-use the same queue for temp2 if temp1 and temp2 are pointing to the same path
+ auto t1Path = std::filesystem::canonical( cx.cfg.temp1Path );
+ auto t2Path = std::filesystem::canonical( cx.cfg.temp2Path );
+ if( t1Path.compare( t2Path ) == 0 )
+ cx.diskContext->temp2Queue = cx.diskContext->temp1Queue;
+ else
+ cx.diskContext->temp2Queue = new DiskQueue( cx.cfg.temp2Path );
+ }
+
+ cx.phase2 = new CudaK32Phase2{};
+ cx.phase3 = new CudaK32Phase3{};
// #TODO: Support non-warm starting
Log::Line( "Allocating buffers (this may take a few seconds)..." );
AllocBuffers( cx );
InitFSEBitMask( cx );
+ Log::Line( "Done." );
+
+
+ // Allocate GR Context if --check was specified
+ if( cfg.plotCheckCount > 0 )
+ {
+ if( cfg.gCfg->compressionLevel > 0 )
+ {
+ GreenReaperConfig grCfg{};
+ grCfg.apiVersion = GR_API_VERSION;
+ grCfg.threadCount = 1;
+ grCfg.gpuRequest = GRGpuRequestKind_ExactDevice;
+ grCfg.gpuDeviceIndex = cfg.deviceIndex;
+
+ auto grResult = grCreateContext( &cx.grCheckContext, &grCfg, sizeof( grCfg ) );
+ FatalIf( grResult != GRResult_OK, "Failed to create decompression context for plot check with error '%s' (%d).",
+ grResultToString( grResult ), (int)grResult );
+
+ grResult = grPreallocateForCompressionLevel( cx.grCheckContext, BBCU_K, cfg.gCfg->compressionLevel );
+ FatalIf( grResult != GRResult_OK, "Failed to preallocate memory for decompression context with error '%s' (%d).",
+ grResultToString( grResult ), (int)grResult );
+ }
+
+ PlotCheckerConfig checkerCfg{};
+ checkerCfg.proofCount = cfg.plotCheckCount;
+ checkerCfg.noGpu = false;
+ checkerCfg.gpuIndex = cfg.deviceIndex;
+ checkerCfg.threadCount = 1;
+ checkerCfg.disableCpuAffinity = false;
+ checkerCfg.silent = false;
+ checkerCfg.hasSeed = false;
+ checkerCfg.deletePlots = true;
+ checkerCfg.deleteThreshold = cfg.plotCheckThreshhold;
+ checkerCfg.grContext = cx.grCheckContext;
+
+ cx.plotChecker = PlotChecker::Create( checkerCfg );
+ }
}
//-----------------------------------------------------------
@@ -210,6 +370,8 @@ void CudaK32Plotter::Run( const PlotRequest& req )
cx.plotWriter = new PlotWriter( !cfg.gCfg->disableOutputDirectIO );
if( cx.gCfg->benchmarkMode )
cx.plotWriter->EnableDummyMode();
+ if( cx.plotChecker )
+ cx.plotWriter->EnablePlotChecking( *cx.plotChecker );
FatalIf( !cx.plotWriter->BeginPlot( cfg.gCfg->compressionLevel > 0 ? PlotVersion::v2_0 : PlotVersion::v1_0,
req.outDir, req.plotFileName, req.plotId, req.memo, req.memoSize, cfg.gCfg->compressionLevel ),
@@ -220,19 +382,43 @@ void CudaK32Plotter::Run( const PlotRequest& req )
cx.plotWriter->EndPlot( true );
- // #TODO: Ensure the last plot ended here for now
+ // Ensure the last plot has ended
+ // #TODO: Move it elsewhere, using different buffers for parks
+ // so that we can continue writing to disk until we get to
+ // actually writing the next plot in table 7 finalization.
{
const auto pltoCompleteTimer = TimerBegin();
cx.plotWriter->WaitForPlotToComplete();
const double plotIOTime = TimerEnd( pltoCompleteTimer );
Log::Line( "Completed writing plot in %.2lf seconds", plotIOTime );
- cx.plotWriter->DumpTables();
+ if( !cx.plotChecker || !cx.plotChecker->LastPlotDeleted() )
+ {
+ cx.plotWriter->DumpTables();
+ Log::NewLine();
+ }
}
- Log::Line( "" );
-
+
delete cx.plotWriter;
cx.plotWriter = nullptr;
+
+
+ // Delete any temporary files
+ #if !(DBG_BBCU_KEEP_TEMP_FILES)
+ if( cx.plotRequest.IsFinalPlot && cx.cfg.hybrid128Mode )
+ {
+ if( cx.diskContext->yBuffer ) delete cx.diskContext->yBuffer;
+ if( cx.diskContext->metaBuffer ) delete cx.diskContext->metaBuffer;
+ if( cx.diskContext->unsortedL ) delete cx.diskContext->unsortedL;
+ if( cx.diskContext->unsortedR ) delete cx.diskContext->unsortedR;
+
+ for( TableId t = TableId::Table1; t <= TableId::Table7; t++ )
+ {
+ if( cx.diskContext->tablesL[(int)t] ) delete cx.diskContext->tablesL[(int)t];
+ if( cx.diskContext->tablesR[(int)t] ) delete cx.diskContext->tablesR[(int)t];
+ }
+ }
+ #endif
}
//-----------------------------------------------------------
@@ -243,26 +429,51 @@ void MakePlot( CudaK32PlotContext& cx )
memset( cx.tableEntryCounts, 0, sizeof( cx.tableEntryCounts ) );
cx.table = TableId::Table1;
+
const auto plotTimer = TimerBegin();
const auto p1Timer = plotTimer;
#if BBCU_DBG_SKIP_PHASE_1
DbgLoadContextAndPairs( cx );
#else
- // F1
- Log::Line( "Generating F1" );
- const auto timer = TimerBegin();
- GenF1Cuda( cx );
- const auto elapsed = TimerEnd( timer );
- Log::Line( "Finished F1 in %.2lf seconds.", elapsed );
- // Time for FP
+ if( cx.cfg.hybrid128Mode )
+ {
+ cx.sortedXPairsOut.AssignDiskBuffer( nullptr );
+ cx.sortedPairsLOut.AssignDiskBuffer( nullptr );
+ cx.sortedPairsROut.AssignDiskBuffer( nullptr );
+
+ if( !cx.plotRequest.isFirstPlot )
+ {
+ for( TableId t = TableId::Table1; t <= TableId::Table7; t++ )
+ {
+ if( cx.diskContext->tablesL[(int)t] ) cx.diskContext->tablesL[(int)t]->Swap();
+ if( cx.diskContext->tablesR[(int)t] ) cx.diskContext->tablesR[(int)t]->Swap();
+ }
+
+ }
+ }
+
+ /// Generate F1 entries
+ {
+ Log::Line( "Generating F1" );
+ const auto timer = TimerBegin();
+
+ GenF1Cuda( cx );
+
+ const auto elapsed = TimerEnd( timer );
+ Log::Line( "Finished F1 in %.2lf seconds.", elapsed );
+ }
+
+ /// Forward-propagate the rest of the tables
for( TableId table = TableId::Table2; table <= TableId::Table7; table++ )
{
cx.table = table;
cx.bucket = 0;
+
FpTable( cx );
}
+
const auto p1Elapsed = TimerEnd( p1Timer );
Log::Line( "Completed Phase 1 in %.2lf seconds", p1Elapsed );
#endif
@@ -294,6 +505,22 @@ void FpTable( CudaK32PlotContext& cx )
cx.prevTablePairOffset = 0;
+ if( cx.cfg.hybrid128Mode )
+ {
+ auto* diskBufferL = cx.diskContext->tablesL[(int)inTable];
+ auto* diskBufferR = cx.diskContext->tablesR[(int)inTable];
+
+ if( inTable == cx.firstStoredTable )
+ {
+ cx.sortedXPairsOut.AssignDiskBuffer( diskBufferL );
+ }
+ else if( inTable > cx.firstStoredTable )
+ {
+ cx.sortedPairsLOut.AssignDiskBuffer( diskBufferL );
+ cx.sortedPairsROut.AssignDiskBuffer( diskBufferR );
+ }
+ }
+
// Clear slice counts
CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );
@@ -358,10 +585,28 @@ void FpTable( CudaK32PlotContext& cx )
cx.sortedPairsROut.WaitForCompletion();//cx.sortedPairsROut.WaitForCopyCompletion();
cx.sortedPairsROut.Reset();
-
- if( cx.table < TableId::Table7 )
+ if( cx.cfg.hybrid128Mode && inTable >= cx.firstStoredTable )
{
+ if( cx.diskContext->tablesL[(int)inTable] ) cx.diskContext->tablesL[(int)inTable]->Swap();
+ if( cx.diskContext->tablesR[(int)inTable] ) cx.diskContext->tablesR[(int)inTable]->Swap();
+ }
+
+ if( cx.table < TableId::Table7 )
cx.metaOut.WaitForCompletion(); cx.metaOut.Reset();
+
+ if( cx.cfg.hybrid128Mode )
+ {
+ if( cx.cfg.hybrid16Mode || cx.table == cx.firstStoredTable || cx.table == cx.firstStoredTable + 1 )
+ {
+ cx.diskContext->unsortedL->Swap();
+ }
+
+ if( cx.cfg.hybrid16Mode )
+ {
+ cx.diskContext->yBuffer->Swap();
+ cx.diskContext->metaBuffer->Swap();
+ cx.diskContext->unsortedR->Swap();
+ }
}
cx.yIn .Reset();
@@ -373,23 +618,24 @@ void FpTable( CudaK32PlotContext& cx )
Log::Line( "Table %u completed in %.2lf seconds with %llu entries.",
(uint32)cx.table+1, elapsed, cx.tableEntryCounts[(int)cx.table] );
+ /// DEBUG
#if DBG_BBCU_P1_WRITE_PAIRS
// Write them sorted, so have to wait until table 3 completes
if( cx.table > TableId::Table2 )
DbgWritePairs( cx, cx.table - 1 );
#endif
-
+
if( cx.table == TableId::Table7 )
{
FinalizeTable7( cx );
- #if DBG_BBCU_P1_WRITE_PAIRS
+ // DEBUG
+ #if DBG_BBCU_P1_WRITE_PAIRS
DbgWritePairs( cx, TableId::Table7 );
- #endif
-
+ #endif
#if DBG_BBCU_P1_WRITE_CONTEXT
DbgWriteContext( cx );
- #endif
+ #endif
}
}
@@ -410,8 +656,8 @@ void FpTableBucket( CudaK32PlotContext& cx, const uint32 bucket )
cudaStream_t metaStream = cx.computeStream;//B;
cudaStream_t pairsStream = cx.computeStream;//C;
- uint32* sortKeyIn = (uint32*)cx.devMatches;
- uint32* sortKeyOut = cx.devSortKey;
+ uint32* sortKeyIn = (uint32*)cx.devMatches;
+ uint32* sortKeyOut = cx.devSortKey;
if( cx.table > TableId::Table2 )
{
// Generate a sorting key
@@ -447,7 +693,7 @@ void FpTableBucket( CudaK32PlotContext& cx, const uint32 bucket )
// Sort and download prev table's pairs
const bool isLTableInlineable = cx.table == TableId::Table2 || (uint32)cx.table <= cx.gCfg->numDroppedTables+1;
-
+
if( !isLTableInlineable )
{
CudaErrCheck( cudaStreamWaitEvent( pairsStream, cx.computeEventC ) ); // Ensure sort key is ready
@@ -463,35 +709,36 @@ void FpTableBucket( CudaK32PlotContext& cx, const uint32 bucket )
CudaK32PlotSortByKey( entryCount, sortKeyOut, pairsIn, sortedPairs, pairsStream );
cx.xPairsIn.ReleaseDeviceBuffer( pairsStream );
- Pair* hostPairs = ((Pair*)cx.hostBackPointers[(int)cx.table-1].left) + cx.prevTablePairOffset;
+ Pair* hostPairs = ((Pair*)cx.hostBackPointers[(int)inTable].left) + cx.prevTablePairOffset;
// Write sorted pairs back to host
cx.sortedXPairsOut.DownloadT( hostPairs, entryCount, pairsStream, cx.downloadDirect );
}
else
{
- uint32* hostPairsL, *hostPairsLFinal;
- uint16* hostPairsR, *hostPairsRFinal;
+ // uint32* hostPairsL;
+ // uint16* hostPairsR;
// Wait for pairs to complete loading and sort on Y (or do this before match? Giving us time to write to disk while matching?)
uint32* pairsLIn = (uint32*)cx.pairsLIn .GetUploadedDeviceBuffer( pairsStream );
uint32* sortedPairsL = (uint32*)cx.sortedPairsLOut.LockDeviceBuffer( pairsStream );
CudaK32PlotSortByKey( entryCount, sortKeyOut, pairsLIn, sortedPairsL, pairsStream );
cx.pairsLIn.ReleaseDeviceBuffer( pairsStream );
- hostPairsL = cx.hostTableSortedL + cx.prevTablePairOffset;
- hostPairsLFinal = cx.hostBackPointers[(int)cx.table-1].left + cx.prevTablePairOffset;
+ // hostPairsL = cx.hostTableSortedL + cx.prevTablePairOffset;
+ uint32* hostPairsLFinal = cx.hostBackPointers[(int)inTable].left + cx.prevTablePairOffset;
cx.sortedPairsLOut.DownloadT( hostPairsLFinal, entryCount, pairsStream, cx.downloadDirect );
// cx.sortedPairsLOut.DownloadAndCopyT( hostPairsL, hostPairsLFinal, entryCount, pairsStream );
-
+
// if( !isOutputCompressed )
{
uint16* pairsRIn = (uint16*)cx.pairsRIn .GetUploadedDeviceBuffer( pairsStream );
uint16* sortedPairsR = (uint16*)cx.sortedPairsROut.LockDeviceBuffer( pairsStream );
CudaK32PlotSortByKey( entryCount, sortKeyOut, pairsRIn, sortedPairsR, pairsStream );
cx.pairsRIn.ReleaseDeviceBuffer( pairsStream );
- hostPairsR = cx.hostTableSortedR + cx.prevTablePairOffset;
- hostPairsRFinal = cx.hostBackPointers[(int)cx.table-1].right + cx.prevTablePairOffset;
+ // hostPairsR = cx.hostTableSortedR + cx.prevTablePairOffset;
+
+ uint16* hostPairsRFinal = cx.hostBackPointers[(int)inTable].right + cx.prevTablePairOffset;
cx.sortedPairsROut.DownloadT( hostPairsRFinal, entryCount, pairsStream, cx.downloadDirect );
// cx.sortedPairsROut.DownloadAndCopyT( hostPairsR, hostPairsRFinal, entryCount, pairsStream );
@@ -557,7 +804,7 @@ void FpTableBucket( CudaK32PlotContext& cx, const uint32 bucket )
void FinalizeTable7( CudaK32PlotContext& cx )
{
Log::Line( "Finalizing Table 7" );
-
+
const auto timer = TimerBegin();
cx.table = TableId::Table7+1; // Set a false table
@@ -578,19 +825,41 @@ void FinalizeTable7( CudaK32PlotContext& cx )
const size_t c1TableSizeBytes = c1TotalEntries * sizeof( uint32 );
const size_t c2TableSizeBytes = c2TotalEntries * sizeof( uint32 );
+ if( cx.cfg.hybrid128Mode )
+ {
+ cx.sortedPairsLOut.AssignDiskBuffer( cx.diskContext->tablesL[(int)TableId::Table7] );
+ cx.sortedPairsROut.AssignDiskBuffer( cx.diskContext->tablesR[(int)TableId::Table7] );
+ }
+
+
+ // Re-use meta GPU downloader to download parks
+ GpuDownloadBuffer& parkDownloader = cx.metaOut;
+
+ // Store disk buffer temporarily, if there is one, since we don't want to write to meta now
+ DiskBufferBase* metaDiskBuffer = parkDownloader.GetDiskBuffer();
+
+ // Reset park buffer chain, if we're using it
+ if( cx.parkContext )
+ {
+ cx.parkContext->parkBufferChain->Reset();
+ parkDownloader.AssignDiskBuffer( nullptr ); // We want direct downloads to the park buffers, which are pinned already
+ }
// Prepare host allocations
constexpr size_t c3ParkSize = CalculateC3Size();
const uint64 totalParkSize = CDivT( tableLength, (uint64)kCheckpoint1Interval ) * c3ParkSize;
- StackAllocator hostAlloc( cx.hostMeta, BBCU_TABLE_ALLOC_ENTRY_COUNT * sizeof( uint32 ) * 4 );
+ StackAllocator hostAlloc = cx.parkContext
+ ? StackAllocator( cx.parkContext->table7Memory.Ptr(), cx.parkContext->table7Memory.Length() )
+ : StackAllocator( cx.hostMeta, BBCU_TABLE_ALLOC_ENTRY_COUNT * sizeof( uint32 ) * 4 );
+
uint32* hostC1Buffer = hostAlloc.CAlloc( c1TotalEntries );
uint32* hostC2Buffer = hostAlloc.CAlloc( c2TotalEntries );
uint32* hostLastParkEntries = hostAlloc.CAlloc( kCheckpoint1Interval );
byte* hostLastParkBuffer = (byte*)hostAlloc.CAlloc( kCheckpoint1Interval );
- byte* hostCompressedParks = hostAlloc.AllocT( totalParkSize );
-
+ byte* hostCompressedParks = cx.parkContext ? nullptr : hostAlloc.AllocT( totalParkSize );
+
byte* hostParkWriter = hostCompressedParks;
uint32* hostC1Writer = hostC1Buffer;
@@ -606,8 +875,6 @@ void FinalizeTable7( CudaK32PlotContext& cx )
const size_t parkBufferSize = kCheckpoint1Interval * sizeof( uint32 );
- GpuDownloadBuffer& parkDownloader = cx.metaOut;
-
cudaStream_t mainStream = cx.computeStream;
cudaStream_t metaStream = cx.computeStream;//B;
cudaStream_t pairsStream = cx.computeStream;//C;
@@ -616,7 +883,7 @@ void FinalizeTable7( CudaK32PlotContext& cx )
// Load CTable
FSE_CTable* devCTable = devAlloc.AllocT( sizeof( CTable_C3 ), sizeof( uint64 ) );
CudaErrCheck( cudaMemcpyAsync( devCTable, CTable_C3, sizeof( CTable_C3 ), cudaMemcpyHostToDevice, cx.computeStream ) );
-
+ CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
// Prepare plot tables
cx.plotWriter->ReserveTableSize( PlotTable::C1, c1TableSizeBytes );
@@ -627,7 +894,6 @@ void FinalizeTable7( CudaK32PlotContext& cx )
uint32 retainedC3EntryCount = 0;
uint32* devYSorted = cx.devYWork + kCheckpoint1Interval;
-
uint32* sortKeyIn = (uint32*)cx.devMatches;
uint32* sortKeyOut = cx.devSortKey;
@@ -732,13 +998,42 @@ void FinalizeTable7( CudaK32PlotContext& cx )
// Download compressed parks to host
const size_t parkDownloadSize = c3ParkSize * parkCount;
+
+ if( cx.parkContext )
+ {
+ ASSERT( parkDownloadSize <= cx.parkContext->parkBufferChain->BufferSize() );
+
+ // Override the park buffer to be used when using a park context
+ hostParkWriter = cx.parkContext->parkBufferChain->PeekBuffer( bucket );
+
+ // Wait for the next park buffer to be available to be used for download
+ parkDownloader.HostCallback([&cx]{
+ (void)cx.parkContext->parkBufferChain->GetNextBuffer();
+ });
+ }
+
+ const bool directOverride = cx.parkContext != nullptr;
+
parkDownloader.DownloadWithCallback( hostParkWriter, parkDownloadSize,
[]( void* parksBuffer, size_t size, void* userData ) {
auto& cx = *reinterpret_cast( userData );
+
cx.plotWriter->WriteTableData( parksBuffer, size );
- }, &cx, mainStream );
+
+ // Release the buffer after the plot writer is done with it.
+ if( cx.parkContext )
+ {
+ cx.plotWriter->CallBack([&cx](){
+ cx.parkContext->parkBufferChain->ReleaseNextBuffer();
+ });
+ }
+
+ }, &cx, mainStream, directOverride );
hostParkWriter += parkDownloadSize;
+
+ if( cx.parkContext )
+ hostParkWriter = nullptr;
}
// Download c1 entries
@@ -788,8 +1083,6 @@ void FinalizeTable7( CudaK32PlotContext& cx )
// Cleanup
- // cx.sortedPairsLOut.WaitForCopyCompletion();
- // cx.sortedPairsROut.WaitForCopyCompletion();
cx.sortedPairsLOut.WaitForCompletion();
cx.sortedPairsROut.WaitForCompletion();
cx.sortedPairsLOut.Reset();
@@ -797,6 +1090,18 @@ void FinalizeTable7( CudaK32PlotContext& cx )
cx.prevTablePairOffset = 0;
+ // Restore disk buffer on repurposed meta download stream
+ parkDownloader.AssignDiskBuffer( metaDiskBuffer );
+
+ if( cx.cfg.hybrid128Mode )
+ {
+ cx.diskContext->tablesL[(int)TableId::Table7]->Swap();
+ cx.diskContext->tablesR[(int)TableId::Table7]->Swap();
+
+ if( cx.cfg.hybrid16Mode )
+ cx.diskContext->yBuffer->Swap();
+ }
+
auto elapsed = TimerEnd( timer );
Log::Line( "Finalized Table 7 in %.2lf seconds.", elapsed );
}
@@ -834,7 +1139,7 @@ __global__ void CudaCompressTable( const uint32* entryCount, const uint32* inLEn
const uint32 x0 = inLEntries[pair.left ];
const uint32 x1 = inLEntries[pair.right];
- // Convert to linepoint
+ // Convert to linepoint
if constexpr ( UseLP )
outREntries[gid] = (uint32)CudaSquareToLinePoint64( x1 >> bitShift, x0 >> bitShift );
else
@@ -850,7 +1155,7 @@ void InlineTable( CudaK32PlotContext& cx, const uint32* devInX, cudaStream_t str
const uint32 kthreads = 256;
const uint32 kblocks = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, (int)kthreads );
-
+
if( isCompressedInput )
{
const bool isFinalTable = cx.table == TableId::Table1 + (TableId)cx.gCfg->numDroppedTables;
@@ -870,7 +1175,7 @@ void InlineTable( CudaK32PlotContext& cx, const uint32* devInX, cudaStream_t str
//-----------------------------------------------------------
void CudaK32PlotDownloadBucket( CudaK32PlotContext& cx )
{
- const bool writeVertical = CudaK32PlotIsOutputInterleaved( cx );
+ const bool writeVertical = CudaK32PlotIsOutputVertical( cx );
const size_t metaMultiplier = GetTableMetaMultiplier( cx.table );
const bool downloadCompressed = cx.table > TableId::Table1 && (uint32)cx.table <= cx.gCfg->numDroppedTables;
@@ -879,8 +1184,8 @@ void CudaK32PlotDownloadBucket( CudaK32PlotContext& cx )
uint32* hostY = cx.hostY;
uint32* hostMeta = cx.hostMeta;
- uint32* hostPairsL = cx.hostTableL; //cx.hostBackPointers[6].left;
- uint16* hostPairsR = cx.hostTableR; //cx.hostBackPointers[6].right;
+ uint32* hostPairsL = cx.hostTableL;
+ uint16* hostPairsR = cx.hostTableR;
Pair* t2HostPairs = (Pair*)cx.hostBackPointers[4].left;
const size_t startOffset = cx.bucket * ( writeVertical ? BBCU_MAX_SLICE_ENTRY_COUNT : BBCU_BUCKET_ALLOC_ENTRY_COUNT ); // vertical: offset to starting col. horizontal: to starting row
@@ -896,7 +1201,7 @@ void CudaK32PlotDownloadBucket( CudaK32PlotContext& cx )
{
const size_t metaSizeMultiplier = metaMultiplier == 3 ? 4 : metaMultiplier;
const size_t metaSize = sizeof( uint32 ) * metaSizeMultiplier;
-
+
const size_t metaSrcStride = srcStride * metaSize;
const size_t metaDstStride = dstStride * sizeof( K32Meta4 );
const size_t metaWidth = width * metaSize;
@@ -927,20 +1232,23 @@ void UploadBucketForTable( CudaK32PlotContext& cx, const uint64 bucket )
const TableId rTable = cx.table;
const TableId inTable = rTable - 1;
- uint32 metaMultiplier = GetTableMetaMultiplier( inTable );
+ const uint32 metaMultiplier = GetTableMetaMultiplier( inTable );
const uint32 inIdx = CudaK32PlotGetInputIndex( cx );
- const bool readVertical = CudaK32PlotIsOutputInterleaved( cx );
+ const bool readVertical = CudaK32PlotIsOutputVertical( cx );
const uint32* hostY = cx.hostY;
const uint32* hostMeta = cx.hostMeta;
- const uint32* hostPairsL = cx.hostTableL; //cx.hostBackPointers[6].left;
- const uint16* hostPairsR = cx.hostTableR; //cx.hostBackPointers[6].right;
+ const uint32* hostPairsL = cx.hostTableL;
+ const uint16* hostPairsR = cx.hostTableR;
const bool uploadCompressed = cx.table > TableId::Table2 && (uint32)cx.table-1 <= cx.gCfg->numDroppedTables;
const bool uploadInlinedPairs = !uploadCompressed && (uint32)cx.table == cx.gCfg->numDroppedTables+2;
const Pair* t2HostPairs = (Pair*)cx.hostBackPointers[4].left; // Table 2 will use table 5, and overflow onto 6
+ if( cx.cfg.hybrid128Mode )
+ t2HostPairs = (Pair*)hostPairsL;
+
uint32 stride = BBCU_BUCKET_ALLOC_ENTRY_COUNT; // Start as vertical
size_t offset = (size_t)bucket * BBCU_MAX_SLICE_ENTRY_COUNT;
@@ -974,7 +1282,7 @@ void UploadBucketForTable( CudaK32PlotContext& cx, const uint64 bucket )
cx.pairsRIn.UploadArrayT( hostPairsR + offset, BBCU_BUCKET_COUNT, stride, BBCU_BUCKET_COUNT, counts, pairsStream );
}
}
-
+
// Meta
if( metaMultiplier > 0 )
{
@@ -982,11 +1290,13 @@ void UploadBucketForTable( CudaK32PlotContext& cx, const uint64 bucket )
const size_t metaSize = sizeof( uint32 ) * metaSizeMultiplier;
auto actualMetaStream = inTable == TableId::Table1 ? cx.computeStream : metaStream;
+
cx.metaIn.UploadArray( hostMeta + offset * 4, BBCU_BUCKET_COUNT, metaSize, stride * sizeof( K32Meta4 ), BBCU_BUCKET_COUNT, counts, actualMetaStream );
}
}
+
///
/// Allocations
///
@@ -1002,13 +1312,19 @@ void AllocBuffers( CudaK32PlotContext& cx )
cx.hostTempAllocSize = 0;
cx.devAllocSize = 0;
+ // If on <= 128G mode or not using direct downloads,
+ // we need to use a separate buffer for downloading parks, instead of re-using exisintg ones.
+ // If on <= 64G mode or not using direct downloads,
+ const bool allocateParkBuffers = cx.downloadDirect || cx.cfg.hybrid128Mode;
+ size_t parksPinnedSize = 0;
+
// Gather the size needed first
{
CudaK32AllocContext acx = {};
acx.alignment = alignment;
acx.dryRun = true;
-
+
DummyAllocator pinnedAllocator;
DummyAllocator hostTableAllocator;
DummyAllocator hostTempAllocator;
@@ -1020,7 +1336,6 @@ void AllocBuffers( CudaK32PlotContext& cx )
acx.devAllocator = &devAllocator;
AllocateP1Buffers( cx, acx );
-
cx.pinnedAllocSize = pinnedAllocator .Size();
cx.hostTableAllocSize = hostTableAllocator.Size();
cx.hostTempAllocSize = hostTempAllocator .Size();
@@ -1033,7 +1348,6 @@ void AllocBuffers( CudaK32PlotContext& cx )
devAllocator = {};
CudaK32PlotPhase2AllocateBuffers( cx, acx );
-
cx.pinnedAllocSize = std::max( cx.pinnedAllocSize , pinnedAllocator .Size() );
cx.hostTableAllocSize = std::max( cx.hostTableAllocSize, hostTableAllocator.Size() );
cx.hostTempAllocSize = std::max( cx.hostTempAllocSize , hostTempAllocator .Size() );
@@ -1046,15 +1360,23 @@ void AllocBuffers( CudaK32PlotContext& cx )
devAllocator = {};
CudaK32PlotPhase3AllocateBuffers( cx, acx );
-
cx.pinnedAllocSize = std::max( cx.pinnedAllocSize , pinnedAllocator .Size() );
cx.hostTableAllocSize = std::max( cx.hostTableAllocSize, hostTableAllocator.Size() );
cx.hostTempAllocSize = std::max( cx.hostTempAllocSize , hostTempAllocator .Size() );
cx.devAllocSize = std::max( cx.devAllocSize , devAllocator .Size() );
+
+ // May need to allocate extra pinned buffers for park buffers
+ if( allocateParkBuffers )
+ {
+ pinnedAllocator = {};
+ AllocateParkSerializationBuffers( cx, *acx.pinnedAllocator, acx.dryRun );
+ parksPinnedSize = pinnedAllocator.Size();
+ }
}
- size_t totalPinnedSize = cx.pinnedAllocSize + cx.hostTempAllocSize;
- size_t totalHostSize = cx.hostTableAllocSize + totalPinnedSize;
+
+ const size_t totalPinnedSize = cx.pinnedAllocSize + cx.hostTempAllocSize + parksPinnedSize;
+ const size_t totalHostSize = cx.hostTableAllocSize + totalPinnedSize;
Log::Line( "Kernel RAM required : %-12llu bytes ( %-9.2lf MiB or %-6.2lf GiB )", totalPinnedSize,
(double)totalPinnedSize BtoMB, (double)totalPinnedSize BtoGB );
@@ -1070,43 +1392,46 @@ void AllocBuffers( CudaK32PlotContext& cx )
Log::Line( "GPU RAM required : %-12llu bytes ( %-9.2lf MiB or %-6.2lf GiB )", cx.devAllocSize,
(double)cx.devAllocSize BtoMB, (double)cx.devAllocSize BtoGB );
- Log::Line( "Allocating buffers" );
// Now actually allocate the buffers
+ Log::Line( "Allocating buffers..." );
CudaErrCheck( cudaMallocHost( &cx.pinnedBuffer, cx.pinnedAllocSize, cudaHostAllocDefault ) );
#if _DEBUG
cx.hostBufferTables = bbvirtallocboundednuma( cx.hostTableAllocSize );
#else
- #if !_WIN32
- // if( cx.downloadDirect )
+
+ bool allocateHostTablesPinned = cx.downloadDirect;
+ #if _WIN32
+ // On windows we always force the use of intermediate buffers, so we allocate on the host
+ allocateHostTablesPinned = false;
+ #endif
+
+ // Log::Line( "Table pairs allocated as pinned: %s", allocateHostTablesPinned ? "true" : "false" );
+ if( allocateHostTablesPinned )
CudaErrCheck( cudaMallocHost( &cx.hostBufferTables, cx.hostTableAllocSize, cudaHostAllocDefault ) );
- // else
- // {
- // // #TODO: On windows, first check if we have enough shared memory (512G)?
- // // and attempt to alloc that way first. Otherwise, use intermediate pinned buffers.
- #else
+ else
cx.hostBufferTables = bbvirtallocboundednuma( cx.hostTableAllocSize );
- #endif
- // }
#endif
- //CudaErrCheck( cudaMallocHost( &cx.hostBufferTables, cx.hostTableAllocSize, cudaHostAllocDefault ) );
-
cx.hostBufferTemp = nullptr;
-#if _DEBUG
- cx.hostBufferTemp = bbvirtallocboundednuma( cx.hostTempAllocSize );
-#endif
- if( cx.hostBufferTemp == nullptr )
+ #if _DEBUG || _WIN32
+ if( cx.hostTempAllocSize )
+ cx.hostBufferTemp = bbvirtallocboundednuma( cx.hostTempAllocSize );
+ #endif
+
+ if( cx.hostBufferTemp == nullptr && cx.hostTempAllocSize )
CudaErrCheck( cudaMallocHost( &cx.hostBufferTemp, cx.hostTempAllocSize, cudaHostAllocDefault ) );
CudaErrCheck( cudaMalloc( &cx.deviceBuffer, cx.devAllocSize ) );
// Warm start
- if( true )
+ if( true )// cx.gCfg->warmStart )
{
- FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.pinnedBuffer, cx.pinnedAllocSize );
+ FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.pinnedBuffer , cx.pinnedAllocSize );
FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.hostBufferTables, cx.hostTableAllocSize );
- FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.hostBufferTemp, cx.hostTempAllocSize );
+
+ if( cx.hostTempAllocSize )
+ FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.hostBufferTemp, cx.hostTempAllocSize );
}
{
@@ -1114,7 +1439,7 @@ void AllocBuffers( CudaK32PlotContext& cx )
acx.alignment = alignment;
acx.dryRun = false;
-
+
StackAllocator pinnedAllocator ( cx.pinnedBuffer , cx.pinnedAllocSize );
StackAllocator hostTableAllocator( cx.hostBufferTables, cx.hostTableAllocSize );
StackAllocator hostTempAllocator ( cx.hostBufferTemp , cx.hostTempAllocSize );
@@ -1137,106 +1462,254 @@ void AllocBuffers( CudaK32PlotContext& cx )
hostTempAllocator .PopToMarker( 0 );
devAllocator .PopToMarker( 0 );
CudaK32PlotPhase3AllocateBuffers( cx, acx );
+
+ if( allocateParkBuffers )
+ {
+ // Fine to leak. App-lifetime buffer
+ void* parksBuffer = nullptr;
+ CudaErrCheck( cudaMallocHost( &parksBuffer, parksPinnedSize, cudaHostAllocDefault ) );
+ StackAllocator parkAllocator( parksBuffer, parksPinnedSize );
+ AllocateParkSerializationBuffers( cx, parkAllocator, acx.dryRun );
+ }
}
}
//-----------------------------------------------------------
void AllocateP1Buffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
{
- const size_t alignment = acx.alignment;
+ const size_t alignment = acx.alignment;
+ const bool isCompressed = cx.gCfg->compressionLevel > 0;
+ const TableId firstTable = cx.firstStoredTable;
- const bool isCompressed = cx.gCfg->compressionLevel > 0;
+ const FileFlags tmp1FileFlags = cx.cfg.temp1DirectIO ? FileFlags::NoBuffering | FileFlags::LargeFile : FileFlags::LargeFile;
+ const FileFlags tmp2FileFlags = cx.cfg.temp2DirectIO ? FileFlags::NoBuffering | FileFlags::LargeFile : FileFlags::LargeFile;
- // #TODO: Re-optimize usage here again for windows running 256G
/// Host allocations
{
// Temp allocations are pinned host buffers that can be re-used for other means in different phases.
// This is roughly equivalent to temp2 dir during disk plotting.
- cx.hostY = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
- cx.hostMeta = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT * BBCU_HOST_META_MULTIPLIER, alignment );
- const size_t markingTableBitFieldSize = GetMarkingTableBitFieldSize();
- cx.hostMarkingTables[0] = nullptr;
- cx.hostMarkingTables[1] = isCompressed ? nullptr : acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment );
- cx.hostMarkingTables[2] = acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment );
- cx.hostMarkingTables[3] = acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment );
- cx.hostMarkingTables[4] = acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment );
- cx.hostMarkingTables[5] = acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment );
+ if( !cx.cfg.hybrid16Mode )
+ {
+ cx.hostY = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
+ cx.hostMeta = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT * BBCU_HOST_META_MULTIPLIER, alignment );
+ }
+ else if( !cx.diskContext->metaBuffer )
+ {
+ const size_t ySliceSize = sizeof( uint32 ) * BBCU_MAX_SLICE_ENTRY_COUNT;
+ const size_t metaSliceSize = sizeof( uint32 ) * BBCU_META_SLICE_ENTRY_COUNT;
-
- // NOTE: The first table has their values inlines into the backpointers of the next table
- cx.hostBackPointers[0] = {};
+ cx.diskContext->yBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::Y_DISK_BUFFER_FILE_NAME.data(),
+ BBCU_BUCKET_COUNT, ySliceSize, FileMode::Create, FileAccess::ReadWrite, tmp2FileFlags );
+ FatalIf( !cx.diskContext->yBuffer, "Failed to create y disk buffer." );
- const TableId firstTable = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables;
-
- Pair* firstTablePairs = acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
- cx.hostBackPointers[(int)firstTable] = { (uint32*)firstTablePairs, nullptr };
+ cx.diskContext->metaBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::META_DISK_BUFFER_FILE_NAME.data(),
+ BBCU_BUCKET_COUNT, metaSliceSize, FileMode::Create, FileAccess::ReadWrite, tmp2FileFlags );
+ FatalIf( !cx.diskContext->metaBuffer, "Failed to create metadata disk buffer." );
+ }
- for( TableId table = firstTable + 1; table <= TableId::Table7; table++ )
- cx.hostBackPointers[(int)table] = { acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ), acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ) };
+ // Marking tables used to prune back pointers
+ {
+ const size_t markingTableBitFieldSize = GetMarkingTableBitFieldSize();
+
+ cx.hostMarkingTables[0] = nullptr;
+ cx.hostMarkingTables[1] = isCompressed ? nullptr : acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment );
+ cx.hostMarkingTables[2] = acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment );
+ cx.hostMarkingTables[3] = acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment );
+ cx.hostMarkingTables[4] = acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment );
+ cx.hostMarkingTables[5] = acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment );
+ }
+
+ if( !cx.cfg.hybrid128Mode )
+ {
+ // NOTE: The first table has their values inlined into the backpointers of the next table
+ cx.hostBackPointers[0] = {};
- cx.hostTableL = cx.hostBackPointers[6].left; // Also used for Table 7
- cx.hostTableR = cx.hostBackPointers[6].right;
- cx.hostTableSortedL = cx.hostBackPointers[5].left;
- cx.hostTableSortedR = cx.hostBackPointers[5].right;
+ Pair* firstTablePairs = acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
+
+ cx.hostBackPointers[(int)firstTable] = { (uint32*)firstTablePairs, nullptr };
+
+ for( TableId table = firstTable + 1; table <= TableId::Table7; table++ )
+ {
+ cx.hostBackPointers[(int)table] = {
+ acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ),
+ acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment )
+ };
+ }
+
+ // These buffers, belonging to table 7, are re-used
+ // to store the unsorted back-pointers.
+ // For this to work, the reading ot table 7 must be horizontal (see CudaK32PlotIsOutputVertical()).
+ // This way, when we store the sorted pairs, we don't
+ // overwrite the unsorted data from other buckets.
+ cx.hostTableL = cx.hostBackPointers[6].left;
+ cx.hostTableR = cx.hostBackPointers[6].right;
+ }
+ else
+ {
+ char tableName[] = "table_l_000.tmp";
+
+ size_t multiplier = 2; // First table entries are Pair, not uint32s...
+
+ #if BBCU_DBG_SKIP_PHASE_1
+ const FileMode fileMode = FileMode::Open;
+ #else
+ const FileMode fileMode = FileMode::Create;
+ #endif
+
+ for( TableId table = firstTable; table <= TableId::Table7; table++ )
+ {
+ if( cx.diskContext->tablesL[(int)table] == nullptr )
+ {
+ sprintf( tableName, "table_l_%d.tmp", (int32)table+1 );
+ cx.diskContext->tablesL[(int)table] = DiskBuffer::Create(
+ *cx.diskContext->temp1Queue, tableName, BBCU_BUCKET_COUNT, sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT * multiplier,
+ fileMode, FileAccess::ReadWrite, tmp1FileFlags );
+
+ FatalIf( !cx.diskContext->tablesL[(int)table], "Failed to create table %d L disk buffer.", (int)table+1 );
+ }
+
+ if( table > firstTable && cx.diskContext->tablesR[(int)table] == nullptr )
+ {
+ sprintf( tableName, "table_r_%d.tmp", (int32)table+1 );
+ cx.diskContext->tablesR[(int)table] = DiskBuffer::Create(
+ *cx.diskContext->temp1Queue, tableName, BBCU_BUCKET_COUNT, sizeof( uint16 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT,
+ fileMode, FileAccess::ReadWrite, tmp1FileFlags );
+
+ FatalIf( !cx.diskContext->tablesR[(int)table], "Failed to create table %d R disk buffer.", (int)table+1 );
+ }
+
+ multiplier = 1;
+ }
+
+ // When storing unsorted inlined x's, we don't have enough space in RAM, store i disk instead.
+ const size_t xSliceSize = BBCU_MAX_SLICE_ENTRY_COUNT * sizeof( Pair );
+ cx.diskContext->unsortedL = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::LPAIRS_DISK_BUFFER_FILE_NAME.data(),
+ BBCU_BUCKET_COUNT, xSliceSize, FileMode::OpenOrCreate, FileAccess::ReadWrite, tmp2FileFlags );
+ FatalIf( !cx.diskContext->unsortedL, "Failed to create unsorted L disk buffer." );
+
+ if( cx.cfg.hybrid16Mode )
+ {
+ cx.diskContext->unsortedR = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, "p1unsorted_r.tmp",
+ BBCU_BUCKET_COUNT, BBCU_MAX_SLICE_ENTRY_COUNT * sizeof( uint16 ), FileMode::OpenOrCreate, FileAccess::ReadWrite, tmp2FileFlags );
+ FatalIf( !cx.diskContext->unsortedR, "Failed to create unsorted R disk buffer." );
+ }
+ else
+ {
+ // In 128G mode we can store intermediate pairs in the host
+ cx.hostTableL = acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
+ cx.hostTableR = acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
+ }
+ }
}
/// Device & Pinned allocations
{
- // #NOTE: The R pair is allocated as uint32 because for table 2 we want to download them as inlined x's, so we need 2 uint32 buffers
- /// Device/Pinned allocations
- // cx.yOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
- // cx.metaOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
- cx.yOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer ( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
- cx.metaOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
-
- // These download buffers share the same backing buffers
+ GpuStreamDescriptor yDesc{};
+ yDesc.entriesPerSlice = BBCU_MAX_SLICE_ENTRY_COUNT;
+ yDesc.sliceCount = BBCU_BUCKET_COUNT;
+ yDesc.sliceAlignment = alignment;
+ yDesc.bufferCount = BBCU_DEFAULT_GPU_BUFFER_COUNT;
+ yDesc.deviceAllocator = acx.devAllocator;
+ yDesc.pinnedAllocator = nullptr; // Start in direct mode (no intermediate pinined buffers)
+
+ // In disk-backed mode, we always have pinned buffers,
+ // which are the same buffers used to write and read from disk.
+ GpuStreamDescriptor descTablePairs = yDesc;
+ GpuStreamDescriptor descTableSortedPairs = yDesc;
+ GpuStreamDescriptor descXPairs = yDesc;
+ GpuStreamDescriptor descMeta = yDesc;
+
+ if( cx.cfg.hybrid128Mode )
{
+ // Temp 1 Queue
+ descTableSortedPairs.pinnedAllocator = acx.pinnedAllocator;
+ descTableSortedPairs.sliceAlignment = cx.diskContext->temp1Queue->BlockSize();
+
+ // Temp 2 Queue
+ descXPairs.pinnedAllocator = acx.pinnedAllocator;
+ descXPairs.sliceAlignment = cx.diskContext->temp2Queue->BlockSize();
+
+ if( cx.cfg.hybrid16Mode )
+ {
+ yDesc.pinnedAllocator = acx.pinnedAllocator;
+ yDesc.sliceAlignment = cx.diskContext->temp2Queue->BlockSize();
+
+ descMeta.pinnedAllocator = acx.pinnedAllocator;
+ descMeta.sliceAlignment = cx.diskContext->temp2Queue->BlockSize();
+
+ descTablePairs.pinnedAllocator = acx.pinnedAllocator;
+ descTablePairs.sliceAlignment = cx.diskContext->temp2Queue->BlockSize();
+ }
+ }
+
+ if( !cx.downloadDirect )
+ {
+ // Use intermediate pinned buffer for transfers to non-pinned destinations
+ yDesc.pinnedAllocator = acx.pinnedAllocator;
+ descTablePairs.pinnedAllocator = acx.pinnedAllocator;
+ descTableSortedPairs.pinnedAllocator = acx.pinnedAllocator;
+ descXPairs.pinnedAllocator = acx.pinnedAllocator;
+ descMeta.pinnedAllocator = acx.pinnedAllocator;
+ }
+
+
+ ///
+ /// Downloads
+ ///
+ cx.yOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( yDesc, acx.dryRun );
+ cx.metaOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( descMeta, acx.dryRun );
+
+ {
+ // These download buffers share the same backing buffers
const size_t devMarker = acx.devAllocator->Size();
const size_t pinnedMarker = acx.pinnedAllocator->Size();
- cx.pairsLOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
- cx.pairsROut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
+ cx.pairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( descTablePairs, acx.dryRun );
+ cx.pairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( descTablePairs, acx.dryRun );
acx.devAllocator->PopToMarker( devMarker );
acx.pinnedAllocator->PopToMarker( pinnedMarker );
// Allocate Pair at the end, to ensure we grab the highest value
- cx.xPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+ cx.xPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( descXPairs, acx.dryRun );
}
- // These download buffers share the same backing buffers
{
+ // These download buffers share the same backing buffers
const size_t devMarker = acx.devAllocator->Size();
const size_t pinnedMarker = acx.pinnedAllocator->Size();
- cx.sortedPairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
- cx.sortedPairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+ cx.sortedPairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( descTableSortedPairs, acx.dryRun );
+ cx.sortedPairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( descTableSortedPairs, acx.dryRun );
acx.devAllocator->PopToMarker( devMarker );
acx.pinnedAllocator->PopToMarker( pinnedMarker );
// Allocate Pair at the end, to ensure we grab the highest value
- cx.sortedXPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+ cx.sortedXPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( descXPairs, acx.dryRun );
}
- cx.yIn = cx.gpuUploadStream[0]->CreateUploadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
- cx.metaIn = cx.gpuUploadStream[0]->CreateUploadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+ ///
+ /// Uploads
+ ///
+ cx.yIn = cx.gpuUploadStream[0]->CreateUploadBufferT( yDesc, acx.dryRun );
+ cx.metaIn = cx.gpuUploadStream[0]->CreateUploadBufferT( descMeta, acx.dryRun );
// These uploaded buffers share the same backing buffers
{
const size_t devMarker = acx.devAllocator->Size();
const size_t pinnedMarker = acx.pinnedAllocator->Size();
- cx.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
- cx.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+ cx.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT( descTablePairs, acx.dryRun );
+ cx.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT( descTablePairs, acx.dryRun );
acx.devAllocator->PopToMarker( devMarker );
acx.pinnedAllocator->PopToMarker( pinnedMarker );
// Allocate Pair at the end, to ensure we grab the highest value
- cx.xPairsIn = cx.gpuUploadStream[0]->CreateUploadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+ cx.xPairsIn = cx.gpuUploadStream[0]->CreateUploadBufferT( descXPairs, acx.dryRun );
}
/// Device-only allocations
@@ -1268,9 +1741,56 @@ void AllocateP1Buffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
cx.hostBucketCounts = acx.pinnedAllocator->CAlloc( BBCU_BUCKET_COUNT, alignment );
cx.hostBucketSlices = acx.pinnedAllocator->CAlloc( BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, alignment );
}
+
+ /// In disk-backed mode, assign disk buffers to gpu buffers
+ if( cx.cfg.hybrid128Mode && !acx.dryRun )
+ {
+ cx.xPairsOut.AssignDiskBuffer( cx.diskContext->unsortedL );
+ cx.xPairsIn .AssignDiskBuffer( cx.diskContext->unsortedL );
+
+ if( cx.cfg.hybrid16Mode )
+ {
+ cx.pairsLOut.AssignDiskBuffer( cx.diskContext->unsortedL );
+ cx.pairsLIn .AssignDiskBuffer( cx.diskContext->unsortedL );
+
+ cx.pairsROut.AssignDiskBuffer( cx.diskContext->unsortedR );
+ cx.pairsRIn .AssignDiskBuffer( cx.diskContext->unsortedR );
+
+ cx.yOut.AssignDiskBuffer( cx.diskContext->yBuffer );
+ cx.yIn .AssignDiskBuffer( cx.diskContext->yBuffer );
+
+ cx.metaOut.AssignDiskBuffer( cx.diskContext->metaBuffer );
+ cx.metaIn .AssignDiskBuffer( cx.diskContext->metaBuffer );
+ }
+ }
+}
+
+//-----------------------------------------------------------
+void AllocateParkSerializationBuffers( CudaK32PlotContext& cx, IAllocator& pinnedAllocator, bool dryRun )
+{
+ ASSERT( cx.parkContext );
+
+ auto& pc = *cx.parkContext;
+ pc.maxParkBuffers = 3;
+
+ // Get the largest park size
+ const size_t maxParkSize = cx.cfg.gCfg->compressionLevel == 0 ?
+ CalculateParkSize( TableId::Table1 ) :
+ GetLargestCompressedParkSize();
+
+ const size_t parksPerBuffer = CDivT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2;
+ // CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kCheckpoint1Interval ) + 1; // Need an extra park for left-over entries
+ const size_t bucketParkBufferSize = parksPerBuffer * maxParkSize;
+ const size_t alignment = 4096;
+
+ // Allocate some extra space for C tables (see FinalizeTable7)
+ pc.hostRetainedLinePoints = pinnedAllocator.CAlloc( kEntriesPerPark );
+ pc.table7Memory = pinnedAllocator.CAllocSpan( 8 MiB, alignment );
+ pc.parkBufferChain = BufferChain::Create( pinnedAllocator, pc.maxParkBuffers, bucketParkBufferSize, alignment, dryRun );
}
+
///
/// Debug
///
@@ -1278,6 +1798,9 @@ void AllocateP1Buffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
void DbgWritePairs( CudaK32PlotContext& cx, const TableId table )
{
+ if( cx.cfg.hybrid128Mode )
+ return;
+
const TableId earliestTable = TableId::Table1 + (TableId)cx.gCfg->numDroppedTables+1;
if( table < earliestTable )
return;
@@ -1332,7 +1855,7 @@ void DbgWriteContext( CudaK32PlotContext& cx )
Log::Line( "[DEBUG] Writing context file." );
FileStream contxetFile;
sprintf( path, "%scontext.tmp", DBG_BBCU_DBG_DIR );
- FatalIf( !contxetFile.Open( path, FileMode::Create, FileAccess::Write ), "Failed to open context file." );
+ FatalIf( !contxetFile.Open( path, FileMode::Create, FileAccess::Write ), "Failed to open context file at '%s'.", path );
FatalIf( contxetFile.Write( &cx, sizeof( CudaK32PlotContext ) ) != (ssize_t)sizeof( CudaK32PlotContext ), "Failed to write context data." );
contxetFile.Close();
@@ -1360,7 +1883,7 @@ void DbgLoadContextAndPairs( CudaK32PlotContext& cx, bool loadTables )
memcpy( cx.bucketSlices, tmpCx.bucketSlices, sizeof( tmpCx.bucketSlices ) );
memcpy( cx.tableEntryCounts, tmpCx.tableEntryCounts, sizeof( tmpCx.tableEntryCounts ) );
}
-
+
if( !loadTables )
return;
@@ -1384,8 +1907,11 @@ void DbgLoadContextAndPairs( CudaK32PlotContext& cx, bool loadTables )
}
}
-void DbgLoadTablePairs( CudaK32PlotContext& cx, const TableId table, bool copyToPinnedBuffer )
+void DbgLoadTablePairs( CudaK32PlotContext& cx, const TableId table, bool useDiskHybridData )
{
+ if( cx.cfg.hybrid128Mode )
+ return;
+
char lPath[512];
char rPath[512];
@@ -1393,57 +1919,227 @@ void DbgLoadTablePairs( CudaK32PlotContext& cx, const TableId table, bool copyTo
if( table < earliestTable )
return;
- // for( TableId table = TableId::Table2; table <= TableId::Table7; table++ )
+ const uint64 entryCount = cx.tableEntryCounts[(int)table];
+ Pairs& pairs = cx.hostBackPointers[(int)table];
+
{
Log::Line( "[DEBUG] Loading table %d", (int)table + 1 );
sprintf( lPath, "%st%d.l.tmp", DBG_BBCU_DBG_DIR, (int)table + 1 );
sprintf( rPath, "%st%d.r.tmp", DBG_BBCU_DBG_DIR, (int)table + 1 );
- const uint64 entryCount = cx.tableEntryCounts[(int)table];
// cx.hostBackPointers[(int)table].left = bbcvirtallocbounded( entryCount );
// cx.hostBackPointers[(int)table].right = bbcvirtallocbounded( entryCount );
- Pairs& pairs = cx.hostBackPointers[(int)table];
int err;
- if( table == earliestTable )
+ static DiskQueue* diskQueue = nullptr;
+
+ // Load disk-hybrid tables
+ // #NOTE: Enable (and disable the block below this one), to load tables from
+ // the disk-hybrid output. Also adjust path in the DiskQueue below.
+
+ // useDiskHybridData = true;
+ if( useDiskHybridData )
{
- FatalIf( !IOJob::ReadFromFile( lPath, pairs.left, entryCount * sizeof( Pair ), err ), "Failed to read table X pairs: %d", err );
+ if( diskQueue == nullptr )
+ diskQueue = new DiskQueue( "/home/harold/plotdisk" );
+
+ char lname[64] = {};
+ sprintf( lname, "table_l_%d.tmp", (int)table + 1 );
+
+ if( table == earliestTable )
+ {
+ DiskBuffer* buf = DiskBuffer::Create( *diskQueue, lname, BBCU_BUCKET_COUNT, sizeof( Pair ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT,
+ FileMode::Open, FileAccess::Read, FileFlags::LargeFile | FileFlags::NoBuffering );
+ PanicIf( !buf, "No table file" );
+
+ VirtualAllocator valloc;
+ buf->ReserveBuffers( valloc );
+
+ Span pairsWriter( (Pair*)pairs.left, BBCU_TABLE_ALLOC_ENTRY_COUNT );
+ buf->ReadNextBucket();
+
+ for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+ {
+ const size_t bucketLength = cx.bucketCounts[(int)table][bucket];
+
+ buf->TryReadNextBucket();
+ auto entries = buf->GetNextReadBufferAs().SliceSize( bucketLength );
+
+ entries.CopyTo( pairsWriter );
+ pairsWriter = pairsWriter.Slice( entries.Length() );
+ }
+
+ delete buf;
+ }
+ else
+ {
+ char rname[64] = {};
+ sprintf( rname, "table_r_%d.tmp", (int)table + 1 );
+
+ DiskBuffer* lBuf = DiskBuffer::Create( *diskQueue, lname, BBCU_BUCKET_COUNT, sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT,
+ FileMode::Open, FileAccess::Read, FileFlags::LargeFile | FileFlags::NoBuffering );
+ DiskBuffer* rBuf = DiskBuffer::Create( *diskQueue, rname, BBCU_BUCKET_COUNT, sizeof( uint16 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT,
+ FileMode::Open, FileAccess::Read, FileFlags::LargeFile | FileFlags::NoBuffering );
+ PanicIf( !lBuf, "No table L file" );
+ PanicIf( !rBuf, "No table R file" );
+
+ VirtualAllocator valloc;
+ lBuf->ReserveBuffers( valloc );
+ rBuf->ReserveBuffers( valloc );
+
+ Span lWriter( pairs.left , BBCU_TABLE_ALLOC_ENTRY_COUNT );
+ Span rWriter( pairs.right, BBCU_TABLE_ALLOC_ENTRY_COUNT );
+
+ lBuf->ReadNextBucket();
+ rBuf->ReadNextBucket();
+
+ for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+ {
+ const size_t bucketLength = cx.bucketCounts[(int)table][bucket];
+
+ lBuf->TryReadNextBucket();
+ rBuf->TryReadNextBucket();
+
+ auto lEntries = lBuf->GetNextReadBufferAs().SliceSize( bucketLength );
+ lEntries.CopyTo( lWriter );
+
+ auto rEntries = rBuf->GetNextReadBufferAs().SliceSize( bucketLength );
+ rEntries.CopyTo( rWriter );
+
+ lWriter = lWriter.Slice( lEntries.Length() );
+ rWriter = rWriter.Slice( rEntries.Length() );
+ }
+
+ delete lBuf;
+ delete rBuf;
+ }
}
else
{
- FatalIf( !IOJob::ReadFromFile( lPath, pairs.left , entryCount * sizeof( uint32 ), err ), "Failed to read table L pairs: %d", err );
-
- // if( (uint32)table > cx.gCfg->numDroppedTables )
- FatalIf( !IOJob::ReadFromFile( rPath, pairs.right, entryCount * sizeof( uint16 ), err ), "Failed to read table R pairs: %d", err );
- }
-
- // We expect table 7 to also be found in these buffers, so copy it
- // if( table == TableId::Table7 )
- if( copyToPinnedBuffer )
- {
- bbmemcpy_t( cx.hostTableSortedL, pairs.left , entryCount );
- bbmemcpy_t( cx.hostTableSortedR, pairs.right, entryCount );
+ if( table == earliestTable )
+ {
+ FatalIf( !IOJob::ReadFromFile( lPath, pairs.left, entryCount * sizeof( Pair ), err ), "Failed to read table X pairs: %d", err );
+ }
+ else
+ {
+ FatalIf( !IOJob::ReadFromFile( lPath, pairs.left , entryCount * sizeof( uint32 ), err ), "Failed to read table L pairs: %d", err );
+
+ // if( (uint32)table > cx.gCfg->numDroppedTables )
+ FatalIf( !IOJob::ReadFromFile( rPath, pairs.right, entryCount * sizeof( uint16 ), err ), "Failed to read table R pairs: %d", err );
+ }
}
}
+
+ // if( table == earliestTable && !useDiskHybridData )
+ // {
+ // uint64* tmpBucket = bbcvirtallocboundednuma( BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+
+ // std::vector hashesRam{};
+ // std::vector hashesDisk{};
+
+ // byte hash[32];
+ // char hashstr[sizeof(hash)*2+1] = {};
+
+ // for( uint32 run = 0; run < 2; run++ )
+ // {
+ // auto& hashes = run == 0 ? hashesRam : hashesDisk;
+
+ // uint64* xs = (uint64*)pairs.left;
+
+ // for( uint32 b = 0; b < BBCU_BUCKET_COUNT; b++ )
+ // {
+ // const uint64 bucketEntryCount = cx.bucketCounts[(int)table][b];
+
+ // RadixSort256::Sort( DbgGetThreadPool( cx ), xs, tmpBucket, bucketEntryCount );
+
+ // // Hash
+ // {
+ // blake3_hasher hasher;
+ // blake3_hasher_init( &hasher );
+ // blake3_hasher_update( &hasher, xs, bucketEntryCount * sizeof( uint64 ) );
+ // blake3_hasher_finalize( &hasher, hash, sizeof( hash ) );
+
+ // size_t _;
+ // BytesToHexStr( hash, sizeof( hash ), hashstr, sizeof( hashstr ), _ );
+ // Log::Line( "[%3u] : 0x%s", b, hashstr );
+
+ // hashes.push_back( hashstr );
+
+ // // DbgPrintHash( " :", xs, sizeof( uint64 ) * bucketEntryCount );
+ // }
+
+ // xs += bucketEntryCount;
+ // }
+
+ // if( run == 0 )
+ // {
+ // DbgLoadTablePairs( cx, table, true );
+ // }
+ // }
+
+ // // Compare hashes
+ // {
+ // for( uint32 b = 0; b < BBCU_BUCKET_COUNT; b++ )
+ // {
+ // if( hashesRam[b] != hashesDisk[b] )
+ // {
+ // Panic( "Hash mismatch at bucket %u. %s != %s", b, hashesRam[b].c_str(), hashesDisk[b].c_str() );
+ // }
+ // }
+ // Log::Line( "All hashes match!" );
+ // }
+
+
+ // // DbgPrintHash( "Inlined X Table", cx.hostBackPointers[(int)table].left, sizeof( Pair ) * cx.tableEntryCounts[(int)table] );
+ // Log::Line( "" );
+ // bbvirtfreebounded( tmpBucket );
+ // Exit( 0 );
+ // }
+ // else
+ // {
+ // // DbgPrintHash( "L Table", cx.hostBackPointers[(int)table].left, sizeof( uint32 ) * cx.tableEntryCounts[(int)table] );
+ // // DbgPrintHash( "R Table", cx.hostBackPointers[(int)table].right, sizeof( uint16 ) * cx.tableEntryCounts[(int)table] );
+ // // Log::Line( "" );
+ // }
+
+ // Sort inlined xs
+ // if( table == earliestTable )
+ // {
+ // uint64* tmpBucket = bbcvirtallocboundednuma( BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+ // uint64* xs = (uint64*)pairs.left;
+
+ // for( uint32 b = 0; b < BBCU_BUCKET_COUNT; b++ )
+ // {
+ // const uint64 bucketEntryCount = cx.bucketCounts[(int)table][b];
+ // RadixSort256::Sort( DbgGetThreadPool( cx ), xs, tmpBucket, bucketEntryCount );
+ // xs += bucketEntryCount;
+ // }
+
+ // DbgPrintHash( "pre_sorted_xs", pairs.left, sizeof( uint64 ) * entryCount );
+ // }
+
Log::Line( "[DEBUG] Done." );
}
-
void DbgLoadMarks( CudaK32PlotContext& cx )
{
char path[512];
+ std::string baseUrl = DBG_BBCU_DBG_DIR;
+ if( cx.cfg.hybrid128Mode )
+ baseUrl += "disk/";
+
// const size_t tableSize = ((1ull << BBCU_K) / 64) * sizeof(uint64);
Log::Line( "[DEBUG] Loadinging marking tables" );
- const TableId startTable = TableId::Table2 + cx.gCfg->numDroppedTables;
+ const TableId startTable = cx.firstStoredTable;
for( TableId table = startTable; table < TableId::Table7; table++ )
{
- sprintf( path, "%smarks%d.tmp", DBG_BBCU_DBG_DIR, (int)table+1 );
+ sprintf( path, "%smarks%d.tmp", baseUrl.c_str(), (int)table+1 );
int err = 0;
cx.hostMarkingTables[(int)table] = (uint64*)IOJob::ReadAllBytesDirect( path, err );
diff --git a/cuda/CudaPlotter.h b/cuda/CudaPlotter.h
index ebe30f67..ddcbfed2 100644
--- a/cuda/CudaPlotter.h
+++ b/cuda/CudaPlotter.h
@@ -9,10 +9,22 @@ struct CudaK32PlotConfig
{
const GlobalPlotConfig* gCfg = nullptr;
- uint32 deviceIndex = 0; // Which CUDA device to use when plotting//
- bool disableDirectDownloads = false; // Don't allocate host tables using pinned buffers, instead
- // download to intermediate pinned buffers then copy to the final host buffer.
- // May be necessarry on Windows because of shared memory limitations (usual 50% of system memory)
+ uint32 deviceIndex = 0; // Which CUDA device to use when plotting/
+ bool disableDirectDownloads = false; // Don't allocate host tables using pinned buffers, instead
+ // download to intermediate pinned buffers then copy to the final host buffer.
+ // May be necessarry on Windows because of shared memory limitations (usual 50% of system memory)
+
+ bool hybrid128Mode = false; // Enable hybrid disk-offload w/ 128G of RAM.
+ bool hybrid16Mode = false; // Enable hybrid disk-offload w/ 64G of RAM.
+
+ const char* temp1Path = nullptr; // For 128G RAM mode
+ const char* temp2Path = nullptr; // For 64G RAM mode
+
+ bool temp1DirectIO = true; // Use direct I/O for temp1 files
+ bool temp2DirectIO = true; // Use direct I/O for temp2 files
+
+ uint64 plotCheckCount = 0; // For performing plot check command after plotting
+ double plotCheckThreshhold = 0.6; // Proof/check threshhold below which plots will be deleted
};
class CudaK32Plotter : public IPlotter
@@ -28,4 +40,6 @@ class CudaK32Plotter : public IPlotter
private:
CudaK32PlotConfig _cfg = {};
struct CudaK32PlotContext* _cx = nullptr;;
-};
\ No newline at end of file
+};
+
+void CudaK32PlotterPrintHelp();
diff --git a/cuda/GpuDownloadStream.cu b/cuda/GpuDownloadStream.cu
new file mode 100644
index 00000000..3d06973c
--- /dev/null
+++ b/cuda/GpuDownloadStream.cu
@@ -0,0 +1,385 @@
+#include "GpuStreams.h"
+#include "GpuQueue.h"
+#include "plotting/DiskBucketBuffer.h"
+#include "plotting/DiskBuffer.h"
+
+
+///
+/// DownloadBuffer
+///
+void* GpuDownloadBuffer::GetDeviceBuffer()
+{
+ const uint32 index = self->outgoingSequence % self->bufferCount;
+
+ CudaErrCheck( cudaEventSynchronize( self->events[index] ) );
+
+ return self->deviceBuffer[index];
+}
+
+void* GpuDownloadBuffer::LockDeviceBuffer( cudaStream_t stream )
+{
+ ASSERT( self->lockSequence >= self->outgoingSequence );
+ ASSERT( self->lockSequence - self->outgoingSequence < self->bufferCount );
+
+ const uint32 index = self->lockSequence % self->bufferCount;
+ self->lockSequence++;
+
+ // Wait for the device buffer to be free to be used by kernels
+ CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) );
+ return self->deviceBuffer[index];
+}
+
+void GpuDownloadBuffer::Download( void* hostBuffer, const size_t size )
+{
+ Download2D( hostBuffer, size, 1, size, size );
+}
+
+void GpuDownloadBuffer::Download( void* hostBuffer, const size_t size, cudaStream_t workStream, bool directOverride )
+{
+ Download2D( hostBuffer, size, 1, size, size, workStream, directOverride );
+}
+
+void GpuDownloadBuffer::DownloadAndCopy( void* hostBuffer, void* finalBuffer, const size_t size, cudaStream_t workStream )
+{
+ Panic( "Unavailable" );
+ // ASSERT( self->outgoingSequence < BBCU_BUCKET_COUNT );
+ // ASSERT( hostBuffer );
+ // ASSERT( workStream );
+ // ASSERT( self->lockSequence > 0 );
+ // ASSERT( self->outgoingSequence < self->lockSequence );
+ // ASSERT( self->lockSequence - self->outgoingSequence <= self->bufferCount );
+
+ // auto& cpy = self->copies[self->outgoingSequence];
+ // cpy.self = self;
+ // cpy.sequence = self->outgoingSequence;
+ // cpy.copy.hostBuffer = finalBuffer;
+ // cpy.copy.srcBuffer = hostBuffer;
+ // cpy.copy.size = size;
+
+
+ // const uint32 index = self->outgoingSequence % self->bufferCount;
+ // self->outgoingSequence++;
+
+ // void* pinnedBuffer = self->pinnedBuffer[index];
+ // const void* devBuffer = self->deviceBuffer[index];
+
+ // // Signal from the work stream when it has finished doing kernel work with the device buffer
+ // CudaErrCheck( cudaEventRecord( self->readyEvents[index], workStream ) );
+
+
+ // // Ensure the work stream has completed writing data to the device buffer
+ // cudaStream_t stream = self->queue->_stream;
+
+ // CudaErrCheck( cudaStreamWaitEvent( stream, self->readyEvents[index] ) );
+
+ // // Copy
+ // CudaErrCheck( cudaMemcpyAsync( hostBuffer, devBuffer, size, cudaMemcpyDeviceToHost, stream ) );
+
+ // // Signal that the device buffer is free to be re-used
+ // CudaErrCheck( cudaEventRecord( self->events[index], stream ) );
+
+ // // Launch copy command
+ // CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){
+
+ // const CopyInfo& c = *reinterpret_cast( userData );
+ // IGpuBuffer* self = c.self;
+
+ // auto& cmd = self->queue->GetCommand( GpuQueue::CommandType::Copy );
+ // cmd.copy.info = &c;
+
+ // self->queue->SubmitCommands();
+
+ // // Signal the download completed
+ // self->fence.Signal( ++self->completedSequence );
+ // }, &cpy ) );
+}
+
+void GpuDownloadBuffer::DownloadWithCallback( void* hostBuffer, const size_t size, GpuDownloadCallback callback, void* userData, cudaStream_t workStream, bool directOverride )
+{
+ Download2DWithCallback( hostBuffer, size, 1, size, size, callback, userData, workStream, directOverride );
+}
+
+void GpuDownloadBuffer::Download2D( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, cudaStream_t workStream, bool directOverride )
+{
+ Download2DWithCallback( hostBuffer, width, height, dstStride, srcStride, nullptr, nullptr, workStream, directOverride );
+}
+
+void GpuDownloadBuffer::Download2DWithCallback( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride,
+ GpuDownloadCallback callback, void* userData, cudaStream_t workStream, bool directOverride )
+{
+ PerformDownload2D( hostBuffer, width, height, dstStride, srcStride,
+ callback, userData,
+ workStream, directOverride );
+}
+
+void GpuDownloadBuffer::PerformDownload2D( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride,
+ GpuDownloadCallback postCallback, void* postUserData,
+ cudaStream_t workStream, bool directOverride )
+{
+ PanicIf( !(hostBuffer || self->pinnedBuffer[0] ), "" );
+ ASSERT( workStream );
+ ASSERT( self->lockSequence > 0 );
+ ASSERT( self->outgoingSequence < self->lockSequence );
+ ASSERT( self->lockSequence - self->outgoingSequence <= self->bufferCount );
+
+ const uint32 index = self->outgoingSequence++ % self->bufferCount;
+
+ void* pinnedBuffer = self->pinnedBuffer[index];
+ void* finalHostBuffer = hostBuffer;
+ const void* devBuffer = self->deviceBuffer[index];
+
+ const bool isDirect = (directOverride || self->pinnedBuffer[0] == nullptr) && !self->diskBuffer; ASSERT( isDirect || self->pinnedBuffer[0] );
+ const bool isSequentialCopy = dstStride == srcStride;
+ const size_t totalSize = height * width;
+
+
+ // Signal from the work stream when it has finished doing kernel work with the device buffer
+ CudaErrCheck( cudaEventRecord( self->workEvent[index], workStream ) );
+
+ // From the download stream, wait for the work stream to finish
+ cudaStream_t downloadStream = self->queue->_stream;
+ CudaErrCheck( cudaStreamWaitEvent( downloadStream, self->workEvent[index] ) );
+
+
+ if( self->diskBuffer )
+ {
+ // Wait until the next disk buffer is ready for use.
+ // This also signals that the pinned buffer is ready for re-use
+ CallHostFunctionOnStream( downloadStream, [this](){
+ self->diskBuffer->GetNextWriteBuffer();
+ });
+
+ pinnedBuffer = self->diskBuffer->PeekWriteBufferForBucket( self->outgoingSequence-1 );
+ }
+
+ if( !isDirect )
+ {
+ // Ensure that the pinned buffer is ready for use
+ // (we signal pinned buffers are ready when using disks without events)
+ if( !self->diskBuffer )
+ CudaErrCheck( cudaStreamWaitEvent( downloadStream, self->pinnedEvent[index] ) );
+
+ // Set host buffer as the pinned buffer
+ hostBuffer = pinnedBuffer;
+ }
+
+
+ // Copy from device to host buffer
+ // #NOTE: Since the pinned buffer is simply the same size (a full bucket) as the device buffer
+ // we also always copy as 1D if we're copying to our pinned buffer.
+ ASSERT( hostBuffer );
+ if( isSequentialCopy || hostBuffer == pinnedBuffer )
+ CudaErrCheck( cudaMemcpyAsync( hostBuffer, devBuffer, totalSize, cudaMemcpyDeviceToHost, downloadStream ) );
+ else
+ CudaErrCheck( cudaMemcpy2DAsync( hostBuffer, dstStride, devBuffer, srcStride, width, height, cudaMemcpyDeviceToHost, downloadStream ) );
+
+ // Dispatch a host callback if one was set
+ if( postCallback )
+ {
+ CallHostFunctionOnStream( downloadStream, [=](){
+ (*postCallback)( finalHostBuffer, totalSize, postUserData );
+ });
+ }
+
+
+ // Signal that the device buffer is free to be re-used
+ CudaErrCheck( cudaEventRecord( self->deviceEvents[index], downloadStream ) );
+
+ if( self->diskBuffer )
+ {
+ // If it's a disk-based copy, then write the pinned buffer to disk
+ CallHostFunctionOnStream( downloadStream, [=]() {
+
+ auto* diskBucketBuffer = dynamic_cast( self->diskBuffer );
+ if( diskBucketBuffer != nullptr )
+ diskBucketBuffer->Submit( srcStride );
+ else
+ static_cast( self->diskBuffer )->Submit( totalSize );
+ });
+
+ // #NOTE: We don't need to signal that the pinned buffer is ready for re-use here as
+ // we do that implicitly with DiskBuffer::GetNextWriteBuffer (see above).
+ }
+ else if( !isDirect )
+ {
+ // #TODO: Do this in a different host copy stream, and signal from there.
+ // #MAYBE: Perhaps use multiple host threads/streams to do host-to-host copies.
+ // for now do it on the same download stream, but we will be blocking the download stream,
+ // unless other download streams are used by other buffers.
+
+
+ ASSERT( hostBuffer == pinnedBuffer );
+ if( isSequentialCopy )
+ CudaErrCheck( cudaMemcpyAsync( finalHostBuffer, hostBuffer, totalSize, cudaMemcpyHostToHost, downloadStream ) );
+ else
+ CudaErrCheck( cudaMemcpy2DAsync( finalHostBuffer, dstStride, hostBuffer, srcStride, width, height, cudaMemcpyHostToHost, downloadStream ) );
+
+ // Signal the pinned buffer is free to be re-used
+ CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], downloadStream ) );
+ }
+}
+
+void GpuDownloadBuffer::CallHostFunctionOnStream( cudaStream_t stream, std::function func )
+{
+ auto* fnCpy = new std::function( std::move( func ) );
+ CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ) {
+
+ auto& fn = *reinterpret_cast*>( userData );
+ fn();
+ delete& fn;
+
+ }, fnCpy ) );
+}
+
+void GpuDownloadBuffer::HostCallback( std::function func )
+{
+ CallHostFunctionOnStream( self->queue->GetStream(), func );
+}
+
+void GpuDownloadBuffer::GetDownload2DCommand( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride,
+ uint32& outIndex, void*& outPinnedBuffer, const void*& outDevBuffer, GpuDownloadCallback callback, void* userData )
+{
+ ASSERT( width );
+ ASSERT( height );
+ ASSERT( hostBuffer );
+
+ const uint32 index = self->outgoingSequence % self->bufferCount;
+
+ // We need to block until the pinned buffer is available.
+ if( self->outgoingSequence > self->bufferCount-1 )
+ self->fence.Wait( self->outgoingSequence - self->bufferCount + 1 );
+
+ void* pinnedBuffer = self->pinnedBuffer[index];
+ const void* devBuffer = self->deviceBuffer[index];
+
+ //auto& cmd = self->commands[index];
+ //cmd.type = GpuQueue::CommandType::Copy2D;
+ //cmd.sequenceId = self->outgoingSequence++;
+ //cmd.finishedSignal = &self->fence;
+ //cmd.dstBuffer = hostBuffer;
+ //cmd.srcBuffer = pinnedBuffer;
+ //cmd.copy2d.width = width;
+ //cmd.copy2d.height = height;
+ //cmd.copy2d.dstStride = dstStride;
+ //cmd.copy2d.srcStride = srcStride;
+ //cmd.copy2d.callback = callback;
+ //cmd.copy2d.userData = userData;
+
+ outIndex = index;
+ outPinnedBuffer = pinnedBuffer;
+ outDevBuffer = devBuffer;
+}
+
+
+void GpuDownloadBuffer::DownloadAndPackArray( void* hostBuffer, const uint32 length, size_t srcStride, const uint32* counts, const uint32 elementSize )
+{
+ ASSERT( length );
+ ASSERT( elementSize );
+ ASSERT( counts );
+
+ uint32 totalElements = 0;
+ for( uint32 i = 0; i < length; i++ )
+ totalElements += counts[i];
+
+ const size_t totalSize = (size_t)totalElements * elementSize;
+
+ uint32 index;
+ void* pinnedBuffer;
+ const void* devBuffer;
+ GetDownload2DCommand( hostBuffer, totalSize, 1, totalSize, totalSize, index, pinnedBuffer, devBuffer );
+
+
+ srcStride *= elementSize;
+
+ byte* dst = (byte*)pinnedBuffer;
+ const byte* src = (byte*)devBuffer;
+
+ cudaStream_t stream = self->queue->_stream;
+
+ // Copy all buffers from device to pinned buffer
+ for( uint32 i = 0; i < length; i++ )
+ {
+ const size_t copySize = counts[i] * (size_t)elementSize;
+
+ // #TODO: Determine if there's a cuda (jagged) array copy
+ CudaErrCheck( cudaMemcpyAsync( dst, src, copySize, cudaMemcpyDeviceToHost, stream ) );
+
+ src += srcStride;
+ dst += copySize;
+ }
+
+ // Signal that the device buffer is free
+ CudaErrCheck( cudaEventRecord( self->events[index], stream ) );
+
+ // Submit command to do the final copy from pinned to host
+ CudaErrCheck( cudaLaunchHostFunc( stream, GpuQueue::CopyPendingDownloadStream, self ) );
+}
+
+void GpuDownloadBuffer::WaitForCompletion()
+{
+ if( self->outgoingSequence > 0 )
+ {
+ //const uint32 index = (self->outgoingSequence - 1) % self->bufferCount;
+
+ // cudaEvent_t event = self->completedEvents[index];
+ //const cudaError_t r = cudaEventQuery( event );
+
+ //if( r == cudaSuccess )
+ // return;
+
+ //if( r != cudaErrorNotReady )
+ // CudaErrCheck( r );
+
+ //CudaErrCheck( cudaEventSynchronize( event ) );
+
+
+ cudaStream_t downloadStream = self->queue->_stream;
+ // this->self->fence.Reset( 0 );
+ CallHostFunctionOnStream( downloadStream, [this](){
+ this->self->fence.Signal( this->self->outgoingSequence );
+ });
+ self->fence.Wait( self->outgoingSequence );
+
+ }
+}
+
+void GpuDownloadBuffer::WaitForCopyCompletion()
+{
+ if( self->outgoingSequence > 0 )
+ {
+ self->copyFence.Wait( self->outgoingSequence );
+ }
+}
+
+void GpuDownloadBuffer::Reset()
+{
+ self->lockSequence = 0;
+ self->outgoingSequence = 0;
+ self->completedSequence = 0;
+ self->copySequence = 0;
+ self->fence.Reset( 0 );
+ self->copyFence.Reset( 0 );
+}
+
+GpuQueue* GpuDownloadBuffer::GetQueue() const
+{
+ return self->queue;
+}
+
+void GpuDownloadBuffer::AssignDiskBuffer( DiskBufferBase* diskBuffer )
+{
+ // ASSERT( self->pinnedBuffer[0] );
+
+ void* nullBuffers[2] = { nullptr, nullptr };
+ if( self->diskBuffer )
+ self->diskBuffer->AssignWriteBuffers( nullBuffers );
+
+ self->diskBuffer = diskBuffer;
+ if( self->diskBuffer )
+ self->diskBuffer->AssignWriteBuffers( self->pinnedBuffer );
+}
+
+DiskBufferBase* GpuDownloadBuffer::GetDiskBuffer() const
+{
+ return self->diskBuffer;
+}
diff --git a/cuda/GpuQueue.cu b/cuda/GpuQueue.cu
new file mode 100644
index 00000000..399a0fbf
--- /dev/null
+++ b/cuda/GpuQueue.cu
@@ -0,0 +1,432 @@
+#include "GpuQueue.h"
+#include "util/IAllocator.h"
+#include "plotting/DiskBucketBuffer.h"
+#include "plotting/DiskBuffer.h"
+
+///
+/// Shared GpuStream Inteface
+///
+GpuQueue::GpuQueue( Kind kind ) : _kind( kind )
+ , _bufferReadySignal( BBCU_BUCKET_COUNT )
+{
+ CudaErrCheck( cudaStreamCreateWithFlags( &_stream , cudaStreamNonBlocking ) );
+ CudaErrCheck( cudaStreamCreateWithFlags( &_preloadStream , cudaStreamNonBlocking ) );
+ CudaErrCheck( cudaStreamCreateWithFlags( &_callbackStream, cudaStreamNonBlocking ) );
+
+ _queueThread.Run( QueueThreadEntryPoint, this );
+}
+
+GpuQueue::~GpuQueue()
+{
+ _exitQueueThread.store( true, std::memory_order_release );
+ _bufferReadySignal.Release();
+ _waitForExitSignal.Wait();
+
+
+ if( _stream ) cudaStreamDestroy( _stream );
+ if( _preloadStream ) cudaStreamDestroy( _preloadStream );
+ if( _callbackStream ) cudaStreamDestroy( _callbackStream );
+
+ _stream = nullptr;
+ _preloadStream = nullptr;
+ _callbackStream = nullptr;
+}
+
+GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( const GpuStreamDescriptor& desc, bool dryRun )
+{
+ FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue." );
+ GpuDownloadBuffer r = { CreateGpuBuffer( desc, dryRun ) };
+
+ if( !dryRun )
+ r.Reset();
+
+ return r;
+}
+
+GpuDownloadBuffer GpuQueue::CreateDirectDownloadBuffer( const size_t size, IAllocator& devAllocator, const size_t alignment, const bool dryRun )
+{
+ FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" );
+
+ ASSERT( 0 ); // #TODO: Deprecated function. Replace with the new one.
+ GpuStreamDescriptor desc{};
+ desc.entrySize = 1;
+ desc.entriesPerSlice = 1;
+ desc.sliceCount = BBCU_BUCKET_COUNT;
+ desc.sliceAlignment = alignment;
+ desc.bufferCount = 2;
+ desc.deviceAllocator = &devAllocator;
+ desc.pinnedAllocator = nullptr;
+
+ return CreateDownloadBuffer( desc, dryRun );
+}
+
+GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun )
+{
+ FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" );
+ GpuDownloadBuffer r = { CreateGpuBuffer( size, devAllocator, pinnedAllocator, alignment, dryRun ) };
+
+ if( !dryRun )
+ r.Reset();
+
+ return r;
+}
+
+GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( const size_t size, const uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun )
+{
+ FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" );
+
+ ASSERT( 0 ); // #TODO: Deprecated function. Replace with the new one.
+ GpuStreamDescriptor desc{};
+ desc.entrySize = 1;
+ desc.entriesPerSlice = 1;
+ desc.sliceCount = BBCU_BUCKET_COUNT;
+ desc.sliceAlignment = alignment;
+ desc.bufferCount = bufferCount;
+ desc.deviceAllocator = &devAllocator;
+ desc.pinnedAllocator = &pinnedAllocator;
+
+ GpuDownloadBuffer r = { CreateGpuBuffer( desc, dryRun ) };
+
+ if( !dryRun )
+ r.Reset();
+
+ return r;
+}
+
+GpuUploadBuffer GpuQueue::CreateUploadBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun )
+{
+ Panic( "Deprecated" );
+ FatalIf( _kind != Uploader, "Attempted to create GpuUploadBuffer on an DownloadQueue" );
+
+ GpuUploadBuffer r = { CreateGpuBuffer( size, devAllocator, pinnedAllocator, alignment, dryRun ) };
+
+ if( !dryRun )
+ r.Reset();
+
+ return r;
+}
+
+GpuUploadBuffer GpuQueue::CreateUploadBuffer( const GpuStreamDescriptor& desc, bool dryRun )
+{
+ FatalIf( _kind != Uploader, "Attempted to create GpuUploadBuffer on an DownloadQueue." );
+
+ GpuUploadBuffer r = { CreateGpuBuffer( desc, dryRun ) };
+
+ if( !dryRun )
+ r.Reset();
+
+ return r;
+}
+
+
+
+struct IGpuBuffer* GpuQueue::CreateGpuBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun )
+{
+ Panic( "Deprecated" );
+ // ASSERT( 0 ); // #TODO: Deprecated function. Replace with the new one.
+ GpuStreamDescriptor desc{};
+ desc.entrySize = 1;
+ desc.entriesPerSlice = size;
+ desc.sliceCount = BBCU_BUCKET_COUNT;
+ desc.sliceAlignment = alignment;
+ desc.bufferCount = 2;
+ desc.deviceAllocator = &devAllocator;
+ desc.pinnedAllocator = &pinnedAllocator;
+
+ return CreateGpuBuffer( desc, dryRun );
+}
+
+struct IGpuBuffer* GpuQueue::CreateGpuBuffer( const GpuStreamDescriptor& desc, bool dryRun )
+{
+ PanicIf( desc.bufferCount > BBCU_GPU_BUFFER_MAX_COUNT || !desc.bufferCount, "Invalid GPUBuffer buffer count." );
+ PanicIf( !desc.deviceAllocator, "Null device allocator." );
+ PanicIf( !desc.entrySize, "Invalid entry size." );
+ PanicIf( !desc.entriesPerSlice, "Invalid entries per slice." );
+ PanicIf( !desc.sliceCount || desc.sliceCount > BBCU_BUCKET_COUNT, "Invalid slice count." );
+ PanicIf( !desc.sliceAlignment, "Invalid slice alignment." );
+ PanicIf( desc.diskQueue && (!desc.diskFileName || !*desc.diskFileName), "Invalid disk offload config." );
+ PanicIf( desc.diskQueue && !desc.pinnedAllocator, "A pinned allocator must be set in disk offload mode." );
+
+ const size_t allocSize = CalculateBufferSizeFromDescriptor( desc );
+
+ void* devBuffers [BBCU_GPU_BUFFER_MAX_COUNT] = {};
+ void* pinnedBuffers[BBCU_GPU_BUFFER_MAX_COUNT] = {};
+
+ for( int32 i = 0; i < desc.bufferCount; i++ )
+ {
+ devBuffers[i] = desc.deviceAllocator->Alloc( allocSize, desc.sliceAlignment );
+
+ if( desc.pinnedAllocator )
+ pinnedBuffers[i] = desc.pinnedAllocator->Alloc( allocSize, desc.sliceAlignment );
+ }
+
+ struct IGpuBuffer* buf = nullptr;
+
+ if( !dryRun )
+ {
+ buf = new IGpuBuffer{};
+
+ for( int32 i = 0; i < desc.bufferCount; i++ )
+ {
+ CudaErrCheck( cudaEventCreateWithFlags( &buf->events[i] , cudaEventDisableTiming ) );
+ CudaErrCheck( cudaEventCreateWithFlags( &buf->completedEvents[i], cudaEventDisableTiming ) );
+ CudaErrCheck( cudaEventCreateWithFlags( &buf->readyEvents[i] , cudaEventDisableTiming ) );
+ // CudaErrCheck( cudaEventCreateWithFlags( &buf->preloadEvents[i] , cudaEventDisableTiming ) );
+ CudaErrCheck( cudaEventCreateWithFlags( &buf->pinnedEvent[i] , cudaEventDisableTiming ) );
+
+ CudaErrCheck( cudaEventCreateWithFlags( &buf->callbackLockEvent , cudaEventDisableTiming ) );
+ CudaErrCheck( cudaEventCreateWithFlags( &buf->callbackCompletedEvent, cudaEventDisableTiming ) );
+
+ buf->deviceBuffer[i] = devBuffers[i];
+ buf->pinnedBuffer[i] = pinnedBuffers[i];
+ }
+
+ buf->size = allocSize;
+ buf->bufferCount = desc.bufferCount;
+ buf->queue = this;
+ }
+
+ // Disk offload mode?
+ if( desc.diskQueue )
+ {
+ const size_t sliceSize = CalculateSliceSizeFromDescriptor( desc );
+
+ if( !dryRun )
+ {
+ if( desc.bucketedDiskBuffer )
+ {
+ buf->diskBuffer = DiskBucketBuffer::Create(
+ *desc.diskQueue, desc.diskFileName,
+ desc.sliceCount, sliceSize,
+ FileMode::Create, FileAccess::ReadWrite,
+ desc.directIO ? FileFlags::NoBuffering | FileFlags::LargeFile : FileFlags::None );
+ }
+ else
+ {
+ buf->diskBuffer = DiskBuffer::Create(
+ *desc.diskQueue, desc.diskFileName,
+ desc.sliceCount, allocSize,
+ FileMode::Create, FileAccess::ReadWrite,
+ desc.directIO ? FileFlags::NoBuffering | FileFlags::LargeFile : FileFlags::None );
+ }
+
+ PanicIf( !buf->diskBuffer, "Failed to create DiskBuffer for GpuBuffer." );
+
+ void* readBuffers [2] = { nullptr, nullptr };
+ void* writeBuffers[2] = { pinnedBuffers[0], pinnedBuffers[1] };
+
+ buf->diskBuffer->AssignBuffers( readBuffers, writeBuffers );
+ }
+ else
+ {
+ size_t diskAllocSize = 0;
+ if( desc.bucketedDiskBuffer )
+ {
+ diskAllocSize = DiskBucketBuffer::GetReserveAllocSize( *desc.diskQueue, desc.sliceCount, sliceSize );
+ }
+ else
+ {
+ diskAllocSize = DiskBuffer::GetReserveAllocSize( *desc.diskQueue, allocSize );
+ }
+
+ ASSERT( diskAllocSize == allocSize * 4 );
+ }
+ }
+
+ return buf;
+}
+
+void GpuQueue::DispatchHostFunc( GpuCallbackDispath func, cudaStream_t stream, cudaEvent_t lockEvent, cudaEvent_t completedEvent )
+{
+ // #MAYBE: Perhaps support having multiple callback streams, and multiple copy streams.
+
+ // Signal from the work stream into the callback stream that we are ready for callback
+ CudaErrCheck( cudaEventRecord( lockEvent, stream ) );
+
+ // Wait on the callback stream until it's ready to dsitpatch
+ CudaErrCheck( cudaStreamWaitEvent( _callbackStream, lockEvent ) );
+
+ // #MAYBE: Use a bump allocator perhaps later to avoid locking here by new/delete if needed for performance.
+ auto* fnCpy = new std::function( std::move( func ) );
+ CudaErrCheck( cudaLaunchHostFunc( _callbackStream, []( void* userData ){
+
+ auto& fn = *reinterpret_cast*>( userData );
+ fn();
+ delete &fn;
+
+ }, fnCpy ) );
+
+ // Signal from the callback stream that the callback finished
+ CudaErrCheck( cudaEventRecord( completedEvent, _callbackStream ) );
+
+ // Wait on work stream for the callback to complete
+ CudaErrCheck( cudaStreamWaitEvent( stream, completedEvent ) );
+}
+
+size_t GpuQueue::CalculateSliceSizeFromDescriptor( const GpuStreamDescriptor& desc )
+{
+ const size_t alignment = desc.diskQueue ? desc.diskQueue->BlockSize() : desc.sliceAlignment;
+ return RoundUpToNextBoundaryT( desc.entrySize * desc.entriesPerSlice, alignment );
+}
+
+size_t GpuQueue::CalculateBufferSizeFromDescriptor( const GpuStreamDescriptor& desc )
+{
+ return CalculateSliceSizeFromDescriptor( desc ) * desc.sliceCount;
+}
+
+void GpuQueue::CopyPendingDownloadStream( void* userData )
+{
+ auto* buf = reinterpret_cast( userData );
+
+ GpuQueue* queue = buf->queue;
+
+ //const uint32 index = buf->completedSequence % buf->bufferCount;
+ buf->completedSequence++;
+
+ //queue->GetCommand( CommandType::Download2D ) = buf->commands[index];
+ queue->SubmitCommands();
+}
+
+void GpuQueue::SubmitCommands()
+{
+ const uint64 ticket = _commitTicketOut++;
+
+ // Wait for our ticket to come up
+ while( _commitTicketIn.load( std::memory_order_relaxed ) != ticket );
+
+ _queue.Commit();
+ _bufferReadySignal.Release();
+ //_bufferReadySignal.Signal();
+
+ // Use our ticket
+ _commitTicketIn.store( ticket+1, std::memory_order_release );
+}
+
+GpuQueue::Command& GpuQueue::GetCommand( CommandType type )
+{
+ const uint64 ticket = _cmdTicketOut++;
+
+ // Wait for our ticket to come up
+ while( _cmdTicketIn.load( std::memory_order_relaxed ) != ticket );
+
+ Command* cmd;
+ while( !_queue.Write( cmd ) )
+ {
+ Log::Line( "[GpuQueue] Queue is depleted. Waiting for copies to complete." );
+ auto waitTimer = TimerBegin();
+
+ // Block and wait until we have commands free in the buffer
+ _bufferCopiedSignal.Wait();
+
+ Log::Line( "[GpuQueue] Waited %.6lf seconds for availability.", TimerEnd( waitTimer ) );
+ }
+
+ // Use our ticket
+ _cmdTicketIn.store( ticket+1, std::memory_order_release );
+
+ ZeroMem( cmd );
+ cmd->type = type;
+
+ return *cmd;
+}
+
+///
+/// Command thread
+///
+void GpuQueue::QueueThreadEntryPoint( GpuQueue* self )
+{
+ ASSERT( self );
+ self->QueueThreadMain();
+ self->_waitForExitSignal.Signal();
+}
+
+void GpuQueue::QueueThreadMain()
+{
+ const int32 CMD_BUF_SIZE = 256;
+ Command buffers[CMD_BUF_SIZE];
+
+ for( ;; )
+ {
+ _bufferReadySignal.Wait();
+
+ if( ShouldExitQueueThread() )
+ return;
+
+ // 1 command per semaphore release
+ int32 bufCount;
+ while( ( ( bufCount = _queue.Dequeue( buffers, CMD_BUF_SIZE ) ) ) )
+ // if( ( ( bufCount = _queue.Dequeue( buffers, CMD_BUF_SIZE ) ) ) )
+ {
+ ASSERT( bufCount <= CMD_BUF_SIZE );
+ _bufferCopiedSignal.Signal();
+
+ for( int i = 0; i < bufCount; i++ )
+ ExecuteCommand( buffers[i] );
+ }
+ }
+}
+
+void GpuQueue::ExecuteCommand( const Command& cmd )
+{
+
+ // const uint32 index = cmd.sequenceId % BBCU_GPU_BUFFER_MAX_COUNT;
+
+ if( cmd.type == CommandType::Copy )
+ {
+ auto& cpy = *cmd.copy;
+
+ const bool isSequentialCopy = cpy.dstStride == cpy.srcStride;
+ const size_t totalSize = cpy.height * cpy.width;
+
+ byte* dst = (byte*)cpy.dstBuffer;
+ const byte* src = (byte*)cpy.srcBuffer;
+
+ if( isSequentialCopy )
+ memcpy( cpy.dstBuffer, cpy.srcBuffer, totalSize );
+ else
+ {
+ const byte* src = (byte*)cpy.srcBuffer;
+ byte* dst = (byte*)cpy.dstBuffer;
+
+ for( size_t i = 0; i < cpy.height; i++ )
+ {
+ memcpy( dst, src, cpy.width );
+
+ dst += cpy.dstStride;
+ src += cpy.srcStride;
+ }
+ }
+
+ cpy.self->fence.Signal( cpy.sequence+1 );
+ cpy.self->copyFence.Signal( cpy.sequence+1 );
+
+ if( cpy.callback )
+ cpy.callback( cpy.dstBuffer, totalSize, cpy.userData );
+ }
+ else if( cmd.type == CommandType::CopyArray )
+ {
+
+ }
+ else if( cmd.type == CommandType::Callback )
+ {
+ cmd.callback.callback( cmd.callback.dstbuffer, cmd.callback.copySize, cmd.callback.userData );
+ }
+ // else if( cmd.type == CommandType::Sync )
+ // {
+ // _syncFence.Signal();
+ // return;
+ // }
+ else
+ {
+ ASSERT( 0 );
+ }
+
+ // Signal that the pinned buffer is free
+ //cpy.finishedSignal->Signal( cpy.sequenceId + 1 );
+}
+
+inline bool GpuQueue::ShouldExitQueueThread()
+{
+ return _exitQueueThread.load( std::memory_order_acquire );
+}
+
diff --git a/cuda/GpuQueue.h b/cuda/GpuQueue.h
new file mode 100644
index 00000000..8adf41e5
--- /dev/null
+++ b/cuda/GpuQueue.h
@@ -0,0 +1,188 @@
+#pragma once
+
+#include "GpuStreams.h"
+#include
+
+class DiskQueue;
+
+struct GpuStreamDescriptor
+{
+ size_t entrySize;
+ size_t entriesPerSlice;
+ uint32 sliceCount;
+ uint32 sliceAlignment;
+ uint32 bufferCount;
+ IAllocator* deviceAllocator;
+ IAllocator* pinnedAllocator;
+ DiskQueue* diskQueue; // DiskQueue to use when disk offload mode is enabled.
+ const char* diskFileName; // File name to use when disk offload mode is enabled. The diskQueue must be set.
+ bool bucketedDiskBuffer; // If true, a DiskBucketBuffer will be used instead of a DiskBuffer.
+ bool directIO; // If true, direct I/O will be used when using disk offload mode.
+};
+
+typedef std::function GpuCallbackDispath;
+
+class GpuQueue
+{
+ friend struct IGpuBuffer;
+ friend struct GpuDownloadBuffer;
+ friend struct GpuUploadBuffer;
+
+ enum class CommandType
+ {
+ None = 0,
+ Copy,
+ CopyArray,
+ Callback,
+ };
+
+ struct Command
+ {
+ CommandType type;
+
+ union
+ {
+ struct CopyInfo* copy;
+
+ struct {
+ GpuDownloadCallback callback;
+ size_t copySize;
+ void* dstbuffer;
+ void* userData;
+ } callback;
+ };
+ };
+
+public:
+
+ enum Kind
+ {
+ Downloader,
+ Uploader
+ };
+
+ GpuQueue( Kind kind );
+ virtual ~GpuQueue();
+
+ static size_t CalculateSliceSizeFromDescriptor( const GpuStreamDescriptor& desc );
+ static size_t CalculateBufferSizeFromDescriptor( const GpuStreamDescriptor& desc );
+
+ //GpuDownloadBuffer CreateDownloadBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size = 0, bool dryRun = false );
+ //GpuDownloadBuffer CreateDownloadBuffer( const size_t size, bool dryRun = false );
+ GpuDownloadBuffer CreateDirectDownloadBuffer( size_t size, IAllocator& devAllocator, size_t alignment, bool dryRun = false );
+ GpuDownloadBuffer CreateDownloadBuffer( size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false );
+ GpuDownloadBuffer CreateDownloadBuffer( size_t size, uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false );
+
+ GpuDownloadBuffer CreateDownloadBuffer( const GpuStreamDescriptor& desc, bool dryRun = false );
+
+ /// Create with descriptor and override entry size
+ inline GpuDownloadBuffer CreateDownloadBuffer( const GpuStreamDescriptor& desc, size_t entrySize, bool dryRun = false )
+ {
+ GpuStreamDescriptor copy = desc;
+ copy.entrySize = entrySize;
+
+ return CreateDownloadBuffer( copy, dryRun );
+ }
+
+ template
+ inline GpuDownloadBuffer CreateDownloadBufferT( const GpuStreamDescriptor& desc, bool dryRun = false )
+ {
+ return CreateDownloadBuffer( desc, sizeof( T ), dryRun );
+ }
+
+ /// Create with descriptor and override entry size
+ GpuUploadBuffer CreateUploadBuffer( const GpuStreamDescriptor& desc, bool dryRun = false );
+
+ // inline GpuUploadBuffer CreateUploadBuffer( const GpuStreamDescriptor& desc, bool size_t entrySize, bool dryRun = false )
+ // {
+ // GpuStreamDescriptor copy = desc;
+ // copy.entrySize = entrySize;
+
+ // return CreateUploadBuffer( copy, dryRun );
+ // }
+
+ template
+ inline GpuUploadBuffer CreateUploadBufferT( const GpuStreamDescriptor& desc, bool dryRun = false )
+ {
+ GpuStreamDescriptor copy = desc;
+ copy.entrySize = sizeof(T);
+
+ return CreateUploadBuffer( copy, dryRun );
+ // return CreateUploadBuffer( desc, sizeof( T ), dryRun );
+ }
+
+
+ template
+ inline GpuDownloadBuffer CreateDirectDownloadBuffer( const size_t count, IAllocator& devAllocator, size_t alignment = alignof( T ), bool dryRun = false )
+ {
+ return CreateDirectDownloadBuffer( count * sizeof( T ), devAllocator, alignment, dryRun );
+ }
+
+ template
+ inline GpuDownloadBuffer CreateDownloadBufferT( const size_t count, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment = alignof( T ), bool dryRun = false )
+ {
+ return CreateDownloadBuffer( count * sizeof( T ), devAllocator, pinnedAllocator, alignment, dryRun );
+ }
+
+ template
+ inline GpuDownloadBuffer CreateDownloadBufferT( const size_t count, uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment = alignof( T ), bool dryRun = false )
+ {
+ return CreateDownloadBuffer( count * sizeof( T ), bufferCount, devAllocator, pinnedAllocator, alignment, dryRun );
+ }
+
+ //GpuUploadBuffer CreateUploadBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size = 0, bool dryRun = false );
+ //GpuUploadBuffer CreateUploadBuffer( const size_t size, bool dryRun = false );
+ GpuUploadBuffer CreateUploadBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false );
+
+ template
+ inline GpuUploadBuffer CreateUploadBufferT( const size_t count, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false )
+ {
+ return CreateUploadBuffer( count * sizeof( T ), devAllocator, pinnedAllocator, alignment, dryRun );
+ }
+
+ inline cudaStream_t GetStream() const { return _stream; }
+
+protected:
+
+ struct IGpuBuffer* CreateGpuBuffer( size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun );
+ struct IGpuBuffer* CreateGpuBuffer( const GpuStreamDescriptor& desc, bool dryRun );
+
+ void DispatchHostFunc( GpuCallbackDispath func, cudaStream_t stream, cudaEvent_t lockEvent, cudaEvent_t completedEvent );
+
+ static void CopyPendingDownloadStream( void* userData );
+
+ [[nodiscard]]
+ Command& GetCommand( CommandType type );
+ void SubmitCommands();
+
+ // Copy threads
+ static void QueueThreadEntryPoint( GpuQueue* self );
+ void QueueThreadMain();
+
+ void ExecuteCommand( const Command& cpy );
+
+ bool ShouldExitQueueThread();
+
+protected:
+ cudaStream_t _stream = nullptr;
+ cudaStream_t _preloadStream = nullptr;
+ cudaStream_t _callbackStream = nullptr;
+
+
+ Thread _queueThread;
+ //Fence _bufferReadySignal;
+ Semaphore _bufferReadySignal;
+ Fence _bufferCopiedSignal;
+ Fence _syncFence;
+ SPCQueue _queue;
+ Kind _kind;
+
+ AutoResetSignal _waitForExitSignal;
+ std::atomic _exitQueueThread = false;
+
+ // Support multiple threads to grab commands
+ std::atomic _cmdTicketOut = 0;
+ std::atomic _cmdTicketIn = 0;
+ std::atomic _commitTicketOut = 0;
+ std::atomic _commitTicketIn = 0;
+};
diff --git a/cuda/GpuStreams.cu b/cuda/GpuStreams.cu
index e5dcfd66..63700c9c 100644
--- a/cuda/GpuStreams.cu
+++ b/cuda/GpuStreams.cu
@@ -1,137 +1,105 @@
#include "GpuStreams.h"
-#include "util/StackAllocator.h"
+#include "GpuQueue.h"
+#include "plotting/DiskBucketBuffer.h"
+#include "plotting/DiskBuffer.h"
-struct PackedCopy
-{
- struct IGpuBuffer* self;
- const byte* src;
- uint32 sequence;
- uint32 length;
- uint32 stride;
- uint32 elementSize;
- uint32 counts[BBCU_BUCKET_COUNT];
-};
-
-struct CopyInfo
-{
- struct IGpuBuffer* self;
- uint32 sequence;
-
- const void* srcBuffer;
- void* dstBuffer;
- size_t width;
- size_t height;
- size_t dstStride;
- size_t srcStride;
-
- // Callback data
- GpuDownloadCallback callback;
- void* userData;
-};
-
-struct IGpuBuffer
-{
- size_t size;
- uint32 bufferCount; // Number of pinned/device buffers this instance contains
- void* deviceBuffer [BBCU_GPU_BUFFER_MAX_COUNT];
- void* pinnedBuffer [BBCU_GPU_BUFFER_MAX_COUNT]; // Pinned host buffer
- cudaEvent_t events [BBCU_GPU_BUFFER_MAX_COUNT]; // Signals the device buffer is ready for use
- cudaEvent_t completedEvents[BBCU_GPU_BUFFER_MAX_COUNT]; // Signals the buffer is ready for consumption by the device or buffer
- cudaEvent_t readyEvents [BBCU_GPU_BUFFER_MAX_COUNT]; // User must signal this event when the device buffer is ready for download
- // GpuQueue::Command commands [BBCU_GPU_BUFFER_MAX_COUNT]; // Pending copy command for downloads
- Fence fence; // Signals the pinned buffer is ready for use
- Fence copyFence;
-
- cudaEvent_t preloadEvents[BBCU_GPU_BUFFER_MAX_COUNT];
-
- CopyInfo copies[BBCU_BUCKET_COUNT];
- PackedCopy packedCopeis[BBCU_BUCKET_COUNT]; // For uplad buffers
- // #TODO: Remove atomic again
- uint32 lockSequence; // Index of next buffer to lock
- uint32 outgoingSequence; // Index of locked buffer that will be downoaded/uploaded
- std::atomic completedSequence; // Index of buffer that finished downloading/uploading
- std::atomic copySequence;
-
- GpuQueue* queue;
-};
///
-/// DownloadBuffer
+/// UploadBuffer
///
-void* GpuDownloadBuffer::GetDeviceBuffer()
+void* GpuUploadBuffer::GetNextPinnedBuffer()
{
+ // Wait for the pinned host buffer to be available
+ //if( self->outgoingSequence > self->bufferCount-1 )
+ // self->fence.Wait( self->outgoingSequence - self->bufferCount + 1 );
+ //
const uint32 index = self->outgoingSequence % self->bufferCount;
- CudaErrCheck( cudaEventSynchronize( self->events[index] ) );
+ void* pinnedBuffer = self->pinnedBuffer[index];
- return self->deviceBuffer[index];
+ return pinnedBuffer;
}
-void* GpuDownloadBuffer::LockDeviceBuffer( cudaStream_t stream )
+void GpuUploadBuffer::Upload( const void* hostBuffer, size_t size, cudaStream_t workStream, bool directOverride )
{
- ASSERT( self->lockSequence >= self->outgoingSequence );
- ASSERT( self->lockSequence - self->outgoingSequence < self->bufferCount );
+ ASSERT( size );
- const uint32 index = self->lockSequence % self->bufferCount;
- self->lockSequence++;
+ const bool isDirect = (!self->pinnedBuffer[0] || directOverride) && !self->diskBuffer;
+ PanicIf( isDirect && !hostBuffer, "No host buffer provided for direct upload." );
- // Wait for the device buffer to be free to be used by kernels
- CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) );
- return self->deviceBuffer[index];
-}
+ const uint32 index = SynchronizeOutgoingSequence();
-void GpuDownloadBuffer::Download( void* hostBuffer, const size_t size )
-{
- Download2D( hostBuffer, size, 1, size, size );
-}
+ auto uploadStream = self->queue->GetStream();
-void GpuDownloadBuffer::Download( void* hostBuffer, const size_t size, cudaStream_t workStream, bool directOverride )
-{
- Download2D( hostBuffer, size, 1, size, size, workStream, directOverride );
-}
-
-void GpuDownloadBuffer::DownloadAndCopy( void* hostBuffer, void* finalBuffer, const size_t size, cudaStream_t workStream )
-{
- ASSERT( 0 );
- // ASSERT( self->outgoingSequence < BBCU_BUCKET_COUNT );
- // ASSERT( hostBuffer );
- // ASSERT( workStream );
- // ASSERT( self->lockSequence > 0 );
- // ASSERT( self->outgoingSequence < self->lockSequence );
- // ASSERT( self->lockSequence - self->outgoingSequence <= self->bufferCount );
-
- // auto& cpy = self->copies[self->outgoingSequence];
- // cpy.self = self;
- // cpy.sequence = self->outgoingSequence;
- // cpy.copy.hostBuffer = finalBuffer;
- // cpy.copy.srcBuffer = hostBuffer;
- // cpy.copy.size = size;
+ DiskBuffer* diskBuffer = nullptr;
+ if( self->diskBuffer )
+ {
+ // Preload data from disk into pinned buffer
+
+ diskBuffer = dynamic_cast( self->diskBuffer );
+ PanicIf( !diskBuffer, "Not a DiskBucketBuffer" );
+ ASSERT( diskBuffer->GetAlignedBufferSize() >= size );
+
+ hostBuffer = self->pinnedBuffer[index];
+ ASSERT( hostBuffer == diskBuffer->PeekReadBufferForBucket( self->outgoingSequence - 1 ) );
+ ASSERT( self->outgoingSequence <= BBCU_BUCKET_COUNT );
+
+ CallHostFunctionOnStream( uploadStream, [=](){
+ // Read on disk queue's thread
+ diskBuffer->ReadNextBucket();
+
+ // Block until the buffer is fully read from disk
+ // #TODO: Also should not do this here, but in a host-to-host background stream,
+ // so that the next I/O read can happen in the background while
+ // the previous upload to disk is happening, if needed.
+ (void)diskBuffer->GetNextReadBuffer();
+ });
+ }
+ else if( !isDirect )
+ {
+ // Copy from unpinned to pinned first
+ // #TODO: This should be done in a different backgrund host-to-host copy stream
+ CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->pinnedEvent[index] ) );
+ CudaErrCheck( cudaMemcpyAsync( self->pinnedBuffer[index], hostBuffer, size, cudaMemcpyHostToHost, uploadStream ) );
+ hostBuffer = self->pinnedBuffer[index];
+ }
- // const uint32 index = self->outgoingSequence % self->bufferCount;
- // self->outgoingSequence++;
+ // Ensure the device buffer is ready for use
+ CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) );
- // void* pinnedBuffer = self->pinnedBuffer[index];
- // const void* devBuffer = self->deviceBuffer[index];
+ // Upload to the device buffer
+ CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, size, cudaMemcpyHostToDevice, uploadStream ) );
- // // Signal from the work stream when it has finished doing kernel work with the device buffer
- // CudaErrCheck( cudaEventRecord( self->readyEvents[index], workStream ) );
+ if( !isDirect )
+ {
+ // Signal that the pinned buffer is ready for re-use
+ CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) );
+ }
+ // Signal work stream that the device buffer is ready to be used
+ CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) );
+}
- // // Ensure the work stream has completed writing data to the device buffer
- // cudaStream_t stream = self->queue->_stream;
+void GpuUploadBuffer::UploadAndPreLoad( void* hostBuffer, const size_t size, const void* copyBufferSrc, const size_t copySize )
+{
+ ASSERT(0);
+ // ASSERT( size >= copySize );
- // CudaErrCheck( cudaStreamWaitEvent( stream, self->readyEvents[index] ) );
+ // Upload( hostBuffer, size, nullptr );
- // // Copy
- // CudaErrCheck( cudaMemcpyAsync( hostBuffer, devBuffer, size, cudaMemcpyDeviceToHost, stream ) );
-
- // // Signal that the device buffer is free to be re-used
- // CudaErrCheck( cudaEventRecord( self->events[index], stream ) );
+ // // Add callback for copy
+ // const uint32 sequence = self->outgoingSequence - 1;
+ // auto& cpy = self->copies[sequence];
+ // cpy.self = self;
+ // cpy.sequence = sequence;
+ // cpy.copy.hostBuffer = hostBuffer;
+ // cpy.copy.srcBuffer = copyBufferSrc;
+ // cpy.copy.size = copySize;
// // Launch copy command
- // CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){
+ // CudaErrCheck( cudaLaunchHostFunc( self->queue->GetStream(), []( void* userData ){
// const CopyInfo& c = *reinterpret_cast( userData );
// IGpuBuffer* self = c.self;
@@ -140,438 +108,113 @@ void GpuDownloadBuffer::DownloadAndCopy( void* hostBuffer, void* finalBuffer, co
// cmd.copy.info = &c;
// self->queue->SubmitCommands();
-
- // // Signal the download completed
- // self->fence.Signal( ++self->completedSequence );
// }, &cpy ) );
}
-void GpuDownloadBuffer::DownloadWithCallback( void* hostBuffer, const size_t size, GpuDownloadCallback callback, void* userData, cudaStream_t workStream, bool directOverride )
-{
- Download2DWithCallback( hostBuffer, size, 1, size, size, callback, userData, workStream, directOverride );
-}
-
-void GpuDownloadBuffer::Download2D( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, cudaStream_t workStream, bool directOverride )
-{
- Download2DWithCallback( hostBuffer, width, height, dstStride, srcStride, nullptr, nullptr, workStream, directOverride );
-}
-
-void GpuDownloadBuffer::Download2DWithCallback( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride,
- GpuDownloadCallback callback, void* userData, cudaStream_t workStream, bool directOverride )
+void GpuUploadBuffer::UploadArray( const void* hostBuffer, uint32 length, uint32 elementSize, uint32 srcStride,
+ uint32 countStride, const uint32* counts, cudaStream_t workStream )
{
- ASSERT( hostBuffer );
- ASSERT( workStream );
- ASSERT( self->lockSequence > 0 );
- ASSERT( self->outgoingSequence < self->lockSequence );
- ASSERT( self->lockSequence - self->outgoingSequence <= self->bufferCount );
-
- const uint32 index = self->outgoingSequence % self->bufferCount;
+ const uint32 index = SynchronizeOutgoingSequence();
+ const bool isDirect = self->pinnedBuffer[0] == nullptr && !self->diskBuffer;
- void* pinnedBuffer = self->pinnedBuffer[index];
- const void* devBuffer = self->deviceBuffer[index];
+ auto uploadStream = self->queue->GetStream();
- const bool isDirect = directOverride || self->pinnedBuffer[0] == nullptr; ASSERT( isDirect || self->pinnedBuffer[0] );
+ DiskBucketBuffer* diskBuffer = nullptr;
+ size_t totalBufferSize = 0;
- // Signal from the work stream when it has finished doing kernel work with the device buffer
- CudaErrCheck( cudaEventRecord( self->readyEvents[index], workStream ) );
-
- // Ensure the work stream has completed writing data to the device buffer
- cudaStream_t stream = self->queue->_stream;
-
- CudaErrCheck( cudaStreamWaitEvent( stream, self->readyEvents[index] ) );
-
- // Ensure the pinned buffer is ready for use
- if( !isDirect )
+ if( self->diskBuffer )
{
- // CudaErrCheck( cudaStreamWaitEvent( stream, self->completedEvents[index] ) );
- CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){
-
- IGpuBuffer* self = reinterpret_cast( userData );
- if( self->copySequence++ > 1 )
- {
- self->copyFence.Wait( self->copySequence-1 );
- }
- }, self ) );
- }
+ diskBuffer = dynamic_cast( self->diskBuffer );
+ PanicIf( !diskBuffer, "Not a DiskBucketBuffer" );
- // Copy from device to pinned host buffer
- const bool isSequentialCopy = dstStride == srcStride;
- const size_t totalSize = height * width;
-
- if( isDirect )
- {
- if( isSequentialCopy )
- CudaErrCheck( cudaMemcpyAsync( hostBuffer, devBuffer, totalSize, cudaMemcpyDeviceToHost, stream ) );
- else
- CudaErrCheck( cudaMemcpy2DAsync( hostBuffer, dstStride, devBuffer, srcStride, width, height, cudaMemcpyDeviceToHost, stream ) );
+ hostBuffer = diskBuffer->PeekReadBufferForBucket( self->outgoingSequence-1 );
+ ASSERT( self->outgoingSequence <= BBCU_BUCKET_COUNT );
- // Signal direct download completed
- auto& cpy = self->copies[self->outgoingSequence];
- cpy.self = self;
- cpy.sequence = self->outgoingSequence;
- cpy.dstBuffer = hostBuffer;
- cpy.callback = callback;
- cpy.userData = userData;
- cpy.height = height;
- cpy.width = width;
+ // if( nextReadBucket < BBCU_BUCKET_COUNT )
+ {
+ // Override the input slice sizes with the correct ones (as we wrote them with fixed size)
+
+ // Preload the bucket buffer from disk
+ CallHostFunctionOnStream( uploadStream, [=](){
- CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){
+ const uint32 nextReadBucket = diskBuffer->GetNextReadBucketId();
+ diskBuffer->OverrideReadSlices( nextReadBucket, elementSize, counts, countStride );
- CopyInfo& cpy = *reinterpret_cast( userData );
- IGpuBuffer* self = cpy.self; //reinterpret_cast( userData );
+ // Preloads in the background
+ diskBuffer->ReadNextBucket();
- self->fence.Signal( ++self->completedSequence );
+ // Upload the next one too, if needed
+ // #NOTE: This is a hacky way to do it for now.
+ // We ought to have a synchronized, separate, disk stream later
+ // if( nextReadBucket < BBCU_BUCKET_COUNT )
+ // diskBuffer->ReadNextBucket();
+ });
+ }
- // Dispatch callback, if one was set
- if( cpy.callback )
- cpy.callback( cpy.dstBuffer, cpy.height * cpy.width, cpy.userData );
+ // Wait for disk buffer to be ready
+ CallHostFunctionOnStream( uploadStream, [diskBuffer](){
- }, &cpy ) );
+ // Wait until next buffer is ready
+ (void)diskBuffer->GetNextReadBuffer();
+ });
}
else
{
- CudaErrCheck( cudaMemcpyAsync( pinnedBuffer, devBuffer, totalSize, cudaMemcpyDeviceToHost, stream ) );
- }
-
- // Signal that the device buffer is free to be re-used
- CudaErrCheck( cudaEventRecord( self->events[index], stream ) );
+ // Perform fragmented uploads
+ const auto waitEvent = isDirect ? self->deviceEvents[index] : self->pinnedEvent[index];
+ const auto copyMode = isDirect ? cudaMemcpyHostToDevice : cudaMemcpyHostToHost;
- // If not a direct copy, we need to do another copy from the pinned buffer to the unpinned host buffer
- if( !isDirect )
- {
- // Signal the copy stream that the pinned buffer is ready to be copied to the unpinned host buffer
- CudaErrCheck( cudaEventRecord( self->preloadEvents[index], stream ) );
+ // Wait on device or pinned buffer to be ready (depending if a direct copy or not)
+ CudaErrCheck( cudaStreamWaitEvent( uploadStream, waitEvent ) );
- // Ensure the pinned buffer is ready for use
- cudaStream_t copyStream = self->queue->_preloadStream;
-
- CudaErrCheck( cudaStreamWaitEvent( copyStream, self->preloadEvents[index] ) );
+ const byte* src = (byte*)hostBuffer;
+ byte* dst = (byte*)( isDirect ? self->deviceBuffer[index] : self->pinnedBuffer[index] );
+ const uint32* sizes = counts;
+ for( uint32 i = 0; i < length; i++ )
{
- auto& cpy = self->copies[self->outgoingSequence];
- cpy.self = self;
- cpy.sequence = self->outgoingSequence;
-
- cpy.dstBuffer = hostBuffer;
- cpy.srcBuffer = pinnedBuffer;
- cpy.width = width;
- cpy.height = height;
- cpy.srcStride = srcStride;
- cpy.dstStride = dstStride;
- cpy.callback = callback;
- cpy.userData = userData;
-
- CudaErrCheck( cudaLaunchHostFunc( copyStream, []( void* userData ){
+ const size_t size = *sizes * (size_t)elementSize;
- CopyInfo& cpy = *reinterpret_cast( userData );
- IGpuBuffer* self = cpy.self; //reinterpret_cast( userData );
+ CudaErrCheck( cudaMemcpyAsync( dst, src, size, copyMode, uploadStream ) );
- auto& cmd = self->queue->GetCommand( GpuQueue::CommandType::Copy );
- cmd.copy = &cpy;
- self->queue->SubmitCommands();
-
- }, &cpy ) );
+ dst += size;
+ src += srcStride;
+ sizes += countStride;
}
- // Signal the pinned buffer is free to be re-used
- // CudaErrCheck( cudaEventRecord( self->completedEvents[index], copyStream ) );
- }
-
-
- // Signal the download completed
- // {
- // auto& cpy = self->copies[self->outgoingSequence];
- // cpy.self = self;
- // cpy.sequence = self->outgoingSequence;
-
- // cpy.copy2d.dstBuffer = hostBuffer;
- // cpy.copy2d.srcBuffer = pinnedBuffer;
- // cpy.copy2d.width = width;
- // cpy.copy2d.height = height;
- // cpy.copy2d.srcStride = srcStride;
- // cpy.copy2d.dstStride = dstStride;
-
- // CudaErrCheck( cudaLaunchHostFunc( copyStream, []( void* userData ){
-
- // CopyInfo& cpy = *reinterpret_cast( userData );
- // IGpuBuffer* self = cpy.self; //reinterpret_cast( userData );
-
- // const uint32 idx = cpy.sequence & self->bufferCount;
-
- // const byte* src = (byte*)cpy.copy2d.srcBuffer;
- // byte* dst = (byte*)cpy.copy2d.dstBuffer;
-
- // const size_t width = cpy.copy2d.width;
- // const size_t height = cpy.copy2d.height;
- // const size_t dstStride = cpy.copy2d.dstStride;
- // const size_t srcStride = cpy.copy2d.srcStride;
-
- // auto& cmd = self->queue->GetCommand( GpuQueue::CommandType::Download2D );
- // cmd.sequenceId = cpy.sequence;
- // cmd.srcBuffer = src;
- // cmd.dstBuffer = dst;
- // cmd.download2d.buf = self;
- // cmd.download2d.width = width;
- // cmd.download2d.height = height;
- // cmd.download2d.srcStride = srcStride;
- // cmd.download2d.dstStride = dstStride;
- // self->queue->SubmitCommands();
-
- // // for( size_t i = 0; i < height; i++ )
- // // {
- // // memcpy( dst, src, width );
-
- // // dst += dstStride;
- // // src += srcStride;
- // // }
-
- // // self->fence.Signal( ++self->completedSequence );
- // }, &cpy ) );
- // }
- // CudaErrCheck( cudaEventRecord( self->completedEvents[index], copyStream ) );
-
- // if( callback )
- // {
- // ASSERT( width <= srcStride );
- // ASSERT( width <= dstStride );
-
- // auto& cpy = self->copies[self->outgoingSequence];
- // cpy.self = self;
- // cpy.sequence = self->outgoingSequence;
- // cpy.callback.hostBuffer = hostBuffer;
- // cpy.callback.size = width * height;
- // cpy.callback.callback = callback;
- // cpy.callback.userData = userData;
-
- // CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){
-
- // auto& cpy = *reinterpret_cast( userData );
- // auto* self = cpy.self;
-
- // // Fire callback command
- // auto& cmd = self->queue->GetCommand( GpuQueue::CommandType::Callback );
- // cmd.dstBuffer = cpy.callback.hostBuffer;
- // cmd.callback.copySize = cpy.callback.size;
- // cmd.callback.callback = cpy.callback.callback;
- // cmd.callback.userData = cpy.callback.userData;
- // self->queue->SubmitCommands();
-
- // // Signal the download completed
- // self->fence.Signal( ++self->completedSequence );
- // }, &cpy ) );
- // }
- // else
- // {
- // // Signal the download completed
- // CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){
-
- // IGpuBuffer* self = reinterpret_cast( userData );
- // self->fence.Signal( ++self->completedSequence );
- // }, self ) );
- // }
-
- self->outgoingSequence++;
-}
-
-void GpuDownloadBuffer::GetDownload2DCommand( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride,
- uint32& outIndex, void*& outPinnedBuffer, const void*& outDevBuffer, GpuDownloadCallback callback, void* userData )
-{
- ASSERT( width );
- ASSERT( height );
- ASSERT( hostBuffer );
-
- const uint32 index = self->outgoingSequence % self->bufferCount;
-
- // We need to block until the pinned buffer is available.
- if( self->outgoingSequence > self->bufferCount-1 )
- self->fence.Wait( self->outgoingSequence - self->bufferCount + 1 );
-
- void* pinnedBuffer = self->pinnedBuffer[index];
- const void* devBuffer = self->deviceBuffer[index];
-
- //auto& cmd = self->commands[index];
- //cmd.type = GpuQueue::CommandType::Copy2D;
- //cmd.sequenceId = self->outgoingSequence++;
- //cmd.finishedSignal = &self->fence;
- //cmd.dstBuffer = hostBuffer;
- //cmd.srcBuffer = pinnedBuffer;
- //cmd.copy2d.width = width;
- //cmd.copy2d.height = height;
- //cmd.copy2d.dstStride = dstStride;
- //cmd.copy2d.srcStride = srcStride;
- //cmd.copy2d.callback = callback;
- //cmd.copy2d.userData = userData;
-
- outIndex = index;
- outPinnedBuffer = pinnedBuffer;
- outDevBuffer = devBuffer;
-}
-
-
-void GpuDownloadBuffer::DownloadAndPackArray( void* hostBuffer, const uint32 length, size_t srcStride, const uint32* counts, const uint32 elementSize )
-{
- ASSERT( length );
- ASSERT( elementSize );
- ASSERT( counts );
-
- uint32 totalElements = 0;
- for( uint32 i = 0; i < length; i++ )
- totalElements += counts[i];
-
- const size_t totalSize = (size_t)totalElements * elementSize;
-
- uint32 index;
- void* pinnedBuffer;
- const void* devBuffer;
- GetDownload2DCommand( hostBuffer, totalSize, 1, totalSize, totalSize, index, pinnedBuffer, devBuffer );
-
-
- srcStride *= elementSize;
-
- byte* dst = (byte*)pinnedBuffer;
- const byte* src = (byte*)devBuffer;
-
- cudaStream_t stream = self->queue->_stream;
-
- // Copy all buffers from device to pinned buffer
- for( uint32 i = 0; i < length; i++ )
- {
- const size_t copySize = counts[i] * (size_t)elementSize;
-
- // #TODO: Determine if there's a cuda (jagged) array copy
- CudaErrCheck( cudaMemcpyAsync( dst, src, copySize, cudaMemcpyDeviceToHost, stream ) );
-
- src += srcStride;
- dst += copySize;
+ if( !isDirect )
+ {
+ // Set the pinned buffer as the host buffer so that we can do a sequential copy to the device now
+ hostBuffer = self->pinnedBuffer[index];
+ }
}
- // Signal that the device buffer is free
- CudaErrCheck( cudaEventRecord( self->events[index], stream ) );
-
- // Submit command to do the final copy from pinned to host
- CudaErrCheck( cudaLaunchHostFunc( stream, GpuQueue::CopyPendingDownloadStream, self ) );
-}
-
-void GpuDownloadBuffer::WaitForCompletion()
-{
- if( self->outgoingSequence > 0 )
+ // Upload to device buffer if in non-direct mode
+ if( !isDirect )
{
- //const uint32 index = (self->outgoingSequence - 1) % self->bufferCount;
-
- // cudaEvent_t event = self->completedEvents[index];
- //const cudaError_t r = cudaEventQuery( event );
-
- //if( r == cudaSuccess )
- // return;
-
- //if( r != cudaErrorNotReady )
- // CudaErrCheck( r );
+ for( uint32 i = 0; i < length; i++ )
+ {
+ ASSERT( *counts );
+ totalBufferSize += *counts * (size_t)elementSize;
+ counts += countStride;
+ }
- //CudaErrCheck( cudaEventSynchronize( event ) );
-
- self->fence.Wait( self->outgoingSequence );
- }
-}
+ // #TODO: This should be done in a copy stream to perform the copies in the background
+ CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) );
+ CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, totalBufferSize, cudaMemcpyHostToDevice, uploadStream ) );
-void GpuDownloadBuffer::WaitForCopyCompletion()
-{
- if( self->outgoingSequence > 0 )
- {
- self->copyFence.Wait( self->outgoingSequence );
+ if( !self->diskBuffer )
+ CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) );
}
-}
-
-void GpuDownloadBuffer::Reset()
-{
- self->lockSequence = 0;
- self->outgoingSequence = 0;
- self->completedSequence = 0;
- self->copySequence = 0;
- self->fence.Reset( 0 );
- self->copyFence.Reset( 0 );
-}
-
-GpuQueue* GpuDownloadBuffer::GetQueue() const
-{
- return self->queue;
-}
-
-
-///
-/// UploadBuffer
-///
-void* GpuUploadBuffer::GetNextPinnedBuffer()
-{
- // Wait for the pinned host buffer to be available
- //if( self->outgoingSequence > self->bufferCount-1 )
- // self->fence.Wait( self->outgoingSequence - self->bufferCount + 1 );
- //
- const uint32 index = self->outgoingSequence % self->bufferCount;
-
- void* pinnedBuffer = self->pinnedBuffer[index];
-
- return pinnedBuffer;
-}
-
-void GpuUploadBuffer::Upload( const void* hostBuffer, size_t size, cudaStream_t workStream )
-{
- ASSERT( hostBuffer );
- ASSERT( size );
- ASSERT( self->outgoingSequence - self->lockSequence < 2 );
- // ASSERT( workStream );
-
- const uint32 index = self->outgoingSequence % self->bufferCount;
- self->outgoingSequence++;
-
- auto stream = self->queue->GetStream();
-
- // Ensure the device buffer is ready for use
- CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) );
-
- // Upload to device buffer
- CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, size, cudaMemcpyHostToDevice, stream ) );
// Signal work stream that the device buffer is ready to be used
- CudaErrCheck( cudaEventRecord( self->readyEvents[index], stream ) );
+ CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) );
}
-void GpuUploadBuffer::UploadAndPreLoad( void* hostBuffer, const size_t size, const void* copyBufferSrc, const size_t copySize )
-{
- ASSERT(0);
- // ASSERT( size >= copySize );
-
- // Upload( hostBuffer, size, nullptr );
-
- // // Add callback for copy
- // const uint32 sequence = self->outgoingSequence - 1;
- // auto& cpy = self->copies[sequence];
- // cpy.self = self;
- // cpy.sequence = sequence;
- // cpy.copy.hostBuffer = hostBuffer;
- // cpy.copy.srcBuffer = copyBufferSrc;
- // cpy.copy.size = copySize;
-
- // // Launch copy command
- // CudaErrCheck( cudaLaunchHostFunc( self->queue->GetStream(), []( void* userData ){
-
- // const CopyInfo& c = *reinterpret_cast( userData );
- // IGpuBuffer* self = c.self;
-
- // auto& cmd = self->queue->GetCommand( GpuQueue::CommandType::Copy );
- // cmd.copy.info = &c;
-
- // self->queue->SubmitCommands();
- // }, &cpy ) );
-}
-
-void GpuUploadBuffer::UploadArray( const void* hostBuffer, uint32 length, uint32 elementSize, uint32 srcStride,
- uint32 countStride, const uint32* counts, cudaStream_t workStream )
+void GpuUploadBuffer::UploadArrayForIndex( const uint32 index, const void* hostBuffer, uint32 length,
+ uint32 elementSize, uint32 srcStride, uint32 countStride, const uint32* counts )
{
ASSERT( hostBuffer );
- ASSERT( self->outgoingSequence - self->lockSequence < 2 );
-
- const uint32 index = self->outgoingSequence % self->bufferCount;
- self->outgoingSequence++;
auto stream = self->queue->GetStream();
@@ -632,28 +275,6 @@ void* GpuUploadBuffer::GetUploadedDeviceBuffer( cudaStream_t workStream )
return self->deviceBuffer[index];
}
-void* GpuUploadBuffer::GetUploadedDeviceBuffer()
-{ASSERT(0); // Not allowed for now
- if( self->outgoingSequence < 1 )
- {
- ASSERT( 0 );
- return nullptr;
- }
- ASSERT( 0 );
- const uint32 index = self->completedSequence % self->bufferCount;
-
- // #TODO: Make this spin way.
- // #TODO: Find a better way to do this instead of having to wait on both primitives.
- // Can't check the cuda event until we're sure it's been
- // added to the stream
- self->fence.Wait( self->completedSequence + 1 );
- CudaErrCheck( cudaEventSynchronize( self->events[index] ) );
-
- self->completedSequence++;
-
- return self->deviceBuffer[index];
-}
-
void GpuUploadBuffer::ReleaseDeviceBuffer( cudaStream_t workStream )
{
ASSERT( self->outgoingSequence > self->lockSequence );
@@ -663,7 +284,7 @@ void GpuUploadBuffer::ReleaseDeviceBuffer( cudaStream_t workStream )
const uint32 index = self->lockSequence % self->bufferCount;
self->lockSequence++;
- CudaErrCheck( cudaEventRecord( self->events[index], workStream ) );
+ CudaErrCheck( cudaEventRecord( self->deviceEvents[index], workStream ) );
}
void GpuUploadBuffer::WaitForPreloadsToComplete()
@@ -674,6 +295,17 @@ void GpuUploadBuffer::WaitForPreloadsToComplete()
}
}
+uint32 GpuUploadBuffer::SynchronizeOutgoingSequence()
+{
+ PanicIf( self->outgoingSequence < self->lockSequence || self->outgoingSequence - self->lockSequence >= 2,
+ "Invalid outgoing synchro sequence state." );
+
+ const uint32 index = self->outgoingSequence % self->bufferCount;
+ self->outgoingSequence++;
+
+ return index;
+}
+
void GpuUploadBuffer::Reset()
{
self->lockSequence = 0;
@@ -689,362 +321,32 @@ GpuQueue* GpuUploadBuffer::GetQueue() const
return self->queue;
}
-
-///
-/// Shared GpuStream Inteface
-///
-GpuQueue::GpuQueue( Kind kind ) : _kind( kind )
- , _bufferReadySignal( BBCU_BUCKET_COUNT )
-{
- CudaErrCheck( cudaStreamCreateWithFlags( &_stream, cudaStreamNonBlocking ) );
- CudaErrCheck( cudaStreamCreateWithFlags( &_preloadStream, cudaStreamNonBlocking ) );
-
- _copyThread.Run( CopyThreadEntryPoint, this );
-}
-
-GpuQueue::~GpuQueue()
-{
- _exitCopyThread.store( true, std::memory_order_release );
- _bufferReadySignal.Release();
- _waitForExitSignal.Wait();
-}
-
-//void GpuQueue::Synchronize()
-//{
-// (void)GetCommand( CommandType::Sync );
-// SubmitCommands();
-//
-// _syncFence.Wait();
-//}
-
-
-//GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size, bool dryRun )
-//{
-// FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" );
-// if( dryRun ) return { nullptr };
-//
-// // #TODO: Set size?
-// return { CreateGpuBuffer( dev0, dev1, pinned0, pinned1, size ) };
-//}
-
-//GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( const size_t size, bool dryRun )
-//{
-// FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" );
-// if( dryRun ) return { nullptr };
-// return { CreateGpuBuffer( size ) };
-//}
-
-GpuDownloadBuffer GpuQueue::CreateDirectDownloadBuffer( const size_t size, IAllocator& devAllocator, const size_t alignment, const bool dryRun )
-{
- FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" );
- GpuDownloadBuffer r = { CreateGpuBuffer( size, BBCU_DEFAULT_GPU_BUFFER_COUNT, &devAllocator, nullptr, alignment, dryRun ) };
-
- if( !dryRun )
- r.Reset();
-
- return r;
-}
-
-GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun )
+void GpuUploadBuffer::AssignDiskBuffer( DiskBufferBase* diskBuffer )
{
- FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" );
- GpuDownloadBuffer r = { CreateGpuBuffer( size, devAllocator, pinnedAllocator, alignment, dryRun ) };
+ ASSERT( self->pinnedBuffer[0] );
- if( !dryRun )
- r.Reset();
+ void* nullBuffers[2] = { nullptr, nullptr };
+ if( self->diskBuffer )
+ self->diskBuffer->AssignReadBuffers( nullBuffers );
- return r;
+ self->diskBuffer = diskBuffer;
+ if( self->diskBuffer )
+ self->diskBuffer->AssignReadBuffers( self->pinnedBuffer );
}
-GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( const size_t size, const uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun )
+DiskBufferBase* GpuUploadBuffer::GetDiskBuffer() const
{
- FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" );
- GpuDownloadBuffer r = { CreateGpuBuffer( size, bufferCount, &devAllocator, &pinnedAllocator, alignment, dryRun ) };
-
- if( !dryRun )
- r.Reset();
-
- return r;
+ return self->diskBuffer;
}
-GpuUploadBuffer GpuQueue::CreateUploadBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun )
+void GpuUploadBuffer::CallHostFunctionOnStream( cudaStream_t stream, std::function func )
{
- FatalIf( _kind != Uploader, "Attempted to create GpuUploadBuffer on an DownloadQueue" );
- GpuUploadBuffer r = { CreateGpuBuffer( size, devAllocator, pinnedAllocator, alignment, dryRun ) };
-
- if( !dryRun )
- r.Reset();
+ auto* fnCpy = new std::function( std::move( func ) );
+ CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ) {
- return r;
-}
+ auto& fn = *reinterpret_cast*>( userData );
+ fn();
+ delete& fn;
-
-struct IGpuBuffer* GpuQueue::CreateGpuBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun )
-{
- return CreateGpuBuffer( size, BBCU_DEFAULT_GPU_BUFFER_COUNT, &devAllocator, &pinnedAllocator, alignment, dryRun );
-}
-
-struct IGpuBuffer* GpuQueue::CreateGpuBuffer( const size_t size, const uint32 bufferCount, IAllocator* devAllocator, IAllocator* pinnedAllocator, size_t alignment, bool dryRun )
-{
- FatalIf( bufferCount > BBCU_GPU_BUFFER_MAX_COUNT, "GPU Buffer count overflow." );
-
- const size_t allocSize = RoundUpToNextBoundaryT( size, alignment );
-
- void* devBuffers [BBCU_GPU_BUFFER_MAX_COUNT] = {};
- void* pinnedBuffers[BBCU_GPU_BUFFER_MAX_COUNT] = {};
-
- for( int32 i = 0; i < bufferCount; i++ )
- {
- devBuffers[i] = devAllocator->Alloc( allocSize, alignment );
-
- if( pinnedAllocator )
- pinnedBuffers[i] = pinnedAllocator->Alloc( allocSize, alignment );
- }
-
- if( dryRun ) return nullptr;
-
- struct IGpuBuffer* buf = new IGpuBuffer{};
-
- for( int32 i = 0; i < bufferCount; i++ )
- {
- CudaErrCheck( cudaEventCreateWithFlags( &buf->events[i] , cudaEventDisableTiming ) );
- CudaErrCheck( cudaEventCreateWithFlags( &buf->completedEvents[i], cudaEventDisableTiming ) );
- CudaErrCheck( cudaEventCreateWithFlags( &buf->readyEvents[i] , cudaEventDisableTiming ) );
- CudaErrCheck( cudaEventCreateWithFlags( &buf->preloadEvents[i] , cudaEventDisableTiming ) );
-
- buf->deviceBuffer[i] = devBuffers[i];
- buf->pinnedBuffer[i] = pinnedBuffers[i];
- // buf->commands[i] = {};
-
- // Events have to be disabled initially for uploads
- //if( _kind == Uploader )
- //{
- // CudaErrCheck( cudaEventSynchronize( buf->events[i] ) );
- // CudaErrCheck( cudaEventSynchronize( buf->completedEvents[i] ) );
- // CudaErrCheck( cudaEventSynchronize( buf->readyEvents[i] ) );
- //}
- }
-
- buf->size = size;
- buf->bufferCount = bufferCount;
- buf->queue = this;
-
- return buf;
-}
-
-//struct IGpuBuffer* GpuQueue::CreateGpuBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, const size_t size )
-//{
-// ASSERT( dev0 );
-// ASSERT( dev1 );
-// ASSERT( pinned0 );
-// ASSERT( pinned1 );
-//
-// ASSERT( dev0 != dev1 );
-// ASSERT( pinned0 != pinned1 );
-//
-//#if _DEBUG
-// if( size )
-// {
-// ASSERT_DOES_NOT_OVERLAP( dev0 , dev1 , size );
-// ASSERT_DOES_NOT_OVERLAP( dev0 , pinned0, size );
-// ASSERT_DOES_NOT_OVERLAP( dev0 , pinned1, size );
-// ASSERT_DOES_NOT_OVERLAP( dev1 , pinned0, size );
-// ASSERT_DOES_NOT_OVERLAP( dev1 , pinned1, size );
-// ASSERT_DOES_NOT_OVERLAP( pinned0, pinned1, size );
-// }
-//#endif
-//
-// struct IGpuBuffer* buf = new IGpuBuffer();
-//
-// CudaErrCheck( cudaEventCreateWithFlags( &buf->events[0], cudaEventDisableTiming ) );
-// CudaErrCheck( cudaEventCreateWithFlags( &buf->events[1], cudaEventDisableTiming ) );
-//
-// buf->deviceBuffer[0] = dev0;
-// buf->deviceBuffer[1] = dev1;
-//
-// buf->pinnedBuffer[0] = pinned0;
-// buf->pinnedBuffer[1] = pinned1;
-//
-// buf->size = size;
-// buf->fence.Reset( 0 );
-//
-// buf->commands[0] = {};
-// buf->commands[1] = {};
-//
-// buf->outgoingSequence = 0;
-// buf->completedSequence = 0;
-//
-// buf->queue = this;
-//
-// return buf;
-//}
-
-//struct IGpuBuffer* GpuQueue::CreateGpuBuffer( const size_t size )
-//{
-// ASSERT( size );
-//
-// void* dev0;
-// void* dev1;
-// void* pinned0;
-// void* pinned1;
-//
-// CudaErrCheck( cudaMalloc( &dev0, size ) );
-// CudaErrCheck( cudaMalloc( &dev1, size ) );
-// CudaErrCheck( cudaMallocHost( &pinned0, size ) );
-// CudaErrCheck( cudaMallocHost( &pinned1, size ) );
-//
-// return CreateGpuBuffer( dev0, dev1, pinned0, pinned1, size );
-//}
-
-void GpuQueue::CopyPendingDownloadStream( void* userData )
-{
- auto* buf = reinterpret_cast( userData );
-
- GpuQueue* queue = buf->queue;
-
- //const uint32 index = buf->completedSequence % buf->bufferCount;
- buf->completedSequence++;
-
- //queue->GetCommand( CommandType::Download2D ) = buf->commands[index];
- queue->SubmitCommands();
-}
-
-void GpuQueue::SubmitCommands()
-{
- const uint64 ticket = _commitTicketOut++;
-
- // Wait for our ticket to come up
- while( _commitTicketIn.load( std::memory_order_relaxed ) != ticket );
-
- _queue.Commit();
- _bufferReadySignal.Release();
- //_bufferReadySignal.Signal();
-
- // Use our ticket
- _commitTicketIn.store( ticket+1, std::memory_order_release );
-}
-
-GpuQueue::Command& GpuQueue::GetCommand( CommandType type )
-{
- const uint64 ticket = _cmdTicketOut++;
-
- // Wait for our ticket to come up
- while( _cmdTicketIn.load( std::memory_order_relaxed ) != ticket );
-
- Command* cmd;
- while( !_queue.Write( cmd ) )
- {
- Log::Line( "[GpuQueue] Queue is depleted. Waiting for copies to complete." );
- auto waitTimer = TimerBegin();
-
- // Block and wait until we have commands free in the buffer
- _bufferCopiedSignal.Wait();
-
- Log::Line( "[GpuQueue] Waited %.6lf seconds for availability.", TimerEnd( waitTimer ) );
- }
-
- // Use our ticket
- _cmdTicketIn.store( ticket+1, std::memory_order_release );
-
- ZeroMem( cmd );
- cmd->type = type;
-
- return *cmd;
-}
-
-
-///
-/// Command thread
-///
-void GpuQueue::CopyThreadEntryPoint( GpuQueue* self )
-{
- ASSERT( self );
- self->CopyThreadMain();
- self->_waitForExitSignal.Signal();
-}
-
-void GpuQueue::CopyThreadMain()
-{
- const int32 CMD_BUF_SIZE = 256;
- Command buffers[CMD_BUF_SIZE];
-
- for( ;; )
- {
- _bufferReadySignal.Wait();
-
- if( ShouldExitCopyThread() )
- return;
-
- // 1 command per semaphore release
- int32 bufCount;
- while( ( ( bufCount = _queue.Dequeue( buffers, CMD_BUF_SIZE ) ) ) )
- // if( ( ( bufCount = _queue.Dequeue( buffers, CMD_BUF_SIZE ) ) ) )
- {
- ASSERT( bufCount <= CMD_BUF_SIZE );
- _bufferCopiedSignal.Signal();
-
- for( int i = 0; i < bufCount; i++ )
- ExecuteCommand( buffers[i] );
- }
- }
-}
-
-void GpuQueue::ExecuteCommand( const Command& cmd )
-{
-
- // const uint32 index = cmd.sequenceId % BBCU_GPU_BUFFER_MAX_COUNT;
-
- if( cmd.type == CommandType::Copy )
- {
- auto& cpy = *cmd.copy;
-
- const bool isSequentialCopy = cpy.dstStride == cpy.srcStride;
- const size_t totalSize = cpy.height * cpy.width;
-
- byte* dst = (byte*)cpy.dstBuffer;
- const byte* src = (byte*)cpy.srcBuffer;
-
- if( isSequentialCopy )
- memcpy( cpy.dstBuffer, cpy.srcBuffer, totalSize );
- else
- {
- const byte* src = (byte*)cpy.srcBuffer;
- byte* dst = (byte*)cpy.dstBuffer;
-
- for( size_t i = 0; i < cpy.height; i++ )
- {
- memcpy( dst, src, cpy.width );
-
- dst += cpy.dstStride;
- src += cpy.srcStride;
- }
- }
-
- cpy.self->fence.Signal( cpy.sequence+1 );
- cpy.self->copyFence.Signal( cpy.sequence+1 );
-
- if( cpy.callback )
- cpy.callback( cpy.dstBuffer, totalSize, cpy.userData );
- }
- else if( cmd.type == CommandType::Callback )
- {
- cmd.callback.callback( cmd.callback.dstbuffer, cmd.callback.copySize, cmd.callback.userData );
- }
- // else if( cmd.type == CommandType::Sync )
- // {
- // _syncFence.Signal();
- // return;
- // }
- else
- {
- ASSERT( 0 );
- }
-
- // Signal that the pinned buffer is free
- //cpy.finishedSignal->Signal( cpy.sequenceId + 1 );
-}
-
-inline bool GpuQueue::ShouldExitCopyThread()
-{
- return _exitCopyThread.load( std::memory_order_acquire );
+ }, fnCpy ) );
}
diff --git a/cuda/GpuStreams.h b/cuda/GpuStreams.h
index ae1a5b63..2a310059 100644
--- a/cuda/GpuStreams.h
+++ b/cuda/GpuStreams.h
@@ -5,22 +5,127 @@
#include "threading/Fence.h"
#include "threading/Semaphore.h"
#include "util/SPCQueue.h"
+#include "util/StackAllocator.h"
+#include
-//#define GPU_BUFFER_COUNT
+class DiskBufferBase;
+class DiskBuffer;
+class DiskBucketBuffer;
+struct GpuDownloadBuffer;
+struct GpuUploadBuffer;
+struct GpuQueue;
+typedef std::function GpuStreamCallback;
+typedef void (*GpuDownloadCallback)( void* hostBuffer, size_t downloadSize, void* userData );
+
+struct PackedCopy
+{
+ struct IGpuBuffer* self;
+ const byte* src;
+ uint32 sequence;
+ uint32 length;
+ uint32 stride;
+ uint32 elementSize;
+ uint32 counts[BBCU_BUCKET_COUNT];
+};
+
+struct DiskDataInfo
+{
+ DiskBufferBase* diskBuffer;
+
+ union {
+ struct {
+ GpuUploadBuffer* self;
+ uint32 sequence;
+ } uploadInfo;
+
+ struct {
+ size_t srcStride;
+ } download2DInfo;
+
+ struct {
+ size_t size;
+ } downloadSequentialInfo;
+ };
+};
+
+struct CopyInfo
+{
+ struct IGpuBuffer* self;
+ uint32 sequence;
+
+ const void* srcBuffer;
+ void* dstBuffer;
+ size_t width;
+ size_t height;
+ size_t dstStride;
+ size_t srcStride;
+
+ // Callback data
+ GpuDownloadCallback callback;
+ void* userData;
+};
// Represents a double-buffered device buffer, which can be used with a GpuStreamQueue to
// make fast transfers (via intermediate pinned memory)
-class IAllocator;
-
enum class GpuStreamKind : uint32
{
Download = 0,
Upload
};
-typedef void (*GpuDownloadCallback)( void* hostBuffer, size_t downloadSize, void* userData );
+struct IGpuBuffer
+{
+ size_t size;
+ uint32 bufferCount; // Number of pinned/device buffers this instance contains
+ void* deviceBuffer[BBCU_GPU_BUFFER_MAX_COUNT];
+ void* pinnedBuffer[BBCU_GPU_BUFFER_MAX_COUNT]; // Pinned host buffer
+
+
+ cudaEvent_t pinnedEvent[BBCU_GPU_BUFFER_MAX_COUNT]; // Signals that the pinned buffer is ready for use
+
+ union {
+ cudaEvent_t deviceEvents[BBCU_GPU_BUFFER_MAX_COUNT]; // Signals that the device buffer is ready for use
+ cudaEvent_t events [BBCU_GPU_BUFFER_MAX_COUNT]; // Signals the device buffer is ready for use
+ };
+
+
+ union {
+ cudaEvent_t workEvent [BBCU_GPU_BUFFER_MAX_COUNT]; // Signals that the the work stream is done w/ the device buffer, and it's ready for use
+ cudaEvent_t readyEvents [BBCU_GPU_BUFFER_MAX_COUNT]; // User must signal this event when the device buffer is ready for download
+ };
+ cudaEvent_t completedEvents[BBCU_GPU_BUFFER_MAX_COUNT]; // Signals the buffer is ready for consumption by the device or buffer
+
+ // For dispatching host callbacks.
+ // Each buffer uses its own function?
+ cudaEvent_t callbackLockEvent;
+ cudaEvent_t callbackCompletedEvent;
+
+ Fence fence; // Signals the pinned buffer is ready for use
+ Fence copyFence;
+
+ cudaEvent_t preloadEvents[BBCU_GPU_BUFFER_MAX_COUNT];
+
+
+ CopyInfo copies[BBCU_BUCKET_COUNT];
+ // union {
+ // PackedCopy packedCopeis[BBCU_BUCKET_COUNT]; // For upload buffers
+ DiskDataInfo diskData[BBCU_BUCKET_COUNT];
+ // };
+ // DiskBucketBuffer* diskBucketBuffer = nullptr;
+
+ // #TODO: Remove atomic again
+ uint32 lockSequence; // Index of next buffer to lock
+ uint32 outgoingSequence; // Index of locked buffer that will be downloaded/uploaded
+ std::atomic completedSequence; // Index of buffer that finished downloading/uploading
+ std::atomic