diff --git a/.idea/vcs.xml b/.idea/vcs.xml index 54a1aefd..94a25f7f 100644 --- a/.idea/vcs.xml +++ b/.idea/vcs.xml @@ -2,13 +2,5 @@ - - - - - - - - \ No newline at end of file diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json deleted file mode 100644 index 8ae95aee..00000000 --- a/.vscode/c_cpp_properties.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "configurations": [ - { - "name": "Win32", - "includePath": [ - "${workspaceFolder}/**" - ], - "defines": [ - "_DEBUG", - "UNICODE", - "_UNICODE" - ], - "windowsSdkVersion": "10.0.19041.0", - "compilerPath": "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\bin\\Hostx64\\x64\\cl.exe", - "cStandard": "c17", - "cppStandard": "c++17", - "intelliSenseMode": "windows-msvc-x64", - "configurationProvider": "ms-vscode.cmake-tools", - "forcedInclude": [ - "src/pch.h" - ] - }, - { - "name": "macOS", - "includePath": [ - "${workspaceFolder}/**" - ], - "defines": [], - "macFrameworkPath": [ - "/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/System/Library/Frameworks" - ], - "compilerPath": "/usr/bin/clang", - "cStandard": "c17", - "cppStandard": "c++17", - "intelliSenseMode": "macos-clang-arm64", - "configurationProvider": "ms-vscode.cmake-tools", - "forcedInclude": [ - "src/pch.h" - ] - } - ], - "version": 4 -} \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json index 6957af27..bb356736 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -131,19 +131,25 @@ "preLaunchTask" : "build_cuda_debug", "program": "${workspaceFolder}/build/bladebit_cuda", - + // "-c", "xch1uf48n3f50xrs7zds0uek9wp9wmyza6crnex6rw8kwm3jnm39y82q5mvps6", // "-i", "7a709594087cca18cffa37be61bdecf9b6b465de91acb06ecb6dbe0f4a536f73", // Yes overflow // "--memo", "80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef207d52406afa2b6d7d92ea778f407205bd9dca40816c1b1cacfca2a6612b93eb", + + "args": + "-w -n 1 -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --check 100 --check-threshold 2 /home/harold/plot", + + // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot /home/harold/plot", + // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk --no-direct-buffers /home/harold/plot", + // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk /home/harold/plot", + "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-64 -t1 /home/harold/plotdisk /home/harold/plot", - "args": - // "-w --compress 3 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot ~/plot/tmp", - "-w --compress 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot ~/plot", "windows": { "type": "cppvsdbg", "program": "${workspaceFolder}/build/Debug/bladebit_cuda.exe", - "args": "--benchmark --compress 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot D:/" + // "args": "--benchmark -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot D:/" + "args": "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot -t2 D:/chia_test_plots D:/chia_test_plots", } }, @@ -236,7 +242,7 @@ { "name" : "Tests", - + "type" : "cppdbg", "osx": { "MIMode": "lldb", @@ -245,7 +251,7 @@ "stopAtEntry" : false, "cwd" : "${workspaceFolder}", "preLaunchTask" : "build_tests_debug", - "console" : "internalConsole", + // "console" : "internalConsole", "program": "${workspaceRoot}/build/tests", @@ -260,6 +266,8 @@ // { "name": "bb_plot" , "value": "/home/harold/plot/tmp/plot-k32-c06-2023-02-14-21-43-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" }, { "name": "bb_clevel" , "value": "1" }, { "name": "bb_end_clevel" , "value": "1" }, + + { "name": "bb_queue_path" , "value": "/home/ubuntu/plot" }, ], "args": [ @@ -273,7 +281,10 @@ // "line-point-deltas" // "compressed-plot-proof" // "compressed-plot-qualities" - "macos-threads" + // "macos-threads" + // "disk-slices" + // "disk-buckets" + "[disk-queue]" ] } @@ -285,10 +296,16 @@ "stopAtEntry" : false, "cwd" : "${workspaceFolder}", "preLaunchTask" : "build_debug", - "console" : "internalConsole", "program": "${workspaceFolder}/build/bladebit", - + // "program": "${workspaceFolder}/build/bladebit_cuda", + + "linux": { + "MIMode": "gdb", + "miDebuggerPath": "/usr/bin/gdb", + "program": "${workspaceFolder}/build/bladebit" + }, + "windows": { "type" : "cppvsdbg", "program": "${workspaceFolder}/build/debug/bladebit.exe" @@ -301,6 +318,11 @@ // "-t", "48", // "-t", "1", + // "validate", "--f7", "2", + // "/home/harold/plot/jmplot-c01-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" + // "/home/harold/plot/plot-k32-c01-2023-07-19-00-29-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot", + // "/home/harold/plot/plot-k32-c01-2023-08-03-04-57-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" + // "-t", "1", "validate", "--f7", "324", "~/plot/tmp/plot-k32-c01-2023-02-13-22-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" // "validate", "--f7", "7", "~/plot/tmp/plot-k32-c01-2023-03-09-14-07-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot", // "validate", "--cuda", "--f7", "4", "~/plot/tmp/plot-k32-c07-2023-04-13-16-08-330fbf677f78641061c93312c1a7ffa28138739b69975f3b874df6acc3e76378.plot", @@ -322,8 +344,8 @@ // // "/home/harold/plot/tmp/plot-k32-c04-2023-01-31-23-15-5cfc42dfaa5613da0b425994c2427a2ba4a8efcfb49e7844e93c0854baf09863.plot" // Simulation - "-t", "1", "simulate", "--seed", "b8e9ec6bc179ae6ba5f5c3483f7501db32879efa84b62001d27601a540dca5ff", - "-p", "16", "-n", "1", "--power", "45", "--size", "4PB", "~/plot/tmp/plot-k32-c01-2023-03-09-14-07-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" + // "-t", "1", "simulate", "--seed", "b8e9ec6bc179ae6ba5f5c3483f7501db32879efa84b62001d27601a540dca5ff", + // "-p", "16", "-n", "1", "--power", "45", "--size", "4PB", "~/plot/tmp/plot-k32-c01-2023-03-09-14-07-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" // "-t", "30", "simulate", "-p", "2", "-n", "600", "~/plot/tmp/plot-k32-c07-2023-03-16-11-49-7732c75d9f3b5ad1fc804bb7429121e334bd4f25f9bbbb76ef0370b5a0e80aae.plot" // "-m", @@ -335,11 +357,18 @@ // "--f7", "3983284117", "/home/harito/plot/tmp/gpu_1.plot", /// Compare - // "plotcmp", - // "/home/harito/plot/tmp/gpu_1.plot.old", - // "/home/harold/plot-tmpfs/gpu_1.plot", - // "/home/harito/plot/tmp/gpu_1.plot", - // "/home/harito/plot/tmp/plot-k32-2022-11-21-05-59-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" + "plotcmp", + "/home/harold/plot/plot-k32-c01-2023-08-22-16-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot", + "/home/harold/plot/plot-k32-c01-2023-08-22-16-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot", + + // "/home/harold/plot/plot-k32-c01-2023-08-03-22-59-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" + // "/home/harold/plot/jmplot-c01-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" + + // Check + // "check", + // "-n", "100", "--seed", "dc471c4d905ba3a65c6cecb46d97b132c0c98f51d416db5ec5cbdbe95ef2832f", + // "/home/harold/plot/plot-k32-c01-2023-07-19-00-29-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" + // "/home/harold/plot/jm.plot" ] }, diff --git a/.vscode/settings.json b/.vscode/settings.json index c6c5274d..6c2da21b 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -4,16 +4,16 @@ "nominmax" ], "files.associations": { + "*.sd": "yaml", + "*.userprefs": "xml", + "*.make": "makefile", "Fastfile": "ruby", "*.plist": "xml", - "*.sd": "yaml", "*.json": "jsonc", "*.ir": "llvm", "*.qs": "javascript", "*.ac": "shellscript", "player": "json", - "*.userprefs": "xml", - "*.make": "makefile", "memory": "cpp", "cstddef": "cpp", "string": "cpp", @@ -113,7 +113,18 @@ "filesystem": "cpp", "__bits": "cpp", "csignal": "cpp", - "cfenv": "cpp" + "cfenv": "cpp", + "ranges": "cpp", + "xhash": "cpp", + "xmemory": "cpp", + "xstddef": "cpp", + "xstring": "cpp", + "xtr1common": "cpp", + "xtree": "cpp", + "xutility": "cpp", + "__assert": "cpp", + "*.inc": "cpp", + "xiosbase": "cpp" }, "cSpell.words": [ "Ryzen" @@ -124,7 +135,13 @@ "cmake.preferredGenerators": [ "Unix Makefiles", "Visual Studio 17 2022" - ] + ], + // "cmake.buildArgs": [], + "cmake.configureSettings": { + "BB_ENABLE_TESTS": "ON", + "BB_CUDA_USE_NATIVE": "ON" + }, + "C_Cpp.dimInactiveRegions": false, // "cmake.generator": "Unix Makefiles" // "cmake.generator": "Visual Studio 17 2022" diff --git a/Bladebit.cmake b/Bladebit.cmake index 6ce0ad97..ffd03d67 100644 --- a/Bladebit.cmake +++ b/Bladebit.cmake @@ -227,6 +227,8 @@ set(src_bladebit src/plotting/PlotWriter.cpp src/plotting/PlotWriter.h src/plotting/Tables.h + src/plotting/BufferChain.h + src/plotting/BufferChain.cpp src/plotting/f1/F1Gen.h src/plotting/f1/F1Gen.cpp @@ -258,6 +260,7 @@ set(src_bladebit src/tools/PlotReader.cpp src/tools/PlotReader.h src/tools/PlotValidator.cpp + src/tools/PlotChecker.cpp src/util/Array.h src/util/Array.inl @@ -289,6 +292,18 @@ set(src_bladebit src/harvesting/GreenReaper.h src/harvesting/GreenReaperInternal.h src/harvesting/Thresher.h + + src/plotting/DiskQueue.h + src/plotting/DiskQueue.cpp + src/plotting/DiskBuffer.h + src/plotting/DiskBuffer.cpp + src/plotting/DiskBucketBuffer.h + src/plotting/DiskBucketBuffer.cpp + src/plotting/DiskBufferBase.h + src/plotting/DiskBufferBase.cpp + + src/util/MPMCQueue.h + src/util/CommandQueue.h ) target_sources(bladebit_core PUBLIC ${src_bladebit}) diff --git a/BladebitCUDA.cmake b/BladebitCUDA.cmake index 1fc668fa..8b140c2f 100644 --- a/BladebitCUDA.cmake +++ b/BladebitCUDA.cmake @@ -22,6 +22,9 @@ add_executable(bladebit_cuda cuda/CudaPlotUtil.cu cuda/GpuStreams.h cuda/GpuStreams.cu + cuda/GpuDownloadStream.cu + cuda/GpuQueue.h + cuda/GpuQueue.cu # Harvester cuda/harvesting/CudaThresher.cu @@ -42,7 +45,7 @@ target_compile_options(bladebit_cuda PRIVATE > $<${is_cuda_debug}: - -G + # -G > ) diff --git a/CMakeLists.txt b/CMakeLists.txt index 56595d7c..8f72155c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,7 @@ cmake_minimum_required(VERSION 3.19 FATAL_ERROR) -set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CONFIGURATION_TYPES Release Debug) @@ -9,7 +10,7 @@ if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Possible values are: Release, Debug" FORCE - ) + ) endif() # Allows for CMAKE_MSVC_RUNTIME_LIBRARY @@ -17,7 +18,7 @@ if(POLICY CMP0091) cmake_policy(SET CMP0091 NEW) endif() -set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14" CACHE STRING "macOS minimum supported version.") +set(CMAKE_OSX_DEPLOYMENT_TARGET "10.16" CACHE STRING "macOS minimum supported version.") set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>" CACHE STRING "MSVC Runtime Library") project(bladebit LANGUAGES C CXX ASM) @@ -83,10 +84,10 @@ endif() # NOTE: These are mostly sandbox test environment, not proper tests option(BB_ENABLE_TESTS "Enable tests." OFF) option(NO_CUDA_HARVESTER "Explicitly disable CUDA in the bladebit_harvester target." OFF) -option(BB_NO_EMBED_VERSION "Disable embedding the version when building locally (non-CI)." ON) +option(BB_NO_EMBED_VERSION "Disable embedding the version when building locally (non-CI)." OFF) option(BB_HARVESTER_ONLY "Enable only the harvester target." OFF) option(BB_HARVESTER_STATIC "Build the harvester target as a static library." OFF) - +option(BB_CUDA_USE_NATIVE "Only build the native CUDA architecture when in release mode." OFF) # # Dependencies @@ -103,7 +104,7 @@ if(NOT ${BB_HARVESTER_ONLY}) GIT_REPOSITORY https://github.com/Chia-Network/bls-signatures.git GIT_TAG 2.0.2 EXCLUDE_FROM_ALL ${BB_IS_DEPENDENCY} - ) +) set(BUILD_BLS_PYTHON_BINDINGS "0" CACHE STRING "0") set(BUILD_BLS_TESTS "0" CACHE STRING "") @@ -130,6 +131,7 @@ set(is_x86 $,$,$>) set(is_msvc_c_cpp $>) + if(CUDAToolkit_FOUND AND NOT ${NO_CUDA_HARVESTER}) set(have_cuda $) else() @@ -143,7 +145,7 @@ endif() include(Config.cmake) if(NOT ${BB_HARVESTER_ONLY}) - if(NOT BB_IS_DEPENDENCY AND (NOT BB_NO_EMBED_VERSION)) + if((NOT BB_IS_DEPENDENCY) AND (NOT BB_NO_EMBED_VERSION)) include(cmake_modules/EmbedVersion.cmake) endif() diff --git a/Config.cmake b/Config.cmake index 4139b4a9..f3481d6b 100644 --- a/Config.cmake +++ b/Config.cmake @@ -1,6 +1,11 @@ # Base interface configuration project add_library(bladebit_config INTERFACE) +target_include_directories(bladebit_config INTERFACE + ${INCLUDE_DIRECTORIES} + ${CMAKE_CURRENT_SOURCE_DIR}/src +) + target_compile_definitions(bladebit_config INTERFACE $<${is_release}: _NDEBUG=1 @@ -22,32 +27,34 @@ target_compile_definitions(bladebit_config INTERFACE target_compile_options(bladebit_config INTERFACE - # GCC or Clang - $<$: - -Wall - -Wno-comment - -Wno-unknown-pragmas - -g - - $<${is_release}: - -O3 + $<${is_c_cpp}: + # GCC or Clang + $<$: + -Wall + -Wno-comment + -Wno-unknown-pragmas + -g + + $<${is_release}: + -O3 + > + + $<${is_debug}: + -O0 + > > - $<${is_debug}: - -O0 + # GCC + $<$: + -fmax-errors=5 > - > - - # GCC - $<$: - -fmax-errors=5 - > - # Clang - $<$: - -ferror-limit=5 - -fdeclspec - -Wno-empty-body + # Clang + $<$: + -ferror-limit=5 + -fdeclspec + -Wno-empty-body + > > # MSVC @@ -129,43 +136,36 @@ cmake_policy(SET CMP0105 NEW) set(cuda_archs $<${is_cuda_release}: -## Maxwell - ## Tesla/Quadro M series - -gencode=arch=compute_50,code=sm_50 - ## Quadro M6000 , GeForce 900, GTX-970, GTX-980, GTX Titan X - -gencode=arch=compute_52,code=sm_52 - ## Tegra (Jetson) TX1 / Tegra X1, Drive CX, Drive PX, Jetson Nano - -gencode=arch=compute_53,code=sm_53 -## Pascal - ## GeForce 1000 series - -gencode=arch=compute_60,code=sm_60 - ## GeForce GTX 1050Ti, GTX 1060, GTX 1070, GTX 1080 - -gencode=arch=compute_61,code=sm_61 - ## Drive Xavier, Jetson AGX Xavier, Jetson Xavier NX - -gencode=arch=compute_62,code=sm_62 -## Volta - ## GV100, Tesla V100, Titan V - -gencode=arch=compute_70,code=sm_70 - ## Tesla V100 - -gencode=arch=compute_72,code=sm_72 - ## Turing - -gencode=arch=compute_75,code=sm_75 -## Ampere - ## NVIDIA A100, DGX-A100 - -gencode=arch=compute_80,code=sm_80 - ## GeForce RTX 3000 series, NVIDIA A100 - -gencode=arch=compute_86,code=sm_86 - ## Jetson Orin - -gencode=arch=compute_87,code=sm_87 -## Lovelace - ## NVIDIA GeForce RTX 4090, RTX 4080, RTX 6000, Tesla L40 - -gencode=arch=compute_89,code=sm_89 - ## Future proofing - -gencode=arch=compute_89,code=compute_89 -## Hopper - ## NVIDIA H100 (GH100) - # -gencode=arch=compute_90,code=sm_90 - # -gencode=arch=compute_90a,code=sm_90a + $<$: + -arch=native + > + + $<$>: + + # Maxwell + -gencode=arch=compute_50,code=sm_50 # Tesla/Quadro M series + -gencode=arch=compute_52,code=sm_52 # Quadro M6000 , GeForce 900, GTX-970, GTX-980, GTX Titan X + -gencode=arch=compute_53,code=sm_53 # Tegra (Jetson) TX1 / Tegra X1, Drive CX, Drive PX, Jetson Nano + + # Pascal + -gencode=arch=compute_60,code=sm_60 # GeForce 1000 series + -gencode=arch=compute_61,code=sm_61 # GeForce GTX 1050Ti, GTX 1060, GTX 1070, GTX 1080 + -gencode=arch=compute_62,code=sm_62 # Drive Xavier, Jetson AGX Xavier, Jetson Xavier NX + + # Volta + -gencode=arch=compute_70,code=sm_70 # GV100, Tesla V100, Titan V + -gencode=arch=compute_72,code=sm_72 # Tesla V100 + -gencode=arch=compute_75,code=sm_75 # Turing + + # Ampere + -gencode=arch=compute_80,code=sm_80 # NVIDIA A100, DGX-A100 + -gencode=arch=compute_86,code=sm_86 # GeForce RTX 3000 series, NVIDIA A100 + -gencode=arch=compute_87,code=sm_87 # Jetson Orin + + # Lovelace + -gencode=arch=compute_89,code=sm_89 # NVIDIA GeForce RTX 4090, RTX 4080, RTX 6000, Tesla L40 + -gencode=arch=compute_89,code=compute_89 # Future proofing + > > $<${is_cuda_debug}: diff --git a/Harvester.cmake b/Harvester.cmake index d853a2db..692daa80 100644 --- a/Harvester.cmake +++ b/Harvester.cmake @@ -1,5 +1,5 @@ if(NOT ${BB_HARVESTER_STATIC}) - add_library(bladebit_harvester SHARED) + add_library(bladebit_harvester SHARED src/harvesting/HarvesterDummy.cpp) else() add_library(bladebit_harvester STATIC) endif() @@ -82,9 +82,15 @@ target_sources(bladebit_harvester PRIVATE cuda/CudaF1.cu cuda/CudaMatch.cu cuda/CudaPlotUtil.cu + cuda/GpuQueue.cu - # TODO: Remove this, ought not be needed in harvester + # TODO: Does this have to be here? cuda/GpuStreams.cu + cuda/GpuDownloadStream.cu + src/plotting/DiskBuffer.cpp + src/plotting/DiskBucketBuffer.cpp + src/plotting/DiskBufferBase.cpp + src/plotting/DiskQueue.cpp > $<$: @@ -159,7 +165,7 @@ if(CUDAToolkit_FOUND) CUDA_RUNTIME_LIBRARY Static CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON - # CUDA_ARCHITECTURES OFF + CUDA_ARCHITECTURES OFF ) endif() diff --git a/README.md b/README.md index 9197014e..24d50f30 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,71 @@ -# BladeBit Chia Plotter +# Bladebit Chia Plotter [![Release Builds](https://github.com/Chia-Network/bladebit/actions/workflows/build-release.yml/badge.svg?branch=master&event=push)](https://github.com/Chia-Network/bladebit/actions/workflows/build-release.yml) -A high-performance **k32-only**, Chia (XCH) plotter supporting in-RAM and disk-based plotting. +A high-performance **k32-only**, Chia (XCH) plotter. + +Bladebit supports 3 plotting modes: + - Fully In-RAM (no drives required), CPU-based mode. + - GPU (CUDA-based) mode. Both fully in-RAM or disk-hybrid mode. + - Disk-based mode + +## Usage +Run `bladebit --help` to see general help. For command-specific help, use `bladebit help `. + +## Requirements + +**CUDA** + +An NVIDIA GPU is required for this mode. This mode is exposed via the `cudaplot` command in a separate executable "bladebit_cuda". This mode has mainly been tested on consumer cards from the **10xx** series and up. + +| Mode | OS | DRAM | VRAM | CUDA capability +|--------------------------------|----------------|------|------|---------------- +| In-RAM | Linux, Windows | 256G | 8G | 5.2 and up +| Disk-hybrid 128G | Linux, Windows | 128G | 8G | 5.2 and up +| Disk-hybrid 16G (WIP) | Linux | 16G | 8G | 5.2 and up + +> *NOTE: 16G mode currently a work in progress and at this stage it only works in Linux and direct I/O is unavailable in this mode.* + + +**CPU RAM-Only** + +Available on Linux, Windows and macOS. +Requires at least **416G** of system DRAM. + + +**Disk** + +Available on Linux, Windows and macOS. + +A minimum of **4 GiB of RAM** is required, with lower bucket counts requiring up to 12 GiB of RAM. Roughly **480 GiB of disk space** is required in the default mode, or around **390 GiB of disk space** with `--alternate` mode enabled. + +The exact amounts of RAM and disk space required may vary slightly depending on the system's page size and the target disk file system block size (block-alignment is required for direct I/O). + +SSDs are highly recommended for disk-based plotting. + + +## Compressed Plots + +Compressed plots are supported in CUDA mode and in RAM-only mode. CPU Disk-based mode does **NOT** currently support compressed plots. + +Compressed plots are currently supported for compression levels from **C1** to **C7**. Note that bladebit compression levels are not compatible with other plotter compression levels. These compression levels are based on the *number of bits dropped from an entry excluding the minimum bits required to fully drop a table*. At `k=32` a the first table is fully excluded from the plot at 16 bits dropped. + +> *NOTE: Although higher compression levels are available, support for farming them has not been currently implemented and are therefore disabled. They will be implemented in the future.* + +Compression levels are currently roughly equivalent to the following plot sizes. + +| Compression Level | Plot Size +|-------------------|------------- +| C1 | 87.5 GiB +| C2 | 86.0 GiB +| C3 | 84.4 GiB +| C4 | 82.8 GiB +| C5 | 81.2 GiB +| C6 | 79.6 GiB +| C7 | 78.0 GiB + +These might be optimized in the future with further compression optimizations. + ## Requirements @@ -39,7 +102,7 @@ SSDs are highly recommended for disk-based plotting. ## Prerequisites -Linux, Windows and MacOS (both intel and ARM (Apple Silicon)) are supported. +Linux, Windows and macOS (both Intel and ARM) are supported. ### Linux @@ -83,8 +146,12 @@ cmake --build . --target bladebit --config Release The resulting binary will be found under the `build/` directory. On Windows it will be under `build/Release/`. +For **bladebit_cuda**, the CUDA toolkit must be installed. The target name is `bladebit_cuda`. + +For simplicity the `build.sh` or `build-cuda.sh` scripts can be used to build. On Windows this requires gitbash or similar bash-based shell to run. + ## Usage -Run **bladebit** with the `-h` for complete usage and command line options: +Run **bladebit** (or **bladebit_cuda**) with the `-h` for complete usage and command line options: ```bash # Linux & macOS @@ -93,18 +160,33 @@ build/bladebit -h # Windows build/Release/bladebit.exe -h ``` +The bladebit CLI uses the format `bladebit `. - -The bladebit CLI uses the format `bladebit `. - -Use the aforementioned `-h` parameter to get the full list of sub-commands and `GLOBAL_OPTIONS`. -The `sub_command`-specific `COMMAND_OPTIONS` can be obtained by using the `help` sub command with the desired command as the parameter: +Use the aforementioned `-h` parameter to get the full list of commands and `GLOBAL_OPTIONS`. +The `command`-specific `COMMAND_OPTIONS` can be obtained by using the `help` sub command with the desired command as the parameter: ```bash +bladebit help cudaplot bladebit help ramplot bladebit help diskplot ``` +### CUDA +Basic `cudaplot` usage: +```bash +# OG plots +./bladebit_cuda -f -p cudaplot + +# Portable plots +./bladebit_cuda -f -c cudaplot + +# Compressed plots +./bladebit_cuda -z -f -c cudaplot + +# 128G disk-hybrid mode +./bladebit_cuda -z -f -c cudaplot --disk-128 -t1 +``` + ### In-RAM Basic `ramplot` usage: ```bash @@ -113,6 +195,9 @@ Basic `ramplot` usage: # Portable plots ./bladebit -f -c ramplot + +# Compressed plots +./bladebit -z -f -c ramplot ``` ### Disk-Based diff --git a/Tests.cmake b/Tests.cmake index 577e541c..aaba51df 100644 --- a/Tests.cmake +++ b/Tests.cmake @@ -1,10 +1,15 @@ include(cmake_modules/FindCatch2.cmake) -add_executable(tests ${src_bladebit}) +add_executable(tests ${src_bladebit} + cuda/harvesting/CudaThresherDummy.cpp + tests/TestUtil.h + tests/TestDiskQueue.cpp +) + target_compile_definitions(tests PRIVATE BB_TEST_MODE=1 ) -target_link_libraries(tests PRIVATE bladebit_config Catch2::Catch2WithMain) +target_link_libraries(tests PRIVATE bladebit_config bladebit_core Catch2::Catch2WithMain) set_target_properties(tests PROPERTIES EXCLUDE_FROM_ALL ON diff --git a/VERSION b/VERSION index 4a36342f..0c6173b5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1,2 @@ -3.0.0 +3.1.0 + diff --git a/build-cuda.sh b/build-cuda.sh new file mode 100755 index 00000000..d7a10154 --- /dev/null +++ b/build-cuda.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +set -e +_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +cd $_dir + +build_dir=build-release +mkdir -p ${build_dir} +cd ${build_dir} + +cmake .. -DCMAKE_BUILD_TYPE=Release +cmake --build . --target bladebit_cuda --config Release --clean-first -j24 diff --git a/cmake_modules/EmbedVersion.cmake b/cmake_modules/EmbedVersion.cmake index 6ec042c0..1c346632 100644 --- a/cmake_modules/EmbedVersion.cmake +++ b/cmake_modules/EmbedVersion.cmake @@ -2,18 +2,25 @@ if((NOT DEFINED ENV{CI}) AND (NOT DEFINED CACHE{bb_version_embedded})) message("Embedding local build version") - set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.") - - set(cmd_ver bash) + set(cmd_shell bash) + set(cmd_ext sh) if(${CMAKE_SYSTEM_NAME} MATCHES "Windows") - set(cmd_ver bash.exe) + + find_program(bash_path NAMES bash.exe NO_CACHE) + + if(${bash_path} MATCHES "-NOTFOUND") + set(cmd_shell powershell) + set(cmd_ext ps1) + else() + set(cmd_shell "${bash_path}") + endif() endif() - execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh major OUTPUT_VARIABLE bb_ver_maj WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) - execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh minor OUTPUT_VARIABLE bb_ver_min WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) - execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh revision OUTPUT_VARIABLE bb_ver_rev WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) - execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh suffix OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) - execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh commit OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) + execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} major OUTPUT_VARIABLE bb_ver_maj WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) + execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} minor OUTPUT_VARIABLE bb_ver_min WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) + execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} revision OUTPUT_VARIABLE bb_ver_rev WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) + execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} suffix OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) + execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} commit OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY) # Remove trailing whitespace incurred in windows gitbash string(STRIP "${bb_ver_maj}" bb_ver_maj) @@ -39,3 +46,5 @@ if(NOT DEFINED ENV{CI}) add_compile_definitions(BLADEBIT_VERSION_SUFFIX="${bb_ver_suffix}") add_compile_definitions(BLADEBIT_GIT_COMMIT="${bb_ver_commit}") endif() + +set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.") \ No newline at end of file diff --git a/cuda/CudaPlotConfig.h b/cuda/CudaPlotConfig.h index 80721e9f..a9afd81f 100644 --- a/cuda/CudaPlotConfig.h +++ b/cuda/CudaPlotConfig.h @@ -19,7 +19,7 @@ #define BBCU_TABLE_ENTRY_COUNT (1ull<<32) #define BBCU_BUCKET_ENTRY_COUNT (BBCU_TABLE_ENTRY_COUNT/BBCU_BUCKET_COUNT) //#define BBCU_XTRA_ENTRIES_PER_SLICE (1024u*64u) -#define BBCU_XTRA_ENTRIES_PER_SLICE (4096u*1u) +#define BBCU_XTRA_ENTRIES_PER_SLICE (4096+1024) #define BBCU_MAX_SLICE_ENTRY_COUNT ((BBCU_BUCKET_ENTRY_COUNT/BBCU_BUCKET_COUNT)+BBCU_XTRA_ENTRIES_PER_SLICE) #define BBCU_BUCKET_ALLOC_ENTRY_COUNT (BBCU_MAX_SLICE_ENTRY_COUNT*BBCU_BUCKET_COUNT) #define BBCU_TABLE_ALLOC_ENTRY_COUNT (((uint64)BBCU_BUCKET_ALLOC_ENTRY_COUNT)*BBCU_BUCKET_COUNT) @@ -42,12 +42,12 @@ static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT / BBCU_BUCKET_COUNT == BBCU_MAX_SLI #ifdef _WIN32 #define DBG_BBCU_DBG_DIR "D:/dbg/cuda/" #else - // #define DBG_BBCU_DBG_DIR "/home/harold/plot/dbg/cuda/" - #define DBG_BBCU_DBG_DIR "/home/harito/plot/dbg/cuda/" + #define DBG_BBCU_DBG_DIR "/home/harold/plotdisk/dbg/cuda/" + // #define DBG_BBCU_DBG_DIR "/home/harito/plots/dbg/cuda/" #endif - // #define DBG_BBCU_REF_DIR "/home/harold/plot/ref/" + // #define DBG_BBCU_REF_DIR "/home/harold/plots/ref/" + - // #define BBCU_DBG_SKIP_PHASE_1 1 // Skip phase 1 and load pairs from disk // #define BBCU_DBG_SKIP_PHASE_2 1 // Skip phase 1 and 2 and load pairs and marks from disk @@ -60,6 +60,7 @@ static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT / BBCU_BUCKET_COUNT == BBCU_MAX_SLI // #define DBG_BBCU_P2_WRITE_MARKS 1 // #define DBG_BBCU_P2_COUNT_PRUNED_ENTRIES 1 + // #define DBG_BBCU_KEEP_TEMP_FILES 1 #define _ASSERT_DOES_NOT_OVERLAP( b0, b1, size ) ASSERT( (b1+size) <= b0 || b1 >= (b0+size) ) diff --git a/cuda/CudaPlotContext.h b/cuda/CudaPlotContext.h index f4e8d909..fc5884b3 100644 --- a/cuda/CudaPlotContext.h +++ b/cuda/CudaPlotContext.h @@ -7,11 +7,16 @@ #include "plotting/PlotTypes.h" #include "plotting/PlotWriter.h" #include "GpuStreams.h" +#include "GpuQueue.h" #include "util/StackAllocator.h" #include "fse/fse.h" #include "threading/Fence.h" #include "plotting/GlobalPlotConfig.h" #include "threading/ThreadPool.h" +#include "plotting/BufferChain.h" +#include "plotting/DiskBuffer.h" +#include "plotting/DiskBucketBuffer.h" +#include #include "cub/device/device_radix_sort.cuh" // #include @@ -29,7 +34,51 @@ using namespace cooperative_groups; #endif +struct CudaK32ParkContext +{ + Span table7Memory; // Memory buffer reserved for finalizing table7 and writing C parks + BufferChain* parkBufferChain; + uint32 maxParkBuffers; // Maximum number of park buffers + uint64* hostRetainedLinePoints; +}; + +struct CudaK32HybridMode +{ + // For clarity, these are the file names for the disk buffers + // whose disk space will be shared for temp data in both phase 1 and phase 3. + // The name indicates their usage and in which phase. + static constexpr std::string_view Y_DISK_BUFFER_FILE_NAME = "p1y-p3index.tmp"; + static constexpr std::string_view META_DISK_BUFFER_FILE_NAME = "p1meta-p3rmap.tmp"; + static constexpr std::string_view LPAIRS_DISK_BUFFER_FILE_NAME = "p1unsortedx-p1lpairs-p3lp-p3-lmap.tmp"; + + static constexpr std::string_view P3_RMAP_DISK_BUFFER_FILE_NAME = META_DISK_BUFFER_FILE_NAME; + static constexpr std::string_view P3_INDEX_DISK_BUFFER_FILE_NAME = Y_DISK_BUFFER_FILE_NAME; + static constexpr std::string_view P3_LP_AND_LMAP_DISK_BUFFER_FILE_NAME = LPAIRS_DISK_BUFFER_FILE_NAME; + + DiskQueue* temp1Queue; // Tables Queue + DiskQueue* temp2Queue; // Metadata Queue (could be the same as temp1Queue) + DiskBucketBuffer* metaBuffer; // Enabled in < 128G mode + DiskBucketBuffer* yBuffer; // Enabled in < 128G mode + DiskBucketBuffer* unsortedL; // Unsorted Xs (or L pairs in < 128G) are written to disk (uint64 entries) + DiskBucketBuffer* unsortedR; // Unsorted R pairs in < 128G mode + + DiskBuffer* tablesL[7]; + DiskBuffer* tablesR[7]; + + GpuDownloadBuffer _tablesL[7]; + GpuDownloadBuffer _tablesR[7]; + + struct + { + // #NOTE: These buffers shared the same file-backed storage as + // with other buffers in phase 1. + DiskBucketBuffer* rMapBuffer; // Step 1 + DiskBucketBuffer* indexBuffer; // X-step/Step 2 + DiskBucketBuffer* lpAndLMapBuffer; // X-step/Step 2 (LP) | Step 3 (LMap) + + } phase3; +}; struct CudaK32Phase2 { @@ -64,11 +113,12 @@ struct CudaK32Phase3 }; uint64 pairsLoadOffset; - + + // Device buffers uint32* devBucketCounts; uint32* devPrunedEntryCount; - + // Host buffers union { RMap* hostRMap; uint32* hostIndices; @@ -79,12 +129,6 @@ struct CudaK32Phase3 uint64* hostLinePoints; }; - // #TODO: Remove this when we sort-out all of the buffer usage - // uint64* hostMarkingTables[6]; // Set by Phase 2 - - - // uint32* hostBucketCounts; - uint32 prunedBucketCounts[7][BBCU_BUCKET_COUNT]; uint64 prunedTableEntryCounts[7]; @@ -111,9 +155,10 @@ struct CudaK32Phase3 // Step 2 struct { GpuUploadBuffer rMapIn; // RMap from step 1 - GpuUploadBuffer lMapIn; // Output map (uint64) from the previous table run. Or during L table 1, it is inlined x values + GpuUploadBuffer lMapIn; // Output map (uint64) from the previous table run. Or, when L table is the first stored table, it is inlined x values GpuDownloadBuffer lpOut; // Output line points (uint64) GpuDownloadBuffer indexOut; // Output source line point index (uint32) (taken from the rMap source value) + GpuDownloadBuffer parksOut; // Output P7 parks on the last table uint32* devLTable[2]; // Unpacked L table bucket uint32 prunedBucketSlices[BBCU_BUCKET_COUNT][BBCU_BUCKET_COUNT]; @@ -123,7 +168,7 @@ struct CudaK32Phase3 struct { GpuUploadBuffer lpIn; // Line points from step 2 GpuUploadBuffer indexIn; // Indices from step 2 - GpuDownloadBuffer mapOut; // lTable for next step 1 + GpuDownloadBuffer mapOut; // lTable for next step 2 GpuDownloadBuffer parksOut; // Downloads park buffers to host uint32* hostParkOverrunCount; @@ -137,7 +182,6 @@ struct CudaK32Phase3 FSE_CTable* devCTable; uint32* devParkOverrunCount; - Fence* parkFence; std::atomic parkBucket; uint32 prunedBucketSlices[BBCU_BUCKET_COUNT][BBCU_BUCKET_COUNT]; @@ -178,8 +222,9 @@ struct CudaK32PlotContext int32 cudaDevice = -1; cudaDeviceProp* cudaDevProps = nullptr; bool downloadDirect = false; + TableId firstStoredTable = TableId::Table2; // First non-dropped table that has back pointers ThreadPool* threadPool = nullptr; - + TableId table = TableId::Table1; // Current table being generated uint32 bucket = 0; // Current bucket being processed @@ -192,6 +237,7 @@ struct CudaK32PlotContext PlotRequest plotRequest; PlotWriter* plotWriter = nullptr; Fence* plotFence = nullptr; + Fence* parkFence = nullptr; // Root allocations size_t allocAlignment = 0; @@ -263,8 +309,6 @@ struct CudaK32PlotContext uint32* hostBucketSlices = nullptr; uint32* hostTableL = nullptr; uint16* hostTableR = nullptr; - uint32* hostTableSortedL = nullptr; - uint16* hostTableSortedR = nullptr; union { uint32* hostMatchCount = nullptr; @@ -279,6 +323,14 @@ struct CudaK32PlotContext CudaK32Phase2* phase2 = nullptr; CudaK32Phase3* phase3 = nullptr; + CudaK32HybridMode* diskContext = nullptr; + CudaK32ParkContext* parkContext = nullptr; + bool useParkContext = false; + + // Used when '--check' is enabled + struct GreenReaperContext* grCheckContext = nullptr; + class PlotChecker* plotChecker = nullptr; + struct { Duration uploadTime = Duration::zero(); // Host-to-device wait time @@ -359,7 +411,7 @@ inline uint32 CudaK32PlotGetOutputIndex( CudaK32PlotContext& cx ) } //----------------------------------------------------------- -inline bool CudaK32PlotIsOutputInterleaved( CudaK32PlotContext& cx ) +inline bool CudaK32PlotIsOutputVertical( CudaK32PlotContext& cx ) { return CudaK32PlotGetOutputIndex( cx ) == 0; } diff --git a/cuda/CudaPlotPhase2.cu b/cuda/CudaPlotPhase2.cu index 93099d86..8d2d5094 100644 --- a/cuda/CudaPlotPhase2.cu +++ b/cuda/CudaPlotPhase2.cu @@ -20,8 +20,7 @@ static void CudaK32PlotAllocateBuffersTest( CudaK32PlotContext& cx ); #define MARK_TABLE_BLOCK_THREADS 128 -#define P2_BUCKET_COUNT BBCU_BUCKET_COUNT -#define P2_ENTRIES_PER_BUCKET BBCU_BUCKET_ALLOC_ENTRY_COUNT //((1ull< -__global__ void CudaMarkTables( const uint32 entryCount, const uint32* lPairs, const uint16* rPairs, byte* marks, const uint64* rTableMarks, const uint32 rOffset ) +__global__ void CudaMarkTables( const uint32 entryCount, const uint32* lPairs, const uint16* rPairs, + byte* marks, const uint64* rTableMarks, const uint32 rOffset ) { const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x; @@ -39,11 +39,11 @@ __global__ void CudaMarkTables( const uint32 entryCount, const uint32* lPairs, c return; if constexpr ( useRMarks ) - { + { if( !CuBitFieldGet( rTableMarks, rOffset + gid ) ) return; } - + const uint32 l = lPairs[gid]; const uint32 r = l + rPairs[gid]; @@ -117,12 +117,12 @@ static void BytefieldToBitfield( CudaK32PlotContext& cx, const byte* bytefield, ASSERT( (uint64)blockCount * blockThreadCount * 64 == tableEntryCount ); -#if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES + #if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES #define G_PRUNED_COUNTS ,cx.phase2->devPrunedCount CudaErrCheck( cudaMemsetAsync( cx.phase2->devPrunedCount, 0, sizeof( uint32 ), stream ) ); -#else + #else #define G_PRUNED_COUNTS -#endif + #endif ASSERT_DOES_NOT_OVERLAP2( bitfield, bytefield, GetMarkingTableBitFieldSize(), GetMarkingTableByteSize() ); @@ -131,8 +131,11 @@ static void BytefieldToBitfield( CudaK32PlotContext& cx, const byte* bytefield, void LoadPairs( CudaK32PlotContext& cx, CudaK32Phase2& p2, const TableId rTable, const uint32 bucket ) { + if( bucket >= BBCU_BUCKET_COUNT ) + return; + const uint64 tableEntryCount = cx.tableEntryCounts[(int)rTable]; - const uint32 entryCount = BBCU_BUCKET_ENTRY_COUNT;//(uint32)std::min( (uint64)BBCU_BUCKET_ENTRY_COUNT, tableEntryCount - p2.pairsLoadOffset );// cx.bucketCounts[(int)rTable][bucket]; + const uint32 entryCount = cx.bucketCounts[(int)rTable][bucket]; // uint32* hostPairsL = cx.hostTableSortedL + p2.pairsLoadOffset; // uint16* hostPairsR = cx.hostTableSortedR + p2.pairsLoadOffset; @@ -163,42 +166,48 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 ) byte* devLMarks = p2.devMarkingTable; + if( cx.cfg.hybrid128Mode ) + { + cx.diskContext->tablesL[(int)rTable]->Swap(); + cx.diskContext->tablesR[(int)rTable]->Swap(); + + p2.pairsLIn.AssignDiskBuffer( cx.diskContext->tablesL[(int)rTable] ); + p2.pairsRIn.AssignDiskBuffer( cx.diskContext->tablesR[(int)rTable] ); + } + // Zero-out marks CudaErrCheck( cudaMemsetAsync( devLMarks, 0, GetMarkingTableByteSize(), cx.computeStream ) ); // Load first bucket's worth of pairs LoadPairs( cx, p2, rTable, 0 ); - uint32 rOffset = 0; - for( uint32 bucket = 0; bucket < P2_BUCKET_COUNT; bucket++ ) - { - const bool isLastBucket = bucket + 1 == P2_BUCKET_COUNT; + // Mark the table, buckey by bucket + uint32 rTableGlobalIndexOffset = 0; - // Load next set of pairs in the background - if( !isLastBucket ) - LoadPairs( cx, p2, rTable, bucket + 1 ); + for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ ) + { + // Load next set of pairs in the background (if there is another bucket) + LoadPairs( cx, p2, rTable, bucket + 1 ); const uint64 tableEntryCount = cx.tableEntryCounts[(int)rTable]; - const uint32 entryCount = isLastBucket ? tableEntryCount - (BBCU_BUCKET_ENTRY_COUNT * (BBCU_BUCKET_COUNT-1)): BBCU_BUCKET_ENTRY_COUNT; - // const uint32 entryCount = cx.bucketCounts[(int)rTable][bucket]; + const uint32 entryCount = cx.bucketCounts[(int)rTable][bucket]; // Wait for pairs to be ready const uint32* devLPairs = p2.pairsLIn.GetUploadedDeviceBufferT( cx.computeStream ); const uint16* devRPairs = p2.pairsRIn.GetUploadedDeviceBufferT( cx.computeStream ); - // Mark const uint32 blockCount = (uint32)CDiv( entryCount, MARK_TABLE_BLOCK_THREADS ); if( rTable == TableId::Table7 ) CudaMarkTables<<>>( entryCount, devLPairs, devRPairs, devLMarks, nullptr, 0 ); else - CudaMarkTables<<>>( entryCount, devLPairs, devRPairs, devLMarks, p2.devRMarks[(int)rTable], rOffset ); - + CudaMarkTables<<>>( entryCount, devLPairs, devRPairs, devLMarks, p2.devRMarks[(int)rTable], rTableGlobalIndexOffset ); + p2.pairsLIn.ReleaseDeviceBuffer( cx.computeStream ); p2.pairsRIn.ReleaseDeviceBuffer( cx.computeStream ); - rOffset += entryCount; + rTableGlobalIndexOffset += entryCount; } // Convert the bytefield marking table to a bitfield @@ -209,14 +218,14 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 ) // Download bitfield marks // uint64* hostBitField = p2.hostBitFieldAllocator->AllocT( GetMarkingTableBitFieldSize() ); uint64* hostBitField = cx.hostMarkingTables[(int)lTable]; - + // #TODO: Do download and copy again, for now just store all of them in this pinned buffer // cx.phase3->hostMarkingTables[(int)lTable] = hostBitField; p2.outMarks.Download( hostBitField, GetMarkingTableBitFieldSize(), cx.computeStream ); - + // p2.outMarks.DownloadAndCopy( hostBitField, cx.hostMarkingTables[(int)lTable], GetMarkingTableBitFieldSize(), cx.computeStream ); // p2.outMarks.Download( cx.hostMarkingTables[(int)lTable], GetMarkingTableBitFieldSize() ); - + #if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES { @@ -370,6 +379,9 @@ void CudaK32PlotPhase2( CudaK32PlotContext& cx ) MarkTable( cx, p2 ); p2.outMarks.WaitForCompletion(); p2.outMarks.Reset(); + p2.pairsLIn.Reset(); + p2.pairsRIn.Reset(); + const auto elapsed = TimerEnd( timer ); Log::Line( "Marked Table %u in %.2lf seconds.", rTable, elapsed ); @@ -380,7 +392,7 @@ void CudaK32PlotPhase2( CudaK32PlotContext& cx ) } // Wait for everything to complete - + // p2.outMarks.WaitForCopyCompletion(); // #TODO: Re-activate this when re-enabling copy p2.outMarks.WaitForCompletion(); p2.outMarks.Reset(); @@ -392,30 +404,39 @@ void CudaK32PlotPhase2( CudaK32PlotContext& cx ) /// void CudaK32PlotPhase2AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx ) { - const size_t alignment = cx.allocAlignment; + GpuStreamDescriptor desc{}; + + desc.entriesPerSlice = P2_ENTRIES_PER_BUCKET; + desc.sliceCount = 1; + desc.sliceAlignment = cx.allocAlignment; + desc.bufferCount = BBCU_DEFAULT_GPU_BUFFER_COUNT; + desc.deviceAllocator = acx.devAllocator; + desc.pinnedAllocator = nullptr; // Start in direct mode (no intermediate pinined buffers) + + if( cx.cfg.hybrid128Mode ) + { + desc.pinnedAllocator = acx.pinnedAllocator; + desc.sliceAlignment = cx.diskContext->temp1Queue->BlockSize(); + } - IAllocator& devAllocator = *acx.devAllocator; - IAllocator& pinnedAllocator = *acx.pinnedAllocator; + if( !cx.downloadDirect ) + desc.pinnedAllocator = acx.pinnedAllocator; CudaK32Phase2& p2 = *cx.phase2; const size_t markingTableByteSize = GetMarkingTableByteSize(); const size_t markingTableBitFieldSize = GetMarkingTableBitFieldSize(); - p2.devPrunedCount = devAllocator.CAlloc( 1, alignment ); - p2.devMarkingTable = devAllocator.AllocT( markingTableByteSize, alignment ); - - p2.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBuffer( - sizeof( uint32 ) * P2_ENTRIES_PER_BUCKET, devAllocator, pinnedAllocator, alignment, acx.dryRun ); + // Device buffers + p2.devPrunedCount = acx.devAllocator->CAlloc( 1, acx.alignment ); + p2.devMarkingTable = acx.devAllocator->AllocT( markingTableByteSize, acx.alignment ); - p2.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBuffer( - sizeof( uint16 ) * P2_ENTRIES_PER_BUCKET, devAllocator, pinnedAllocator, alignment, acx.dryRun ); + // Upload/Download streams + p2.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT( desc, acx.dryRun ); + p2.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT( desc, acx.dryRun ); - p2.outMarks = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( - markingTableBitFieldSize, devAllocator, alignment, acx.dryRun ); - - // These buffers are safe to use at this point - // p2.hostBitFieldAllocator = new StackAllocator( cx.hostTableR, sizeof( uint32 ) * BBCU_TABLE_ALLOC_ENTRY_COUNT ); + desc.entriesPerSlice = markingTableBitFieldSize; + p2.outMarks = cx.gpuDownloadStream[0]->CreateDownloadBufferT( desc, acx.dryRun ); } @@ -550,7 +571,7 @@ void DbgValidateTable( CudaK32PlotContext& cx ) { { uint64 totalCount = 0; - for( uint32 bucket = 0; bucket < P2_BUCKET_COUNT; bucket++ ) + for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ ) totalCount += cx.bucketCounts[(int)rt][bucket]; ASSERT( totalCount == cx.tableEntryCounts[(int)rt] ); @@ -562,7 +583,7 @@ void DbgValidateTable( CudaK32PlotContext& cx ) Pairs hostRTablePairs = cx.hostBackPointers[(int)rt]; - for( uint32 bucket = 0; bucket < P2_BUCKET_COUNT; bucket++ ) + for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ ) { const uint32 rTableBucketEntryCount = cx.bucketCounts[(int)rt][bucket]; @@ -638,9 +659,13 @@ void DbgWriteMarks( CudaK32PlotContext& cx, const TableId table ) { char path[512]; + std::string baseUrl = DBG_BBCU_DBG_DIR; + if( cx.cfg.hybrid128Mode ) + baseUrl += "disk/"; + Log::Line( "[DEBUG] Writing marking table %u to disk...", table+1 ); { - sprintf( path, "%smarks%d.tmp", DBG_BBCU_DBG_DIR, (int)table+1 ); + sprintf( path, "%smarks%d.tmp", baseUrl.c_str(), (int)table+1 ); const uint64* marks = cx.hostMarkingTables[(int)table]; diff --git a/cuda/CudaPlotPhase3.cu b/cuda/CudaPlotPhase3.cu index b19d42c3..8fcdfe2a 100644 --- a/cuda/CudaPlotPhase3.cu +++ b/cuda/CudaPlotPhase3.cu @@ -53,7 +53,7 @@ __global__ void CudaConvertInlinedXsToLinePoints( { const Pair p = inXs[gid]; CUDA_ASSERT( p.left || p.right ); - + lp = CudaSquareToLinePoint64( p.left, p.right ); bucket = (uint32)(lp >> bucketShift); offset = atomicAdd( &sharedBuckets[bucket], 1 ); @@ -79,7 +79,6 @@ __global__ void CudaConvertInlinedXsToLinePoints( outIndices[dst] = rIndex; } - //----------------------------------------------------------- __global__ void CudaTestPrune( const uint64 entryCount, const uint32 rOffset, const uint64* rTableMarks, uint32* gPrunedEntryCount ) @@ -236,6 +235,14 @@ void CudaK32PlotPhase3( CudaK32PlotContext& cx ) } #endif + if( cx.cfg.hybrid16Mode ) + { + cx.diskContext->phase3.rMapBuffer->Swap(); + cx.diskContext->phase3.indexBuffer->Swap(); + cx.diskContext->phase3.lpAndLMapBuffer->Swap(); + } + + const uint32 compressionLevel = cx.gCfg->compressionLevel; // Special case with the starting table, since it has the values inlined already @@ -259,11 +266,11 @@ void CudaK32PlotPhase3( CudaK32PlotContext& cx ) elapsed = TimerEnd( timer ); Log::Line( " Step 2 completed step in %.2lf seconds.", elapsed ); - const uint64 baseEntryCount = cx.tableEntryCounts[(int)cx.table]; const uint64 prunedEntryCount = cx.phase3->prunedTableEntryCounts[(int)cx.table]; Log::Line( "Completed table %u in %.2lf seconds with %llu / %llu entries ( %.2lf%% ).", cx.table, tableElapsed, prunedEntryCount, baseEntryCount, (prunedEntryCount / (double)baseEntryCount) * 100.0 ); + } // else if( compressionLevel > 0 ) // { @@ -286,7 +293,7 @@ void CudaK32PlotPhase3( CudaK32PlotContext& cx ) Log::Line( "Compressing tables %u and %u...", (uint)rTable, (uint)rTable+1 ); cx.table = rTable; - + #if BBCU_DBG_SKIP_PHASE_2 if( rTable < TableId::Table7 ) DbgLoadTablePairs( cx, rTable+1, false ); @@ -340,26 +347,22 @@ void Step1( CudaK32PlotContext& cx ) auto& p3 = *cx.phase3; auto& s1 = p3.step1; - const uint32 entryCount = BBCU_BUCKET_ENTRY_COUNT; + if( bucket == 0 && cx.cfg.hybrid128Mode ) + { + cx.diskContext->tablesL[(int)rTable]->Swap(); + cx.diskContext->tablesR[(int)rTable]->Swap(); + + s1.pairsLIn.AssignDiskBuffer( cx.diskContext->tablesL[(int)rTable] ); + s1.pairsRIn.AssignDiskBuffer( cx.diskContext->tablesR[(int)rTable] ); + } + + const uint32 entryCount = cx.bucketCounts[(int)rTable][bucket]; //BBCU_BUCKET_ENTRY_COUNT; - // uint32* hostPairsL = cx.hostTableSortedL + p3.pairsLoadOffset; - // uint16* hostPairsR = cx.hostTableSortedR + p3.pairsLoadOffset; uint32* hostPairsL = cx.hostBackPointers[(int)rTable].left + p3.pairsLoadOffset; uint16* hostPairsR = cx.hostBackPointers[(int)rTable].right + p3.pairsLoadOffset; - // if( rTable < TableId::Table7 ) - // { - // const uint32* nextHostPairsL = cx.hostBackPointers[(int)rTable + 1].left + p3.pairsLoadOffset; - // const uint16* nextHostPairsR = cx.hostBackPointers[(int)rTable + 1].right + p3.pairsLoadOffset; - - // s1.pairsLIn.UploadAndPreLoadT( hostPairsL, entryCount, nextHostPairsL, entryCount ); - // s1.pairsRIn.UploadAndPreLoadT( hostPairsR, entryCount, nextHostPairsR, entryCount ); - // } - // else - { - s1.pairsLIn.UploadT( hostPairsL, entryCount ); - s1.pairsRIn.UploadT( hostPairsR, entryCount ); - } + s1.pairsLIn.UploadT( hostPairsL, entryCount ); + s1.pairsRIn.UploadT( hostPairsR, entryCount ); p3.pairsLoadOffset += entryCount; }; @@ -384,7 +387,6 @@ void Step1( CudaK32PlotContext& cx ) p3.pairsLoadOffset = 0; LoadBucket( cx, 0 ); - /// /// Process buckets /// @@ -403,9 +405,9 @@ void Step1( CudaK32PlotContext& cx ) const uint32* devLPairs = (uint32*)s1.pairsLIn.GetUploadedDeviceBuffer( cx.computeStream ); const uint16* devRPairs = (uint16*)s1.pairsRIn.GetUploadedDeviceBuffer( cx.computeStream ); - const uint32 entryCount = bucket == BBCU_BUCKET_COUNT-1 ? - ( cx.tableEntryCounts[(int)rTable] - (BBCU_BUCKET_ENTRY_COUNT * (BBCU_BUCKET_COUNT-1)) ) : // Get only the remaining entries for the last bucket - BBCU_BUCKET_ENTRY_COUNT; // Otherwise, use a whole bucket's worth. + const uint32 entryCount = cx.bucketCounts[(int)rTable][bucket];// bucket == BBCU_BUCKET_COUNT-1 ? + // ( cx.tableEntryCounts[(int)rTable] - (BBCU_BUCKET_ENTRY_COUNT * (BBCU_BUCKET_COUNT-1)) ) : // Get only the remaining entries for the last bucket + // BBCU_BUCKET_ENTRY_COUNT; // Otherwise, use a whole bucket's worth. auto* devRMap = (RMap*)s1.rMapOut.LockDeviceBuffer( cx.computeStream ); @@ -430,7 +432,7 @@ void Step1( CudaK32PlotContext& cx ) s1.rMapOut.Download2DT( p3.hostRMap + (size_t)bucket * P3_PRUNED_SLICE_MAX, P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_BUCKET_MAX, P3_PRUNED_SLICE_MAX, cx.computeStream ); } - + // Download slice counts cudaStream_t downloadStream = s1.rMapOut.GetQueue()->GetStream(); @@ -464,6 +466,15 @@ void Step1( CudaK32PlotContext& cx ) for( uint32 i = 0; i < BBCU_BUCKET_COUNT; i++ ) p3.prunedTableEntryCounts[(int)rTable] += p3.prunedBucketCounts[(int)rTable][i]; } + + if( cx.cfg.hybrid16Mode ) + { + cx.diskContext->phase3.rMapBuffer->Swap(); + } + + // #if _DEBUG + // DbgValidateRMap( cx ); + // #endif } //----------------------------------------------------------- @@ -478,17 +489,25 @@ void CompressInlinedTable( CudaK32PlotContext& cx ) auto& p3 = *cx.phase3; auto& tx = p3.xTable; - if( bucket == 0 ) - p3.pairsLoadOffset = 0; - // Load inlined x's const TableId rTable = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables; const uint32 entryCount = cx.bucketCounts[(int)rTable][bucket]; + if( bucket == 0 ) + { + p3.pairsLoadOffset = 0; + + if( cx.cfg.hybrid128Mode ) + { + cx.diskContext->tablesL[(int)rTable]->Swap(); + tx.xIn.AssignDiskBuffer( cx.diskContext->tablesL[(int)rTable] ); + } + } + const Pair* inlinedXs = ((Pair*)cx.hostBackPointers[(int)rTable].left) + p3.pairsLoadOffset; tx.xIn.UploadT( inlinedXs, entryCount, cx.computeStream ); - + p3.pairsLoadOffset += entryCount; }; @@ -511,8 +530,8 @@ void CompressInlinedTable( CudaK32PlotContext& cx ) const bool isCompressed = cx.gCfg->compressionLevel > 0; const uint32 compressedLPBits = isCompressed ? GetCompressedLPBitCount( cx.gCfg->compressionLevel ) : 0; - const uint32 lpBits = isCompressed ? compressedLPBits : BBCU_K * 2 - 1; - const uint32 lpBucketShift = lpBits - BBC_BUCKET_BITS; + const uint32 lpBits = isCompressed ? compressedLPBits : BBCU_K * 2 - 1; + const uint32 lpBucketShift = lpBits - BBC_BUCKET_BITS; uint64 tablePrunedEntryCount = 0; uint32 rTableOffset = 0; @@ -556,7 +575,7 @@ void CompressInlinedTable( CudaK32PlotContext& cx ) rTableOffset += entryCount; } - + cudaStream_t downloadStream = tx.lpOut.GetQueue()->GetStream(); CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, @@ -592,11 +611,17 @@ void CompressInlinedTable( CudaK32PlotContext& cx ) p3.prunedTableEntryCounts[(int)rTable] += p3.prunedBucketCounts[(int)rTable][i]; } -#if _DEBUG - // DbgValidateIndices( cx ); - // DbgValidateStep2Output( cx ); - // DbgDumpSortedLinePoints( cx ); -#endif + if( cx.cfg.hybrid16Mode ) + { + cx.diskContext->phase3.lpAndLMapBuffer->Swap(); + cx.diskContext->phase3.indexBuffer->Swap(); + } + +// #if _DEBUG +// DbgValidateIndices( cx ); +// // DbgValidateStep2Output( cx ); +// // DbgDumpSortedLinePoints( cx ); +// #endif } @@ -606,22 +631,47 @@ void CompressInlinedTable( CudaK32PlotContext& cx ) //----------------------------------------------------------- void CudaK32PlotPhase3AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx ) { + static_assert( sizeof( LMap ) == sizeof( uint64 ) ); + auto& p3 = *cx.phase3; // Shared allocations - p3.devBucketCounts = acx.devAllocator->CAlloc( BBCU_BUCKET_COUNT, acx.alignment ); - p3.devPrunedEntryCount = acx.devAllocator->CAlloc( 1, acx.alignment ); + p3.devBucketCounts = acx.devAllocator->CAlloc( BBCU_BUCKET_COUNT, acx.alignment ); + p3.devPrunedEntryCount = acx.devAllocator->CAlloc( 1, acx.alignment ); // Host allocations - p3.hostRMap = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT ); // Used for rMap and index - p3.hostLinePoints = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT ); // Used for lMap and LPs - - if( !acx.dryRun ) + if( !cx.cfg.hybrid16Mode ) + { + p3.hostRMap = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT ); // Used for rMap and index + p3.hostLinePoints = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT ); // Used for lMap and LPs + } + else if( !cx.diskContext->phase3.rMapBuffer ) { - ASSERT( (uintptr_t)(p3.hostLinePoints + BBCU_TABLE_ALLOC_ENTRY_COUNT ) <= (uintptr_t)cx.hostTableL ); - ASSERT( (uintptr_t)(p3.hostLinePoints + BBCU_TABLE_ALLOC_ENTRY_COUNT ) < (uintptr_t)cx.hostTableSortedL ); + const size_t RMAP_SLICE_SIZE = sizeof( RMap ) * P3_PRUNED_SLICE_MAX; + const size_t INDEX_SLICE_SIZE = sizeof( uint32 ) * P3_PRUNED_SLICE_MAX; + const size_t LP_AND_LMAP_SLICE_SIZE = sizeof( uint64 ) * P3_PRUNED_SLICE_MAX; + + const FileFlags TMP2_QUEUE_FILE_FLAGS = cx.cfg.temp2DirectIO ? FileFlags::NoBuffering | FileFlags::LargeFile : FileFlags::LargeFile; + + cx.diskContext->phase3.rMapBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::P3_RMAP_DISK_BUFFER_FILE_NAME.data(), + BBCU_BUCKET_COUNT, RMAP_SLICE_SIZE, FileMode::OpenOrCreate, FileAccess::ReadWrite, TMP2_QUEUE_FILE_FLAGS ); + FatalIf( !cx.diskContext->phase3.rMapBuffer, "Failed to create R Map disk buffer." ); + + cx.diskContext->phase3.indexBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::P3_INDEX_DISK_BUFFER_FILE_NAME.data(), + BBCU_BUCKET_COUNT, INDEX_SLICE_SIZE, FileMode::OpenOrCreate, FileAccess::ReadWrite, TMP2_QUEUE_FILE_FLAGS ); + FatalIf( !cx.diskContext->phase3.indexBuffer, "Failed to create index disk buffer." ); + + cx.diskContext->phase3.lpAndLMapBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::P3_LP_AND_LMAP_DISK_BUFFER_FILE_NAME.data(), + BBCU_BUCKET_COUNT, RMAP_SLICE_SIZE, FileMode::OpenOrCreate, FileAccess::ReadWrite, TMP2_QUEUE_FILE_FLAGS ); + FatalIf( !cx.diskContext->phase3.lpAndLMapBuffer, "Failed to create LP/LMap disk buffer." ); } - // p3.hostBucketCounts = acx.pinnedAllocator->CAlloc( BBCU_BUCKET_COUNT, acx.alignment ); + + #if _DEBUG + if( !acx.dryRun && !cx.cfg.hybrid128Mode ) + { + ASSERT( (uintptr_t)(p3.hostLinePoints + BBCU_TABLE_ALLOC_ENTRY_COUNT ) <= (uintptr_t)cx.hostTableL ); + } + #endif if( acx.dryRun ) { @@ -687,74 +737,156 @@ void CudaK32PlotPhase3AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocConte //----------------------------------------------------------- void AllocXTableStep( CudaK32PlotContext& cx, CudaK32AllocContext& acx ) { + GpuStreamDescriptor desc{}; + desc.entriesPerSlice = BBCU_MAX_SLICE_ENTRY_COUNT; + desc.sliceCount = BBCU_BUCKET_COUNT; + desc.sliceAlignment = acx.alignment; + desc.bufferCount = BBCU_DEFAULT_GPU_BUFFER_COUNT; + desc.deviceAllocator = acx.devAllocator; + desc.pinnedAllocator = nullptr; + + GpuStreamDescriptor uploadDesc = desc; + if( cx.cfg.hybrid128Mode ) + { + uploadDesc.pinnedAllocator = acx.pinnedAllocator; + + if( cx.cfg.hybrid16Mode ) + desc.pinnedAllocator = acx.pinnedAllocator; + } + auto& tx = cx.phase3->xTable; tx.devRMarks = (uint64*)acx.devAllocator->AllocT( GetMarkingTableBitFieldSize(), acx.alignment ); - tx.xIn = cx.gpuUploadStream[0]->CreateUploadBuffer(sizeof(Pair) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, acx.alignment, acx.dryRun); - tx.lpOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, acx.alignment, acx.dryRun ); - tx.indexOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, acx.alignment, acx.dryRun ); + + tx.xIn = cx.gpuUploadStream[0]->CreateUploadBufferT( uploadDesc, acx.dryRun ); + tx.lpOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( desc, acx.dryRun ); + tx.indexOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( desc, acx.dryRun ); + + if( !acx.dryRun && cx.cfg.hybrid16Mode ) + { + tx.lpOut .AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer ); + tx.indexOut.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer ); + } } //----------------------------------------------------------- void CudaK32PlotAllocateBuffersStep1( CudaK32PlotContext& cx, CudaK32AllocContext& acx ) { + GpuStreamDescriptor desc{}; + desc.entriesPerSlice = BBCU_MAX_SLICE_ENTRY_COUNT; + desc.sliceCount = BBCU_BUCKET_COUNT; + desc.sliceAlignment = acx.alignment; + desc.bufferCount = BBCU_DEFAULT_GPU_BUFFER_COUNT; + desc.deviceAllocator = acx.devAllocator; + desc.pinnedAllocator = nullptr; + + GpuStreamDescriptor uploadDesc = desc; + if( cx.cfg.hybrid128Mode ) + { + uploadDesc.pinnedAllocator = acx.pinnedAllocator; + + if( cx.cfg.hybrid16Mode ) + desc.pinnedAllocator = acx.pinnedAllocator; + } + auto& s1 = cx.phase3->step1; const size_t alignment = acx.alignment; - s1.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBuffer( - sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun ); - - s1.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBuffer( - sizeof( uint16 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun ); - - s1.rMapOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( - sizeof( RMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun ); + s1.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT( uploadDesc, acx.dryRun ); + s1.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT( uploadDesc, acx.dryRun ); + s1.rMapOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( desc, acx.dryRun ); s1.rTableMarks = (uint64*)acx.devAllocator->AllocT( GetMarkingTableBitFieldSize(), acx.alignment ); + + if( !acx.dryRun && cx.cfg.hybrid16Mode ) + { + s1.rMapOut.AssignDiskBuffer( cx.diskContext->phase3.rMapBuffer ); + } } //----------------------------------------------------------- void CudaK32PlotAllocateBuffersStep2( CudaK32PlotContext& cx, CudaK32AllocContext& acx ) { + GpuStreamDescriptor desc{}; + desc.entriesPerSlice = BBCU_MAX_SLICE_ENTRY_COUNT; + desc.sliceCount = BBCU_BUCKET_COUNT; + desc.sliceAlignment = acx.alignment; + desc.bufferCount = BBCU_DEFAULT_GPU_BUFFER_COUNT; + desc.deviceAllocator = acx.devAllocator; + desc.pinnedAllocator = nullptr; + + GpuStreamDescriptor uploadDesc = desc; + if( cx.cfg.hybrid16Mode ) + { + desc.pinnedAllocator = acx.pinnedAllocator; + } + auto& s2 = cx.phase3->step2; const size_t alignment = acx.alignment; - s2.rMapIn = cx.gpuUploadStream[0]->CreateUploadBuffer( - sizeof( RMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun ); + s2.rMapIn = cx.gpuUploadStream[0]->CreateUploadBufferT( desc, acx.dryRun ); + s2.lMapIn = cx.gpuUploadStream[0]->CreateUploadBufferT( desc, acx.dryRun ); - s2.lMapIn = cx.gpuUploadStream[0]->CreateUploadBuffer( - sizeof( LMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun ); + s2.lpOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( desc, acx.dryRun ); + s2.indexOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT (desc, acx.dryRun ); - s2.lpOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( - sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun ); - s2.indexOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( - sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun ); - + const size_t devParkAllocSize = P3_PARK_7_SIZE * P3_MAX_P7_PARKS_PER_BUCKET; + + GpuStreamDescriptor parksDesc = desc; + parksDesc.sliceCount = 1; + parksDesc.entriesPerSlice = devParkAllocSize; + parksDesc.sliceAlignment = RoundUpToNextBoundaryT( P3_PARK_7_SIZE, sizeof( uint64 ) ); + + s2.parksOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( parksDesc, acx.dryRun ); + s2.devLTable[0] = acx.devAllocator->CAlloc( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment ); s2.devLTable[1] = acx.devAllocator->CAlloc( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment ); + + if( !acx.dryRun && cx.cfg.hybrid16Mode ) + { + s2.rMapIn.AssignDiskBuffer( cx.diskContext->phase3.rMapBuffer ); + s2.lMapIn.AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer ); + + s2.lpOut .AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer ); + s2.indexOut.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer ); + } } //----------------------------------------------------------- void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContext& acx ) { + GpuStreamDescriptor desc{}; + desc.entriesPerSlice = BBCU_MAX_SLICE_ENTRY_COUNT; + desc.sliceCount = BBCU_BUCKET_COUNT; + desc.sliceAlignment = acx.alignment; + desc.bufferCount = BBCU_DEFAULT_GPU_BUFFER_COUNT; + desc.deviceAllocator = acx.devAllocator; + desc.pinnedAllocator = nullptr; + + if( cx.cfg.hybrid16Mode ) + { + desc.pinnedAllocator = acx.pinnedAllocator; + } + auto& s3 = cx.phase3->step3; const size_t alignment = acx.alignment; s3.hostParkOverrunCount = acx.pinnedAllocator->CAlloc( 1 ); - const size_t devParkAllocSize = DEV_MAX_PARK_SIZE * P3_PRUNED_MAX_PARKS_PER_BUCKET; + s3.lpIn = cx.gpuUploadStream[0]->CreateUploadBufferT( desc, acx.dryRun ); + s3.indexIn = cx.gpuUploadStream[0]->CreateUploadBufferT( desc, acx.dryRun ); - s3.lpIn = cx.gpuUploadStream[0]->CreateUploadBuffer( - sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun ); + s3.mapOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( desc, acx.dryRun ); - s3.indexIn = cx.gpuUploadStream[0]->CreateUploadBuffer( - sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun ); + const size_t devParkAllocSize = DEV_MAX_PARK_SIZE * P3_PRUNED_MAX_PARKS_PER_BUCKET; - s3.mapOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( - sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun ); + GpuStreamDescriptor parksDesc = desc; + parksDesc.sliceCount = 1; + parksDesc.entriesPerSlice = devParkAllocSize; + parksDesc.sliceAlignment = RoundUpToNextBoundaryT( DEV_MAX_PARK_SIZE, sizeof( uint64 ) ); - s3.parksOut = cx.gpuDownloadStream[0]->CreateDownloadBuffer(devParkAllocSize, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun); + s3.parksOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( parksDesc, acx.dryRun ); if( acx.dryRun ) { @@ -774,11 +906,16 @@ void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContex s3.devDeltaLinePoints = acx.devAllocator->CAlloc( linePointAllocCount, alignment ); s3.devIndices = acx.devAllocator->CAlloc( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment ); - // s3.devParks = acx.devAllocator->AllocT( parkAllocSize, alignment ); - // s3.hostParks = acx.devAllocator->AllocT ( maxParkSize , alignment ); - s3.devCTable = acx.devAllocator->AllocT( P3_MAX_CTABLE_SIZE, alignment ); s3.devParkOverrunCount = acx.devAllocator->CAlloc( 1 ); + + if( !acx.dryRun && cx.cfg.hybrid16Mode ) + { + s3.lpIn .AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer ); + s3.indexIn.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer ); + + s3.mapOut.AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer ); + } } @@ -827,6 +964,9 @@ void DbgValidateRMap( CudaK32PlotContext& cx ) RMap* rMap = bbcvirtallocbounded( BBCU_BUCKET_ALLOC_ENTRY_COUNT ); + // blake3_hasher hasher; + // blake3_hasher_init( &hasher ); + for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ ) { const RMap* reader = p3.hostRMap + bucket * P3_PRUNED_BUCKET_MAX; @@ -838,7 +978,7 @@ void DbgValidateRMap( CudaK32PlotContext& cx ) { const uint32 copyCount = s1.prunedBucketSlices[slice][bucket]; bbmemcpy_t( writer, reader, copyCount ); - + writer += copyCount; entryCount += copyCount; @@ -858,13 +998,18 @@ void DbgValidateRMap( CudaK32PlotContext& cx ) const uint32 right = map.dstR - bucketOffset; ASSERT( left < BBCU_BUCKET_ALLOC_ENTRY_COUNT ); ASSERT( right < BBCU_BUCKET_ALLOC_ENTRY_COUNT ); - CUDA_ASSERT( left < right ); - + ASSERT( left < right ); } + + // Hash bucket + // blake3_hasher_update( &hasher, rMap, sizeof( RMap ) * entryCount ); } + // Print hash + // DbgFinishAndPrintHash( hasher, "r_map", (uint)cx.table + 1 ); + bbvirtfreebounded( rMap ); - Log::Line( "[DEBUG] CPU OK" ); + Log::Line( " [DEBUG] CPU OK" ); } // Validate in CUDA @@ -899,10 +1044,12 @@ void DbgValidateRMap( CudaK32PlotContext& cx ) p3.step2.rMapIn.ReleaseDeviceBuffer( cx.computeStream ); } - Log::Line( "[DEBUG] CUDA OK" ); + Log::Line( " [DEBUG] CUDA OK" ); p3.step2.lMapIn.Reset(); } + + Log::Line( "[DEBUG] RMap validation OK" ); } //----------------------------------------------------------- @@ -922,23 +1069,45 @@ void DbgValidateIndices( CudaK32PlotContext& cx ) const uint32* reader = p3.hostIndices; const size_t readerStride = P3_PRUNED_SLICE_MAX * 3; - uint64 entryCount = 0; for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ ) { - for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ ) + if( cx.cfg.hybrid16Mode ) + { + const uint32* sizeSlices = &s2.prunedBucketSlices[0][bucket]; + + cx.diskContext->phase3.indexBuffer->OverrideReadSlices( bucket, sizeof( uint32 ), sizeSlices, BBCU_BUCKET_COUNT ); + cx.diskContext->phase3.indexBuffer->ReadNextBucket(); + const auto readBucket = cx.diskContext->phase3.indexBuffer->GetNextReadBufferAs(); + ASSERT( readBucket.Length() == p3.prunedBucketCounts[(int)cx.table][bucket] ); + + bbmemcpy_t( idxWriter, readBucket.Ptr(), readBucket.Length() ); + + idxWriter += readBucket.Length(); + entryCount += readBucket.Length(); + } + else { - const uint32 copyCount = s2.prunedBucketSlices[bucket][slice]; + for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ ) + { + const uint32 copyCount = s2.prunedBucketSlices[slice][bucket]; - bbmemcpy_t( idxWriter, reader, copyCount ); + bbmemcpy_t( idxWriter, reader, copyCount ); - idxWriter += copyCount; - entryCount += copyCount; - reader += readerStride; + idxWriter += copyCount; + entryCount += copyCount; + reader += readerStride; + } } } + if( cx.cfg.hybrid16Mode ) + { + cx.diskContext->phase3.indexBuffer->Swap(); + cx.diskContext->phase3.indexBuffer->Swap(); + } + ASSERT( entryCount == p3.prunedTableEntryCounts[(int)cx.table] ); RadixSort256::Sort( pool, indices, idxTmp, entryCount ); @@ -949,10 +1118,36 @@ void DbgValidateIndices( CudaK32PlotContext& cx ) ASSERT( indices[i] > indices[i-1] ); } + DbgHashDataT( indices, entryCount, "indices", (uint32)cx.table+1 ); + bbvirtfreebounded( indices ); bbvirtfreebounded( idxTmp ); - Log::Line( "[DEBUG] OK" ); + Log::Line( "[DEBUG] Index validation OK" ); +} + +//----------------------------------------------------------- +void DbgHashData( const void* data, size_t size, const char* name, uint32 index ) +{ + blake3_hasher hasher; + blake3_hasher_init( &hasher ); + blake3_hasher_update( &hasher, data, size ); + + DbgFinishAndPrintHash( hasher, name, index ); +} + +//----------------------------------------------------------- +void DbgFinishAndPrintHash( blake3_hasher& hasher, const char* name, uint32 index ) +{ + constexpr size_t HASH_LEN = 256/8; + byte output[HASH_LEN]; + blake3_hasher_finalize( &hasher, output, HASH_LEN ); + + Log::Write( "[DEBUG] %s_%u hash: 0x", name, index ); + for( uint32 i = 0; i < HASH_LEN; i++ ) + Log::Write( "%02x", output[i] ); + + Log::NewLine(); } #endif diff --git a/cuda/CudaPlotPhase3Internal.h b/cuda/CudaPlotPhase3Internal.h index 1a4bd7a8..34909123 100644 --- a/cuda/CudaPlotPhase3Internal.h +++ b/cuda/CudaPlotPhase3Internal.h @@ -10,8 +10,18 @@ #include "plotdisk/jobs/IOJob.h" #include "algorithm/RadixSort.h" #include "plotmem/ParkWriter.h" + #include "b3/blake3.h" void DbgValidateStep2Output( CudaK32PlotContext& cx ); + + void DbgHashData( const void* data, size_t size, const char* name, uint32 index ); + + void DbgFinishAndPrintHash( blake3_hasher& hasher, const char* name, uint32 index ); + template + inline void DbgHashDataT( const T* data, uint64 count, const char* name, uint32 index ) + { + DbgHashData( data, (size_t)count * sizeof( T ), name, index ); + } #endif using LMap = CudaK32Phase3::LMap; @@ -27,22 +37,11 @@ static_assert( alignof( LMap ) == sizeof( uint32 ) ); #define P3_PRUNED_TABLE_MAX_ENTRIES BBCU_TABLE_ALLOC_ENTRY_COUNT //(P3_PRUNED_BUCKET_MAX*BBCU_BUCKET_COUNT) #define P3_PRUNED_MAX_PARKS_PER_BUCKET ((P3_PRUNED_BUCKET_MAX/kEntriesPerPark)+2) -static constexpr size_t P3_MAX_CTABLE_SIZE = 38u * 1024u; // Should be more than enough - -//static constexpr size_t P3_LP_BUCKET_COUNT = BBCU_BUCKET_COUNT;// << 1; -//static constexpr size_t P3_LP_SLICE_ENTRY_COUNT = BBCU_MAX_SLICE_ENTRY_COUNT; -//static constexpr uint32 P3_LP_BUCKET_BITS = BBC_BUCKET_BITS; - -// static constexpr uint32 P3_LP_BUCKET_BITS = (uint32)(CuBBLog2( P3_LP_BUCKET_COUNT )); -//static constexpr size_t P3_LP_SLICE_ENTRY_COUNT = ( CuCDiv( (size_t)( ( BBCU_TABLE_ENTRY_COUNT / P3_LP_BUCKET_COUNT / P3_LP_BUCKET_COUNT ) * P3_LP_BUCKET_MULTIPLER ), - //BBCU_XTRA_ENTRIES_PER_SLICE ) * BBCU_XTRA_ENTRIES_PER_SLICE + BBCU_XTRA_ENTRIES_PER_SLICE ); -// static constexpr size_t P3_LP_BUCKET_ENTRY_COUNT = P3_LP_SLICE_ENTRY_COUNT * P3_LP_BUCKET_COUNT; - -//static constexpr size_t P3_LP_BUCKET_STRIDE = BBCU_BUCKET_ALLOC_ENTRY_COUNT; -// static constexpr size_t P3_LP_BUCKET_ALLOC_COUNT = ( CuCDiv( (size_t)( ( BBCU_TABLE_ENTRY_COUNT / P3_LP_BUCKET_COUNT / P3_LP_BUCKET_COUNT ) * P3_LP_BUCKET_MULTIPLER ), -// BBCU_XTRA_ENTRIES_PER_SLICE ) * BBCU_XTRA_ENTRIES_PER_SLICE + BBCU_XTRA_ENTRIES_PER_SLICE ); -// //static constexpr size_t P3_LP_TABLE_ALLOC_COUNT = P3_LP_BUCKET_STRIDE * BBCU_BUCKET_COUNT; +static constexpr size_t P3_MAX_CTABLE_SIZE = 38u * 1024u; // Should be more than enough +static constexpr size_t P3_MAX_P7_PARKS_PER_BUCKET = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2; +static constexpr size_t P3_PARK_7_SIZE = CalculatePark7Size( BBCU_K ); +static_assert( sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT >= P3_MAX_P7_PARKS_PER_BUCKET * P3_PARK_7_SIZE ); static constexpr size_t MAX_PARK_SIZE = CalculateParkSize( TableId::Table1 ); static constexpr size_t DEV_MAX_PARK_SIZE = CuCDiv( MAX_PARK_SIZE, sizeof( uint64 ) ) * sizeof( uint64 ); // Align parks to 64 bits, for easier writing of stubs diff --git a/cuda/CudaPlotPhase3Step2.cu b/cuda/CudaPlotPhase3Step2.cu index ac13e915..3a7a6449 100644 --- a/cuda/CudaPlotPhase3Step2.cu +++ b/cuda/CudaPlotPhase3Step2.cu @@ -248,7 +248,7 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx ) s2.rMapIn.UploadArrayT( rmap, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, rSliceCounts ); }; - + const TableId rTable = cx.table; const TableId lTable = rTable-1; @@ -309,7 +309,7 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx ) const auto* rMap = (RMap*)s2.rMapIn.GetUploadedDeviceBuffer( cx.computeStream ); const uint32 rEntryCount = p3.prunedBucketCounts[(int)rTable][bucket]; - + uint64* devOutLPs = (uint64*)s2.lpOut .LockDeviceBuffer( cx.computeStream ); uint32* devOutIndices = (uint32*)s2.indexOut.LockDeviceBuffer( cx.computeStream ); @@ -317,7 +317,6 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx ) s2.rMapIn.ReleaseDeviceBuffer( cx.computeStream ); rTableOffset += rEntryCount; - // Horizontal download (write 1 row) s2.lpOut .Download2DT( p3.hostLinePoints + (size_t)bucket * P3_PRUNED_BUCKET_MAX , P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX , P3_PRUNED_SLICE_MAX, cx.computeStream ); s2.indexOut.Download2DT( p3.hostIndices + (size_t)bucket * P3_PRUNED_BUCKET_MAX*3, P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX*3, P3_PRUNED_SLICE_MAX, cx.computeStream ); @@ -354,7 +353,7 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx ) CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cudaMemcpyDeviceToHost, downloadStream ) ); - + memset( p3.prunedBucketCounts[(int)rTable], 0, BBCU_BUCKET_COUNT * sizeof( uint32 ) ); CudaErrCheck( cudaStreamSynchronize( downloadStream ) ); @@ -370,8 +369,15 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx ) ASSERT( p3.prunedBucketCounts[(int)rTable][bucket] <= P3_PRUNED_BUCKET_MAX ); } + if( cx.cfg.hybrid16Mode ) + { + cx.diskContext->phase3.rMapBuffer->Swap(); + cx.diskContext->phase3.lpAndLMapBuffer->Swap(); + cx.diskContext->phase3.indexBuffer->Swap(); + } + // #if _DEBUG - // if( cx.table > TableId::Table3 ) + // // if( cx.table > TableId::Table3 ) // { // DbgValidateStep2Output( cx ); // } @@ -402,23 +408,26 @@ void WritePark7( CudaK32PlotContext& cx ) auto& p3 = *cx.phase3; auto& s2 = p3.step2; - + // Load initial bucket LoadBucket( cx, 0 ); // Begin park 7 table in plot cx.plotWriter->BeginTable( PlotTable::Table7 ); - constexpr size_t parkSize = CalculatePark7Size( BBCU_K ); + constexpr size_t parkSize = P3_PARK_7_SIZE; constexpr size_t parkFieldCount = parkSize / sizeof( uint64 ); static_assert( parkFieldCount * sizeof( uint64 ) == parkSize ); + GpuDownloadBuffer& parkDownloader = cx.useParkContext ? s2.parksOut : s2.lpOut; - GpuDownloadBuffer& parkDownloader = s2.lpOut; - - constexpr size_t maxParksPerBucket = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2; + constexpr size_t maxParksPerBucket = P3_MAX_P7_PARKS_PER_BUCKET; static_assert( sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT >= maxParksPerBucket * parkSize ); + if( cx.useParkContext ) + { + cx.parkContext->parkBufferChain->Reset(); + } // Host stuff constexpr size_t hostMetaTableSize = sizeof( RMap ) * BBCU_TABLE_ALLOC_ENTRY_COUNT; @@ -427,9 +436,10 @@ void WritePark7( CudaK32PlotContext& cx ) const uint64 tableEntryCount = cx.tableEntryCounts[(int)cx.table]; const size_t totalParkCount = CDiv( (size_t)tableEntryCount, kEntriesPerPark ); - byte* hostParks = hostAllocator.AllocT( totalParkCount * parkSize ); - byte* hostParkWriter = hostParks; - uint32* hostLastParkEntries = hostAllocator.CAlloc( kEntriesPerPark ); + byte* hostParks = cx.useParkContext ? nullptr : hostAllocator.AllocT( totalParkCount * parkSize ); + byte* hostParksWriter = cx.useParkContext ? nullptr : hostParks; + uint32* hostLastParkEntries = cx.useParkContext ? (uint32*)cx.parkContext->hostRetainedLinePoints : + hostAllocator.CAlloc( kEntriesPerPark ); static_assert( kEntriesPerPark * maxParksPerBucket <= BBCU_BUCKET_ALLOC_ENTRY_COUNT * 2 ); uint32* devIndexBuffer = s2.devLTable[0] + kEntriesPerPark; @@ -479,14 +489,38 @@ void WritePark7( CudaK32PlotContext& cx ) // Download parks & write to plot const size_t downloadSize = parkCount * parkSize; - parkDownloader.DownloadWithCallback( hostParkWriter, downloadSize, + if( cx.useParkContext ) + { + ASSERT( downloadSize <= cx.parkContext->parkBufferChain->BufferSize() ); + + // Override the park buffer to be used when using a park context + hostParksWriter = cx.parkContext->parkBufferChain->PeekBuffer( bucket ); + + // Wait for the next park buffer to be available + parkDownloader.HostCallback([&cx]{ + (void)cx.parkContext->parkBufferChain->GetNextBuffer(); + }); + } + + parkDownloader.DownloadWithCallback( hostParksWriter, downloadSize, []( void* parksBuffer, size_t size, void* userData ) { auto& cx = *reinterpret_cast( userData ); cx.plotWriter->WriteTableData( parksBuffer, size ); + + // Release the buffer after the plot writer is done with it. + if( cx.useParkContext ) + { + cx.plotWriter->CallBack([&cx](){ + cx.parkContext->parkBufferChain->ReleaseNextBuffer(); + }); + } + }, &cx, cx.computeStream ); - hostParkWriter += downloadSize; + hostParksWriter += downloadSize; + if( cx.useParkContext ) + hostParksWriter = nullptr; } // Wait for parks to complete downloading @@ -499,9 +533,19 @@ void WritePark7( CudaK32PlotContext& cx ) // Was there a left-over park? if( retainedEntryCount > 0 ) { + if( cx.useParkContext ) + hostParksWriter = cx.parkContext->parkBufferChain->GetNextBuffer(); + // Submit last park to plot - TableWriter::WriteP7Parks( 1, hostLastParkEntries, hostParkWriter ); - cx.plotWriter->WriteTableData( hostParkWriter, parkSize ); + TableWriter::WriteP7Parks( 1, hostLastParkEntries, hostParksWriter ); + cx.plotWriter->WriteTableData( hostParksWriter, parkSize ); + + if( cx.useParkContext ) + { + cx.plotWriter->CallBack([&cx](){ + cx.parkContext->parkBufferChain->ReleaseNextBuffer(); + }); + } } cx.plotWriter->EndTable(); @@ -534,6 +578,7 @@ void _DbgValidateOutput( CudaK32PlotContext& cx ) auto& s2 = p3.step2; // Validate line points... + Log::Debug( "[DEBUG] Validating line points..." ); uint64* refLinePoints = bbcvirtallocboundednuma( BBCU_TABLE_ALLOC_ENTRY_COUNT ); uint64* tmpLinePoints = bbcvirtallocboundednuma( BBCU_TABLE_ALLOC_ENTRY_COUNT ); uint32* indices = bbcvirtallocboundednuma( BBCU_TABLE_ALLOC_ENTRY_COUNT ); @@ -614,9 +659,13 @@ void _DbgValidateOutput( CudaK32PlotContext& cx ) } } + DbgHashDataT( refLinePoints, prunedEntryCount, "line_points", (uint32)cx.table+1 ); + bbvirtfreebounded( refLinePoints ); bbvirtfreebounded( tmpLinePoints ); bbvirtfreebounded( indices ); + + Log::Debug( "[DEBUG] Line point validation OK" ); } #endif @@ -659,6 +708,8 @@ void DbgDumpSortedLinePoints( CudaK32PlotContext& cx ) ThreadPool& pool = *cx.threadPool; //DbgGetThreadPool( cx ); RadixSort256::Sort( pool, sortedLinePoints, tmpLinePoints, prunedEntryCount ); + // DbgHashDataT( sortedLinePoints, prunedEntryCount, "sorted_line_points", (uint32)cx.table+1 ); + // Write to disk { char filePath[1024] = {}; diff --git a/cuda/CudaPlotPhase3Step3.cu b/cuda/CudaPlotPhase3Step3.cu index 3949bd8c..c8f9337b 100644 --- a/cuda/CudaPlotPhase3Step3.cu +++ b/cuda/CudaPlotPhase3Step3.cu @@ -52,12 +52,14 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx ) // Load CTable const bool isCompressed = cx.gCfg->compressionLevel > 0 && lTable <= (TableId)cx.gCfg->numDroppedTables; - const uint32 stubBitSize = !isCompressed ? (BBCU_K - kStubMinusBits) : cx.gCfg->compressionInfo.subtSizeBits; + const uint32 stubBitSize = !isCompressed ? (BBCU_K - kStubMinusBits) : cx.gCfg->compressionInfo.stubSizeBits; const TableId firstTable = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables; - + + const bool isFirstSerializedTable = firstTable == rTable; + const size_t cTableSize = !isCompressed ? sizeof( CTable_0 ) : cx.gCfg->cTableSize; ASSERT( cTableSize <= P3_MAX_CTABLE_SIZE ); const FSE_CTable* hostCTable = !isCompressed ? CTables[(int)lTable] : cx.gCfg->ctable; - + // (upload must be loaded before first bucket, on the same stream) CudaErrCheck( cudaMemcpyAsync( s3.devCTable, hostCTable, cTableSize, cudaMemcpyHostToDevice, s3.lpIn.GetQueue()->GetStream() ) ); @@ -75,13 +77,32 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx ) const size_t hostParkSize = isCompressed ? cx.gCfg->compressionInfo.tableParkSize : CalculateParkSize( lTable ); ASSERT( DEV_MAX_PARK_SIZE >= hostParkSize ); - // #TODO: Move this allocation to the beginning - if( s3.parkFence == nullptr ) - s3.parkFence = new Fence(); - byte* hostParksWriter = (byte*)cx.hostBackPointers[(int)rTable].left; //(byte*)cx.hostTableL; uint64* hostRetainedEntries = nullptr; + if( cx.cfg.hybrid128Mode ) + { + hostParksWriter = (byte*)cx.hostTableL; + + if( !isFirstSerializedTable && !cx.useParkContext ) + { + // Ensure the this buffer is no longer in use (the last table finished writing to disk.) + const bool willWaitForParkFence = cx.parkFence->Value() < BBCU_BUCKET_COUNT; + if( willWaitForParkFence ) + Log::Line( " Waiting for parks buffer to become available." ); + + Duration parkWaitTime; + cx.parkFence->Wait( BBCU_BUCKET_COUNT, parkWaitTime ); + + if( willWaitForParkFence ) + Log::Line( " Waited %.3lf seconds for the park buffer to be released.", TicksToSeconds( parkWaitTime ) ); + } + } + if( cx.useParkContext ) + { + cx.parkContext->parkBufferChain->Reset(); + } + // if( !isCompressed && lTable == TableId::Table1 ) // hostParksWriter = (byte*)cx.hostBackPointers[(int)TableId::Table2].left; @@ -101,7 +122,7 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx ) // Set initial event LP stream event as set. CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) ); - s3.parkFence->Reset( 0 ); + cx.parkFence->Reset( 0 ); s3.parkBucket = 0; for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ ) @@ -200,7 +221,8 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx ) // No more buckets so we have to compress this last park on the CPU CudaErrCheck( cudaStreamWaitEvent( downloadStream, cx.computeEventC ) ); - hostRetainedEntries = (uint64*)( hostParksWriter + hostParkSize * parkCount ); + hostRetainedEntries = cx.useParkContext ? cx.parkContext->hostRetainedLinePoints : + (uint64*)( hostParksWriter + hostParkSize * parkCount ); CudaErrCheck( cudaMemcpyAsync( hostRetainedEntries, copySource, copySize, cudaMemcpyDeviceToHost, downloadStream ) ); } } @@ -209,6 +231,19 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx ) // Download parks + if( cx.useParkContext ) + { + ASSERT( hostParkSize * parkCount <= cx.parkContext->parkBufferChain->BufferSize() ); + + // Override the park buffer to be used when using a park context + hostParksWriter = cx.parkContext->parkBufferChain->PeekBuffer( bucket ); + + // Wait for the next park buffer to be available + s3.parksOut.HostCallback([&cx]{ + (void)cx.parkContext->parkBufferChain->GetNextBuffer(); + }); + } + s3.parksOut.Download2DWithCallback( hostParksWriter, hostParkSize, parkCount, hostParkSize, DEV_MAX_PARK_SIZE, []( void* parksBuffer, size_t size, void* userData ) { @@ -216,11 +251,22 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx ) auto& s3 = cx.phase3->step3; cx.plotWriter->WriteTableData( parksBuffer, size ); - cx.plotWriter->SignalFence( *s3.parkFence, ++s3.parkBucket ); + cx.plotWriter->SignalFence( *cx.parkFence, ++s3.parkBucket ); + + // Release the buffer after the plot writer is done with it. + if( cx.useParkContext ) + { + cx.plotWriter->CallBack([&cx](){ + cx.parkContext->parkBufferChain->ReleaseNextBuffer(); + }); + } }, &cx, lpStream, cx.downloadDirect ); hostParksWriter += hostParkSize * parkCount; + + if( cx.useParkContext ) + hostParksWriter = nullptr; } // Copy park overrun count @@ -242,18 +288,24 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx ) // Was there a left-over park? if( retainedLPCount > 0 ) { - ASSERT( hostRetainedEntries ); - + if( cx.useParkContext ) + hostParksWriter = cx.parkContext->parkBufferChain->GetNextBuffer(); + uint64 lastParkEntries[kEntriesPerPark]; bbmemcpy_t( lastParkEntries, hostRetainedEntries, retainedLPCount ); WritePark( hostParkSize, retainedLPCount, lastParkEntries, hostParksWriter, stubBitSize, hostCTable ); cx.plotWriter->WriteTableData( hostParksWriter, hostParkSize ); + + if( cx.useParkContext ) + { + cx.plotWriter->CallBack([&cx](){ + cx.parkContext->parkBufferChain->ReleaseNextBuffer(); + }); + } } cx.plotWriter->EndTable(); - // Update buckets counts for L table - // #TODO: These should match Step 1 pruned entry count I believe, so just copy? memset( p3.prunedBucketCounts[(int)rTable], 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT ); for( uint32 i = 0; i < BBCU_BUCKET_COUNT; i++ ) @@ -266,12 +318,19 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx ) s3.lpIn .Reset(); s3.indexIn.Reset(); + if( cx.cfg.hybrid16Mode ) + { + cx.diskContext->phase3.lpAndLMapBuffer->Swap(); + cx.diskContext->phase3.indexBuffer->Swap(); + } + // #if _DEBUG // //if( cx.table >= TableId::Table6 ) // //{ - // DbgValidateLMap( cx ); - // DbgValidateLMapData( cx ); + // // DbgValidateLMap( cx ); + // // DbgValidateLMapData( cx ); + // // DbgSaveLMap( cx ); // //} // #endif @@ -386,7 +445,7 @@ void DbgSaveLMap( CudaK32PlotContext& cx ) char path[512]; sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.tmp", (uint)cx.table+1 ); - + const size_t writeSize = sizeof( LMap ) * BBCU_TABLE_ALLOC_ENTRY_COUNT; int err; FatalIf( !IOJob::WriteToFile( path, p3.hostLMap, writeSize, err ), @@ -399,7 +458,7 @@ void DbgSaveLMap( CudaK32PlotContext& cx ) sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.buckets.tmp", (uint)cx.table+1 ); FatalIf( !IOJob::WriteToFileUnaligned( path, p3.prunedBucketCounts[(int)cx.table], sizeof( uint32 ) * BBCU_BUCKET_COUNT, err ), "[DEBUG] Failed to write LMap buckets with error: %d", err ); - + Log::Line( " [DEBUG] OK" ); } @@ -410,7 +469,7 @@ void DbgLoadLMap( CudaK32PlotContext& cx ) char path[512]; sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.tmp", (uint)cx.table+1 ); - + const size_t writeSize = sizeof( LMap ) * BBCU_TABLE_ALLOC_ENTRY_COUNT; int err; FatalIf( !IOJob::ReadFromFile( path, p3.hostLMap, writeSize, err ), @@ -438,10 +497,12 @@ void DbgValidateLMap( CudaK32PlotContext& cx ) auto& p3 = *cx.phase3; auto& s3 = p3.step3; - LMap* lMap = bbcvirtallocbounded( BBCU_TABLE_ENTRY_COUNT ); + LMap* lMap = bbcvirtallocbounded( BBCU_BUCKET_ALLOC_ENTRY_COUNT ); - { + // blake3_hasher hasher; + // blake3_hasher_init( &hasher ); + for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ ) { const LMap* reader = p3.hostLMap + bucket * P3_PRUNED_BUCKET_MAX; @@ -471,14 +532,18 @@ void DbgValidateLMap( CudaK32PlotContext& cx ) ASSERT( map.sourceIndex || map.sortedIndex ); ASSERT( ( map.sourceIndex >> ( 32 - BBC_BUCKET_BITS ) ) == bucket ); } + + // Hash bucket + // blake3_hasher_update( &hasher, lMap, sizeof( LMap ) * entryCount ); } - + // Print hash + // DbgFinishAndPrintHash( hasher, "l_map", (uint)cx.table + 1 ); } bbvirtfreebounded( lMap ); - Log::Line( "[DEBUG] OK" ); + Log::Line( "[DEBUG] LMap OK" ); } //----------------------------------------------------------- @@ -566,7 +631,7 @@ void _DbgValidateLMapData( CudaK32PlotContext& cx ) bbvirtfreebounded( dstIndices ); bbvirtfreebounded( tmpIndices ); - Log::Line( "[DEBUG] OK" ); + Log::Line( "[DEBUG] LMap uniqueness OK" ); } #endif diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu index 8e0458dd..80ba8b0e 100644 --- a/cuda/CudaPlotter.cu +++ b/cuda/CudaPlotter.cu @@ -9,6 +9,10 @@ #include "plotting/CTables.h" #include "plotting/TableWriter.h" #include "plotting/PlotTools.h" +#include "util/VirtualAllocator.h" +#include "harvesting/GreenReaper.h" +#include "tools/PlotChecker.h" + // TEST/DEBUG #if _DEBUG @@ -36,6 +40,7 @@ static void InlineTable( CudaK32PlotContext& cx, const uint32* devInX, cudaStrea static void AllocBuffers( CudaK32PlotContext& cx ); static void AllocateP1Buffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx ); +static void AllocateParkSerializationBuffers( CudaK32PlotContext& cx, IAllocator& pinnedAllocator, bool dryRun ); template static void UploadBucketToGpu( CudaK32PlotContext& context, TableId table, const uint32* hostPtr, T* devPtr, uint64 bucket, uint64 stride ); @@ -53,11 +58,37 @@ GPU-based (CUDA) plotter [OPTIONS]: -h, --help : Shows this help message and exits. -d, --device : Select the CUDA device index. (default=0) + + --disk-128 : Enable hybrid disk plotting for 128G system RAM. + Requires a --temp1 and --temp2 to be set. + + --disk-16 : (experimental) Enable hybrid disk plotting for 16G system RAM. + Requires a --temp1 and --temp2 to be set. + + -t1, --temp1 : Temporary directory 1. Used for longer-lived, sequential writes. + + -t2, --temp2 : Temporary directory 2. Used for temporary, shorted-lived read and writes. + NOTE: If only one of -t1 or -t2 is specified, both will be + set to the same directory. + + --check : Perform a plot check for proofs on the newly created plot. + + --check-threshold : Proof threshold rate below which the plots that don't pass + the check will be deleted. + That is, the number of proofs fetched / proof check count + must be above or equal to this threshold to pass. + (default=0.6). )"; /// /// CLI /// +//----------------------------------------------------------- +void CudaK32PlotterPrintHelp() +{ + Log::Line( USAGE ); +} + //----------------------------------------------------------- void CudaK32Plotter::ParseCLI( const GlobalPlotConfig& gCfg, CliParser& cli ) { @@ -68,18 +99,70 @@ void CudaK32Plotter::ParseCLI( const GlobalPlotConfig& gCfg, CliParser& cli ) { if( cli.ReadU32( cfg.deviceIndex, "-d", "--device" ) ) continue; - if( cli.ReadSwitch( cfg.disableDirectDownloads, "--no-direct-downloads" ) ) + if( cli.ReadSwitch( cfg.hybrid128Mode, "--disk-128" ) ) + continue; + if( cli.ReadSwitch( cfg.hybrid16Mode, "--disk-16" ) ) + { + cfg.hybrid128Mode = true; + continue; + } + if( cli.ReadStr( cfg.temp1Path, "-t1", "--temp1" ) ) + { + if( !cfg.temp2Path ) + cfg.temp2Path = cfg.temp1Path; + continue; + } + if( cli.ReadStr( cfg.temp2Path, "-t2", "--temp2" ) ) + { + if( !cfg.temp1Path ) + cfg.temp1Path = cfg.temp2Path; + continue; + } + if( cli.ReadUnswitch( cfg.temp1DirectIO, "--no-t1-direct" ) ) + continue; + if( cli.ReadUnswitch( cfg.temp2DirectIO, "--no-t2-direct" ) ) + continue; + + if( cli.ReadU64( cfg.plotCheckCount, "--check" ) ) + continue; + if( cli.ReadF64( cfg.plotCheckThreshhold, "--check-threshold" ) ) continue; + // if( cli.ReadSwitch( cfg.disableDirectDownloads, "--no-direct-buffers" ) ) + // continue; if( cli.ArgMatch( "--help", "-h" ) ) { - Log::Line( USAGE ); + CudaK32PlotterPrintHelp(); exit( 0 ); } else break; // Let the caller handle it } - // The rest should be output directies, parsed by the global config parser. + + + if( cfg.hybrid128Mode && gCfg.compressionLevel <= 0 ) + { + Log::Error( "Error: Cannot plot classic (uncompressed) plots in 128G or 64G mode." ); + Exit( -1 ); + } + + if( cfg.hybrid16Mode ) + { + #if PLATFORM_IS_WINDOWS + Log::Error( "16G mode is currently unsupported on Windows." ); + Exit( -1 ); + #else + Log::Line( "Warning: 16G mode is experimental and still under development." ); + Log::Line( " Please use the --check parameter to validate plots when using this mode." ); + + if( cfg.temp1DirectIO || cfg.temp2DirectIO ) + { + Log::Line( " Direct I/O not supported in 16G mode at the moment. Disabing it." ); + cfg.temp1DirectIO = cfg.temp2DirectIO = false; + } + + #endif + } } //----------------------------------------------------------- @@ -97,10 +180,25 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext ) auto& cx = *new CudaK32PlotContext{}; outContext = &cx; - cx.cfg = cfg; - cx.gCfg = cfg.gCfg; + cx.cfg = cfg; + cx.gCfg = cfg.gCfg; + + cx.firstStoredTable = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables; Log::Line( "[Bladebit CUDA Plotter]" ); + Log::Line( " Host RAM : %llu GiB", SysHost::GetTotalSystemMemory() BtoGB ); + + if( cx.cfg.plotCheckCount == 0 ) + Log::Line( " Plot checks : disabled" ); + else + { + Log::Line( " Plot checks : enabled ( %llu )", (llu)cx.cfg.plotCheckCount ); + Log::Line( " Plot check threshold: %.3lf", cx.cfg.plotCheckThreshhold ); + } + + // Log::Line( " Direct transfers: %s", cfg.disableDirectDownloads ? "false" : "true" ); + Log::NewLine(); + CudaInit( cx ); CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStream , cudaStreamNonBlocking ) ); @@ -119,27 +217,89 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext ) } cx.threadPool = new ThreadPool( SysHost::GetLogicalCPUCount() ); + cx.plotFence = new Fence(); + cx.parkFence = new Fence(); - #if __linux__ - cx.downloadDirect = cfg.disableDirectDownloads ? false : true; + #if _WIN32 + // #MAYBE: Add a configurable option to enable direct downloads on windows? + // On windows always default to using intermediate pinned buffers + cx.downloadDirect = false; #else - // #TODO: One windows, check if we have enough memory, if so, default to true. - cx.downloadDirect = true ;//false; + cx.downloadDirect = cfg.disableDirectDownloads ? false : true; #endif // cx.plotWriter = new PlotWriter( !cfg.gCfg->disableOutputDirectIO ); // if( cx.gCfg->benchmarkMode ) // cx.plotWriter->EnableDummyMode(); - cx.plotFence = new Fence(); + // Need to do allocations for park serialization differently under the following conditions + if( cx.downloadDirect || cx.cfg.hybrid128Mode ) + { + cx.parkContext = new CudaK32ParkContext{}; - cx.phase2 = new CudaK32Phase2{}; - cx.phase3 = new CudaK32Phase3{}; + if( cx.cfg.hybrid16Mode ) + cx.useParkContext = true; + } + + // Check for hybrid mode + if( cx.cfg.hybrid128Mode ) + { + cx.diskContext = new CudaK32HybridMode{}; + cx.diskContext->temp1Queue = new DiskQueue( cx.cfg.temp1Path ); + + // Re-use the same queue for temp2 if temp1 and temp2 are pointing to the same path + auto t1Path = std::filesystem::canonical( cx.cfg.temp1Path ); + auto t2Path = std::filesystem::canonical( cx.cfg.temp2Path ); + if( t1Path.compare( t2Path ) == 0 ) + cx.diskContext->temp2Queue = cx.diskContext->temp1Queue; + else + cx.diskContext->temp2Queue = new DiskQueue( cx.cfg.temp2Path ); + } + + cx.phase2 = new CudaK32Phase2{}; + cx.phase3 = new CudaK32Phase3{}; // #TODO: Support non-warm starting Log::Line( "Allocating buffers (this may take a few seconds)..." ); AllocBuffers( cx ); InitFSEBitMask( cx ); + Log::Line( "Done." ); + + + // Allocate GR Context if --check was specified + if( cfg.plotCheckCount > 0 ) + { + if( cfg.gCfg->compressionLevel > 0 ) + { + GreenReaperConfig grCfg{}; + grCfg.apiVersion = GR_API_VERSION; + grCfg.threadCount = 1; + grCfg.gpuRequest = GRGpuRequestKind_ExactDevice; + grCfg.gpuDeviceIndex = cfg.deviceIndex; + + auto grResult = grCreateContext( &cx.grCheckContext, &grCfg, sizeof( grCfg ) ); + FatalIf( grResult != GRResult_OK, "Failed to create decompression context for plot check with error '%s' (%d).", + grResultToString( grResult ), (int)grResult ); + + grResult = grPreallocateForCompressionLevel( cx.grCheckContext, BBCU_K, cfg.gCfg->compressionLevel ); + FatalIf( grResult != GRResult_OK, "Failed to preallocate memory for decompression context with error '%s' (%d).", + grResultToString( grResult ), (int)grResult ); + } + + PlotCheckerConfig checkerCfg{}; + checkerCfg.proofCount = cfg.plotCheckCount; + checkerCfg.noGpu = false; + checkerCfg.gpuIndex = cfg.deviceIndex; + checkerCfg.threadCount = 1; + checkerCfg.disableCpuAffinity = false; + checkerCfg.silent = false; + checkerCfg.hasSeed = false; + checkerCfg.deletePlots = true; + checkerCfg.deleteThreshold = cfg.plotCheckThreshhold; + checkerCfg.grContext = cx.grCheckContext; + + cx.plotChecker = PlotChecker::Create( checkerCfg ); + } } //----------------------------------------------------------- @@ -210,6 +370,8 @@ void CudaK32Plotter::Run( const PlotRequest& req ) cx.plotWriter = new PlotWriter( !cfg.gCfg->disableOutputDirectIO ); if( cx.gCfg->benchmarkMode ) cx.plotWriter->EnableDummyMode(); + if( cx.plotChecker ) + cx.plotWriter->EnablePlotChecking( *cx.plotChecker ); FatalIf( !cx.plotWriter->BeginPlot( cfg.gCfg->compressionLevel > 0 ? PlotVersion::v2_0 : PlotVersion::v1_0, req.outDir, req.plotFileName, req.plotId, req.memo, req.memoSize, cfg.gCfg->compressionLevel ), @@ -220,19 +382,43 @@ void CudaK32Plotter::Run( const PlotRequest& req ) cx.plotWriter->EndPlot( true ); - // #TODO: Ensure the last plot ended here for now + // Ensure the last plot has ended + // #TODO: Move it elsewhere, using different buffers for parks + // so that we can continue writing to disk until we get to + // actually writing the next plot in table 7 finalization. { const auto pltoCompleteTimer = TimerBegin(); cx.plotWriter->WaitForPlotToComplete(); const double plotIOTime = TimerEnd( pltoCompleteTimer ); Log::Line( "Completed writing plot in %.2lf seconds", plotIOTime ); - cx.plotWriter->DumpTables(); + if( !cx.plotChecker || !cx.plotChecker->LastPlotDeleted() ) + { + cx.plotWriter->DumpTables(); + Log::NewLine(); + } } - Log::Line( "" ); - + delete cx.plotWriter; cx.plotWriter = nullptr; + + + // Delete any temporary files + #if !(DBG_BBCU_KEEP_TEMP_FILES) + if( cx.plotRequest.IsFinalPlot && cx.cfg.hybrid128Mode ) + { + if( cx.diskContext->yBuffer ) delete cx.diskContext->yBuffer; + if( cx.diskContext->metaBuffer ) delete cx.diskContext->metaBuffer; + if( cx.diskContext->unsortedL ) delete cx.diskContext->unsortedL; + if( cx.diskContext->unsortedR ) delete cx.diskContext->unsortedR; + + for( TableId t = TableId::Table1; t <= TableId::Table7; t++ ) + { + if( cx.diskContext->tablesL[(int)t] ) delete cx.diskContext->tablesL[(int)t]; + if( cx.diskContext->tablesR[(int)t] ) delete cx.diskContext->tablesR[(int)t]; + } + } + #endif } //----------------------------------------------------------- @@ -243,26 +429,51 @@ void MakePlot( CudaK32PlotContext& cx ) memset( cx.tableEntryCounts, 0, sizeof( cx.tableEntryCounts ) ); cx.table = TableId::Table1; + const auto plotTimer = TimerBegin(); const auto p1Timer = plotTimer; #if BBCU_DBG_SKIP_PHASE_1 DbgLoadContextAndPairs( cx ); #else - // F1 - Log::Line( "Generating F1" ); - const auto timer = TimerBegin(); - GenF1Cuda( cx ); - const auto elapsed = TimerEnd( timer ); - Log::Line( "Finished F1 in %.2lf seconds.", elapsed ); - // Time for FP + if( cx.cfg.hybrid128Mode ) + { + cx.sortedXPairsOut.AssignDiskBuffer( nullptr ); + cx.sortedPairsLOut.AssignDiskBuffer( nullptr ); + cx.sortedPairsROut.AssignDiskBuffer( nullptr ); + + if( !cx.plotRequest.isFirstPlot ) + { + for( TableId t = TableId::Table1; t <= TableId::Table7; t++ ) + { + if( cx.diskContext->tablesL[(int)t] ) cx.diskContext->tablesL[(int)t]->Swap(); + if( cx.diskContext->tablesR[(int)t] ) cx.diskContext->tablesR[(int)t]->Swap(); + } + + } + } + + /// Generate F1 entries + { + Log::Line( "Generating F1" ); + const auto timer = TimerBegin(); + + GenF1Cuda( cx ); + + const auto elapsed = TimerEnd( timer ); + Log::Line( "Finished F1 in %.2lf seconds.", elapsed ); + } + + /// Forward-propagate the rest of the tables for( TableId table = TableId::Table2; table <= TableId::Table7; table++ ) { cx.table = table; cx.bucket = 0; + FpTable( cx ); } + const auto p1Elapsed = TimerEnd( p1Timer ); Log::Line( "Completed Phase 1 in %.2lf seconds", p1Elapsed ); #endif @@ -294,6 +505,22 @@ void FpTable( CudaK32PlotContext& cx ) cx.prevTablePairOffset = 0; + if( cx.cfg.hybrid128Mode ) + { + auto* diskBufferL = cx.diskContext->tablesL[(int)inTable]; + auto* diskBufferR = cx.diskContext->tablesR[(int)inTable]; + + if( inTable == cx.firstStoredTable ) + { + cx.sortedXPairsOut.AssignDiskBuffer( diskBufferL ); + } + else if( inTable > cx.firstStoredTable ) + { + cx.sortedPairsLOut.AssignDiskBuffer( diskBufferL ); + cx.sortedPairsROut.AssignDiskBuffer( diskBufferR ); + } + } + // Clear slice counts CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) ); @@ -358,10 +585,28 @@ void FpTable( CudaK32PlotContext& cx ) cx.sortedPairsROut.WaitForCompletion();//cx.sortedPairsROut.WaitForCopyCompletion(); cx.sortedPairsROut.Reset(); - - if( cx.table < TableId::Table7 ) + if( cx.cfg.hybrid128Mode && inTable >= cx.firstStoredTable ) { + if( cx.diskContext->tablesL[(int)inTable] ) cx.diskContext->tablesL[(int)inTable]->Swap(); + if( cx.diskContext->tablesR[(int)inTable] ) cx.diskContext->tablesR[(int)inTable]->Swap(); + } + + if( cx.table < TableId::Table7 ) cx.metaOut.WaitForCompletion(); cx.metaOut.Reset(); + + if( cx.cfg.hybrid128Mode ) + { + if( cx.cfg.hybrid16Mode || cx.table == cx.firstStoredTable || cx.table == cx.firstStoredTable + 1 ) + { + cx.diskContext->unsortedL->Swap(); + } + + if( cx.cfg.hybrid16Mode ) + { + cx.diskContext->yBuffer->Swap(); + cx.diskContext->metaBuffer->Swap(); + cx.diskContext->unsortedR->Swap(); + } } cx.yIn .Reset(); @@ -373,23 +618,24 @@ void FpTable( CudaK32PlotContext& cx ) Log::Line( "Table %u completed in %.2lf seconds with %llu entries.", (uint32)cx.table+1, elapsed, cx.tableEntryCounts[(int)cx.table] ); + /// DEBUG #if DBG_BBCU_P1_WRITE_PAIRS // Write them sorted, so have to wait until table 3 completes if( cx.table > TableId::Table2 ) DbgWritePairs( cx, cx.table - 1 ); #endif - + if( cx.table == TableId::Table7 ) { FinalizeTable7( cx ); - #if DBG_BBCU_P1_WRITE_PAIRS + // DEBUG + #if DBG_BBCU_P1_WRITE_PAIRS DbgWritePairs( cx, TableId::Table7 ); - #endif - + #endif #if DBG_BBCU_P1_WRITE_CONTEXT DbgWriteContext( cx ); - #endif + #endif } } @@ -410,8 +656,8 @@ void FpTableBucket( CudaK32PlotContext& cx, const uint32 bucket ) cudaStream_t metaStream = cx.computeStream;//B; cudaStream_t pairsStream = cx.computeStream;//C; - uint32* sortKeyIn = (uint32*)cx.devMatches; - uint32* sortKeyOut = cx.devSortKey; + uint32* sortKeyIn = (uint32*)cx.devMatches; + uint32* sortKeyOut = cx.devSortKey; if( cx.table > TableId::Table2 ) { // Generate a sorting key @@ -447,7 +693,7 @@ void FpTableBucket( CudaK32PlotContext& cx, const uint32 bucket ) // Sort and download prev table's pairs const bool isLTableInlineable = cx.table == TableId::Table2 || (uint32)cx.table <= cx.gCfg->numDroppedTables+1; - + if( !isLTableInlineable ) { CudaErrCheck( cudaStreamWaitEvent( pairsStream, cx.computeEventC ) ); // Ensure sort key is ready @@ -463,35 +709,36 @@ void FpTableBucket( CudaK32PlotContext& cx, const uint32 bucket ) CudaK32PlotSortByKey( entryCount, sortKeyOut, pairsIn, sortedPairs, pairsStream ); cx.xPairsIn.ReleaseDeviceBuffer( pairsStream ); - Pair* hostPairs = ((Pair*)cx.hostBackPointers[(int)cx.table-1].left) + cx.prevTablePairOffset; + Pair* hostPairs = ((Pair*)cx.hostBackPointers[(int)inTable].left) + cx.prevTablePairOffset; // Write sorted pairs back to host cx.sortedXPairsOut.DownloadT( hostPairs, entryCount, pairsStream, cx.downloadDirect ); } else { - uint32* hostPairsL, *hostPairsLFinal; - uint16* hostPairsR, *hostPairsRFinal; + // uint32* hostPairsL; + // uint16* hostPairsR; // Wait for pairs to complete loading and sort on Y (or do this before match? Giving us time to write to disk while matching?) uint32* pairsLIn = (uint32*)cx.pairsLIn .GetUploadedDeviceBuffer( pairsStream ); uint32* sortedPairsL = (uint32*)cx.sortedPairsLOut.LockDeviceBuffer( pairsStream ); CudaK32PlotSortByKey( entryCount, sortKeyOut, pairsLIn, sortedPairsL, pairsStream ); cx.pairsLIn.ReleaseDeviceBuffer( pairsStream ); - hostPairsL = cx.hostTableSortedL + cx.prevTablePairOffset; - hostPairsLFinal = cx.hostBackPointers[(int)cx.table-1].left + cx.prevTablePairOffset; + // hostPairsL = cx.hostTableSortedL + cx.prevTablePairOffset; + uint32* hostPairsLFinal = cx.hostBackPointers[(int)inTable].left + cx.prevTablePairOffset; cx.sortedPairsLOut.DownloadT( hostPairsLFinal, entryCount, pairsStream, cx.downloadDirect ); // cx.sortedPairsLOut.DownloadAndCopyT( hostPairsL, hostPairsLFinal, entryCount, pairsStream ); - + // if( !isOutputCompressed ) { uint16* pairsRIn = (uint16*)cx.pairsRIn .GetUploadedDeviceBuffer( pairsStream ); uint16* sortedPairsR = (uint16*)cx.sortedPairsROut.LockDeviceBuffer( pairsStream ); CudaK32PlotSortByKey( entryCount, sortKeyOut, pairsRIn, sortedPairsR, pairsStream ); cx.pairsRIn.ReleaseDeviceBuffer( pairsStream ); - hostPairsR = cx.hostTableSortedR + cx.prevTablePairOffset; - hostPairsRFinal = cx.hostBackPointers[(int)cx.table-1].right + cx.prevTablePairOffset; + // hostPairsR = cx.hostTableSortedR + cx.prevTablePairOffset; + + uint16* hostPairsRFinal = cx.hostBackPointers[(int)inTable].right + cx.prevTablePairOffset; cx.sortedPairsROut.DownloadT( hostPairsRFinal, entryCount, pairsStream, cx.downloadDirect ); // cx.sortedPairsROut.DownloadAndCopyT( hostPairsR, hostPairsRFinal, entryCount, pairsStream ); @@ -557,7 +804,7 @@ void FpTableBucket( CudaK32PlotContext& cx, const uint32 bucket ) void FinalizeTable7( CudaK32PlotContext& cx ) { Log::Line( "Finalizing Table 7" ); - + const auto timer = TimerBegin(); cx.table = TableId::Table7+1; // Set a false table @@ -578,19 +825,41 @@ void FinalizeTable7( CudaK32PlotContext& cx ) const size_t c1TableSizeBytes = c1TotalEntries * sizeof( uint32 ); const size_t c2TableSizeBytes = c2TotalEntries * sizeof( uint32 ); + if( cx.cfg.hybrid128Mode ) + { + cx.sortedPairsLOut.AssignDiskBuffer( cx.diskContext->tablesL[(int)TableId::Table7] ); + cx.sortedPairsROut.AssignDiskBuffer( cx.diskContext->tablesR[(int)TableId::Table7] ); + } + + + // Re-use meta GPU downloader to download parks + GpuDownloadBuffer& parkDownloader = cx.metaOut; + + // Store disk buffer temporarily, if there is one, since we don't want to write to meta now + DiskBufferBase* metaDiskBuffer = parkDownloader.GetDiskBuffer(); + + // Reset park buffer chain, if we're using it + if( cx.parkContext ) + { + cx.parkContext->parkBufferChain->Reset(); + parkDownloader.AssignDiskBuffer( nullptr ); // We want direct downloads to the park buffers, which are pinned already + } // Prepare host allocations constexpr size_t c3ParkSize = CalculateC3Size(); const uint64 totalParkSize = CDivT( tableLength, (uint64)kCheckpoint1Interval ) * c3ParkSize; - StackAllocator hostAlloc( cx.hostMeta, BBCU_TABLE_ALLOC_ENTRY_COUNT * sizeof( uint32 ) * 4 ); + StackAllocator hostAlloc = cx.parkContext + ? StackAllocator( cx.parkContext->table7Memory.Ptr(), cx.parkContext->table7Memory.Length() ) + : StackAllocator( cx.hostMeta, BBCU_TABLE_ALLOC_ENTRY_COUNT * sizeof( uint32 ) * 4 ); + uint32* hostC1Buffer = hostAlloc.CAlloc( c1TotalEntries ); uint32* hostC2Buffer = hostAlloc.CAlloc( c2TotalEntries ); uint32* hostLastParkEntries = hostAlloc.CAlloc( kCheckpoint1Interval ); byte* hostLastParkBuffer = (byte*)hostAlloc.CAlloc( kCheckpoint1Interval ); - byte* hostCompressedParks = hostAlloc.AllocT( totalParkSize ); - + byte* hostCompressedParks = cx.parkContext ? nullptr : hostAlloc.AllocT( totalParkSize ); + byte* hostParkWriter = hostCompressedParks; uint32* hostC1Writer = hostC1Buffer; @@ -606,8 +875,6 @@ void FinalizeTable7( CudaK32PlotContext& cx ) const size_t parkBufferSize = kCheckpoint1Interval * sizeof( uint32 ); - GpuDownloadBuffer& parkDownloader = cx.metaOut; - cudaStream_t mainStream = cx.computeStream; cudaStream_t metaStream = cx.computeStream;//B; cudaStream_t pairsStream = cx.computeStream;//C; @@ -616,7 +883,7 @@ void FinalizeTable7( CudaK32PlotContext& cx ) // Load CTable FSE_CTable* devCTable = devAlloc.AllocT( sizeof( CTable_C3 ), sizeof( uint64 ) ); CudaErrCheck( cudaMemcpyAsync( devCTable, CTable_C3, sizeof( CTable_C3 ), cudaMemcpyHostToDevice, cx.computeStream ) ); - + CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); // Prepare plot tables cx.plotWriter->ReserveTableSize( PlotTable::C1, c1TableSizeBytes ); @@ -627,7 +894,6 @@ void FinalizeTable7( CudaK32PlotContext& cx ) uint32 retainedC3EntryCount = 0; uint32* devYSorted = cx.devYWork + kCheckpoint1Interval; - uint32* sortKeyIn = (uint32*)cx.devMatches; uint32* sortKeyOut = cx.devSortKey; @@ -732,13 +998,42 @@ void FinalizeTable7( CudaK32PlotContext& cx ) // Download compressed parks to host const size_t parkDownloadSize = c3ParkSize * parkCount; + + if( cx.parkContext ) + { + ASSERT( parkDownloadSize <= cx.parkContext->parkBufferChain->BufferSize() ); + + // Override the park buffer to be used when using a park context + hostParkWriter = cx.parkContext->parkBufferChain->PeekBuffer( bucket ); + + // Wait for the next park buffer to be available to be used for download + parkDownloader.HostCallback([&cx]{ + (void)cx.parkContext->parkBufferChain->GetNextBuffer(); + }); + } + + const bool directOverride = cx.parkContext != nullptr; + parkDownloader.DownloadWithCallback( hostParkWriter, parkDownloadSize, []( void* parksBuffer, size_t size, void* userData ) { auto& cx = *reinterpret_cast( userData ); + cx.plotWriter->WriteTableData( parksBuffer, size ); - }, &cx, mainStream ); + + // Release the buffer after the plot writer is done with it. + if( cx.parkContext ) + { + cx.plotWriter->CallBack([&cx](){ + cx.parkContext->parkBufferChain->ReleaseNextBuffer(); + }); + } + + }, &cx, mainStream, directOverride ); hostParkWriter += parkDownloadSize; + + if( cx.parkContext ) + hostParkWriter = nullptr; } // Download c1 entries @@ -788,8 +1083,6 @@ void FinalizeTable7( CudaK32PlotContext& cx ) // Cleanup - // cx.sortedPairsLOut.WaitForCopyCompletion(); - // cx.sortedPairsROut.WaitForCopyCompletion(); cx.sortedPairsLOut.WaitForCompletion(); cx.sortedPairsROut.WaitForCompletion(); cx.sortedPairsLOut.Reset(); @@ -797,6 +1090,18 @@ void FinalizeTable7( CudaK32PlotContext& cx ) cx.prevTablePairOffset = 0; + // Restore disk buffer on repurposed meta download stream + parkDownloader.AssignDiskBuffer( metaDiskBuffer ); + + if( cx.cfg.hybrid128Mode ) + { + cx.diskContext->tablesL[(int)TableId::Table7]->Swap(); + cx.diskContext->tablesR[(int)TableId::Table7]->Swap(); + + if( cx.cfg.hybrid16Mode ) + cx.diskContext->yBuffer->Swap(); + } + auto elapsed = TimerEnd( timer ); Log::Line( "Finalized Table 7 in %.2lf seconds.", elapsed ); } @@ -834,7 +1139,7 @@ __global__ void CudaCompressTable( const uint32* entryCount, const uint32* inLEn const uint32 x0 = inLEntries[pair.left ]; const uint32 x1 = inLEntries[pair.right]; - // Convert to linepoint + // Convert to linepoint if constexpr ( UseLP ) outREntries[gid] = (uint32)CudaSquareToLinePoint64( x1 >> bitShift, x0 >> bitShift ); else @@ -850,7 +1155,7 @@ void InlineTable( CudaK32PlotContext& cx, const uint32* devInX, cudaStream_t str const uint32 kthreads = 256; const uint32 kblocks = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, (int)kthreads ); - + if( isCompressedInput ) { const bool isFinalTable = cx.table == TableId::Table1 + (TableId)cx.gCfg->numDroppedTables; @@ -870,7 +1175,7 @@ void InlineTable( CudaK32PlotContext& cx, const uint32* devInX, cudaStream_t str //----------------------------------------------------------- void CudaK32PlotDownloadBucket( CudaK32PlotContext& cx ) { - const bool writeVertical = CudaK32PlotIsOutputInterleaved( cx ); + const bool writeVertical = CudaK32PlotIsOutputVertical( cx ); const size_t metaMultiplier = GetTableMetaMultiplier( cx.table ); const bool downloadCompressed = cx.table > TableId::Table1 && (uint32)cx.table <= cx.gCfg->numDroppedTables; @@ -879,8 +1184,8 @@ void CudaK32PlotDownloadBucket( CudaK32PlotContext& cx ) uint32* hostY = cx.hostY; uint32* hostMeta = cx.hostMeta; - uint32* hostPairsL = cx.hostTableL; //cx.hostBackPointers[6].left; - uint16* hostPairsR = cx.hostTableR; //cx.hostBackPointers[6].right; + uint32* hostPairsL = cx.hostTableL; + uint16* hostPairsR = cx.hostTableR; Pair* t2HostPairs = (Pair*)cx.hostBackPointers[4].left; const size_t startOffset = cx.bucket * ( writeVertical ? BBCU_MAX_SLICE_ENTRY_COUNT : BBCU_BUCKET_ALLOC_ENTRY_COUNT ); // vertical: offset to starting col. horizontal: to starting row @@ -896,7 +1201,7 @@ void CudaK32PlotDownloadBucket( CudaK32PlotContext& cx ) { const size_t metaSizeMultiplier = metaMultiplier == 3 ? 4 : metaMultiplier; const size_t metaSize = sizeof( uint32 ) * metaSizeMultiplier; - + const size_t metaSrcStride = srcStride * metaSize; const size_t metaDstStride = dstStride * sizeof( K32Meta4 ); const size_t metaWidth = width * metaSize; @@ -927,20 +1232,23 @@ void UploadBucketForTable( CudaK32PlotContext& cx, const uint64 bucket ) const TableId rTable = cx.table; const TableId inTable = rTable - 1; - uint32 metaMultiplier = GetTableMetaMultiplier( inTable ); + const uint32 metaMultiplier = GetTableMetaMultiplier( inTable ); const uint32 inIdx = CudaK32PlotGetInputIndex( cx ); - const bool readVertical = CudaK32PlotIsOutputInterleaved( cx ); + const bool readVertical = CudaK32PlotIsOutputVertical( cx ); const uint32* hostY = cx.hostY; const uint32* hostMeta = cx.hostMeta; - const uint32* hostPairsL = cx.hostTableL; //cx.hostBackPointers[6].left; - const uint16* hostPairsR = cx.hostTableR; //cx.hostBackPointers[6].right; + const uint32* hostPairsL = cx.hostTableL; + const uint16* hostPairsR = cx.hostTableR; const bool uploadCompressed = cx.table > TableId::Table2 && (uint32)cx.table-1 <= cx.gCfg->numDroppedTables; const bool uploadInlinedPairs = !uploadCompressed && (uint32)cx.table == cx.gCfg->numDroppedTables+2; const Pair* t2HostPairs = (Pair*)cx.hostBackPointers[4].left; // Table 2 will use table 5, and overflow onto 6 + if( cx.cfg.hybrid128Mode ) + t2HostPairs = (Pair*)hostPairsL; + uint32 stride = BBCU_BUCKET_ALLOC_ENTRY_COUNT; // Start as vertical size_t offset = (size_t)bucket * BBCU_MAX_SLICE_ENTRY_COUNT; @@ -974,7 +1282,7 @@ void UploadBucketForTable( CudaK32PlotContext& cx, const uint64 bucket ) cx.pairsRIn.UploadArrayT( hostPairsR + offset, BBCU_BUCKET_COUNT, stride, BBCU_BUCKET_COUNT, counts, pairsStream ); } } - + // Meta if( metaMultiplier > 0 ) { @@ -982,11 +1290,13 @@ void UploadBucketForTable( CudaK32PlotContext& cx, const uint64 bucket ) const size_t metaSize = sizeof( uint32 ) * metaSizeMultiplier; auto actualMetaStream = inTable == TableId::Table1 ? cx.computeStream : metaStream; + cx.metaIn.UploadArray( hostMeta + offset * 4, BBCU_BUCKET_COUNT, metaSize, stride * sizeof( K32Meta4 ), BBCU_BUCKET_COUNT, counts, actualMetaStream ); } } + /// /// Allocations /// @@ -1002,13 +1312,19 @@ void AllocBuffers( CudaK32PlotContext& cx ) cx.hostTempAllocSize = 0; cx.devAllocSize = 0; + // If on <= 128G mode or not using direct downloads, + // we need to use a separate buffer for downloading parks, instead of re-using exisintg ones. + // If on <= 64G mode or not using direct downloads, + const bool allocateParkBuffers = cx.downloadDirect || cx.cfg.hybrid128Mode; + size_t parksPinnedSize = 0; + // Gather the size needed first { CudaK32AllocContext acx = {}; acx.alignment = alignment; acx.dryRun = true; - + DummyAllocator pinnedAllocator; DummyAllocator hostTableAllocator; DummyAllocator hostTempAllocator; @@ -1020,7 +1336,6 @@ void AllocBuffers( CudaK32PlotContext& cx ) acx.devAllocator = &devAllocator; AllocateP1Buffers( cx, acx ); - cx.pinnedAllocSize = pinnedAllocator .Size(); cx.hostTableAllocSize = hostTableAllocator.Size(); cx.hostTempAllocSize = hostTempAllocator .Size(); @@ -1033,7 +1348,6 @@ void AllocBuffers( CudaK32PlotContext& cx ) devAllocator = {}; CudaK32PlotPhase2AllocateBuffers( cx, acx ); - cx.pinnedAllocSize = std::max( cx.pinnedAllocSize , pinnedAllocator .Size() ); cx.hostTableAllocSize = std::max( cx.hostTableAllocSize, hostTableAllocator.Size() ); cx.hostTempAllocSize = std::max( cx.hostTempAllocSize , hostTempAllocator .Size() ); @@ -1046,15 +1360,23 @@ void AllocBuffers( CudaK32PlotContext& cx ) devAllocator = {}; CudaK32PlotPhase3AllocateBuffers( cx, acx ); - cx.pinnedAllocSize = std::max( cx.pinnedAllocSize , pinnedAllocator .Size() ); cx.hostTableAllocSize = std::max( cx.hostTableAllocSize, hostTableAllocator.Size() ); cx.hostTempAllocSize = std::max( cx.hostTempAllocSize , hostTempAllocator .Size() ); cx.devAllocSize = std::max( cx.devAllocSize , devAllocator .Size() ); + + // May need to allocate extra pinned buffers for park buffers + if( allocateParkBuffers ) + { + pinnedAllocator = {}; + AllocateParkSerializationBuffers( cx, *acx.pinnedAllocator, acx.dryRun ); + parksPinnedSize = pinnedAllocator.Size(); + } } - size_t totalPinnedSize = cx.pinnedAllocSize + cx.hostTempAllocSize; - size_t totalHostSize = cx.hostTableAllocSize + totalPinnedSize; + + const size_t totalPinnedSize = cx.pinnedAllocSize + cx.hostTempAllocSize + parksPinnedSize; + const size_t totalHostSize = cx.hostTableAllocSize + totalPinnedSize; Log::Line( "Kernel RAM required : %-12llu bytes ( %-9.2lf MiB or %-6.2lf GiB )", totalPinnedSize, (double)totalPinnedSize BtoMB, (double)totalPinnedSize BtoGB ); @@ -1070,43 +1392,46 @@ void AllocBuffers( CudaK32PlotContext& cx ) Log::Line( "GPU RAM required : %-12llu bytes ( %-9.2lf MiB or %-6.2lf GiB )", cx.devAllocSize, (double)cx.devAllocSize BtoMB, (double)cx.devAllocSize BtoGB ); - Log::Line( "Allocating buffers" ); // Now actually allocate the buffers + Log::Line( "Allocating buffers..." ); CudaErrCheck( cudaMallocHost( &cx.pinnedBuffer, cx.pinnedAllocSize, cudaHostAllocDefault ) ); #if _DEBUG cx.hostBufferTables = bbvirtallocboundednuma( cx.hostTableAllocSize ); #else - #if !_WIN32 - // if( cx.downloadDirect ) + + bool allocateHostTablesPinned = cx.downloadDirect; + #if _WIN32 + // On windows we always force the use of intermediate buffers, so we allocate on the host + allocateHostTablesPinned = false; + #endif + + // Log::Line( "Table pairs allocated as pinned: %s", allocateHostTablesPinned ? "true" : "false" ); + if( allocateHostTablesPinned ) CudaErrCheck( cudaMallocHost( &cx.hostBufferTables, cx.hostTableAllocSize, cudaHostAllocDefault ) ); - // else - // { - // // #TODO: On windows, first check if we have enough shared memory (512G)? - // // and attempt to alloc that way first. Otherwise, use intermediate pinned buffers. - #else + else cx.hostBufferTables = bbvirtallocboundednuma( cx.hostTableAllocSize ); - #endif - // } #endif - //CudaErrCheck( cudaMallocHost( &cx.hostBufferTables, cx.hostTableAllocSize, cudaHostAllocDefault ) ); - cx.hostBufferTemp = nullptr; -#if _DEBUG - cx.hostBufferTemp = bbvirtallocboundednuma( cx.hostTempAllocSize ); -#endif - if( cx.hostBufferTemp == nullptr ) + #if _DEBUG || _WIN32 + if( cx.hostTempAllocSize ) + cx.hostBufferTemp = bbvirtallocboundednuma( cx.hostTempAllocSize ); + #endif + + if( cx.hostBufferTemp == nullptr && cx.hostTempAllocSize ) CudaErrCheck( cudaMallocHost( &cx.hostBufferTemp, cx.hostTempAllocSize, cudaHostAllocDefault ) ); CudaErrCheck( cudaMalloc( &cx.deviceBuffer, cx.devAllocSize ) ); // Warm start - if( true ) + if( true )// cx.gCfg->warmStart ) { - FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.pinnedBuffer, cx.pinnedAllocSize ); + FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.pinnedBuffer , cx.pinnedAllocSize ); FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.hostBufferTables, cx.hostTableAllocSize ); - FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.hostBufferTemp, cx.hostTempAllocSize ); + + if( cx.hostTempAllocSize ) + FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.hostBufferTemp, cx.hostTempAllocSize ); } { @@ -1114,7 +1439,7 @@ void AllocBuffers( CudaK32PlotContext& cx ) acx.alignment = alignment; acx.dryRun = false; - + StackAllocator pinnedAllocator ( cx.pinnedBuffer , cx.pinnedAllocSize ); StackAllocator hostTableAllocator( cx.hostBufferTables, cx.hostTableAllocSize ); StackAllocator hostTempAllocator ( cx.hostBufferTemp , cx.hostTempAllocSize ); @@ -1137,106 +1462,254 @@ void AllocBuffers( CudaK32PlotContext& cx ) hostTempAllocator .PopToMarker( 0 ); devAllocator .PopToMarker( 0 ); CudaK32PlotPhase3AllocateBuffers( cx, acx ); + + if( allocateParkBuffers ) + { + // Fine to leak. App-lifetime buffer + void* parksBuffer = nullptr; + CudaErrCheck( cudaMallocHost( &parksBuffer, parksPinnedSize, cudaHostAllocDefault ) ); + StackAllocator parkAllocator( parksBuffer, parksPinnedSize ); + AllocateParkSerializationBuffers( cx, parkAllocator, acx.dryRun ); + } } } //----------------------------------------------------------- void AllocateP1Buffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx ) { - const size_t alignment = acx.alignment; + const size_t alignment = acx.alignment; + const bool isCompressed = cx.gCfg->compressionLevel > 0; + const TableId firstTable = cx.firstStoredTable; - const bool isCompressed = cx.gCfg->compressionLevel > 0; + const FileFlags tmp1FileFlags = cx.cfg.temp1DirectIO ? FileFlags::NoBuffering | FileFlags::LargeFile : FileFlags::LargeFile; + const FileFlags tmp2FileFlags = cx.cfg.temp2DirectIO ? FileFlags::NoBuffering | FileFlags::LargeFile : FileFlags::LargeFile; - // #TODO: Re-optimize usage here again for windows running 256G /// Host allocations { // Temp allocations are pinned host buffers that can be re-used for other means in different phases. // This is roughly equivalent to temp2 dir during disk plotting. - cx.hostY = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ); - cx.hostMeta = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT * BBCU_HOST_META_MULTIPLIER, alignment ); - const size_t markingTableBitFieldSize = GetMarkingTableBitFieldSize(); - cx.hostMarkingTables[0] = nullptr; - cx.hostMarkingTables[1] = isCompressed ? nullptr : acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment ); - cx.hostMarkingTables[2] = acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment ); - cx.hostMarkingTables[3] = acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment ); - cx.hostMarkingTables[4] = acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment ); - cx.hostMarkingTables[5] = acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment ); + if( !cx.cfg.hybrid16Mode ) + { + cx.hostY = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ); + cx.hostMeta = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT * BBCU_HOST_META_MULTIPLIER, alignment ); + } + else if( !cx.diskContext->metaBuffer ) + { + const size_t ySliceSize = sizeof( uint32 ) * BBCU_MAX_SLICE_ENTRY_COUNT; + const size_t metaSliceSize = sizeof( uint32 ) * BBCU_META_SLICE_ENTRY_COUNT; - - // NOTE: The first table has their values inlines into the backpointers of the next table - cx.hostBackPointers[0] = {}; + cx.diskContext->yBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::Y_DISK_BUFFER_FILE_NAME.data(), + BBCU_BUCKET_COUNT, ySliceSize, FileMode::Create, FileAccess::ReadWrite, tmp2FileFlags ); + FatalIf( !cx.diskContext->yBuffer, "Failed to create y disk buffer." ); - const TableId firstTable = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables; - - Pair* firstTablePairs = acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ); - cx.hostBackPointers[(int)firstTable] = { (uint32*)firstTablePairs, nullptr }; + cx.diskContext->metaBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::META_DISK_BUFFER_FILE_NAME.data(), + BBCU_BUCKET_COUNT, metaSliceSize, FileMode::Create, FileAccess::ReadWrite, tmp2FileFlags ); + FatalIf( !cx.diskContext->metaBuffer, "Failed to create metadata disk buffer." ); + } - for( TableId table = firstTable + 1; table <= TableId::Table7; table++ ) - cx.hostBackPointers[(int)table] = { acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ), acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ) }; + // Marking tables used to prune back pointers + { + const size_t markingTableBitFieldSize = GetMarkingTableBitFieldSize(); + + cx.hostMarkingTables[0] = nullptr; + cx.hostMarkingTables[1] = isCompressed ? nullptr : acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment ); + cx.hostMarkingTables[2] = acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment ); + cx.hostMarkingTables[3] = acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment ); + cx.hostMarkingTables[4] = acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment ); + cx.hostMarkingTables[5] = acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment ); + } + + if( !cx.cfg.hybrid128Mode ) + { + // NOTE: The first table has their values inlined into the backpointers of the next table + cx.hostBackPointers[0] = {}; - cx.hostTableL = cx.hostBackPointers[6].left; // Also used for Table 7 - cx.hostTableR = cx.hostBackPointers[6].right; - cx.hostTableSortedL = cx.hostBackPointers[5].left; - cx.hostTableSortedR = cx.hostBackPointers[5].right; + Pair* firstTablePairs = acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ); + + cx.hostBackPointers[(int)firstTable] = { (uint32*)firstTablePairs, nullptr }; + + for( TableId table = firstTable + 1; table <= TableId::Table7; table++ ) + { + cx.hostBackPointers[(int)table] = { + acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ), + acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ) + }; + } + + // These buffers, belonging to table 7, are re-used + // to store the unsorted back-pointers. + // For this to work, the reading ot table 7 must be horizontal (see CudaK32PlotIsOutputVertical()). + // This way, when we store the sorted pairs, we don't + // overwrite the unsorted data from other buckets. + cx.hostTableL = cx.hostBackPointers[6].left; + cx.hostTableR = cx.hostBackPointers[6].right; + } + else + { + char tableName[] = "table_l_000.tmp"; + + size_t multiplier = 2; // First table entries are Pair, not uint32s... + + #if BBCU_DBG_SKIP_PHASE_1 + const FileMode fileMode = FileMode::Open; + #else + const FileMode fileMode = FileMode::Create; + #endif + + for( TableId table = firstTable; table <= TableId::Table7; table++ ) + { + if( cx.diskContext->tablesL[(int)table] == nullptr ) + { + sprintf( tableName, "table_l_%d.tmp", (int32)table+1 ); + cx.diskContext->tablesL[(int)table] = DiskBuffer::Create( + *cx.diskContext->temp1Queue, tableName, BBCU_BUCKET_COUNT, sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT * multiplier, + fileMode, FileAccess::ReadWrite, tmp1FileFlags ); + + FatalIf( !cx.diskContext->tablesL[(int)table], "Failed to create table %d L disk buffer.", (int)table+1 ); + } + + if( table > firstTable && cx.diskContext->tablesR[(int)table] == nullptr ) + { + sprintf( tableName, "table_r_%d.tmp", (int32)table+1 ); + cx.diskContext->tablesR[(int)table] = DiskBuffer::Create( + *cx.diskContext->temp1Queue, tableName, BBCU_BUCKET_COUNT, sizeof( uint16 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, + fileMode, FileAccess::ReadWrite, tmp1FileFlags ); + + FatalIf( !cx.diskContext->tablesR[(int)table], "Failed to create table %d R disk buffer.", (int)table+1 ); + } + + multiplier = 1; + } + + // When storing unsorted inlined x's, we don't have enough space in RAM, store i disk instead. + const size_t xSliceSize = BBCU_MAX_SLICE_ENTRY_COUNT * sizeof( Pair ); + cx.diskContext->unsortedL = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::LPAIRS_DISK_BUFFER_FILE_NAME.data(), + BBCU_BUCKET_COUNT, xSliceSize, FileMode::OpenOrCreate, FileAccess::ReadWrite, tmp2FileFlags ); + FatalIf( !cx.diskContext->unsortedL, "Failed to create unsorted L disk buffer." ); + + if( cx.cfg.hybrid16Mode ) + { + cx.diskContext->unsortedR = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, "p1unsorted_r.tmp", + BBCU_BUCKET_COUNT, BBCU_MAX_SLICE_ENTRY_COUNT * sizeof( uint16 ), FileMode::OpenOrCreate, FileAccess::ReadWrite, tmp2FileFlags ); + FatalIf( !cx.diskContext->unsortedR, "Failed to create unsorted R disk buffer." ); + } + else + { + // In 128G mode we can store intermediate pairs in the host + cx.hostTableL = acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ); + cx.hostTableR = acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ); + } + } } /// Device & Pinned allocations { - // #NOTE: The R pair is allocated as uint32 because for table 2 we want to download them as inlined x's, so we need 2 uint32 buffers - /// Device/Pinned allocations - // cx.yOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun ); - // cx.metaOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun ); - cx.yOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer ( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun ); - cx.metaOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun ); - - // These download buffers share the same backing buffers + GpuStreamDescriptor yDesc{}; + yDesc.entriesPerSlice = BBCU_MAX_SLICE_ENTRY_COUNT; + yDesc.sliceCount = BBCU_BUCKET_COUNT; + yDesc.sliceAlignment = alignment; + yDesc.bufferCount = BBCU_DEFAULT_GPU_BUFFER_COUNT; + yDesc.deviceAllocator = acx.devAllocator; + yDesc.pinnedAllocator = nullptr; // Start in direct mode (no intermediate pinined buffers) + + // In disk-backed mode, we always have pinned buffers, + // which are the same buffers used to write and read from disk. + GpuStreamDescriptor descTablePairs = yDesc; + GpuStreamDescriptor descTableSortedPairs = yDesc; + GpuStreamDescriptor descXPairs = yDesc; + GpuStreamDescriptor descMeta = yDesc; + + if( cx.cfg.hybrid128Mode ) { + // Temp 1 Queue + descTableSortedPairs.pinnedAllocator = acx.pinnedAllocator; + descTableSortedPairs.sliceAlignment = cx.diskContext->temp1Queue->BlockSize(); + + // Temp 2 Queue + descXPairs.pinnedAllocator = acx.pinnedAllocator; + descXPairs.sliceAlignment = cx.diskContext->temp2Queue->BlockSize(); + + if( cx.cfg.hybrid16Mode ) + { + yDesc.pinnedAllocator = acx.pinnedAllocator; + yDesc.sliceAlignment = cx.diskContext->temp2Queue->BlockSize(); + + descMeta.pinnedAllocator = acx.pinnedAllocator; + descMeta.sliceAlignment = cx.diskContext->temp2Queue->BlockSize(); + + descTablePairs.pinnedAllocator = acx.pinnedAllocator; + descTablePairs.sliceAlignment = cx.diskContext->temp2Queue->BlockSize(); + } + } + + if( !cx.downloadDirect ) + { + // Use intermediate pinned buffer for transfers to non-pinned destinations + yDesc.pinnedAllocator = acx.pinnedAllocator; + descTablePairs.pinnedAllocator = acx.pinnedAllocator; + descTableSortedPairs.pinnedAllocator = acx.pinnedAllocator; + descXPairs.pinnedAllocator = acx.pinnedAllocator; + descMeta.pinnedAllocator = acx.pinnedAllocator; + } + + + /// + /// Downloads + /// + cx.yOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( yDesc, acx.dryRun ); + cx.metaOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( descMeta, acx.dryRun ); + + { + // These download buffers share the same backing buffers const size_t devMarker = acx.devAllocator->Size(); const size_t pinnedMarker = acx.pinnedAllocator->Size(); - cx.pairsLOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun ); - cx.pairsROut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun ); + cx.pairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( descTablePairs, acx.dryRun ); + cx.pairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( descTablePairs, acx.dryRun ); acx.devAllocator->PopToMarker( devMarker ); acx.pinnedAllocator->PopToMarker( pinnedMarker ); // Allocate Pair at the end, to ensure we grab the highest value - cx.xPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun ); + cx.xPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( descXPairs, acx.dryRun ); } - // These download buffers share the same backing buffers { + // These download buffers share the same backing buffers const size_t devMarker = acx.devAllocator->Size(); const size_t pinnedMarker = acx.pinnedAllocator->Size(); - cx.sortedPairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun ); - cx.sortedPairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun ); + cx.sortedPairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( descTableSortedPairs, acx.dryRun ); + cx.sortedPairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( descTableSortedPairs, acx.dryRun ); acx.devAllocator->PopToMarker( devMarker ); acx.pinnedAllocator->PopToMarker( pinnedMarker ); // Allocate Pair at the end, to ensure we grab the highest value - cx.sortedXPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun ); + cx.sortedXPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( descXPairs, acx.dryRun ); } - cx.yIn = cx.gpuUploadStream[0]->CreateUploadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun ); - cx.metaIn = cx.gpuUploadStream[0]->CreateUploadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun ); + /// + /// Uploads + /// + cx.yIn = cx.gpuUploadStream[0]->CreateUploadBufferT( yDesc, acx.dryRun ); + cx.metaIn = cx.gpuUploadStream[0]->CreateUploadBufferT( descMeta, acx.dryRun ); // These uploaded buffers share the same backing buffers { const size_t devMarker = acx.devAllocator->Size(); const size_t pinnedMarker = acx.pinnedAllocator->Size(); - cx.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun ); - cx.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun ); + cx.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT( descTablePairs, acx.dryRun ); + cx.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT( descTablePairs, acx.dryRun ); acx.devAllocator->PopToMarker( devMarker ); acx.pinnedAllocator->PopToMarker( pinnedMarker ); // Allocate Pair at the end, to ensure we grab the highest value - cx.xPairsIn = cx.gpuUploadStream[0]->CreateUploadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun ); + cx.xPairsIn = cx.gpuUploadStream[0]->CreateUploadBufferT( descXPairs, acx.dryRun ); } /// Device-only allocations @@ -1268,9 +1741,56 @@ void AllocateP1Buffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx ) cx.hostBucketCounts = acx.pinnedAllocator->CAlloc( BBCU_BUCKET_COUNT, alignment ); cx.hostBucketSlices = acx.pinnedAllocator->CAlloc( BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, alignment ); } + + /// In disk-backed mode, assign disk buffers to gpu buffers + if( cx.cfg.hybrid128Mode && !acx.dryRun ) + { + cx.xPairsOut.AssignDiskBuffer( cx.diskContext->unsortedL ); + cx.xPairsIn .AssignDiskBuffer( cx.diskContext->unsortedL ); + + if( cx.cfg.hybrid16Mode ) + { + cx.pairsLOut.AssignDiskBuffer( cx.diskContext->unsortedL ); + cx.pairsLIn .AssignDiskBuffer( cx.diskContext->unsortedL ); + + cx.pairsROut.AssignDiskBuffer( cx.diskContext->unsortedR ); + cx.pairsRIn .AssignDiskBuffer( cx.diskContext->unsortedR ); + + cx.yOut.AssignDiskBuffer( cx.diskContext->yBuffer ); + cx.yIn .AssignDiskBuffer( cx.diskContext->yBuffer ); + + cx.metaOut.AssignDiskBuffer( cx.diskContext->metaBuffer ); + cx.metaIn .AssignDiskBuffer( cx.diskContext->metaBuffer ); + } + } +} + +//----------------------------------------------------------- +void AllocateParkSerializationBuffers( CudaK32PlotContext& cx, IAllocator& pinnedAllocator, bool dryRun ) +{ + ASSERT( cx.parkContext ); + + auto& pc = *cx.parkContext; + pc.maxParkBuffers = 3; + + // Get the largest park size + const size_t maxParkSize = cx.cfg.gCfg->compressionLevel == 0 ? + CalculateParkSize( TableId::Table1 ) : + GetLargestCompressedParkSize(); + + const size_t parksPerBuffer = CDivT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2; + // CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kCheckpoint1Interval ) + 1; // Need an extra park for left-over entries + const size_t bucketParkBufferSize = parksPerBuffer * maxParkSize; + const size_t alignment = 4096; + + // Allocate some extra space for C tables (see FinalizeTable7) + pc.hostRetainedLinePoints = pinnedAllocator.CAlloc( kEntriesPerPark ); + pc.table7Memory = pinnedAllocator.CAllocSpan( 8 MiB, alignment ); + pc.parkBufferChain = BufferChain::Create( pinnedAllocator, pc.maxParkBuffers, bucketParkBufferSize, alignment, dryRun ); } + /// /// Debug /// @@ -1278,6 +1798,9 @@ void AllocateP1Buffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx ) void DbgWritePairs( CudaK32PlotContext& cx, const TableId table ) { + if( cx.cfg.hybrid128Mode ) + return; + const TableId earliestTable = TableId::Table1 + (TableId)cx.gCfg->numDroppedTables+1; if( table < earliestTable ) return; @@ -1332,7 +1855,7 @@ void DbgWriteContext( CudaK32PlotContext& cx ) Log::Line( "[DEBUG] Writing context file." ); FileStream contxetFile; sprintf( path, "%scontext.tmp", DBG_BBCU_DBG_DIR ); - FatalIf( !contxetFile.Open( path, FileMode::Create, FileAccess::Write ), "Failed to open context file." ); + FatalIf( !contxetFile.Open( path, FileMode::Create, FileAccess::Write ), "Failed to open context file at '%s'.", path ); FatalIf( contxetFile.Write( &cx, sizeof( CudaK32PlotContext ) ) != (ssize_t)sizeof( CudaK32PlotContext ), "Failed to write context data." ); contxetFile.Close(); @@ -1360,7 +1883,7 @@ void DbgLoadContextAndPairs( CudaK32PlotContext& cx, bool loadTables ) memcpy( cx.bucketSlices, tmpCx.bucketSlices, sizeof( tmpCx.bucketSlices ) ); memcpy( cx.tableEntryCounts, tmpCx.tableEntryCounts, sizeof( tmpCx.tableEntryCounts ) ); } - + if( !loadTables ) return; @@ -1384,8 +1907,11 @@ void DbgLoadContextAndPairs( CudaK32PlotContext& cx, bool loadTables ) } } -void DbgLoadTablePairs( CudaK32PlotContext& cx, const TableId table, bool copyToPinnedBuffer ) +void DbgLoadTablePairs( CudaK32PlotContext& cx, const TableId table, bool useDiskHybridData ) { + if( cx.cfg.hybrid128Mode ) + return; + char lPath[512]; char rPath[512]; @@ -1393,57 +1919,227 @@ void DbgLoadTablePairs( CudaK32PlotContext& cx, const TableId table, bool copyTo if( table < earliestTable ) return; - // for( TableId table = TableId::Table2; table <= TableId::Table7; table++ ) + const uint64 entryCount = cx.tableEntryCounts[(int)table]; + Pairs& pairs = cx.hostBackPointers[(int)table]; + { Log::Line( "[DEBUG] Loading table %d", (int)table + 1 ); sprintf( lPath, "%st%d.l.tmp", DBG_BBCU_DBG_DIR, (int)table + 1 ); sprintf( rPath, "%st%d.r.tmp", DBG_BBCU_DBG_DIR, (int)table + 1 ); - const uint64 entryCount = cx.tableEntryCounts[(int)table]; // cx.hostBackPointers[(int)table].left = bbcvirtallocbounded( entryCount ); // cx.hostBackPointers[(int)table].right = bbcvirtallocbounded( entryCount ); - Pairs& pairs = cx.hostBackPointers[(int)table]; int err; - if( table == earliestTable ) + static DiskQueue* diskQueue = nullptr; + + // Load disk-hybrid tables + // #NOTE: Enable (and disable the block below this one), to load tables from + // the disk-hybrid output. Also adjust path in the DiskQueue below. + + // useDiskHybridData = true; + if( useDiskHybridData ) { - FatalIf( !IOJob::ReadFromFile( lPath, pairs.left, entryCount * sizeof( Pair ), err ), "Failed to read table X pairs: %d", err ); + if( diskQueue == nullptr ) + diskQueue = new DiskQueue( "/home/harold/plotdisk" ); + + char lname[64] = {}; + sprintf( lname, "table_l_%d.tmp", (int)table + 1 ); + + if( table == earliestTable ) + { + DiskBuffer* buf = DiskBuffer::Create( *diskQueue, lname, BBCU_BUCKET_COUNT, sizeof( Pair ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, + FileMode::Open, FileAccess::Read, FileFlags::LargeFile | FileFlags::NoBuffering ); + PanicIf( !buf, "No table file" ); + + VirtualAllocator valloc; + buf->ReserveBuffers( valloc ); + + Span pairsWriter( (Pair*)pairs.left, BBCU_TABLE_ALLOC_ENTRY_COUNT ); + buf->ReadNextBucket(); + + for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ ) + { + const size_t bucketLength = cx.bucketCounts[(int)table][bucket]; + + buf->TryReadNextBucket(); + auto entries = buf->GetNextReadBufferAs().SliceSize( bucketLength ); + + entries.CopyTo( pairsWriter ); + pairsWriter = pairsWriter.Slice( entries.Length() ); + } + + delete buf; + } + else + { + char rname[64] = {}; + sprintf( rname, "table_r_%d.tmp", (int)table + 1 ); + + DiskBuffer* lBuf = DiskBuffer::Create( *diskQueue, lname, BBCU_BUCKET_COUNT, sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, + FileMode::Open, FileAccess::Read, FileFlags::LargeFile | FileFlags::NoBuffering ); + DiskBuffer* rBuf = DiskBuffer::Create( *diskQueue, rname, BBCU_BUCKET_COUNT, sizeof( uint16 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, + FileMode::Open, FileAccess::Read, FileFlags::LargeFile | FileFlags::NoBuffering ); + PanicIf( !lBuf, "No table L file" ); + PanicIf( !rBuf, "No table R file" ); + + VirtualAllocator valloc; + lBuf->ReserveBuffers( valloc ); + rBuf->ReserveBuffers( valloc ); + + Span lWriter( pairs.left , BBCU_TABLE_ALLOC_ENTRY_COUNT ); + Span rWriter( pairs.right, BBCU_TABLE_ALLOC_ENTRY_COUNT ); + + lBuf->ReadNextBucket(); + rBuf->ReadNextBucket(); + + for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ ) + { + const size_t bucketLength = cx.bucketCounts[(int)table][bucket]; + + lBuf->TryReadNextBucket(); + rBuf->TryReadNextBucket(); + + auto lEntries = lBuf->GetNextReadBufferAs().SliceSize( bucketLength ); + lEntries.CopyTo( lWriter ); + + auto rEntries = rBuf->GetNextReadBufferAs().SliceSize( bucketLength ); + rEntries.CopyTo( rWriter ); + + lWriter = lWriter.Slice( lEntries.Length() ); + rWriter = rWriter.Slice( rEntries.Length() ); + } + + delete lBuf; + delete rBuf; + } } else { - FatalIf( !IOJob::ReadFromFile( lPath, pairs.left , entryCount * sizeof( uint32 ), err ), "Failed to read table L pairs: %d", err ); - - // if( (uint32)table > cx.gCfg->numDroppedTables ) - FatalIf( !IOJob::ReadFromFile( rPath, pairs.right, entryCount * sizeof( uint16 ), err ), "Failed to read table R pairs: %d", err ); - } - - // We expect table 7 to also be found in these buffers, so copy it - // if( table == TableId::Table7 ) - if( copyToPinnedBuffer ) - { - bbmemcpy_t( cx.hostTableSortedL, pairs.left , entryCount ); - bbmemcpy_t( cx.hostTableSortedR, pairs.right, entryCount ); + if( table == earliestTable ) + { + FatalIf( !IOJob::ReadFromFile( lPath, pairs.left, entryCount * sizeof( Pair ), err ), "Failed to read table X pairs: %d", err ); + } + else + { + FatalIf( !IOJob::ReadFromFile( lPath, pairs.left , entryCount * sizeof( uint32 ), err ), "Failed to read table L pairs: %d", err ); + + // if( (uint32)table > cx.gCfg->numDroppedTables ) + FatalIf( !IOJob::ReadFromFile( rPath, pairs.right, entryCount * sizeof( uint16 ), err ), "Failed to read table R pairs: %d", err ); + } } } + + // if( table == earliestTable && !useDiskHybridData ) + // { + // uint64* tmpBucket = bbcvirtallocboundednuma( BBCU_BUCKET_ALLOC_ENTRY_COUNT ); + + // std::vector hashesRam{}; + // std::vector hashesDisk{}; + + // byte hash[32]; + // char hashstr[sizeof(hash)*2+1] = {}; + + // for( uint32 run = 0; run < 2; run++ ) + // { + // auto& hashes = run == 0 ? hashesRam : hashesDisk; + + // uint64* xs = (uint64*)pairs.left; + + // for( uint32 b = 0; b < BBCU_BUCKET_COUNT; b++ ) + // { + // const uint64 bucketEntryCount = cx.bucketCounts[(int)table][b]; + + // RadixSort256::Sort( DbgGetThreadPool( cx ), xs, tmpBucket, bucketEntryCount ); + + // // Hash + // { + // blake3_hasher hasher; + // blake3_hasher_init( &hasher ); + // blake3_hasher_update( &hasher, xs, bucketEntryCount * sizeof( uint64 ) ); + // blake3_hasher_finalize( &hasher, hash, sizeof( hash ) ); + + // size_t _; + // BytesToHexStr( hash, sizeof( hash ), hashstr, sizeof( hashstr ), _ ); + // Log::Line( "[%3u] : 0x%s", b, hashstr ); + + // hashes.push_back( hashstr ); + + // // DbgPrintHash( " :", xs, sizeof( uint64 ) * bucketEntryCount ); + // } + + // xs += bucketEntryCount; + // } + + // if( run == 0 ) + // { + // DbgLoadTablePairs( cx, table, true ); + // } + // } + + // // Compare hashes + // { + // for( uint32 b = 0; b < BBCU_BUCKET_COUNT; b++ ) + // { + // if( hashesRam[b] != hashesDisk[b] ) + // { + // Panic( "Hash mismatch at bucket %u. %s != %s", b, hashesRam[b].c_str(), hashesDisk[b].c_str() ); + // } + // } + // Log::Line( "All hashes match!" ); + // } + + + // // DbgPrintHash( "Inlined X Table", cx.hostBackPointers[(int)table].left, sizeof( Pair ) * cx.tableEntryCounts[(int)table] ); + // Log::Line( "" ); + // bbvirtfreebounded( tmpBucket ); + // Exit( 0 ); + // } + // else + // { + // // DbgPrintHash( "L Table", cx.hostBackPointers[(int)table].left, sizeof( uint32 ) * cx.tableEntryCounts[(int)table] ); + // // DbgPrintHash( "R Table", cx.hostBackPointers[(int)table].right, sizeof( uint16 ) * cx.tableEntryCounts[(int)table] ); + // // Log::Line( "" ); + // } + + // Sort inlined xs + // if( table == earliestTable ) + // { + // uint64* tmpBucket = bbcvirtallocboundednuma( BBCU_BUCKET_ALLOC_ENTRY_COUNT ); + // uint64* xs = (uint64*)pairs.left; + + // for( uint32 b = 0; b < BBCU_BUCKET_COUNT; b++ ) + // { + // const uint64 bucketEntryCount = cx.bucketCounts[(int)table][b]; + // RadixSort256::Sort( DbgGetThreadPool( cx ), xs, tmpBucket, bucketEntryCount ); + // xs += bucketEntryCount; + // } + + // DbgPrintHash( "pre_sorted_xs", pairs.left, sizeof( uint64 ) * entryCount ); + // } + Log::Line( "[DEBUG] Done." ); } - void DbgLoadMarks( CudaK32PlotContext& cx ) { char path[512]; + std::string baseUrl = DBG_BBCU_DBG_DIR; + if( cx.cfg.hybrid128Mode ) + baseUrl += "disk/"; + // const size_t tableSize = ((1ull << BBCU_K) / 64) * sizeof(uint64); Log::Line( "[DEBUG] Loadinging marking tables" ); - const TableId startTable = TableId::Table2 + cx.gCfg->numDroppedTables; + const TableId startTable = cx.firstStoredTable; for( TableId table = startTable; table < TableId::Table7; table++ ) { - sprintf( path, "%smarks%d.tmp", DBG_BBCU_DBG_DIR, (int)table+1 ); + sprintf( path, "%smarks%d.tmp", baseUrl.c_str(), (int)table+1 ); int err = 0; cx.hostMarkingTables[(int)table] = (uint64*)IOJob::ReadAllBytesDirect( path, err ); diff --git a/cuda/CudaPlotter.h b/cuda/CudaPlotter.h index ebe30f67..ddcbfed2 100644 --- a/cuda/CudaPlotter.h +++ b/cuda/CudaPlotter.h @@ -9,10 +9,22 @@ struct CudaK32PlotConfig { const GlobalPlotConfig* gCfg = nullptr; - uint32 deviceIndex = 0; // Which CUDA device to use when plotting// - bool disableDirectDownloads = false; // Don't allocate host tables using pinned buffers, instead - // download to intermediate pinned buffers then copy to the final host buffer. - // May be necessarry on Windows because of shared memory limitations (usual 50% of system memory) + uint32 deviceIndex = 0; // Which CUDA device to use when plotting/ + bool disableDirectDownloads = false; // Don't allocate host tables using pinned buffers, instead + // download to intermediate pinned buffers then copy to the final host buffer. + // May be necessarry on Windows because of shared memory limitations (usual 50% of system memory) + + bool hybrid128Mode = false; // Enable hybrid disk-offload w/ 128G of RAM. + bool hybrid16Mode = false; // Enable hybrid disk-offload w/ 64G of RAM. + + const char* temp1Path = nullptr; // For 128G RAM mode + const char* temp2Path = nullptr; // For 64G RAM mode + + bool temp1DirectIO = true; // Use direct I/O for temp1 files + bool temp2DirectIO = true; // Use direct I/O for temp2 files + + uint64 plotCheckCount = 0; // For performing plot check command after plotting + double plotCheckThreshhold = 0.6; // Proof/check threshhold below which plots will be deleted }; class CudaK32Plotter : public IPlotter @@ -28,4 +40,6 @@ class CudaK32Plotter : public IPlotter private: CudaK32PlotConfig _cfg = {}; struct CudaK32PlotContext* _cx = nullptr;; -}; \ No newline at end of file +}; + +void CudaK32PlotterPrintHelp(); diff --git a/cuda/GpuDownloadStream.cu b/cuda/GpuDownloadStream.cu new file mode 100644 index 00000000..3d06973c --- /dev/null +++ b/cuda/GpuDownloadStream.cu @@ -0,0 +1,385 @@ +#include "GpuStreams.h" +#include "GpuQueue.h" +#include "plotting/DiskBucketBuffer.h" +#include "plotting/DiskBuffer.h" + + +/// +/// DownloadBuffer +/// +void* GpuDownloadBuffer::GetDeviceBuffer() +{ + const uint32 index = self->outgoingSequence % self->bufferCount; + + CudaErrCheck( cudaEventSynchronize( self->events[index] ) ); + + return self->deviceBuffer[index]; +} + +void* GpuDownloadBuffer::LockDeviceBuffer( cudaStream_t stream ) +{ + ASSERT( self->lockSequence >= self->outgoingSequence ); + ASSERT( self->lockSequence - self->outgoingSequence < self->bufferCount ); + + const uint32 index = self->lockSequence % self->bufferCount; + self->lockSequence++; + + // Wait for the device buffer to be free to be used by kernels + CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) ); + return self->deviceBuffer[index]; +} + +void GpuDownloadBuffer::Download( void* hostBuffer, const size_t size ) +{ + Download2D( hostBuffer, size, 1, size, size ); +} + +void GpuDownloadBuffer::Download( void* hostBuffer, const size_t size, cudaStream_t workStream, bool directOverride ) +{ + Download2D( hostBuffer, size, 1, size, size, workStream, directOverride ); +} + +void GpuDownloadBuffer::DownloadAndCopy( void* hostBuffer, void* finalBuffer, const size_t size, cudaStream_t workStream ) +{ + Panic( "Unavailable" ); + // ASSERT( self->outgoingSequence < BBCU_BUCKET_COUNT ); + // ASSERT( hostBuffer ); + // ASSERT( workStream ); + // ASSERT( self->lockSequence > 0 ); + // ASSERT( self->outgoingSequence < self->lockSequence ); + // ASSERT( self->lockSequence - self->outgoingSequence <= self->bufferCount ); + + // auto& cpy = self->copies[self->outgoingSequence]; + // cpy.self = self; + // cpy.sequence = self->outgoingSequence; + // cpy.copy.hostBuffer = finalBuffer; + // cpy.copy.srcBuffer = hostBuffer; + // cpy.copy.size = size; + + + // const uint32 index = self->outgoingSequence % self->bufferCount; + // self->outgoingSequence++; + + // void* pinnedBuffer = self->pinnedBuffer[index]; + // const void* devBuffer = self->deviceBuffer[index]; + + // // Signal from the work stream when it has finished doing kernel work with the device buffer + // CudaErrCheck( cudaEventRecord( self->readyEvents[index], workStream ) ); + + + // // Ensure the work stream has completed writing data to the device buffer + // cudaStream_t stream = self->queue->_stream; + + // CudaErrCheck( cudaStreamWaitEvent( stream, self->readyEvents[index] ) ); + + // // Copy + // CudaErrCheck( cudaMemcpyAsync( hostBuffer, devBuffer, size, cudaMemcpyDeviceToHost, stream ) ); + + // // Signal that the device buffer is free to be re-used + // CudaErrCheck( cudaEventRecord( self->events[index], stream ) ); + + // // Launch copy command + // CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){ + + // const CopyInfo& c = *reinterpret_cast( userData ); + // IGpuBuffer* self = c.self; + + // auto& cmd = self->queue->GetCommand( GpuQueue::CommandType::Copy ); + // cmd.copy.info = &c; + + // self->queue->SubmitCommands(); + + // // Signal the download completed + // self->fence.Signal( ++self->completedSequence ); + // }, &cpy ) ); +} + +void GpuDownloadBuffer::DownloadWithCallback( void* hostBuffer, const size_t size, GpuDownloadCallback callback, void* userData, cudaStream_t workStream, bool directOverride ) +{ + Download2DWithCallback( hostBuffer, size, 1, size, size, callback, userData, workStream, directOverride ); +} + +void GpuDownloadBuffer::Download2D( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, cudaStream_t workStream, bool directOverride ) +{ + Download2DWithCallback( hostBuffer, width, height, dstStride, srcStride, nullptr, nullptr, workStream, directOverride ); +} + +void GpuDownloadBuffer::Download2DWithCallback( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, + GpuDownloadCallback callback, void* userData, cudaStream_t workStream, bool directOverride ) +{ + PerformDownload2D( hostBuffer, width, height, dstStride, srcStride, + callback, userData, + workStream, directOverride ); +} + +void GpuDownloadBuffer::PerformDownload2D( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, + GpuDownloadCallback postCallback, void* postUserData, + cudaStream_t workStream, bool directOverride ) +{ + PanicIf( !(hostBuffer || self->pinnedBuffer[0] ), "" ); + ASSERT( workStream ); + ASSERT( self->lockSequence > 0 ); + ASSERT( self->outgoingSequence < self->lockSequence ); + ASSERT( self->lockSequence - self->outgoingSequence <= self->bufferCount ); + + const uint32 index = self->outgoingSequence++ % self->bufferCount; + + void* pinnedBuffer = self->pinnedBuffer[index]; + void* finalHostBuffer = hostBuffer; + const void* devBuffer = self->deviceBuffer[index]; + + const bool isDirect = (directOverride || self->pinnedBuffer[0] == nullptr) && !self->diskBuffer; ASSERT( isDirect || self->pinnedBuffer[0] ); + const bool isSequentialCopy = dstStride == srcStride; + const size_t totalSize = height * width; + + + // Signal from the work stream when it has finished doing kernel work with the device buffer + CudaErrCheck( cudaEventRecord( self->workEvent[index], workStream ) ); + + // From the download stream, wait for the work stream to finish + cudaStream_t downloadStream = self->queue->_stream; + CudaErrCheck( cudaStreamWaitEvent( downloadStream, self->workEvent[index] ) ); + + + if( self->diskBuffer ) + { + // Wait until the next disk buffer is ready for use. + // This also signals that the pinned buffer is ready for re-use + CallHostFunctionOnStream( downloadStream, [this](){ + self->diskBuffer->GetNextWriteBuffer(); + }); + + pinnedBuffer = self->diskBuffer->PeekWriteBufferForBucket( self->outgoingSequence-1 ); + } + + if( !isDirect ) + { + // Ensure that the pinned buffer is ready for use + // (we signal pinned buffers are ready when using disks without events) + if( !self->diskBuffer ) + CudaErrCheck( cudaStreamWaitEvent( downloadStream, self->pinnedEvent[index] ) ); + + // Set host buffer as the pinned buffer + hostBuffer = pinnedBuffer; + } + + + // Copy from device to host buffer + // #NOTE: Since the pinned buffer is simply the same size (a full bucket) as the device buffer + // we also always copy as 1D if we're copying to our pinned buffer. + ASSERT( hostBuffer ); + if( isSequentialCopy || hostBuffer == pinnedBuffer ) + CudaErrCheck( cudaMemcpyAsync( hostBuffer, devBuffer, totalSize, cudaMemcpyDeviceToHost, downloadStream ) ); + else + CudaErrCheck( cudaMemcpy2DAsync( hostBuffer, dstStride, devBuffer, srcStride, width, height, cudaMemcpyDeviceToHost, downloadStream ) ); + + // Dispatch a host callback if one was set + if( postCallback ) + { + CallHostFunctionOnStream( downloadStream, [=](){ + (*postCallback)( finalHostBuffer, totalSize, postUserData ); + }); + } + + + // Signal that the device buffer is free to be re-used + CudaErrCheck( cudaEventRecord( self->deviceEvents[index], downloadStream ) ); + + if( self->diskBuffer ) + { + // If it's a disk-based copy, then write the pinned buffer to disk + CallHostFunctionOnStream( downloadStream, [=]() { + + auto* diskBucketBuffer = dynamic_cast( self->diskBuffer ); + if( diskBucketBuffer != nullptr ) + diskBucketBuffer->Submit( srcStride ); + else + static_cast( self->diskBuffer )->Submit( totalSize ); + }); + + // #NOTE: We don't need to signal that the pinned buffer is ready for re-use here as + // we do that implicitly with DiskBuffer::GetNextWriteBuffer (see above). + } + else if( !isDirect ) + { + // #TODO: Do this in a different host copy stream, and signal from there. + // #MAYBE: Perhaps use multiple host threads/streams to do host-to-host copies. + // for now do it on the same download stream, but we will be blocking the download stream, + // unless other download streams are used by other buffers. + + + ASSERT( hostBuffer == pinnedBuffer ); + if( isSequentialCopy ) + CudaErrCheck( cudaMemcpyAsync( finalHostBuffer, hostBuffer, totalSize, cudaMemcpyHostToHost, downloadStream ) ); + else + CudaErrCheck( cudaMemcpy2DAsync( finalHostBuffer, dstStride, hostBuffer, srcStride, width, height, cudaMemcpyHostToHost, downloadStream ) ); + + // Signal the pinned buffer is free to be re-used + CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], downloadStream ) ); + } +} + +void GpuDownloadBuffer::CallHostFunctionOnStream( cudaStream_t stream, std::function func ) +{ + auto* fnCpy = new std::function( std::move( func ) ); + CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ) { + + auto& fn = *reinterpret_cast*>( userData ); + fn(); + delete& fn; + + }, fnCpy ) ); +} + +void GpuDownloadBuffer::HostCallback( std::function func ) +{ + CallHostFunctionOnStream( self->queue->GetStream(), func ); +} + +void GpuDownloadBuffer::GetDownload2DCommand( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, + uint32& outIndex, void*& outPinnedBuffer, const void*& outDevBuffer, GpuDownloadCallback callback, void* userData ) +{ + ASSERT( width ); + ASSERT( height ); + ASSERT( hostBuffer ); + + const uint32 index = self->outgoingSequence % self->bufferCount; + + // We need to block until the pinned buffer is available. + if( self->outgoingSequence > self->bufferCount-1 ) + self->fence.Wait( self->outgoingSequence - self->bufferCount + 1 ); + + void* pinnedBuffer = self->pinnedBuffer[index]; + const void* devBuffer = self->deviceBuffer[index]; + + //auto& cmd = self->commands[index]; + //cmd.type = GpuQueue::CommandType::Copy2D; + //cmd.sequenceId = self->outgoingSequence++; + //cmd.finishedSignal = &self->fence; + //cmd.dstBuffer = hostBuffer; + //cmd.srcBuffer = pinnedBuffer; + //cmd.copy2d.width = width; + //cmd.copy2d.height = height; + //cmd.copy2d.dstStride = dstStride; + //cmd.copy2d.srcStride = srcStride; + //cmd.copy2d.callback = callback; + //cmd.copy2d.userData = userData; + + outIndex = index; + outPinnedBuffer = pinnedBuffer; + outDevBuffer = devBuffer; +} + + +void GpuDownloadBuffer::DownloadAndPackArray( void* hostBuffer, const uint32 length, size_t srcStride, const uint32* counts, const uint32 elementSize ) +{ + ASSERT( length ); + ASSERT( elementSize ); + ASSERT( counts ); + + uint32 totalElements = 0; + for( uint32 i = 0; i < length; i++ ) + totalElements += counts[i]; + + const size_t totalSize = (size_t)totalElements * elementSize; + + uint32 index; + void* pinnedBuffer; + const void* devBuffer; + GetDownload2DCommand( hostBuffer, totalSize, 1, totalSize, totalSize, index, pinnedBuffer, devBuffer ); + + + srcStride *= elementSize; + + byte* dst = (byte*)pinnedBuffer; + const byte* src = (byte*)devBuffer; + + cudaStream_t stream = self->queue->_stream; + + // Copy all buffers from device to pinned buffer + for( uint32 i = 0; i < length; i++ ) + { + const size_t copySize = counts[i] * (size_t)elementSize; + + // #TODO: Determine if there's a cuda (jagged) array copy + CudaErrCheck( cudaMemcpyAsync( dst, src, copySize, cudaMemcpyDeviceToHost, stream ) ); + + src += srcStride; + dst += copySize; + } + + // Signal that the device buffer is free + CudaErrCheck( cudaEventRecord( self->events[index], stream ) ); + + // Submit command to do the final copy from pinned to host + CudaErrCheck( cudaLaunchHostFunc( stream, GpuQueue::CopyPendingDownloadStream, self ) ); +} + +void GpuDownloadBuffer::WaitForCompletion() +{ + if( self->outgoingSequence > 0 ) + { + //const uint32 index = (self->outgoingSequence - 1) % self->bufferCount; + + // cudaEvent_t event = self->completedEvents[index]; + //const cudaError_t r = cudaEventQuery( event ); + + //if( r == cudaSuccess ) + // return; + + //if( r != cudaErrorNotReady ) + // CudaErrCheck( r ); + + //CudaErrCheck( cudaEventSynchronize( event ) ); + + + cudaStream_t downloadStream = self->queue->_stream; + // this->self->fence.Reset( 0 ); + CallHostFunctionOnStream( downloadStream, [this](){ + this->self->fence.Signal( this->self->outgoingSequence ); + }); + self->fence.Wait( self->outgoingSequence ); + + } +} + +void GpuDownloadBuffer::WaitForCopyCompletion() +{ + if( self->outgoingSequence > 0 ) + { + self->copyFence.Wait( self->outgoingSequence ); + } +} + +void GpuDownloadBuffer::Reset() +{ + self->lockSequence = 0; + self->outgoingSequence = 0; + self->completedSequence = 0; + self->copySequence = 0; + self->fence.Reset( 0 ); + self->copyFence.Reset( 0 ); +} + +GpuQueue* GpuDownloadBuffer::GetQueue() const +{ + return self->queue; +} + +void GpuDownloadBuffer::AssignDiskBuffer( DiskBufferBase* diskBuffer ) +{ + // ASSERT( self->pinnedBuffer[0] ); + + void* nullBuffers[2] = { nullptr, nullptr }; + if( self->diskBuffer ) + self->diskBuffer->AssignWriteBuffers( nullBuffers ); + + self->diskBuffer = diskBuffer; + if( self->diskBuffer ) + self->diskBuffer->AssignWriteBuffers( self->pinnedBuffer ); +} + +DiskBufferBase* GpuDownloadBuffer::GetDiskBuffer() const +{ + return self->diskBuffer; +} diff --git a/cuda/GpuQueue.cu b/cuda/GpuQueue.cu new file mode 100644 index 00000000..399a0fbf --- /dev/null +++ b/cuda/GpuQueue.cu @@ -0,0 +1,432 @@ +#include "GpuQueue.h" +#include "util/IAllocator.h" +#include "plotting/DiskBucketBuffer.h" +#include "plotting/DiskBuffer.h" + +/// +/// Shared GpuStream Inteface +/// +GpuQueue::GpuQueue( Kind kind ) : _kind( kind ) + , _bufferReadySignal( BBCU_BUCKET_COUNT ) +{ + CudaErrCheck( cudaStreamCreateWithFlags( &_stream , cudaStreamNonBlocking ) ); + CudaErrCheck( cudaStreamCreateWithFlags( &_preloadStream , cudaStreamNonBlocking ) ); + CudaErrCheck( cudaStreamCreateWithFlags( &_callbackStream, cudaStreamNonBlocking ) ); + + _queueThread.Run( QueueThreadEntryPoint, this ); +} + +GpuQueue::~GpuQueue() +{ + _exitQueueThread.store( true, std::memory_order_release ); + _bufferReadySignal.Release(); + _waitForExitSignal.Wait(); + + + if( _stream ) cudaStreamDestroy( _stream ); + if( _preloadStream ) cudaStreamDestroy( _preloadStream ); + if( _callbackStream ) cudaStreamDestroy( _callbackStream ); + + _stream = nullptr; + _preloadStream = nullptr; + _callbackStream = nullptr; +} + +GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( const GpuStreamDescriptor& desc, bool dryRun ) +{ + FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue." ); + GpuDownloadBuffer r = { CreateGpuBuffer( desc, dryRun ) }; + + if( !dryRun ) + r.Reset(); + + return r; +} + +GpuDownloadBuffer GpuQueue::CreateDirectDownloadBuffer( const size_t size, IAllocator& devAllocator, const size_t alignment, const bool dryRun ) +{ + FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" ); + + ASSERT( 0 ); // #TODO: Deprecated function. Replace with the new one. + GpuStreamDescriptor desc{}; + desc.entrySize = 1; + desc.entriesPerSlice = 1; + desc.sliceCount = BBCU_BUCKET_COUNT; + desc.sliceAlignment = alignment; + desc.bufferCount = 2; + desc.deviceAllocator = &devAllocator; + desc.pinnedAllocator = nullptr; + + return CreateDownloadBuffer( desc, dryRun ); +} + +GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun ) +{ + FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" ); + GpuDownloadBuffer r = { CreateGpuBuffer( size, devAllocator, pinnedAllocator, alignment, dryRun ) }; + + if( !dryRun ) + r.Reset(); + + return r; +} + +GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( const size_t size, const uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun ) +{ + FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" ); + + ASSERT( 0 ); // #TODO: Deprecated function. Replace with the new one. + GpuStreamDescriptor desc{}; + desc.entrySize = 1; + desc.entriesPerSlice = 1; + desc.sliceCount = BBCU_BUCKET_COUNT; + desc.sliceAlignment = alignment; + desc.bufferCount = bufferCount; + desc.deviceAllocator = &devAllocator; + desc.pinnedAllocator = &pinnedAllocator; + + GpuDownloadBuffer r = { CreateGpuBuffer( desc, dryRun ) }; + + if( !dryRun ) + r.Reset(); + + return r; +} + +GpuUploadBuffer GpuQueue::CreateUploadBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun ) +{ + Panic( "Deprecated" ); + FatalIf( _kind != Uploader, "Attempted to create GpuUploadBuffer on an DownloadQueue" ); + + GpuUploadBuffer r = { CreateGpuBuffer( size, devAllocator, pinnedAllocator, alignment, dryRun ) }; + + if( !dryRun ) + r.Reset(); + + return r; +} + +GpuUploadBuffer GpuQueue::CreateUploadBuffer( const GpuStreamDescriptor& desc, bool dryRun ) +{ + FatalIf( _kind != Uploader, "Attempted to create GpuUploadBuffer on an DownloadQueue." ); + + GpuUploadBuffer r = { CreateGpuBuffer( desc, dryRun ) }; + + if( !dryRun ) + r.Reset(); + + return r; +} + + + +struct IGpuBuffer* GpuQueue::CreateGpuBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun ) +{ + Panic( "Deprecated" ); + // ASSERT( 0 ); // #TODO: Deprecated function. Replace with the new one. + GpuStreamDescriptor desc{}; + desc.entrySize = 1; + desc.entriesPerSlice = size; + desc.sliceCount = BBCU_BUCKET_COUNT; + desc.sliceAlignment = alignment; + desc.bufferCount = 2; + desc.deviceAllocator = &devAllocator; + desc.pinnedAllocator = &pinnedAllocator; + + return CreateGpuBuffer( desc, dryRun ); +} + +struct IGpuBuffer* GpuQueue::CreateGpuBuffer( const GpuStreamDescriptor& desc, bool dryRun ) +{ + PanicIf( desc.bufferCount > BBCU_GPU_BUFFER_MAX_COUNT || !desc.bufferCount, "Invalid GPUBuffer buffer count." ); + PanicIf( !desc.deviceAllocator, "Null device allocator." ); + PanicIf( !desc.entrySize, "Invalid entry size." ); + PanicIf( !desc.entriesPerSlice, "Invalid entries per slice." ); + PanicIf( !desc.sliceCount || desc.sliceCount > BBCU_BUCKET_COUNT, "Invalid slice count." ); + PanicIf( !desc.sliceAlignment, "Invalid slice alignment." ); + PanicIf( desc.diskQueue && (!desc.diskFileName || !*desc.diskFileName), "Invalid disk offload config." ); + PanicIf( desc.diskQueue && !desc.pinnedAllocator, "A pinned allocator must be set in disk offload mode." ); + + const size_t allocSize = CalculateBufferSizeFromDescriptor( desc ); + + void* devBuffers [BBCU_GPU_BUFFER_MAX_COUNT] = {}; + void* pinnedBuffers[BBCU_GPU_BUFFER_MAX_COUNT] = {}; + + for( int32 i = 0; i < desc.bufferCount; i++ ) + { + devBuffers[i] = desc.deviceAllocator->Alloc( allocSize, desc.sliceAlignment ); + + if( desc.pinnedAllocator ) + pinnedBuffers[i] = desc.pinnedAllocator->Alloc( allocSize, desc.sliceAlignment ); + } + + struct IGpuBuffer* buf = nullptr; + + if( !dryRun ) + { + buf = new IGpuBuffer{}; + + for( int32 i = 0; i < desc.bufferCount; i++ ) + { + CudaErrCheck( cudaEventCreateWithFlags( &buf->events[i] , cudaEventDisableTiming ) ); + CudaErrCheck( cudaEventCreateWithFlags( &buf->completedEvents[i], cudaEventDisableTiming ) ); + CudaErrCheck( cudaEventCreateWithFlags( &buf->readyEvents[i] , cudaEventDisableTiming ) ); + // CudaErrCheck( cudaEventCreateWithFlags( &buf->preloadEvents[i] , cudaEventDisableTiming ) ); + CudaErrCheck( cudaEventCreateWithFlags( &buf->pinnedEvent[i] , cudaEventDisableTiming ) ); + + CudaErrCheck( cudaEventCreateWithFlags( &buf->callbackLockEvent , cudaEventDisableTiming ) ); + CudaErrCheck( cudaEventCreateWithFlags( &buf->callbackCompletedEvent, cudaEventDisableTiming ) ); + + buf->deviceBuffer[i] = devBuffers[i]; + buf->pinnedBuffer[i] = pinnedBuffers[i]; + } + + buf->size = allocSize; + buf->bufferCount = desc.bufferCount; + buf->queue = this; + } + + // Disk offload mode? + if( desc.diskQueue ) + { + const size_t sliceSize = CalculateSliceSizeFromDescriptor( desc ); + + if( !dryRun ) + { + if( desc.bucketedDiskBuffer ) + { + buf->diskBuffer = DiskBucketBuffer::Create( + *desc.diskQueue, desc.diskFileName, + desc.sliceCount, sliceSize, + FileMode::Create, FileAccess::ReadWrite, + desc.directIO ? FileFlags::NoBuffering | FileFlags::LargeFile : FileFlags::None ); + } + else + { + buf->diskBuffer = DiskBuffer::Create( + *desc.diskQueue, desc.diskFileName, + desc.sliceCount, allocSize, + FileMode::Create, FileAccess::ReadWrite, + desc.directIO ? FileFlags::NoBuffering | FileFlags::LargeFile : FileFlags::None ); + } + + PanicIf( !buf->diskBuffer, "Failed to create DiskBuffer for GpuBuffer." ); + + void* readBuffers [2] = { nullptr, nullptr }; + void* writeBuffers[2] = { pinnedBuffers[0], pinnedBuffers[1] }; + + buf->diskBuffer->AssignBuffers( readBuffers, writeBuffers ); + } + else + { + size_t diskAllocSize = 0; + if( desc.bucketedDiskBuffer ) + { + diskAllocSize = DiskBucketBuffer::GetReserveAllocSize( *desc.diskQueue, desc.sliceCount, sliceSize ); + } + else + { + diskAllocSize = DiskBuffer::GetReserveAllocSize( *desc.diskQueue, allocSize ); + } + + ASSERT( diskAllocSize == allocSize * 4 ); + } + } + + return buf; +} + +void GpuQueue::DispatchHostFunc( GpuCallbackDispath func, cudaStream_t stream, cudaEvent_t lockEvent, cudaEvent_t completedEvent ) +{ + // #MAYBE: Perhaps support having multiple callback streams, and multiple copy streams. + + // Signal from the work stream into the callback stream that we are ready for callback + CudaErrCheck( cudaEventRecord( lockEvent, stream ) ); + + // Wait on the callback stream until it's ready to dsitpatch + CudaErrCheck( cudaStreamWaitEvent( _callbackStream, lockEvent ) ); + + // #MAYBE: Use a bump allocator perhaps later to avoid locking here by new/delete if needed for performance. + auto* fnCpy = new std::function( std::move( func ) ); + CudaErrCheck( cudaLaunchHostFunc( _callbackStream, []( void* userData ){ + + auto& fn = *reinterpret_cast*>( userData ); + fn(); + delete &fn; + + }, fnCpy ) ); + + // Signal from the callback stream that the callback finished + CudaErrCheck( cudaEventRecord( completedEvent, _callbackStream ) ); + + // Wait on work stream for the callback to complete + CudaErrCheck( cudaStreamWaitEvent( stream, completedEvent ) ); +} + +size_t GpuQueue::CalculateSliceSizeFromDescriptor( const GpuStreamDescriptor& desc ) +{ + const size_t alignment = desc.diskQueue ? desc.diskQueue->BlockSize() : desc.sliceAlignment; + return RoundUpToNextBoundaryT( desc.entrySize * desc.entriesPerSlice, alignment ); +} + +size_t GpuQueue::CalculateBufferSizeFromDescriptor( const GpuStreamDescriptor& desc ) +{ + return CalculateSliceSizeFromDescriptor( desc ) * desc.sliceCount; +} + +void GpuQueue::CopyPendingDownloadStream( void* userData ) +{ + auto* buf = reinterpret_cast( userData ); + + GpuQueue* queue = buf->queue; + + //const uint32 index = buf->completedSequence % buf->bufferCount; + buf->completedSequence++; + + //queue->GetCommand( CommandType::Download2D ) = buf->commands[index]; + queue->SubmitCommands(); +} + +void GpuQueue::SubmitCommands() +{ + const uint64 ticket = _commitTicketOut++; + + // Wait for our ticket to come up + while( _commitTicketIn.load( std::memory_order_relaxed ) != ticket ); + + _queue.Commit(); + _bufferReadySignal.Release(); + //_bufferReadySignal.Signal(); + + // Use our ticket + _commitTicketIn.store( ticket+1, std::memory_order_release ); +} + +GpuQueue::Command& GpuQueue::GetCommand( CommandType type ) +{ + const uint64 ticket = _cmdTicketOut++; + + // Wait for our ticket to come up + while( _cmdTicketIn.load( std::memory_order_relaxed ) != ticket ); + + Command* cmd; + while( !_queue.Write( cmd ) ) + { + Log::Line( "[GpuQueue] Queue is depleted. Waiting for copies to complete." ); + auto waitTimer = TimerBegin(); + + // Block and wait until we have commands free in the buffer + _bufferCopiedSignal.Wait(); + + Log::Line( "[GpuQueue] Waited %.6lf seconds for availability.", TimerEnd( waitTimer ) ); + } + + // Use our ticket + _cmdTicketIn.store( ticket+1, std::memory_order_release ); + + ZeroMem( cmd ); + cmd->type = type; + + return *cmd; +} + +/// +/// Command thread +/// +void GpuQueue::QueueThreadEntryPoint( GpuQueue* self ) +{ + ASSERT( self ); + self->QueueThreadMain(); + self->_waitForExitSignal.Signal(); +} + +void GpuQueue::QueueThreadMain() +{ + const int32 CMD_BUF_SIZE = 256; + Command buffers[CMD_BUF_SIZE]; + + for( ;; ) + { + _bufferReadySignal.Wait(); + + if( ShouldExitQueueThread() ) + return; + + // 1 command per semaphore release + int32 bufCount; + while( ( ( bufCount = _queue.Dequeue( buffers, CMD_BUF_SIZE ) ) ) ) + // if( ( ( bufCount = _queue.Dequeue( buffers, CMD_BUF_SIZE ) ) ) ) + { + ASSERT( bufCount <= CMD_BUF_SIZE ); + _bufferCopiedSignal.Signal(); + + for( int i = 0; i < bufCount; i++ ) + ExecuteCommand( buffers[i] ); + } + } +} + +void GpuQueue::ExecuteCommand( const Command& cmd ) +{ + + // const uint32 index = cmd.sequenceId % BBCU_GPU_BUFFER_MAX_COUNT; + + if( cmd.type == CommandType::Copy ) + { + auto& cpy = *cmd.copy; + + const bool isSequentialCopy = cpy.dstStride == cpy.srcStride; + const size_t totalSize = cpy.height * cpy.width; + + byte* dst = (byte*)cpy.dstBuffer; + const byte* src = (byte*)cpy.srcBuffer; + + if( isSequentialCopy ) + memcpy( cpy.dstBuffer, cpy.srcBuffer, totalSize ); + else + { + const byte* src = (byte*)cpy.srcBuffer; + byte* dst = (byte*)cpy.dstBuffer; + + for( size_t i = 0; i < cpy.height; i++ ) + { + memcpy( dst, src, cpy.width ); + + dst += cpy.dstStride; + src += cpy.srcStride; + } + } + + cpy.self->fence.Signal( cpy.sequence+1 ); + cpy.self->copyFence.Signal( cpy.sequence+1 ); + + if( cpy.callback ) + cpy.callback( cpy.dstBuffer, totalSize, cpy.userData ); + } + else if( cmd.type == CommandType::CopyArray ) + { + + } + else if( cmd.type == CommandType::Callback ) + { + cmd.callback.callback( cmd.callback.dstbuffer, cmd.callback.copySize, cmd.callback.userData ); + } + // else if( cmd.type == CommandType::Sync ) + // { + // _syncFence.Signal(); + // return; + // } + else + { + ASSERT( 0 ); + } + + // Signal that the pinned buffer is free + //cpy.finishedSignal->Signal( cpy.sequenceId + 1 ); +} + +inline bool GpuQueue::ShouldExitQueueThread() +{ + return _exitQueueThread.load( std::memory_order_acquire ); +} + diff --git a/cuda/GpuQueue.h b/cuda/GpuQueue.h new file mode 100644 index 00000000..8adf41e5 --- /dev/null +++ b/cuda/GpuQueue.h @@ -0,0 +1,188 @@ +#pragma once + +#include "GpuStreams.h" +#include + +class DiskQueue; + +struct GpuStreamDescriptor +{ + size_t entrySize; + size_t entriesPerSlice; + uint32 sliceCount; + uint32 sliceAlignment; + uint32 bufferCount; + IAllocator* deviceAllocator; + IAllocator* pinnedAllocator; + DiskQueue* diskQueue; // DiskQueue to use when disk offload mode is enabled. + const char* diskFileName; // File name to use when disk offload mode is enabled. The diskQueue must be set. + bool bucketedDiskBuffer; // If true, a DiskBucketBuffer will be used instead of a DiskBuffer. + bool directIO; // If true, direct I/O will be used when using disk offload mode. +}; + +typedef std::function GpuCallbackDispath; + +class GpuQueue +{ + friend struct IGpuBuffer; + friend struct GpuDownloadBuffer; + friend struct GpuUploadBuffer; + + enum class CommandType + { + None = 0, + Copy, + CopyArray, + Callback, + }; + + struct Command + { + CommandType type; + + union + { + struct CopyInfo* copy; + + struct { + GpuDownloadCallback callback; + size_t copySize; + void* dstbuffer; + void* userData; + } callback; + }; + }; + +public: + + enum Kind + { + Downloader, + Uploader + }; + + GpuQueue( Kind kind ); + virtual ~GpuQueue(); + + static size_t CalculateSliceSizeFromDescriptor( const GpuStreamDescriptor& desc ); + static size_t CalculateBufferSizeFromDescriptor( const GpuStreamDescriptor& desc ); + + //GpuDownloadBuffer CreateDownloadBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size = 0, bool dryRun = false ); + //GpuDownloadBuffer CreateDownloadBuffer( const size_t size, bool dryRun = false ); + GpuDownloadBuffer CreateDirectDownloadBuffer( size_t size, IAllocator& devAllocator, size_t alignment, bool dryRun = false ); + GpuDownloadBuffer CreateDownloadBuffer( size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false ); + GpuDownloadBuffer CreateDownloadBuffer( size_t size, uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false ); + + GpuDownloadBuffer CreateDownloadBuffer( const GpuStreamDescriptor& desc, bool dryRun = false ); + + /// Create with descriptor and override entry size + inline GpuDownloadBuffer CreateDownloadBuffer( const GpuStreamDescriptor& desc, size_t entrySize, bool dryRun = false ) + { + GpuStreamDescriptor copy = desc; + copy.entrySize = entrySize; + + return CreateDownloadBuffer( copy, dryRun ); + } + + template + inline GpuDownloadBuffer CreateDownloadBufferT( const GpuStreamDescriptor& desc, bool dryRun = false ) + { + return CreateDownloadBuffer( desc, sizeof( T ), dryRun ); + } + + /// Create with descriptor and override entry size + GpuUploadBuffer CreateUploadBuffer( const GpuStreamDescriptor& desc, bool dryRun = false ); + + // inline GpuUploadBuffer CreateUploadBuffer( const GpuStreamDescriptor& desc, bool size_t entrySize, bool dryRun = false ) + // { + // GpuStreamDescriptor copy = desc; + // copy.entrySize = entrySize; + + // return CreateUploadBuffer( copy, dryRun ); + // } + + template + inline GpuUploadBuffer CreateUploadBufferT( const GpuStreamDescriptor& desc, bool dryRun = false ) + { + GpuStreamDescriptor copy = desc; + copy.entrySize = sizeof(T); + + return CreateUploadBuffer( copy, dryRun ); + // return CreateUploadBuffer( desc, sizeof( T ), dryRun ); + } + + + template + inline GpuDownloadBuffer CreateDirectDownloadBuffer( const size_t count, IAllocator& devAllocator, size_t alignment = alignof( T ), bool dryRun = false ) + { + return CreateDirectDownloadBuffer( count * sizeof( T ), devAllocator, alignment, dryRun ); + } + + template + inline GpuDownloadBuffer CreateDownloadBufferT( const size_t count, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment = alignof( T ), bool dryRun = false ) + { + return CreateDownloadBuffer( count * sizeof( T ), devAllocator, pinnedAllocator, alignment, dryRun ); + } + + template + inline GpuDownloadBuffer CreateDownloadBufferT( const size_t count, uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment = alignof( T ), bool dryRun = false ) + { + return CreateDownloadBuffer( count * sizeof( T ), bufferCount, devAllocator, pinnedAllocator, alignment, dryRun ); + } + + //GpuUploadBuffer CreateUploadBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size = 0, bool dryRun = false ); + //GpuUploadBuffer CreateUploadBuffer( const size_t size, bool dryRun = false ); + GpuUploadBuffer CreateUploadBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false ); + + template + inline GpuUploadBuffer CreateUploadBufferT( const size_t count, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false ) + { + return CreateUploadBuffer( count * sizeof( T ), devAllocator, pinnedAllocator, alignment, dryRun ); + } + + inline cudaStream_t GetStream() const { return _stream; } + +protected: + + struct IGpuBuffer* CreateGpuBuffer( size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun ); + struct IGpuBuffer* CreateGpuBuffer( const GpuStreamDescriptor& desc, bool dryRun ); + + void DispatchHostFunc( GpuCallbackDispath func, cudaStream_t stream, cudaEvent_t lockEvent, cudaEvent_t completedEvent ); + + static void CopyPendingDownloadStream( void* userData ); + + [[nodiscard]] + Command& GetCommand( CommandType type ); + void SubmitCommands(); + + // Copy threads + static void QueueThreadEntryPoint( GpuQueue* self ); + void QueueThreadMain(); + + void ExecuteCommand( const Command& cpy ); + + bool ShouldExitQueueThread(); + +protected: + cudaStream_t _stream = nullptr; + cudaStream_t _preloadStream = nullptr; + cudaStream_t _callbackStream = nullptr; + + + Thread _queueThread; + //Fence _bufferReadySignal; + Semaphore _bufferReadySignal; + Fence _bufferCopiedSignal; + Fence _syncFence; + SPCQueue _queue; + Kind _kind; + + AutoResetSignal _waitForExitSignal; + std::atomic _exitQueueThread = false; + + // Support multiple threads to grab commands + std::atomic _cmdTicketOut = 0; + std::atomic _cmdTicketIn = 0; + std::atomic _commitTicketOut = 0; + std::atomic _commitTicketIn = 0; +}; diff --git a/cuda/GpuStreams.cu b/cuda/GpuStreams.cu index e5dcfd66..63700c9c 100644 --- a/cuda/GpuStreams.cu +++ b/cuda/GpuStreams.cu @@ -1,137 +1,105 @@ #include "GpuStreams.h" -#include "util/StackAllocator.h" +#include "GpuQueue.h" +#include "plotting/DiskBucketBuffer.h" +#include "plotting/DiskBuffer.h" -struct PackedCopy -{ - struct IGpuBuffer* self; - const byte* src; - uint32 sequence; - uint32 length; - uint32 stride; - uint32 elementSize; - uint32 counts[BBCU_BUCKET_COUNT]; -}; - -struct CopyInfo -{ - struct IGpuBuffer* self; - uint32 sequence; - - const void* srcBuffer; - void* dstBuffer; - size_t width; - size_t height; - size_t dstStride; - size_t srcStride; - - // Callback data - GpuDownloadCallback callback; - void* userData; -}; - -struct IGpuBuffer -{ - size_t size; - uint32 bufferCount; // Number of pinned/device buffers this instance contains - void* deviceBuffer [BBCU_GPU_BUFFER_MAX_COUNT]; - void* pinnedBuffer [BBCU_GPU_BUFFER_MAX_COUNT]; // Pinned host buffer - cudaEvent_t events [BBCU_GPU_BUFFER_MAX_COUNT]; // Signals the device buffer is ready for use - cudaEvent_t completedEvents[BBCU_GPU_BUFFER_MAX_COUNT]; // Signals the buffer is ready for consumption by the device or buffer - cudaEvent_t readyEvents [BBCU_GPU_BUFFER_MAX_COUNT]; // User must signal this event when the device buffer is ready for download - // GpuQueue::Command commands [BBCU_GPU_BUFFER_MAX_COUNT]; // Pending copy command for downloads - Fence fence; // Signals the pinned buffer is ready for use - Fence copyFence; - - cudaEvent_t preloadEvents[BBCU_GPU_BUFFER_MAX_COUNT]; - - CopyInfo copies[BBCU_BUCKET_COUNT]; - PackedCopy packedCopeis[BBCU_BUCKET_COUNT]; // For uplad buffers - // #TODO: Remove atomic again - uint32 lockSequence; // Index of next buffer to lock - uint32 outgoingSequence; // Index of locked buffer that will be downoaded/uploaded - std::atomic completedSequence; // Index of buffer that finished downloading/uploading - std::atomic copySequence; - - GpuQueue* queue; -}; /// -/// DownloadBuffer +/// UploadBuffer /// -void* GpuDownloadBuffer::GetDeviceBuffer() +void* GpuUploadBuffer::GetNextPinnedBuffer() { + // Wait for the pinned host buffer to be available + //if( self->outgoingSequence > self->bufferCount-1 ) + // self->fence.Wait( self->outgoingSequence - self->bufferCount + 1 ); + // const uint32 index = self->outgoingSequence % self->bufferCount; - CudaErrCheck( cudaEventSynchronize( self->events[index] ) ); + void* pinnedBuffer = self->pinnedBuffer[index]; - return self->deviceBuffer[index]; + return pinnedBuffer; } -void* GpuDownloadBuffer::LockDeviceBuffer( cudaStream_t stream ) +void GpuUploadBuffer::Upload( const void* hostBuffer, size_t size, cudaStream_t workStream, bool directOverride ) { - ASSERT( self->lockSequence >= self->outgoingSequence ); - ASSERT( self->lockSequence - self->outgoingSequence < self->bufferCount ); + ASSERT( size ); - const uint32 index = self->lockSequence % self->bufferCount; - self->lockSequence++; + const bool isDirect = (!self->pinnedBuffer[0] || directOverride) && !self->diskBuffer; + PanicIf( isDirect && !hostBuffer, "No host buffer provided for direct upload." ); - // Wait for the device buffer to be free to be used by kernels - CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) ); - return self->deviceBuffer[index]; -} + const uint32 index = SynchronizeOutgoingSequence(); -void GpuDownloadBuffer::Download( void* hostBuffer, const size_t size ) -{ - Download2D( hostBuffer, size, 1, size, size ); -} + auto uploadStream = self->queue->GetStream(); -void GpuDownloadBuffer::Download( void* hostBuffer, const size_t size, cudaStream_t workStream, bool directOverride ) -{ - Download2D( hostBuffer, size, 1, size, size, workStream, directOverride ); -} - -void GpuDownloadBuffer::DownloadAndCopy( void* hostBuffer, void* finalBuffer, const size_t size, cudaStream_t workStream ) -{ - ASSERT( 0 ); - // ASSERT( self->outgoingSequence < BBCU_BUCKET_COUNT ); - // ASSERT( hostBuffer ); - // ASSERT( workStream ); - // ASSERT( self->lockSequence > 0 ); - // ASSERT( self->outgoingSequence < self->lockSequence ); - // ASSERT( self->lockSequence - self->outgoingSequence <= self->bufferCount ); - - // auto& cpy = self->copies[self->outgoingSequence]; - // cpy.self = self; - // cpy.sequence = self->outgoingSequence; - // cpy.copy.hostBuffer = finalBuffer; - // cpy.copy.srcBuffer = hostBuffer; - // cpy.copy.size = size; + DiskBuffer* diskBuffer = nullptr; + if( self->diskBuffer ) + { + // Preload data from disk into pinned buffer + + diskBuffer = dynamic_cast( self->diskBuffer ); + PanicIf( !diskBuffer, "Not a DiskBucketBuffer" ); + ASSERT( diskBuffer->GetAlignedBufferSize() >= size ); + + hostBuffer = self->pinnedBuffer[index]; + ASSERT( hostBuffer == diskBuffer->PeekReadBufferForBucket( self->outgoingSequence - 1 ) ); + ASSERT( self->outgoingSequence <= BBCU_BUCKET_COUNT ); + + CallHostFunctionOnStream( uploadStream, [=](){ + // Read on disk queue's thread + diskBuffer->ReadNextBucket(); + + // Block until the buffer is fully read from disk + // #TODO: Also should not do this here, but in a host-to-host background stream, + // so that the next I/O read can happen in the background while + // the previous upload to disk is happening, if needed. + (void)diskBuffer->GetNextReadBuffer(); + }); + } + else if( !isDirect ) + { + // Copy from unpinned to pinned first + // #TODO: This should be done in a different backgrund host-to-host copy stream + CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->pinnedEvent[index] ) ); + CudaErrCheck( cudaMemcpyAsync( self->pinnedBuffer[index], hostBuffer, size, cudaMemcpyHostToHost, uploadStream ) ); + hostBuffer = self->pinnedBuffer[index]; + } - // const uint32 index = self->outgoingSequence % self->bufferCount; - // self->outgoingSequence++; + // Ensure the device buffer is ready for use + CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) ); - // void* pinnedBuffer = self->pinnedBuffer[index]; - // const void* devBuffer = self->deviceBuffer[index]; + // Upload to the device buffer + CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, size, cudaMemcpyHostToDevice, uploadStream ) ); - // // Signal from the work stream when it has finished doing kernel work with the device buffer - // CudaErrCheck( cudaEventRecord( self->readyEvents[index], workStream ) ); + if( !isDirect ) + { + // Signal that the pinned buffer is ready for re-use + CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) ); + } + // Signal work stream that the device buffer is ready to be used + CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) ); +} - // // Ensure the work stream has completed writing data to the device buffer - // cudaStream_t stream = self->queue->_stream; +void GpuUploadBuffer::UploadAndPreLoad( void* hostBuffer, const size_t size, const void* copyBufferSrc, const size_t copySize ) +{ + ASSERT(0); + // ASSERT( size >= copySize ); - // CudaErrCheck( cudaStreamWaitEvent( stream, self->readyEvents[index] ) ); + // Upload( hostBuffer, size, nullptr ); - // // Copy - // CudaErrCheck( cudaMemcpyAsync( hostBuffer, devBuffer, size, cudaMemcpyDeviceToHost, stream ) ); - - // // Signal that the device buffer is free to be re-used - // CudaErrCheck( cudaEventRecord( self->events[index], stream ) ); + // // Add callback for copy + // const uint32 sequence = self->outgoingSequence - 1; + // auto& cpy = self->copies[sequence]; + // cpy.self = self; + // cpy.sequence = sequence; + // cpy.copy.hostBuffer = hostBuffer; + // cpy.copy.srcBuffer = copyBufferSrc; + // cpy.copy.size = copySize; // // Launch copy command - // CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){ + // CudaErrCheck( cudaLaunchHostFunc( self->queue->GetStream(), []( void* userData ){ // const CopyInfo& c = *reinterpret_cast( userData ); // IGpuBuffer* self = c.self; @@ -140,438 +108,113 @@ void GpuDownloadBuffer::DownloadAndCopy( void* hostBuffer, void* finalBuffer, co // cmd.copy.info = &c; // self->queue->SubmitCommands(); - - // // Signal the download completed - // self->fence.Signal( ++self->completedSequence ); // }, &cpy ) ); } -void GpuDownloadBuffer::DownloadWithCallback( void* hostBuffer, const size_t size, GpuDownloadCallback callback, void* userData, cudaStream_t workStream, bool directOverride ) -{ - Download2DWithCallback( hostBuffer, size, 1, size, size, callback, userData, workStream, directOverride ); -} - -void GpuDownloadBuffer::Download2D( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, cudaStream_t workStream, bool directOverride ) -{ - Download2DWithCallback( hostBuffer, width, height, dstStride, srcStride, nullptr, nullptr, workStream, directOverride ); -} - -void GpuDownloadBuffer::Download2DWithCallback( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, - GpuDownloadCallback callback, void* userData, cudaStream_t workStream, bool directOverride ) +void GpuUploadBuffer::UploadArray( const void* hostBuffer, uint32 length, uint32 elementSize, uint32 srcStride, + uint32 countStride, const uint32* counts, cudaStream_t workStream ) { - ASSERT( hostBuffer ); - ASSERT( workStream ); - ASSERT( self->lockSequence > 0 ); - ASSERT( self->outgoingSequence < self->lockSequence ); - ASSERT( self->lockSequence - self->outgoingSequence <= self->bufferCount ); - - const uint32 index = self->outgoingSequence % self->bufferCount; + const uint32 index = SynchronizeOutgoingSequence(); + const bool isDirect = self->pinnedBuffer[0] == nullptr && !self->diskBuffer; - void* pinnedBuffer = self->pinnedBuffer[index]; - const void* devBuffer = self->deviceBuffer[index]; + auto uploadStream = self->queue->GetStream(); - const bool isDirect = directOverride || self->pinnedBuffer[0] == nullptr; ASSERT( isDirect || self->pinnedBuffer[0] ); + DiskBucketBuffer* diskBuffer = nullptr; + size_t totalBufferSize = 0; - // Signal from the work stream when it has finished doing kernel work with the device buffer - CudaErrCheck( cudaEventRecord( self->readyEvents[index], workStream ) ); - - // Ensure the work stream has completed writing data to the device buffer - cudaStream_t stream = self->queue->_stream; - - CudaErrCheck( cudaStreamWaitEvent( stream, self->readyEvents[index] ) ); - - // Ensure the pinned buffer is ready for use - if( !isDirect ) + if( self->diskBuffer ) { - // CudaErrCheck( cudaStreamWaitEvent( stream, self->completedEvents[index] ) ); - CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){ - - IGpuBuffer* self = reinterpret_cast( userData ); - if( self->copySequence++ > 1 ) - { - self->copyFence.Wait( self->copySequence-1 ); - } - }, self ) ); - } + diskBuffer = dynamic_cast( self->diskBuffer ); + PanicIf( !diskBuffer, "Not a DiskBucketBuffer" ); - // Copy from device to pinned host buffer - const bool isSequentialCopy = dstStride == srcStride; - const size_t totalSize = height * width; - - if( isDirect ) - { - if( isSequentialCopy ) - CudaErrCheck( cudaMemcpyAsync( hostBuffer, devBuffer, totalSize, cudaMemcpyDeviceToHost, stream ) ); - else - CudaErrCheck( cudaMemcpy2DAsync( hostBuffer, dstStride, devBuffer, srcStride, width, height, cudaMemcpyDeviceToHost, stream ) ); + hostBuffer = diskBuffer->PeekReadBufferForBucket( self->outgoingSequence-1 ); + ASSERT( self->outgoingSequence <= BBCU_BUCKET_COUNT ); - // Signal direct download completed - auto& cpy = self->copies[self->outgoingSequence]; - cpy.self = self; - cpy.sequence = self->outgoingSequence; - cpy.dstBuffer = hostBuffer; - cpy.callback = callback; - cpy.userData = userData; - cpy.height = height; - cpy.width = width; + // if( nextReadBucket < BBCU_BUCKET_COUNT ) + { + // Override the input slice sizes with the correct ones (as we wrote them with fixed size) + + // Preload the bucket buffer from disk + CallHostFunctionOnStream( uploadStream, [=](){ - CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){ + const uint32 nextReadBucket = diskBuffer->GetNextReadBucketId(); + diskBuffer->OverrideReadSlices( nextReadBucket, elementSize, counts, countStride ); - CopyInfo& cpy = *reinterpret_cast( userData ); - IGpuBuffer* self = cpy.self; //reinterpret_cast( userData ); + // Preloads in the background + diskBuffer->ReadNextBucket(); - self->fence.Signal( ++self->completedSequence ); + // Upload the next one too, if needed + // #NOTE: This is a hacky way to do it for now. + // We ought to have a synchronized, separate, disk stream later + // if( nextReadBucket < BBCU_BUCKET_COUNT ) + // diskBuffer->ReadNextBucket(); + }); + } - // Dispatch callback, if one was set - if( cpy.callback ) - cpy.callback( cpy.dstBuffer, cpy.height * cpy.width, cpy.userData ); + // Wait for disk buffer to be ready + CallHostFunctionOnStream( uploadStream, [diskBuffer](){ - }, &cpy ) ); + // Wait until next buffer is ready + (void)diskBuffer->GetNextReadBuffer(); + }); } else { - CudaErrCheck( cudaMemcpyAsync( pinnedBuffer, devBuffer, totalSize, cudaMemcpyDeviceToHost, stream ) ); - } - - // Signal that the device buffer is free to be re-used - CudaErrCheck( cudaEventRecord( self->events[index], stream ) ); + // Perform fragmented uploads + const auto waitEvent = isDirect ? self->deviceEvents[index] : self->pinnedEvent[index]; + const auto copyMode = isDirect ? cudaMemcpyHostToDevice : cudaMemcpyHostToHost; - // If not a direct copy, we need to do another copy from the pinned buffer to the unpinned host buffer - if( !isDirect ) - { - // Signal the copy stream that the pinned buffer is ready to be copied to the unpinned host buffer - CudaErrCheck( cudaEventRecord( self->preloadEvents[index], stream ) ); + // Wait on device or pinned buffer to be ready (depending if a direct copy or not) + CudaErrCheck( cudaStreamWaitEvent( uploadStream, waitEvent ) ); - // Ensure the pinned buffer is ready for use - cudaStream_t copyStream = self->queue->_preloadStream; - - CudaErrCheck( cudaStreamWaitEvent( copyStream, self->preloadEvents[index] ) ); + const byte* src = (byte*)hostBuffer; + byte* dst = (byte*)( isDirect ? self->deviceBuffer[index] : self->pinnedBuffer[index] ); + const uint32* sizes = counts; + for( uint32 i = 0; i < length; i++ ) { - auto& cpy = self->copies[self->outgoingSequence]; - cpy.self = self; - cpy.sequence = self->outgoingSequence; - - cpy.dstBuffer = hostBuffer; - cpy.srcBuffer = pinnedBuffer; - cpy.width = width; - cpy.height = height; - cpy.srcStride = srcStride; - cpy.dstStride = dstStride; - cpy.callback = callback; - cpy.userData = userData; - - CudaErrCheck( cudaLaunchHostFunc( copyStream, []( void* userData ){ + const size_t size = *sizes * (size_t)elementSize; - CopyInfo& cpy = *reinterpret_cast( userData ); - IGpuBuffer* self = cpy.self; //reinterpret_cast( userData ); + CudaErrCheck( cudaMemcpyAsync( dst, src, size, copyMode, uploadStream ) ); - auto& cmd = self->queue->GetCommand( GpuQueue::CommandType::Copy ); - cmd.copy = &cpy; - self->queue->SubmitCommands(); - - }, &cpy ) ); + dst += size; + src += srcStride; + sizes += countStride; } - // Signal the pinned buffer is free to be re-used - // CudaErrCheck( cudaEventRecord( self->completedEvents[index], copyStream ) ); - } - - - // Signal the download completed - // { - // auto& cpy = self->copies[self->outgoingSequence]; - // cpy.self = self; - // cpy.sequence = self->outgoingSequence; - - // cpy.copy2d.dstBuffer = hostBuffer; - // cpy.copy2d.srcBuffer = pinnedBuffer; - // cpy.copy2d.width = width; - // cpy.copy2d.height = height; - // cpy.copy2d.srcStride = srcStride; - // cpy.copy2d.dstStride = dstStride; - - // CudaErrCheck( cudaLaunchHostFunc( copyStream, []( void* userData ){ - - // CopyInfo& cpy = *reinterpret_cast( userData ); - // IGpuBuffer* self = cpy.self; //reinterpret_cast( userData ); - - // const uint32 idx = cpy.sequence & self->bufferCount; - - // const byte* src = (byte*)cpy.copy2d.srcBuffer; - // byte* dst = (byte*)cpy.copy2d.dstBuffer; - - // const size_t width = cpy.copy2d.width; - // const size_t height = cpy.copy2d.height; - // const size_t dstStride = cpy.copy2d.dstStride; - // const size_t srcStride = cpy.copy2d.srcStride; - - // auto& cmd = self->queue->GetCommand( GpuQueue::CommandType::Download2D ); - // cmd.sequenceId = cpy.sequence; - // cmd.srcBuffer = src; - // cmd.dstBuffer = dst; - // cmd.download2d.buf = self; - // cmd.download2d.width = width; - // cmd.download2d.height = height; - // cmd.download2d.srcStride = srcStride; - // cmd.download2d.dstStride = dstStride; - // self->queue->SubmitCommands(); - - // // for( size_t i = 0; i < height; i++ ) - // // { - // // memcpy( dst, src, width ); - - // // dst += dstStride; - // // src += srcStride; - // // } - - // // self->fence.Signal( ++self->completedSequence ); - // }, &cpy ) ); - // } - // CudaErrCheck( cudaEventRecord( self->completedEvents[index], copyStream ) ); - - // if( callback ) - // { - // ASSERT( width <= srcStride ); - // ASSERT( width <= dstStride ); - - // auto& cpy = self->copies[self->outgoingSequence]; - // cpy.self = self; - // cpy.sequence = self->outgoingSequence; - // cpy.callback.hostBuffer = hostBuffer; - // cpy.callback.size = width * height; - // cpy.callback.callback = callback; - // cpy.callback.userData = userData; - - // CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){ - - // auto& cpy = *reinterpret_cast( userData ); - // auto* self = cpy.self; - - // // Fire callback command - // auto& cmd = self->queue->GetCommand( GpuQueue::CommandType::Callback ); - // cmd.dstBuffer = cpy.callback.hostBuffer; - // cmd.callback.copySize = cpy.callback.size; - // cmd.callback.callback = cpy.callback.callback; - // cmd.callback.userData = cpy.callback.userData; - // self->queue->SubmitCommands(); - - // // Signal the download completed - // self->fence.Signal( ++self->completedSequence ); - // }, &cpy ) ); - // } - // else - // { - // // Signal the download completed - // CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){ - - // IGpuBuffer* self = reinterpret_cast( userData ); - // self->fence.Signal( ++self->completedSequence ); - // }, self ) ); - // } - - self->outgoingSequence++; -} - -void GpuDownloadBuffer::GetDownload2DCommand( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, - uint32& outIndex, void*& outPinnedBuffer, const void*& outDevBuffer, GpuDownloadCallback callback, void* userData ) -{ - ASSERT( width ); - ASSERT( height ); - ASSERT( hostBuffer ); - - const uint32 index = self->outgoingSequence % self->bufferCount; - - // We need to block until the pinned buffer is available. - if( self->outgoingSequence > self->bufferCount-1 ) - self->fence.Wait( self->outgoingSequence - self->bufferCount + 1 ); - - void* pinnedBuffer = self->pinnedBuffer[index]; - const void* devBuffer = self->deviceBuffer[index]; - - //auto& cmd = self->commands[index]; - //cmd.type = GpuQueue::CommandType::Copy2D; - //cmd.sequenceId = self->outgoingSequence++; - //cmd.finishedSignal = &self->fence; - //cmd.dstBuffer = hostBuffer; - //cmd.srcBuffer = pinnedBuffer; - //cmd.copy2d.width = width; - //cmd.copy2d.height = height; - //cmd.copy2d.dstStride = dstStride; - //cmd.copy2d.srcStride = srcStride; - //cmd.copy2d.callback = callback; - //cmd.copy2d.userData = userData; - - outIndex = index; - outPinnedBuffer = pinnedBuffer; - outDevBuffer = devBuffer; -} - - -void GpuDownloadBuffer::DownloadAndPackArray( void* hostBuffer, const uint32 length, size_t srcStride, const uint32* counts, const uint32 elementSize ) -{ - ASSERT( length ); - ASSERT( elementSize ); - ASSERT( counts ); - - uint32 totalElements = 0; - for( uint32 i = 0; i < length; i++ ) - totalElements += counts[i]; - - const size_t totalSize = (size_t)totalElements * elementSize; - - uint32 index; - void* pinnedBuffer; - const void* devBuffer; - GetDownload2DCommand( hostBuffer, totalSize, 1, totalSize, totalSize, index, pinnedBuffer, devBuffer ); - - - srcStride *= elementSize; - - byte* dst = (byte*)pinnedBuffer; - const byte* src = (byte*)devBuffer; - - cudaStream_t stream = self->queue->_stream; - - // Copy all buffers from device to pinned buffer - for( uint32 i = 0; i < length; i++ ) - { - const size_t copySize = counts[i] * (size_t)elementSize; - - // #TODO: Determine if there's a cuda (jagged) array copy - CudaErrCheck( cudaMemcpyAsync( dst, src, copySize, cudaMemcpyDeviceToHost, stream ) ); - - src += srcStride; - dst += copySize; + if( !isDirect ) + { + // Set the pinned buffer as the host buffer so that we can do a sequential copy to the device now + hostBuffer = self->pinnedBuffer[index]; + } } - // Signal that the device buffer is free - CudaErrCheck( cudaEventRecord( self->events[index], stream ) ); - - // Submit command to do the final copy from pinned to host - CudaErrCheck( cudaLaunchHostFunc( stream, GpuQueue::CopyPendingDownloadStream, self ) ); -} - -void GpuDownloadBuffer::WaitForCompletion() -{ - if( self->outgoingSequence > 0 ) + // Upload to device buffer if in non-direct mode + if( !isDirect ) { - //const uint32 index = (self->outgoingSequence - 1) % self->bufferCount; - - // cudaEvent_t event = self->completedEvents[index]; - //const cudaError_t r = cudaEventQuery( event ); - - //if( r == cudaSuccess ) - // return; - - //if( r != cudaErrorNotReady ) - // CudaErrCheck( r ); + for( uint32 i = 0; i < length; i++ ) + { + ASSERT( *counts ); + totalBufferSize += *counts * (size_t)elementSize; + counts += countStride; + } - //CudaErrCheck( cudaEventSynchronize( event ) ); - - self->fence.Wait( self->outgoingSequence ); - } -} + // #TODO: This should be done in a copy stream to perform the copies in the background + CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) ); + CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, totalBufferSize, cudaMemcpyHostToDevice, uploadStream ) ); -void GpuDownloadBuffer::WaitForCopyCompletion() -{ - if( self->outgoingSequence > 0 ) - { - self->copyFence.Wait( self->outgoingSequence ); + if( !self->diskBuffer ) + CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) ); } -} - -void GpuDownloadBuffer::Reset() -{ - self->lockSequence = 0; - self->outgoingSequence = 0; - self->completedSequence = 0; - self->copySequence = 0; - self->fence.Reset( 0 ); - self->copyFence.Reset( 0 ); -} - -GpuQueue* GpuDownloadBuffer::GetQueue() const -{ - return self->queue; -} - - -/// -/// UploadBuffer -/// -void* GpuUploadBuffer::GetNextPinnedBuffer() -{ - // Wait for the pinned host buffer to be available - //if( self->outgoingSequence > self->bufferCount-1 ) - // self->fence.Wait( self->outgoingSequence - self->bufferCount + 1 ); - // - const uint32 index = self->outgoingSequence % self->bufferCount; - - void* pinnedBuffer = self->pinnedBuffer[index]; - - return pinnedBuffer; -} - -void GpuUploadBuffer::Upload( const void* hostBuffer, size_t size, cudaStream_t workStream ) -{ - ASSERT( hostBuffer ); - ASSERT( size ); - ASSERT( self->outgoingSequence - self->lockSequence < 2 ); - // ASSERT( workStream ); - - const uint32 index = self->outgoingSequence % self->bufferCount; - self->outgoingSequence++; - - auto stream = self->queue->GetStream(); - - // Ensure the device buffer is ready for use - CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) ); - - // Upload to device buffer - CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, size, cudaMemcpyHostToDevice, stream ) ); // Signal work stream that the device buffer is ready to be used - CudaErrCheck( cudaEventRecord( self->readyEvents[index], stream ) ); + CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) ); } -void GpuUploadBuffer::UploadAndPreLoad( void* hostBuffer, const size_t size, const void* copyBufferSrc, const size_t copySize ) -{ - ASSERT(0); - // ASSERT( size >= copySize ); - - // Upload( hostBuffer, size, nullptr ); - - // // Add callback for copy - // const uint32 sequence = self->outgoingSequence - 1; - // auto& cpy = self->copies[sequence]; - // cpy.self = self; - // cpy.sequence = sequence; - // cpy.copy.hostBuffer = hostBuffer; - // cpy.copy.srcBuffer = copyBufferSrc; - // cpy.copy.size = copySize; - - // // Launch copy command - // CudaErrCheck( cudaLaunchHostFunc( self->queue->GetStream(), []( void* userData ){ - - // const CopyInfo& c = *reinterpret_cast( userData ); - // IGpuBuffer* self = c.self; - - // auto& cmd = self->queue->GetCommand( GpuQueue::CommandType::Copy ); - // cmd.copy.info = &c; - - // self->queue->SubmitCommands(); - // }, &cpy ) ); -} - -void GpuUploadBuffer::UploadArray( const void* hostBuffer, uint32 length, uint32 elementSize, uint32 srcStride, - uint32 countStride, const uint32* counts, cudaStream_t workStream ) +void GpuUploadBuffer::UploadArrayForIndex( const uint32 index, const void* hostBuffer, uint32 length, + uint32 elementSize, uint32 srcStride, uint32 countStride, const uint32* counts ) { ASSERT( hostBuffer ); - ASSERT( self->outgoingSequence - self->lockSequence < 2 ); - - const uint32 index = self->outgoingSequence % self->bufferCount; - self->outgoingSequence++; auto stream = self->queue->GetStream(); @@ -632,28 +275,6 @@ void* GpuUploadBuffer::GetUploadedDeviceBuffer( cudaStream_t workStream ) return self->deviceBuffer[index]; } -void* GpuUploadBuffer::GetUploadedDeviceBuffer() -{ASSERT(0); // Not allowed for now - if( self->outgoingSequence < 1 ) - { - ASSERT( 0 ); - return nullptr; - } - ASSERT( 0 ); - const uint32 index = self->completedSequence % self->bufferCount; - - // #TODO: Make this spin way. - // #TODO: Find a better way to do this instead of having to wait on both primitives. - // Can't check the cuda event until we're sure it's been - // added to the stream - self->fence.Wait( self->completedSequence + 1 ); - CudaErrCheck( cudaEventSynchronize( self->events[index] ) ); - - self->completedSequence++; - - return self->deviceBuffer[index]; -} - void GpuUploadBuffer::ReleaseDeviceBuffer( cudaStream_t workStream ) { ASSERT( self->outgoingSequence > self->lockSequence ); @@ -663,7 +284,7 @@ void GpuUploadBuffer::ReleaseDeviceBuffer( cudaStream_t workStream ) const uint32 index = self->lockSequence % self->bufferCount; self->lockSequence++; - CudaErrCheck( cudaEventRecord( self->events[index], workStream ) ); + CudaErrCheck( cudaEventRecord( self->deviceEvents[index], workStream ) ); } void GpuUploadBuffer::WaitForPreloadsToComplete() @@ -674,6 +295,17 @@ void GpuUploadBuffer::WaitForPreloadsToComplete() } } +uint32 GpuUploadBuffer::SynchronizeOutgoingSequence() +{ + PanicIf( self->outgoingSequence < self->lockSequence || self->outgoingSequence - self->lockSequence >= 2, + "Invalid outgoing synchro sequence state." ); + + const uint32 index = self->outgoingSequence % self->bufferCount; + self->outgoingSequence++; + + return index; +} + void GpuUploadBuffer::Reset() { self->lockSequence = 0; @@ -689,362 +321,32 @@ GpuQueue* GpuUploadBuffer::GetQueue() const return self->queue; } - -/// -/// Shared GpuStream Inteface -/// -GpuQueue::GpuQueue( Kind kind ) : _kind( kind ) - , _bufferReadySignal( BBCU_BUCKET_COUNT ) -{ - CudaErrCheck( cudaStreamCreateWithFlags( &_stream, cudaStreamNonBlocking ) ); - CudaErrCheck( cudaStreamCreateWithFlags( &_preloadStream, cudaStreamNonBlocking ) ); - - _copyThread.Run( CopyThreadEntryPoint, this ); -} - -GpuQueue::~GpuQueue() -{ - _exitCopyThread.store( true, std::memory_order_release ); - _bufferReadySignal.Release(); - _waitForExitSignal.Wait(); -} - -//void GpuQueue::Synchronize() -//{ -// (void)GetCommand( CommandType::Sync ); -// SubmitCommands(); -// -// _syncFence.Wait(); -//} - - -//GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size, bool dryRun ) -//{ -// FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" ); -// if( dryRun ) return { nullptr }; -// -// // #TODO: Set size? -// return { CreateGpuBuffer( dev0, dev1, pinned0, pinned1, size ) }; -//} - -//GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( const size_t size, bool dryRun ) -//{ -// FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" ); -// if( dryRun ) return { nullptr }; -// return { CreateGpuBuffer( size ) }; -//} - -GpuDownloadBuffer GpuQueue::CreateDirectDownloadBuffer( const size_t size, IAllocator& devAllocator, const size_t alignment, const bool dryRun ) -{ - FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" ); - GpuDownloadBuffer r = { CreateGpuBuffer( size, BBCU_DEFAULT_GPU_BUFFER_COUNT, &devAllocator, nullptr, alignment, dryRun ) }; - - if( !dryRun ) - r.Reset(); - - return r; -} - -GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun ) +void GpuUploadBuffer::AssignDiskBuffer( DiskBufferBase* diskBuffer ) { - FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" ); - GpuDownloadBuffer r = { CreateGpuBuffer( size, devAllocator, pinnedAllocator, alignment, dryRun ) }; + ASSERT( self->pinnedBuffer[0] ); - if( !dryRun ) - r.Reset(); + void* nullBuffers[2] = { nullptr, nullptr }; + if( self->diskBuffer ) + self->diskBuffer->AssignReadBuffers( nullBuffers ); - return r; + self->diskBuffer = diskBuffer; + if( self->diskBuffer ) + self->diskBuffer->AssignReadBuffers( self->pinnedBuffer ); } -GpuDownloadBuffer GpuQueue::CreateDownloadBuffer( const size_t size, const uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun ) +DiskBufferBase* GpuUploadBuffer::GetDiskBuffer() const { - FatalIf( _kind != Downloader, "Attempted to create GpuDownloadBuffer on an UploadQueue" ); - GpuDownloadBuffer r = { CreateGpuBuffer( size, bufferCount, &devAllocator, &pinnedAllocator, alignment, dryRun ) }; - - if( !dryRun ) - r.Reset(); - - return r; + return self->diskBuffer; } -GpuUploadBuffer GpuQueue::CreateUploadBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun ) +void GpuUploadBuffer::CallHostFunctionOnStream( cudaStream_t stream, std::function func ) { - FatalIf( _kind != Uploader, "Attempted to create GpuUploadBuffer on an DownloadQueue" ); - GpuUploadBuffer r = { CreateGpuBuffer( size, devAllocator, pinnedAllocator, alignment, dryRun ) }; - - if( !dryRun ) - r.Reset(); + auto* fnCpy = new std::function( std::move( func ) ); + CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ) { - return r; -} + auto& fn = *reinterpret_cast*>( userData ); + fn(); + delete& fn; - -struct IGpuBuffer* GpuQueue::CreateGpuBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun ) -{ - return CreateGpuBuffer( size, BBCU_DEFAULT_GPU_BUFFER_COUNT, &devAllocator, &pinnedAllocator, alignment, dryRun ); -} - -struct IGpuBuffer* GpuQueue::CreateGpuBuffer( const size_t size, const uint32 bufferCount, IAllocator* devAllocator, IAllocator* pinnedAllocator, size_t alignment, bool dryRun ) -{ - FatalIf( bufferCount > BBCU_GPU_BUFFER_MAX_COUNT, "GPU Buffer count overflow." ); - - const size_t allocSize = RoundUpToNextBoundaryT( size, alignment ); - - void* devBuffers [BBCU_GPU_BUFFER_MAX_COUNT] = {}; - void* pinnedBuffers[BBCU_GPU_BUFFER_MAX_COUNT] = {}; - - for( int32 i = 0; i < bufferCount; i++ ) - { - devBuffers[i] = devAllocator->Alloc( allocSize, alignment ); - - if( pinnedAllocator ) - pinnedBuffers[i] = pinnedAllocator->Alloc( allocSize, alignment ); - } - - if( dryRun ) return nullptr; - - struct IGpuBuffer* buf = new IGpuBuffer{}; - - for( int32 i = 0; i < bufferCount; i++ ) - { - CudaErrCheck( cudaEventCreateWithFlags( &buf->events[i] , cudaEventDisableTiming ) ); - CudaErrCheck( cudaEventCreateWithFlags( &buf->completedEvents[i], cudaEventDisableTiming ) ); - CudaErrCheck( cudaEventCreateWithFlags( &buf->readyEvents[i] , cudaEventDisableTiming ) ); - CudaErrCheck( cudaEventCreateWithFlags( &buf->preloadEvents[i] , cudaEventDisableTiming ) ); - - buf->deviceBuffer[i] = devBuffers[i]; - buf->pinnedBuffer[i] = pinnedBuffers[i]; - // buf->commands[i] = {}; - - // Events have to be disabled initially for uploads - //if( _kind == Uploader ) - //{ - // CudaErrCheck( cudaEventSynchronize( buf->events[i] ) ); - // CudaErrCheck( cudaEventSynchronize( buf->completedEvents[i] ) ); - // CudaErrCheck( cudaEventSynchronize( buf->readyEvents[i] ) ); - //} - } - - buf->size = size; - buf->bufferCount = bufferCount; - buf->queue = this; - - return buf; -} - -//struct IGpuBuffer* GpuQueue::CreateGpuBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, const size_t size ) -//{ -// ASSERT( dev0 ); -// ASSERT( dev1 ); -// ASSERT( pinned0 ); -// ASSERT( pinned1 ); -// -// ASSERT( dev0 != dev1 ); -// ASSERT( pinned0 != pinned1 ); -// -//#if _DEBUG -// if( size ) -// { -// ASSERT_DOES_NOT_OVERLAP( dev0 , dev1 , size ); -// ASSERT_DOES_NOT_OVERLAP( dev0 , pinned0, size ); -// ASSERT_DOES_NOT_OVERLAP( dev0 , pinned1, size ); -// ASSERT_DOES_NOT_OVERLAP( dev1 , pinned0, size ); -// ASSERT_DOES_NOT_OVERLAP( dev1 , pinned1, size ); -// ASSERT_DOES_NOT_OVERLAP( pinned0, pinned1, size ); -// } -//#endif -// -// struct IGpuBuffer* buf = new IGpuBuffer(); -// -// CudaErrCheck( cudaEventCreateWithFlags( &buf->events[0], cudaEventDisableTiming ) ); -// CudaErrCheck( cudaEventCreateWithFlags( &buf->events[1], cudaEventDisableTiming ) ); -// -// buf->deviceBuffer[0] = dev0; -// buf->deviceBuffer[1] = dev1; -// -// buf->pinnedBuffer[0] = pinned0; -// buf->pinnedBuffer[1] = pinned1; -// -// buf->size = size; -// buf->fence.Reset( 0 ); -// -// buf->commands[0] = {}; -// buf->commands[1] = {}; -// -// buf->outgoingSequence = 0; -// buf->completedSequence = 0; -// -// buf->queue = this; -// -// return buf; -//} - -//struct IGpuBuffer* GpuQueue::CreateGpuBuffer( const size_t size ) -//{ -// ASSERT( size ); -// -// void* dev0; -// void* dev1; -// void* pinned0; -// void* pinned1; -// -// CudaErrCheck( cudaMalloc( &dev0, size ) ); -// CudaErrCheck( cudaMalloc( &dev1, size ) ); -// CudaErrCheck( cudaMallocHost( &pinned0, size ) ); -// CudaErrCheck( cudaMallocHost( &pinned1, size ) ); -// -// return CreateGpuBuffer( dev0, dev1, pinned0, pinned1, size ); -//} - -void GpuQueue::CopyPendingDownloadStream( void* userData ) -{ - auto* buf = reinterpret_cast( userData ); - - GpuQueue* queue = buf->queue; - - //const uint32 index = buf->completedSequence % buf->bufferCount; - buf->completedSequence++; - - //queue->GetCommand( CommandType::Download2D ) = buf->commands[index]; - queue->SubmitCommands(); -} - -void GpuQueue::SubmitCommands() -{ - const uint64 ticket = _commitTicketOut++; - - // Wait for our ticket to come up - while( _commitTicketIn.load( std::memory_order_relaxed ) != ticket ); - - _queue.Commit(); - _bufferReadySignal.Release(); - //_bufferReadySignal.Signal(); - - // Use our ticket - _commitTicketIn.store( ticket+1, std::memory_order_release ); -} - -GpuQueue::Command& GpuQueue::GetCommand( CommandType type ) -{ - const uint64 ticket = _cmdTicketOut++; - - // Wait for our ticket to come up - while( _cmdTicketIn.load( std::memory_order_relaxed ) != ticket ); - - Command* cmd; - while( !_queue.Write( cmd ) ) - { - Log::Line( "[GpuQueue] Queue is depleted. Waiting for copies to complete." ); - auto waitTimer = TimerBegin(); - - // Block and wait until we have commands free in the buffer - _bufferCopiedSignal.Wait(); - - Log::Line( "[GpuQueue] Waited %.6lf seconds for availability.", TimerEnd( waitTimer ) ); - } - - // Use our ticket - _cmdTicketIn.store( ticket+1, std::memory_order_release ); - - ZeroMem( cmd ); - cmd->type = type; - - return *cmd; -} - - -/// -/// Command thread -/// -void GpuQueue::CopyThreadEntryPoint( GpuQueue* self ) -{ - ASSERT( self ); - self->CopyThreadMain(); - self->_waitForExitSignal.Signal(); -} - -void GpuQueue::CopyThreadMain() -{ - const int32 CMD_BUF_SIZE = 256; - Command buffers[CMD_BUF_SIZE]; - - for( ;; ) - { - _bufferReadySignal.Wait(); - - if( ShouldExitCopyThread() ) - return; - - // 1 command per semaphore release - int32 bufCount; - while( ( ( bufCount = _queue.Dequeue( buffers, CMD_BUF_SIZE ) ) ) ) - // if( ( ( bufCount = _queue.Dequeue( buffers, CMD_BUF_SIZE ) ) ) ) - { - ASSERT( bufCount <= CMD_BUF_SIZE ); - _bufferCopiedSignal.Signal(); - - for( int i = 0; i < bufCount; i++ ) - ExecuteCommand( buffers[i] ); - } - } -} - -void GpuQueue::ExecuteCommand( const Command& cmd ) -{ - - // const uint32 index = cmd.sequenceId % BBCU_GPU_BUFFER_MAX_COUNT; - - if( cmd.type == CommandType::Copy ) - { - auto& cpy = *cmd.copy; - - const bool isSequentialCopy = cpy.dstStride == cpy.srcStride; - const size_t totalSize = cpy.height * cpy.width; - - byte* dst = (byte*)cpy.dstBuffer; - const byte* src = (byte*)cpy.srcBuffer; - - if( isSequentialCopy ) - memcpy( cpy.dstBuffer, cpy.srcBuffer, totalSize ); - else - { - const byte* src = (byte*)cpy.srcBuffer; - byte* dst = (byte*)cpy.dstBuffer; - - for( size_t i = 0; i < cpy.height; i++ ) - { - memcpy( dst, src, cpy.width ); - - dst += cpy.dstStride; - src += cpy.srcStride; - } - } - - cpy.self->fence.Signal( cpy.sequence+1 ); - cpy.self->copyFence.Signal( cpy.sequence+1 ); - - if( cpy.callback ) - cpy.callback( cpy.dstBuffer, totalSize, cpy.userData ); - } - else if( cmd.type == CommandType::Callback ) - { - cmd.callback.callback( cmd.callback.dstbuffer, cmd.callback.copySize, cmd.callback.userData ); - } - // else if( cmd.type == CommandType::Sync ) - // { - // _syncFence.Signal(); - // return; - // } - else - { - ASSERT( 0 ); - } - - // Signal that the pinned buffer is free - //cpy.finishedSignal->Signal( cpy.sequenceId + 1 ); -} - -inline bool GpuQueue::ShouldExitCopyThread() -{ - return _exitCopyThread.load( std::memory_order_acquire ); + }, fnCpy ) ); } diff --git a/cuda/GpuStreams.h b/cuda/GpuStreams.h index ae1a5b63..2a310059 100644 --- a/cuda/GpuStreams.h +++ b/cuda/GpuStreams.h @@ -5,22 +5,127 @@ #include "threading/Fence.h" #include "threading/Semaphore.h" #include "util/SPCQueue.h" +#include "util/StackAllocator.h" +#include -//#define GPU_BUFFER_COUNT +class DiskBufferBase; +class DiskBuffer; +class DiskBucketBuffer; +struct GpuDownloadBuffer; +struct GpuUploadBuffer; +struct GpuQueue; +typedef std::function GpuStreamCallback; +typedef void (*GpuDownloadCallback)( void* hostBuffer, size_t downloadSize, void* userData ); + +struct PackedCopy +{ + struct IGpuBuffer* self; + const byte* src; + uint32 sequence; + uint32 length; + uint32 stride; + uint32 elementSize; + uint32 counts[BBCU_BUCKET_COUNT]; +}; + +struct DiskDataInfo +{ + DiskBufferBase* diskBuffer; + + union { + struct { + GpuUploadBuffer* self; + uint32 sequence; + } uploadInfo; + + struct { + size_t srcStride; + } download2DInfo; + + struct { + size_t size; + } downloadSequentialInfo; + }; +}; + +struct CopyInfo +{ + struct IGpuBuffer* self; + uint32 sequence; + + const void* srcBuffer; + void* dstBuffer; + size_t width; + size_t height; + size_t dstStride; + size_t srcStride; + + // Callback data + GpuDownloadCallback callback; + void* userData; +}; // Represents a double-buffered device buffer, which can be used with a GpuStreamQueue to // make fast transfers (via intermediate pinned memory) -class IAllocator; - enum class GpuStreamKind : uint32 { Download = 0, Upload }; -typedef void (*GpuDownloadCallback)( void* hostBuffer, size_t downloadSize, void* userData ); +struct IGpuBuffer +{ + size_t size; + uint32 bufferCount; // Number of pinned/device buffers this instance contains + void* deviceBuffer[BBCU_GPU_BUFFER_MAX_COUNT]; + void* pinnedBuffer[BBCU_GPU_BUFFER_MAX_COUNT]; // Pinned host buffer + + + cudaEvent_t pinnedEvent[BBCU_GPU_BUFFER_MAX_COUNT]; // Signals that the pinned buffer is ready for use + + union { + cudaEvent_t deviceEvents[BBCU_GPU_BUFFER_MAX_COUNT]; // Signals that the device buffer is ready for use + cudaEvent_t events [BBCU_GPU_BUFFER_MAX_COUNT]; // Signals the device buffer is ready for use + }; + + + union { + cudaEvent_t workEvent [BBCU_GPU_BUFFER_MAX_COUNT]; // Signals that the the work stream is done w/ the device buffer, and it's ready for use + cudaEvent_t readyEvents [BBCU_GPU_BUFFER_MAX_COUNT]; // User must signal this event when the device buffer is ready for download + }; + cudaEvent_t completedEvents[BBCU_GPU_BUFFER_MAX_COUNT]; // Signals the buffer is ready for consumption by the device or buffer + + // For dispatching host callbacks. + // Each buffer uses its own function? + cudaEvent_t callbackLockEvent; + cudaEvent_t callbackCompletedEvent; + + Fence fence; // Signals the pinned buffer is ready for use + Fence copyFence; + + cudaEvent_t preloadEvents[BBCU_GPU_BUFFER_MAX_COUNT]; + + + CopyInfo copies[BBCU_BUCKET_COUNT]; + // union { + // PackedCopy packedCopeis[BBCU_BUCKET_COUNT]; // For upload buffers + DiskDataInfo diskData[BBCU_BUCKET_COUNT]; + // }; + // DiskBucketBuffer* diskBucketBuffer = nullptr; + + // #TODO: Remove atomic again + uint32 lockSequence; // Index of next buffer to lock + uint32 outgoingSequence; // Index of locked buffer that will be downloaded/uploaded + std::atomic completedSequence; // Index of buffer that finished downloading/uploading + std::atomic copySequence; + + GpuQueue* queue; // Queue associated with this buffer + DiskBufferBase* diskBuffer; // DiskBuffer, is any, used when using disk offload mode. +}; + + struct GpuDownloadBuffer { @@ -79,7 +184,7 @@ struct GpuDownloadBuffer } void DownloadWithCallback( void* hostBuffer, size_t size, GpuDownloadCallback callback, void* userData, cudaStream_t workStream = nullptr, bool directOverride = false ); - + // Performs a direct host-to-pinned buffer copy, // and then a 2-dimensional copy from pinned buffer to host buffer // - width : Size in bytes of each row to copy @@ -98,6 +203,15 @@ struct GpuDownloadBuffer Download2D( hostBuffer, width * sizeof( T ), height, dstStride * sizeof( T ), srcStride * sizeof( T ), workStream, directOverride ); } + template + inline void Download2DWithCallbackT( T* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, + GpuDownloadCallback callback, void* userData, cudaStream_t workStream = nullptr, bool directOverride = false ) + { + Download2DWithCallback( + hostBuffer, width * sizeof( T ), height, dstStride * sizeof( T ), srcStride * sizeof( T ), + callback, userData, workStream, directOverride ); + } + // Performs several gpu-to-pinned downloads, then copies the pinned data as a contiguous buffer // to the destination host buffer void DownloadAndPackArray( void* hostBuffer, uint32 length, size_t srcStride, const uint32* counts, uint32 elementSize ); @@ -120,25 +234,37 @@ struct GpuDownloadBuffer class GpuQueue* GetQueue() const; + DiskBufferBase* GetDiskBuffer() const; + void AssignDiskBuffer( DiskBufferBase* diskBuffer ); + + void HostCallback( std::function func ); + //private: struct IGpuBuffer* self; private: + + void PerformDownload2D( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, + GpuDownloadCallback postCallback, void* postUserData, + cudaStream_t workStream, bool directOverride ); + void PerformDownload( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, GpuDownloadCallback callback, void* userData, cudaStream_t workStream, struct CopyInfo* copy = nullptr ); void GetDownload2DCommand( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, uint32& outIndex, void*& outPinnedBuffer, const void*& outDevBuffer, GpuDownloadCallback callback = nullptr, void* userData = nullptr ); + + void CallHostFunctionOnStream( cudaStream_t stream, std::function func ); }; struct GpuUploadBuffer { - void Upload( const void* hostBuffer, size_t size, cudaStream_t workStream ); + void Upload( const void* hostBuffer, size_t size, cudaStream_t workStream, bool directOverride = false ); template - inline void UploadT( const T* hostBuffer, size_t count, cudaStream_t workStream ) + inline void UploadT( const T* hostBuffer, size_t count, cudaStream_t workStream, bool directOverride = false ) { - Upload( hostBuffer, count * sizeof( T ), workStream ); + Upload( hostBuffer, count * sizeof( T ), workStream, directOverride ); } void Upload( const void* hostBuffer, size_t size ); @@ -152,7 +278,7 @@ struct GpuUploadBuffer // Upload the host buffer, then copy the copyBufferSrc to the host buffer. Preloading // data into that hostBuffer (should be pinned) as soon as it is free so that memory is ready for the next upload. void UploadAndPreLoad( void* hostBuffer, size_t size, const void* copyBufferSrc, size_t copySize ); - + template inline void UploadAndPreLoadT( T* hostBuffer, const size_t count, const T* copyBufferSrc, const size_t copyCount ) { @@ -170,6 +296,9 @@ struct GpuUploadBuffer void UploadArray( const void* hostBuffer, uint32 length, uint32 elementSize, uint32 srcStrideBytes, uint32 countStride, const uint32* counts ); + void UploadArrayForIndex( const uint32 index, const void* hostBuffer, uint32 length, + uint32 elementSize, uint32 srcStride, uint32 countStride, const uint32* counts ); + // srcStride here is in element count template inline void UploadArrayT( const T* hostBuffer, uint32 length, uint32 srcStride, uint32 countStride, const uint32* counts ) @@ -177,18 +306,12 @@ struct GpuUploadBuffer UploadArray( hostBuffer, length, (uint32)sizeof( T ), srcStride * (uint32)sizeof( T ), countStride, counts ); } - - void* GetUploadedDeviceBuffer( cudaStream_t workStream ); - - template - inline T* GetUploadedDeviceBufferT( cudaStream_t workStream ) { return (T*)GetUploadedDeviceBuffer( workStream ); } - // Waits until the earliest buffer has been uploaded to the GPU // and returns the device buffer. - void* GetUploadedDeviceBuffer(); + void* GetUploadedDeviceBuffer( cudaStream_t workStream ); template - inline T* GetUploadedDeviceBufferT() { return (T*)GetUploadedDeviceBuffer(); } + inline T* GetUploadedDeviceBufferT( cudaStream_t workStream ) { return (T*)GetUploadedDeviceBuffer( workStream ); } // #TODO: Pass in the buffer used as a reference so that it can be nullified, for safety. void ReleaseDeviceBuffer( cudaStream_t workStream ); @@ -205,131 +328,17 @@ struct GpuUploadBuffer class GpuQueue* GetQueue() const; + void AssignDiskBuffer( DiskBufferBase* diskBuffer ); + DiskBufferBase* GetDiskBuffer() const; + + void CallHostFunctionOnStream( cudaStream_t stream, std::function func ); + + //private: struct IGpuBuffer* self; private: + uint32 SynchronizeOutgoingSequence(); void* GetNextPinnedBuffer(); }; - -class GpuQueue -{ - friend struct IGpuBuffer; - friend struct GpuDownloadBuffer; - friend struct GpuUploadBuffer; - - enum class CommandType - { - None = 0, - Copy, - Callback, - }; - - struct Command - { - CommandType type; - - union - { - struct CopyInfo* copy; - - struct { - GpuDownloadCallback callback; - size_t copySize; - void* dstbuffer; - void* userData; - } callback; - }; - }; - -public: - - enum Kind - { - Downloader, - Uploader - }; - - GpuQueue( Kind kind ); - virtual ~GpuQueue(); - - //void Synchronize(); - - //GpuDownloadBuffer CreateDownloadBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size = 0, bool dryRun = false ); - //GpuDownloadBuffer CreateDownloadBuffer( const size_t size, bool dryRun = false ); - GpuDownloadBuffer CreateDirectDownloadBuffer( size_t size, IAllocator& devAllocator, size_t alignment, bool dryRun = false ); - GpuDownloadBuffer CreateDownloadBuffer( size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false ); - GpuDownloadBuffer CreateDownloadBuffer( size_t size, uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false ); - - template - inline GpuDownloadBuffer CreateDirectDownloadBuffer( const size_t count, IAllocator& devAllocator, size_t alignment = alignof( T ), bool dryRun = false ) - { - return CreateDirectDownloadBuffer( count * sizeof( T ), devAllocator, alignment, dryRun ); - } - - template - inline GpuDownloadBuffer CreateDownloadBufferT( const size_t count, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment = alignof( T ), bool dryRun = false ) - { - return CreateDownloadBuffer( count * sizeof( T ), devAllocator, pinnedAllocator, alignment, dryRun ); - } - - template - inline GpuDownloadBuffer CreateDownloadBufferT( const size_t count, uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment = alignof( T ), bool dryRun = false ) - { - return CreateDownloadBuffer( count * sizeof( T ), bufferCount, devAllocator, pinnedAllocator, alignment, dryRun ); - } - - //GpuUploadBuffer CreateUploadBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size = 0, bool dryRun = false ); - //GpuUploadBuffer CreateUploadBuffer( const size_t size, bool dryRun = false ); - GpuUploadBuffer CreateUploadBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false ); - - template - inline GpuUploadBuffer CreateUploadBufferT( const size_t count, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false ) - { - return CreateUploadBuffer( count * sizeof( T ), devAllocator, pinnedAllocator, alignment, dryRun ); - } - - inline cudaStream_t GetStream() const { return _stream; } - -protected: - - struct IGpuBuffer* CreateGpuBuffer( size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun ); - struct IGpuBuffer* CreateGpuBuffer( size_t size, uint32 bufferCount, IAllocator* devAllocator, IAllocator* pinnedAllocator, size_t alignment, bool dryRun ); - //struct IGpuBuffer* CreateGpuBuffer( const size_t size ); - //struct IGpuBuffer* CreateGpuBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size ); - - static void CopyPendingDownloadStream( void* userData ); - - [[nodiscard]] - Command& GetCommand( CommandType type ); - void SubmitCommands(); - - // Copy threads - static void CopyThreadEntryPoint( GpuQueue* self ); - virtual void CopyThreadMain(); - - void ExecuteCommand( const Command& cpy ); - - bool ShouldExitCopyThread(); - -protected: - cudaStream_t _stream; - cudaStream_t _preloadStream; - Thread _copyThread; - //Fence _bufferReadySignal; - Semaphore _bufferReadySignal; - Fence _bufferCopiedSignal; - Fence _syncFence; - SPCQueue _queue; - Kind _kind; - - AutoResetSignal _waitForExitSignal; - std::atomic _exitCopyThread = false; - - // Support multiple threads to grab commands - std::atomic _cmdTicketOut = 0; - std::atomic _cmdTicketIn = 0; - std::atomic _commitTicketOut = 0; - std::atomic _commitTicketIn = 0; -}; diff --git a/cuda/chacha8.cu b/cuda/chacha8.cu index ffa4e5fb..7fb7c5d0 100644 --- a/cuda/chacha8.cu +++ b/cuda/chacha8.cu @@ -1,5 +1,6 @@ #include "pos/chacha8.h" #include "CudaPlotContext.h" +#include "plotting/DiskBucketBuffer.h" // #TEST #if _DEBUG @@ -247,6 +248,12 @@ void GenF1Cuda( CudaK32PlotContext& cx ) cx.metaOut.WaitForCompletion(); cx.yOut .Reset(); cx.metaOut.Reset(); + + if( cx.cfg.hybrid16Mode ) + { + cx.diskContext->yBuffer->Swap(); + cx.diskContext->metaBuffer->Swap(); + } } /// diff --git a/extract-version.ps1 b/extract-version.ps1 new file mode 100644 index 00000000..c26d1c70 --- /dev/null +++ b/extract-version.ps1 @@ -0,0 +1,60 @@ +# Navigate to the script's directory +$scriptPath = Split-Path -Path $MyInvocation.MyCommand.Definition -Parent +Set-Location -Path $scriptPath + +# Arguments +$ver_component = $args[0] # The user-specified component from the full version + +# Read the version from the file +$version_str = (Get-Content 'VERSION' | Select-Object -First 1 | Out-String).Trim() +$bb_version_suffix = (Get-Content 'VERSION' | Select-Object -Last 1 | Out-String).Trim() +$version_header = 'src\Version.h' + +if ($version_str -eq $bb_version_suffix) { + $bb_version_suffix = "" +} + +# Prepend a '-' to the suffix, if necessary +if (-Not [string]::IsNullOrEmpty($bb_version_suffix) -and $bb_version_suffix[0] -ne '-') { + $bb_version_suffix = "-$bb_version_suffix" +} + +# Parse the major, minor, and revision numbers +$bb_ver_maj, $bb_ver_min, $bb_ver_rev = $version_str -split '\.' | ForEach-Object { $_.Trim() } + +# Get the Git commit hash +$bb_git_commit = $env:GITHUB_SHA +if ([string]::IsNullOrEmpty($bb_git_commit)) { + $bb_git_commit = & git rev-parse HEAD +} + +if ([string]::IsNullOrEmpty($bb_git_commit)) { + $bb_git_commit = "unknown" +} + +# Check if the user wants a specific component +if (-Not [string]::IsNullOrEmpty($ver_component)) { + switch ($ver_component) { + "major" { + Write-Host -NoNewline $bb_ver_maj + } + "minor" { + Write-Host -NoNewline $bb_ver_min + } + "revision" { + Write-Host -NoNewline $bb_ver_rev + } + "suffix" { + Write-Host -NoNewline $bb_version_suffix + } + "commit" { + Write-Host -NoNewline $bb_git_commit + } + default { + Write-Error "Invalid version component '$ver_component'" + exit 1 + } + } + exit 0 +} + diff --git a/src/PlotContext.h b/src/PlotContext.h index 9cf78630..7465493e 100644 --- a/src/PlotContext.h +++ b/src/PlotContext.h @@ -8,10 +8,11 @@ struct PlotRequest { - const byte* plotId; // Id of the plot we want to create - const char* outDir; // Output plot directory - const char* plotFileName; // .plot.tmp file name - const byte* memo; // Plot memo + const byte* plotId; // Id of the plot we want to create + const char* outDir; // Output plot directory + const char* plotFileName; // .plot.tmp file name + const char* plotOutPath; // Full output path for the final .plot.tmp file + const byte* memo; // Plot memo uint16 memoSize; bool isFirstPlot; bool IsFinalPlot; diff --git a/src/PlotWriter.h b/src/PlotWriter.h index 71e9e954..0255532b 100644 --- a/src/PlotWriter.h +++ b/src/PlotWriter.h @@ -3,6 +3,7 @@ #include "threading/Thread.h" #include "threading/Semaphore.h" + /** * Handles writing the final plot to disk * diff --git a/src/Types.h b/src/Types.h index 44d20992..d2364cc3 100644 --- a/src/Types.h +++ b/src/Types.h @@ -1,5 +1,7 @@ #pragma once +#include + typedef uint8_t byte; typedef uint8_t uint8; typedef uint16_t uint16; @@ -67,3 +69,13 @@ typedef uint128_t uint128; typedef std::chrono::steady_clock::duration Duration; typedef std::chrono::steady_clock::time_point TimePoint; typedef std::chrono::nanoseconds NanoSeconds; + + +template +using ptr = std::unique_ptr; + +template +using sptr = std::shared_ptr; + +template +using wptr = std::weak_ptr; \ No newline at end of file diff --git a/src/commands/CmdPlotCheck.cpp b/src/commands/CmdPlotCheck.cpp index a05beeb8..0ead02b7 100644 --- a/src/commands/CmdPlotCheck.cpp +++ b/src/commands/CmdPlotCheck.cpp @@ -1,26 +1,31 @@ -#include "Commands.h" -#include "plotting/GlobalPlotConfig.h" #include "threading/MTJob.h" +#include "util/CliParser.h" #include "tools/PlotReader.h" +#include "plotting/GlobalPlotConfig.h" #include "plotting/PlotValidation.h" #include "plotting/f1/F1Gen.h" +#include "tools/PlotChecker.h" +#include "harvesting/GreenReaper.h" -struct Config +struct PlotCheckConfig { GlobalPlotConfig* gCfg = nullptr; - uint64 proofCount = 100; - const char* plotPath = ""; + uint64 proofCount = 100; + std::vector plotPaths{}; + byte seed[BB_PLOT_ID_LEN]{}; + bool hasSeed = false; + bool noGpu = false; + int32 gpuIndex = -1; }; void CmdPlotsCheckHelp(); - //----------------------------------------------------------- void CmdPlotsCheckMain( GlobalPlotConfig& gCfg, CliParser& cli ) { - Config cfg = {}; + PlotCheckConfig cfg = {}; cfg.gCfg = &gCfg; while( cli.HasArgs() ) @@ -30,111 +35,75 @@ void CmdPlotsCheckMain( GlobalPlotConfig& gCfg, CliParser& cli ) CmdPlotsCheckHelp(); Exit( 0 ); } + if( cli.ReadHexStrAsBytes( cfg.seed, sizeof( cfg.seed ), "-s", "--seed" ) ) + { + cfg.hasSeed = true; + } else if( cli.ReadU64( cfg.proofCount, "-n", "--iterations" ) ) continue; + else if( cli.ReadSwitch( cfg.noGpu, "-g", "--no-gpu" ) ) continue; + else if( cli.ReadI32( cfg.gpuIndex, "-d", "--device" ) ) continue; else break; } FatalIf( !cli.HasArgs(), "Expected a path to a plot file." ); + do { - cfg.plotPath = cli.Arg(); + cfg.plotPaths.push_back( cli.Arg() ); cli.NextArg(); - - if( cli.HasArgs() ) - { - Fatal( "Unexpected argument '%s'.", cli.Arg() ); - Exit( 1 ); - } - } - - cfg.proofCount = std::max( cfg.proofCount, (uint64)1 ); - - FilePlot plot; - FatalIf( !plot.Open( cfg.plotPath ), "Failed to open plot file at '%s' with error %d.", cfg.plotPath, plot.GetError() ); - - const uint32 threadCount = gCfg.threadCount == 0 ? SysHost::GetLogicalCPUCount() : - std::min( (uint32)MAX_THREADS, std::min( gCfg.threadCount, SysHost::GetLogicalCPUCount() ) ); - - PlotReader reader( plot ); - reader.ConfigDecompressor( threadCount, gCfg.disableCpuAffinity ); - - const uint32 k = plot.K(); - - byte AlignAs(8) seed[BB_PLOT_ID_LEN] = {}; - SysHost::Random( seed, sizeof( seed ) ); - - { - std::string seedHex = BytesToHexStdString( seed, sizeof( seed ) ); - Log::Line( "Checking %llu random proofs with seed 0x%s...", (llu)cfg.proofCount, seedHex.c_str() ); } - Log::Line( "Plot compression level: %u", plot.CompressionLevel() ); - - const uint64 f7Mask = (1ull << k) - 1; - - uint64 prevF7 = 0; - uint64 proofCount = 0; - - uint64 proofXs[BB_PLOT_PROOF_X_COUNT]; - - uint64 nextPercentage = 10; - - for( uint64 i = 0; i < cfg.proofCount; i++ ) + while( cli.HasArgs() ); + + + // GreenReaperContext* grContext = nullptr; + // { + // // Pre-create decompressor here? + // grCreateContext( &grcontext, grCfg, sizeof( GreenReaperConfig ) ) + // } + + // const bool hasGPU = grHasGpuDecompressor( reader.GetDecompressorContext() ); + // if( hasGPU && !cfg.silent ) + // Log::Line( "Using GPU for decompression." ); + // else if( !cfg.silent ) + // Log::Line( "No GPU was selected for decompression." ); + + PlotCheckerConfig checkerCfg{ + .proofCount = cfg.proofCount, + .noGpu = cfg.noGpu, + .gpuIndex = cfg.gpuIndex, + .threadCount = gCfg.threadCount, + .disableCpuAffinity = gCfg.disableCpuAffinity, + .silent = false, + .hasSeed = cfg.hasSeed, + .deletePlots = false, + .deleteThreshold = 0.0 + }; + + static_assert( sizeof( checkerCfg.seed ) == sizeof( cfg.seed ) ); + if( cfg.hasSeed ) + memcpy( checkerCfg.seed, cfg.seed, sizeof( checkerCfg.seed ) ); + + ptr checker( PlotChecker::Create( checkerCfg ) ); + + for( auto* plotPath : cfg.plotPaths ) { - const uint64 f7 = F1GenSingleForK( k, seed, prevF7 ) & f7Mask; - prevF7 = f7; - - uint64 startP7Idx = 0; - const uint64 nF7Proofs = reader.GetP7IndicesForF7( f7, startP7Idx ); - - for( uint64 j = 0; j < nF7Proofs; j++ ) + PlotCheckResult result{}; + checker->CheckPlot( plotPath, &result ); + if( !result.error.empty() ) { - uint64 p7Entry; - if( !reader.ReadP7Entry( startP7Idx + j, p7Entry ) ) - { - // #TODO: Handle error - continue; - } - - const auto r = reader.FetchProof( p7Entry, proofXs ); - if( r == ProofFetchResult::OK ) - { - // Convert to - uint64 outF7 = 0; - if( PlotValidation::ValidateFullProof( k, plot.PlotId(), proofXs, outF7 ) ) - { - if( f7 == outF7 ) - { - proofCount++; - } - else {}// #TODO: Handle error - } - else - { - // #TODO: Handle error - } - - } - else - { - // #TODO: Handle error - continue; - } + Fatal( result.error.c_str() ); } - const double percent = i / (double)cfg.proofCount * 100.0; - if( (uint64)percent == nextPercentage ) - { - Log::Line( " %llu%%...", (llu)nextPercentage ); - nextPercentage += 10; - } + Log::NewLine(); + + // Log::Line( "%llu / %llu (%.2lf%%) valid proofs found.", + // (llu)result.proofCount, (llu)cfg.proofCount, ((double)result.proofCount / cfg.proofCount) * 100.0 ); } - Log::Line( "%llu / %llu (%.2lf%%) valid proofs found.", - (llu)proofCount, (llu)cfg.proofCount, ((double)proofCount / cfg.proofCount) * 100.0 ); } //----------------------------------------------------------- void CmdPlotsCheckHelp() { -} \ No newline at end of file +} diff --git a/src/harvesting/GreenReaper.cpp b/src/harvesting/GreenReaper.cpp index 325a4982..3aae4cdd 100644 --- a/src/harvesting/GreenReaper.cpp +++ b/src/harvesting/GreenReaper.cpp @@ -353,7 +353,7 @@ GRResult grGetCompressionInfo( GRCompressionInfo* outInfo, const size_t infoStru auto c = GetCompressionInfoForLevel( compressionLevel ); outInfo->entrySizeBits = c.entrySizeBits; - outInfo->subtSizeBits = c.subtSizeBits; + outInfo->stubSizeBits = c.stubSizeBits; outInfo->tableParkSize = c.tableParkSize; outInfo->ansRValue = c.ansRValue; diff --git a/src/harvesting/GreenReaper.h b/src/harvesting/GreenReaper.h index 3fdfa6c9..499e755d 100644 --- a/src/harvesting/GreenReaper.h +++ b/src/harvesting/GreenReaper.h @@ -69,7 +69,7 @@ typedef enum GRResult typedef struct GRCompressionInfo { uint32_t entrySizeBits; - uint32_t subtSizeBits; + uint32_t stubSizeBits; size_t tableParkSize; double ansRValue; } GRCompressionInfo; @@ -165,6 +165,22 @@ GR_API GRBool grHasGpuDecompressor( GreenReaperContext* context ); GR_API GRResult grGetCompressionInfo( GRCompressionInfo* outInfo, size_t infoStructSize, uint32_t k, uint32_t compressionLevel ); +inline const char* grResultToString( const GRResult r ) +{ + switch( r ) + { + case GRResult_Failed : return "GRResult_Failed"; + case GRResult_OK : return "GRResult_OK"; + case GRResult_OutOfMemory : return "GRResult_OutOfMemory"; + case GRResult_NoProof : return "GRResult_NoProof"; + case GRResult_WrongVersion : return "GRResult_WrongVersion"; + case GRResult_InvalidGPU : return "GRResult_InvalidGPU"; + case GRResult_InvalidArg : return "GRResult_InvalidArg"; + } + + return "Unknown"; +} + #ifdef __cplusplus } #endif diff --git a/src/harvesting/HarvesterDummy.cpp b/src/harvesting/HarvesterDummy.cpp new file mode 100644 index 00000000..e2d8f69e --- /dev/null +++ b/src/harvesting/HarvesterDummy.cpp @@ -0,0 +1 @@ +// Only here to make CMake happy \ No newline at end of file diff --git a/src/io/FileStream.h b/src/io/FileStream.h index 3521faa1..e67d2544 100644 --- a/src/io/FileStream.h +++ b/src/io/FileStream.h @@ -31,7 +31,28 @@ class FileStream : public IStream { public: inline FileStream() {} - inline ~FileStream() + + inline FileStream( FileStream&& other ) noexcept + : _position ( other._position ) + , _access ( other._access ) + , _flags ( other._flags ) + , _error ( other._error ) + , _blockSize ( other._blockSize ) + , _fd ( other._fd ) + { + other._position = 0; + other._access = FileAccess::None; + other._flags = FileFlags::None; + other._error = 0; + other._blockSize = 0; + #if PLATFORM_IS_UNIX + other._fd = -1; + #else + other._fd = INVALID_WIN32_HANDLE; + #endif + } + + virtual inline ~FileStream() { Close(); } diff --git a/src/main.cpp b/src/main.cpp index ae568d1c..c510ed5c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -135,6 +135,7 @@ int main( int argc, const char* argv[] ) req.memoSize = plotMemoSize; req.outDir = plotOutFolder; req.plotFileName = plotFileName; + req.plotOutPath = plotOutPath; req.isFirstPlot = i == 0; req.IsFinalPlot = i == plotCount-1; @@ -177,9 +178,11 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c // The next parameter is potentially the compression level if( IsNumber( cli.Peek() ) ) cfg.compressionLevel = (uint32)cli.ReadU64(); - + continue; } + else if( cli.ReadSwitch( cfg.disableOutputDirectIO, "--no-direct-io" ) ) + continue; else if( cli.ReadStr( cfg.plotMemoStr, "--memo" ) ) continue; else if( cli.ReadSwitch( cfg.showMemo, "--show-memo" ) ) @@ -325,8 +328,10 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c DiskPlotter::PrintUsage(); else if( cli.ArgMatch( "ramplot" ) ) Log::Line( "bladebit -f ... -p/c ... ramplot " ); + #if BB_CUDA_ENABLED else if( cli.ArgMatch( "cudaplot" ) ) - Log::Line( "bladebit_cuda -f ... -p/c ... cudaplot [-d=device] " ); + CudaK32PlotterPrintHelp(); + #endif else if( cli.ArgMatch( "iotest" ) ) IOTestPrintUsage(); else if( cli.ArgMatch( "memtest" ) ) @@ -362,7 +367,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c // The remainder should be output folders, which we parse after the plotter consumes it's config /// - /// Validate global conifg + /// Validate global config /// FatalIf( farmerPublicKey == nullptr, "A farmer public key must be specified." ); FatalIf( !KeyTools::HexPKeyToG1Element( farmerPublicKey, *(cfg.farmerPublicKey = new bls::G1Element()) ), @@ -391,7 +396,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c { // #TODO: Remove this when added if( cfg.compressionLevel > 7 ) - Log::Line( "[WARNING] Compression levels greater than 7 are only for testing purposes and are not configured to the final plot size." ); + Log::Line( "WARNING: Compression levels greater than 7 are only for testing purposes and are not configured to the final plot size." ); cfg.compressedEntryBits = 17 - cfg.compressionLevel; cfg.ctable = CreateCompressionCTable( cfg.compressionLevel, &cfg.cTableSize ); @@ -477,7 +482,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c Log::Line( " Benchmark mode : %s", cfg.benchmarkMode ? "enabled" : "disabled" ); // Log::Line( " Output path : %s", cfg.outputFolder ); // Log::Line( "" ); - + FatalIf( plotter == nullptr, "No plotter type chosen." ); @@ -486,7 +491,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c // Parse plotter-specific CLI plotter->ParseCLI( cfg, cli ); - + // Parse remaining args as output directories cfg.outputFolderCount = (uint32)cli.RemainingArgCount(); FatalIf( cfg.outputFolderCount < 1, "At least one output folder must be specified." ); @@ -498,6 +503,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c while( cli.HasArgs() ) { outPath = cli.Arg(); + FatalIf( outPath[0] == '-', "Unrecognized argument '%s'.", outPath.c_str() ); // Add trailing slash? const char endChar = outPath.back(); @@ -541,7 +547,7 @@ R"( -t, --threads : Maximum number of threads to use. By default, this is set to the maximum number of logical cpus present. - + -n, --count : Number of plots to create. Default = 1. -f, --farmer-key : Farmer public key, specified in hexadecimal format. @@ -560,7 +566,11 @@ R"( Current compression levels supported are from 0 to 7 (inclusive). Where 0 means no compression, and 7 is the highest compression. Higher compression means smaller plots, but more CPU usage during harvesting. - + + --no-direct-io : Disable direct I/O when writing plot files. + Enable this if writing to a storage destination + that does not support direct I/O. + --benchmark : Enables benchmark mode. This is meant to test plotting without actually writing a final plot to disk. @@ -582,10 +592,10 @@ R"( This is useful when running multiple simultaneous instances of Bladebit as you can manually assign thread affinity yourself when launching Bladebit. - + --memory : Display system memory available, in bytes, and the required memory to run Bladebit, in bytes. - + --memory-json : Same as --memory, but formats the output as json. --version : Display current version. diff --git a/src/pch.h b/src/pch.h index 44e56636..eae8251c 100644 --- a/src/pch.h +++ b/src/pch.h @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include "Platform.h" // Defined in Util.cpp diff --git a/src/platform/win32/SysHost_Win32.cpp b/src/platform/win32/SysHost_Win32.cpp index b1744027..cb05c0d6 100644 --- a/src/platform/win32/SysHost_Win32.cpp +++ b/src/platform/win32/SysHost_Win32.cpp @@ -684,8 +684,7 @@ bool EnableLockMemoryPrivilege() // Still have to check if it actually adjusted the privilege // #See: https://devblogs.microsoft.com/oldnewthing/20211126-00/?p=105973 - DWORD r = ::GetLastError(); - if( r != ERROR_SUCCESS ) + if( ::GetLastError() != ERROR_SUCCESS ) goto Failed; _enabledState = 1; diff --git a/src/plotdisk/DiskPlotPhase3.cpp b/src/plotdisk/DiskPlotPhase3.cpp index 6be14d56..5fac28f3 100644 --- a/src/plotdisk/DiskPlotPhase3.cpp +++ b/src/plotdisk/DiskPlotPhase3.cpp @@ -1065,7 +1065,7 @@ class P3StepTwo auto info = GetCompressionInfoForLevel( _context.cfg->globalCfg->compressionLevel ); outParkSize = info.tableParkSize; - outStubBitSize = info.subtSizeBits; + outStubBitSize = info.stubSizeBits; outCtable = _context.cfg->globalCfg->ctable; } else diff --git a/src/plotdisk/jobs/IOJob.cpp b/src/plotdisk/jobs/IOJob.cpp index 90b27da1..2f7f2e8f 100644 --- a/src/plotdisk/jobs/IOJob.cpp +++ b/src/plotdisk/jobs/IOJob.cpp @@ -146,21 +146,29 @@ bool IOJob::WriteToFile( const char* filePath, const void* writeBuffer, const si //----------------------------------------------------------- bool IOJob::WriteToFile( IStream& file, const void* writeBuffer, const size_t size, - void* fileBlockBuffer, const size_t blockSize, int& error ) + void* fileBlockBuffer, const size_t blockSize, int& error, size_t* outSizeWritten ) { error = 0; const byte* buffer = (byte*)writeBuffer; byte* blockBuffer = (byte*)fileBlockBuffer; - size_t sizeToWrite = size / blockSize * blockSize; - const size_t remainder = size - sizeToWrite; + const size_t totalSizeToWrite = size / blockSize * blockSize; ASSERT( totalSizeToWrite <= size ); + + size_t sizeToWrite = totalSizeToWrite; + const size_t remainder = size - sizeToWrite; ASSERT( remainder < blockSize ); + ASSERT( !remainder || blockBuffer ); while( sizeToWrite ) { - ssize_t sizeWritten = file.Write( buffer, sizeToWrite ); + const ssize_t sizeWritten = file.Write( buffer, sizeToWrite ); + if( sizeWritten < 1 ) { + // Output size written thus far + if( outSizeWritten ) + *outSizeWritten = totalSizeToWrite - sizeToWrite; + error = file.GetError(); return false; } @@ -171,23 +179,49 @@ bool IOJob::WriteToFile( IStream& file, const void* writeBuffer, const size_t si buffer += sizeWritten; } + // Write unaligned portion, if any if( remainder ) { - ASSERT( blockBuffer ); - + if( !blockBuffer ) + { + // All aligned data was written (if there was any) + if( outSizeWritten ) + *outSizeWritten = totalSizeToWrite; + + error = -1; + return false; + } + // Unnecessary zeroing of memory, but might be useful for debugging memset( blockBuffer, 0, blockSize ); memcpy( blockBuffer, buffer, remainder ); - ssize_t sizeWritten = file.Write( blockBuffer, blockSize ); + const ssize_t sizeWritten = file.Write( blockBuffer, blockSize ); if( sizeWritten < 1 ) { + // All aligned data was written (if there was any) + if( outSizeWritten ) + *outSizeWritten = totalSizeToWrite; + error = file.GetError(); return false; } + + // Expect to always write a full block. + if( (size_t)sizeWritten != blockSize ) + { + if( outSizeWritten ) + *outSizeWritten = totalSizeToWrite + (size_t)sizeWritten; + + error = -2; + return false; + } } + if( outSizeWritten ) + *outSizeWritten = size; + return true; } @@ -332,6 +366,12 @@ bool IOJob::ReadFromFile( IStream& file, void* readBuffer, const size_t size, if( remainder ) { + if( blockBuffer == nullptr ) + { + error = -1; + return false; + } + ssize_t sizeRead = file.Read( blockBuffer, blockSize ); if( sizeRead < (ssize_t)remainder ) diff --git a/src/plotdisk/jobs/IOJob.h b/src/plotdisk/jobs/IOJob.h index ef09e807..4bfc67c0 100644 --- a/src/plotdisk/jobs/IOJob.h +++ b/src/plotdisk/jobs/IOJob.h @@ -34,8 +34,11 @@ struct IOJob : MTJob static bool WriteToFile( const char* filePath, const void* writeBuffer, const size_t size, int& error ); + // Aligned write. + // Guaranteed to write all data in the buffer, if not it returns false and sets the error. + // Negative error values are non-OS errors. static bool WriteToFile( IStream& file, const void* writeBuffer, const size_t size, - void* fileBlockBuffer, const size_t blockSize, int& error ); + void* fileBlockBuffer, const size_t blockSize, int& error, size_t* outSizeWritten = nullptr ); static bool WriteToFileUnaligned( const char* filePath, const void* writeBuffer, const size_t size, int& error ); static bool WriteToFileUnaligned( IStream& file, const void* writeBuffer, const size_t size, int& error ); diff --git a/src/plotmem/MemPhase3.cpp b/src/plotmem/MemPhase3.cpp index 9db5c54e..0c7260d0 100644 --- a/src/plotmem/MemPhase3.cpp +++ b/src/plotmem/MemPhase3.cpp @@ -185,7 +185,7 @@ uint64 MemPhase3::ProcessTable( uint32* lEntries, uint64* lpBuffer, Pair* rTable if( tableId == TableId::Table2 && cx.cfg.gCfg->compressionLevel > 0 ) { parkSize = cx.cfg.gCfg->compressionInfo.tableParkSize; - stubBitSize = cx.cfg.gCfg->compressionInfo.subtSizeBits; + stubBitSize = cx.cfg.gCfg->compressionInfo.stubSizeBits; cTable = cx.cfg.gCfg->ctable; } diff --git a/src/plotting/BufferChain.cpp b/src/plotting/BufferChain.cpp new file mode 100644 index 00000000..43a7e47b --- /dev/null +++ b/src/plotting/BufferChain.cpp @@ -0,0 +1,72 @@ +#include "BufferChain.h" +#include "util/IAllocator.h" + +BufferChain::BufferChain( uint32 bufferCount, size_t bufferSize ) + : _buffers ( new byte*[bufferCount], bufferCount ) + , _bufferSize ( bufferSize ) +{} + +BufferChain::~BufferChain() +{ + delete[] _buffers.Ptr(); +} + +BufferChain* BufferChain::Create( IAllocator& allocator, uint32 bufferCount, size_t bufferSize, size_t bufferAlignment, bool dryRun ) +{ + PanicIf( !bufferSize, "" ); + PanicIf( !bufferCount, "" ); + PanicIf( !bufferAlignment, "" ); + + BufferChain* self = nullptr; + if( !dryRun ) + self = new BufferChain( bufferCount, bufferSize ); + + for( uint32 i = 0; i < bufferCount; i++ ) + { + byte* buffer = allocator.AllocT( bufferSize, bufferAlignment ); + + if( !dryRun ) + self->_buffers[i] = buffer; + } + + return self; +} + +byte* BufferChain::PeekBuffer( const uint32 index ) +{ + return _buffers[index % (uint32)_buffers.Length()]; +} + +byte* BufferChain::GetNextBuffer() +{ + const uint32 bufferCount = (uint32)_buffers.Length(); + + PanicIf( _nextBufferToRelease > _nextBufferToLock, "" ); + PanicIf( _nextBufferToLock - _nextBufferToRelease > bufferCount, "" ); + + if( _nextBufferToLock >= bufferCount ) + { + _fence.Wait( _nextBufferToLock - bufferCount + 1 ); + } + + return PeekBuffer( _nextBufferToLock++ ); +} + +void BufferChain::ReleaseNextBuffer() +{ + PanicIf( _nextBufferToRelease >= _nextBufferToLock, "" ); + PanicIf(_nextBufferToLock - _nextBufferToRelease > (uint32)_buffers.Length(), "" ); + + _fence.Signal( ++_nextBufferToRelease ); +} + +void BufferChain::Reset() +{ + // Wait for the last buffer to be released + _fence.Wait( _nextBufferToLock ); + + // Reset state + _fence.Reset( 0 ); + _nextBufferToRelease = 0; + _nextBufferToLock = 0; +} diff --git a/src/plotting/BufferChain.h b/src/plotting/BufferChain.h new file mode 100644 index 00000000..edb934a7 --- /dev/null +++ b/src/plotting/BufferChain.h @@ -0,0 +1,44 @@ +#pragma once +#include "threading/Fence.h" +#include "util/Span.h" + +class IAllocator; + +/// Maintains a chain of buffers which is to be used (and re-used) with first-out, first-in semantics. +/// #NOTE: The caller is expected to free the buffers, as we don't own them, just use them. +class BufferChain +{ + BufferChain( uint32 bufferCount, size_t bufferSize ); + +public: + ~BufferChain(); + + static BufferChain* Create( IAllocator& allocator, uint32 bufferCount, size_t bufferSize, size_t bufferAlignment, bool dryRun ); + + /// Get the pointer to a buffer that will be used for a certain index + /// without actually waiting for it to be available. + byte* PeekBuffer( uint32 index ); + + /// Blocks calling thread until the next buffer in the chain + /// is ready for use, and returns it. + byte* GetNextBuffer(); + + /// Releases the earliest locked buffer + void ReleaseNextBuffer(); + + /// Blocks the calling thread until all outstanding buffers have been released, + /// and resets its state to the first buffer index again. + void Reset(); + + inline size_t BufferSize() const { return _bufferSize; } + + inline uint32 BufferCount() const { return (uint32)_buffers.Length(); } + +private: + Fence _fence; + Span _buffers; + IAllocator* _allocator = nullptr; + size_t _bufferSize = 0; // Size of each individual buffer + uint32 _nextBufferToLock = 0; + uint32 _nextBufferToRelease = 0; +}; diff --git a/src/plotting/Compression.cpp b/src/plotting/Compression.cpp index d1db2973..bde4313b 100644 --- a/src/plotting/Compression.cpp +++ b/src/plotting/Compression.cpp @@ -2,6 +2,7 @@ #include "plotting/FSETableGenerator.h" #include "util/Util.h" #include +#include // Caches for C and D tables static std::atomic _cTableCache[32] = {}; @@ -62,7 +63,7 @@ template void GetCompressionInfoForLevel( CompressionInfo& info ) { info.entrySizeBits = CompressionLevelInfo::ENTRY_SIZE; - info.subtSizeBits = CompressionLevelInfo::STUB_BIT_SIZE; + info.stubSizeBits = CompressionLevelInfo::STUB_BIT_SIZE; info.tableParkSize = CompressionLevelInfo::TABLE_PARK_SIZE; info.ansRValue = CompressionLevelInfo::ANS_R_VALUE; } @@ -140,4 +141,19 @@ uint32 GetCompressedLPBitCount( const uint32 compressionLevel ) // lpBitSize = lpBitSize * 2 - 1; return lpBitSize * 2 - 1; +} + +size_t GetLargestCompressedParkSize() +{ + return std::max( { + GetCompressionInfoForLevel( 1 ).tableParkSize, + GetCompressionInfoForLevel( 2 ).tableParkSize, + GetCompressionInfoForLevel( 3 ).tableParkSize, + GetCompressionInfoForLevel( 4 ).tableParkSize, + GetCompressionInfoForLevel( 5 ).tableParkSize, + GetCompressionInfoForLevel( 6 ).tableParkSize, + GetCompressionInfoForLevel( 7 ).tableParkSize, + GetCompressionInfoForLevel( 8 ).tableParkSize, + GetCompressionInfoForLevel( 9 ).tableParkSize } + ); } \ No newline at end of file diff --git a/src/plotting/Compression.h b/src/plotting/Compression.h index babb379f..dbb01228 100644 --- a/src/plotting/Compression.h +++ b/src/plotting/Compression.h @@ -4,7 +4,7 @@ struct CompressionInfo { uint32_t entrySizeBits; - uint32_t subtSizeBits; + uint32_t stubSizeBits; size_t tableParkSize; double ansRValue; }; @@ -16,6 +16,7 @@ FSE_CTable* CreateCompressionCTable( const uint32_t compressionLevel, size_t FSE_DTable* CreateCompressionDTable( const uint32_t compressionLevel, size_t* outTableSize = nullptr ); CompressionInfo GetCompressionInfoForLevel( const uint32_t compressionLevel ); uint32_t GetCompressedLPBitCount( const uint32_t compressionLevel ); +size_t GetLargestCompressedParkSize(); template struct CompressionLevelInfo diff --git a/src/plotting/DiskBucketBuffer.cpp b/src/plotting/DiskBucketBuffer.cpp new file mode 100644 index 00000000..39c15e70 --- /dev/null +++ b/src/plotting/DiskBucketBuffer.cpp @@ -0,0 +1,238 @@ +#include "DiskBucketBuffer.h" +#include "DiskQueue.h" +#include "plotdisk/jobs/IOJob.h" +#include "util/IAllocator.h" +#include "util/StackAllocator.h" +#include + +DiskBucketBuffer::DiskBucketBuffer( DiskQueue& queue, FileStream& stream, const char* name, + uint32 bucketCount, size_t sliceCapacity ) + : DiskBufferBase( queue, stream, name, bucketCount ) + , _sliceCapacity( RoundUpToNextBoundaryT( sliceCapacity, queue.BlockSize() ) ) + // , _writeSliceStride( _sliceCapacity ) // Start writing horizontally + // , _readSliceStride( _sliceCapacity * bucketCount ) +{ + ASSERT( bucketCount > 0 ); + + _writeSliceSizes.resize( bucketCount ); + _readSliceSizes .resize( bucketCount ); + for( size_t bucket = 0; bucket < bucketCount; bucket++ ) + { + _writeSliceSizes[bucket].resize( bucketCount ); + _readSliceSizes [bucket].resize( bucketCount ); + } +} + +DiskBucketBuffer::~DiskBucketBuffer() +{} + +DiskBucketBuffer* +DiskBucketBuffer::Create( DiskQueue& queue, const char* fileName, + uint32 bucketCount, size_t sliceCapacity, + FileMode mode, FileAccess access, FileFlags flags ) +{ + FileStream file; + if( !DiskBufferBase::MakeFile( queue, fileName, mode, access, flags, file ) ) + return nullptr; + + return new DiskBucketBuffer( queue, file, fileName, bucketCount, sliceCapacity ); +} + +size_t DiskBucketBuffer::GetSingleBucketBufferSize( DiskQueue& queue, uint32 bucketCount, size_t sliceCapacity ) +{ + return RoundUpToNextBoundaryT( sliceCapacity, queue.BlockSize() ) * bucketCount; +} + +size_t DiskBucketBuffer::GetReserveAllocSize( DiskQueue& queue, uint32 bucketCount, size_t sliceCapacity ) +{ + return DiskBufferBase::GetReserveAllocSize( + GetSingleBucketBufferSize( queue, bucketCount, sliceCapacity ), + queue.BlockSize() ); +} + +void DiskBucketBuffer::ReserveBuffers( IAllocator& allocator ) +{ + DiskBufferBase::ReserveBuffers( allocator, GetBucketRowStride(), _queue->BlockSize() ); +} + +void DiskBucketBuffer::Swap() +{ + DiskBufferBase::Swap(); + + // std::swap( _writeSliceStride, _readSliceStride ); + _verticalWrite = !_verticalWrite; + std::swap( _writeSliceSizes, _readSliceSizes ); +} + +void DiskBucketBuffer::Submit( const size_t sliceStride ) +{ + PanicIf( sliceStride > _sliceCapacity, "Invalid slice stride %llu is greater than capacity %llu for %s.", + (llu)sliceStride, (llu)_sliceCapacity, Name() ); + + const uint32 bucket = BeginWriteSubmission(); + + DiskQueueDispatchCommand dcmd = {}; + auto& cmd = dcmd.bucketBufferCmd; + + cmd.type = DiskBucketBufferCommand::Write; + auto& c = cmd.write; + + c.sliceStride = sliceStride; + c.bucket = bucket; + c.vertical = _verticalWrite; + + _queue->EnqueueDispatchCommand( this, dcmd ); + + // Record slice sizes (write 1 column cell per row) + // At the end of a table a bucket row will have + // all the slice sizes of a given bucket. + for( uint32 row = 0; row < _bucketCount; row++ ) + { + _writeSliceSizes[row][bucket] = sliceStride;//sliceSizes[row]; + } + + // Signal completion + EndWriteSubmission(); +} + +void DiskBucketBuffer::ReadNextBucket() +{ + const uint32 bucket = BeginReadSubmission(); + + DiskQueueDispatchCommand dcmd = {}; + auto& cmd = dcmd.bucketBufferCmd; + + cmd.type = DiskBucketBufferCommand::Read; + auto& c = cmd.read; + c.bucket = bucket; + c.vertical = _verticalWrite; // If the last write was NOT vertical, then the read is vertical. + + _queue->EnqueueDispatchCommand( this, dcmd ); + + EndReadSubmission(); +} + +Span DiskBucketBuffer::PeekReadBuffer( const uint32 bucket ) +{ + size_t totalSize = 0; + for( auto sz : _readSliceSizes[bucket] ) + totalSize += sz; + + return Span( _readBuffers[bucket % 2], totalSize ); +} + +void DiskBucketBuffer::OverrideReadSlices( const uint32 bucket, const size_t elementSize, const uint32* sliceSizes, const uint32 stride ) +{ + size_t totalSize = 0; + + auto& readSlices = _readSliceSizes[bucket]; + ASSERT( readSlices.size() == _bucketCount ); + + for( size_t i = 0; i < _bucketCount; i++ ) + { + readSlices[i] = *sliceSizes * elementSize; + sliceSizes += stride; + } +} + + +/// +/// These are executed from the DiskQueue thread +/// +void DiskBucketBuffer::HandleCommand( const DiskQueueDispatchCommand& cmd ) +{ + const auto& c = cmd.bucketBufferCmd; + + switch( c.type ) + { + default: + Panic( "Unexpected." ); + break; + + case DiskBucketBufferCommand::Write: + CmdWriteSlices( c ); + break; + + case DiskBucketBufferCommand::Read: + CmdReadSlices( c ); + break; + } +} + +void DiskBucketBuffer::CmdWriteSlices( const DiskBucketBufferCommand& cmd ) +{ + auto & c = cmd.write; + int err = 0; + + const byte* src = (byte*)_writeBuffers[c.bucket % 2]; + const size_t srcStride = c.sliceStride; + const size_t dstStride = c.vertical ? GetBucketRowStride() : GetSliceStride(); + + // Offset to the starting location + int64 offset = (int64)(c.vertical ? _sliceCapacity * c.bucket : GetBucketRowStride() * c.bucket ); + + // Seek to starting location + for( uint32 i = 0; i < _bucketCount; i++ ) + { + // Seek to next slice + FatalIf( !_file.Seek( offset, SeekOrigin::Begin ), + "Failed to seek to slice %u start on '%s/%s' with error %d.", + i, _queue->Path(), Name(), (int32)_file.GetError() ); + offset += (int64)dstStride; + + // Write slice + if( !IOJob::WriteToFileUnaligned( _file, src, srcStride, err ) ) + { + Fatal( "Failed to write slice on '%s/%s' with error %d.", _queue->Path(), Name(), err ); + } + + src += srcStride; + } +} + +void DiskBucketBuffer::CmdReadSlices( const DiskBucketBufferCommand& cmd ) +{ + const auto& c = cmd.read; + + int err = 0; + + byte* dst = _readBuffers[c.bucket % 2]; + + const size_t rowStride = GetBucketRowStride(); + const size_t sliceStride = GetSliceStride(); + + // Use the last slice as a temp buffer (to avoid the slower memmove on most copies) + byte* tmpBuffer = dst + sliceStride * (_bucketCount-1); + + for( size_t i = 0; i < _bucketCount; i++ ) + { + // Seek to starting location of the slice + const size_t colOffset = c.vertical ? sliceStride * c.bucket : sliceStride * i; + const size_t rowOffset = c.vertical ? rowStride * i : rowStride * c.bucket; + + if( !_file.Seek( (int64)(rowOffset + colOffset), SeekOrigin::Begin ) ) + { + Fatal( "Failed to seek to slice %u start on '%s/%s' with error %d.", + i, _queue->Path(), Name(), (int32)_file.GetError() ); + } + + // Read a full block-aligned slice + if( !IOJob::ReadFromFileUnaligned( _file, tmpBuffer, sliceStride, err ) ) + { + if( err != 0 || i + 1 < _bucketCount ) + { + Fatal( "Failed to read slice from '%s/%s' with error %d.", _queue->Path(), Name(), err ); + } + } + + // Copy read buffer to actual location + const size_t sliceSize = _readSliceSizes[c.bucket][i]; + + if( i + 1 < _bucketCount ) + memcpy( dst, tmpBuffer, sliceSize ); + else + memmove( dst, tmpBuffer, sliceSize ); // Last copy overlaps since it's the same as the temp buffer + + dst += sliceSize; + } +} diff --git a/src/plotting/DiskBucketBuffer.h b/src/plotting/DiskBucketBuffer.h new file mode 100644 index 00000000..ec50cc8b --- /dev/null +++ b/src/plotting/DiskBucketBuffer.h @@ -0,0 +1,97 @@ +#pragma once +#include "DiskBufferBase.h" + +/** + * A disk-backed buffer which read/writes in buckets and slices. Where a slice is a a portion + * of data that belongs to a bucket. The number of slices is equal to n_buckets * n_buckets. + * Where each bucket has n_buckets slices. + * The data layout can be visualized as a grid, where each cell of the grid represents a slice. + * And depending on the manner of writing, each row or column of the grid represents a bucket. + * The manner or writing and reading is swapped between tables. When horizontal (row-writes) are + * performed, then column reads must subsequently be performed. + * This is because each write consists of a row of slices, each for a different bucket. Therefore if we previously + * wrote as a row, (full sequential write), then when we've finished writing all of the rows, all of a bucket's + * data will be found in a column (vertically). If we write vertically, then + * the opposite is true, and the bucket's data is found in a single row (horizontally). + */ +class DiskBucketBuffer : public DiskBufferBase +{ + DiskBucketBuffer( DiskQueue& queue, FileStream& stream, const char* name, uint32 bucketCount, size_t sliceCapacity ); + +public: + static DiskBucketBuffer* Create( DiskQueue& queue, const char* fileName, + uint32 bucketCount, size_t sliceCapacity, + FileMode mode, FileAccess access, FileFlags flags ); + + static size_t GetSingleBucketBufferSize( DiskQueue& queue, uint32 bucketCount, size_t sliceCapacity ); + static size_t GetReserveAllocSize( DiskQueue& queue, uint32 bucketCount, size_t sliceCapacity ); + + virtual ~DiskBucketBuffer(); + + void ReserveBuffers( IAllocator& allocator ) override; + + void Swap() override; + + /** + * Submit next write buffer and track the actual + * size of each submitted slice. + */ + // void Submit( const Span sliceSizes ); + + /** + * Submit next write buffer w/ fixed source stride. + * sliceStride must be <= the slice capacity. + * It ought to be used when the slices are tracked by the + * user separately, and it will be read with a slice override. + */ + void Submit( size_t sliceStride ); + + /** + * Assumes the sliceStride is the same as the maximum slice capacity. + */ + inline void Submit() { Submit( GetSliceStride() ); } + + /** + * Read next bucket + */ + void ReadNextBucket() override; + + inline size_t GetSliceStride() const { return _sliceCapacity; } + + inline size_t GetBucketRowStride() const { return _sliceCapacity * _bucketCount; } + + template + inline Span GetNextWriteBufferAs() + { + return Span( reinterpret_cast( GetNextWriteBuffer() ), GetBucketRowStride() ); + } + + template + inline Span GetNextReadBufferAs() + { + size_t totalSize = 0; + for( auto sz : _readSliceSizes[_nextReadLock] ) + totalSize += sz; + + return Span( reinterpret_cast( GetNextReadBuffer() ), totalSize / sizeof( T ) ); + } + + Span PeekReadBuffer( uint32 bucket ); + + void OverrideReadSlices( uint32 bucket, size_t elementSize, const uint32* sliceSizes, uint32 stride ); + +private: + void HandleCommand( const DiskQueueDispatchCommand& cmd ) override; + void CmdWriteSlices( const DiskBucketBufferCommand& cmd ); + void CmdReadSlices( const DiskBucketBufferCommand& cmd ); + +private: + size_t _sliceCapacity; // Maximum size of each slice + + bool _verticalWrite = false; + // size_t _writeSliceStride; // Offset to the start of the next slices when writing + // size_t _readSliceStride; // Offset to the start of the next slice when reading (these are swapped between tables). + + std::vector> _writeSliceSizes = {}; + std::vector> _readSliceSizes = {}; +}; diff --git a/src/plotting/DiskBuffer.cpp b/src/plotting/DiskBuffer.cpp new file mode 100644 index 00000000..0fb3f44a --- /dev/null +++ b/src/plotting/DiskBuffer.cpp @@ -0,0 +1,119 @@ +#include "DiskBuffer.h" +#include "DiskQueue.h" +#include "plotdisk/jobs/IOJob.h" + +DiskBuffer* DiskBuffer::Create( DiskQueue& queue, const char* fileName, uint32 bucketCount, + size_t bufferSize, FileMode mode, FileAccess access, FileFlags flags ) +{ + FileStream file; + if( !DiskBufferBase::MakeFile( queue, fileName, mode, access, flags, file ) ) + return nullptr; + + return new DiskBuffer( queue, file, fileName, bucketCount, bufferSize ); +} + +DiskBuffer::DiskBuffer( DiskQueue& queue, FileStream& stream, const char* name, + uint32 bucketCount, size_t bufferSize ) + : DiskBufferBase( queue, stream, name, bucketCount ) + , _bufferSize( bufferSize ) + , _alignedBufferSize( RoundUpToNextBoundaryT( bufferSize, _file.BlockSize() ) ) +{ + _bucketSizes.resize( bucketCount ); +} + +DiskBuffer::~DiskBuffer() {} + +void DiskBuffer::ReserveBuffers( IAllocator& allocator ) +{ + DiskBufferBase::ReserveBuffers( allocator, _alignedBufferSize, _file.BlockSize() ); +} + +size_t DiskBuffer::GetReserveAllocSize( DiskQueue& queue, size_t bufferSize ) +{ + const size_t alignment = queue.BlockSize(); + + return DiskBufferBase::GetReserveAllocSize( RoundUpToNextBoundaryT( bufferSize, alignment ), alignment ); +} + +void DiskBuffer::Swap() +{ + DiskBufferBase::Swap(); + + FatalIf( !_file.Seek( 0, SeekOrigin::Begin ), "Failed to seek to file start on '%s/%s' with error %d.", + _queue->Path(), Name(), (int32)_file.GetError() ); +} + +void DiskBuffer::ReadNextBucket() +{ + FatalIf( _nextReadBucket >= _bucketCount, "'%s' Read bucket overflow.", Name() ); + + // Read whole bucket + DiskQueueDispatchCommand dcmd = {}; + auto& cmd = dcmd.bufferCmd; + cmd.type = DiskBufferCommand::Read; + + auto& c = cmd.read; + c.bucket = _nextReadBucket; + + _queue->EnqueueDispatchCommand( this, dcmd ); + _queue->SignalFence( _readFence, ++_nextReadBucket ); +} + +void DiskBuffer::Submit( const size_t size ) +{ + FatalIf( (int64)_nextWriteLock - (int64)_nextWriteBucket > 2, "Invalid write lock state for '%s'.", _name.c_str() ); + FatalIf( size > _alignedBufferSize, "Write submission too large for '%s'.", _name.c_str() ); + + DiskQueueDispatchCommand dcmd = {}; + auto& cmd = dcmd.bufferCmd; + cmd.type = DiskBufferCommand::Write; + + auto& c = cmd.write; + c.bucket = _nextWriteBucket; + _queue->EnqueueDispatchCommand( this, dcmd ); + + // Signal completion + _queue->SignalFence( _writeFence, ++_nextWriteBucket ); +} + +void DiskBuffer::HandleCommand( const DiskQueueDispatchCommand& cmd ) +{ + const auto& c = cmd.bufferCmd; + + switch( c.type ) + { + case DiskBufferCommand::None: + ASSERT( 0 ); + break; + case DiskBufferCommand::Write: + CmdWrite( c ); + break; + case DiskBufferCommand::Read: + CmdRead( c ); + break; + } +} + +void DiskBuffer::CmdWrite( const DiskBufferCommand& cmd ) +{ + const auto& c = cmd.write; + + // Write a full block-aligned bucket + int err = 0; + if( !IOJob::WriteToFileUnaligned( _file, _writeBuffers[c.bucket % 2], _alignedBufferSize, err ) ) + { + Fatal( "Failed to write bucket to '%s/%s' with error %d.", _queue->Path(), Name(), err ); + } +} + +void DiskBuffer::CmdRead( const DiskBufferCommand& cmd ) +{ + const auto& c = cmd.read; + + // Read a full block-aligned bucket + int err = 0; + if( !IOJob::ReadFromFileUnaligned( _file, _readBuffers[c.bucket % 2], _alignedBufferSize, err ) ) + { + Fatal( "Failed to read bucket from '%s/%s' with error %d.", _queue->Path(), Name(), err ); + } +} diff --git a/src/plotting/DiskBuffer.h b/src/plotting/DiskBuffer.h new file mode 100644 index 00000000..2bcba94a --- /dev/null +++ b/src/plotting/DiskBuffer.h @@ -0,0 +1,58 @@ +#pragma once +#include "DiskBufferBase.h" + +/** + * Sequential disk buffer that whose actions are dispatched on a DiskQueue. + * This performs block-aligned reads and writes. + */ +class DiskBuffer : public DiskBufferBase +{ + DiskBuffer( DiskQueue& queue, FileStream& stream, const char* name, uint32 bucketCount, size_t bufferSize ); + +public: + static DiskBuffer* Create( DiskQueue& queue, const char* fileName, + uint32 bucketCount, size_t bufferSize, + FileMode mode, FileAccess access, FileFlags flags ); + + virtual ~DiskBuffer(); + + void ReserveBuffers( IAllocator& allocator ) override; + + static size_t GetReserveAllocSize( DiskQueue& queue, size_t bufferSize ); + + inline size_t GetAlignedBufferSize() const + { + return _alignedBufferSize; + } + + void ReadNextBucket() override; + void Swap() override; + + void Submit( size_t size ); + + template + inline Span GetNextWriteBufferAs() + { + return Span( reinterpret_cast( GetNextWriteBuffer() ), GetAlignedBufferSize() ); + } + + template + inline Span GetNextReadBufferAs() + { + return Span( reinterpret_cast( GetNextReadBuffer() ), GetAlignedBufferSize() / sizeof( T ) ); + } + +protected: + void HandleCommand( const DiskQueueDispatchCommand& cmd ) override; + +private: + /// Command handlers + void CmdWrite( const DiskBufferCommand& cmd ); + void CmdRead( const DiskBufferCommand& cmd ); + +private: + size_t _bufferSize; // Requested buffer size + size_t _alignedBufferSize; // Block-aligned requested buffer size + + std::vector _bucketSizes; // The actual (unaligned) size of each bucket. +}; diff --git a/src/plotting/DiskBufferBase.cpp b/src/plotting/DiskBufferBase.cpp new file mode 100644 index 00000000..38a26270 --- /dev/null +++ b/src/plotting/DiskBufferBase.cpp @@ -0,0 +1,203 @@ +#include "DiskBufferBase.h" +#include "DiskQueue.h" +#include "util/IAllocator.h" +#include "util/StackAllocator.h" +#include + +bool DiskBufferBase::MakeFile( DiskQueue& queue, const char* name, + FileMode mode, FileAccess access, FileFlags flags, FileStream& file ) +{ + ASSERT( !file.IsOpen() ); + + std::string path = std::filesystem::path( queue.Path() ).append( name ).string(); + + return file.Open( path.c_str(), mode, access, flags ); +} + +DiskBufferBase::DiskBufferBase( DiskQueue& queue, FileStream& stream, + const char* name, uint32 bucketCount ) + : _queue ( &queue ) + , _file ( std::move( stream ) ) + , _name ( name ) + , _bucketCount( bucketCount ) +{} + +DiskBufferBase::~DiskBufferBase() +{ + _file.Close(); + std::string path = std::filesystem::path( _queue->Path() ).append( _name ).string(); + ::remove( path.c_str() ); + + // #TODO: Track the allocator used, and only release if we have that reference. + // if( _writeBuffers[0] ) bbvirtfreebounded( _writeBuffers[0] ); + // if( _writeBuffers[1] ) bbvirtfreebounded( _writeBuffers[1] ); + // if( _readBuffers[0] ) bbvirtfreebounded( _readBuffers[0] ); + // if( _readBuffers[1] ) bbvirtfreebounded( _readBuffers[1] ); +} + +void DiskBufferBase::ReserveBufferForInstance( DiskBufferBase* self, IAllocator& allocator, const size_t size, const size_t alignment ) +{ + if( self ) + { + PanicIf( self->_writeBuffers[0], "Buffers already reserved for '%s'.", self->_name.c_str() ); + } + + byte* w0 = allocator.AllocT( size, alignment ); + byte* w1 = allocator.AllocT( size, alignment ); + byte* r0 = allocator.AllocT( size, alignment ); + byte* r1 = allocator.AllocT( size, alignment ); + + if( self ) + { + self->_writeBuffers[0] = w0; + self->_writeBuffers[1] = w1; + self->_readBuffers [0] = r0; + self->_readBuffers [1] = r1; + } +} + +size_t DiskBufferBase::GetReserveAllocSize( const size_t size, const size_t alignment ) +{ + DummyAllocator allocator; + ReserveBufferForInstance( nullptr, allocator, size, alignment ); + + return allocator.Size(); +} + +void DiskBufferBase::ReserveBuffers( IAllocator& allocator, const size_t size, const size_t alignment ) +{ + ReserveBufferForInstance( this, allocator, size, alignment ); +} + +void DiskBufferBase::AssignBuffers( void* readBuffers[2], void* writeBuffers[2] ) +{ + AssignReadBuffers( readBuffers ); + AssignWriteBuffers( writeBuffers ); +} + +void DiskBufferBase::AssignReadBuffers( void* readBuffers[2] ) +{ + // PanicIf( _readBuffers[0], "Read buffers already assigned for '%s'.", _name.c_str() ); + _readBuffers [0] = (byte*)readBuffers [0]; + _readBuffers [1] = (byte*)readBuffers [1]; +} + +void DiskBufferBase::AssignWriteBuffers( void* writeBuffers[2] ) +{ + // PanicIf( _writeBuffers[0], "Write buffers already assigned for '%s'.", _name.c_str() ); + _writeBuffers[0] = (byte*)writeBuffers[0]; + _writeBuffers[1] = (byte*)writeBuffers[1]; +} + + +void DiskBufferBase::ShareBuffers( const DiskBufferBase& other ) +{ + _writeBuffers[0] = other._writeBuffers[0]; + _writeBuffers[1] = other._writeBuffers[1]; + _readBuffers [0] = other._readBuffers [0]; + _readBuffers [1] = other._readBuffers [1]; +} + +void DiskBufferBase::Swap() +{ +// FatalIf( !_file.Seek( 0, SeekOrigin::Begin ), "Failed to seek '%s'.", _name.c_str() ); + WaitForLastWriteToComplete(); + + _nextWriteBucket = 0; + _nextReadBucket = 0; + _nextWriteLock = 0; + _nextReadLock = 0; + + _readFence .Reset(); + _writeFence.Reset(); +} + +void* DiskBufferBase::GetNextWriteBuffer() +{ + PanicIf( _nextWriteLock >= _bucketCount, "Write bucket overflow." ); + PanicIf( (int64)_nextWriteLock - (int64)_nextWriteBucket >= 2, "Invalid write buffer lock for '%s'.", _name.c_str() ); + + void* buf = _writeBuffers[_nextWriteLock % 2]; + PanicIf( !buf, "No write buffer reserved for '%s'.", _name.c_str() ); + + if( _nextWriteLock++ >= 2 ) + WaitForWriteToComplete( _nextWriteLock-2 ); + + return buf; +} + +void* DiskBufferBase::PeekReadBufferForBucket( uint32 bucket ) +{ + PanicIf( _nextReadLock >= _bucketCount, "Read bucket overflow." ); + return _readBuffers[bucket % 2]; +} + +void* DiskBufferBase::PeekWriteBufferForBucket( const uint32 bucket ) +{ + PanicIf( _nextWriteLock >= _bucketCount, "Write bucket overflow." ); + return _writeBuffers[bucket % 2]; +} + +void DiskBufferBase::WaitForWriteToComplete( const uint32 bucket ) +{ + _writeFence.Wait( bucket + 1 ); +} + +void DiskBufferBase::WaitForLastWriteToComplete() +{ + if( _nextWriteBucket < 1 ) + return; + + WaitForWriteToComplete( _nextWriteBucket-1 ); +} + +void* DiskBufferBase::GetNextReadBuffer() +{ + PanicIf( _nextReadLock >= _bucketCount, "Read bucket overflow." ); + PanicIf( _nextReadLock >= _nextReadBucket, "Invalid read buffer lock for '%s'.", _name.c_str() ); + + void* buf = _readBuffers[_nextReadLock % 2]; + PanicIf( !buf, "No read buffer reserved for '%s'.", _name.c_str() ); + + WaitForReadToComplete( _nextReadLock++ ); + return buf; +} + +void DiskBufferBase::WaitForReadToComplete( const uint32 bucket ) +{ + _readFence.Wait( bucket + 1 ); +} + +void DiskBufferBase::WaitForNextReadToComplete() +{ + FatalIf( _nextReadBucket < 1, "Nothing yet read for '%s'.", _name.c_str() ); + FatalIf( _nextReadLock >= _nextReadBucket, "Invalid read buffer lock for '%s'.", _name.c_str() ); + + Panic( "Unsupported. Nothing to see here." ); + + // # TODO: Don't use this as is, it is not intuitive and can causes errors. + // Use GetNextReadBuffer() or WaitForReadToComplete() instead. + WaitForReadToComplete( _nextReadBucket-1 ); +} + +uint32 DiskBufferBase::BeginWriteSubmission() +{ + FatalIf( (int64)_nextWriteLock - (int64)_nextWriteBucket > 2, "Invalid write lock state for '%s'.", _name.c_str() ); + return _nextWriteBucket; +} + +void DiskBufferBase::EndWriteSubmission() +{ + _queue->SignalFence( _writeFence, ++_nextWriteBucket ); +} + +uint32 DiskBufferBase::BeginReadSubmission() +{ + FatalIf( _nextReadBucket >= _bucketCount, "'%s' Read bucket overflow.", Name() ); + return _nextReadBucket; +} + +void DiskBufferBase::EndReadSubmission() +{ + _queue->SignalFence( _readFence, ++_nextReadBucket ); +} diff --git a/src/plotting/DiskBufferBase.h b/src/plotting/DiskBufferBase.h new file mode 100644 index 00000000..d361281f --- /dev/null +++ b/src/plotting/DiskBufferBase.h @@ -0,0 +1,117 @@ +#pragma once +#include "io/FileStream.h" +#include "threading/Fence.h" +#include "DiskQueue.h" +#include "util/Span.h" + + +class IAllocator; + +/** + * Dual-buffered base class for DiskQueue-based writing and reading. + */ +class DiskBufferBase +{ + friend class DiskQueue; + +protected: + static bool MakeFile( DiskQueue& queue, const char* name, FileMode mode, FileAccess access, FileFlags flags, FileStream& file ); + + DiskBufferBase( DiskQueue& queue, FileStream& stream, const char* name, uint32 bucketCount ); + + virtual void HandleCommand( const DiskQueueDispatchCommand& cmd ) = 0; + + static void ReserveBufferForInstance( DiskBufferBase* self, IAllocator& allocator, size_t size, size_t alignment ); + void ReserveBuffers( IAllocator& allocator, size_t size, size_t alignment ); + + static size_t GetReserveAllocSize( size_t size, const size_t alignment ); +public: + + virtual void ReserveBuffers( IAllocator& allocator ) = 0; + + /// Assigns already existing buffers to be used as I/O buffers + void AssignBuffers( void* readBuffers[2], void* writeBuffers[2] ); + void AssignReadBuffers( void* readBuffers[2] ); + void AssignWriteBuffers( void* writeBuffers[2] ); + + /// Takes the same buffers that another DiskBufferBase uses and shares them. + void ShareBuffers( const DiskBufferBase& other ); + + /// Read next bucket + virtual void ReadNextBucket() = 0; + + /// Waits for the last write to finish + /// and marks completion of writing and reading a table. + virtual void Swap(); + + void* GetNextWriteBuffer(); + void* GetNextReadBuffer(); + + void* PeekReadBufferForBucket( uint32 bucket ); + + /// Gets the write buffer for a certain bucket without waiting for it (unsafe) + void* PeekWriteBufferForBucket( uint32 bucket ); + + void WaitForWriteToComplete( uint32 bucket ); + void WaitForLastWriteToComplete(); + + void WaitForReadToComplete( uint32 bucket ); + void WaitForNextReadToComplete(); + + + inline const char* Name() const + { + return _name.c_str(); + } + + inline FileStream& File() const + { + return const_cast( this )->_file; + } + + /// Helpers + inline bool TryReadNextBucket() + { + if( _nextReadBucket >= _bucketCount ) + return false; + + ReadNextBucket(); + return true; + } + + inline uint32 GetNextReadBucketId() const + { + ASSERT( _nextReadBucket < _bucketCount ); + return _nextReadBucket; + } + +public: + virtual ~DiskBufferBase(); + +protected: + /** + * Returns the bucket about to be written. + */ + uint32 BeginWriteSubmission(); + void EndWriteSubmission(); + + uint32 BeginReadSubmission(); + void EndReadSubmission(); + +protected: + DiskQueue* _queue; + FileStream _file; + std::string _name; + + uint32 _bucketCount; + Fence _writeFence; + Fence _readFence; + + byte* _writeBuffers[2] = {}; + byte* _readBuffers [2] = {}; + + uint32 _nextWriteBucket = 0; // Next bucket that will be written to disk + uint32 _nextReadBucket = 0; // Next bucket that will be read from disk + uint32 _nextWriteLock = 0; // Next write bucket buffer index that will be locked (for user use) + uint32 _nextReadLock = 0; // Next read bucket buffer index that will be locked (for user use) +}; diff --git a/src/plotting/DiskQueue.cpp b/src/plotting/DiskQueue.cpp new file mode 100644 index 00000000..8906efd4 --- /dev/null +++ b/src/plotting/DiskQueue.cpp @@ -0,0 +1,64 @@ +#include "DiskQueue.h" +#include "DiskBucketBuffer.h" +#include "io/FileStream.h" +#include "threading/Fence.h" +#include "plotdisk/jobs/IOJob.h" + +DiskQueue::DiskQueue( const char* path ) + : Super() + , _path( path ) +{ + ASSERT( path ); + + _blockSize = FileStream::GetBlockSizeForPath( path ); + FatalIf( _blockSize < 1, "Failed to obtain file system block size for path '%s'", path ); + + StartConsumer(); +} + +DiskQueue::~DiskQueue() +{} + +void DiskQueue::ProcessCommands( const Span items ) +{ + for( uint32 item = 0; item < items.Length(); item++ ) + { + auto& cmd = items[item]; + + switch( cmd.type ) + { + case DiskQueueCommand::DispatchDiskBufferCommand: + cmd.dispatch.sender->HandleCommand( cmd.dispatch.cmd ); + break; + + case DiskQueueCommand::Signal: + cmd.signal.fence->Signal( (uint32)cmd.signal.value ); + break; + + default: + ASSERT(0); + break; + } + } +} + +void DiskQueue::EnqueueDispatchCommand( DiskBufferBase* sender, const DiskQueueDispatchCommand& cmd ) +{ + // #TODO: Don't copy and just have them send in a DiskQueueCommand? + DiskQueueCommand c; + c.type = DiskQueueCommand::DispatchDiskBufferCommand; + c.dispatch.sender = sender; + c.dispatch.cmd = cmd; + + this->Submit( c ); +} + +void DiskQueue::SignalFence( Fence& fence, uint64 value ) +{ + DiskQueueCommand c; + c.type = DiskQueueCommand::Signal; + c.signal.fence = &fence; + c.signal.value = value; + + this->Submit( c ); +} diff --git a/src/plotting/DiskQueue.h b/src/plotting/DiskQueue.h new file mode 100644 index 00000000..b3a7a145 --- /dev/null +++ b/src/plotting/DiskQueue.h @@ -0,0 +1,121 @@ +#pragma once +#include "threading/Thread.h" +#include "threading/AutoResetSignal.h" +#include "util/MPMCQueue.h" +#include "util/CommandQueue.h" + +class IStream; +class Fence; +class DiskBufferBase; + +struct DiskBufferCommand +{ + enum Type + { + None = 0, + Write, + Read, + }; + + Type type; + + union + { + struct { + uint32 bucket; + } write; + + struct { + uint32 bucket; + } read; + }; +}; + +struct DiskBucketBufferCommand +{ + enum Type + { + None = 0, + Write, + Read, + // Seek, + // Close, + }; + + Type type; + + union + { + struct { + size_t sliceStride; + uint32 bucket; + bool vertical; + } write; + + struct { + uint32 bucket; + bool vertical; + } read; + }; +}; + +union DiskQueueDispatchCommand +{ + DiskBufferCommand bufferCmd; + DiskBucketBufferCommand bucketBufferCmd; +}; + +struct DiskQueueCommand +{ + static constexpr uint32 MAX_STACK_COMMANDS = 64; + + enum Type + { + None = 0, + DispatchDiskBufferCommand, + Signal, + }; + + Type type; + + union + { + struct { + DiskBufferBase* sender; + DiskQueueDispatchCommand cmd; + } dispatch; + + struct { + Fence* fence; + uint64 value; + } signal; + }; +}; + +class DiskQueue : public MPCommandQueue +{ + using Super = MPCommandQueue; + + friend class DiskBufferBase; + friend class DiskBuffer; + friend class DiskBucketBuffer; + +public: + DiskQueue( const char* path ); + ~DiskQueue(); + + inline const char* Path() const { return _path.c_str(); } + inline size_t BlockSize() const { return _blockSize; } + +protected: + void ProcessCommands( const Span items ) override; + +private: + void EnqueueDispatchCommand( DiskBufferBase* sender, const DiskQueueDispatchCommand& cmd ); + void SignalFence( Fence& fence, uint64 value ); + +private: + std::string _path; // Storage directory + size_t _blockSize = 0; // File system block size at path +}; + diff --git a/src/plotting/PlotWriter.cpp b/src/plotting/PlotWriter.cpp index 6e0785aa..3d3440c7 100644 --- a/src/plotting/PlotWriter.cpp +++ b/src/plotting/PlotWriter.cpp @@ -2,16 +2,19 @@ #include "ChiaConsts.h" #include "plotdisk/jobs/IOJob.h" #include "plotdisk/DiskBufferQueue.h" +#include "harvesting/GreenReaper.h" //----------------------------------------------------------- PlotWriter::PlotWriter() : PlotWriter( true ) {} //----------------------------------------------------------- PlotWriter::PlotWriter( bool useDirectIO ) - : _queue() - , _writerThread( new Thread( 4 MiB ) ) + : _writerThread( new Thread( 4 MiB ) ) , _directIO ( useDirectIO ) + , _queue() { + _readyToPlotSignal.Signal(); // Start ready to plot + // #MAYBE: Start the thread at first plot? _writerThread->Run( WriterThreadEntry, this ); } @@ -40,12 +43,25 @@ PlotWriter::~PlotWriter() bbvirtfree( _writeBuffer.Ptr() ); } +//----------------------------------------------------------- +void PlotWriter::EnablePlotChecking( PlotChecker& checker ) +{ + _plotChecker = &checker; +} + //----------------------------------------------------------- bool PlotWriter::BeginPlot( PlotVersion version, const char* plotFileDir, const char* plotFileName, const byte plotId[32], const byte* plotMemo, const uint16 plotMemoSize, const uint32 compressionLevel ) { - return BeginPlotInternal( version, plotFileDir, plotFileName, plotId, plotMemo, plotMemoSize, compressionLevel ); + _readyToPlotSignal.Wait(); + + const bool r = BeginPlotInternal( version, plotFileDir, plotFileName, plotId, plotMemo, plotMemoSize, compressionLevel ); + + if( !r ) + _readyToPlotSignal.Signal(); + + return r; } //----------------------------------------------------------- @@ -259,7 +275,6 @@ bool PlotWriter::BeginPlotInternal( PlotVersion version, return true; } - //----------------------------------------------------------- void PlotWriter::EndPlot( const bool rename ) { @@ -267,10 +282,32 @@ void PlotWriter::EndPlot( const bool rename ) ASSERT( _stream.IsOpen() ); - auto& cmd = GetCommand( CommandType::EndPlot ); - cmd.endPlot.fence = &_completedFence; - cmd.endPlot.rename = rename; - SubmitCommands(); + // auto& cmd = GetCommand( CommandType::EndPlot ); + // cmd.endPlot.fence = &_completedFence; + // cmd.endPlot.rename = rename; + // SubmitCommands(); + + SubmitCommand({ .type = CommandType::EndPlot, + .endPlot{ .fence = &_completedFence, + .rename = rename + } + }); +} + +//----------------------------------------------------------- +bool PlotWriter::CheckPlot() +{ + if( _dummyMode || !_plotChecker ) return false; + + const char* plotPath = _plotPathBuffer.Ptr(); + + PlotCheckResult checksResult{}; + _plotChecker->CheckPlot( plotPath, &checksResult ); + + if( !checksResult.error.empty() ) + return false; + + return !checksResult.deleted; } @@ -322,9 +359,14 @@ void PlotWriter::BeginTable( const PlotTable table ) { if( _dummyMode ) return; - auto& cmd = GetCommand( CommandType::BeginTable ); - cmd.beginTable.table = table; - SubmitCommands(); + SubmitCommand({ + .type = CommandType::BeginTable, + .beginTable{ .table = table } + }); + // auto& cmd = GetCommand( CommandType::BeginTable ); + // auto cmd = GetCommand( CommandType::BeginTable ); + // cmd.beginTable.table = table; + // SubmitCommands(); } //----------------------------------------------------------- @@ -332,10 +374,18 @@ void PlotWriter::ReserveTableSize( const PlotTable table, const size_t size ) { if( _dummyMode ) return; - auto& cmd = GetCommand( CommandType::ReserveTable ); - cmd.reserveTable.table = table; - cmd.reserveTable.size = size; - SubmitCommands(); + // auto& cmd = GetCommand( CommandType::ReserveTable ); + // cmd.reserveTable.table = table; + // cmd.reserveTable.size = size; + // SubmitCommands(); + + SubmitCommand({ + .type = CommandType::ReserveTable, + .reserveTable { + .table = table, + .size = size + } + }); } //----------------------------------------------------------- @@ -343,8 +393,9 @@ void PlotWriter::EndTable() { if( _dummyMode ) return; - auto& cmd = GetCommand( CommandType::EndTable ); - SubmitCommands(); + // auto& cmd = GetCommand( CommandType::EndTable ); + // SubmitCommands(); + SubmitCommand({ .type = CommandType::EndTable }); } //----------------------------------------------------------- @@ -352,10 +403,16 @@ void PlotWriter::WriteTableData( const void* data, const size_t size ) { if( _dummyMode ) return; - auto& cmd = GetCommand( CommandType::WriteTable ); - cmd.writeTable.buffer = (byte*)data; - cmd.writeTable.size = size; - SubmitCommands(); + // auto& cmd = GetCommand( CommandType::WriteTable ); + // cmd.writeTable.buffer = (byte*)data; + // cmd.writeTable.size = size; + // SubmitCommands(); + + SubmitCommand({ .type = CommandType::WriteTable, + .writeTable{ .buffer = (byte*)data, + .size = size, + } + }); } //----------------------------------------------------------- @@ -363,41 +420,90 @@ void PlotWriter::WriteReservedTable( const PlotTable table, const void* data ) { if( _dummyMode ) return; - auto& cmd = GetCommand( CommandType::WriteReservedTable ); - cmd.writeReservedTable.table = table; - cmd.writeReservedTable.buffer = (byte*)data; - SubmitCommands(); + // auto& cmd = GetCommand( CommandType::WriteReservedTable ); + // cmd.writeReservedTable.table = table; + // cmd.writeReservedTable.buffer = (byte*)data; + // SubmitCommands(); + + SubmitCommand({ .type = CommandType::WriteReservedTable, + .writeReservedTable{ + .table = table, + .buffer = (byte*)data + } + }); } //----------------------------------------------------------- void PlotWriter::SignalFence( Fence& fence ) { - if( _dummyMode ) fence.Signal(); + if( _dummyMode ) + { + fence.Signal(); + return; + } + + // auto& cmd = GetCommand( CommandType::SignalFence ); + // cmd.signalFence.fence = &fence; + // cmd.signalFence.sequence = -1; + // SubmitCommands(); - auto& cmd = GetCommand( CommandType::SignalFence ); - cmd.signalFence.fence = &fence; - cmd.signalFence.sequence = -1; - SubmitCommands(); + SubmitCommand({ .type = CommandType::SignalFence, + .signalFence{ .fence = &fence, + .sequence = -1 + } + }); } //----------------------------------------------------------- void PlotWriter::SignalFence( Fence& fence, uint32 sequence ) { - if( _dummyMode ) fence.Signal( sequence ); + if( _dummyMode ) + { + fence.Signal( sequence ); + return; + } + + // auto& cmd = GetCommand( CommandType::SignalFence ); + // cmd.signalFence.fence = &fence; + // cmd.signalFence.sequence = (int64)sequence; + // SubmitCommands(); + + SubmitCommand({ .type = CommandType::SignalFence, + .signalFence{ .fence = &fence, + .sequence = (int64)sequence + } + }); +} + +//----------------------------------------------------------- +void PlotWriter::CallBack( std::function func ) +{ + if( _dummyMode ) + { + func(); + return; + } + + // auto& cmd = GetCommand( CommandType::CallBack ); + // cmd.callback.func = new std::function( std::move( func ) ); + // SubmitCommands(); - auto& cmd = GetCommand( CommandType::SignalFence ); - cmd.signalFence.fence = &fence; - cmd.signalFence.sequence = (int64)sequence; - SubmitCommands(); + SubmitCommand({ .type = CommandType::CallBack, + .callback{ .func = new std::function( std::move( func ) ) } + }); } //----------------------------------------------------------- void PlotWriter::ExitWriterThread() { // Signal writer thread to exit after it finishes its commands - auto& cmd = GetCommand( CommandType::Exit ); - cmd.signalFence.fence = &_completedFence; - SubmitCommands(); + // auto& cmd = GetCommand( CommandType::Exit ); + // cmd.signalFence.fence = &_completedFence; + // SubmitCommands(); + + SubmitCommand({ .type = CommandType::Exit, + .signalFence{ .fence = &_completedFence } + }); // Wait for writer thread to exit _completedFence.Wait(); @@ -407,50 +513,60 @@ void PlotWriter::ExitWriterThread() //----------------------------------------------------------- PlotWriter::Command& PlotWriter::GetCommand( CommandType type ) { - if( _owner != nullptr ) - { - auto* cmd = _owner->GetCommandObject( DiskBufferQueue::Command::CommandType::PlotWriterCommand ); - ASSERT( cmd ); - - ZeroMem( &cmd->plotWriterCmd ); - cmd->plotWriterCmd.writer = this; - cmd->plotWriterCmd.cmd.type = type; - return cmd->plotWriterCmd.cmd; - } - else - { - Command* cmd = nullptr; - while( !_queue.Write( cmd ) ) - { - Log::Line( "[PlotWriter] Command buffer full. Waiting for commands." ); - auto waitTimer = TimerBegin(); - - // Block and wait until we have commands free in the buffer - _cmdConsumedSignal.Wait(); + Panic( "Don't use me!" ); + + // if( _owner != nullptr ) + // { + // auto* cmd = _owner->GetCommandObject( DiskBufferQueue::Command::CommandType::PlotWriterCommand ); + // ASSERT( cmd ); + + // ZeroMem( &cmd->plotWriterCmd ); + // cmd->plotWriterCmd.writer = this; + // cmd->plotWriterCmd.cmd.type = type; + // return cmd->plotWriterCmd.cmd; + // } + // else + // { + // Command* cmd = nullptr; + // while( !_queue.Write( cmd ) ) + // { + // Log::Line( "[PlotWriter] Command buffer full. Waiting for commands." ); + // auto waitTimer = TimerBegin(); + + // // Block and wait until we have commands free in the buffer + // _cmdConsumedSignal.Wait(); - Log::Line( "[PlotWriter] Waited %.6lf seconds for a Command to be available.", TimerEnd( waitTimer ) ); - } + // Log::Line( "[PlotWriter] Waited %.6lf seconds for a Command to be available.", TimerEnd( waitTimer ) ); + // } - ASSERT( cmd ); - ZeroMem( cmd ); - cmd->type = type; + // ASSERT( cmd ); + // ZeroMem( cmd ); + // cmd->type = type; - return *cmd; - } + // return *cmd; + // } } //----------------------------------------------------------- -void PlotWriter::SubmitCommands() +void PlotWriter::SubmitCommand( const Command cmd ) { - if( _owner != nullptr ) - { - _owner->CommitCommands(); - } - else - { - _queue.Commit(); - _cmdReadySignal.Signal(); - } + std::unique_lock lock( _queueLock ); + _queue.push( cmd ); + _cmdReadySignal.Signal(); +} + +//----------------------------------------------------------- +void PlotWriter::SubmitCommands() +{Panic( "" ); + // if( _owner != nullptr ) + // { + // _owner->CommitCommands(); + // } + // else + // { + // _queue.Commit(); + // _cmdReadySignal.Signal(); + // } } @@ -475,12 +591,48 @@ void PlotWriter::WriterThreadMain() _cmdReadySignal.Wait(); // Load commands from the queue - int32 cmdCount; - while( ( ( cmdCount = _queue.Dequeue( commands, MAX_COMMANDS ) ) ) ) + // int32 cmdCount; + // while( ( ( cmdCount = _queue.Dequeue( commands, MAX_COMMANDS ) ) ) ) + // { + // // Notify we consumed commands + // _cmdConsumedSignal.Signal(); + + // for( int32 i = 0; i < cmdCount; i++ ) + // { + // if( commands[i].type == CommandType::Exit ) + // { + // commands[i].signalFence.fence->Signal(); + // return; + // } + + // ExecuteCommand( commands[i] ); + // } + // } + + // Consume commands from the queue and execute them + // until there are none more found in the queue + size_t cmdCount = 0; + for( ;; ) { + // Consume commands from queue + { + std::unique_lock lock( _queueLock ); + cmdCount = std::min( _queue.size(), MAX_COMMANDS ); + + for( size_t i = 0; i < cmdCount; i++ ) + { + commands[i] = _queue.front(); + _queue.pop(); + } + } + // Notify we consumed commands _cmdConsumedSignal.Signal(); + if( cmdCount < 1 ) + break; + + // Execute commands for( int32 i = 0; i < cmdCount; i++ ) { if( commands[i].type == CommandType::Exit ) @@ -508,6 +660,7 @@ void PlotWriter::ExecuteCommand( const Command& cmd ) case CommandType::ReserveTable : CmdReserveTable( cmd ); break; case CommandType::WriteReservedTable : CmdWriteReservedTable( cmd ); break; case CommandType::EndPlot : CmdEndPlot( cmd ); break; + case CommandType::CallBack : CmdCallBack( cmd ); break; case CommandType::SignalFence: if( cmd.signalFence.sequence >= 0 ) @@ -527,7 +680,7 @@ void PlotWriter::SeekToLocation( const size_t location ) // - The seeked-to block already existed const size_t blockSize = _stream.BlockSize(); - const size_t currentAlignedLocation = _position / blockSize * blockSize; + // const size_t currentAlignedLocation = _position / blockSize * blockSize; const size_t alignedLocation = location / blockSize * blockSize; if( _bufferBytes ) @@ -618,16 +771,51 @@ void PlotWriter::WriteData( const byte* src, const size_t size ) ASSERT( (copySize + _bufferBytes) / blockSize * blockSize == (copySize + _bufferBytes) ); memcpy( writeBuffer + _bufferBytes, src, copySize ); - - const size_t writeSize = _bufferBytes + copySize; + + size_t writeSize = _bufferBytes + copySize; sizeToWrite -= writeSize; src += copySize; _bufferBytes = 0; ASSERT( writeSize / blockSize * blockSize == writeSize ); - PanicIf( !IOJob::WriteToFile( _stream, writeBuffer, writeSize, nullptr, blockSize, err ), - "Failed to write to plot with error %d:", err ); + + size_t totalSizeWritten = 0; + size_t sizeWritten = 0; + while( !IOJob::WriteToFile( _stream, writeBuffer, writeSize, nullptr, blockSize, err, &sizeWritten ) ) + { + ASSERT( writeSize / blockSize * blockSize == writeSize ); + + bool isOutOfSpace = false; + + #if !defined( _WIN32 ) + isOutOfSpace = err == ENOSPC; + #else + // #TODO: Add out of space error check for windows + #endif + + // Wait indefinitely until there's more space + if( isOutOfSpace ) + { + const long SLEEP_TIME = 10 * (long)1000; + + Log::Line( "No space left in plot output directory for plot '%s'. Waiting %.1lf seconds before trying again...", + this->_plotPathBuffer.Ptr(), (double)SLEEP_TIME/1000.0 ); + Thread::Sleep( SLEEP_TIME ); + } + else + Log::Line( "Error %d encountered when writing to plot '%s.", err, this->_plotPathBuffer.Ptr() ); + + totalSizeWritten += sizeWritten; + if( totalSizeWritten >= writeSize ) + break; + + ASSERT( sizeWritten >= writeSize ); + + writeBuffer += sizeWritten; + writeSize -= sizeWritten; + sizeWritten = 0; + } } @@ -783,8 +971,14 @@ void PlotWriter::CmdEndPlot( const Command& cmd ) FlushRetainedBytes(); _stream.Close(); + bool renamePlot = cmd.endPlot.rename; + if( _plotChecker ) + { + renamePlot = CheckPlot(); + } + // Now rename to its final non-temp name - if( cmd.endPlot.rename ) + if( renamePlot ) { const uint32 RETRY_COUNT = 10; const long MS_WAIT_TIME = 1000; @@ -820,6 +1014,15 @@ void PlotWriter::CmdEndPlot( const Command& cmd ) } } + _readyToPlotSignal.Signal(); cmd.endPlot.fence->Signal(); } +//----------------------------------------------------------- +void PlotWriter::CmdCallBack( const Command& cmd ) +{ + ASSERT( cmd.type == CommandType::CallBack ); + + (*cmd.callback.func)(); + delete cmd.callback.func; +} \ No newline at end of file diff --git a/src/plotting/PlotWriter.h b/src/plotting/PlotWriter.h index 28084687..47113b06 100644 --- a/src/plotting/PlotWriter.h +++ b/src/plotting/PlotWriter.h @@ -3,10 +3,14 @@ #include "util/SPCQueue.h" #include "plotting/PlotTypes.h" #include "plotting/PlotHeader.h" +#include "tools/PlotChecker.h" #include "io/FileStream.h" #include "threading/Thread.h" #include "threading/AutoResetSignal.h" #include "threading/Fence.h" +#include +#include +#include /** * Handles writing the final plot data to disk asynchronously. @@ -83,6 +87,7 @@ class PlotWriter PlotWriter( DiskBufferQueue& ownerQueue ); virtual ~PlotWriter(); + void EnablePlotChecking( PlotChecker& checker ); // Begins writing a new plot. Any previous plot must have finished before calling this bool BeginPlot( PlotVersion version, @@ -118,6 +123,9 @@ class PlotWriter void SignalFence( Fence& fence ); void SignalFence( Fence& fence, uint32 sequence ); + + // Dispatch a callback from the writer thread + void CallBack( std::function func ); void CompleteTable(); @@ -154,8 +162,11 @@ class PlotWriter const byte* plotMemo, const uint16 plotMemoSize, int32 compressionLevel ); + bool CheckPlot(); + Command& GetCommand( CommandType type ); void SubmitCommands(); + void SubmitCommand( const Command cmd ); void SeekToLocation( size_t location ); @@ -168,6 +179,7 @@ class PlotWriter void WriteData( const byte* data, size_t size ); + private: void CmdBeginTable( const Command& cmd ); void CmdEndTable( const Command& cmd ); @@ -176,6 +188,7 @@ class PlotWriter void CmdWriteReservedTable( const Command& cmd ); void CmdSignalFence( const Command& cmd ); void CmdEndPlot( const Command& cmd ); + void CmdCallBack( const Command& cmd ); private: enum class CommandType : uint32 @@ -188,7 +201,8 @@ class PlotWriter ReserveTable, WriteReservedTable, SignalFence, - EndPlot + EndPlot, + CallBack, }; struct Command @@ -237,6 +251,11 @@ class PlotWriter Fence* fence; bool rename; } endPlot; + + struct + { + std::function* func; + } callback; }; }; @@ -244,7 +263,7 @@ class PlotWriter private: class DiskBufferQueue* _owner = nullptr; // This instance might be own by an IOQueue, which will // dispatch our ocmmands in its own threads. - + FileStream _stream; bool _directIO; bool _dummyMode = false; // In this mode we don't actually write anything @@ -255,6 +274,7 @@ class PlotWriter Fence _completedFence; // Signal plot completed AutoResetSignal _cmdReadySignal; AutoResetSignal _cmdConsumedSignal; + AutoResetSignal _readyToPlotSignal; // Set when the writer is ready to start the next plot. Span _writeBuffer = {}; size_t _bufferBytes = 0; // Current number of bytes in the buffer size_t _headerSize = 0; @@ -271,6 +291,12 @@ class PlotWriter size_t _tableStart = 0; // Current table start location uint64 _tablePointers[10] = {}; uint64 _tableSizes [10] = {}; - SPCQueue _queue; + // SPCQueue _queue; + + std::queue _queue; + std::mutex _queueLock; + // std::mutex _pushLock; + + PlotChecker* _plotChecker = nullptr; // User responsible for ownership of checker. Must live until this PlotWriter's lifetime neds. }; diff --git a/src/threading/MTJob.h b/src/threading/MTJob.h index a7e0badd..fe00de25 100644 --- a/src/threading/MTJob.h +++ b/src/threading/MTJob.h @@ -315,7 +315,7 @@ inline void MTJobSyncT::WaitForRelease() // Trace( "- locked: %d", count ); // Wait for the control thread (id == 0 ) to signal us - while( finishedCount.load( std::memory_order_relaxed ) != 0 ); + while( finishedCount.load( std::memory_order_relaxed ) != 0 ){} // Ensure all threads have been released (prevent re-locking before another thread has been released) // count = releaseLock.load( std::memory_order_acquire ); diff --git a/src/threading/Semaphore.cpp b/src/threading/Semaphore.cpp index 52ffc7f2..62671710 100644 --- a/src/threading/Semaphore.cpp +++ b/src/threading/Semaphore.cpp @@ -133,7 +133,7 @@ int Semaphore::GetCount() return value; #elif PLATFORM_IS_WINDOWS || PLATFORM_IS_APPLE - return _count.load( std::memory_order::memory_order_release ); + return _count.load( std::memory_order_release ); #else #error Unimplemented #endif diff --git a/src/threading/ThreadPool.cpp b/src/threading/ThreadPool.cpp index fc6dee58..31963e05 100644 --- a/src/threading/ThreadPool.cpp +++ b/src/threading/ThreadPool.cpp @@ -157,7 +157,7 @@ void ThreadPool::FixedThreadRunner( void* tParam ) for( ;; ) { - if( exitSignal.load( std::memory_order::memory_order_acquire ) ) + if( exitSignal.load( std::memory_order_acquire ) ) break; // Wait until we are signalled to go @@ -190,7 +190,7 @@ void ThreadPool::GreedyThreadRunner( void* tParam ) for( ;; ) { - if( pool._exitSignal.load( std::memory_order::memory_order_acquire ) ) + if( pool._exitSignal.load( std::memory_order_acquire ) ) return; // Wait until we are signalled to go diff --git a/src/tools/PlotChecker.cpp b/src/tools/PlotChecker.cpp new file mode 100644 index 00000000..6578a90e --- /dev/null +++ b/src/tools/PlotChecker.cpp @@ -0,0 +1,215 @@ +#include "PlotChecker.h" +#include "tools/PlotReader.h" +#include "plotting/PlotValidation.h" +#include "harvesting/GreenReaper.h" +#include "plotting/f1/F1Gen.h" + +class PlotCheckerImpl : public PlotChecker +{ + PlotCheckerConfig _cfg; + bool _lastPlotDeleted = false; +public: + + //----------------------------------------------------------- + PlotCheckerImpl( PlotCheckerConfig& cfg ) + : _cfg( cfg ) + {} + + //----------------------------------------------------------- + ~PlotCheckerImpl() override = default; + + //----------------------------------------------------------- + void CheckPlot( const char* plotPath, PlotCheckResult* outResult ) override + { + _lastPlotDeleted = false; + + PlotCheckResult result{}; + PerformPlotCheck( plotPath, result ); + + if( !result.error.empty() ) + { + if( !_cfg.silent ) + { + Log::Line( "An error occured checking the plot: %s.", result.error.c_str() ); + + if( _cfg.deletePlots ) + Log::Line( "Any actions against plot '%s' will be ignored.", plotPath ); + } + } + + // Check threshold for plot deletion + const double passRate = result.proofCount / (double)result.checkCount; + + // Print stats + if( !_cfg.silent ) + { + std::string seedHex = BytesToHexStdString( result.seedUsed, sizeof( result.seedUsed ) ); + Log::Line( "Seed used: 0x%s", seedHex.c_str() ); + Log::Line( "Proofs requested/fetched: %llu / %llu ( %.3lf%% )", result.proofCount, result.checkCount, passRate * 100.0 ); + + if( result.proofFetchFailCount > 0 ) + Log::Line( "Proof fetches failed : %llu ( %.3lf%% )", result.proofFetchFailCount, result.proofFetchFailCount / (double)result.checkCount * 100.0 ); + if( result.proofValidationFailCount > 0 ) + Log::Line( "Proof validation failed : %llu ( %.3lf%% )", result.proofValidationFailCount, result.proofValidationFailCount / (double)result.checkCount * 100.0 ); + Log::NewLine(); + } + + // Delete the plot if it's below the set threshold + if( _cfg.deletePlots ) + { + if( result.proofFetchFailCount > 0 || passRate < _cfg.deleteThreshold ) + { + if( !_cfg.silent ) + { + if( result.proofFetchFailCount > 0 ) + { + Log::Line( "WARNING: Deleting plot '%s' as it failed to fetch some proofs. This might indicate corrupt plot file.", plotPath ); + } + else + { + Log::Line( "WARNING: Deleting plot '%s' as it is below the proof threshold: %.3lf / %.3lf.", + plotPath, passRate, _cfg.deleteThreshold ); + } + Log::NewLine(); + } + + remove( plotPath ); + result.deleted = true; + _lastPlotDeleted = true; + } + else + { + Log::Line( "Plot is OK. It passed the proof threshold of %.3lf%%", _cfg.deleteThreshold * 100.0 ); + Log::NewLine(); + } + } + + if( outResult ) + *outResult = result; + } + + //----------------------------------------------------------- + void PerformPlotCheck( const char* plotPath, PlotCheckResult& result ) + { + FilePlot plot; + if( !plot.Open( plotPath ) ) + { + std::stringstream err; err << "Failed to open plot file at '" << plotPath << "' with error " << plot.GetError() << "."; + result.error = err.str(); + return; + } + + const uint32 threadCount = _cfg.threadCount == 0 ? SysHost::GetLogicalCPUCount() : + std::min( (uint32)MAX_THREADS, std::min( _cfg.threadCount, SysHost::GetLogicalCPUCount() ) ); + + const bool useGpu = plot.CompressionLevel() > 0 && !_cfg.noGpu; + + PlotReader reader( plot ); + + if( _cfg.grContext ) + reader.AssignDecompressionContext( _cfg.grContext ); + else + reader.ConfigDecompressor( threadCount, _cfg.disableCpuAffinity, 0, useGpu, (int)_cfg.gpuIndex ); + + const uint32 k = plot.K(); + + byte AlignAs(8) seed[BB_PLOT_ID_LEN] = {}; + + if( !_cfg.hasSeed ) + SysHost::Random( seed, sizeof( seed ) ); + else + memcpy( seed, _cfg.seed, sizeof( _cfg.seed ) ); + + { + std::string seedHex = BytesToHexStdString( seed, sizeof( seed ) ); + if( !_cfg.silent ) + Log::Line( "Checking %llu random proofs with seed 0x%s...", (llu)_cfg.proofCount, seedHex.c_str() ); + } + + if( !_cfg.silent ) + Log::Line( "Plot compression level: %u", plot.CompressionLevel() ); + + if( !_cfg.grContext && plot.CompressionLevel() > 0 && useGpu ) + { + const bool hasGPU = grHasGpuDecompressor( reader.GetDecompressorContext() ); + if( hasGPU && !_cfg.silent ) + Log::Line( "Using GPU for decompression." ); + else if( !_cfg.silent ) + Log::Line( "No GPU was selected for decompression." ); + } + + const uint64 f7Mask = (1ull << k) - 1; + + uint64 prevF7 = 0; + uint64 proofCount = 0; + + uint64 proofXs[BB_PLOT_PROOF_X_COUNT]; + + uint64 nextPercentage = 10; + + for( uint64 i = 0; i < _cfg.proofCount; i++ ) + { + const uint64 f7 = F1GenSingleForK( k, seed, prevF7 ) & f7Mask; + prevF7 = f7; + + uint64 startP7Idx = 0; + const uint64 nF7Proofs = reader.GetP7IndicesForF7( f7, startP7Idx ); + + for( uint64 j = 0; j < nF7Proofs; j++ ) + { + uint64 p7Entry; + if( !reader.ReadP7Entry( startP7Idx + j, p7Entry ) ) + { + result.proofFetchFailCount ++; + continue; + } + + const auto r = reader.FetchProof( p7Entry, proofXs ); + if( r == ProofFetchResult::OK ) + { + // Convert to + uint64 outF7 = 0; + if( PlotValidation::ValidateFullProof( k, plot.PlotId(), proofXs, outF7 ) && outF7 == f7 ) + { + proofCount++; + } + else + { + result.proofValidationFailCount++; + } + } + else + { + if( r != ProofFetchResult::NoProof ) + result.proofFetchFailCount ++; + } + } + + const double percent = i / (double)_cfg.proofCount * 100.0; + if( (uint64)percent == nextPercentage ) + { + if( !_cfg.silent ) + Log::Line( " %llu%%...", (llu)nextPercentage ); + nextPercentage += 10; + } + } + + result.checkCount = _cfg.proofCount; + result.proofCount = proofCount; + result.error.clear(); + static_assert( sizeof(PlotCheckResult::seedUsed) == sizeof(seed) ); + memcpy( result.seedUsed, seed, sizeof( result.seedUsed ) ); + } + + //----------------------------------------------------------- + bool LastPlotDeleted() override + { + return _lastPlotDeleted; + } +}; + +//----------------------------------------------------------- +PlotChecker* PlotChecker::Create( PlotCheckerConfig& cfg ) +{ + return new PlotCheckerImpl( cfg ); +} diff --git a/src/tools/PlotChecker.h b/src/tools/PlotChecker.h new file mode 100644 index 00000000..e63adc23 --- /dev/null +++ b/src/tools/PlotChecker.h @@ -0,0 +1,50 @@ +#pragma once +#include "ChiaConsts.h" +#include + +struct PlotCheckerConfig +{ + uint64 proofCount = 100; + bool noGpu = false; + int32 gpuIndex = -1; + uint32 threadCount = 0; + bool disableCpuAffinity = false; + bool silent = false; + bool hasSeed = false; + byte seed[BB_PLOT_ID_LEN]{}; + + bool deletePlots = false; // If true, plots that fail to fetch proofs, or are below a threshold, will be deleted + double deleteThreshold = 0.0; // If proofs received to proof request ratio is below this, the plot will be deleted + + struct GreenReaperContext* grContext = nullptr; +}; + +struct PlotCheckResult +{ + uint64 checkCount; + uint64 proofCount; + uint64 proofFetchFailCount; + uint64 proofValidationFailCount; + byte seedUsed[BB_PLOT_ID_LEN]; + std::string error; + bool deleted; +}; + +class PlotChecker +{ +public: + +protected: + PlotChecker() = default; + +public: + static PlotChecker* Create( PlotCheckerConfig& cfg ); + virtual ~PlotChecker() = default; + + // Add a plot ot the queue to be checked + /// Returns true if the plot passed the threshold check + virtual void CheckPlot( const char* plotPath, PlotCheckResult* outResult ) = 0; + + // Returns true if the last plot checked was deleted + virtual bool LastPlotDeleted() = 0; +}; diff --git a/src/tools/PlotComparer.cpp b/src/tools/PlotComparer.cpp index 9562e2a0..f275d980 100644 --- a/src/tools/PlotComparer.cpp +++ b/src/tools/PlotComparer.cpp @@ -1,5 +1,6 @@ #include "io/FileStream.h" #include "ChiaConsts.h" +#include "tools/PlotReader.h" #include "util/Util.h" #include "util/Log.h" #include "util/CliParser.h" @@ -11,279 +12,17 @@ #include #include -class PlotInfo; -void TestTable( TableId table, PlotInfo& ref, PlotInfo& tgt ); -void TestC3Table( PlotInfo& ref, PlotInfo& tgt ); -void TestTable( PlotInfo& ref, PlotInfo& tgt, TableId table ); +void DumpPlotHeader( FilePlot& plot ); +void TestTable( TableId table, FilePlot& ref, FilePlot& tgt ); +void TestC3Table( FilePlot& ref, FilePlot& tgt ); +void TestTable( FilePlot& ref, FilePlot& tgt, TableId table ); -void UnpackPark7( const byte* srcBits, uint64* dstEntries ); - -void DumpP7( PlotInfo& plot, const char* path ); - -Span ReadC1Table( PlotInfo& plot ); - -class PlotInfo -{ -public: - PlotInfo() {} - - ~PlotInfo() - { - - } - - void Open( const char* path ) - { - _path = path; - FatalIf( IsOpen(), "Plot is already open." ); - - // FileFlags::NoBuffering | FileFlags::NoBuffering ), // #TODO: allow unbuffered reading with our own buffer... For now just use like this - FatalIf( !_plotFile.Open( path, FileMode::Open, FileAccess::Read, FileFlags::None ), - "Failed to open plot '%s' with error %d.", path, _plotFile.GetError() ); - - const size_t blockSize = _plotFile.BlockSize(); - _blockBuffer = bbvirtalloc( blockSize ); - - /// - /// Read header - /// - - // Magic - { - char magic[sizeof( kPOSMagic )-1] = { 0 }; - Read( sizeof( magic ), magic ); - FatalIf( !MemCmp( magic, kPOSMagic, sizeof( magic ) ), "Invalid plot magic." ); - } - - // Plot Id - { - Read( sizeof( _id ), _id ); - - char str[65] = { 0 }; - size_t numEncoded = 0; - BytesToHexStr( _id, sizeof( _id ), str, sizeof( str ), numEncoded ); - ASSERT( numEncoded == sizeof( _id ) ); - _idString = str; - } - - // K - { - byte k = 0; - Read( 1, &k ); - _k = k; - } - - // Format Descritption - { - const uint formatDescSize = ReadUInt16(); - FatalIf( formatDescSize != sizeof( kFormatDescription ) - 1, "Invalid format description size." ); - - char desc[sizeof( kFormatDescription )-1] = { 0 }; - Read( sizeof( desc ), desc ); - FatalIf( !MemCmp( desc, kFormatDescription, sizeof( desc ) ), "Invalid format description." ); - } - - // Memo - { - uint memoSize = ReadUInt16(); - FatalIf( memoSize > sizeof( _memo ), "Invalid memo." ); - _memoLength = memoSize; - - Read( memoSize, _memo ); - - char str[BB_PLOT_MEMO_MAX_SIZE*2+1] = { 0 }; - size_t numEncoded = 0; - BytesToHexStr( _memo, memoSize, str, sizeof( str ), numEncoded ); - - _memoString = str; - } - - // Table pointers - Read( sizeof( _tablePtrs ), _tablePtrs ); - for( int i = 0; i < 10; i++ ) - _tablePtrs[i] = Swap64( _tablePtrs[i] ); - - // What follows is table data - } - -public: - const bool IsOpen() const { return _plotFile.IsOpen(); } - - const byte* PlotId() const { return _id; } - - const std::string& PlotIdStr() const { return _idString; } - - uint PlotMemoSize() const { return _memoLength; } - - const byte* PlotMemo() const { return _memo; } - - const std::string& PlotMemoStr() const { return _memoString; } - - uint K() const { return _k; } - - FileStream& PlotFile() { return _plotFile; } - - uint64 TableAddress( TableId table ) const - { - ASSERT( table >= TableId::Table1 && table <= TableId::Table7 ); - return _tablePtrs[(int)table]; - } - - uint64 CTableAddress( int c ) - { - ASSERT( c >= 1 && c <= 3 ); - - return _tablePtrs[c+6]; - } - - size_t TableSize( int tableIndex ) - { - ASSERT( tableIndex >= 0 && tableIndex < 10 ); - - const uint64 address = _tablePtrs[tableIndex]; - uint64 endAddress = _plotFile.Size(); - - // Check all table entris where we find and address that is - // greater than ours and less than the current end address - for( int i = 0; i < 10; i++ ) - { - const uint64 a = _tablePtrs[i]; - if( a > address && a < endAddress ) - endAddress = a; - } - - return (size_t)( endAddress - address ); - } - - ssize_t Read( size_t size, void* buffer ) - { - ASSERT( buffer ); - if( size == 0 ) - return 0; - - const size_t blockSize = _plotFile.BlockSize(); - - // Read-in any data already left-over in the block buffer - // if( _blockRemainder ) - // { - // const size_t copySize = std::min( _blockRemainder, size ); - // memcpy( buffer, _blockBuffer + _blockOffset, copySize ); - - // _blockOffset += copySize; - // _blockRemainder -= copySize; - - // buffer = (void*)((byte*)buffer + copySize); - // size -= copySize; - - // if( size == 0 ) - // return copySize; - // } - - const size_t blockCount = size / blockSize; - - size_t blockSizeToRead = blockCount * blockSize; - const size_t remainder = size - blockSizeToRead; - - byte* reader = (byte*)buffer; - ssize_t sizeRead = 0; - - while( blockSizeToRead ) - { - ssize_t read = _plotFile.Read( reader, blockSizeToRead ); - FatalIf( read < 0 , "Plot %s failed to read with error: %d.", _path.c_str(), _plotFile.GetError() ); - - reader += read; - sizeRead += read; - blockSizeToRead -= (size_t)read; - } - - if( remainder ) - { - ssize_t read = _plotFile.Read( reader, remainder ); - - // ssize_t read = _plotFile.Read( _blockBuffer, blockSize ); - ASSERT( read == (ssize_t)remainder || read == (ssize_t)blockSize ); - - // FatalIf( read < (ssize_t)remainder, "Failed to read a full block on plot %s.", _path.c_str() ); - - // memcpy( reader, _blockBuffer, remainder ); - sizeRead += read; - - // // Save any left over data in the block buffer - // _blockOffset = remainder; - // _blockRemainder = blockSize - remainder; - } - - return sizeRead; - } - - uint16 ReadUInt16() - { - uint16 value = 0; - Read( sizeof( value ), &value ); - return Swap16( value ); - } - - void ReadTable( int tableIndex, void* buffer ) - { - const size_t size = TableSize( tableIndex ); - - _blockRemainder = 0; - FatalIf( !_plotFile.Seek( (int64)_tablePtrs[tableIndex], SeekOrigin::Begin ), - "Failed to seek to table %u.", tableIndex+1 ); - - Read( size, buffer ); - } - - void DumpHeader() - { - Log::Line( "Plot %s", _path.c_str() ); - Log::Line( "-----------------------------------------" ); - Log::Line( "Id : %s", _idString.c_str() ); - Log::Line( "Memo : %s", _memoString.c_str() ); - Log::Line( "K : %u", _k ); - - for( int i = 0; i <= (int)TableId::Table7; i++ ) - { - const size_t size = TableSize( i ); - - Log::Line( "Table %u : %16lu ( 0x%016lx ) : %8llu MiB ( %.2lf GiB )", - i+1, _tablePtrs[i], _tablePtrs[i], - size BtoMB, (double)size BtoGB ); - - } - - for( int i = (int)TableId::Table7+1; i < 10; i++ ) - { - const size_t size = TableSize( i ); - - Log::Line( "C%u : %16lu ( 0x%016lx ) : %8llu MiB ( %.2lf GiB )", - i-6, _tablePtrs[i], _tablePtrs[i], - size BtoMB, (double)size BtoGB ); - } - } - -private: - FileStream _plotFile; - byte _id[BB_PLOT_ID_LEN] = { 0 }; - byte _memo[BB_PLOT_MEMO_MAX_SIZE] = { 0 }; - uint _memoLength = 0; - std::string _idString = ""; - std::string _memoString = ""; - uint _k = 0; - std::string _path = ""; - uint64 _tablePtrs[10] = { 0 }; - byte* _blockBuffer = nullptr; - size_t _blockRemainder = 0; - size_t _blockOffset = 0; - - // size_t _readBufferSize = 32 MB; - // byte* _readBuffer = nullptr; - -}; +void UnpackPark7( uint32 k, const byte* srcBits, uint64* dstEntries ); +void DumpP7( FilePlot& plot, const char* path ); +Span ReadC1Table( FilePlot& plot ); //----------------------------------------------------------- const char USAGE[] = R"(plotcmp @@ -330,8 +69,8 @@ void PlotCompareMain( GlobalPlotConfig& gCfg, CliParser& cli ) opts.plotAPath = cli.ArgConsume(); opts.plotBPath = cli.ArgConsume(); - PlotInfo refPlot; // Reference - PlotInfo tgtPlot; // Target + FilePlot refPlot; // Reference + FilePlot tgtPlot; // Target { const char* refPath = opts.plotAPath; @@ -340,9 +79,12 @@ void PlotCompareMain( GlobalPlotConfig& gCfg, CliParser& cli ) refPlot.Open( refPath ); tgtPlot.Open( tgtPath ); - refPlot.DumpHeader(); - Log::Line( "" ); - tgtPlot.DumpHeader(); + Log::Line( "[Reference Plot]" ); + DumpPlotHeader( refPlot ); + Log::NewLine(); + Log::Line( "[Target Plot]" ); + DumpPlotHeader( tgtPlot ); + Log::NewLine(); } FatalIf( refPlot.K() != 32, "Plot A is k%u. Only k32 plots are currently supported.", refPlot.K() ); @@ -352,10 +94,14 @@ void PlotCompareMain( GlobalPlotConfig& gCfg, CliParser& cli ) // FatalIf( !MemCmp( refPlot.PlotMemo(), tgtPlot.PlotMemo(), std::min( refPlot.PlotMemoSize(), tgtPlot.PlotMemoSize() ) ), "Plot memo mismatch." ); FatalIf( refPlot.K() != tgtPlot.K(), "K value mismatch." ); + FatalIf( refPlot.CompressionLevel() != tgtPlot.CompressionLevel(), + "Compression mismatch. %u != %u.", refPlot.CompressionLevel(), tgtPlot.CompressionLevel() ); + // Test P7, dump it // DumpP7( refPlot, "/mnt/p5510a/reference/p7.tmp" ); - // TestC3Table( refPlot, tgtPlot ); + // TestC3Table( refPlot, tgtPlot ); Exit( 0 ); + // TestTable( refPlot, tgtPlot, TableId::Table7 ); // TestTable( refPlot, tgtPlot, TableId::Table3 ); @@ -368,13 +114,36 @@ void PlotCompareMain( GlobalPlotConfig& gCfg, CliParser& cli ) } //----------------------------------------------------------- -Span ReadC1Table( PlotInfo& plot ) +void DumpPlotHeader( FilePlot& p ) { - const size_t tableSize = plot.TableSize( 7 ); + // Log::Line( "Id: %") + Log::Line( "K: %u", p.K() ); + Log::Line( "Compression Level: %u", p.CompressionLevel() ); + + Log::Line( "Table Addresses:" ); + for( uint32 i = 0; i < 10; i++ ) + Log::Line( " [%2u] : 0x%016llx", i+1, (llu)p.TableAddress( (PlotTable)i ) ); + + if( p.Version() >= PlotVersion::v2_0 ) + { + const auto sizes = p.TableSizes(); + + Log::Line( "Table Sizes:" ); + for( uint32 i = 0; i < 10; i++ ) + Log::Line( " [%2u] : %-12llu B | %llu MiB", i+1, (llu)sizes[i], (llu)(sizes[i] BtoMB) ); + } +} + +//----------------------------------------------------------- +Span ReadC1Table( FilePlot& plot ) +{ + const size_t tableSize = plot.TableSize( PlotTable::C1 ); const uint32 entryCount = (uint)( tableSize / sizeof( uint32 ) ); - + uint32* c1 = bbvirtalloc( tableSize ); - plot.ReadTable( 7, c1 ); + + FatalIf( !plot.SeekToTable( PlotTable::C1 ), "Failed to seek to table C1." ); + plot.Read( tableSize, c1 ); for( uint i = 0; i < entryCount; i++ ) c1[i] = Swap32( c1[i] ); @@ -383,13 +152,15 @@ Span ReadC1Table( PlotInfo& plot ) } //----------------------------------------------------------- -Span ReadC2Table( PlotInfo& plot ) +Span ReadC2Table( FilePlot& plot ) { - const size_t tableSize = plot.TableSize( 8 ); + const size_t tableSize = plot.TableSize( PlotTable::C2 ); const uint32 entryCount = (uint)( tableSize / sizeof( uint32 ) ); uint32* c2 = bbvirtalloc( tableSize ); - plot.ReadTable( 8, c2 ); + + FatalIf( !plot.SeekToTable( PlotTable::C2 ), "Failed to seek to table C1." ); + plot.Read( tableSize, c2 ); for( uint i = 0; i < entryCount; i++ ) c2[i] = Swap32( c2[i] ); @@ -398,13 +169,9 @@ Span ReadC2Table( PlotInfo& plot ) } //----------------------------------------------------------- -void TestC3Table( PlotInfo& ref, PlotInfo& tgt ) +void TestC3Table( FilePlot& ref, FilePlot& tgt ) { Log::Line( "Reading C tables..." ); - // const size_t refSize = ref.TableSize( 9 ); - // const size_t tgtSize = tgt.TableSize( 9 ); - - // const size_t c3Size = std::min( refSize, tgtSize ); // Read C1 so that we know how many parks we got Span refC1 = ReadC1Table( ref ); @@ -463,14 +230,17 @@ void TestC3Table( PlotInfo& ref, PlotInfo& tgt ) Log::Line( "Validating C3 table..." ); { - const size_t refC3Size = ref.TableSize( 9 ); - const size_t tgtC3Size = tgt.TableSize( 9 ); + const size_t refC3Size = ref.TableSize( PlotTable::C3 ); + const size_t tgtC3Size = tgt.TableSize( PlotTable::C3 ); byte* refC3 = bbvirtalloc( refC3Size ); byte* tgtC3 = bbvirtalloc( tgtC3Size ); - ref.ReadTable( 9, refC3 ); - tgt.ReadTable( 9, tgtC3 ); + FatalIf( !ref.SeekToTable( PlotTable::C3 ), "Failed to seek ref plot to C3 table." ); + FatalIf( !tgt.SeekToTable( PlotTable::C3 ), "Failed to seek tgt plot to C3 table." ); + + FatalIf( (ssize_t)refC3Size != ref.Read( refC3Size, refC3 ), "Failed to read ref C3 table." ); + FatalIf( (ssize_t)tgtC3Size != tgt.Read( tgtC3Size, tgtC3 ), "Failed to read tgt C3 table." ); // const size_t c3Size = std::min( refC3Size, tgtC3Size ); @@ -515,7 +285,7 @@ void TestC3Table( PlotInfo& ref, PlotInfo& tgt ) } //----------------------------------------------------------- -uint64 CompareP7( PlotInfo& ref, PlotInfo& tgt, const byte* p7RefBytes, const byte* p7TgtBytes, const int64 parkCount ) +uint64 CompareP7( FilePlot& ref, FilePlot& tgt, const byte* p7RefBytes, const byte* p7TgtBytes, const int64 parkCount ) { // Double-buffer parks at a time so that we can compare entries across parks uint64 refParks[2][kEntriesPerPark]; @@ -529,8 +299,8 @@ uint64 CompareP7( PlotInfo& ref, PlotInfo& tgt, const byte* p7RefBytes, const by const size_t parkSize = CalculatePark7Size( ref.K() ); - UnpackPark7( p7RefBytes, refParks[0] ); - UnpackPark7( p7TgtBytes, tgtParks[0] ); + UnpackPark7( ref.K(), p7RefBytes, refParks[0] ); + UnpackPark7( tgt.K(), p7TgtBytes, tgtParks[0] ); p7RefBytes += parkSize; p7TgtBytes += parkSize; @@ -543,8 +313,8 @@ uint64 CompareP7( PlotInfo& ref, PlotInfo& tgt, const byte* p7RefBytes, const by // Load the next park, if we can if( !isLastPark ) { - UnpackPark7( p7RefBytes, refParks[1] ); - UnpackPark7( p7TgtBytes, tgtParks[1] ); + UnpackPark7( ref.K(), p7RefBytes, refParks[1] ); + UnpackPark7( tgt.K(), p7TgtBytes, tgtParks[1] ); p7RefBytes += parkSize; p7TgtBytes += parkSize; } @@ -601,32 +371,61 @@ uint64 CompareP7( PlotInfo& ref, PlotInfo& tgt, const byte* p7RefBytes, const by } //----------------------------------------------------------- -void TestTable( PlotInfo& ref, PlotInfo& tgt, TableId table ) +void TestTable( FilePlot& ref, FilePlot& tgt, TableId table ) { + if( table == TableId::Table1 && tgt.CompressionLevel() > 0 ) + return; + + if( table == TableId::Table2 && tgt.CompressionLevel() >= 9 ) + return; + + // if( table == TableId::Table7 ) return; + Log::Line( "Reading Table %u...", table+1 ); - const size_t parkSize = table < TableId::Table7 ? CalculateParkSize( table ) : CalculatePark7Size( ref.K() ); + const uint32 numTablesDropped = tgt.CompressionLevel() >= 9 ? 2 : + tgt.CompressionLevel() >= 1 ? 1 : 0; - const size_t sizeRef = ref.TableSize( (int)table ); - const size_t sizeTgt = tgt.TableSize( (int)table ); + const size_t parkSize = table < TableId::Table7 ? + (uint)table == numTablesDropped ? + GetCompressionInfoForLevel( tgt.CompressionLevel() ).tableParkSize : CalculateParkSize( table ) : + CalculatePark7Size( ref.K() ); + + const size_t sizeRef = ref.TableSize( (PlotTable)table ); + const size_t sizeTgt = tgt.TableSize( (PlotTable)table ); byte* tableParksRef = bbvirtalloc( sizeRef ); byte* tableParksTgt = bbvirtalloc( sizeTgt ); - ref.ReadTable( (int)table, tableParksRef ); - tgt.ReadTable( (int)table, tableParksTgt ); - const size_t tableSize = std::min( sizeRef, sizeTgt ); const int64 parkCount = (int64)( tableSize / parkSize ); + FatalIf( !ref.SeekToTable( (PlotTable)table ), "Failed to seek to table %u on reference plot.", (uint32)table+1 ); + FatalIf( !tgt.SeekToTable( (PlotTable)table ), "Failed to seek to table %u on target plot.", (uint32)table+1 ); + + { + const ssize_t refRead = ref.Read( tableSize, tableParksRef ); + FatalIf( (ssize_t)tableSize != refRead, "Failed to read reference table %u.", (uint32)table+1 ); + + const ssize_t tgtRead = tgt.Read( tableSize, tableParksTgt ); + FatalIf( (ssize_t)tableSize != tgtRead, "Failed to read target table %u.", (uint32)table+1 ); + + } + const byte* parkRef = tableParksRef; const byte* parkTgt = tableParksTgt; - Log::Line( "Validating Table %u...", table+1 ); - const uint64 stubBitSize = (_K - kStubMinusBits); + uint64 stubBitSize = (ref.K() - kStubMinusBits); + if( ref.CompressionLevel() > 0 ) + { + auto cInfo = GetCompressionInfoForLevel( ref.CompressionLevel() ); + stubBitSize = cInfo.stubSizeBits; + } + const size_t stubSectionBytes = CDiv( (kEntriesPerPark - 1) * stubBitSize, 8 ); + uint64 failureCount = 0; if( table == TableId::Table7 ) { @@ -643,7 +442,6 @@ void TestTable( PlotInfo& ref, PlotInfo& tgt, TableId table ) for( int64 i = 0; i < parkCount; i++ ) { // Ignore buffer zone - const uint16 pRefCSize = *(uint16*)(parkRef + stubSectionBytes + sizeof( uint64 ) ); const uint16 pTgtCSize = *(uint16*)(parkTgt + stubSectionBytes + sizeof( uint64 ) ); @@ -652,7 +450,7 @@ void TestTable( PlotInfo& ref, PlotInfo& tgt, TableId table ) if( !failed ) { const size_t realParkSize = sizeof( uint64 ) + stubSectionBytes + pRefCSize; - failed =!MemCmp( parkRef, parkTgt, realParkSize ); + failed = !MemCmp( parkRef, parkTgt, realParkSize ); } // if( pRefCSize != pTgtCSize || !MemCmp( parkRef, parkTgt, parkSize ) ) @@ -661,7 +459,7 @@ void TestTable( PlotInfo& ref, PlotInfo& tgt, TableId table ) if( failed ) { - bool stubsEqual = MemCmp( parkRef, parkTgt, stubSectionBytes + sizeof( uint64 ) ); + // bool stubsEqual = MemCmp( parkRef, parkTgt, stubSectionBytes + sizeof( uint64 ) ); Log::Line( " T%u park %lld failed.", table+1, i ); failureCount++; } @@ -705,20 +503,19 @@ void TestTable( PlotInfo& ref, PlotInfo& tgt, TableId table ) // Unpack a single park 7, // ensure srcBits is algined to uint64 //----------------------------------------------------------- -void UnpackPark7( const byte* srcBits, uint64* dstEntries ) +void UnpackPark7( const uint32 k, const byte* srcBits, uint64* dstEntries ) { ASSERT( ((uintptr_t)srcBits & 7 ) == 0 ); - const uint32 _k = _K; - const uint32 bitsPerEntry = _k + 1; - CPBitReader reader( srcBits, CalculatePark7Size( _k ) * 8, 0 ); + const uint32 bitsPerEntry = k + 1; + CPBitReader reader( srcBits, CalculatePark7Size( k ) * 8, 0 ); for( int32 i = 0; i < kEntriesPerPark; i++ ) dstEntries[i] = reader.Read64Aligned( bitsPerEntry ); } //----------------------------------------------------------- -void DumpP7( PlotInfo& plot, const char* path ) +void DumpP7( FilePlot& plot, const char* path ) { FileStream file; FatalIf( !file.Open( path, FileMode::Create, FileAccess::Write, FileFlags::LargeFile | FileFlags::NoBuffering ), @@ -726,14 +523,14 @@ void DumpP7( PlotInfo& plot, const char* path ) const size_t parkSize = CalculatePark7Size( plot.K() ); - const size_t tableSize = plot.TableSize( (int)TableId::Table7 ); + const size_t tableSize = plot.TableSize( PlotTable::Table7 ); const int64 parkCount = (int64)( tableSize / parkSize ); const uint64 numEntries = (uint64)parkCount * kEntriesPerPark; byte* p7Bytes = bbvirtalloc( tableSize ); Log::Line( "Reading Table7..." ); - plot.ReadTable( (int)TableId::Table7, p7Bytes ); + // plot.ReadTable( (int)TableId::Table7, p7Bytes ); Log::Line( "Unpacking Table 7..." ); uint64* entries = bbvirtalloc( RoundUpToNextBoundaryT( (size_t)numEntries* sizeof( uint64 ), file.BlockSize() ) ); @@ -742,7 +539,7 @@ void DumpP7( PlotInfo& plot, const char* path ) uint64* entryWriter = entries; for( int64 i = 0; i < parkCount; i++ ) { - UnpackPark7( p7Bytes, entryWriter ); + UnpackPark7( plot.K(), p7Bytes, entryWriter ); parkReader += parkSize; entryWriter += kEntriesPerPark; diff --git a/src/tools/PlotReader.cpp b/src/tools/PlotReader.cpp index 6e925947..0059f186 100644 --- a/src/tools/PlotReader.cpp +++ b/src/tools/PlotReader.cpp @@ -8,6 +8,7 @@ #include "plotting/Compression.h" #include "harvesting/GreenReaper.h" #include "BLS.h" +#include "plotdisk/jobs/IOJob.h" /// /// Plot Reader @@ -38,7 +39,7 @@ PlotReader::~PlotReader() bbvirtfreebounded_span( _c3Buffer ); - if( _grContext ) + if( _grContext && _ownsGrContext ) grDestroyContext( _grContext ); _grContext = nullptr; } @@ -347,22 +348,25 @@ bool PlotReader::ReadLPParkComponents( TableId table, uint64 parkIndex, if( _plot.Read( 2, &compressedDeltasSize ) != 2 ) return false; - if( !( compressedDeltasSize & 0x8000 ) && compressedDeltasSize > maxDeltasSizeBytes ) + // Don't support uncompressed deltas + if( compressedDeltasSize & 0x8000 ) return false; size_t deltaCount = 0; - if( compressedDeltasSize & 0x8000 ) - { - // Uncompressed - compressedDeltasSize &= 0x7fff; - if( _plot.Read( compressedDeltasSize, compressedDeltaBuffer ) != compressedDeltasSize ) - return false; - deltaCount = compressedDeltasSize; - } - else + // #TODO: Investigate this, but we should not support uncompressed deltas + // if( compressedDeltasSize & 0x8000 ) + // { + // // Uncompressed + // compressedDeltasSize &= 0x7fff; + // if( _plot.Read( compressedDeltasSize, compressedDeltaBuffer ) != compressedDeltasSize ) + // return false; + + // deltaCount = compressedDeltasSize; + // } + // else { - // Compressed + // Compressed deltas if( _plot.Read( compressedDeltasSize, compressedDeltaBuffer ) != compressedDeltasSize ) return false; @@ -470,7 +474,7 @@ uint32 PlotReader::GetLPStubBitSize( TableId table ) const return _plot.K() - kStubMinusBits; auto info = GetCompressionInfoForLevel( _plot.CompressionLevel() ); - return info.subtSizeBits; + return info.stubSizeBits; } //----------------------------------------------------------- @@ -932,24 +936,30 @@ void PlotReader::AssignDecompressionContext( struct GreenReaperContext* context if( !context) return; - if( _grContext ) + if( _grContext && _ownsGrContext ) grDestroyContext( _grContext ); - _grContext = context; + _grContext = context; + _ownsGrContext = false; } //----------------------------------------------------------- -void PlotReader::ConfigDecompressor( const uint32 threadCount, const bool disableCPUAffinity, const uint32 cpuOffset ) +void PlotReader::ConfigDecompressor( const uint32 threadCount, const bool disableCPUAffinity, const uint32 cpuOffset, bool useGpu, int gpuIndex ) { - if( _grContext ) + if( _grContext && _ownsGrContext ) grDestroyContext( _grContext ); - _grContext = nullptr; + + _grContext = nullptr; + _ownsGrContext = true; GreenReaperConfig cfg = {}; cfg.apiVersion = GR_API_VERSION; cfg.threadCount = bbclamp( threadCount, 1u, SysHost::GetLogicalCPUCount() ); cfg.cpuOffset = cpuOffset; cfg.disableCpuAffinity = disableCPUAffinity ? GR_TRUE : GR_FALSE; + cfg.gpuRequest = !useGpu ? GRGpuRequestKind_None : + gpuIndex >= 0 ? GRGpuRequestKind_ExactDevice : GRGpuRequestKind_FirstAvailable; + cfg.gpuDeviceIndex = gpuIndex < 0 ? 0 : gpuIndex; auto result = grCreateContext( &_grContext, &cfg, sizeof( GreenReaperConfig ) ); ASSERT( result == GRResult_OK ); @@ -966,6 +976,8 @@ GreenReaperContext* PlotReader::GetGRContext() auto result = grCreateContext( &_grContext, &cfg, sizeof( GreenReaperConfig ) ); ASSERT( result == GRResult_OK ); + + _ownsGrContext = true; } return _grContext; @@ -1262,7 +1274,18 @@ size_t FilePlot::PlotSize() const //----------------------------------------------------------- ssize_t FilePlot::Read( size_t size, void* buffer ) { - return _file.Read( buffer, size ); + if( size > (size_t)std::numeric_limits::max() ) + size = (size_t)std::numeric_limits::max(); + + int error = 0; + if( !IOJob::ReadFromFileUnaligned( _file, buffer, size, error ) ) + { + size = 0; + (void)error; + Log::Error( "Failed to read from plot with error %d", error ); + } + + return (ssize_t)size; } //----------------------------------------------------------- diff --git a/src/tools/PlotReader.h b/src/tools/PlotReader.h index 9517c78a..a8d8bdbb 100644 --- a/src/tools/PlotReader.h +++ b/src/tools/PlotReader.h @@ -88,6 +88,15 @@ class IPlotFile return Swap16( value ); } + inline bool SeekToTable( PlotTable table ) + { + return Seek( SeekOrigin::Begin, (int64)TableAddress( table ) ); + } + + inline PlotVersion Version() { return _version; } + + inline Span TableSizes() { return Span( _header.tableSizes, 10 ); } + // Abstract Interface public: virtual bool Open( const char* path ) = 0; @@ -233,7 +242,13 @@ class PlotReader // Takes ownership of a decompression context void AssignDecompressionContext( struct GreenReaperContext* context ); - void ConfigDecompressor( uint32 threadCount, bool disableCPUAffinity, uint32 cpuOffset = 0 ); + void ConfigDecompressor( uint32 threadCount, bool disableCPUAffinity, uint32 cpuOffset = 0, bool useGpu = false, int gpuIndex = -1 ); + + inline void ConfigGpuDecompressor( uint32 threadCount, bool disableCPUAffinity, uint32 cpuOffset = 0 ) + { + ConfigDecompressor( threadCount, disableCPUAffinity, cpuOffset, true ); + } + inline struct GreenReaperContext* GetDecompressorContext() const { return _grContext; } private: @@ -261,8 +276,9 @@ class PlotReader Span _c2Entries; Span _c3Buffer; - struct GreenReaperContext* _grContext = nullptr; // Used for decompressing - + struct GreenReaperContext* _grContext = nullptr; // Used for decompressing + bool _ownsGrContext = true; + int64 _park7Index = -1; uint64 _park7Entries[kEntriesPerPark]; }; diff --git a/src/tools/PlotValidator.cpp b/src/tools/PlotValidator.cpp index d0ec5e8d..16a7f14a 100644 --- a/src/tools/PlotValidator.cpp +++ b/src/tools/PlotValidator.cpp @@ -271,7 +271,7 @@ bool ValidatePlot( const ValidatePlotOptions& options ) { auto* memPlot = new MemoryPlot(); plotFile = memPlot; - + Log::Line( "Reading plot file into memory..." ); if( memPlot->Open( options.plotPath.c_str() ) ) { @@ -305,7 +305,7 @@ bool ValidatePlot( const ValidatePlotOptions& options ) // Duplicate the plot file, ThreadPool pool( threadCount ); - + UnpackedK32Plot unpackedPlot; if( options.unpacked ) { @@ -332,7 +332,7 @@ bool ValidatePlot( const ValidatePlotOptions& options ) } MTJobRunner jobs( pool ); - + for( uint32 i = 0; i < threadCount; i++ ) { auto& job = jobs[i]; diff --git a/src/util/BitView.h b/src/util/BitView.h index da74ea4b..fd918abe 100644 --- a/src/util/BitView.h +++ b/src/util/BitView.h @@ -153,7 +153,7 @@ class CPBitReader if constexpr ( CheckAlignment ) { isPtrAligned = ((uintptr_t)pField & 7) == 0; // % 8 - isLastField = fieldIndex == ( sizeBits >> 6 ) - 1; + isLastField = fieldIndex == ( sizeBits >> 6 ); if( isPtrAligned && !isLastField ) field = *((uint64*)pField); diff --git a/src/util/CliParser.cpp b/src/util/CliParser.cpp index 77ba7c47..d78f5b57 100644 --- a/src/util/CliParser.cpp +++ b/src/util/CliParser.cpp @@ -44,6 +44,21 @@ bool CliParser::ReadStr( const char*& value, const char* paramA, const char* par return true; } +//----------------------------------------------------------- +bool CliParser::ReadStr( std::string& value, const char* paramA, const char* paramB ) +{ + if( !ArgMatch( paramA, paramB ) ) + return false; + + NextArg(); + FatalIf( !HasArgs(), "Expected a value for argument '%s'.", _argv[_i-1] ); + + value = _argv[_i]; + NextArg(); + + return true; +} + //----------------------------------------------------------- uint64 CliParser::ReadU64() { @@ -317,7 +332,7 @@ bool CliParser::ReadHexStr( const char*& hexStr, const size_t maxStrLength, cons return false; size_t len = strlen( hexStr ); - if( len >= 2 && hexStr[0] == '0' && hexStr[0] == 'x' ) + if( len >= 2 && hexStr[0] == '0' && hexStr[1] == 'x' ) { hexStr += 2; len -= 2; diff --git a/src/util/CliParser.h b/src/util/CliParser.h index dcf5b490..d8c3b0db 100644 --- a/src/util/CliParser.h +++ b/src/util/CliParser.h @@ -97,6 +97,8 @@ class CliParser bool ReadUnswitch( bool& value, const char* paramA, const char* paramB = nullptr ); bool ReadStr( const char*& value, const char* paramA, const char* paramB = nullptr ); + + bool ReadStr( std::string& value, const char* paramA, const char* paramB = nullptr ); uint64 ReadU64(); diff --git a/src/util/CommandQueue.h b/src/util/CommandQueue.h new file mode 100644 index 00000000..4da58a68 --- /dev/null +++ b/src/util/CommandQueue.h @@ -0,0 +1,95 @@ +#pragma once +#include "MPMCQueue.h" +#include "threading/Thread.h" +#include "threading/AutoResetSignal.h" +#include "util/Span.h" +#include "util/Util.h" + +/// Multi-producer command queue base class +template +class MPCommandQueue +{ + using TSelf = MPCommandQueue; + + enum State : uint32 + { + Default = 0, + Running, + Exiting, + }; + +public: + MPCommandQueue() {} + + virtual inline ~MPCommandQueue() + { + _state.store( Exiting, std::memory_order_release ); + _consumerSignal.Signal(); + _consumerThread.WaitForExit(); + } + + void StartConsumer() + { + PanicIf( _state.load( std::memory_order_relaxed ) != Default, "Unexpected state" ); + + State expectedState = Default; + if( !_state.compare_exchange_weak( expectedState, Running, + std::memory_order_release, + std::memory_order_relaxed ) ) + { + Panic( "Unexpected state %u.", expectedState ); + } + + _consumerThread.Run( ConsumerThreadMain , this ); + } + + /// Thread-safe + void Submit( const TCommand& cmd ) + { + Submit( &cmd, 1 ); + } + + void Submit( const TCommand* commands, const i32 count ) + { + ASSERT( commands ); + ASSERT( count > 0 ); + + _queue.Enqueue( commands, (size_t)count ); + _consumerSignal.Signal(); + } + +protected: + /// Implementors must implement this + virtual void ProcessCommands( const Span items ) = 0; + + /// Command thread + static void ConsumerThreadMain( TSelf* self ) + { + self->ConsumerThread(); + } + + void ConsumerThread() + { + TCommand items[_MaxDequeue] = {}; + + for( ;; ) + { + _consumerSignal.Wait(); + + if( _state.load( std::memory_order_relaxed ) == Exiting ) + break; + + const size_t itemCount = _queue.Dequeue( items, _MaxDequeue ); + + if( itemCount > 0 ) + this->ProcessCommands( Span( items, itemCount ) ); + } + } + +private: + MPMCQueue _queue; + Thread _consumerThread; + AutoResetSignal _consumerSignal; + std::atomic _state = Default; +}; + diff --git a/src/util/IAllocator.h b/src/util/IAllocator.h new file mode 100644 index 00000000..fe9127cc --- /dev/null +++ b/src/util/IAllocator.h @@ -0,0 +1,93 @@ +#pragma once +#include "util/Util.h" + +class IAllocator +{ +public: + virtual ~IAllocator() {} + + virtual void* Alloc( const size_t size, const size_t alignment ) = 0; + + inline virtual void Free( void* ptr ) { (void)ptr; } + + //----------------------------------------------------------- + template + inline T* AllocT( const size_t size, size_t alignment = alignof( T ) ) + { + return reinterpret_cast( Alloc( size, alignment ) ); + } + + //----------------------------------------------------------- + template + inline T* CAlloc( const size_t count, size_t alignment = alignof( T ) ) + { + const size_t allocSize = sizeof( T ) * count; + ASSERT( allocSize >= count ); + + return AllocT( allocSize, alignment ); + } + + //----------------------------------------------------------- + template + inline Span CAllocSpan( const size_t count, size_t alignment = alignof( T ) ) + { + return Span( this->CAlloc( count, alignment ), count ); + } + + //----------------------------------------------------------- + inline void* CAlloc( const size_t count, const size_t size, const size_t alignment ) + { + const size_t paddedSize = RoundUpToNextBoundaryT( size, alignment ); + + return Alloc( paddedSize * count, alignment ); + } + + //----------------------------------------------------------- + inline void TryFree( void* ptr ) + { + if( ptr ) + Free( ptr ); + } + + //----------------------------------------------------------- + template + inline void SafeFree( T*& ptr ) + { + if( ptr ) + { + Free( ptr ); + ptr = nullptr; + } + } +}; + +class GlobalAllocator : public IAllocator +{ +public: + inline void* Alloc( const size_t size, const size_t alignment ) override + { + // Ignore alignment + (void)alignment; + return malloc( size ); + } +}; + +// class ProxyAllocator : public IAllocator +// { +// IAllocator& _allocator; + +// public: +// ProxyAllocator() = delete; +// inline ProxyAllocator( IAllocator& allocator ) +// , _allocator( allocator ) +// {} + +// inline ProxyAllocator( const ProxyAllocator& other ) +// : _allocator( other._allocator ) +// {} + +// inline void* Alloc( const size_t size, const size_t alignment ) override +// { +// return _allocator.Alloc( size, alignment ); +// } +// }; diff --git a/src/util/MPMCQueue.h b/src/util/MPMCQueue.h new file mode 100644 index 00000000..ba77719a --- /dev/null +++ b/src/util/MPMCQueue.h @@ -0,0 +1,68 @@ +#pragma once +#include +#include + +/// Lock-based multi-producer, multi-consumer queue +/// Simple and good enough for most uses +template +class MPMCQueue +{ +public: + inline MPMCQueue() {} + + void Enqueue( const T& item ) + { + _mutex.lock(); + _queue.push( item ); + _mutex.unlock(); + } + + void Enqueue( const T* items, const size_t count ) + { + if( count < 1 ) + return; + + _mutex.lock(); + + for( size_t i = 0; i < count; i++ ) + _queue.push( items[i] ); + + _mutex.unlock(); + } + + size_t Dequeue( T* outItem, const size_t maxDequeue ) + { + _mutex.lock(); + + const size_t max = std::min( maxDequeue, _queue.size() ); + + for( size_t i = 0; i < max; i++ ) + { + outItem[i] = _queue.front(); + _queue.pop(); + } + _mutex.unlock(); + + return max; + } + + bool Dequeue( T* outItem ) + { + _mutex.lock(); + const bool hasItem = !_queue.empty(); + + if( hasItem ) + { + *outItem = _queue.front(); + _queue.pop(); + } + _mutex.unlock(); + + return hasItem; + } + +private: + std::mutex _mutex; + std::queue _queue; +}; + diff --git a/src/util/Span.h b/src/util/Span.h index 3624048b..df3e82c2 100644 --- a/src/util/Span.h +++ b/src/util/Span.h @@ -157,6 +157,8 @@ struct Span return Span( reinterpret_cast( values ), targetLength ); } + + inline bool IsEmpty() const { return length == 0; } }; typedef Span ByteSpan; diff --git a/src/util/StackAllocator.h b/src/util/StackAllocator.h index a8ac2573..6fb710b3 100644 --- a/src/util/StackAllocator.h +++ b/src/util/StackAllocator.h @@ -1,44 +1,5 @@ #pragma once - -class IAllocator -{ -public: - virtual ~IAllocator() {} - - virtual void* Alloc( const size_t size, const size_t alignment ) = 0; - - //----------------------------------------------------------- - template - inline T* AllocT( const size_t size, size_t alignment = alignof( T ) ) - { - return reinterpret_cast( Alloc( size, alignment ) ); - } - - //----------------------------------------------------------- - template - inline T* CAlloc( const size_t count, size_t alignment = alignof( T ) ) - { - const size_t allocSize = sizeof( T ) * count; - ASSERT( allocSize >= count ); - - return AllocT( allocSize, alignment ); - } - - //----------------------------------------------------------- - template - inline Span CAllocSpan( const size_t count, size_t alignment = alignof( T ) ) - { - return Span( this->CAlloc( count, alignment ), count ); - } - - //----------------------------------------------------------- - inline void* CAlloc( const size_t count, const size_t size, const size_t alignment ) - { - const size_t paddedSize = RoundUpToNextBoundaryT( size, alignment ); - - return Alloc( paddedSize * count, alignment ); - } -}; +#include "IAllocator.h" class IStackAllocator : public IAllocator { @@ -98,7 +59,7 @@ class StackAllocator : public IStackAllocator { // Start address must be aligned to the specified alignment const size_t paddedSize = RoundUpToNextBoundaryT( _size, alignment ); - + ASSERT( size > 0 ); ASSERT( _size < _capacity ); ASSERT( paddedSize <= _capacity ); diff --git a/src/util/Util.h b/src/util/Util.h index 7d38cdde..e4477e84 100644 --- a/src/util/Util.h +++ b/src/util/Util.h @@ -68,9 +68,9 @@ /// /// Assorted utility functions /// -void Exit( int code ); -void FatalExit(); -void PanicExit(); +[[noreturn]] void Exit( int code ); +[[noreturn]] void FatalExit(); +[[noreturn]] void PanicExit(); void FatalErrorMsg( const char* message, ... ); void PanicErrorMsg( const char* message, ... ); diff --git a/src/util/VirtualAllocator.h b/src/util/VirtualAllocator.h index f3a2ee33..2b79755b 100644 --- a/src/util/VirtualAllocator.h +++ b/src/util/VirtualAllocator.h @@ -14,7 +14,7 @@ class VirtualAllocator : public IAllocator const size_t allocSize = PageAlign( size ); _size += allocSize; - return bbvirtalloc( allocSize ); + return bbvirtallocbounded( allocSize ); } inline void* TryAlloc( const size_t size ) diff --git a/tests/TestDiskQueue.cpp b/tests/TestDiskQueue.cpp new file mode 100644 index 00000000..7d8f5bfc --- /dev/null +++ b/tests/TestDiskQueue.cpp @@ -0,0 +1,172 @@ +#include "TestUtil.h" +#include "plotting/DiskQueue.h" +#include "plotting/DiskBucketBuffer.h" +#include "plotting/DiskBuffer.h" +#include "util/VirtualAllocator.h" + +constexpr uint32 bucketCount = 64; +constexpr uint32 entriesPerBucket = 1 << 16; +constexpr uint32 entriesPerSlice = entriesPerBucket / bucketCount; + +static void WriteBucketSlices( DiskBucketBuffer* buf, uint32 bucket, uint32 mask, Span sliceSizes ); + +//----------------------------------------------------------- +TEST_CASE( "disk-slices", "[disk-queue]" ) +{ + const char* tempPath = GetEnv( "bb_queue_path", "/Users/harito/.sandbox/plot" ); + + DiskQueue queue( tempPath ); + + auto buf = std::unique_ptr( DiskBucketBuffer::Create( + queue, "slices.tmp", bucketCount, sizeof( uint32 ) * entriesPerSlice, + FileMode::Create, FileAccess::ReadWrite ) ); + + ENSURE( buf.get() ); + + { + VirtualAllocator allocator{}; + buf->ReserveBuffers( allocator ); + } + + size_t _sliceSizes[bucketCount] = {}; + for( uint32 i = 0; i < bucketCount; i++ ) + _sliceSizes[i] = entriesPerSlice * sizeof( uint32 ); + + Span sliceSizes( _sliceSizes, bucketCount ); + + // Write a whole "table"'s worth of data + for( uint32 b = 0; b < bucketCount; b++ ) + { + WriteBucketSlices( buf.get(), b, 0, sliceSizes ); + } + + // Read back + buf->Swap(); + const uint32 secondMask = 0xF0000000; + + { + buf->ReadNextBucket(); + + for( uint32 b = 0; b < bucketCount; b++ ) + { + buf->TryReadNextBucket(); + + auto input = buf->GetNextReadBufferAs(); + + const uint32 readMask = b << 16; + + ENSURE( input.Length() == entriesPerBucket ); + + // Validate + for( uint32 i = 0; i < input.Length(); i++ ) + { + ENSURE( input[i] == (readMask | i ) ); + } + + // Write new bucket + WriteBucketSlices( buf.get(), b, secondMask, sliceSizes ); + } + } + + // Read again and validate the second match + buf->Swap(); + { + buf->ReadNextBucket(); + + for( uint32 b = 0; b < bucketCount; b++ ) + { + buf->TryReadNextBucket(); + + auto input = buf->GetNextReadBufferAs(); + + const uint32 readMask = secondMask | (b << 16); + + ENSURE( input.Length() == entriesPerBucket ); + + // Validate + for( uint32 i = 0; i < input.Length(); i++ ) + { + ENSURE( input[i] == (readMask | i ) ); + } + } + } + + buf->Swap(); + Log::Line( "Ok" ); +} + +//----------------------------------------------------------- +TEST_CASE( "disk-buckets", "[disk-queue]" ) +{ + const char* tempPath = GetEnv( "bb_queue_path", "/Users/harito/.sandbox/plot" ); + DiskQueue queue( tempPath ); + + auto buffer = std::unique_ptr( DiskBuffer::Create( + queue, "bucket.tmp", + bucketCount, sizeof( uint32 ) * entriesPerBucket, + FileMode::Create, FileAccess::ReadWrite ) ); + + ENSURE( buffer ); + { + VirtualAllocator allocator{}; + buffer->ReserveBuffers( allocator ); + } + + // Write bucket + { + for( uint32 b = 0; b < bucketCount; b++ ) + { + auto bucket = buffer->GetNextWriteBufferAs(); + + for( uint32 i = 0; i < entriesPerBucket; i++ ) + bucket[i] = b * entriesPerBucket + i; + + buffer->Submit( entriesPerBucket * sizeof( uint32 ) ); + } + } + + // Read back bucket + buffer->Swap(); + + { + buffer->ReadNextBucket(); + for( uint32 b = 0; b < bucketCount; b++ ) + { + buffer->TryReadNextBucket(); + + auto bucket = buffer->GetNextReadBufferAs(); + + // Validate + ENSURE( bucket.Length() == entriesPerBucket ); + for( uint32 i = 0; i < entriesPerBucket; i++ ) + { + ENSURE( bucket[i] == b * entriesPerBucket + i ); + } + } + } + + Log::Line( "Ok" ); +} + +//----------------------------------------------------------- +void WriteBucketSlices( DiskBucketBuffer* buf, uint32 bucket, uint32 writeMask, Span sliceSizes ) +{ + const uint32 base = entriesPerSlice * bucket; + + auto slices = buf->GetNextWriteBufferAs(); + + for( uint32 slice = 0; slice < bucketCount; slice++ ) + { + const uint32 mask = writeMask | (slice << 16); + + for( uint32 i = 0; i < entriesPerSlice; i++ ) + slices[i] = mask | (base + i); + + slices = slices.Slice( buf->GetSliceStride() / sizeof( uint32 ) ); + } + + // Submit next buffer + buf->Submit(); +} + +