diff --git a/.github/actions/build-asset-unix.sh b/.github/actions/build-asset-unix.sh
index b7fc8920..d96f234f 100755
--- a/.github/actions/build-asset-unix.sh
+++ b/.github/actions/build-asset-unix.sh
@@ -1,12 +1,28 @@
#! /usr/bin/env bash
# NOTE: This is meant to be run from the repo root dir
-#
-# Expects env variables:
-# - BB_ARTIFACT_NAME
-# - BB_VERSION
-#
+
set -eo pipefail
+compile_cuda=0
+artifact_name=bladebit
+version=v1.0
+
+while true; do
+ case $1 in
+ --cuda)
+ compile_cuda=1 || exit 1
+ ;;
+ --artifact)
+ shift && artifact_name=$1 || exit 1
+ ;;
+ --version)
+ shift && version=$1 || exit 1
+ ;;
+ esac
+ shift || break
+done
+
+
thread_count=2
if [[ $OSTYPE == 'darwin'* ]]; then
@@ -19,11 +35,19 @@ fi
echo "System: $(uname -s)"
gcc --version
-mkdir build && cd build
-cmake ..
+exe_name=bladebit
+target=bladebit
+if [[ compile_cuda -eq 1 ]]; then
+ target=bladebit_cuda
+ exe_name=bladebit_cuda
+fi
+
+set -x
+mkdir build-${target} && cd build-${target}
+cmake .. -DCMAKE_BUILD_TYPE=Release
bash -eo pipefail ../embed-version.sh
-cmake --build . --target bladebit --config Release -j $thread_count
-chmod +x ./bladebit
+cmake --build . --config Release --target $target -j $thread_count
+chmod +x ./${exe_name}
if [[ $OSTYPE == 'msys'* ]] || [[ $OSTYPE == 'cygwin'* ]]; then
ls -la Release
@@ -32,16 +56,16 @@ else
fi
# Ensure bladebit version matches expected version
-bb_version="$(./bladebit --version | xargs)"
+bb_version="$(./${exe_name} --version | xargs)"
-if [[ "$bb_version" != "$BB_VERSION" ]]; then
- >&2 echo "Incorrect bladebit version. Got '$bb_version' but expected '$BB_VERSION'."
+if [[ "$bb_version" != "$version" ]]; then
+ >&2 echo "Incorrect bladebit version. Got '$bb_version' but expected '$version'."
exit 1
fi
tar --version
-tar -czvf $BB_ARTIFACT_NAME bladebit
-mkdir ../bin
-mv $BB_ARTIFACT_NAME ../bin/
+tar -czvf $artifact_name $exe_name
+mkdir -p ../bin
+mv $artifact_name ../bin/
ls -la ../bin
diff --git a/.github/actions/build-harvester.sh b/.github/actions/build-harvester.sh
new file mode 100644
index 00000000..2460a279
--- /dev/null
+++ b/.github/actions/build-harvester.sh
@@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+set -eo pipefail
+if [[ $RUNNER_DEBUG = 1 ]]; then
+ set -x
+fi
+
+host_os=$(uname -a)
+case "${host_os}" in
+ Linux*) host_os="linux";;
+ Darwin*) host_os="macos";;
+ CYGWIN*) host_os="windows";;
+ MINGW*) host_os="windows";;
+ *Msys) host_os="windows";;
+esac
+
+if [[ "$host_os" == "windows" ]]; then
+ ext="zip"
+else
+ ext="tar.gz"
+fi
+
+if [[ "$host_os" == "macos" ]]; then
+ procs=$(sysctl -n hw.logicalcpu)
+ sha_sum="shasum -a 256"
+else
+ procs=$(nproc --all)
+ sha_sum="sha256sum"
+fi
+
+artifact_name=green_reaper.$ext
+
+while true; do
+ case $1 in
+ --artifact)
+ shift && artifact_name=$1 || exit 1
+ ;;
+ esac
+ shift || break
+done
+
+echo "Harvester artifact: ${artifact_name}"
+echo 'cmake --version'
+cmake --version
+
+mkdir -p build-harvester
+pushd build-harvester
+cmake .. -DCMAKE_BUILD_TYPE=Release -DBB_HARVESTER_ONLY=ON
+
+cmake --build . --config Release --target bladebit_harvester
+
+if [[ "$host_os" == "windows" ]]; then
+ OBJDUMP=$("${CUDA_PATH}"\\bin\\cuobjdump Release\\bladebit_harvester.dll)
+elif [[ "$host_os" == "linux" ]]; then
+ OBJDUMP=$(/usr/local/cuda/bin/cuobjdump libbladebit_harvester.so)
+fi
+
+cmake --install . --prefix harvester_dist
+pushd harvester_dist/green_reaper
+
+if [[ "$host_os" == "windows" ]]; then
+ mkdir -p lib
+ cp -vn ../../*/*.dll lib/
+ cp -vn ../../*/*.lib lib/
+fi
+
+artifact_files=($(find . -type f -name '*.*' | cut -c3-))
+
+# shellcheck disable=SC2068
+$sha_sum ${artifact_files[@]} > sha256checksum
+
+artifact_files+=("sha256checksum")
+
+if [[ "$host_os" == "windows" ]]; then
+ 7z.exe a -tzip "${artifact_name}" "${artifact_files[@]}"
+else
+ # shellcheck disable=SC2068
+ tar -czvf "${artifact_name}" ${artifact_files[@]}
+fi
+
+popd
+mv "harvester_dist/green_reaper/${artifact_name}" ./
+$sha_sum "${artifact_name}" > "${artifact_name}.sha256.txt"
+ls -la
+cat "${artifact_name}.sha256.txt"
+
+if [[ "$CI" == "true" ]]; then
+ if [[ "$host_os" == "windows" ]] || [[ "$host_os" == "linux" ]]; then
+ while IFS= read -r line; do
+ echo -e "$(echo ${line#* } | tr -d '*')\n###### ${line%% *}\n"
+ done <"${artifact_name}.sha256.txt" >> "$GITHUB_STEP_SUMMARY"
+ echo "| Arch | Code Version | Host | Compile Size |" >> "$GITHUB_STEP_SUMMARY"
+ echo "| --- | --- | --- | --- |" >> "$GITHUB_STEP_SUMMARY"
+ echo "$OBJDUMP" | awk -v RS= -v FS='\n' -v OFS=' | ' '{
+ for (i=1; i<=NF; i++) {
+ if (index($i, "=")) {
+ gsub(/.* = /, "", $i);
+ }
+ }
+ print $3, $4, $5, $6;
+ }' | sed 's/^/| /; s/$/ |/; s/ | | / | /g' >> "$GITHUB_STEP_SUMMARY"
+ fi
+
+ if [[ "$host_os" == "windows" ]]; then
+ harvester_artifact_path="$(cygpath -m "$(pwd)/${artifact_name}")*"
+ else
+ harvester_artifact_path="$(pwd)/${artifact_name}*"
+ fi
+ echo "harvester_artifact_path=$harvester_artifact_path"
+ echo "harvester_artifact_path=$harvester_artifact_path" >> "$GITHUB_ENV"
+fi
+
+popd
+ls -la
diff --git a/.github/actions/get-version.sh b/.github/actions/get-version.sh
index 81dea115..16c51dda 100755
--- a/.github/actions/get-version.sh
+++ b/.github/actions/get-version.sh
@@ -29,6 +29,8 @@ if [[ "$os" == "windows" ]]; then
ext="zip"
fi
-echo "::set-output name=BB_VERSION::$version"
-echo "::set-output name=BB_ARTIFACT_NAME::bladebit-v${version}-${os}-${arch}.${ext}"
+echo "BB_VERSION=$version" >> $GITHUB_ENV
+echo "BB_ARTIFACT_NAME=bladebit-v${version}-${os}-${arch}.${ext}" >> $GITHUB_ENV
+echo "BB_ARTIFACT_NAME_CUDA=bladebit-cuda-v${version}-${os}-${arch}.${ext}" >> $GITHUB_ENV
+
diff --git a/.github/actions/install-cmake-linux.sh b/.github/actions/install-cmake-linux.sh
new file mode 100644
index 00000000..ec75913a
--- /dev/null
+++ b/.github/actions/install-cmake-linux.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+set -eo pipefail
+ref_cmake_sha256='39e1c2eccda989b0d000dc5f4ee2cb031bdda799163780d855acc0bd9eda9d92'
+cmake_name='cmake-3.23.3-linux-x86_64'
+
+curl -L https://github.com/Kitware/CMake/releases/download/v3.23.3/cmake-3.23.3-linux-x86_64.tar.gz > cmake.tar.gz
+
+cmake_sh_sha256=$(sha256sum cmake.tar.gz | cut -f1 -d' ')
+if [[ "${ref_cmake_sha256}" != "${cmake_sh_sha256}" ]]; then
+ 2>&1 echo "sha256 mismatch!: "
+ 2>&1 echo "Got : '${cmake_sh_sha256}'"
+ 2>&1 echo "Expected: '${ref_cmake_sha256}'"
+ exit 1
+fi
+
+rm -f /usr/bin/cmake && rm -f /usr/local/bin/cmake
+mkdir -p /usr/local/bin
+mkdir -p /usr/local/share
+
+cmake_prefix=$(pwd)/${cmake_name}
+tar -xzvf cmake.tar.gz
+ls -la
+ls -la ${cmake_prefix}
+
+cp -r ${cmake_prefix}/bin/* /usr/local/bin/
+cp -r ${cmake_prefix}/share/* /usr/local/share/
+
+echo 'Cmake Info:'
+which cmake
+cmake --version
+
+echo 'Done.'
+exit 0
diff --git a/.github/workflows/attach-release-assets.yml b/.github/workflows/attach-release-assets.yml
index 50818edd..d0df509d 100644
--- a/.github/workflows/attach-release-assets.yml
+++ b/.github/workflows/attach-release-assets.yml
@@ -35,8 +35,15 @@ jobs:
bladebit-v${BB_VERSION}-ubuntu-arm64.tar.gz
bladebit-v${BB_VERSION}-centos-arm64.tar.gz
bladebit-v${BB_VERSION}-windows-x86-64.zip
- bladebit-v${BB_VERSION}-macos-arm64.tar.gz
- bladebit-v${BB_VERSION}-macos-x86-64.tar.gz
+ bladebit-cuda-v${BB_VERSION}-ubuntu-x86-64.tar.gz
+ bladebit-cuda-v${BB_VERSION}-centos-x86-64.tar.gz
+ bladebit-cuda-v${BB_VERSION}-ubuntu-arm64.tar.gz
+ bladebit-cuda-v${BB_VERSION}-windows-x86-64.zip
+ green_reaper-v${BB_VERSION}-linux-x86-64.tar.gz
+ green_reaper-v${BB_VERSION}-linux-ARM64.tar.gz
+ green_reaper-v${BB_VERSION}-macos-x86-64.tar.gz
+ green_reaper-v${BB_VERSION}-macos-arm64.tar.gz
+ green_reaper-v${BB_VERSION}-windows-x86-64.zip
)
mkdir -p bin
@@ -59,4 +66,4 @@ jobs:
echo "Uploading release asset '${artifact_name}'"
node .github/actions/artifacts.mjs upload-release-asset $BB_VERSION $artifact_name bin/$artifact_name
done
-
+
diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml
index a70fbacb..0cfc70cc 100644
--- a/.github/workflows/build-release.yml
+++ b/.github/workflows/build-release.yml
@@ -4,100 +4,339 @@ on:
branches: ['*']
workflow_dispatch:
+env:
+ CI_BLADEBIT: 1 # Our own CI, that is, not being built as a dependency
+
jobs:
- build-ubuntu-x86-64:
+ build-harvester-linux-x86-64:
runs-on: ubuntu-20.04
+ container:
+ image: ghcr.io/chia-network/build-images/manylinux2014_cuda_x86_64:sha-1caf046d5ff19b7c743de2a106dd86928794032b
steps:
- name: Checkout Repo
uses: actions/checkout@v3
+ - name: Set Env
+ uses: Chia-Network/actions/setjobenv@main
+ env:
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
- name: Get Version Number
id: version_number
- run: .github/actions/get-version.sh ubuntu x86-64
+ shell: bash
+ run: ./.github/actions/get-version.sh ubuntu x86-64
- name: Install Prerequisites
- run: sudo apt install -y libgmp-dev libnuma-dev
+ shell: bash
+ run: |
+ set -eo pipefail
+ yum group install -y "Development Tools"
+ yum install -y sudo make git wget subscription-manager
- - name: Build
+ - name: Build Harvester
+ shell: bash
+ run: |
+ export artifact_name="green_reaper-v${{ env.BB_VERSION }}-linux-x86-64.tar.gz"
+ echo "harvester_artifact_name=${artifact_name}" >> "$GITHUB_ENV"
+ # emits env.harvester_artifact_path
+ bash .github/actions/build-harvester.sh --artifact "${artifact_name}"
+
+ - name: Upload Harvester Artifact
+ uses: actions/upload-artifact@v3
+ with:
+ name: ${{ env.harvester_artifact_name }}
+ path: ${{ env.harvester_artifact_path }}
+ if-no-files-found: error
+
+ build-harvester-windows-x86-64:
+ runs-on: windows-2022
+ steps:
+ - name: Checkout Repo
+ uses: actions/checkout@v3
+
+ - name: Set Env
+ uses: Chia-Network/actions/setjobenv@main
env:
- BB_ARTIFACT_NAME: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
- BB_VERSION: ${{steps.version_number.outputs.BB_VERSION}}
- run: .github/actions/build-asset-unix.sh
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+ - name: Get Version Number
+ id: version_number
+ shell: bash
+ run: ./.github/actions/get-version.sh windows x86-64
+
+ - name: Install Prerequisites
+ shell: powershell
+ run: |
+ choco install -y make
+ choco install -y wget
+ choco install -y sed
+
+ - name: Setup CUDA
+ uses: Jimver/cuda-toolkit@v0.2.11
+ id: cuda-toolkit
+ with:
+ cuda: '12.1.0'
+ method: network
- - name: Upload Artifact Ubuntu x86-64
+ - name: Verify CUDA
+ shell: bash
+ run: |
+ echo "Installed cuda version is: ${{ steps.cuda-toolkit.outputs.cuda }}"
+ echo "Cuda install location: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}"
+ nvcc -V
+
+ - name: Build Harvester
+ shell: bash
+ run: |
+ export artifact_name="green_reaper-v${{ env.BB_VERSION }}-windows-x86-64.zip"
+ echo "harvester_artifact_name=${artifact_name}" >> "$GITHUB_ENV"
+ # emits env.harvester_artifact_path
+ bash .github/actions/build-harvester.sh --artifact "${artifact_name}"
+ env:
+ CUDA_PATH: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
+
+ - name: Upload Harvester Artifact
uses: actions/upload-artifact@v3
with:
- name: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
- path: ${{ github.workspace }}/bin/${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
+ name: ${{ env.harvester_artifact_name }}
+ path: ${{ env.harvester_artifact_path }}
if-no-files-found: error
- build-centos-x86-64:
- runs-on: ubuntu-20.04
+ build-harvester-linux-arm64:
+ runs-on: [ARM64, Linux]
container:
- image: quay.io/centos/centos:stream8
+ image: ghcr.io/chia-network/build-images/manylinux2014_cuda_aarch64:sha-1caf046d5ff19b7c743de2a106dd86928794032b
steps:
- name: Checkout Repo
uses: actions/checkout@v3
+ - name: Set Env
+ uses: Chia-Network/actions/setjobenv@main
+ env:
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+ - name: Cache DNF packages
+ uses: actions/cache@v3
+ with:
+ path: /var/cache/dnf
+ key: ${{ runner.os }}-dnf-${{ hashFiles('**/your-build-file') }}
+ restore-keys: |
+ ${{ runner.os }}-dnf-
+
- name: Get Version Number
id: version_number
- run: .github/actions/get-version.sh centos x86-64
+ shell: bash
+ run: ./.github/actions/get-version.sh centos arm64
- name: Install Prerequisites
+ shell: bash
run: |
- dnf install -y gcc-toolset-9-gcc gcc-toolset-9-gcc-c++ \
- cmake gmp-devel numactl-devel make git
+ set -eo pipefail
+ export module_platform_id=platform:el9
+ export MODULE_PLATFORM_ID=platform:el9
+ export PLATFORM_ID=platform:el9
+ uname -a
+ cat /etc/os-release
+ yum install -y dnf
+ dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+ dnf install -y dnf-plugins-core
+ dnf makecache
+ dnf install -y kernel-headers.aarch64 kernel-devel.aarch64 tar bzip2 make automake gcc gcc-c++ pciutils elfutils-libelf-devel libglvnd-opengl libglvnd-glx libglvnd-devel acpid pkgconfig dkms
+ dnf install -y cmake
+ dnf group install -y "Development Tools"
+ dnf install -y gmp-devel numactl-devel make git wget sed
+
+ - name: Build Harvester
+ shell: bash
+ run: |
+ export artifact_name="green_reaper-v${{ env.BB_VERSION }}-linux-ARM64.tar.gz"
+ echo "harvester_artifact_name=${artifact_name}" >> "$GITHUB_ENV"
+ # emits env.harvester_artifact_path
+ bash .github/actions/build-harvester.sh --artifact "${artifact_name}"
- - name: Build
+ - name: Upload Harvester Artifact
+ uses: actions/upload-artifact@v3
+ with:
+ name: ${{ env.harvester_artifact_name }}
+ path: ${{ env.harvester_artifact_path }}
+ if-no-files-found: error
+
+ build-harvester-macos-arm64:
+ runs-on: [macos, arm64]
+ steps:
+ - name: Cleanup Environment
+ uses: Chia-Network/actions/clean-workspace@main
+
+ - name: Checkout Repo
+ uses: actions/checkout@v3
+
+ - name: Set Env
+ uses: Chia-Network/actions/setjobenv@main
env:
- BB_ARTIFACT_NAME: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
- BB_VERSION: ${{steps.version_number.outputs.BB_VERSION}}
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+ - name: Get Version Number
+ id: version_number
+ run: bash -e .github/actions/get-version.sh macos arm64
+
+ - name: Build Harvester
+ shell: bash
run: |
- source /opt/rh/gcc-toolset-9/enable
- .github/actions/build-asset-unix.sh
-
- - name: Upload Artifact CentOS x86-64
+ export artifact_name="green_reaper-v${{ env.BB_VERSION }}-macos-arm64.tar.gz"
+ echo "harvester_artifact_name=${artifact_name}" >> "$GITHUB_ENV"
+ # emits env.harvester_artifact_path
+ bash .github/actions/build-harvester.sh --artifact "${artifact_name}"
+
+ - name: Upload Harvester Artifact
uses: actions/upload-artifact@v3
with:
- name: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
- path: ${{ github.workspace }}/bin/${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
+ name: ${{ env.harvester_artifact_name }}
+ path: ${{ env.harvester_artifact_path }}
if-no-files-found: error
- build-ubuntu-arm64:
- runs-on: [ARM64, Linux]
- container:
- image: chianetwork/ubuntu-20.04-builder:latest
- defaults:
- run:
+ build-harvester-macos-x86-64:
+ runs-on: macOS-11
+ steps:
+ - name: Cleanup Environment
+ uses: Chia-Network/actions/clean-workspace@main
+
+ - name: Checkout Repo
+ uses: actions/checkout@v3
+
+ - name: Set Env
+ uses: Chia-Network/actions/setjobenv@main
+ env:
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+ - name: Get Version Number
+ id: version_number
+ run: bash -e .github/actions/get-version.sh macos x86-64
+
+ - name: Build Harvester
shell: bash
+ run: |
+ export artifact_name="green_reaper-v${{ env.BB_VERSION }}-macos-x86-64.tar.gz"
+ echo "harvester_artifact_name=${artifact_name}" >> "$GITHUB_ENV"
+ # emits env.harvester_artifact_path
+ bash .github/actions/build-harvester.sh --artifact "${artifact_name}"
+
+ - name: Upload Harvester Artifact
+ uses: actions/upload-artifact@v3
+ with:
+ name: ${{ env.harvester_artifact_name }}
+ path: ${{ env.harvester_artifact_path }}
+ if-no-files-found: error
+
+
+ build-bladebit-ubuntu-x86-64:
+ runs-on: ubuntu-20.04
steps:
- name: Checkout Repo
uses: actions/checkout@v3
+ - name: Set Env
+ uses: Chia-Network/actions/setjobenv@main
+ env:
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
- name: Get Version Number
id: version_number
- run: .github/actions/get-version.sh ubuntu arm64
+ run: .github/actions/get-version.sh ubuntu x86-64
- name: Install Prerequisites
run: |
- export DEBIAN_FRONTEND=noninteractive
- apt update
- apt install -y build-essential git libgmp-dev libnuma-dev
+ sudo apt install -y libgmp-dev libnuma-dev
+ sudo bash .github/actions/install-cmake-linux.sh
- - name: Build
+ - name: Build Bladebit
+ run: .github/actions/build-asset-unix.sh --artifact ${{ env.BB_ARTIFACT_NAME }} --version ${{env.BB_VERSION}}
+
+ - name: Setup CUDA
+ uses: Jimver/cuda-toolkit@v0.2.11
+ id: cuda-toolkit
+ with:
+ cuda: '12.1.0'
+ method: network
+ linux-local-args: '["--toolkit"]'
+
+ - name: Build Bladebit CUDA
+ run: .github/actions/build-asset-unix.sh --cuda --artifact ${{ env.BB_ARTIFACT_NAME_CUDA }} --version ${{env.BB_VERSION}}
+
+ - name: Upload Bladebit Artifact
+ uses: actions/upload-artifact@v3
+ with:
+ name: ${{ env.BB_ARTIFACT_NAME }}
+ path: ${{ github.workspace }}/bin/${{ env.BB_ARTIFACT_NAME }}
+ if-no-files-found: error
+
+ - name: Upload Bladebit CUDA Artifact
+ uses: actions/upload-artifact@v3
+ with:
+ name: ${{ env.BB_ARTIFACT_NAME_CUDA }}
+ path: ${{ github.workspace }}/bin/${{ env.BB_ARTIFACT_NAME_CUDA }}
+ if-no-files-found: error
+
+ build-bladebit-centos-x86-64:
+ runs-on: ubuntu-20.04
+ container:
+ image: quay.io/centos/centos:stream8
+ steps:
+ - name: Checkout Repo
+ uses: actions/checkout@v3
+
+ - name: Set Env
+ uses: Chia-Network/actions/setjobenv@main
env:
- BB_ARTIFACT_NAME: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
- BB_VERSION: ${{steps.version_number.outputs.BB_VERSION}}
- run: .github/actions/build-asset-unix.sh
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- - name: Upload Artifact Ubuntu ARM64
+ - name: Get Version Number
+ id: version_number
+ run: .github/actions/get-version.sh centos x86-64
+
+ - name: Install Prerequisites
+ run: |
+ set -eo pipefail
+ dnf install -y gcc-toolset-9-gcc gcc-toolset-9-gcc-c++ \
+ gmp-devel numactl-devel make git wget subscription-manager
+ bash .github/actions/install-cmake-linux.sh
+
+ - name: Build Bladebit
+ run: |
+ source /opt/rh/gcc-toolset-9/enable
+ .github/actions/build-asset-unix.sh --artifact ${{ env.BB_ARTIFACT_NAME }} --version ${{ env.BB_VERSION }}
+
+ - name: Install CUDA Prerequisites
+ run: |
+ distro=rhel8
+ arch=x86_64
+ dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm
+ dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$distro/$arch/cuda-$distro.repo
+ dnf clean expire-cache
+ dnf module install -y nvidia-driver:latest-dkms
+ dnf install -y cuda
+ ls -la /usr/local/
+ ls -la /usr/local/cuda/
+
+ - name: Build Bladebit CUDA
+ run: |
+ source /opt/rh/gcc-toolset-9/enable
+ .github/actions/build-asset-unix.sh --cuda --artifact ${{ env.BB_ARTIFACT_NAME_CUDA }} --version ${{ env.BB_VERSION }}
+
+ - name: Upload Bladebit Artifact
+ uses: actions/upload-artifact@v3
+ with:
+ name: ${{ env.BB_ARTIFACT_NAME }}
+ path: ${{ github.workspace }}/bin/${{ env.BB_ARTIFACT_NAME }}
+ if-no-files-found: error
+
+ - name: Upload Bladebit CUDA Artifact
uses: actions/upload-artifact@v3
with:
- name: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
- path: ${{ github.workspace }}/bin/${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
+ name: ${{ env.BB_ARTIFACT_NAME_CUDA }}
+ path: ${{ github.workspace }}/bin/${{ env.BB_ARTIFACT_NAME_CUDA }}
if-no-files-found: error
- build-centos-arm64:
+ build-bladebit-centos-arm64:
runs-on: [ARM64, Linux]
container:
image: quay.io/centos/centos:stream8
@@ -105,6 +344,11 @@ jobs:
- name: Checkout Repo
uses: actions/checkout@v3
+ - name: Set Env
+ uses: Chia-Network/actions/setjobenv@main
+ env:
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
- name: Get Version Number
id: version_number
run: .github/actions/get-version.sh centos arm64
@@ -112,44 +356,100 @@ jobs:
- name: Install Prerequisites
run: |
dnf install -y gcc-toolset-9-gcc gcc-toolset-9-gcc-c++ \
- cmake gmp-devel numactl-devel make git
+ cmake gmp-devel numactl-devel make git
- name: Build
- env:
- BB_ARTIFACT_NAME: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
- BB_VERSION: ${{steps.version_number.outputs.BB_VERSION}}
run: |
source /opt/rh/gcc-toolset-9/enable
- .github/actions/build-asset-unix.sh
+ .github/actions/build-asset-unix.sh --artifact ${{ env.BB_ARTIFACT_NAME }} --version ${{ env.BB_VERSION }}
- name: Upload Artifact CentOS ARM64
uses: actions/upload-artifact@v3
with:
- name: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
- path: ${{ github.workspace }}/bin/${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
+ name: ${{ env.BB_ARTIFACT_NAME }}
+ path: ${{ github.workspace }}/bin/${{ env.BB_ARTIFACT_NAME }}
if-no-files-found: error
- build-windows-x86-64:
+ build-bladebit-cuda-linux-arm64:
+ runs-on: [ARM64, Linux]
+ container:
+ image: chianetwork/ubuntu-20.04-builder:latest
+ defaults:
+ run:
+ shell: bash
+ steps:
+ - name: Checkout Repo
+ uses: actions/checkout@v3
+
+ - name: Set Env
+ uses: Chia-Network/actions/setjobenv@main
+ env:
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+ - name: Get Version Number
+ id: version_number
+ run: .github/actions/get-version.sh ubuntu arm64
+
+ - name: Install Prerequisites
+ run: |
+ export DEBIAN_FRONTEND=noninteractive
+ apt update
+ apt install -y build-essential git libgmp-dev libnuma-dev
+ - name: Setup CUDA
+ run: |
+ wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa/cuda-ubuntu2004.pin
+ mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600
+ wget https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda-repo-ubuntu2004-12-1-local_12.1.1-530.30.02-1_arm64.deb
+ dpkg -i cuda-repo-ubuntu2004-12-1-local_12.1.1-530.30.02-1_arm64.deb
+ cp /var/cuda-repo-ubuntu2004-12-1-local/cuda-*-keyring.gpg /usr/share/keyrings/
+ apt-get update
+ DEBIAN_FRONTEND=noninteractive apt-get -y install cuda
+
+ - name: Build
+ run: .github/actions/build-asset-unix.sh --artifact ${{ env.BB_ARTIFACT_NAME }} --version ${{env.BB_VERSION}}
+
+ - name: Upload Artifact Ubuntu ARM64
+ uses: actions/upload-artifact@v3
+ with:
+ name: ${{ env.BB_ARTIFACT_NAME }}
+ path: ${{ github.workspace }}/bin/${{ env.BB_ARTIFACT_NAME }}
+ if-no-files-found: error
+
+ - name: Build Bladebit CUDA
+ run: |
+ .github/actions/build-asset-unix.sh --cuda --artifact ${{ env.BB_ARTIFACT_NAME_CUDA }} --version ${{ env.BB_VERSION }}
+ - name: Upload Bladebit CUDA Artifact
+ uses: actions/upload-artifact@v3
+ with:
+ name: ${{ env.BB_ARTIFACT_NAME_CUDA }}
+ path: ${{ github.workspace }}/bin/${{ env.BB_ARTIFACT_NAME_CUDA }}
+ if-no-files-found: error
+
+ build-bladebit-windows-x86-64:
runs-on: windows-2019
steps:
- name: Checkout Repo
uses: actions/checkout@v3
+ - name: Set Env
+ uses: Chia-Network/actions/setjobenv@main
+ env:
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
- name: Get Version Number
shell: bash
id: version_number
run: .github/actions/get-version.sh windows x86-64
- - name: Build
+ - name: Build Bladebit
shell: bash
env:
- BB_ARTIFACT_NAME: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
- BB_VERSION: ${{steps.version_number.outputs.BB_VERSION}}
+ BB_ARTIFACT_NAME: ${{ env.BB_ARTIFACT_NAME }}
+ BB_VERSION: ${{env.BB_VERSION}}
run: |
-
mkdir build && cd build
cmake ..
- bash -e -o pipefail ../embed-version.sh
+ bash -eo pipefail ../embed-version.sh
cat ../src/Version.h
cmake --build . --target bladebit --config Release
@@ -160,74 +460,86 @@ jobs:
>&2 echo "Incorrect bladebit version. Got but '$bb_version' expected '$BB_VERSION'."
exit 1
fi
-
+
mkdir ../bin
cd Release
ls -la
7z.exe a -tzip ../../bin/${BB_ARTIFACT_NAME} bladebit.exe
ls -la ../../bin
- - name: Upload Artifact Windows x86-64
+ - name: Upload Bladebit Artifact Windows x86-64
uses: actions/upload-artifact@v3
with:
- name: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
- path: ${{ github.workspace }}/bin/${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
+ name: ${{ env.BB_ARTIFACT_NAME }}
+ path: ${{ github.workspace }}/bin/${{ env.BB_ARTIFACT_NAME }}
if-no-files-found: error
- build-macos-arm64:
- runs-on: [macOS, ARM64]
+ build-bladebit-cuda-windows-x86-64:
+ runs-on: windows-2019
steps:
- - name: Cleanup Environment
- uses: Chia-Network/actions/clean-workspace@main
-
- name: Checkout Repo
uses: actions/checkout@v3
+ - name: Set Env
+ uses: Chia-Network/actions/setjobenv@main
+ env:
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
- name: Get Version Number
+ shell: bash
id: version_number
- run: bash -e .github/actions/get-version.sh macos arm64
+ run: .github/actions/get-version.sh windows x86-64
- name: Install Prerequisites
- run: brew install cmake
-
- - name: Build
- env:
- BB_ARTIFACT_NAME: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
- BB_VERSION: ${{steps.version_number.outputs.BB_VERSION}}
- run: .github/actions/build-asset-unix.sh
+ shell: powershell
+ run: |
+ choco install -y make
+ choco install -y wget
+ choco install -y sed
- - name: Upload Artifact macOS arm64
- uses: actions/upload-artifact@v3
+ - name: Setup CUDA
+ uses: Jimver/cuda-toolkit@v0.2.11
+ id: cuda-toolkit
with:
- name: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
- path: ${{ github.workspace }}/bin/${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
- if-no-files-found: error
+ cuda: '12.1.0'
+ method: network
- build-macos-x86-64:
- runs-on: macOS-11
- steps:
- - name: Cleanup Environment
- uses: Chia-Network/actions/clean-workspace@main
+ - name: Verify CUDA
+ shell: bash
+ run: |
+ echo "Installed cuda version is: ${{ steps.cuda-toolkit.outputs.cuda }}"
+ echo "Cuda install location: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}"
+ nvcc -V
- - name: Checkout Repo
- uses: actions/checkout@v3
+ - name: Build Bladebit CUDA
+ shell: bash
+ env:
+ BB_ARTIFACT_NAME_CUDA: ${{ env.BB_ARTIFACT_NAME_CUDA }}
+ BB_VERSION: ${{env.BB_VERSION}}
+ run: |
+ mkdir build_cuda && cd build_cuda
+ cmake ..
+ bash -eo pipefail ../embed-version.sh
+ cat ../src/Version.h
+ cmake --build . --target bladebit_cuda --config Release
- - name: Get Version Number
- id: version_number
- run: .github/actions/get-version.sh macos x86-64
+ # Ensure bladebit version matches expected version
+ bb_version="$(./Release/bladebit_cuda.exe --version | xargs)"
- - name: Install Prerequisites
- run: brew install cmake
+ if [[ "$bb_version" != "$BB_VERSION" ]]; then
+ >&2 echo "Incorrect bladebit version. Got but '$bb_version' expected '$BB_VERSION'."
+ exit 1
+ fi
- - name: Build
- env:
- BB_ARTIFACT_NAME: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
- BB_VERSION: ${{steps.version_number.outputs.BB_VERSION}}
- run: .github/actions/build-asset-unix.sh
+ mkdir ../bin
+ cd Release
+ ls -la
+ 7z.exe a -tzip ../../bin/${BB_ARTIFACT_NAME_CUDA} bladebit_cuda.exe
+ ls -la ../../bin
- - name: Upload Artifact macOS x86-64
+ - name: Upload Bladebit CUDA Artifact Windows x86-64
uses: actions/upload-artifact@v3
with:
- name: ${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
- path: ${{ github.workspace }}/bin/${{ steps.version_number.outputs.BB_ARTIFACT_NAME }}
+ name: ${{ env.BB_ARTIFACT_NAME_CUDA }}
+ path: ${{ github.workspace }}/bin/${{ env.BB_ARTIFACT_NAME_CUDA }}
if-no-files-found: error
diff --git a/.idea/.name b/.idea/.name
new file mode 100644
index 00000000..1e51b03a
--- /dev/null
+++ b/.idea/.name
@@ -0,0 +1 @@
+bladebit
\ No newline at end of file
diff --git a/.idea/qmlSettings.xml b/.idea/qmlSettings.xml
new file mode 100644
index 00000000..b08a4e58
--- /dev/null
+++ b/.idea/qmlSettings.xml
@@ -0,0 +1,14 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
index e88cc20c..54a1aefd 100644
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@@ -3,39 +3,12 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.vscode/launch.json b/.vscode/launch.json
index 246a5532..6957af27 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -33,10 +33,12 @@
"args": [
"-f", "ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef",
"-p", "80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8",
- "-i", "c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835",
- // "-n", "1",
+ // "-i", "c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835",
+ "-i", "5cfc42dfaa5613da0b425994c2427a2ba4a8efcfb49e7844e93c0854baf09863",
+ // "-n", "2",
"-w",
"-v",
+ "--compress", "4",
"ramplot",
"~/plot/tmp"
]
@@ -86,11 +88,13 @@
"--show-memo",
+ // "--compress", "6",
+
"diskplot",
"-t1", "~/plot/tmp",
"--f1-threads", "24",
- // "--fp-threads", "62",
+ "--fp-threads", "62",
"--c-threads", "28",
"--p2-threads", "24",
@@ -101,14 +105,14 @@
// "--cache", "64G",
// "-s",
// "--k32-bounded",
- "-b", "64",
+ // "-b", "64",
// "--sizes",
- // "-b", "128",
+ "-b", "128",
// "-b", "256",
- "--c-threads", "26",
- "--p2-threads", "24",
- "--p3-threads", "48",
+ // "--c-threads", "26",
+ // "--p2-threads", "24",
+ // "--p3-threads", "48",
"~/plot/tmp"
],
@@ -117,6 +121,78 @@
"environment": []
},
+ {
+ "name" : "Bladebit CUDA",
+
+ "type" : "cuda-gdb",
+ "request" : "launch",
+ "stopAtEntry" : false,
+ "cwd" : "${workspaceFolder}",
+ "preLaunchTask" : "build_cuda_debug",
+
+ "program": "${workspaceFolder}/build/bladebit_cuda",
+
+ // "-c", "xch1uf48n3f50xrs7zds0uek9wp9wmyza6crnex6rw8kwm3jnm39y82q5mvps6",
+ // "-i", "7a709594087cca18cffa37be61bdecf9b6b465de91acb06ecb6dbe0f4a536f73", // Yes overflow
+ // "--memo", "80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef207d52406afa2b6d7d92ea778f407205bd9dca40816c1b1cacfca2a6612b93eb",
+
+ "args":
+ // "-w --compress 3 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot ~/plot/tmp",
+ "-w --compress 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot ~/plot",
+
+ "windows": {
+ "type": "cppvsdbg",
+ "program": "${workspaceFolder}/build/Debug/bladebit_cuda.exe",
+ "args": "--benchmark --compress 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot D:/"
+ }
+ },
+
+ {
+ "name" : "CUDA Harvest Test",
+
+ "type" : "cuda-gdb",
+ "request" : "launch",
+ "stopAtEntry" : false,
+ "cwd" : "${workspaceFolder}",
+ "preLaunchTask" : "build_cuda_debug",
+ "program": "${workspaceFolder}/build/bladebit_cuda",
+
+ // "preLaunchTask" : "build_debug",
+ // "program": "${workspaceFolder}/build/bladebit",
+
+ // "-c", "xch1uf48n3f50xrs7zds0uek9wp9wmyza6crnex6rw8kwm3jnm39y82q5mvps6",
+ // "-i", "7a709594087cca18cffa37be61bdecf9b6b465de91acb06ecb6dbe0f4a536f73", // Yes overflow
+ // "--memo", "80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef207d52406afa2b6d7d92ea778f407205bd9dca40816c1b1cacfca2a6612b93eb",
+
+ // "args": "-t 1 validate --cuda --f7 0 ~/plot/tmp/plot-k32-c01-2023-03-09-14-07-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+ // "args": "validate --cuda --f7 0 ~/plot/tmp/plot-k32-c07-2023-03-16-11-49-7732c75d9f3b5ad1fc804bb7429121e334bd4f25f9bbbb76ef0370b5a0e80aae.plot",
+ // "args": "validate --cuda --f7 0 ~/plot/tmp/plot-k32-c09-2023-04-19-16-12-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+
+ // "args": "-t 1 simulate -n 5000 -p 4 ~/plot/tmp/plot-k32-c07-2023-04-13-16-08-330fbf677f78641061c93312c1a7ffa28138739b69975f3b874df6acc3e76378.plot",
+ // "args": "-t 16 simulate -n 10 ~/plot/tmp/plot-k32-c09-2023-04-19-16-12-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+ // "args": "-t 16 simulate -n 10 ~/plot/tmp/plot-k32-c01-2023-03-09-14-07-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+ // "args": "validate --cuda --quality 4 ~/plot/tmp/plot-k32-c07-2023-04-13-16-08-330fbf677f78641061c93312c1a7ffa28138739b69975f3b874df6acc3e76378.plot",
+ // "args": "validate --cuda --f7 4 ~/plot/tmp/plot-k32-c07-2023-04-13-16-08-330fbf677f78641061c93312c1a7ffa28138739b69975f3b874df6acc3e76378.plot",
+
+ // "args": "validate --quality 4 ~/plot/tmp/plot-k32-c07-2023-04-13-16-08-330fbf677f78641061c93312c1a7ffa28138739b69975f3b874df6acc3e76378.plot",
+ // "args": "validate --cuda --quality 4 ~/plot/tmp/plot-k32-c07-2023-04-13-16-08-330fbf677f78641061c93312c1a7ffa28138739b69975f3b874df6acc3e76378.plot",
+
+ // "args": "validate --quality 4 ~/plot/tmp/plot-k32-2023-04-26-20-24-330fbf677f78641061c93312c1a7ffa28138739b69975f3b874df6acc3e76378.plot",
+
+ // "args": "validate --quality 98 ~/plot/tmp/plot-k32-c09-2023-04-19-16-12-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+
+ // "args": "validate --cuda --quality 6 ~/plot/tmp/plot-k32-c09-2023-04-19-16-12-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+ "args": "validate --cuda --f7 6 ~/plot/tmp/plot-k32-c09-2023-04-19-16-12-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+ // "args": "validate --quality 6 ~/plot/tmp/plot-k32-2023-04-26-20-20-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+
+ "windows": {
+ "type": "cppvsdbg",
+ "program": "${workspaceFolder}/build/Debug/bladebit_cuda.exe",
+ // "args": "--benchmark --compress 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot D:/"
+ "args": "validate --cuda --f7 0 D:/chia_test_plots/plot-k32-c01-2023-05-10-18-56-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+ }
+ },
+
{
"name" : "IOTest",
@@ -162,6 +238,9 @@
"name" : "Tests",
"type" : "cppdbg",
+ "osx": {
+ "MIMode": "lldb",
+ },
"request" : "launch",
"stopAtEntry" : false,
"cwd" : "${workspaceFolder}",
@@ -171,7 +250,16 @@
"program": "${workspaceRoot}/build/tests",
"environment": [
- // { "name": "bbtest_thread_count", "value": "2" }
+ // { "name": "bb_thread_count", "value": "60" }
+ { "name": "bb_iterations" , "value": "1" },
+ { "name": "bb_thread_count", "value": "4" },
+ { "name": "bb_f7" , "value": "0" },
+ { "name": "bb_plot" , "value": "/home/harold/plot/tmp/plot-k32-c01-2023-02-13-22-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" },
+ // { "name": "bb_plot" , "value": "/home/harold/plot/tmp/plot-k32-c07-2023-02-08-17-35-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" }
+ // { "name": "bb_plot" , "value": "/home/harold/plot/tmp/plot-k32-c04-2023-02-08-01-33-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" }
+ // { "name": "bb_plot" , "value": "/home/harold/plot/tmp/plot-k32-c06-2023-02-14-21-43-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" },
+ { "name": "bb_clevel" , "value": "1" },
+ { "name": "bb_end_clevel" , "value": "1" },
],
"args": [
@@ -183,10 +271,12 @@
// "PairsAndMap"
// "bucket-slice-write"
// "line-point-deltas"
+ // "compressed-plot-proof"
+ // "compressed-plot-qualities"
+ "macos-threads"
]
}
-
,{
"name" : "Plot Tool",
@@ -208,23 +298,48 @@
"args": [
/// Validate
- // "-t", "32",
+ // "-t", "48",
// "-t", "1",
- "validate",
+
+ // "-t", "1", "validate", "--f7", "324", "~/plot/tmp/plot-k32-c01-2023-02-13-22-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+ // "validate", "--f7", "7", "~/plot/tmp/plot-k32-c01-2023-03-09-14-07-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+ // "validate", "--cuda", "--f7", "4", "~/plot/tmp/plot-k32-c07-2023-04-13-16-08-330fbf677f78641061c93312c1a7ffa28138739b69975f3b874df6acc3e76378.plot",
+
+ // "--verify", "0x7d7ceb24ca25bac5f4c59b4400b23585bff254efa5b78f3085192e399fc74fdaab630f2cd74ea733eb9b82a5bc5582e8fd075c0591b2eef12adae264159a8eeeae5808202d1a10cffd1a0fcb64b1f43cd3941987cf606ba01434d43715cbe1773f01fe74288110606b2cd90063f01f0eca3ba515a2fb2a011ea73d7da3148895e046b09c3d393cad44411fe57671290e4f34ed7d2aafe6788effde2c965b814158a1fe1109b67cf2f9849dfa55568d68e3e5fa24605269499f30b61cb889b6256256e467de963c25d7fb47e6a4119f2f8719ec9acbd82f7d95b8196660fe43165490255730ddf870a4e48da1ea2050fef4608d7321d6a3eede07744d8847858d",
+ // "0x00000037ff04b8ee9355068689bd558eafe07cc7af47ad1574b074fc34d6913a", "c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835",
+
+ // // "--f7", "2534554965", "~/plot/tmp/plot-k32-2022-10-18-22-25-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+ // "-t", "16", "validate", "--f7", "11", "~/plot/tmp/plot-k32-c07-2023-03-16-11-49-7732c75d9f3b5ad1fc804bb7429121e334bd4f25f9bbbb76ef0370b5a0e80aae.plot"
- "--f7", "2534554965",
- "~/plot/tmp/plot-k32-2022-10-18-22-25-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+ // "/home/harold/plot/tmp/plot-k32-c07-2023-02-08-17-35-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+ // "~/plot/tmp/plot-k32-2023-02-08-17-39-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+ // // "~/plot/tmp/plot-k32-c04-2023-01-29-03-29-5cfc42dfaa5613da0b425994c2427a2ba4a8efcfb49e7844e93c0854baf09863.plot"
+ // // "~/plot/tmp/ramplot-k32-2023-01-30-09-04-5cfc42dfaa5613da0b425994c2427a2ba4a8efcfb49e7844e93c0854baf09863.plot"
+ // // "~/plot/tmp/disk/plot-k32-c04-2023-01-30-23-07-5cfc42dfaa5613da0b425994c2427a2ba4a8efcfb49e7844e93c0854baf09863.plot"
+ // // "~/plot/tmp/plot-k32-c04-2023-01-30-23-55-5cfc42dfaa5613da0b425994c2427a2ba4a8efcfb49e7844e93c0854baf09863.plot"
+ // // "~/plot/tmp/plot-k32-c04-2023-01-31-01-00-5cfc42dfaa5613da0b425994c2427a2ba4a8efcfb49e7844e93c0854baf09863.plot"
+ // // "/home/harold/plot/tmp/plot-k32-c04-2023-01-31-22-57-5cfc42dfaa5613da0b425994c2427a2ba4a8efcfb49e7844e93c0854baf09863.plot"
+ // // "/home/harold/plot/tmp/plot-k32-c04-2023-01-31-23-15-5cfc42dfaa5613da0b425994c2427a2ba4a8efcfb49e7844e93c0854baf09863.plot"
+
+ // Simulation
+ "-t", "1", "simulate", "--seed", "b8e9ec6bc179ae6ba5f5c3483f7501db32879efa84b62001d27601a540dca5ff",
+ "-p", "16", "-n", "1", "--power", "45", "--size", "4PB", "~/plot/tmp/plot-k32-c01-2023-03-09-14-07-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+ // "-t", "30", "simulate", "-p", "2", "-n", "600", "~/plot/tmp/plot-k32-c07-2023-03-16-11-49-7732c75d9f3b5ad1fc804bb7429121e334bd4f25f9bbbb76ef0370b5a0e80aae.plot"
// "-m",
- // "-u",
- // "~/plot/tmp/plot-k32-2022-10-17-15-05-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+ // "-u", "~/plot/tmp/plot-k32-2022-10-26-23-58-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
// "/mnt/p5510a/disk_tmp/plot.dat"
+
+ // "--f7", "3983284117", "/home/harito/plot/tmp/plot-k32-2022-11-21-05-59-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+ // "--f7", "3983284117", "/home/harito/plot/tmp/gpu_1.plot",
/// Compare
// "plotcmp",
- // "/mnt/p5510a/disk_tmp/plot-k32-2022-04-12-13-53-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
- // "/mnt/p5510a/disk_tmp/plot-k32-2022-04-12-13-03-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
+ // "/home/harito/plot/tmp/gpu_1.plot.old",
+ // "/home/harold/plot-tmpfs/gpu_1.plot",
+ // "/home/harito/plot/tmp/gpu_1.plot",
+ // "/home/harito/plot/tmp/plot-k32-2022-11-21-05-59-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
]
},
diff --git a/.vscode/settings.json b/.vscode/settings.json
index be694e05..c6c5274d 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -13,6 +13,7 @@
"*.ac": "shellscript",
"player": "json",
"*.userprefs": "xml",
+ "*.make": "makefile",
"memory": "cpp",
"cstddef": "cpp",
"string": "cpp",
@@ -101,11 +102,30 @@
"locale": "cpp",
"stack": "cpp",
"*.include": "cpp",
- "relic_core.h": "c"
+ "relic_core.h": "c",
+ "compare": "cpp",
+ "concepts": "cpp",
+ "numbers": "cpp",
+ "semaphore": "cpp",
+ "stop_token": "cpp",
+ "queue": "cpp",
+ "__memory": "cpp",
+ "filesystem": "cpp",
+ "__bits": "cpp",
+ "csignal": "cpp",
+ "cfenv": "cpp"
},
"cSpell.words": [
"Ryzen"
],
"C_Cpp.errorSquiggles": "Enabled",
- "cmake.configureOnOpen": true
+ "cmake.configureOnOpen": true,
+ "cmake.configureOnEdit": false,
+ "cmake.preferredGenerators": [
+ "Unix Makefiles",
+ "Visual Studio 17 2022"
+ ]
+ // "cmake.generator": "Unix Makefiles"
+ // "cmake.generator": "Visual Studio 17 2022"
+
}
\ No newline at end of file
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
index be6ce096..e98520f2 100644
--- a/.vscode/tasks.json
+++ b/.vscode/tasks.json
@@ -35,6 +35,56 @@
}
},
+ {
+ "type" : "shell",
+ "label" : "build_cuda_debug",
+ "detail" : "Build CUDA Bladebit",
+ "command": "cmake",
+
+ "args": [
+ "--build", ".",
+ "--target", "bladebit_cuda",
+ "--config", "Debug",
+ "-j", "24"
+ ],
+
+ "problemMatcher": [ "$nvcc" ],
+
+ "options": {
+ "cwd": "${workspaceFolder}/build"
+ },
+
+ "group": {
+ "kind": "build",
+ "isDefault": true
+ }
+ },
+
+ {
+ "type" : "shell",
+ "label" : "build_harvester",
+ "detail" : "Build Bladebit Harvester",
+ "command": "cmake",
+
+ "args": [
+ "--build", ".",
+ "--target", "lib_bladebit_harvester",
+ "--config", "Debug",
+ "-j", "24"
+ ],
+
+ "problemMatcher": [ "$nvcc" ],
+
+ "options": {
+ "cwd": "${workspaceFolder}/build"
+ },
+
+ "group": {
+ "kind": "build",
+ "isDefault": false
+ }
+ },
+
{
"type" : "shell",
"label" : "rebuild_debug",
diff --git a/Bladebit.cmake b/Bladebit.cmake
new file mode 100644
index 00000000..6ce0ad97
--- /dev/null
+++ b/Bladebit.cmake
@@ -0,0 +1,305 @@
+add_library(bladebit_core)
+target_link_libraries(bladebit_core PUBLIC bladebit_config)
+
+target_include_directories(bladebit_core PUBLIC
+ ${INCLUDE_DIRECTORIES}
+ ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
+target_compile_definitions(bladebit_core PUBLIC
+ GR_NO_IMPORT=1
+ BB_NUMA_ENABLED=1
+)
+
+target_compile_options(bladebit_core PUBLIC ${preinclude_pch})
+
+target_link_libraries(bladebit_core PUBLIC
+ Threads::Threads
+ bls
+
+ $<$:
+ ${NUMA_LIBRARY}
+ >
+)
+
+add_executable(bladebit
+ src/main.cpp
+ cuda/harvesting/CudaThresherDummy.cpp)
+
+target_link_libraries(bladebit PRIVATE bladebit_core)
+
+
+# Sources
+set(src_uint128
+ src/uint128_t/endianness.h
+ src/uint128_t/uint128_t.cpp
+ src/uint128_t/uint128_t.h
+)
+
+set(src_chacha8
+ src/pos/chacha8.cpp
+ src/pos/chacha8.h
+)
+
+set(src_fse
+ src/fse/bitstream.h
+ src/fse/compiler.h
+ src/fse/debug.c
+ src/fse/debug.h
+ src/fse/entropy_common.c
+ src/fse/error_private.h
+ src/fse/error_public.h
+ src/fse/fse_compress.c
+ src/fse/fse_decompress.c
+ src/fse/fse.h
+ src/fse/hist.c
+ src/fse/hist.h
+ src/fse/huf.h
+ src/fse/mem.h
+)
+
+set(src_blake3
+ src/b3/blake3.c
+ src/b3/blake3_dispatch.c
+ src/b3/blake3.h
+ src/b3/blake3_impl.h
+ src/b3/blake3_portable.c
+
+ $<${is_x86}:
+
+ $<$:
+ src/b3/blake3_sse41.c
+ src/b3/blake3_avx2.c
+ src/b3/blake3_avx512.c
+ >
+ $<$>:
+ src/b3/blake3_avx2_x86-64_unix.S
+ src/b3/blake3_avx512_x86-64_unix.S
+ src/b3/blake3_sse41_x86-64_unix.S
+ >
+ >
+)
+
+set(src_bech32
+ src/bech32/segwit_addr.c
+ src/bech32/segwit_addr.h
+)
+
+set(src_bladebit
+
+ # third party
+ $<$:
+ ${src_uint128}
+ >
+
+ ${src_chacha8}
+ ${src_fse}
+ ${src_blake3}
+ ${src_bech32}
+
+ # bladebit
+ $<$:
+ src/platform/linux
+ src/platform/linux/SysHost_Linux.cpp
+ >
+
+ $<$:
+ src/platform/macos/SysHost_Macos.cpp
+ >
+
+ $<$:
+ src/platform/unix/FileStream_Unix.cpp
+ src/platform/unix/Thread_Unix.cpp
+ >
+
+ $<$:
+ src/platform/win32/FileStream_Win32.cpp
+ src/platform/win32/SysHost_Win32.cpp
+ src/platform/win32/Thread_Win32.cpp
+ >
+
+ src/BLS.h
+ src/Config.h
+ src/ChiaConsts.h
+ src/Globals.h
+ src/Types.h
+ src/Platform.h
+ src/PlotContext.h
+ src/PlotContext.cpp
+ src/PlotWriter.h
+ src/PlotWriter.cpp
+ src/SysHost.cpp
+ src/SysHost.h
+ src/View.h
+ src/pch.cpp
+ src/pch.h
+ src/Version.h
+
+ src/algorithm/YSort.cpp
+ src/algorithm/YSort.h
+ src/algorithm/RadixSort.h
+
+ src/io/BucketStream.cpp
+ src/io/BucketStream.h
+ src/io/FileStream.cpp
+ src/io/FileStream.h
+ src/io/HybridStream.cpp
+ src/io/HybridStream.h
+ src/io/IOUtil.cpp
+ src/io/IOUtil.h
+ src/io/IStream.h
+ src/io/MemoryStream.h
+
+ src/plotdisk/BlockWriter.h
+ src/plotdisk/DiskFp.h
+ src/plotdisk/DiskPairReader.h
+ src/plotdisk/DiskPlotDebug.cpp
+ src/plotdisk/DiskPlotDebug.h
+ src/plotdisk/DiskPlotInfo.h
+ src/plotdisk/DiskPlotPhase2.h
+ src/plotdisk/DiskPlotPhase3.cpp.disabled
+ src/plotdisk/DiskPlotPhase3.h
+ src/plotdisk/FileId.h
+ src/plotdisk/FpFxGen.h
+ src/plotdisk/FpGroupMatcher.h
+ src/plotdisk/MapWriter.h
+
+ src/plotdisk/jobs/JobShared.h
+ src/plotdisk/jobs/LookupMapJob.h
+ src/plotdisk/jobs/UnpackMapJob.cpp
+ src/plotdisk/jobs/UnpackMapJob.h
+ src/plotdisk/jobs/IOJob.cpp
+ src/plotdisk/jobs/IOJob.h
+
+ src/plotdisk/k32/DiskPlotBounded.h
+ src/plotdisk/k32/FpMatchBounded.inl
+ src/plotdisk/k32/CTableWriterBounded.h
+ src/plotdisk/k32/DiskPlotBounded.cpp
+ src/plotdisk/k32/F1Bounded.inl
+ src/plotdisk/k32/FxBounded.inl
+
+ src/plotdisk/DiskF1.h
+ src/plotdisk/DiskPlotConfig.h
+ src/plotdisk/DiskPlotContext.h
+ src/plotdisk/DiskPlotPhase2.cpp
+ src/plotdisk/DiskPlotPhase3.cpp
+ src/plotdisk/DiskPlotter.h
+ src/plotdisk/DiskPlotter.cpp
+ src/plotdisk/DiskBufferQueue.cpp
+ src/plotdisk/DiskBufferQueue.h
+ src/plotdisk/BitBucketWriter.h
+
+
+ src/plotmem/DbgHelper.cpp
+ src/plotmem/FxSort.h
+ src/plotmem/MemPhase1.h
+ src/plotmem/MemPhase2.h
+ src/plotmem/MemPhase3.h
+ src/plotmem/MemPlotter.h
+ src/plotmem/ParkWriter.h
+ src/plotmem/DbgHelper.h
+ src/plotmem/LPGen.h
+ src/plotmem/MemPhase1.cpp
+ src/plotmem/MemPhase2.cpp
+ src/plotmem/MemPhase3.cpp
+ src/plotmem/MemPhase4.cpp
+ src/plotmem/MemPhase4.h
+ src/plotmem/MemPlotter.cpp
+
+
+ src/plotting/DTables.h
+ src/plotting/GenSortKey.h
+ src/plotting/PlotValidation.cpp
+ src/plotting/TableWriter.cpp
+ src/plotting/PlotTypes.h
+ src/plotting/TableWriter.h
+ src/plotting/WorkHeap.cpp
+ src/plotting/CTables.h
+ src/plotting/Compression.cpp
+ src/plotting/Compression.h
+ src/plotting/FSETableGenerator.cpp
+ src/plotting/GlobalPlotConfig.h
+ src/plotting/IPlotter.h
+ src/plotting/PlotHeader.h
+ src/plotting/PlotTools.cpp
+ src/plotting/PlotTools.h
+ src/plotting/PlotValidation.h
+ src/plotting/PlotWriter.cpp
+ src/plotting/PlotWriter.h
+ src/plotting/Tables.h
+
+ src/plotting/f1/F1Gen.h
+ src/plotting/f1/F1Gen.cpp
+
+ src/plotting/fx/PlotFx.inl
+
+ src/plotting/matching/GroupScan.cpp
+ src/plotting/matching/GroupScan.h
+ src/plotting/WorkHeap.h
+
+ src/threading/AutoResetSignal.h
+ src/threading/Semaphore.cpp
+ src/threading/Semaphore.h
+ src/threading/Fence.cpp
+ src/threading/Fence.h
+ src/threading/GenJob.h
+ src/threading/MTJob.h
+ src/threading/MonoJob.h
+ src/threading/Thread.h
+ src/threading/ThreadPool.cpp
+ src/threading/ThreadPool.h
+ src/threading/AutoResetSignal.cpp
+
+ # src/tools/FSETableGenerator.cpp
+ src/tools/MemTester.cpp
+ src/tools/IOTester.cpp
+ src/tools/PlotComparer.cpp
+ src/tools/PlotFile.cpp
+ src/tools/PlotReader.cpp
+ src/tools/PlotReader.h
+ src/tools/PlotValidator.cpp
+
+ src/util/Array.h
+ src/util/Array.inl
+ src/util/BitField.h
+ src/util/SPCQueue.h
+ src/util/SPCQueue.inl
+
+ src/util/jobs/MemJobs.h
+ src/util/jobs/SortKeyJob.h
+ src/util/BitView.h
+ src/util/CliParser.cpp
+ src/util/KeyTools.cpp
+ src/util/KeyTools.h
+ src/util/Log.h
+ src/util/CliParser.h
+ src/util/Log.cpp
+ src/util/Span.h
+ src/util/StackAllocator.h
+ src/util/Util.cpp
+ src/util/Util.h
+ src/util/VirtualAllocator.h
+
+ src/commands/Commands.h
+ src/commands/CmdPlotCheck.cpp
+ src/commands/CmdSimulator.cpp
+ src/commands/CmdCheckCUDA.cpp
+
+ src/harvesting/GreenReaper.cpp
+ src/harvesting/GreenReaper.h
+ src/harvesting/GreenReaperInternal.h
+ src/harvesting/Thresher.h
+)
+
+target_sources(bladebit_core PUBLIC ${src_bladebit})
+
+ # Disable blake3 conversion loss of data warnings
+ if("${CMAKE_CXX_COMPILER_ID}" MATCHES "MSVC")
+ set_source_files_properties(
+ src/b3/blake3_avx2.c
+ src/b3/blake3_avx512.c
+ src/b3/blake3_sse41.c
+ PROPERTIES COMPILE_FLAGS
+ /wd4244
+ )
+ endif()
diff --git a/BladebitCUDA.cmake b/BladebitCUDA.cmake
new file mode 100644
index 00000000..1fc668fa
--- /dev/null
+++ b/BladebitCUDA.cmake
@@ -0,0 +1,59 @@
+add_executable(bladebit_cuda
+ src/main.cpp
+
+ cuda/CudaPlotter.cu
+ cuda/CudaPlotter.h
+ cuda/CudaPlotContext.h
+ cuda/CudaPlotPhase2.cu
+ cuda/CudaPlotPhase3.cu
+ cuda/CudaPlotPhase3Step2.cu
+ cuda/CudaPlotPhase3Step3.cu
+ cuda/CudaPlotPhase3Internal.h
+ cuda/CudaParkSerializer.h
+ cuda/CudaParkSerializer.cu
+ cuda/chacha8.cu
+ cuda/CudaF1.h
+ cuda/CudaF1.cu
+ cuda/CudaMatch.h
+ cuda/CudaMatch.cu
+ cuda/CudaFx.h
+ cuda/FxCuda.cu
+ cuda/CudaUtil.h
+ cuda/CudaPlotUtil.cu
+ cuda/GpuStreams.h
+ cuda/GpuStreams.cu
+
+ # Harvester
+ cuda/harvesting/CudaThresher.cu
+ cuda/harvesting/CudaThresherFactory.cu
+)
+
+target_include_directories(bladebit_cuda PRIVATE src cuda SYSTEM cuda)
+
+target_compile_definitions(bladebit_cuda PUBLIC
+ BB_CUDA_ENABLED=1
+ THRUST_IGNORE_CUB_VERSION_CHECK=1
+)
+
+target_compile_options(bladebit_cuda PRIVATE
+ ${cuda_archs}
+
+ $<${is_cuda_release}:
+ >
+
+ $<${is_cuda_debug}:
+ -G
+ >
+ )
+
+target_link_options(bladebit_cuda PRIVATE $)
+
+target_link_libraries(bladebit_cuda PRIVATE bladebit_core CUDA::cudart_static)# CUDA::cuda_driver)
+
+set_target_properties(bladebit_cuda PROPERTIES
+ MSVC_RUNTIME_LIBRARY MultiThreaded$<$:Debug>
+ CUDA_RUNTIME_LIBRARY Static
+ CUDA_SEPARABLE_COMPILATION ON
+ CUDA_RESOLVE_DEVICE_SYMBOLS ON
+ CUDA_ARCHITECTURES OFF
+)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a8022376..56595d7c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.20 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.19 FATAL_ERROR)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -12,10 +12,39 @@ if(NOT CMAKE_BUILD_TYPE)
)
endif()
+# Allows for CMAKE_MSVC_RUNTIME_LIBRARY
+if(POLICY CMP0091)
+ cmake_policy(SET CMP0091 NEW)
+endif()
+
set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14" CACHE STRING "macOS minimum supported version.")
set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>" CACHE STRING "MSVC Runtime Library")
-project(bladebit C CXX ASM)
+project(bladebit LANGUAGES C CXX ASM)
+
+# Ensure supported OS and Architecture
+if(NOT( (${CMAKE_SYSTEM_NAME} MATCHES "Linux") OR (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") OR (${CMAKE_SYSTEM_NAME} MATCHES "Windows") ))
+ message( FATAL_ERROR "Unsupported operating system '${CMAKE_SYSTEM_NAME}'" )
+endif()
+
+if(NOT (${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "arm64" OR ${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "aarch64" OR ${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "AMD64" OR ${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "x86_64"))
+ message( FATAL_ERROR "Unsupported architecture '${CMAKE_HOST_SYSTEM_PROCESSOR}'" )
+endif()
+
+if(NOT CMAKE_CUDA_COMPILER)
+ include(FindCUDAToolkit)
+
+ if(CUDAToolkit_FOUND)
+ message("Found CUDA: true")
+ message("NVCC : ${CUDAToolkit_NVCC_EXECUTABLE}")
+ set(CMAKE_CUDA_COMPILER ${CUDAToolkit_NVCC_EXECUTABLE})
+ endif()
+endif()
+
+if(CMAKE_CUDA_COMPILER)
+ enable_language(CUDA)
+endif()
+
message("Config : ${CMAKE_BUILD_TYPE}")
message("Compiler : ${CMAKE_CXX_COMPILER_ID}")
@@ -30,53 +59,17 @@ set(CMAKE_MODULE_PATH
${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules
)
-
-#
-# Grab Dependencies
-#
-set(platform_libs)
-
-# BLS
-include(FetchContent)
-
-FetchContent_Declare(
- bls
- GIT_REPOSITORY https://github.com/Chia-Network/bls-signatures.git
- GIT_TAG 1.0.10
-)
-
-set(BUILD_BLS_PYTHON_BINDINGS "0" CACHE STRING "0")
-set(BUILD_BLS_TESTS "0" CACHE STRING "")
-set(BUILD_BLS_BENCHMARKS "0" CACHE STRING "")
-FetchContent_MakeAvailable(bls)
-
-# Threads
-find_package(Threads REQUIRED)
-
-# NUMA
-if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
- find_package(NUMA REQUIRED)
- set(platform_libs ${NUMA_LIBRARY})
+# Is this project included as a dependency/FetchContent?
+if(NOT(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR))
+ set(BB_IS_DEPENDENCY ON)
+ set(BB_ENABLE_TESTS OFF)
+ set(BB_ENABLE_EXE OFF)
endif()
-# Catch
-# TODO: Add configuration var to disable this
-include(cmake_modules/FindCatch2.cmake)
-set_target_properties(Catch2 PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>")
-set_target_properties(Catch2WithMain PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>")
-
-
-# Config
-set(c_opts)
-set(link_opts)
-
-set(release_c_opts)
-set(debug_c_opts)
-set(dev_c_opts)
-
-set(release_link_opts)
-set(debug_link_opts)
+#
+# Options
+#
option(BENCHMARK_MODE "Enable benchmark mode for memplot. No final plot is written." OFF)
if(BENCHMARK_MODE)
add_compile_definitions("BB_BENCHMARK_MODE=1")
@@ -87,342 +80,85 @@ if(ENABLE_DISK_METRICS)
add_compile_definitions("BB_IO_METRICS_ON=1")
endif()
-# Embed version inline when in dev mode
-if((NOT DEFINED ENV{CI}) AND (NOT DEFINED CACHE{bb_version_embedded}))
- message("Embedding local build version")
-
- set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.")
-
- set(cmd_ver bash)
- if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
- set(cmd_ver bash.exe)
- endif()
-
- execute_process(COMMAND ${cmd_ver} extract-version.sh major OUTPUT_VARIABLE bb_ver_maj WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
- execute_process(COMMAND ${cmd_ver} extract-version.sh minor OUTPUT_VARIABLE bb_ver_min WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
- execute_process(COMMAND ${cmd_ver} extract-version.sh revision OUTPUT_VARIABLE bb_ver_rev WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
- execute_process(COMMAND ${cmd_ver} extract-version.sh suffix OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
- execute_process(COMMAND ${cmd_ver} extract-version.sh commit OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-
- # Remove trailing whitespace incurred in windows gitbash
- string(STRIP "${bb_ver_maj}" bb_ver_maj)
- string(STRIP "${bb_ver_min}" bb_ver_min)
- string(STRIP "${bb_ver_rev}" bb_ver_rev)
- string(STRIP "${bb_ver_suffix}" bb_ver_suffix)
- string(STRIP "${bb_ver_commit}" bb_ver_commit)
-
- set(bb_ver_suffix ${bb_ver_suffix}-dev)
-
- # This is slow on windows, so let's cache them
- set(bb_ver_maj ${bb_ver_maj} CACHE STRING "")
- set(bb_ver_min ${bb_ver_min} CACHE STRING "")
- set(bb_ver_rev ${bb_ver_rev} CACHE STRING "")
- set(bb_ver_suffix ${bb_ver_suffix} CACHE STRING "")
- set(bb_ver_commit ${bb_ver_commit} CACHE STRING "")
-endif()
-
-if(NOT DEFINED ENV{CI})
- add_compile_definitions(BLADEBIT_VERSION_MAJ=${bb_ver_maj})
- add_compile_definitions(BLADEBIT_VERSION_MIN=${bb_ver_min})
- add_compile_definitions(BLADEBIT_VERSION_REV=${bb_ver_rev})
- add_compile_definitions(BLADEBIT_VERSION_SUFFIX="${bb_ver_suffix}")
- add_compile_definitions(BLADEBIT_GIT_COMMIT="${bb_ver_commit}")
-endif()
-
-if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
-
- # MSVC
- set(c_opts
- /std:c++17
- /Zc:__cplusplus
- /MP
- /Zi
- # /EHsc-
- # /Wall
- /W3
- /WX
- /FIpch.h
- /wd4068
- /wd4464
- /wd4668
- /wd4820
- /wd4514
- /wd4626
- /wd5027
- /DUNICODE=1
- /DWIN32_LEAN_AND_MEAN=1
- /DNOMINMAX=1
- /D_CRT_SECURE_NO_WARNINGS=1
- /D_HAS_EXCEPTIONS=0
- ${c_opts})
-
- set(tests_c_opts /DBB_TEST_MODE=1 ${tests_c_opts})
-
- set(link_opts
- /SUBSYSTEM:CONSOLE
- /STACK:33554432,1048576
- ${link_opts})
-
- set(release_c_opts
- /Oi
- /O2
- /Gy
- /GL
- /DNDEBUG=1
- /D_NDEBUG=1
- ${release_c_opts})
-
- set(debug_c_opts
- /Od
- /DDEBUG=1
- /D_DEBUG=1
- ${debug_c_opts})
-
- set(dev_c_opts
- ${dev_c_opts})
-
-
- set(release_link_opts
- /DEBUG:FULL
- /LTCG
- /OPT:REF,ICF,LBR
- ${release_link_opts})
-
- set(debug_link_opts
-# /DEBUG:FASTLINK
- /OPT:NOREF,NOICF,NOLBR
-# /INCREMENTAL
- ${debug_link_opts})
-
- # Dependency config
- target_compile_options(bls PRIVATE /MP)
- target_compile_options(relic_s PRIVATE /MP)
- target_compile_options(sodium PRIVATE /MP)
-
- target_compile_options(bls PRIVATE $<$:/MT>)
- target_compile_options(relic_s PRIVATE $<$:/MT>)
- target_compile_options(sodium PRIVATE $<$:/MT>)
-
- target_compile_options(bls PRIVATE $<$:/MTd>)
- target_compile_options(relic_s PRIVATE $<$:/MTd>)
- target_compile_options(sodium PRIVATE $<$:/MTd>)
-
-else()
-
- # *Nix
- set(c_opts --include=pch.h -Wall -Wno-comment -Wno-unknown-pragmas -g ${c_opts})
-
- set(tests_c_opts -DBB_TEST_MODE=1 ${tests_c_opts})
+# NOTE: These are mostly sandbox test environment, not proper tests
+option(BB_ENABLE_TESTS "Enable tests." OFF)
+option(NO_CUDA_HARVESTER "Explicitly disable CUDA in the bladebit_harvester target." OFF)
+option(BB_NO_EMBED_VERSION "Disable embedding the version when building locally (non-CI)." ON)
+option(BB_HARVESTER_ONLY "Enable only the harvester target." OFF)
+option(BB_HARVESTER_STATIC "Build the harvester target as a static library." OFF)
- set(release_c_opts
- -O3 #-flto
- -D_NDEBUG=1
- -DNDEBUG=1
- ${release_c_opts})
-
- set(debug_c_opts
- -O0
- -DDEBUG=1
- -D_DEBUG=1
- ${debug_c_opts})
-
- set(dev_c_opts
- ${dev_c_opts})
-
- set(link_opts -g -rdynamic #-flto
- ${link_opts})
-
- # GCC
- if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-
- set(c_opts -fmax-errors=5 ${c_opts})
-
- # Avoid ranlib error: plugin needed to handle lto objectR "gcc-ar")
- # set(c_opts -ffat-lto-objects ${c_opts})
-
- # Build with native architecture when not building release packages
- if(NOT DEFINED ENV{CI})
- set(c_opts -march=native ${c_opts})
- endif()
-
- # Clang
- elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-
- set(c_opts -ferror-limit=5 -fdeclspec -Wno-empty-body ${c_opts})
-
- endif()
-
-endif()
#
-# Sources
+# Dependencies
#
-file(GLOB_RECURSE bb_sources
- RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
- CONFIGURE_DEPENDS
- LIST_DIRECTORIES false
- src/*.cpp
- src/*.c
-)
-set(src_full ${bb_sources})
-
-# Headers
-file(GLOB_RECURSE bb_headers
- RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
- CONFIGURE_DEPENDS
- LIST_DIRECTORIES false
- src/*.h
- src/*.hpp
- src/*.inl
-)
-
-# Ignore some sources
-list(FILTER bb_sources EXCLUDE REGEX "src/main\\.cpp")
-list(FILTER bb_sources EXCLUDE REGEX "src/tools/FSETableGenerator.cpp")
-list(FILTER bb_sources EXCLUDE REGEX "src/sandbox/.+")
-list(FILTER bb_sources EXCLUDE REGEX "src/platform/.+")
-list(FILTER bb_sources EXCLUDE REGEX "src/b3/blake3_(avx|sse).+")
-list(FILTER bb_sources EXCLUDE REGEX "src/uint128_t/.+")
-
+include(FetchContent)
-# Project-specific sources
-file(GLOB_RECURSE src_tests RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
- CONFIGURE_DEPENDS LIST_DIRECTORIES false
- tests/*.cpp
-)
+# Threads
+find_package(Threads REQUIRED)
-file(GLOB_RECURSE src_dev RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
- CONFIGURE_DEPENDS LIST_DIRECTORIES false
- src/sandbox/*.cpp
- src/sandbox/*.h
-)
+if(NOT ${BB_HARVESTER_ONLY})
+ # BLS
+ FetchContent_Declare(
+ bls
+ GIT_REPOSITORY https://github.com/Chia-Network/bls-signatures.git
+ GIT_TAG 2.0.2
+ EXCLUDE_FROM_ALL ${BB_IS_DEPENDENCY}
+ )
+ set(BUILD_BLS_PYTHON_BINDINGS "0" CACHE STRING "0")
+ set(BUILD_BLS_TESTS "0" CACHE STRING "")
+ set(BUILD_BLS_BENCHMARKS "0" CACHE STRING "")
+ FetchContent_MakeAvailable(bls)
-# Configure dependent on config/platform/architecture
-# Architecture
-if(${CMAKE_HOST_SYSTEM_PROCESSOR} MATCHES "x86_64" OR ${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "AMD64")
-
- if(NOT MSVC)
- list(APPEND bb_sources
- src/b3/blake3_avx2_x86-64_unix.S
- src/b3/blake3_avx512_x86-64_unix.S
- src/b3/blake3_sse41_x86-64_unix.S
- )
- else()
- list(APPEND bb_sources
- src/b3/blake3_avx2.c
- src/b3/blake3_avx512.c
- src/b3/blake3_sse41.c
- src/uint128_t/uint128_t.cpp
- )
- # Disable blake3 conversion loss of data warnings
- set_source_files_properties(
- src/b3/blake3_avx2.c
- src/b3/blake3_avx512.c
- src/b3/blake3_sse41.c
- PROPERTIES COMPILE_FLAGS
- /wd4244
- )
+ # NUMA
+ if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
+ find_package(NUMA REQUIRED)
+ set(platform_libs ${NUMA_LIBRARY})
endif()
+endif() # BB_HARVESTER_ONLY
-elseif(${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "arm64" OR ${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "aarch64")
+#
+# Internal Config
+#
+set(is_release $)
+set(is_debug $)
+set(is_c_cpp $)
+set(is_cuda $)
+set(is_cuda_release $)
+set(is_cuda_debug $)
+set(is_x86 $,$>)
+set(is_arm $,$>)
+set(is_msvc_c_cpp $>)
+
+if(CUDAToolkit_FOUND AND NOT ${NO_CUDA_HARVESTER})
+ set(have_cuda $)
else()
- message( FATAL_ERROR "Unsupported architecture '${CMAKE_HOST_SYSTEM_PROCESSOR}'" )
+ set(have_cuda $)
endif()
-# OS
-if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
- file(GLOB_RECURSE src_linux RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
- CONFIGURE_DEPENDS LIST_DIRECTORIES false
- src/platform/unix/*.cpp
- src/platform/linux/*.cpp
- )
- list(APPEND bb_sources ${src_linux})
-
-elseif(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
-
- file(GLOB_RECURSE src_win RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
- CONFIGURE_DEPENDS LIST_DIRECTORIES false
- src/platform/win32/*.cpp
- )
- list(APPEND bb_sources ${src_win})
+#
+# Targets
+#
+include(Config.cmake)
-elseif(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+if(NOT ${BB_HARVESTER_ONLY})
+ if(NOT BB_IS_DEPENDENCY AND (NOT BB_NO_EMBED_VERSION))
+ include(cmake_modules/EmbedVersion.cmake)
+ endif()
- file(GLOB_RECURSE src_mac RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
- CONFIGURE_DEPENDS LIST_DIRECTORIES false
- src/platform/unix/*.cpp
- src/platform/macos/*.cpp
- )
- list(APPEND bb_sources ${src_mac})
+ include(Bladebit.cmake)
+ set_target_properties(bladebit_core bladebit PROPERTIES EXCLUDE_FROM_ALL $)
-else()
- message( FATAL_ERROR "Unsupported operating system '${CMAKE_SYSTEM_NAME}'" )
+ if(CUDAToolkit_FOUND)
+ include(BladebitCUDA.cmake)
+ set_target_properties(bladebit_cuda PROPERTIES EXCLUDE_FROM_ALL $)
+ endif()
endif()
+include(Harvester.cmake)
-#
-# Targets
-#
-set(bb_include_dirs
- ${INCLUDE_DIRECTORIES}
- ${CMAKE_CURRENT_SOURCE_DIR}/src
-)
-
-# macro(config_proj tgt)
-# message("Configuring target ${tgt}:${CMAKE_BUILD_TYPE}.")
-# target_compile_options(${tgt} PRIVATE $<$:${c_opts} ${release_c_opts}>)
-# target_compile_options(${tgt} PRIVATE $<$:${c_opts} ${debug_c_opts}>)
-# target_link_options(${tgt} PRIVATE $<$:${link_opts} ${release_link_opts}>)
-# target_link_options(${tgt} PRIVATE $<$:${link_opts} ${debug_link_opts}>)
-
-# target_include_directories(${tgt} PRIVATE ${bb_include_dirs})
-# endmacro()
-
-# BladeBit
-add_library(lib_bladebit ${bb_sources} ${bb_headers} src/plotdisk/k32/FpMatchBounded.inl src/plotdisk/k32/F1Bounded.inl)
-
-set_target_properties(lib_bladebit PROPERTIES
- OUTPUT_NAME bladebit
- MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>"
-)
-target_link_libraries(lib_bladebit PUBLIC Threads::Threads bls ${platform_libs})
-target_include_directories(lib_bladebit PUBLIC ${bb_include_dirs})
-
-target_compile_options(lib_bladebit PUBLIC $<$:${c_opts} ${release_c_opts}>)
-target_compile_options(lib_bladebit PUBLIC $<$:${c_opts} ${debug_c_opts}>)
-target_link_options(lib_bladebit PUBLIC $<$:${link_opts} ${release_link_opts}>)
-target_link_options(lib_bladebit PUBLIC $<$:${link_opts} ${debug_link_opts}>)
-
-add_executable(bladebit ${bb_headers} src/main.cpp src/plotdisk/k32/FxBounded.inl)
-target_link_libraries(bladebit PRIVATE lib_bladebit)
-
-add_executable(bladebit_dev EXCLUDE_FROM_ALL src/sandbox/sandbox_main.cpp ${src_dev} ${bb_headers})
-target_link_libraries(bladebit_dev PRIVATE lib_bladebit)
-
-# Tools
-add_executable(fsegen src/tools/FSETableGenerator.cpp ${bb_sources} ${bb_headers})
-target_link_libraries(fsegen PRIVATE lib_bladebit)
-
-# add_executable(plot_tool
-# src/tools/PlotTools_Main.cpp
-# src/tools/PlotReader.cpp
-# src/tools/PlotValidator.cpp
-# src/tools/PlotComparer.cpp
-# ${bb_headers}
-# )
-# target_link_libraries(plot_tool PRIVATE lib_bladebit)
-
-# Tests
-add_executable(tests ${src_tests} ${bb_headers})
-target_compile_options(tests PUBLIC $<$:${c_opts} ${release_c_opts} ${tests_c_opts}>)
-target_compile_options(tests PUBLIC $<$:${c_opts} ${debug_c_opts} ${tests_c_opts}>)
-set_target_properties(tests PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>")
-target_link_libraries(tests PRIVATE lib_bladebit Catch2::Catch2WithMain)
-
-# Pretty source view for IDE projects
-source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR}/src
- FILES ${src_full} ${bb_headers}
-)
+if(${BB_ENABLE_TESTS} AND NOT ${BB_HARVESTER_ONLY})
+ include(Tests.cmake)
+endif()
diff --git a/CMakeSettings.json b/CMakeSettings.json
index fb51b4bc..5ef52577 100644
--- a/CMakeSettings.json
+++ b/CMakeSettings.json
@@ -9,6 +9,17 @@
"installRoot": "${projectDir}\\out\\install\\${name}",
"ctestCommandArgs": ""
},
+
+ {
+ "name": "x64-Release",
+ "generator": "Ninja",
+ "configurationType": "Debug",
+ "inheritEnvironments": [ "msvc_x64_x64" ],
+ "buildRoot": "${projectDir}\\out\\build-release\\${name}",
+ "installRoot": "${projectDir}\\out\\cmake-install-release\\${name}",
+ "ctestCommandArgs": ""
+ },
+
{
"name": "MemTest",
"generator": "Ninja",
diff --git a/Config.cmake b/Config.cmake
new file mode 100644
index 00000000..4139b4a9
--- /dev/null
+++ b/Config.cmake
@@ -0,0 +1,175 @@
+# Base interface configuration project
+add_library(bladebit_config INTERFACE)
+
+target_compile_definitions(bladebit_config INTERFACE
+ $<${is_release}:
+ _NDEBUG=1
+ NDEBUG=1
+ >
+ $<${is_debug}:
+ _DEBUG=1
+ DEBUG=1
+ >
+
+ $<$:
+ UNICODE=1
+ WIN32_LEAN_AND_MEAN=1
+ NOMINMAX=1
+ _CRT_SECURE_NO_WARNINGS=1
+ _HAS_EXCEPTIONS=0
+ >
+)
+
+target_compile_options(bladebit_config INTERFACE
+
+ # GCC or Clang
+ $<$:
+ -Wall
+ -Wno-comment
+ -Wno-unknown-pragmas
+ -g
+
+ $<${is_release}:
+ -O3
+ >
+
+ $<${is_debug}:
+ -O0
+ >
+ >
+
+ # GCC
+ $<$:
+ -fmax-errors=5
+ >
+
+ # Clang
+ $<$:
+ -ferror-limit=5
+ -fdeclspec
+ -Wno-empty-body
+ >
+
+ # MSVC
+ $<${is_msvc_c_cpp}:
+ /Zc:__cplusplus
+ /MP
+ /Zi
+ # /EHsc-
+ # /Wall
+ /W3
+ /WX
+ /wd4068
+ /wd4464
+ /wd4668
+ /wd4820
+ /wd4514
+ /wd4626
+ /wd5027
+
+ $<${is_release}:
+ /Oi /O2 /Gy /GL
+ >
+
+ $<${is_debug}:
+ /Od
+ >
+ >
+
+ $<${is_x86}:
+ >
+
+ $<${is_arm}:
+ >
+)
+
+target_link_options(bladebit_config INTERFACE
+
+ # GCC or Clang
+ $<$:
+ -g
+ -rdynamic
+ >
+
+ # MSVC
+ $<${is_msvc_c_cpp}:
+
+ /SUBSYSTEM:CONSOLE
+ /STACK:33554432,1048576
+
+ $<${is_release}:
+ /DEBUG:FULL
+ /LTCG
+ /OPT:REF,ICF,LBR
+ >
+
+ $<${is_debug}:
+ # /DEBUG:FASTLINK
+ # /OPT:NOREF,NOICF,NOLBR
+ # /INCREMENTAL
+ >
+ >
+)
+
+set_property(DIRECTORY . PROPERTY MSVC_RUNTIME_LIBRARY MultiThreaded$<$:Debug>)
+set_property(DIRECTORY . PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+set_property(DIRECTORY . PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+
+set(preinclude_pch
+ $<${is_cuda}:--pre-include pch.h>
+ $<${is_c_cpp}:
+ $<$:--include=pch.h>
+ >
+ $<${is_msvc_c_cpp}:/FIpch.h>
+)
+
+# See: https://gitlab.kitware.com/cmake/cmake/-/issues/18265
+cmake_policy(SET CMP0105 NEW)
+
+set(cuda_archs
+
+ $<${is_cuda_release}:
+## Maxwell
+ ## Tesla/Quadro M series
+ -gencode=arch=compute_50,code=sm_50
+ ## Quadro M6000 , GeForce 900, GTX-970, GTX-980, GTX Titan X
+ -gencode=arch=compute_52,code=sm_52
+ ## Tegra (Jetson) TX1 / Tegra X1, Drive CX, Drive PX, Jetson Nano
+ -gencode=arch=compute_53,code=sm_53
+## Pascal
+ ## GeForce 1000 series
+ -gencode=arch=compute_60,code=sm_60
+ ## GeForce GTX 1050Ti, GTX 1060, GTX 1070, GTX 1080
+ -gencode=arch=compute_61,code=sm_61
+ ## Drive Xavier, Jetson AGX Xavier, Jetson Xavier NX
+ -gencode=arch=compute_62,code=sm_62
+## Volta
+ ## GV100, Tesla V100, Titan V
+ -gencode=arch=compute_70,code=sm_70
+ ## Tesla V100
+ -gencode=arch=compute_72,code=sm_72
+ ## Turing
+ -gencode=arch=compute_75,code=sm_75
+## Ampere
+ ## NVIDIA A100, DGX-A100
+ -gencode=arch=compute_80,code=sm_80
+ ## GeForce RTX 3000 series, NVIDIA A100
+ -gencode=arch=compute_86,code=sm_86
+ ## Jetson Orin
+ -gencode=arch=compute_87,code=sm_87
+## Lovelace
+ ## NVIDIA GeForce RTX 4090, RTX 4080, RTX 6000, Tesla L40
+ -gencode=arch=compute_89,code=sm_89
+ ## Future proofing
+ -gencode=arch=compute_89,code=compute_89
+## Hopper
+ ## NVIDIA H100 (GH100)
+ # -gencode=arch=compute_90,code=sm_90
+ # -gencode=arch=compute_90a,code=sm_90a
+ >
+
+ $<${is_cuda_debug}:
+ -arch=native
+ # -gencode=arch=compute_52,code=sm_52 # Maxwell
+ >
+)
diff --git a/Harvester.cmake b/Harvester.cmake
new file mode 100644
index 00000000..d853a2db
--- /dev/null
+++ b/Harvester.cmake
@@ -0,0 +1,176 @@
+if(NOT ${BB_HARVESTER_STATIC})
+ add_library(bladebit_harvester SHARED)
+else()
+ add_library(bladebit_harvester STATIC)
+endif()
+
+
+set_property(TARGET bladebit_harvester PROPERTY PUBLIC_HEADER
+ src/harvesting/GreenReaper.h
+ src/harvesting/GreenReaperPortable.h)
+
+install(TARGETS bladebit_harvester
+ LIBRARY DESTINATION green_reaper/lib
+ ARCHIVE DESTINATION green_reaper/lib
+ PUBLIC_HEADER DESTINATION green_reaper/include
+)
+
+target_sources(bladebit_harvester PRIVATE
+ src/pch.cpp
+
+ src/pos/chacha8.cpp
+ src/pos/chacha8.h
+
+ src/fse/bitstream.h
+ src/fse/compiler.h
+ src/fse/debug.c
+ src/fse/debug.h
+ src/fse/entropy_common.c
+ src/fse/error_private.h
+ src/fse/error_public.h
+ src/fse/fse_compress.c
+ src/fse/fse_decompress.c
+ src/fse/fse.h
+ src/fse/hist.c
+ src/fse/hist.h
+ src/fse/huf.h
+ src/fse/mem.h
+
+ src/b3/blake3.c
+ src/b3/blake3_dispatch.c
+ src/b3/blake3.h
+ src/b3/blake3_impl.h
+ src/b3/blake3_portable.c
+
+ $<${is_x86}:
+ $<$:
+ src/b3/blake3_sse41.c
+ src/b3/blake3_avx2.c
+ src/b3/blake3_avx512.c
+ >
+ $<$>:
+ src/b3/blake3_avx2_x86-64_unix.S
+ src/b3/blake3_avx512_x86-64_unix.S
+ src/b3/blake3_sse41_x86-64_unix.S
+ >
+ >
+
+
+ src/util/Log.cpp
+ src/util/Util.cpp
+ src/PlotContext.cpp
+ src/io/HybridStream.cpp
+ src/threading/AutoResetSignal.cpp
+ src/threading/Fence.cpp
+ src/threading/Semaphore.cpp
+ src/threading/ThreadPool.cpp
+ src/plotting/FSETableGenerator.cpp
+ src/plotting/PlotWriter.cpp
+ src/plotting/Compression.cpp
+ src/plotting/matching/GroupScan.cpp
+ src/plotdisk/DiskBufferQueue.cpp
+ src/plotting/WorkHeap.cpp
+ src/plotdisk/jobs/IOJob.cpp
+ src/harvesting/GreenReaper.cpp
+
+ src/bech32/segwit_addr.c
+
+ $<${have_cuda}:
+ cuda/harvesting/CudaThresher.cu
+ cuda/harvesting/CudaThresherFactory.cu
+ cuda/FxCuda.cu
+ cuda/CudaF1.cu
+ cuda/CudaMatch.cu
+ cuda/CudaPlotUtil.cu
+
+ # TODO: Remove this, ought not be needed in harvester
+ cuda/GpuStreams.cu
+ >
+
+ $<$:
+ cuda/harvesting/CudaThresherDummy.cpp
+ >
+
+ $<$:
+ src/platform/win32/SysHost_Win32.cpp
+ src/platform/win32/FileStream_Win32.cpp
+ src/platform/win32/Thread_Win32.cpp
+ >
+
+ $<$:
+ src/platform/linux/SysHost_Linux.cpp
+ >
+
+ $<$:
+ src/platform/macos/SysHost_Macos.cpp
+ >
+
+ $<$:
+ src/platform/unix/FileStream_Unix.cpp
+ src/platform/unix/Thread_Unix.cpp
+ >
+
+ $<$:
+ src/uint128_t/uint128_t.cpp
+ >
+)
+
+target_include_directories(bladebit_harvester PRIVATE src SYSTEM cuda INTERFACE src/harvesting)
+
+target_compile_features(bladebit_harvester PUBLIC cxx_std_17)
+
+target_compile_definitions(bladebit_harvester
+ PRIVATE
+ THRUST_IGNORE_CUB_VERSION_CHECK=1
+ GR_EXPORT=1
+
+ $<${have_cuda}:
+ BB_CUDA_ENABLED=1
+ >
+
+ PUBLIC
+ BB_IS_HARVESTER=1
+ INTERFACE
+ $<$:GR_NO_IMPORT=1>
+)
+
+
+target_compile_options(bladebit_harvester PRIVATE
+ ${preinclude_pch}
+ ${cuda_archs}
+)
+
+if(have_cuda)
+ target_link_options(bladebit_harvester PUBLIC $)
+endif()
+
+target_link_libraries(bladebit_harvester
+ PRIVATE
+ bladebit_config
+ PUBLIC
+ Threads::Threads
+ $<${have_cuda}:CUDA::cudart_static>
+)
+
+if(CUDAToolkit_FOUND)
+ set_target_properties(bladebit_harvester PROPERTIES
+ EXCLUDE_FROM_ALL ON
+ MSVC_RUNTIME_LIBRARY MultiThreaded$<$:Debug>
+ CUDA_RUNTIME_LIBRARY Static
+ CUDA_SEPARABLE_COMPILATION ON
+ CUDA_RESOLVE_DEVICE_SYMBOLS ON
+ # CUDA_ARCHITECTURES OFF
+ )
+endif()
+
+ # Disable blake3 conversion loss of data warnings
+ if("${CMAKE_CXX_COMPILER_ID}" MATCHES "MSVC")
+ set_source_files_properties(
+ src/b3/blake3_avx2.c
+ src/b3/blake3_avx512.c
+ src/b3/blake3_sse41.c
+ PROPERTIES COMPILE_FLAGS
+ /wd4244
+ )
+ endif()
+
diff --git a/README.md b/README.md
index a913c55f..9197014e 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,22 @@ A high-performance **k32-only**, Chia (XCH) plotter supporting in-RAM and disk-b
## Requirements
+## **GPU (CUDA) Plotter Requirements**
+
+
+**Supported system configurations for alpha:**
+
+|||
+|------------|-------------------------------------------------------------------------------
+| **OS** | Windows and Linux
+| **Memory** | **256GB** of system DRAM
+| **GPUs** | NVIDIA GPUs with CUDA capability **5.2** and up with at least **8GB** of vRAM
+| |
+
+> See https://developer.nvidia.com/cuda-gpus for compatible GPUs.
+
+
+
### In-RAM
**416 GiB of RAM are required** to run it, and a few more megabytes for stack space and small allocations.
diff --git a/Tests.cmake b/Tests.cmake
new file mode 100644
index 00000000..577e541c
--- /dev/null
+++ b/Tests.cmake
@@ -0,0 +1,11 @@
+include(cmake_modules/FindCatch2.cmake)
+
+add_executable(tests ${src_bladebit})
+target_compile_definitions(tests PRIVATE
+ BB_TEST_MODE=1
+)
+target_link_libraries(tests PRIVATE bladebit_config Catch2::Catch2WithMain)
+
+set_target_properties(tests PROPERTIES
+ EXCLUDE_FROM_ALL ON
+)
\ No newline at end of file
diff --git a/VERSION b/VERSION
index 38f77a65..4a36342f 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.0.1
+3.0.0
diff --git a/cmake_modules/EmbedVersion.cmake b/cmake_modules/EmbedVersion.cmake
new file mode 100644
index 00000000..6ec042c0
--- /dev/null
+++ b/cmake_modules/EmbedVersion.cmake
@@ -0,0 +1,41 @@
+
+if((NOT DEFINED ENV{CI}) AND (NOT DEFINED CACHE{bb_version_embedded}))
+ message("Embedding local build version")
+
+ set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.")
+
+ set(cmd_ver bash)
+ if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
+ set(cmd_ver bash.exe)
+ endif()
+
+ execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh major OUTPUT_VARIABLE bb_ver_maj WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+ execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh minor OUTPUT_VARIABLE bb_ver_min WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+ execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh revision OUTPUT_VARIABLE bb_ver_rev WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+ execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh suffix OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+ execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh commit OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+
+ # Remove trailing whitespace incurred in windows gitbash
+ string(STRIP "${bb_ver_maj}" bb_ver_maj)
+ string(STRIP "${bb_ver_min}" bb_ver_min)
+ string(STRIP "${bb_ver_rev}" bb_ver_rev)
+ string(STRIP "${bb_ver_suffix}" bb_ver_suffix)
+ string(STRIP "${bb_ver_commit}" bb_ver_commit)
+
+ set(bb_ver_suffix ${bb_ver_suffix}-dev)
+
+ # This is slow on windows, so let's cache them
+ set(bb_ver_maj ${bb_ver_maj} CACHE STRING "")
+ set(bb_ver_min ${bb_ver_min} CACHE STRING "")
+ set(bb_ver_rev ${bb_ver_rev} CACHE STRING "")
+ set(bb_ver_suffix ${bb_ver_suffix} CACHE STRING "")
+ set(bb_ver_commit ${bb_ver_commit} CACHE STRING "")
+endif()
+
+if(NOT DEFINED ENV{CI})
+ add_compile_definitions(BLADEBIT_VERSION_MAJ=${bb_ver_maj})
+ add_compile_definitions(BLADEBIT_VERSION_MIN=${bb_ver_min})
+ add_compile_definitions(BLADEBIT_VERSION_REV=${bb_ver_rev})
+ add_compile_definitions(BLADEBIT_VERSION_SUFFIX="${bb_ver_suffix}")
+ add_compile_definitions(BLADEBIT_GIT_COMMIT="${bb_ver_commit}")
+endif()
diff --git a/cmake_modules/FindCatch2.cmake b/cmake_modules/FindCatch2.cmake
index ddced5a8..c3623a9f 100644
--- a/cmake_modules/FindCatch2.cmake
+++ b/cmake_modules/FindCatch2.cmake
@@ -3,7 +3,7 @@ Include(FetchContent)
FetchContent_Declare(
Catch2
GIT_REPOSITORY https://github.com/catchorg/Catch2.git
- GIT_TAG v3.0.0-preview4
+ GIT_TAG v3.3.2
)
FetchContent_MakeAvailable(Catch2)
diff --git a/cuda/CudaF1.cu b/cuda/CudaF1.cu
new file mode 100644
index 00000000..5f51e8f5
--- /dev/null
+++ b/cuda/CudaF1.cu
@@ -0,0 +1,175 @@
+#include "CudaF1.h"
+#include "CudaUtil.h"
+#include "ChiaConsts.h"
+
+/// #NOTE: Code duplicated from chacha8.cu for now.
+/// #TODO: Refactor and consolidate
+
+
+#define U32TO32_LITTLE(v) CuBSwap32(v)
+#define U8TO32_LITTLE(p) (*(const uint32_t *)(p))
+#define U32TO8_LITTLE(p, v) (((uint32_t *)(p))[0] = U32TO32_LITTLE(v))
+#define ROTL32(v, n) (((v) << (n)) | ((v) >> (32 - (n))))
+
+#define ROTATE(v, c) (ROTL32(v, c))
+#define XOR(v, w) ((v) ^ (w))
+#define PLUS(v, w) ((v) + (w))
+#define PLUSONE(v) (PLUS((v), 1))
+
+#define QUARTERROUND(a, b, c, d) \
+ a = PLUS(a, b); \
+ d = ROTATE(XOR(d, a), 16); \
+ c = PLUS(c, d); \
+ b = ROTATE(XOR(b, c), 12); \
+ a = PLUS(a, b); \
+ d = ROTATE(XOR(d, a), 8); \
+ c = PLUS(c, d); \
+ b = ROTATE(XOR(b, c), 7)
+
+
+// 128 threads per cuda block, each thread will do one chacha block
+#define CHACHA_BLOCKS_PER_CUDA_BLOCK 128ull
+
+//-----------------------------------------------------------
+__global__ void chacha8_get_keystream_cuda_k32(
+ const CudaPlotInfo info,
+ const uint32_t* input,
+ const uint64_t chachaBlockBase,
+ uint64* outY,
+ uint32* outX )
+{
+ extern __shared__ uint32 sharedBucketCounts[];
+
+ const uint32 id = threadIdx.x;
+ const uint32 gid = blockIdx.x * blockDim.x + id;
+
+ const uint64_t chachaBlock = chachaBlockBase + blockIdx.x * CHACHA_BLOCKS_PER_CUDA_BLOCK + id;
+
+
+ uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+ uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
+
+ j0 = input[0];
+ j1 = input[1];
+ j2 = input[2];
+ j3 = input[3];
+ j4 = input[4];
+ j5 = input[5];
+ j6 = input[6];
+ j7 = input[7];
+ j8 = input[8];
+ j9 = input[9];
+ j10 = input[10];
+ j11 = input[11];
+ j12 = (uint32_t)chachaBlock;
+ j13 = (uint32_t)(chachaBlock >> 32);
+ j14 = input[14];
+ j15 = input[15];
+
+ // #TODO: Dispatch a different kernel to set the x's
+ x0 = j0;
+ x1 = j1;
+ x2 = j2;
+ x3 = j3;
+ x4 = j4;
+ x5 = j5;
+ x6 = j6;
+ x7 = j7;
+ x8 = j8;
+ x9 = j9;
+ x10 = j10;
+ x11 = j11;
+ x12 = j12;
+ x13 = j13;
+ x14 = j14;
+ x15 = j15;
+
+ #pragma unroll
+ for( int i = 8; i > 0; i -= 2 )
+ {
+ QUARTERROUND( x0, x4, x8 , x12 );
+ QUARTERROUND( x1, x5, x9 , x13 );
+ QUARTERROUND( x2, x6, x10, x14 );
+ QUARTERROUND( x3, x7, x11, x15 );
+ QUARTERROUND( x0, x5, x10, x15 );
+ QUARTERROUND( x1, x6, x11, x12 );
+ QUARTERROUND( x2, x7, x8 , x13 );
+ QUARTERROUND( x3, x4, x9 , x14 );
+ }
+
+ const uint32 x = (uint32)(chachaBlock * 16); // X start offset
+ const uint32 out = gid * (kF1BlockSize / sizeof(uint32));
+
+ const uint32 xo0 = x + 0 ;
+ const uint32 xo1 = x + 1 ;
+ const uint32 xo2 = x + 2 ;
+ const uint32 xo3 = x + 3 ;
+ const uint32 xo4 = x + 4 ;
+ const uint32 xo5 = x + 5 ;
+ const uint32 xo6 = x + 6 ;
+ const uint32 xo7 = x + 7 ;
+ const uint32 xo8 = x + 8 ;
+ const uint32 xo9 = x + 9 ;
+ const uint32 xo10 = x + 10;
+ const uint32 xo11 = x + 11;
+ const uint32 xo12 = x + 12;
+ const uint32 xo13 = x + 13;
+ const uint32 xo14 = x + 14;
+ const uint32 xo15 = x + 15;
+
+ outY[out+0 ] = (((uint64)CuBSwap32( PLUS( x0 , j0 ) )) << kExtraBits) | (xo0 >> (info.k - kExtraBits));
+ outY[out+1 ] = (((uint64)CuBSwap32( PLUS( x1 , j1 ) )) << kExtraBits) | (xo1 >> (info.k - kExtraBits));
+ outY[out+2 ] = (((uint64)CuBSwap32( PLUS( x2 , j2 ) )) << kExtraBits) | (xo2 >> (info.k - kExtraBits));
+ outY[out+3 ] = (((uint64)CuBSwap32( PLUS( x3 , j3 ) )) << kExtraBits) | (xo3 >> (info.k - kExtraBits));
+ outY[out+4 ] = (((uint64)CuBSwap32( PLUS( x4 , j4 ) )) << kExtraBits) | (xo4 >> (info.k - kExtraBits));
+ outY[out+5 ] = (((uint64)CuBSwap32( PLUS( x5 , j5 ) )) << kExtraBits) | (xo5 >> (info.k - kExtraBits));
+ outY[out+6 ] = (((uint64)CuBSwap32( PLUS( x6 , j6 ) )) << kExtraBits) | (xo6 >> (info.k - kExtraBits));
+ outY[out+7 ] = (((uint64)CuBSwap32( PLUS( x7 , j7 ) )) << kExtraBits) | (xo7 >> (info.k - kExtraBits));
+ outY[out+8 ] = (((uint64)CuBSwap32( PLUS( x8 , j8 ) )) << kExtraBits) | (xo8 >> (info.k - kExtraBits));
+ outY[out+9 ] = (((uint64)CuBSwap32( PLUS( x9 , j9 ) )) << kExtraBits) | (xo9 >> (info.k - kExtraBits));
+ outY[out+10] = (((uint64)CuBSwap32( PLUS( x10, j10 ) )) << kExtraBits) | (xo10 >> (info.k - kExtraBits));
+ outY[out+11] = (((uint64)CuBSwap32( PLUS( x11, j11 ) )) << kExtraBits) | (xo11 >> (info.k - kExtraBits));
+ outY[out+12] = (((uint64)CuBSwap32( PLUS( x12, j12 ) )) << kExtraBits) | (xo12 >> (info.k - kExtraBits));
+ outY[out+13] = (((uint64)CuBSwap32( PLUS( x13, j13 ) )) << kExtraBits) | (xo13 >> (info.k - kExtraBits));
+ outY[out+14] = (((uint64)CuBSwap32( PLUS( x14, j14 ) )) << kExtraBits) | (xo14 >> (info.k - kExtraBits));
+ outY[out+15] = (((uint64)CuBSwap32( PLUS( x15, j15 ) )) << kExtraBits) | (xo15 >> (info.k - kExtraBits));
+
+ outX[out+0 ] = xo0 ;
+ outX[out+1 ] = xo1 ;
+ outX[out+2 ] = xo2 ;
+ outX[out+3 ] = xo3 ;
+ outX[out+4 ] = xo4 ;
+ outX[out+5 ] = xo5 ;
+ outX[out+6 ] = xo6 ;
+ outX[out+7 ] = xo7 ;
+ outX[out+8 ] = xo8 ;
+ outX[out+9 ] = xo9 ;
+ outX[out+10] = xo10;
+ outX[out+11] = xo11;
+ outX[out+12] = xo12;
+ outX[out+13] = xo13;
+ outX[out+14] = xo14;
+ outX[out+15] = xo15;
+}
+
+void CudaGenF1K32(
+ const CudaPlotInfo& info,
+ const uint32* devChaChhaInput,
+ const uint64 chachaBlockBase,
+ const uint32 chachaBlockCount,
+ uint64* devOutY,
+ uint32* devOutX,
+ cudaStream_t stream )
+{
+ const uint32 cuThreads = CHACHA_BLOCKS_PER_CUDA_BLOCK;
+ const uint32 cuBlocks = CDiv( chachaBlockCount, cuThreads );
+
+ chacha8_get_keystream_cuda_k32<<>>(
+ info,
+ devChaChhaInput,
+ chachaBlockBase,
+ devOutY,
+ devOutX
+ );
+}
+
diff --git a/cuda/CudaF1.h b/cuda/CudaF1.h
new file mode 100644
index 00000000..811f487a
--- /dev/null
+++ b/cuda/CudaF1.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include
+
+struct CudaPlotInfo;
+
+void CudaGenF1K32(
+ const CudaPlotInfo& info,
+ const uint32* devChaChhaInput,
+ const uint64 chachaBlockBase,
+ const uint32 chachaBlockCount,
+ uint64* devOutY,
+ uint32* devOutX,
+ cudaStream_t stream );
diff --git a/cuda/CudaFSE.cuh b/cuda/CudaFSE.cuh
new file mode 100644
index 00000000..3b263e98
--- /dev/null
+++ b/cuda/CudaFSE.cuh
@@ -0,0 +1,204 @@
+#pragma once
+
+#define FSE_STATIC_LINKING_ONLY 1
+#include "fse/fse.h"
+#include "fse/bitstream.h"
+#undef FSE_STATIC_LINKING_ONLY
+
+#include "CudaPlotContext.h"
+
+#ifdef _WIN32
+__pragma( pack( push, 1 ) )
+typedef struct { U16 v; } unalign16;
+typedef struct { U32 v; } unalign32;
+typedef struct { U64 v; } unalign64;
+typedef struct { size_t v; } unalignArch;
+__pragma( pack( pop ) )
+#endif
+
+__constant__ unsigned CUDA_FSE_BIT_mask[32];
+
+#define CU_FSE_PREFIX(name) FSE_error_##name
+#define CU_FSE_ERROR(name) ((size_t)-CU_FSE_PREFIX(name))
+
+__device__ __forceinline__ unsigned CUDA_ERR_isError(size_t code) { return (code > CU_FSE_ERROR(maxCode)); }
+__device__ __forceinline__ unsigned CUDA_FSE_isError(size_t code) { return CUDA_ERR_isError(code); }
+
+
+__device__ __forceinline__ U16 CUDA_MEM_read16(const void* ptr) { return ((const unalign16*)ptr)->v; }
+
+__device__ __forceinline__ void CUDA_MEM_writeLEST(void* memPtr, size_t val) { ((unalign64*)memPtr)->v = (U64)val; }
+
+__device__ __forceinline__ void CUDA_BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits)
+{
+ CUDA_ASSERT(BIT_MASK_SIZE == 32);
+ CUDA_ASSERT(nbBits < BIT_MASK_SIZE);
+ CUDA_ASSERT(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+
+ bitC->bitContainer |= (value & CUDA_FSE_BIT_mask[nbBits]) << bitC->bitPos;
+ bitC->bitPos += nbBits;
+}
+
+__device__ __forceinline__ void CUDA_BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits)
+{
+ CUDA_ASSERT((value>>nbBits) == 0);
+ CUDA_ASSERT(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+ bitC->bitContainer |= value << bitC->bitPos;
+ bitC->bitPos += nbBits;
+}
+
+__device__ __forceinline__ void CUDA_BIT_flushBits(BIT_CStream_t* bitC)
+{
+ size_t const nbBytes = bitC->bitPos >> 3;
+ CUDA_ASSERT(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+ CUDA_MEM_writeLEST(bitC->ptr, bitC->bitContainer);
+ bitC->ptr += nbBytes;
+ if (bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr;
+ bitC->bitPos &= 7;
+ bitC->bitContainer >>= nbBytes*8;
+}
+
+__device__ __forceinline__ void CUDA_BIT_flushBitsFast(BIT_CStream_t* bitC)
+{
+ size_t const nbBytes = bitC->bitPos >> 3;
+ CUDA_ASSERT(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+ CUDA_MEM_writeLEST(bitC->ptr, bitC->bitContainer);
+ bitC->ptr += nbBytes;
+ CUDA_ASSERT(bitC->ptr <= bitC->endPtr);
+ bitC->bitPos &= 7;
+ bitC->bitContainer >>= nbBytes*8;
+}
+
+__device__ __forceinline__ size_t CUDA_BIT_closeCStream(BIT_CStream_t* bitC)
+{
+ CUDA_BIT_addBitsFast(bitC, 1, 1); /* endMark */
+ CUDA_BIT_flushBits(bitC);
+ if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+ return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0);
+}
+
+__device__ __forceinline__ size_t CUDA_BIT_initCStream(BIT_CStream_t* bitC, void* startPtr, size_t dstCapacity)
+{
+ bitC->bitContainer = 0;
+ bitC->bitPos = 0;
+ bitC->startPtr = (char*)startPtr;
+ bitC->ptr = bitC->startPtr;
+ bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer);
+ if (dstCapacity <= sizeof(bitC->bitContainer)) return CU_FSE_ERROR(dstSize_tooSmall);
+ return 0;
+}
+
+__device__ __forceinline__ void CUDA_FSE_initCState(FSE_CState_t* statePtr, const FSE_CTable* ct)
+{
+ const void* ptr = ct;
+ const U16* u16ptr = (const U16*) ptr;
+ const U32 tableLog = CUDA_MEM_read16(ptr);
+ statePtr->value = (ptrdiff_t)1<stateTable = u16ptr+2;
+ statePtr->symbolTT = ((const U32*)ct + 1 + (tableLog ? (1<<(tableLog-1)) : 1));
+ statePtr->stateLog = tableLog;
+}
+
+__device__ __forceinline__ void CUDA_FSE_initCState2(FSE_CState_t* statePtr, const FSE_CTable* ct, U32 symbol)
+{
+ CUDA_FSE_initCState(statePtr, ct);
+ { const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+ const U16* stateTable = (const U16*)(statePtr->stateTable);
+ U32 nbBitsOut = (U32)((symbolTT.deltaNbBits + (1<<15)) >> 16);
+ statePtr->value = (nbBitsOut << 16) - symbolTT.deltaNbBits;
+ statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+ }
+}
+
+__device__ __forceinline__ void CUDA_FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, U32 symbol)
+{
+ FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+ const U16* const stateTable = (const U16*)(statePtr->stateTable);
+ U32 const nbBitsOut = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+ CUDA_BIT_addBits(bitC, statePtr->value, nbBitsOut);
+ statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+}
+
+__device__ __forceinline__ void CUDA_FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr)
+{
+ CUDA_BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
+ CUDA_BIT_flushBits(bitC);
+}
+
+template
+__device__ size_t CUDA_FSE_compress_usingCTable(
+ void* dst, size_t dstSize,
+ const void* src, size_t srcSize,
+ const FSE_CTable* ct )
+{
+ const byte* const istart = (const byte*) src;
+ const byte* const iend = istart + srcSize;
+ const byte* ip=iend;
+
+ BIT_CStream_t bitC;
+ FSE_CState_t CState1, CState2;
+
+ /* init */
+ CUDA_ASSERT( srcSize > 2 );
+ CUDA_ASSERT( srcSize == (size_t)EntryCount );
+ CUDA_ASSERT( (uintptr_t)(ip - istart) == (uintptr_t)EntryCount );
+
+ // if (srcSize <= 2) return 0;
+ {
+ size_t const initError = CUDA_BIT_initCStream(&bitC, dst, dstSize);
+ CUDA_ASSERT( !CUDA_FSE_isError(initError) );
+
+ #if _DEBUG
+ // if (FSE_isError(initError))
+ // return 0; /* not enough space available to write a bitstream */
+ #endif
+ }
+
+ #define FSE_FLUSHBITS(s) CUDA_BIT_flushBitsFast(s)
+
+ // if (srcSize & 1)
+ {
+ CUDA_FSE_initCState2(&CState1, ct, *--ip);
+ CUDA_FSE_initCState2(&CState2, ct, *--ip);
+ CUDA_FSE_encodeSymbol(&bitC, &CState1, *--ip);
+ FSE_FLUSHBITS(&bitC);
+ }
+ // else {
+ // CUDA_FSE_initCState2(&CState2, ct, *--ip);
+ // CUDA_FSE_initCState2(&CState1, ct, *--ip);
+ // }
+
+ /* join to mod 4 */
+ srcSize -= 2;
+ if ((sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) && (srcSize & 2)) { /* test bit 2 */
+ CUDA_FSE_encodeSymbol(&bitC, &CState2, *--ip);
+ CUDA_FSE_encodeSymbol(&bitC, &CState1, *--ip);
+ FSE_FLUSHBITS(&bitC);
+ }
+
+ /* 2 or 4 encoding per loop */
+ // while ( ip>istart )
+ #pragma unroll
+ for( int32 i = 0; i < EntryCount / 4; i ++ )
+ {
+ CUDA_FSE_encodeSymbol(&bitC, &CState2, *--ip);
+
+ // if constexpr (sizeof(bitC.bitContainer)*8 < FSE_MAX_TABLELOG*2+7 ) /* this test must be static */
+ // FSE_FLUSHBITS(&bitC);
+
+ CUDA_FSE_encodeSymbol(&bitC, &CState1, *--ip);
+
+ // if constexpr (sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) { /* this test must be static */
+ CUDA_FSE_encodeSymbol(&bitC, &CState2, *--ip);
+ CUDA_FSE_encodeSymbol(&bitC, &CState1, *--ip);
+ // }
+
+ FSE_FLUSHBITS(&bitC);
+ }
+
+ CUDA_FSE_flushCState(&bitC, &CState2);
+ CUDA_FSE_flushCState(&bitC, &CState1);
+
+ #undef FSE_FLUSHBITS
+ return CUDA_BIT_closeCStream(&bitC);
+}
diff --git a/cuda/CudaFx.h b/cuda/CudaFx.h
new file mode 100644
index 00000000..66d282cb
--- /dev/null
+++ b/cuda/CudaFx.h
@@ -0,0 +1,14 @@
+#pragma once
+#include
+#include "plotting/Tables.h"
+
+struct Pair;
+void CudaFxHarvestK32(
+ TableId table,
+ uint64* devYOut,
+ void* devMetaOut,
+ uint32 matchCount,
+ const Pair* devPairsIn,
+ const uint64* devYIn,
+ const void* devMetaIn,
+ cudaStream_t stream );
\ No newline at end of file
diff --git a/cuda/CudaMatch.cu b/cuda/CudaMatch.cu
new file mode 100644
index 00000000..e827547f
--- /dev/null
+++ b/cuda/CudaMatch.cu
@@ -0,0 +1,669 @@
+#include "CudaPlotContext.h"
+#include "ChiaConsts.h"
+#include "CudaMatch.h"
+
+#define CU_MATCH_THREAD_COUNT (kExtraBitsPow)
+
+#define BBCU_SCAN_GROUP_THREADS 128
+#define BBCU_THREADS_PER_MATCH_GROUP 352
+static constexpr uint32 BBCU_MAX_ENTRIES_PER_GROUP = 238;
+static constexpr uint32 BBCU_MIN_ENTRIES_PER_GROUP = 230;
+static constexpr uint32 BBCU_MIN_GROUP_COUNT = ( CuCDiv( BBCU_BUCKET_ENTRY_COUNT, BBCU_MAX_ENTRIES_PER_GROUP ) );
+static constexpr uint32 BBCU_MAX_GROUP_COUNT = ( CuCDiv( BBCU_BUCKET_ENTRY_COUNT, BBCU_MIN_ENTRIES_PER_GROUP ) );
+
+static_assert( CU_MAX_BC_GROUP_BOUNDARIES >= BBCU_MAX_GROUP_COUNT );
+
+// #NOTE: The above have been tuned for 128 buckets, should check them for other bucket counts.
+//static_assert( BBCU_BUCKET_COUNT == 128, "Unexpected bucket count" );
+
+//-----------------------------------------------------------
+__forceinline__ __device__ uint16 GenLTarget( const uint16 parity, const uint16 rTargetIdx, const uint16 m )
+{
+ const uint16 indJ = rTargetIdx / kC;
+ return ((indJ + m) % kB) * kC + (((2 * m + parity) * (2 * m + parity) + rTargetIdx) % kC);
+}
+
+//-----------------------------------------------------------
+__global__ void CudaInitGroupsBucket( uint32* entries )
+{
+ const uint32 id = threadIdx.x;
+ const uint32 groupIdx = blockIdx.x;
+ const uint32 gid = blockIdx.x * blockDim.x + id;
+
+ entries[gid] = 0xFFFFFFFF;
+}
+
+//-----------------------------------------------------------
+__global__ void CudaInitGroups( uint32* entries, const uint32 entryCount )
+{
+ const uint32 id = threadIdx.x;
+ const uint32 groupIdx = blockIdx.x;
+ const uint32 gid = blockIdx.x * blockDim.x + id;
+
+ if( gid >= entryCount )
+ return;
+
+ entries[gid] = 0xFFFFFFFF;
+}
+
+//-----------------------------------------------------------
+__global__ void CudaSetFirstAndLastGroup( uint32* groups, const uint32 entryCount )
+{
+ const uint32 id = threadIdx.x;
+ const uint32 groupIdx = blockIdx.x;
+ const uint32 gid = blockIdx.x * blockDim.x + id;
+
+ if( id == 0 )
+ groups[id] = 0;
+ else
+ groups[id] = entryCount;
+}
+
+//-----------------------------------------------------------
+__global__ void ScanGroupsCudaK32Bucket( const uint32* yEntries, uint32* groupBounadries, uint32* gGroupCount, const uint32 entryCount, const uint64 bucketMask )
+{
+ const uint32 id = threadIdx.x;
+ const uint32 gid = blockIdx.x * blockDim.x + id;
+
+ if( gid >= entryCount-1 )
+ return;
+
+ __shared__ uint32 sharedGroupCount;
+ if( id == 0 )
+ sharedGroupCount = 0;
+
+ __syncthreads();
+
+ const uint64 currentGroup = ( bucketMask | yEntries[gid] ) / kBC;
+ const uint64 nextGroup = ( bucketMask | yEntries[gid+1] ) / kBC;
+
+ uint32 offset;
+ if( currentGroup != nextGroup )
+ {
+ // #TODO: Use cooperative groups here instead, so we can just sync these threads
+ offset = atomicAdd( &sharedGroupCount, 1 );
+ }
+
+ __syncthreads();
+
+ // Global sync
+ if( id == 0 )
+ sharedGroupCount = atomicAdd( gGroupCount, sharedGroupCount );
+
+ __syncthreads();
+
+ if( currentGroup != nextGroup )
+ {
+ CUDA_ASSERT( sharedGroupCount + offset < CU_MAX_BC_GROUP_BOUNDARIES );
+ groupBounadries[sharedGroupCount + offset] = gid+1;
+ }
+}
+
+//-----------------------------------------------------------
+__global__ void MatchCudaK32Bucket( const uint64 bucketMask, const uint32 entryCount, const uint32* gGroupCounts, const uint32* yEntries, const uint32* groupBoundaries, uint32* gMatchCount, Pair* outPairs )
+{
+ // 1 thread per y
+ const uint32 id = threadIdx.x;
+ const uint32 groupIdx = blockIdx.x;
+ const uint32 gid = blockIdx.x * blockDim.x + id;
+
+ if( groupIdx >= *gGroupCounts )
+ return;
+
+ const uint32 groupLIdx = groupBoundaries[groupIdx];
+ const uint32 groupRIdx = groupBoundaries[groupIdx+1];
+ const uint32 groupREnd = groupBoundaries[groupIdx+2];
+ const uint64 groupL = ( bucketMask | yEntries[groupLIdx] ) / kBC;
+ const uint64 groupR = ( bucketMask | yEntries[groupRIdx] ) / kBC;
+ const uint32 groupRLength = groupREnd - groupRIdx;
+ const uint64 groupLYStart = groupL * kBC;
+ const uint64 groupRYStart = groupR * kBC;
+ const uint32 groupLLength = groupRIdx - groupLIdx;
+
+#if _DEBUG
+ if( groupLLength >= BBCU_THREADS_PER_MATCH_GROUP || groupRLength >= BBCU_THREADS_PER_MATCH_GROUP )
+ printf( "[%u] Group %u is too large: %u\n", gid, groupRIdx, ( groupRIdx - groupLIdx ) );
+#endif
+ CUDA_ASSERT( groupLLength <= BBCU_THREADS_PER_MATCH_GROUP );
+ CUDA_ASSERT( groupRLength <= BBCU_THREADS_PER_MATCH_GROUP );
+
+ // Generate R group map
+ __shared__ uint32 rMap[kBC/2+1];
+ __shared__ uint32 sharedMatchCount;
+
+ if( groupR - groupL != 1 )
+ return;
+
+ if( id == 0 )
+ sharedMatchCount = 0;
+
+ const uint16 localLY = (uint16)(( bucketMask | yEntries[groupLIdx + min(id, groupLLength-1)] ) - groupLYStart );
+ const uint16 localRY = (uint16)(( bucketMask | yEntries[groupRIdx + min(id, groupRLength-1)] ) - groupRYStart );
+
+ // #TODO: See about using coop_threads here
+ {
+ {
+ uint32 mapIdx = id;
+ while( mapIdx < kBC/2+1 )
+ {
+ // Each entry is:
+ // hi lo
+ // 7 9 7 9
+ //(count,offset|count,offset)
+ rMap[mapIdx] = 0x01FF01FF;
+ mapIdx += BBCU_THREADS_PER_MATCH_GROUP;
+ }
+ }
+
+ __syncthreads();
+
+ const uint16 shift = ( ( localRY & 1 ) << 4 ); // Shift left by 16 bits if odd
+ const uint32 idx = localRY >> 1; // Divide by 2
+
+ // First set the offsets for the even ones (lower bits)
+ if( id < groupRLength && ( localRY & 1 ) == 0 )
+ atomicMin( &rMap[idx], id | 0x01FF0000 );
+
+ __syncthreads();
+
+ // Then set offset for the odd ones
+ if( id < groupRLength && ( localRY & 1 ) )
+ atomicMin( &rMap[idx], (id << 16) | (rMap[idx] & 0x0000FFFF) );
+
+ __syncthreads();
+
+ // Finally, add the counts
+ if( id < groupRLength )
+ atomicAdd( &rMap[idx], 0x200ul << shift );
+ }
+
+ if( id >= groupLLength )
+ return;
+
+ __syncthreads();
+
+
+ // Begin matching
+ constexpr uint32 MAX_MATCHES = 16;
+ Pair matches[MAX_MATCHES];
+ uint32 matchCount = 0;
+
+ #pragma unroll
+ for( uint32 i = 0; i < kExtraBitsPow; i++ )
+ {
+ const uint16 lTarget = GenLTarget( (byte)(groupL & 1), localLY, (uint16)i );
+ const uint16 shift = ( ( lTarget & 1 ) << 4 ); // Shift left by 16 bits if odd
+ const uint16 rValue = (uint16)(rMap[lTarget>>1] >> shift);
+ const int16 rCount = (int16)(rValue >> 9);
+
+ for( int32 j = 0; j < rCount; j++ )
+ {
+ CUDA_ASSERT( matchCount < MAX_MATCHES );
+ matches[matchCount++] = { groupLIdx + id, groupRIdx + (rValue & 0x1FF) + j };
+ }
+ }
+
+ // Store final values
+ const uint32 copyOffset = atomicAdd( &sharedMatchCount, matchCount );
+ __syncthreads();
+
+ // Store our shared match count and get our global offset
+ if( id == 0 )
+ sharedMatchCount = atomicAdd( gMatchCount, sharedMatchCount );
+ __syncthreads();
+
+ outPairs += copyOffset + sharedMatchCount;
+
+ for( uint32 i = 0; i < matchCount; i++ )
+ {
+ CUDA_ASSERT( matches[i].left < entryCount );
+ CUDA_ASSERT( matches[i].right < entryCount );
+
+ outPairs[i] = matches[i];
+ }
+}
+
+/// This kernel, meant for harvesting compressed k32 plots,
+/// matches adjacent BC groups with 64 threads per block.
+/// Where each block represents 1 L entry, and
+/// each thread is one iteration of log2( kExtraBits ) required
+/// per L entry during normal matching.
+/// Since compressed groups are small, we expect this
+/// to be a reasonable way to implement matching
+/// vs the way is imlpemented in plotting where the group
+/// sizes are exploited.
+//-----------------------------------------------------------
+__global__ void HarvestMatchK32Kernel(
+ Pair* gOutMatches,
+ uint32* gOutMatchCount,
+ const uint64* yEntries,
+ const uint32 entryCount,
+ const uint32 matchOffset
+)
+{
+ const uint32 id = threadIdx.x;
+ const uint32 yIdx = blockIdx.x;
+ const uint32 gid = yIdx + id;
+
+ CUDA_ASSERT( id < 64 );
+
+ constexpr uint32 SHARED_R_BUF_SIZE = 64;
+ constexpr uint32 MAX_MATCHES = 16;
+
+ // Read rGroup entries into a shared buffer
+ __shared__ uint64 rBuf[SHARED_R_BUF_SIZE];
+ __shared__ uint32 sharedMatchCount;
+ __shared__ uint64 lYShared;
+
+ // Find group boundaries
+ __shared__ uint32 lGroupStart;
+ __shared__ uint32 rGroupStartShared;
+ __shared__ uint32 rGroupEnd;
+
+ uint64 myY = 0xFFFFFFFFFFFFFFFF;
+ if( gid < entryCount )
+ myY = yEntries[gid];
+
+
+ if( id == 0 )
+ {
+ lYShared = myY;
+ sharedMatchCount = 0;
+ rGroupStartShared = 0xFFFFFFFF;
+ rGroupEnd = 0xFFFFFFFF;
+ }
+ __syncthreads();
+
+ const uint32 groupL = (uint32)(lYShared / kBC);
+ const uint32 myGroup = (uint32)(myY / kBC);
+
+ if( myGroup - groupL == 1 )
+ atomicMin( &rGroupStartShared, id );
+
+ __syncthreads();
+
+ // Not an adjacent group, exit
+ if( rGroupStartShared == 0xFFFFFFFF )
+ return;
+
+ const uint32 rGroupStart = rGroupStartShared;
+
+ // Store our read Y into shared value buffer
+ rBuf[id] = myY;
+ __syncthreads();
+
+ // Now find the R group end
+ // Notice we store the global index here, not the block-local one,
+ // like we did for rGroupStart
+ const uint32 groupR = (uint32)( rBuf[rGroupStart] / kBC);
+ if( myGroup > groupR )
+ atomicMin( &rGroupEnd, gid );
+
+ __syncthreads();
+
+ // Is it the last R group?
+ if( id == 0 && rGroupEnd == 0xFFFFFFFF )
+ rGroupEnd = entryCount;
+
+ __syncthreads();
+ CUDA_ASSERT( rGroupEnd < 0xFFFFFFFF );
+
+ // We should have all the info we need to match this Y now
+ const uint32 rGroupLength = rGroupEnd - (yIdx + rGroupStart);
+
+ const uint64 lY = lYShared;
+ const uint64 groupLYStart = ((uint64)groupL) * kBC;
+ const uint64 groupRYStart = ((uint64)groupR) * kBC;
+ const uint16 localLY = (uint16)(lY - groupLYStart);
+
+ const uint16 lTarget = GenLTarget( (byte)(groupL & 1), localLY, (uint16)id );
+
+ Pair matches[MAX_MATCHES];
+ uint32 matchCount = 0;
+
+ #pragma unroll
+ for( uint32 i = rGroupStart; i < (rGroupStart+rGroupLength); i++ )
+ {
+ const uint64 rY = rBuf[i];
+ const uint16 localRY = (uint16)(rY - groupRYStart);
+
+ if( lTarget == localRY )
+ {
+ CUDA_ASSERT( matchCount <= MAX_MATCHES );
+ matches[matchCount++] = { matchOffset + yIdx, matchOffset + yIdx + i };
+ }
+ }
+
+ // Store matches into global memory
+ const uint32 offset = atomicAdd( &sharedMatchCount, matchCount );
+
+ __syncthreads();
+ if( sharedMatchCount == 0 )
+ return;
+
+ if( id == 0 )
+ sharedMatchCount = atomicAdd( gOutMatchCount, sharedMatchCount );
+
+ __syncthreads();
+
+ // Copy matches to global buffer
+ const uint32 out = sharedMatchCount + offset;
+
+ for( uint32 i = 0; i < matchCount; i++ )
+ gOutMatches[out+i] = matches[i];
+}
+
+//-----------------------------------------------------------
+__global__ void MatchCudaK32KernelInternal(
+ Pair* outPairs,
+ uint32* gMatchCount,
+ const uint32 entryCount,
+ const uint32* gGroupCounts,
+ const uint64* yEntries,
+ const uint32* groupBoundaries )
+{
+ // 1 thread per y
+ const uint32 id = threadIdx.x;
+ const uint32 groupIdx = blockIdx.x;
+ const uint32 gid = blockIdx.x * blockDim.x + id;
+
+ if( groupIdx >= *gGroupCounts )
+ return;
+
+ const uint32 groupLIdx = groupBoundaries[groupIdx];
+ const uint32 groupRIdx = groupBoundaries[groupIdx+1];
+ const uint32 groupREnd = groupBoundaries[groupIdx+2];
+ const uint64 groupL = yEntries[groupLIdx] / kBC;
+ const uint64 groupR = yEntries[groupRIdx] / kBC;
+ const uint32 groupRLength = groupREnd - groupRIdx;
+ const uint64 groupLYStart = groupL * kBC;
+ const uint64 groupRYStart = groupR * kBC;
+ const uint32 groupLLength = groupRIdx - groupLIdx;
+
+#if _DEBUG
+ if( groupLLength >= BBCU_THREADS_PER_MATCH_GROUP || groupRLength >= BBCU_THREADS_PER_MATCH_GROUP )
+ printf( "[%u] Group %u is too large: %u\n", gid, groupRIdx, ( groupRIdx - groupLIdx ) );
+#endif
+ CUDA_ASSERT( groupLLength <= BBCU_THREADS_PER_MATCH_GROUP );
+ CUDA_ASSERT( groupRLength <= BBCU_THREADS_PER_MATCH_GROUP );
+
+ // Each rMap entry is:
+ // hi lo
+ // 7 9 7 9
+ //(count,offset|count,offset)
+ uint32 rMap[kBC/2+1] = {};
+
+ __shared__ uint32 sharedMatchCount;
+ if( id == 0 )
+ sharedMatchCount = 0;
+ __syncthreads();
+
+ if( groupR - groupL != 1 )
+ return;
+
+ const uint16 localLY = (uint16)( yEntries[groupLIdx + min(id, groupLLength-1)] - groupLYStart );
+ const uint16 localRYBase = (uint16)( yEntries[groupRIdx + min(id, groupRLength-1)] - groupRYStart );
+
+ // Packed rMap. 2 entries (of count and offset) per DWORD
+ for( uint32 i = 0; i < groupRLength; i++ )
+ {
+ const uint16 localRY = localRYBase + (uint16)i;
+
+ const uint32 idx = localRY >> 1; // Index in the rMap (Divide by 2)
+ const uint32 value = rMap[idx];
+
+ // Increase the count and sets the index
+ if( (localRY & 1) == 0 )
+ {
+ // Even value, store in the LSbits
+ rMap[idx] = (value + 0x200) | i;
+ }
+ else
+ {
+ // Odd value, store in the MSbits
+ rMap[idx] = (value + 0x2000000) | (i << 16);
+ }
+ }
+ __syncthreads();
+
+
+ // Begin matching
+ constexpr uint32 MAX_MATCHES = 16;
+ Pair matches[MAX_MATCHES];
+ uint32 matchCount = 0;
+
+ #pragma unroll
+ for( uint32 i = 0; i < kExtraBitsPow; i++ )
+ {
+ const uint16 lTarget = GenLTarget( (byte)(groupL & 1), localLY, (uint16)i );
+ const uint16 shift = ( ( lTarget & 1 ) << 4 ); // Shift left by 16 bits if odd
+ const uint16 rValue = (uint16)(rMap[lTarget>>1] >> shift);
+ const int16 rCount = (int16)(rValue >> 9);
+
+ for( int32 j = 0; j < rCount; j++ )
+ {
+ if( matchCount >= MAX_MATCHES )
+ {
+ printf( "[%u] We got too many (i=%u) matches: %u\n", gid, i, matchCount );
+ }
+ CUDA_ASSERT( matchCount < MAX_MATCHES );
+ matches[matchCount++] = { groupLIdx + id, groupRIdx + (rValue & 0x1FF) + j };
+ }
+ }
+
+ // Store final values
+ const uint32 copyOffset = atomicAdd( &sharedMatchCount, matchCount );
+ __syncthreads();
+
+ // Store our shared match count and get our global offset
+ if( id == 0 )
+ sharedMatchCount = atomicAdd( gMatchCount, sharedMatchCount );
+ __syncthreads();
+
+ outPairs += copyOffset + sharedMatchCount;
+
+ for( uint32 i = 0; i < matchCount; i++ )
+ {
+ CUDA_ASSERT( matches[i].left < entryCount );
+ CUDA_ASSERT( matches[i].right < entryCount );
+
+ outPairs[i] = matches[i];
+ }
+}
+
+//-----------------------------------------------------------
+__global__ void MatchK32Kernel(
+ Pair* outPairs,
+ uint32* gMatchCount,
+ const uint32 entryCount,
+ const uint32* gGroupCounts,
+ const uint64* yEntries,
+ const uint32* groupBoundaries )
+{
+ // CUDA_ASSERT( blockDim.x == 1 );
+ // CUDA_ASSERT( blockIdx.x == 0 );
+ // CUDA_ASSERT( threadIdx.x == 0 );
+
+
+ // const uint32 groupCount = *gGroupCounts;
+ // const uint32 entriesPerGroup = (entryCount / groupCount) + 6;
+
+ // const uint32 blocks = groupCount;
+ // const uint32 threads = entriesPerGroup;
+
+ // HarvestMatchK32Kernel<<>>(
+ // gMatchCount,
+ // const uint32 lGroupIdx,
+ // const uint32 lYIdx,
+ // const uint32 rGroupIdx,
+ // const uint32 rGroupLength,
+ // const uint64* yEntries
+
+ // MatchCudaK32KernelInternal<<>>( outPairs, gMatchCount, entryCount, gGroupCounts, yEntries, groupBoundaries );
+
+ // const cudaError_t err = cudaGetLastError();
+ // assert( err == cudaSuccess );
+}
+
+//-----------------------------------------------------------
+__global__ void ScanGroupsK32Kernel(
+ uint32* groupIndices,
+ uint32* outGroupCount,
+ const uint64* yEntries,
+ const uint32 entryCount )
+{
+ const uint32 id = threadIdx.x;
+ const uint32 gid = blockIdx.x * blockDim.x + id;
+
+ __shared__ uint32 sharedGroupCount;
+
+ if( id == 0 )
+ sharedGroupCount = 0;
+ __syncthreads();
+
+ if( gid >= entryCount-1 )
+ return;
+
+ const uint32 currentGroup = (uint32)(yEntries[gid] / kBC);
+ const uint32 nextGroup = (uint32)(yEntries[gid+1] / kBC);
+
+ uint32 offset;
+ if( currentGroup != nextGroup )
+ offset = atomicAdd( &sharedGroupCount, 1 );
+
+ __syncthreads();
+
+ // Global sync
+ if( id == 0 )
+ sharedGroupCount = atomicAdd( outGroupCount, sharedGroupCount );
+
+ __syncthreads();
+
+ if( currentGroup != nextGroup )
+ groupIndices[sharedGroupCount + offset] = gid+1;
+ // // CUDA_ASSERT( sharedGroupCount + offset < CU_MAX_BC_GROUP_BOUNDARIES );
+}
+
+//-----------------------------------------------------------
+cudaError CudaHarvestMatchK32(
+ Pair* devOutPairs,
+ uint32* devMatchCount,
+ const uint32 maxMatches,
+ const uint64* devYEntries,
+ const uint32 entryCount,
+ const uint32 matchOffset,
+ cudaStream_t stream )
+{
+ uint32 kthreads = 64;
+ uint32 kblocks = entryCount-1;
+
+ cudaError cErr = cudaMemsetAsync( devMatchCount, 0, sizeof( uint32 ), stream );
+ if( cErr != cudaSuccess )
+ return cErr;
+
+ HarvestMatchK32Kernel<<>>(
+ devOutPairs, devMatchCount, devYEntries, entryCount, matchOffset );
+
+// #if _DEBUG
+// uint32 matchCount = 0;
+// CudaErrCheck( cudaMemcpyAsync( &matchCount, devMatchCount, sizeof( uint32 ) , cudaMemcpyDeviceToHost, stream ) );
+// CudaErrCheck( cudaStreamSynchronize( stream ) );
+// CudaErrCheck( cudaStreamSynchronize( stream ) );
+
+// Pair* matches = new Pair[matchCount];
+// CudaErrCheck( cudaMemcpyAsync( matches, devOutPairs, sizeof( Pair ) * matchCount , cudaMemcpyDeviceToHost, stream ) );
+// CudaErrCheck( cudaStreamSynchronize( stream ) );
+// CudaErrCheck( cudaStreamSynchronize( stream ) );
+// #endif
+
+ return cudaSuccess;
+}
+
+
+//-----------------------------------------------------------
+void CudaMatchBucketizedK32(
+ CudaK32PlotContext& cx,
+ const uint32* devY,
+ cudaStream_t stream,
+ cudaEvent_t event )
+{
+ const TableId inTable = cx.table - 1;
+ const uint32 entryCount = cx.bucketCounts[(int)inTable][cx.bucket];
+ const uint64 bucketMask = BBC_BUCKET_MASK( cx.bucket );
+
+ constexpr uint32 kscanblocks = CuCDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, BBCU_SCAN_GROUP_THREADS );
+
+ uint32* tmpGroupCounts = (uint32*)cx.devMatches;
+
+ {
+ // Initialize the entries to the max value so that they are not included in the sort
+ CudaInitGroupsBucket<<>>( tmpGroupCounts );
+
+ // Add first group and last ghost group
+ CudaSetFirstAndLastGroup<<<1,2,0,stream>>>( tmpGroupCounts, entryCount );
+ }
+
+ CudaErrCheck( cudaMemsetAsync( cx.devGroupCount, 0, sizeof( uint32 ), stream ) );
+ CudaErrCheck( cudaMemsetAsync( cx.devMatchCount, 0, sizeof( uint32 ), stream ) );
+ ScanGroupsCudaK32Bucket<<>>( devY, tmpGroupCounts+2, cx.devGroupCount, entryCount, bucketMask );
+
+ byte* sortTmpAlloc = (byte*)( tmpGroupCounts + BBCU_MAX_GROUP_COUNT );
+ size_t sortTmpSize = ( BBCU_BUCKET_ALLOC_ENTRY_COUNT - BBCU_MAX_GROUP_COUNT ) * sizeof( uint32 );
+
+#if _DEBUG
+ size_t sortSize = 0;
+ cub::DeviceRadixSort::SortKeys( nullptr, sortSize, nullptr, nullptr, BBCU_MAX_GROUP_COUNT, 0, 32 );
+ ASSERT( sortSize <= sortTmpSize );
+#endif
+
+ cub::DeviceRadixSort::SortKeys( sortTmpAlloc, sortTmpSize, tmpGroupCounts, cx.devGroupBoundaries, BBCU_MAX_GROUP_COUNT, 0, 32, stream );
+
+ MatchCudaK32Bucket<<>>( bucketMask, entryCount, cx.devGroupCount, devY, cx.devGroupBoundaries, cx.devMatchCount, cx.devMatches );
+}
+
+//-----------------------------------------------------------
+// cudaError CudaHarvestMatchK32WithGroupScan(
+// Pair* devOutPairs,
+// uint32* devMatchCount,
+// const uint32 maxMatches,
+// uint32* devGroupIndices,
+// uint32* devGroupIndicesTemp,
+// const uint32 maxGroups,
+// void* sortBuffer,
+// const size_t sortBufferSize,
+// const uint64* devYEntries,
+// const uint32 entryCount,
+// const uint32 matchOffset,
+// cudaStream_t stream )
+// {
+// // Scan for BC groups
+// {
+// const uint32 kblocks = 0;
+// const uint32 kthreads = 0;
+
+
+// // constexpr uint32 kscanblocks = CuCDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, BBCU_SCAN_GROUP_THREADS );
+// // Initialize the entries to the max value so that they are not included in the sort
+// CudaInitGroups<<>>( devGroupIndicesTemp, entryCount );
+// // CudaInitGroupsBucket<<>>( tmpGroupCounts );
+
+// // Add first group and last ghost group
+// CudaSetFirstAndLastGroup<<<1,2,0,stream>>>( tmpGroupCounts, entryCount );
+// }
+
+// CudaErrCheck( cudaMemsetAsync( cx.devGroupCount, 0, sizeof( uint32 ), stream ) );
+// CudaErrCheck( cudaMemsetAsync( cx.devMatchCount, 0, sizeof( uint32 ), stream ) );
+// ScanGroupsCudaK32Bucket<<>>( devY, tmpGroupCounts+2, cx.devGroupCount, entryCount, bucketMask );
+
+// byte* sortTmpAlloc = (byte*)( tmpGroupCounts + BBCU_MAX_GROUP_COUNT );
+// size_t sortTmpSize = ( BBCU_BUCKET_ALLOC_ENTRY_COUNT - BBCU_MAX_GROUP_COUNT ) * sizeof( uint32 );
+
+// #if _DEBUG
+// size_t sortSize = 0;
+// cub::DeviceRadixSort::SortKeys( nullptr, sortSize, nullptr, nullptr, BBCU_MAX_GROUP_COUNT, 0, 32 );
+// ASSERT( sortSize <= sortTmpSize );
+// #endif
+
+// cub::DeviceRadixSort::SortKeys( sortTmpAlloc, sortTmpSize, tmpGroupCounts, cx.devGroupBoundaries, BBCU_MAX_GROUP_COUNT, 0, 32, stream );
+
+// }
diff --git a/cuda/CudaMatch.h b/cuda/CudaMatch.h
new file mode 100644
index 00000000..f52e02fa
--- /dev/null
+++ b/cuda/CudaMatch.h
@@ -0,0 +1,30 @@
+#pragma once
+#include
+
+/// Unbucketized CUDA-based matching function for k32 compressed plots.
+/// This method is meant to only be used with compressed plots.
+cudaError CudaHarvestMatchK32(
+ struct Pair* devOutPairs,
+ uint32* devMatchCount,
+ const uint32 maxMatches,
+ const uint64* devYEntries,
+ const uint32 entryCount,
+ const uint32 matchOffset,
+ cudaStream_t stream );
+
+/// Unbucketized CUDA-based matching function, specifically for k32.
+/// The matches are deterministic. That is, you will always get the
+/// same matches given the same input, though the order of the
+// /// stored matches is not deterministic.
+// cudaError CudaMatchK32(
+// struct Pair* devOutPairs,
+// uint32* devMatchCount,
+// uint32* devTempGroupIndices,
+// uint32* devGroupIndices,
+// uint32* devGroupCount,
+// uint32 maxGroups,
+// byte* devSortTempData,
+// const size_t sortTempDataSize,
+// const uint64* devYEntries,
+// const uint32 entryCount,
+// cudaStream_t stream );
diff --git a/cuda/CudaParkSerializer.cu b/cuda/CudaParkSerializer.cu
new file mode 100644
index 00000000..f3e8b8d4
--- /dev/null
+++ b/cuda/CudaParkSerializer.cu
@@ -0,0 +1,296 @@
+#include "CudaParkSerializer.h"
+#include "CudaFSE.cuh"
+
+
+//-----------------------------------------------------------
+void InitFSEBitMask( CudaK32PlotContext& cx )
+{
+ static bool _initialized = false;
+ if( _initialized )
+ return;
+
+ _initialized = true;
+
+ uint32 bitmask[] = {
+ 0, 1, 3, 7, 0xF, 0x1F,
+ 0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF,
+ 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0x1FFFF,
+ 0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF,
+ 0xFFFFFF, 0x1FFFFFF, 0x3FFFFFF, 0x7FFFFFF, 0xFFFFFFF, 0x1FFFFFFF,
+ 0x3FFFFFFF, 0x7FFFFFFF
+ };
+
+ CudaErrCheck( cudaMemcpyToSymbolAsync( CUDA_FSE_BIT_mask, bitmask, sizeof( bitmask ), 0, cudaMemcpyHostToDevice, cx.computeStream ) );
+ CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
+}
+
+
+//-----------------------------------------------------------
+void CompressToParkInGPU( const uint32 parkCount, const size_t parkSize,
+ uint64* devLinePoints, byte* devParkBuffer, const size_t parkBufferSize,
+ const uint32 stubBitSize, const FSE_CTable* devCTable, uint32* devParkOverrunCount, cudaStream_t stream )
+{
+ const uint32 kThreadCount = 256;
+ const uint32 kBlocks = CDivT( parkCount, kThreadCount );
+ CudaCompressToPark<<>>( parkCount, parkSize, devLinePoints, devParkBuffer, parkBufferSize, stubBitSize, devCTable, devParkOverrunCount );
+}
+
+//-----------------------------------------------------------
+__global__ void CudaCompressToPark(
+ const uint32 parkCount, const size_t parkSize,
+ uint64* linePoints, byte* parkBuffer, const size_t parkBufferSize,
+ const uint32 stubBitSize, const FSE_CTable* cTable, uint32* gParkOverrunCount )
+{
+ const uint32 id = threadIdx.x;
+ const uint32 gid = blockIdx.x * blockDim.x + id;
+
+ if( gid >= parkCount )
+ return;
+
+ linePoints += kEntriesPerPark * (size_t)gid;
+ parkBuffer += parkBufferSize * (size_t)gid;
+
+ // __shared__ uint16 sharedCTable[34812/2];
+
+
+ CUDA_ASSERT( (uintptr_t)parkBuffer / sizeof( uint64 ) * sizeof( uint64 ) == (uintptr_t)parkBuffer ); // Must be 64-bit aligned
+ uint64* writer = (uint64*)parkBuffer;
+
+ // Write the first LinePoint as a full LinePoint
+ uint64 prevLinePoint = linePoints[0];
+
+ *writer++ = CuBSwap64( prevLinePoint );
+
+ // Grab the writing location after the stubs
+ const size_t stubSectionBytes = CuCDiv( (kEntriesPerPark - 1) * (size_t)stubBitSize, 8 );
+
+ byte* deltaBytesWriter = ((byte*)writer) + stubSectionBytes;
+
+ // Write stubs
+ {
+ const uint64 stubMask = ((1ULL << stubBitSize) - 1);
+
+ uint64 field = 0; // Current field to write
+ uint bits = 0; // Bits occupying the current field (always shifted to the leftmost bits)
+
+ #pragma unroll
+ for( uint32 i = 1; i < kEntriesPerPark; i++ )
+ {
+ const uint64 lpDelta = linePoints[i];
+ const uint64 stub = lpDelta & stubMask;
+
+ // Serialize into bits, one uint64 field at a time
+ // Always store it all the way to the MSbits
+ const uint freeBits = 64 - bits;
+ if( freeBits <= stubBitSize )
+ {
+ // Update the next field bits to what the stub bits that were not written into the current field
+ bits = (uint32)stubBitSize - freeBits;
+
+ // Write what we can (which may be nothing) into the free bits of the current field
+ field |= stub >> bits;
+
+ // Store field
+ *writer++ = CuBSwap64( field );
+
+ const uint remainder = 64 - bits;
+ uint64 mask = ( ( 1ull << bits ) - 1 ) << (remainder & 63);
+ field = ( stub << remainder ) & mask;
+ }
+ else
+ {
+ // The stub completely fits into the current field with room to spare
+ field |= stub << (freeBits - stubBitSize);
+ bits += stubBitSize;
+ }
+ }
+
+ // Write any trailing fields
+ if( bits > 0 )
+ *writer++ = CuBSwap64( field );
+
+ // Zero-out any remaining unused bytes
+ // const size_t stubUsedBytes = CDiv( (kEntriesPerPark - 1) * (size_t)stubBitSize, 8 );
+ // const size_t remainderBytes = stubSectionBytes - stubUsedBytes;
+
+ // memset( deltaBytesWriter - remainderBytes, 0, remainderBytes );
+ }
+
+
+ // Convert to small deltas
+ byte* smallDeltas = (byte*)&linePoints[1];
+
+ #pragma unroll
+ for( uint32 i = 1; i < kEntriesPerPark; i++ )
+ {
+ const uint64 smallDelta = linePoints[i] >> stubBitSize;
+ CUDA_ASSERT( smallDelta < 256 );
+
+ smallDeltas[i-1] = (byte)smallDelta;
+ }
+
+ // Write small deltas
+ size_t parkSizeWritten = 0;
+ {
+ byte* deltaSizeWriter = (byte*)deltaBytesWriter;
+ deltaBytesWriter += 2;
+
+ // CUDA_ASSERT( smallDeltas[0] == 3 );
+ size_t deltasSize = CUDA_FSE_compress_usingCTable(
+ deltaBytesWriter, (kEntriesPerPark-1) * 8,
+ smallDeltas, kEntriesPerPark-1, cTable );
+
+ if( deltasSize == 0 )
+ {
+ // #TODO: Set error
+ CUDA_ASSERT( 0 );
+ }
+ else
+ {
+ // Deltas were compressed
+
+ //memcpy( deltaSizeWriter, &deltasSize, sizeof( uint16 ) );
+ // *deltaSizeWriter = (uint16)deltasSize;
+ deltaSizeWriter[0] = (byte)( deltasSize ); // Stored as LE
+ deltaSizeWriter[1] = (byte)( deltasSize >> 8 );
+ }
+
+ if( ((deltaBytesWriter + deltasSize) - parkBuffer) > parkSize )
+ {
+ *gParkOverrunCount++;
+ }
+// #if _DEBUG
+ // deltaBytesWriter += deltasSize;
+ // parkSizeWritten = deltaBytesWriter - parkBuffer;
+
+ // if( parkSizeWritten > parkSize )
+ // printf( "[CUDA KERN ERROR] Overran park buffer: %llu / %llu\n", parkSizeWritten, parkSize );
+ // CUDA_ASSERT( parkSizeWritten <= parkSize );
+// #endif
+
+ // Zero-out any remaining bytes in the deltas section
+ // const size_t parkSizeRemainder = parkSize - parkSizeWritten;
+
+ // memset( deltaBytesWriter, 0, parkSizeRemainder );
+ }
+
+ // return parkSizeWritten;
+}
+
+
+// #TODO: Check if deltafying in a different kernel would be good
+//-----------------------------------------------------------
+__global__ void CudaCompressC3Park( const uint32 parkCount, uint32* f7Entries, byte* parkBuffer, const size_t c3ParkSize, const FSE_CTable* cTable )
+{
+ const uint32 id = threadIdx.x;
+ const uint32 gid = blockIdx.x * blockDim.x + id;
+
+ if( gid >= parkCount )
+ return;
+
+ f7Entries += gid * kCheckpoint1Interval;
+ parkBuffer += gid * c3ParkSize;
+
+ byte* deltaWriter = (byte*)f7Entries;
+
+ // Convert to deltas
+
+ // f7Entries must always start at an interval of kCheckpoint1Interval
+ // Therefore its first entry is a C1 entry, and not written as a delta.
+ uint32 prevF7 = *f7Entries;
+
+ #pragma unroll
+ for( uint32 i = 1; i < kCheckpoint1Interval; i++ )
+ {
+ const uint32 f7 = f7Entries[i];
+ const uint32 delta = f7 - prevF7;
+ prevF7 = f7;
+
+ CUDA_ASSERT( delta < 255 );
+ *deltaWriter++ = (byte)delta;
+ }
+
+ CUDA_ASSERT( (uintptr_t)(deltaWriter - (byte*)f7Entries) == kCheckpoint1Interval-1 );
+
+ // Serialize them into the C3 park buffer
+ const size_t compressedSize = CUDA_FSE_compress_usingCTable(
+ parkBuffer+2, c3ParkSize, (byte*)f7Entries,
+ kCheckpoint1Interval-1, cTable );
+
+ CUDA_ASSERT( (compressedSize+2) < c3ParkSize );
+ CUDA_ASSERT( (compressedSize+2) < 3000 );
+
+ // Store size in the first 2 bytes
+ //memcpy( parkBuffer, &sizeu16, sizeof( uint16) );
+ parkBuffer[0] = (byte)( compressedSize >> 8 ); // Stored as BE
+ parkBuffer[1] = (byte)( compressedSize );
+}
+
+//-----------------------------------------------------------
+void CompressC3ParksInGPU( const uint32 parkCount, uint32* devF7, byte* devParkBuffer,
+ const size_t parkBufSize, const FSE_CTable* cTable, cudaStream_t stream )
+{
+ const uint32 kthreads = 128;
+ const uint32 kblocks = CDiv( parkCount, kthreads );
+
+ CudaCompressC3Park<<>>( parkCount, devF7, devParkBuffer, parkBufSize, cTable );
+}
+
+
+//-----------------------------------------------------------
+__global__ void CudaWritePark7( const uint32 parkCount, const uint32* indices, uint64* fieldWriter, const size_t parkFieldCount )
+{
+ const uint32 id = threadIdx.x;
+ const uint32 gid = blockIdx.x * blockDim.x + id;
+
+ if( gid >= parkCount )
+ return;
+
+ indices += gid * kEntriesPerPark;
+ fieldWriter += gid * parkFieldCount;
+
+ const uint32 bitsPerEntry = BBCU_K + 1;
+
+ uint64 field = 0;
+ uint32 bits = 0;
+
+ #pragma unroll
+ for( int32 i = 0; i < kEntriesPerPark; i++ )
+ {
+ const uint64 index = indices[i];
+ const uint32 freeBits = 64 - bits;
+
+ // Filled a field?
+ if( freeBits <= bitsPerEntry )
+ {
+ bits = bitsPerEntry - freeBits;
+ field |= index >> bits;
+
+ // Store field
+ *fieldWriter++ = CuBSwap64( field );
+
+ const uint remainder = 64 - bits;
+ uint64 mask = ( ( 1ull << bits ) - 1 ) << (remainder & 63);
+ field = ( index << remainder ) & mask;
+ }
+ else
+ {
+ // The entry completely fits into the current field with room to spare
+ field |= index << ( freeBits - bitsPerEntry );
+ bits += bitsPerEntry;
+ }
+ }
+
+ // Write any trailing fields
+ if( bits > 0 )
+ *fieldWriter = CuBSwap64( field );
+}
+
+//-----------------------------------------------------------
+void SerializePark7InGPU( const uint32 parkCount, const uint32* indices, uint64* fieldWriter, const size_t parkFieldCount, cudaStream_t stream )
+{
+ const uint32 kthreads = 256;
+ const uint32 kblocks = CDiv( parkCount, kthreads );
+
+ CudaWritePark7<<>>( parkCount, indices, fieldWriter, parkFieldCount );
+}
diff --git a/cuda/CudaParkSerializer.h b/cuda/CudaParkSerializer.h
new file mode 100644
index 00000000..7b3171d2
--- /dev/null
+++ b/cuda/CudaParkSerializer.h
@@ -0,0 +1,20 @@
+#pragma once
+#include "CudaPlotContext.h"
+
+typedef unsigned FSE_CTable;
+
+void InitFSEBitMask( struct CudaK32PlotContext& cx );
+
+void CompressC3ParksInGPU( const uint32 parkCount, uint32* devF7, byte* devParkBuffer,
+ size_t parkBufSize, const FSE_CTable* cTable, cudaStream_t stream );
+
+void SerializePark7InGPU( const uint32 parkCount, const uint32* indices, uint64* fieldWriter,
+ const size_t parkFieldCount, cudaStream_t stream );
+
+void CompressToParkInGPU( const uint32 parkCount, const size_t parkSize,
+ uint64* devLinePoints, byte* devParkBuffer, size_t parkBufferSize,
+ const uint32 stubBitSize, const FSE_CTable* devCTable, uint32* devParkOverrunCount, cudaStream_t stream );
+
+__global__ void CudaCompressToPark( const uint32 parkCount, const size_t parkSize,
+ uint64* linePoints, byte* parkBuffer, size_t parkBufferSize,
+ const uint32 stubBitSize, const FSE_CTable* cTable, uint32* gParkOverrunCount );
diff --git a/cuda/CudaPlotConfig.h b/cuda/CudaPlotConfig.h
new file mode 100644
index 00000000..80721e9f
--- /dev/null
+++ b/cuda/CudaPlotConfig.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#define BBCU_GPU_STREAM_COUNT 4
+#define BBCU_GPU_BUFFER_MAX_COUNT 4
+#define BBCU_DEFAULT_GPU_BUFFER_COUNT 2
+
+#define BBCU_K (32u)
+#define BBCU_BUCKET_COUNT (128u)
+#define BBC_Y_BITS (BBCU_K+kExtraBits)
+#define BBC_Y_BITS_T7 (BBCU_K)
+#define BBC_BUCKET_BITS (CuBBLog2( BBCU_BUCKET_COUNT ))
+#define BBC_BUCKET_SHIFT (BBC_Y_BITS-BBC_BUCKET_BITS)
+#define BBC_BUCKET_SHIFT_T7 (BBC_Y_BITS_T7-BBC_BUCKET_BITS)
+#define BBC_Y_MASK ((uint32)((1ull << BBC_Y_BITS) - 1))
+#define BBC_Y_MASK_T7 (0xFFFFFFFFu)
+#define BBC_BUCKET_MASK( bucket ) ( ((uint64)bucket) << BBC_BUCKET_SHIFT )
+
+
+#define BBCU_TABLE_ENTRY_COUNT (1ull<<32)
+#define BBCU_BUCKET_ENTRY_COUNT (BBCU_TABLE_ENTRY_COUNT/BBCU_BUCKET_COUNT)
+//#define BBCU_XTRA_ENTRIES_PER_SLICE (1024u*64u)
+#define BBCU_XTRA_ENTRIES_PER_SLICE (4096u*1u)
+#define BBCU_MAX_SLICE_ENTRY_COUNT ((BBCU_BUCKET_ENTRY_COUNT/BBCU_BUCKET_COUNT)+BBCU_XTRA_ENTRIES_PER_SLICE)
+#define BBCU_BUCKET_ALLOC_ENTRY_COUNT (BBCU_MAX_SLICE_ENTRY_COUNT*BBCU_BUCKET_COUNT)
+#define BBCU_TABLE_ALLOC_ENTRY_COUNT (((uint64)BBCU_BUCKET_ALLOC_ENTRY_COUNT)*BBCU_BUCKET_COUNT)
+
+// The host always needs to start slices at the meta4 size, to avoid overwriting by subsequent tables
+#define BBCU_HOST_META_MULTIPLIER (4ull)
+#define BBCU_META_SLICE_ENTRY_COUNT (BBCU_MAX_SLICE_ENTRY_COUNT*BBCU_HOST_META_MULTIPLIER)
+#define BBCU_META_BUCKET_ENTRY_COUNT (BBCU_BUCKET_ALLOC_ENTRY_COUNT*BBCU_HOST_META_MULTIPLIER)
+
+#define BBCU_SCAN_GROUP_THREAD_COUNT 128
+#define BBCU_SCAN_GROUP_ENTRIES_PER_THREAD 512
+
+static constexpr uint32 CU_MAX_BC_GROUP_BOUNDARIES = ( BBCU_BUCKET_ENTRY_COUNT / 210 ); // Should be enough for all threads
+
+
+static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT / BBCU_BUCKET_COUNT == BBCU_MAX_SLICE_ENTRY_COUNT );
+
+#if _DEBUG
+
+ #ifdef _WIN32
+ #define DBG_BBCU_DBG_DIR "D:/dbg/cuda/"
+ #else
+ // #define DBG_BBCU_DBG_DIR "/home/harold/plot/dbg/cuda/"
+ #define DBG_BBCU_DBG_DIR "/home/harito/plot/dbg/cuda/"
+ #endif
+ // #define DBG_BBCU_REF_DIR "/home/harold/plot/ref/"
+
+
+ // #define BBCU_DBG_SKIP_PHASE_1 1 // Skip phase 1 and load pairs from disk
+ // #define BBCU_DBG_SKIP_PHASE_2 1 // Skip phase 1 and 2 and load pairs and marks from disk
+
+ #if (defined( BBCU_DBG_SKIP_PHASE_2 ) && !defined( BBCU_DBG_SKIP_PHASE_1 ) )
+ #define BBCU_DBG_SKIP_PHASE_1 1
+ #endif
+
+ // #define DBG_BBCU_P1_WRITE_CONTEXT 1
+ // #define DBG_BBCU_P1_WRITE_PAIRS 1
+ // #define DBG_BBCU_P2_WRITE_MARKS 1
+
+ // #define DBG_BBCU_P2_COUNT_PRUNED_ENTRIES 1
+
+
+ #define _ASSERT_DOES_NOT_OVERLAP( b0, b1, size ) ASSERT( (b1+size) <= b0 || b1 >= (b0+size) )
+ #define ASSERT_DOES_NOT_OVERLAP( b0, b1, size ) _ASSERT_DOES_NOT_OVERLAP( ((byte*)b0), ((byte*)b1), (size) )
+
+ #define _ASSERT_DOES_NOT_OVERLAP2( b0, b1, sz0, sz1 )ASSERT( (b1+sz1) <= b0 || b1 >= (b0+sz0) )
+ #define ASSERT_DOES_NOT_OVERLAP2( b0, b1, size0, size1 ) _ASSERT_DOES_NOT_OVERLAP2( ((byte*)b0), ((byte*)b1), (size0), (size1) )
+
+#else
+
+ #define _ASSERT_DOES_NOT_OVERLAP( b0, b1, size )
+ #define ASSERT_DOES_NOT_OVERLAP( b0, b1, size )
+ #define _ASSERT_DOES_NOT_OVERLAP2( b0, b1, sz0, sz1 )
+ #define ASSERT_DOES_NOT_OVERLAP2( b0, b1, size0, size1 )
+#endif
\ No newline at end of file
diff --git a/cuda/CudaPlotContext.h b/cuda/CudaPlotContext.h
new file mode 100644
index 00000000..f4e8d909
--- /dev/null
+++ b/cuda/CudaPlotContext.h
@@ -0,0 +1,560 @@
+#pragma once
+
+#include "CudaPlotConfig.h"
+#include "CudaUtil.h"
+#include "ChiaConsts.h"
+#include "CudaPlotter.h"
+#include "plotting/PlotTypes.h"
+#include "plotting/PlotWriter.h"
+#include "GpuStreams.h"
+#include "util/StackAllocator.h"
+#include "fse/fse.h"
+#include "threading/Fence.h"
+#include "plotting/GlobalPlotConfig.h"
+#include "threading/ThreadPool.h"
+
+#include "cub/device/device_radix_sort.cuh"
+// #include
+
+// Fix for cooperative_groups.h on windows
+#ifdef __LITTLE_ENDIAN__
+ #undef __LITTLE_ENDIAN__
+ #define __LITTLE_ENDIAN__ 1
+#endif
+#include
+using namespace cooperative_groups;
+
+#if _DEBUG
+ #include
+#endif
+
+
+
+
+struct CudaK32Phase2
+{
+ GpuUploadBuffer pairsLIn;
+ GpuUploadBuffer pairsRIn;
+ GpuDownloadBuffer outMarks;
+
+ uint64 pairsLoadOffset;
+ byte* devMarkingTable; // bytefield marking table
+ const uint64* devRMarks[6]; // Right table's marks as a bitfield
+ uint32* devPrunedCount;
+
+ StackAllocator* hostBitFieldAllocator; // Pinned bitfield buffers
+
+ TableId endTable;
+};
+
+struct CudaK32Phase3
+{
+ struct LMap
+ {
+ uint32 sourceIndex; // Initial unsorted (or y-sorted) index
+ uint32 sortedIndex; // Final LinePoint-sorted index
+ };
+ static_assert( sizeof( LMap ) == sizeof( uint64 ) );
+
+ struct RMap
+ {
+ uint32 src;
+ uint32 dstL;
+ uint32 dstR;
+ };
+
+ uint64 pairsLoadOffset;
+
+ uint32* devBucketCounts;
+ uint32* devPrunedEntryCount;
+
+
+ union {
+ RMap* hostRMap;
+ uint32* hostIndices;
+ };
+
+ union {
+ LMap* hostLMap;
+ uint64* hostLinePoints;
+ };
+
+ // #TODO: Remove this when we sort-out all of the buffer usage
+ // uint64* hostMarkingTables[6]; // Set by Phase 2
+
+
+ // uint32* hostBucketCounts;
+
+ uint32 prunedBucketCounts[7][BBCU_BUCKET_COUNT];
+ uint64 prunedTableEntryCounts[7];
+
+
+ // Inlined x table
+ struct {
+ const uint64* devRMarks; // R-Marking table
+ GpuUploadBuffer xIn; // 64-bit Pair
+ GpuDownloadBuffer lpOut; // Output line points (uint64)
+ GpuDownloadBuffer indexOut; // Output source line point index (uint32) (taken from the rMap source value)
+
+ } xTable;
+
+ // Step 1
+ struct {
+ uint64* rTableMarks;
+ GpuUploadBuffer pairsLIn;
+ GpuUploadBuffer pairsRIn;
+ GpuDownloadBuffer rMapOut;
+
+ uint32 prunedBucketSlices[BBCU_BUCKET_COUNT][BBCU_BUCKET_COUNT];
+ } step1;
+
+ // Step 2
+ struct {
+ GpuUploadBuffer rMapIn; // RMap from step 1
+ GpuUploadBuffer lMapIn; // Output map (uint64) from the previous table run. Or during L table 1, it is inlined x values
+ GpuDownloadBuffer lpOut; // Output line points (uint64)
+ GpuDownloadBuffer indexOut; // Output source line point index (uint32) (taken from the rMap source value)
+ uint32* devLTable[2]; // Unpacked L table bucket
+
+ uint32 prunedBucketSlices[BBCU_BUCKET_COUNT][BBCU_BUCKET_COUNT];
+ } step2;
+
+ // Step 3
+ struct {
+ GpuUploadBuffer lpIn; // Line points from step 2
+ GpuUploadBuffer indexIn; // Indices from step 2
+ GpuDownloadBuffer mapOut; // lTable for next step 1
+ GpuDownloadBuffer parksOut; // Downloads park buffers to host
+
+ uint32* hostParkOverrunCount;
+
+ size_t sizeTmpSort;
+ byte* devSortTmpData;
+
+ uint64* devLinePoints;
+ uint64* devDeltaLinePoints;
+ uint32* devIndices;
+ FSE_CTable* devCTable;
+ uint32* devParkOverrunCount;
+
+ Fence* parkFence;
+ std::atomic parkBucket;
+
+ uint32 prunedBucketSlices[BBCU_BUCKET_COUNT][BBCU_BUCKET_COUNT];
+
+ } step3;
+};
+
+struct CudaK32AllocContext
+{
+ size_t alignment;
+ bool dryRun;
+
+ IStackAllocator* pinnedAllocator;
+ IStackAllocator* devAllocator;
+ IStackAllocator* hostTableAllocator;
+ IStackAllocator* hostTempAllocator;
+};
+
+// struct CudaK32PlotRequest
+// {
+// const char* plotOutDir;
+// const char* plotFileName;
+
+// const byte* plotId;
+// const char* plotIdStr;
+
+// const byte* plotMemo;
+// uint16 plotMemoSize;
+
+// uint32 plotCount;
+// };
+
+struct CudaK32PlotContext
+{
+ CudaK32PlotConfig cfg = {};
+ const GlobalPlotConfig* gCfg = nullptr;
+
+ int32 cudaDevice = -1;
+ cudaDeviceProp* cudaDevProps = nullptr;
+ bool downloadDirect = false;
+ ThreadPool* threadPool = nullptr;
+
+ TableId table = TableId::Table1; // Current table being generated
+ uint32 bucket = 0; // Current bucket being processed
+
+ uint64 prevTablePairOffset = 0; // Offset at which to write the previous table's sorted pairs
+
+ uint32 bucketCounts[7][BBCU_BUCKET_COUNT] = {};
+ uint32 bucketSlices[2][BBCU_BUCKET_COUNT][BBCU_BUCKET_COUNT] = {};
+ uint64 tableEntryCounts[7] = {};
+
+ PlotRequest plotRequest;
+ PlotWriter* plotWriter = nullptr;
+ Fence* plotFence = nullptr;
+
+ // Root allocations
+ size_t allocAlignment = 0;
+ size_t pinnedAllocSize = 0;
+ size_t devAllocSize = 0;
+ size_t hostTableAllocSize = 0;
+ size_t hostTempAllocSize = 0;
+
+ void* pinnedBuffer = nullptr;
+ void* deviceBuffer = nullptr;
+ void* hostBufferTemp = nullptr;
+ void* hostBufferTables = nullptr;
+
+ // Device stuff
+ cudaStream_t computeStream = nullptr;
+ cudaStream_t computeStreamB = nullptr;
+ cudaStream_t computeStreamC = nullptr;
+ cudaStream_t computeStreamD = nullptr;
+ cudaEvent_t computeEventA = nullptr;
+ cudaEvent_t computeEventB = nullptr;
+ cudaEvent_t computeEventC = nullptr;
+ GpuQueue* gpuDownloadStream[BBCU_GPU_STREAM_COUNT] = {};
+ GpuQueue* gpuUploadStream [BBCU_GPU_STREAM_COUNT] = {};
+
+ GpuDownloadBuffer yOut;
+ GpuDownloadBuffer metaOut;
+ GpuUploadBuffer yIn;
+ GpuUploadBuffer metaIn;
+
+
+ GpuDownloadBuffer xPairsOut; // This shares the same backing buffer with pairsLOut & pairsROut
+ GpuDownloadBuffer pairsLOut;
+ GpuDownloadBuffer pairsROut;
+ GpuUploadBuffer xPairsIn; // This shares the same backing buffer with pairsLIn & pairsRIn
+ GpuUploadBuffer pairsLIn;
+ GpuUploadBuffer pairsRIn;
+ GpuDownloadBuffer sortedXPairsOut; // This shares the same backing buffer with sortedPairsLOut & sortedPairsROut
+ GpuDownloadBuffer sortedPairsLOut;
+ GpuDownloadBuffer sortedPairsROut;
+
+
+ size_t devSortTmpAllocSize = 0;
+ void* devSortTmp = nullptr;
+ uint32* devYWork = nullptr;
+ uint32* devMetaWork = nullptr;
+ uint32* devXInlineInput = nullptr;
+ Pair* devMatches = nullptr;
+ union {
+ Pair* devInlinedXs = nullptr;
+ uint32* devCompressedXs;
+ };
+ uint32* devBucketCounts = nullptr;
+ uint32* devSliceCounts = nullptr;
+ uint32* devSortKey = nullptr;
+ uint32* devChaChaInput = nullptr;
+
+ uint32* devGroupBoundaries = nullptr;
+
+ uint32* devMatchCount = nullptr;
+ uint32* devGroupCount = nullptr;
+
+
+ /// Host stuff
+
+ // Host "Temp 2"
+ uint32* hostY = nullptr;
+ uint32* hostMeta = nullptr;
+ uint32* hostBucketCounts = nullptr;
+ uint32* hostBucketSlices = nullptr;
+ uint32* hostTableL = nullptr;
+ uint16* hostTableR = nullptr;
+ uint32* hostTableSortedL = nullptr;
+ uint16* hostTableSortedR = nullptr;
+
+ union {
+ uint32* hostMatchCount = nullptr;
+ uint32* hostGroupCount;
+ };
+
+ // Host "Temp 1"
+ Pairs hostBackPointers [7] = {};
+ uint64* hostMarkingTables[6] = {};
+
+
+ CudaK32Phase2* phase2 = nullptr;
+ CudaK32Phase3* phase3 = nullptr;
+
+ struct
+ {
+ Duration uploadTime = Duration::zero(); // Host-to-device wait time
+ Duration downloadTime = Duration::zero(); // Device-to-host wait time
+ Duration matchTime = Duration::zero();
+ Duration sortTime = Duration::zero();
+ Duration fxTime = Duration::zero();
+
+ } timings;
+};
+
+#if _DEBUG
+ extern ThreadPool* _dbgThreadPool;
+
+ void DbgLoadTablePairs( CudaK32PlotContext& cx, const TableId table, bool copyToPinnedBuffer = false );
+ void DbgWritePairs( CudaK32PlotContext& cx, TableId table );
+ void DbgWriteContext( CudaK32PlotContext& cx );
+ void DbgLoadContextAndPairs( CudaK32PlotContext& cx, bool loadTables = false );
+ void DbgLoadMarks( CudaK32PlotContext& cx );
+ ThreadPool& DbgGetThreadPool( CudaK32PlotContext& cx );
+#endif
+
+void CudaK32PlotDownloadBucket( CudaK32PlotContext& cx );
+//void CudaK32PlotUploadBucket( CudaK32PlotContext& cx );
+
+
+void CudaK32PlotGenSortKey( const uint32 entryCount, uint32* devKey, cudaStream_t stream = nullptr, bool synchronize = false );
+
+template
+void CudaK32PlotSortByKey( const uint32 entryCount, const uint32* devKey, const T* devInput, T* devOutput, cudaStream_t stream = nullptr, bool synchronize = false );
+
+void CudaK32InlineXsIntoPairs(
+ const uint32 entryCount,
+ Pair* devOutPairs,
+ const Pair* devInPairs,
+ const uint32* devXs,
+ cudaStream_t stream );
+
+void CudaK32ApplyPairOffset(
+ const uint32 entryCount,
+ const uint32 offset,
+ Pair* devOutPairs,
+ const Pair* devInPairs,
+ cudaStream_t stream );
+
+///
+/// Phase 2
+///
+void CudaK32PlotPhase2( CudaK32PlotContext& cx );
+void CudaK32PlotPhase2AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx );
+
+///
+/// Phase 3
+///
+void CudaK32PlotPhase3( CudaK32PlotContext& cx );
+void CudaK32PlotPhase3AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx );
+
+///
+/// Debug
+///
+uint64 CudaPlotK32DbgXtoF1( CudaK32PlotContext& cx, const uint32 x );
+
+
+
+///
+/// Internal
+///
+//-----------------------------------------------------------
+inline uint32 CudaK32PlotGetInputIndex( CudaK32PlotContext& cx )
+{
+ return ((uint32)cx.table-1) & 1;
+}
+
+//-----------------------------------------------------------
+inline uint32 CudaK32PlotGetOutputIndex( CudaK32PlotContext& cx )
+{
+ return (uint32)cx.table & 1;
+}
+
+//-----------------------------------------------------------
+inline bool CudaK32PlotIsOutputInterleaved( CudaK32PlotContext& cx )
+{
+ return CudaK32PlotGetOutputIndex( cx ) == 0;
+}
+
+//-----------------------------------------------------------
+inline size_t GetMarkingTableBitFieldSize()
+{
+ return ((1ull << BBCU_K) / 64) * sizeof(uint64);
+}
+
+#define CuCDiv( a, b ) (( (a) + (b) - 1 ) / (b))
+
+//-----------------------------------------------------------
+template
+__host__ __device__ __forceinline__ constexpr T CuBBLog2( T x )
+{
+ T r = 0;
+ while( x >>= 1 )
+ r++;
+ return r;
+}
+
+
+
+// Calculates x * (x-1) / 2. Division is done before multiplication.
+//-----------------------------------------------------------
+__device__ __forceinline__ uint64 CudaGetXEnc64( uint64 x )
+{
+ uint64 a = x, b = x - 1;
+
+ if( (a & 1) == 0 )
+ a >>= 1;
+ else
+ b >>= 1;
+
+ return a * b;
+}
+
+//-----------------------------------------------------------
+__device__ __forceinline__ uint64 CudaSquareToLinePoint64( uint64 x, uint64 y )
+{
+ return CudaGetXEnc64( max( x, y ) ) + min( x, y );
+}
+
+//-----------------------------------------------------------
+template
+__device__ inline void CuGetThreadOffsets( const uint32 id, const uint32 threadCount, const T totalCount, T& count, T& offset, T& end )
+{
+ const T countPerThread = totalCount / (T)threadCount;
+ const T remainder = totalCount - countPerThread * (T)threadCount;
+
+ count = countPerThread;
+ offset = (T)id * countPerThread;
+
+ if( id == threadCount - 1 )
+ count += remainder;
+
+ end = offset + count;
+}
+
+//-----------------------------------------------------------
+__host__ __device__ __forceinline__ bool CuBitFieldGet( const uint64* bitfield, uint64 index )
+{
+ const uint64 fieldIdx = index >> 6; // Divide by 64. Safe to do with power of 2. (shift right == log2(64))
+ const uint64 field = bitfield[fieldIdx];
+
+ const uint32 rShift = (uint32)(index - (fieldIdx << 6)); // Multiply by fieldIdx (shift left == log2(64))
+ return (bool)((field >> rShift) & 1u);
+}
+
+
+//-----------------------------------------------------------
+__device__ __forceinline__ uint32 atomicAggrInc( uint32* dst )
+{
+ // Increment from coallesced group first
+ coalesced_group g = coalesced_threads();
+
+ uint32 prev;
+ if( g.thread_rank() == 0 )
+ prev = atomicAdd( dst, g.size() );
+
+ prev = g.thread_rank() + g.shfl( prev, 0 );
+ return prev;
+}
+
+//-----------------------------------------------------------
+__device__ __forceinline__ uint32 atomicGlobalOffset( uint32* globalCount )
+{
+ __shared__ uint32 sharedCount;
+
+ thread_block block = this_thread_block();
+
+ if( block.thread_rank() == 0 )
+ sharedCount = 0;
+
+ // Store block-wide offset
+ block.sync();
+ const uint32 blockOffset = atomicAggrInc( &sharedCount );
+ block.sync();
+
+ // Store global offset
+ if( block.thread_rank() == 0 )
+ sharedCount = atomicAdd( globalCount, sharedCount );
+
+ block.sync();
+
+ // Broadcast the shared count to each thread
+ const uint32 gOffset = sharedCount + blockOffset;
+ return gOffset;
+}
+
+//-----------------------------------------------------------
+__device__ __forceinline__ uint32 atomicAddShared( uint32* globalCount, const uint32 count )
+{
+ __shared__ uint32 sharedCount;
+
+ thread_block block = this_thread_block();
+
+ if( block.thread_rank() == 0 )
+ sharedCount = 0;
+
+ // Store shared offset
+ block.sync();
+ const uint32 offset = atomicAdd( &sharedCount, count );
+ block.sync();
+
+ // Store global offset
+ if( block.thread_rank() == 0 )
+ sharedCount = atomicAdd( globalCount, sharedCount );
+
+ block.sync();
+
+ return sharedCount + offset;
+}
+
+
+#if _DEBUG
+
+
+#include "b3/blake3.h"
+
+//-----------------------------------------------------------
+inline void DbgPrintHash( const char* msg, const void* ptr, const size_t size )
+{
+ byte hash[32];
+
+ blake3_hasher hasher;
+ blake3_hasher_init( &hasher );
+ blake3_hasher_update( &hasher, ptr, size );
+ blake3_hasher_finalize( &hasher, hash, sizeof( hash ) );
+
+ char hashstr[sizeof(hash)*2+1] = {};
+ size_t _;
+ BytesToHexStr( hash, sizeof( hash ), hashstr, sizeof( hashstr ), _ );
+
+ Log::Line( "%s 0x%s", msg, hashstr );
+}
+
+//-----------------------------------------------------------
+inline void DbgPrintDeviceHash( const char* msg, const void* ptr, const size_t size, cudaStream_t stream )
+{
+ byte hash[32];
+
+ void* hostBuffer = bbvirtallocboundednuma( size );
+ CudaErrCheck( cudaMemcpyAsync( hostBuffer, ptr, size, cudaMemcpyDeviceToHost, stream ) );
+ CudaErrCheck( cudaStreamSynchronize( stream ) );
+
+ blake3_hasher hasher;
+ blake3_hasher_init( &hasher );
+ blake3_hasher_update( &hasher, hostBuffer, size );
+ blake3_hasher_finalize( &hasher, hash, sizeof( hash ) );
+
+ bbvirtfreebounded( hostBuffer );
+
+ char hashstr[sizeof( hash ) * 2 + 1] = {};
+ size_t _;
+ BytesToHexStr( hash, sizeof( hash ), hashstr, sizeof( hashstr ), _ );
+
+ Log::Line( "%s 0x%s", msg, hashstr );
+}
+
+//-----------------------------------------------------------
+template
+inline void DbgPrintDeviceHashT( const char* msg, const T* ptr, const size_t count, cudaStream_t stream )
+{
+ return DbgPrintDeviceHash( msg, ptr, count * sizeof( T ), stream );
+}
+
+//-----------------------------------------------------------
+inline ThreadPool& DbgGetThreadPool( CudaK32PlotContext& cx )
+{
+ if( _dbgThreadPool == nullptr )
+ _dbgThreadPool = new ThreadPool( SysHost::GetLogicalCPUCount() );
+
+ return *_dbgThreadPool;
+}
+
+#endif
\ No newline at end of file
diff --git a/cuda/CudaPlotPhase2.cu b/cuda/CudaPlotPhase2.cu
new file mode 100644
index 00000000..93099d86
--- /dev/null
+++ b/cuda/CudaPlotPhase2.cu
@@ -0,0 +1,654 @@
+#include "CudaPlotContext.h"
+#include "util/StackAllocator.h"
+
+#if _DEBUG
+ #include "util/BitField.h"
+ #include "threading/MTJob.h"
+ #include "plotdisk/jobs/IOJob.h"
+
+ byte* _dbgRMarks = nullptr;
+
+ static void DbgValidateTable( CudaK32PlotContext& cx, const TableId table );
+ static void DbgWriteMarks( CudaK32PlotContext& cx, const TableId table );
+ static void DebugPruneInCPU( CudaK32PlotContext& cx );
+
+ #ifndef DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
+ #define DBG_BBCU_P2_COUNT_PRUNED_ENTRIES 1
+ #endif
+#endif
+
+static void CudaK32PlotAllocateBuffersTest( CudaK32PlotContext& cx );
+
+#define MARK_TABLE_BLOCK_THREADS 128
+#define P2_BUCKET_COUNT BBCU_BUCKET_COUNT
+#define P2_ENTRIES_PER_BUCKET BBCU_BUCKET_ALLOC_ENTRY_COUNT //((1ull<
+__global__ void CudaMarkTables( const uint32 entryCount, const uint32* lPairs, const uint16* rPairs, byte* marks, const uint64* rTableMarks, const uint32 rOffset )
+{
+ const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+
+ // Each thread handles 1 entry
+ if( gid >= entryCount )
+ return;
+
+ if constexpr ( useRMarks )
+ {
+ if( !CuBitFieldGet( rTableMarks, rOffset + gid ) )
+ return;
+ }
+
+ const uint32 l = lPairs[gid];
+ const uint32 r = l + rPairs[gid];
+
+ marks[l] = 1;
+ marks[r] = 1;
+}
+
+
+__global__ void CudaBytefieldToBitfield( const byte* bytefield, uint64* bitfield
+#if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
+ , uint32* gPrunedCount
+#endif
+ )
+{
+ const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+ CUDA_ASSERT( gid < 67108864 );
+
+ // if( gid >= fieldCount )
+ // return;
+
+ // Each thread reads a full 64-bit field, so 64 bytes
+ bytefield += gid * 64ull;
+
+ // Convert 64 bytes to a 64-bit field
+ uint64 bits = (uint64)bytefield[0];
+
+ #pragma unroll
+ for( int32 i = 1; i < 64; i++ )
+ bits |= (((uint64)bytefield[i]) << i);
+
+ CUDA_ASSERT( (uintptr_t)bitfield / 8 * 8 == (uintptr_t)bitfield );
+ bitfield[gid] = bits;
+
+#if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
+
+ uint32 markCount = 0;
+
+ #pragma unroll
+ for( uint32 i = 0; i < 64; i++ )
+ {
+ // if( (bits & (1ull << i)) != 0 )
+ // markCount++;
+ if( bytefield[i] == 1 )
+ markCount++;
+ }
+
+ __shared__ uint32 sharedMarkCount;
+ thread_block block = this_thread_block();
+
+ // #TODO: Use warp-aware reduction via CUB
+ block.sync();
+ if( block.thread_rank() == 0 )
+ sharedMarkCount = 0;
+ block.sync();
+
+ atomicAdd( &sharedMarkCount, markCount );
+ block.sync();
+
+ if( block.thread_rank() == 0 )
+ atomicAdd( gPrunedCount, sharedMarkCount );
+#endif
+}
+
+static void BytefieldToBitfield( CudaK32PlotContext& cx, const byte* bytefield, uint64* bitfield, cudaStream_t stream )
+{
+ const uint64 tableEntryCount = 1ull << BBCU_K;
+ const uint32 fieldCount = (uint32)( tableEntryCount / 64 );
+
+ const uint32 blockThreadCount = 256;
+ const uint32 blockCount = CDivT( fieldCount, blockThreadCount );
+
+ ASSERT( (uint64)blockCount * blockThreadCount * 64 == tableEntryCount );
+
+#if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
+ #define G_PRUNED_COUNTS ,cx.phase2->devPrunedCount
+ CudaErrCheck( cudaMemsetAsync( cx.phase2->devPrunedCount, 0, sizeof( uint32 ), stream ) );
+#else
+ #define G_PRUNED_COUNTS
+#endif
+
+ ASSERT_DOES_NOT_OVERLAP2( bitfield, bytefield, GetMarkingTableBitFieldSize(), GetMarkingTableByteSize() );
+
+ CudaBytefieldToBitfield<<>>( bytefield, bitfield G_PRUNED_COUNTS );
+}
+
+void LoadPairs( CudaK32PlotContext& cx, CudaK32Phase2& p2, const TableId rTable, const uint32 bucket )
+{
+ const uint64 tableEntryCount = cx.tableEntryCounts[(int)rTable];
+ const uint32 entryCount = BBCU_BUCKET_ENTRY_COUNT;//(uint32)std::min( (uint64)BBCU_BUCKET_ENTRY_COUNT, tableEntryCount - p2.pairsLoadOffset );// cx.bucketCounts[(int)rTable][bucket];
+
+ // uint32* hostPairsL = cx.hostTableSortedL + p2.pairsLoadOffset;
+ // uint16* hostPairsR = cx.hostTableSortedR + p2.pairsLoadOffset;
+ uint32* hostPairsL = cx.hostBackPointers[(int)rTable].left + p2.pairsLoadOffset;
+ uint16* hostPairsR = cx.hostBackPointers[(int)rTable].right + p2.pairsLoadOffset;
+ // const uint32* nextHostPairsL = cx.hostBackPointers[(int)rTable-1].left + p2.pairsLoadOffset;
+ // const uint16* nextHostPairsR = cx.hostBackPointers[(int)rTable-1].right + p2.pairsLoadOffset;
+
+ // if( rTable > p2.endTable )
+ {
+ // Copy the next table to our pinned host pairs
+ // p2.pairsLIn.UploadAndPreLoadT( hostPairsL, entryCount, nextHostPairsL, entryCount );
+ // p2.pairsRIn.UploadAndPreLoadT( hostPairsR, entryCount, nextHostPairsR, entryCount );
+ }
+ // else
+ // {
+ p2.pairsLIn.UploadT( hostPairsL, entryCount );
+ p2.pairsRIn.UploadT( hostPairsR, entryCount );
+ // }
+
+ p2.pairsLoadOffset += entryCount;
+}
+
+void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 )
+{
+ const TableId lTable = cx.table;
+ const TableId rTable = lTable + 1;
+
+ byte* devLMarks = p2.devMarkingTable;
+
+ // Zero-out marks
+ CudaErrCheck( cudaMemsetAsync( devLMarks, 0, GetMarkingTableByteSize(), cx.computeStream ) );
+
+ // Load first bucket's worth of pairs
+ LoadPairs( cx, p2, rTable, 0 );
+
+ uint32 rOffset = 0;
+ for( uint32 bucket = 0; bucket < P2_BUCKET_COUNT; bucket++ )
+ {
+ const bool isLastBucket = bucket + 1 == P2_BUCKET_COUNT;
+
+ // Load next set of pairs in the background
+ if( !isLastBucket )
+ LoadPairs( cx, p2, rTable, bucket + 1 );
+
+ const uint64 tableEntryCount = cx.tableEntryCounts[(int)rTable];
+ const uint32 entryCount = isLastBucket ? tableEntryCount - (BBCU_BUCKET_ENTRY_COUNT * (BBCU_BUCKET_COUNT-1)): BBCU_BUCKET_ENTRY_COUNT;
+ // const uint32 entryCount = cx.bucketCounts[(int)rTable][bucket];
+
+ // Wait for pairs to be ready
+ const uint32* devLPairs = p2.pairsLIn.GetUploadedDeviceBufferT( cx.computeStream );
+ const uint16* devRPairs = p2.pairsRIn.GetUploadedDeviceBufferT( cx.computeStream );
+
+
+ // Mark
+ const uint32 blockCount = (uint32)CDiv( entryCount, MARK_TABLE_BLOCK_THREADS );
+
+ if( rTable == TableId::Table7 )
+ CudaMarkTables<<>>( entryCount, devLPairs, devRPairs, devLMarks, nullptr, 0 );
+ else
+ CudaMarkTables<<>>( entryCount, devLPairs, devRPairs, devLMarks, p2.devRMarks[(int)rTable], rOffset );
+
+ p2.pairsLIn.ReleaseDeviceBuffer( cx.computeStream );
+ p2.pairsRIn.ReleaseDeviceBuffer( cx.computeStream );
+
+ rOffset += entryCount;
+ }
+
+ // Convert the bytefield marking table to a bitfield
+ uint64* bitfield = (uint64*)p2.outMarks.LockDeviceBuffer( cx.computeStream );
+
+ BytefieldToBitfield( cx, devLMarks, bitfield, cx.computeStream );
+
+ // Download bitfield marks
+ // uint64* hostBitField = p2.hostBitFieldAllocator->AllocT( GetMarkingTableBitFieldSize() );
+ uint64* hostBitField = cx.hostMarkingTables[(int)lTable];
+
+ // #TODO: Do download and copy again, for now just store all of them in this pinned buffer
+ // cx.phase3->hostMarkingTables[(int)lTable] = hostBitField;
+ p2.outMarks.Download( hostBitField, GetMarkingTableBitFieldSize(), cx.computeStream );
+
+ // p2.outMarks.DownloadAndCopy( hostBitField, cx.hostMarkingTables[(int)lTable], GetMarkingTableBitFieldSize(), cx.computeStream );
+ // p2.outMarks.Download( cx.hostMarkingTables[(int)lTable], GetMarkingTableBitFieldSize() );
+
+
+#if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
+ {
+ uint32 prunedEntryCount = 0;
+ CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
+ CudaErrCheck( cudaMemcpyAsync( &prunedEntryCount, p2.devPrunedCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, cx.computeStream ) );
+ CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
+
+ const uint64 lEntryCount = cx.tableEntryCounts[(int)lTable];
+ Log::Line( "Table %u now has %u / %llu ( %.2lf%% ) entries.", (uint)lTable+1,
+ prunedEntryCount, lEntryCount, ((double)prunedEntryCount / lEntryCount ) * 100.0 );
+ }
+
+ // Check on CPU
+ if( 0 )
+ {
+ #if _DEBUG
+ p2.outMarks.WaitForCompletion();
+
+ // CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
+ // CudaErrCheck( cudaStreamSynchronize( cx.gpuDownloadStream[0]->GetStream() ) );
+ // CudaErrCheck( cudaStreamSynchronize( cx.gpuDownloadStream[1]->GetStream() ) );
+ // CudaErrCheck( cudaStreamSynchronize( cx.gpuDownloadStream[2]->GetStream() ) );
+
+ // byte* hByteField = bbcvirtalloc( GetMarkingTableByteSize() );
+ // uint64* hBitField = bbcvirtalloc( GetMarkingTableBitFieldSize() );
+ // uint64* rBitField = bbcvirtalloc( GetMarkingTableBitFieldSize() );
+ // CudaErrCheck( cudaMemcpyAsync( hByteField, devLMarks, GetMarkingTableByteSize(), cudaMemcpyDeviceToHost, cx.computeStream ) );
+ // CudaErrCheck( cudaMemcpyAsync( hBitField, bitfield, GetMarkingTableBitFieldSize(), cudaMemcpyDeviceToHost, cx.computeStream ) );
+
+ // if( rTable < TableId::Table7 )
+ // CudaErrCheck( cudaMemcpyAsync( rBitField, p2.devRMarks, GetMarkingTableBitFieldSize(), cudaMemcpyDeviceToHost, cx.computeStream ) );
+
+ // CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
+ // // (void)p2.outMarks.GetDeviceBuffer();
+ uint64* hBitField = cx.hostMarkingTables[(int)lTable];
+
+ std::atomic bitfieldPrunedEntryCount = 0;
+ // std::atomic totalPrunedEntryCount = 0;
+ // std::atomic rTablePrunedEntryCount = 0;
+
+ AnonMTJob::Run( DbgGetThreadPool( cx ), [&]( AnonMTJob* self ){
+
+ const TableId rt = lTable + 1;
+ const uint64 rEntryCount = cx.tableEntryCounts[(int)rTable];
+ const uint64 lEntryCount = cx.tableEntryCounts[(int)lTable];
+
+ uint64 localPrunedEntryCount = 0;
+ uint64 rPrunedEntryCount = 0;
+
+ // BitField rMarks( rBitField, rEntryCount );
+ // const byte* bytefield = hByteField;
+
+ uint64 count, offset, end;
+
+ // // Count r entries again to make sure it's still valid
+ // if( rt < TableId::Table7 )
+ // {
+ // GetThreadOffsets( self, rEntryCount, count, offset, end );
+ // for( uint64 i = offset; i < end; i++ )
+ // {
+ // if( rMarks.Get( i ) )
+ // rPrunedEntryCount ++;
+ // }
+
+ // rTablePrunedEntryCount += rPrunedEntryCount;
+ // }
+
+ GetThreadOffsets( self, lEntryCount, count, offset, end );
+ // for( uint64 i = offset; i < end; i++ )
+ // {
+ // if( bytefield[i] == 1 )
+ // localPrunedEntryCount++;
+ // }
+ // totalPrunedEntryCount += localPrunedEntryCount;
+
+ BitField bits( hBitField, lEntryCount );
+ localPrunedEntryCount = 0;
+ for( uint64 i = offset; i < end; i++ )
+ {
+ if( bits.Get( i ) )
+ localPrunedEntryCount++;
+ }
+ bitfieldPrunedEntryCount += localPrunedEntryCount;
+ });
+
+ uint64 prunedEntryCount;
+ const uint64 lEntryCount = cx.tableEntryCounts[(int)lTable];
+ // prunedEntryCount = totalPrunedEntryCount.load();
+ // Log::Line( "*** BYTEfield pruned entry count: %llu / %llu ( %.2lf %% )",
+ // prunedEntryCount, lEntryCount, prunedEntryCount / (double)lEntryCount * 100.0 );
+
+ prunedEntryCount = bitfieldPrunedEntryCount.load();
+ Log::Line( "*** Bitfield pruned entry count: %llu / %llu ( %.2lf %% )",
+ prunedEntryCount, lEntryCount, prunedEntryCount / (double)lEntryCount * 100.0 );
+
+ // if( rTable < TableId::Table7 )
+ // {
+ // prunedEntryCount = rTablePrunedEntryCount.load();
+ // const uint64 rEntryCount = cx.tableEntryCounts[(int)rTable];
+ // Log::Line( "*** R pruned entry count: %llu / %llu ( %.2lf %% )",
+ // prunedEntryCount, rEntryCount, prunedEntryCount / (double)rEntryCount * 100.0 );
+
+ // }
+
+ // // Full CPU method
+
+ // bbvirtfree( hByteField );
+ // bbvirtfree( hBitField );
+ // bbvirtfree( rBitField );
+ #endif
+ }
+#endif
+
+ // Set right table marks for the next step
+ p2.devRMarks[(int)lTable] = bitfield;
+}
+
+void CudaK32PlotPhase2( CudaK32PlotContext& cx )
+{
+ CudaK32Phase2& p2 = *cx.phase2;
+ // p2.hostBitFieldAllocator->PopToMarker( 0 );
+
+ const uint32 compressionLevel = cx.gCfg->compressionLevel;
+
+ const TableId startRTable = TableId::Table7;
+ const TableId endRTable = TableId::Table3 + (TableId)cx.gCfg->numDroppedTables;
+
+ p2.endTable = endRTable;
+
+// #if _DEBUG
+// DebugPruneInCPU( cx );
+// #endif
+
+#if BBCU_DBG_SKIP_PHASE_1
+ DbgLoadTablePairs( cx, TableId::Table7, true );
+#endif
+ // CudaK32PlotAllocateBuffersTest( cx );
+
+ for( TableId rTable = startRTable; rTable >= endRTable; rTable-- )
+ {
+ #if BBCU_DBG_SKIP_PHASE_1
+ DbgLoadTablePairs( cx, rTable-1, false );
+ // DbgValidateTable( cx, rTable );
+ #endif
+ const auto timer = TimerBegin();
+
+ cx.table = rTable-1;
+ p2.pairsLoadOffset = 0;
+
+ MarkTable( cx, p2 );
+ p2.outMarks.WaitForCompletion();
+ p2.outMarks.Reset();
+ const auto elapsed = TimerEnd( timer );
+ Log::Line( "Marked Table %u in %.2lf seconds.", rTable, elapsed );
+
+ #if _DEBUG && DBG_BBCU_P2_WRITE_MARKS
+ p2.outMarks.WaitForCompletion();
+ DbgWriteMarks( cx, rTable-1 );
+ #endif
+ }
+
+ // Wait for everything to complete
+
+ // p2.outMarks.WaitForCopyCompletion(); // #TODO: Re-activate this when re-enabling copy
+ p2.outMarks.WaitForCompletion();
+ p2.outMarks.Reset();
+}
+
+
+///
+/// Allocation
+///
+void CudaK32PlotPhase2AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
+{
+ const size_t alignment = cx.allocAlignment;
+
+ IAllocator& devAllocator = *acx.devAllocator;
+ IAllocator& pinnedAllocator = *acx.pinnedAllocator;
+
+ CudaK32Phase2& p2 = *cx.phase2;
+
+ const size_t markingTableByteSize = GetMarkingTableByteSize();
+ const size_t markingTableBitFieldSize = GetMarkingTableBitFieldSize();
+
+ p2.devPrunedCount = devAllocator.CAlloc( 1, alignment );
+ p2.devMarkingTable = devAllocator.AllocT( markingTableByteSize, alignment );
+
+ p2.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
+ sizeof( uint32 ) * P2_ENTRIES_PER_BUCKET, devAllocator, pinnedAllocator, alignment, acx.dryRun );
+
+ p2.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
+ sizeof( uint16 ) * P2_ENTRIES_PER_BUCKET, devAllocator, pinnedAllocator, alignment, acx.dryRun );
+
+ p2.outMarks = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer(
+ markingTableBitFieldSize, devAllocator, alignment, acx.dryRun );
+
+ // These buffers are safe to use at this point
+ // p2.hostBitFieldAllocator = new StackAllocator( cx.hostTableR, sizeof( uint32 ) * BBCU_TABLE_ALLOC_ENTRY_COUNT );
+}
+
+
+#if _DEBUG
+
+void DebugPruneInCPU( CudaK32PlotContext& cx )
+{
+ ThreadPool& pool = DbgGetThreadPool( cx );
+ byte* bytefields[2] = {
+ bbvirtalloc( GetMarkingTableByteSize() ),
+ bbvirtalloc( GetMarkingTableByteSize() )
+ };
+
+
+ // uint64* bitfield = bbvirtalloc( GetMarkingTableBitFieldSize() );
+ // BitField marks( bitfield, 1ull << BBCU_K );
+ // memset( bitfield, 0, GetMarkingTableBitFieldSize() );
+
+ // uint64 prunedEntryCount = 0;
+ // const uint64 entryCount = cx.tableEntryCounts[6];
+
+
+ // for( uint64 i = 0; i < entryCount; i++ )
+ // {
+ // const uint32 l = rTable.left[i];
+ // const uint32 r = l + rTable.right[i];
+
+ // marks.Set( l );
+ // marks.Set( r );
+ // }
+
+ // for( uint64 i = 0; i < 1ull << BBCU_K; i++ )
+ // {
+ // if( marks.Get( i ) )
+ // prunedEntryCount++;
+ // }
+ // const TableId rTableId = TableId::Table7;
+
+ for( TableId rTableId = TableId::Table7; rTableId >= cx.phase2->endTable; rTableId-- )
+ {
+ const TableId lTableId = rTableId - 1;
+
+ const byte* rTableByteField = bytefields[(int)lTableId % 2];
+ byte* bytefield = bytefields[(int)rTableId % 2];
+
+ memset( bytefield, 0, GetMarkingTableByteSize() );
+
+ // DbgLoadTablePairs( cx, rTableId );
+ // Pairs rTable = cx.hostBackPointers[(int)rTableId];
+
+ std::atomic totalPrunedEntryCount = 0;
+
+ AnonMTJob::Run( pool, [&]( AnonMTJob* self ) {
+
+ const uint64 rEntryCount = cx.tableEntryCounts[(int)rTableId];
+ {
+ uint64 count, offset, end;
+ GetThreadOffsets( self, rEntryCount, count, offset, end );
+
+ const TableId rId = rTableId;
+ Pairs rTable = cx.hostBackPointers[(int)rTableId];
+
+ for( uint64 i = offset; i < end; i++ )
+ {
+ if( rId < TableId::Table7 && rTableByteField[i] == 0 )
+ continue;
+
+ const uint32 l = rTable.left[i];
+ const uint32 r = l + rTable.right[i];
+
+ bytefield[l] = 1;
+ bytefield[r] = 1;
+ }
+
+ self->SyncThreads();
+
+ uint64 localPrunedEntryCount = 0;
+ const uint64 lEntryCount = cx.tableEntryCounts[(int)lTableId];
+ GetThreadOffsets( self, lEntryCount, count, offset, end );
+ for( uint64 i = offset; i < end; i++ )
+ {
+ if( bytefield[i] == 1 )
+ localPrunedEntryCount++;
+ }
+
+ totalPrunedEntryCount += localPrunedEntryCount;
+ }
+ });
+
+ const uint64 prunedEntryCount = totalPrunedEntryCount.load();
+ const uint64 lEntryCount = cx.tableEntryCounts[(int)lTableId];
+ Log::Line( "Table %u Pruned entry count: %llu / %llu ( %.2lf %% )", (uint)rTableId,
+ prunedEntryCount, lEntryCount, prunedEntryCount / (double)lEntryCount * 100.0 );
+ }
+}
+
+void DbgValidateTable( CudaK32PlotContext& cx )
+{
+ ThreadPool& pool = DbgGetThreadPool( cx );
+
+ byte* bytefieldL = bbvirtalloc( GetMarkingTableByteSize() );
+ byte* bytefieldR = bbvirtalloc( GetMarkingTableByteSize() );
+ memset( bytefieldL, 0, GetMarkingTableByteSize() );
+ memset( bytefieldR, 0, GetMarkingTableByteSize() );
+
+ // uint64* bitfield = bbvirtalloc( GetMarkingTableBitFieldSize() );
+ // BitField marks( bitfield, 1ull << BBCU_K );
+ // memset( bitfield, 0, GetMarkingTableBitFieldSize() );
+
+ // uint64 prunedEntryCount = 0;
+ // const uint64 entryCount = cx.tableEntryCounts[6];
+ // Pairs rTable = cx.hostBackPointers[6];
+
+ // for( uint64 i = 0; i < entryCount; i++ )
+ // {
+ // const uint32 l = rTable.left[i];
+ // const uint32 r = l + rTable.right[i];
+
+ // marks.Set( l );
+ // marks.Set( r );
+ // }
+
+ // for( uint64 i = 0; i < 1ull << BBCU_K; i++ )
+ // {
+ // if( marks.Get( i ) )
+ // prunedEntryCount++;
+ // }
+ Log::Line( "[DEBUG] Validating table" );
+
+ // for( TableId rt = TableId::Table7; rt >= TableId::Table3; rt-- )
+ TableId rt = TableId::Table7;
+ {
+ {
+ uint64 totalCount = 0;
+ for( uint32 bucket = 0; bucket < P2_BUCKET_COUNT; bucket++ )
+ totalCount += cx.bucketCounts[(int)rt][bucket];
+
+ ASSERT( totalCount == cx.tableEntryCounts[(int)rt] );
+ }
+
+ std::atomic totalPrunedEntryCount = 0;
+
+ memset( bytefieldL, 0, GetMarkingTableByteSize() );
+
+ Pairs hostRTablePairs = cx.hostBackPointers[(int)rt];
+
+ for( uint32 bucket = 0; bucket < P2_BUCKET_COUNT; bucket++ )
+ {
+ const uint32 rTableBucketEntryCount = cx.bucketCounts[(int)rt][bucket];
+
+ // Mark
+ AnonMTJob::Run( pool, [&]( AnonMTJob* self ){
+
+ // Pairs rTable = cx.hostBackPointers[(int)rt];
+ // const uint64 rEntryCount = cx.tableEntryCounts[(int)rt];
+ const uint64 rBucketEntryCount = rTableBucketEntryCount;
+
+ {
+ uint64 count, offset, end;
+ GetThreadOffsets( self, rBucketEntryCount, count, offset, end );
+
+ Pairs rTable = hostRTablePairs;
+
+ if( offset == 0 )
+ Log::Line( "[%-3u] %u, %u", bucket, rTable.left[offset], (uint32)rTable.right[offset] );
+
+ const bool readR = rt < TableId::Table7;
+
+ const byte* rBytes = bytefieldR;
+ byte* lBytes = bytefieldL;
+
+ for( uint64 i = offset; i < end; i++ )
+ {
+ // if( readR && rBytes[i] == 0 )
+ // continue;
+
+ const uint32 l = rTable.left[i];
+ const uint32 r = l + rTable.right[i];
+
+ lBytes[l] = 1;
+ lBytes[r] = 1;
+ }
+ }
+ });
+
+ hostRTablePairs.left += rTableBucketEntryCount;
+ hostRTablePairs.right += rTableBucketEntryCount;
+ }
+
+ // Count
+ AnonMTJob::Run( pool, [&]( AnonMTJob* self ){
+
+ uint64 localPrunedEntryCount = 0;
+ const uint64 lEntryCount = cx.tableEntryCounts[(int)rt-1];
+ const byte * lBytes = bytefieldL;
+
+ uint64 count, offset, end;
+ GetThreadOffsets( self, lEntryCount, count, offset, end );
+ for( uint64 i = offset; i < end; i++ )
+ {
+ if( lBytes[i] == 1 )
+ localPrunedEntryCount++;
+ }
+
+ totalPrunedEntryCount += localPrunedEntryCount;
+ });
+
+ // if( _dbgRMarks == nullptr )
+ // _dbgRMarks = bb
+ std::swap( bytefieldL, bytefieldR );
+
+ const uint64 prunedEntryCount = totalPrunedEntryCount.load();
+ const uint64 lEntryCount = cx.tableEntryCounts[(int)rt-1];
+ Log::Line( "Table %u pruned entry count: %llu / %llu ( %.2lf %% )", (uint)rt,
+ prunedEntryCount, lEntryCount, prunedEntryCount / (double)lEntryCount * 100.0 );
+ }
+}
+
+void DbgWriteMarks( CudaK32PlotContext& cx, const TableId table )
+{
+ char path[512];
+
+ Log::Line( "[DEBUG] Writing marking table %u to disk...", table+1 );
+ {
+ sprintf( path, "%smarks%d.tmp", DBG_BBCU_DBG_DIR, (int)table+1 );
+
+ const uint64* marks = cx.hostMarkingTables[(int)table];
+
+ int err;
+ FatalIf( !IOJob::WriteToFile( path, marks, GetMarkingTableBitFieldSize(), err ),
+ "Failed to write marking table with error: %d", err );
+ }
+}
+
+#endif
+
diff --git a/cuda/CudaPlotPhase3.cu b/cuda/CudaPlotPhase3.cu
new file mode 100644
index 00000000..b19d42c3
--- /dev/null
+++ b/cuda/CudaPlotPhase3.cu
@@ -0,0 +1,959 @@
+#include "CudaPlotPhase3Internal.h"
+#include "CudaParkSerializer.h"
+
+
+static void CompressInlinedTable( CudaK32PlotContext& cx );
+static void Step1( CudaK32PlotContext& cx );
+
+void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx );
+void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx );
+void WritePark7( CudaK32PlotContext& cx );
+
+
+static void AllocXTableStep( CudaK32PlotContext& cx, CudaK32AllocContext& acx );
+static void CudaK32PlotAllocateBuffersStep1( CudaK32PlotContext& cx, CudaK32AllocContext& acx );
+static void CudaK32PlotAllocateBuffersStep2( CudaK32PlotContext& cx, CudaK32AllocContext& acx );
+static void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContext& acx );
+
+
+
+#if _DEBUG
+ static void DbgValidateRMap( CudaK32PlotContext& cx );
+ static void DbgValidateIndices( CudaK32PlotContext& cx );
+ void DbgLoadLMap( CudaK32PlotContext& cx );
+ void DbgDumpSortedLinePoints( CudaK32PlotContext& cx );
+#endif
+
+
+//-----------------------------------------------------------
+__global__ void CudaConvertInlinedXsToLinePoints(
+ const uint64 entryCount, const uint32 rOffset, const uint32 bucketShift,
+ const Pair* inXs, const uint64* rMarks,
+ uint64* outLPs, uint32* outIndices, uint32* gBucketCounts )
+{
+ const uint32 id = threadIdx.x;
+ const uint32 gid = blockIdx.x * blockDim.x + id;
+ const uint32 rIndex = rOffset + gid;
+
+ __shared__ uint32 sharedBuckets[BBCU_BUCKET_COUNT];
+
+ CUDA_ASSERT( gridDim.x >= BBCU_BUCKET_COUNT );
+ if( id < BBCU_BUCKET_COUNT )
+ sharedBuckets[id] = 0;
+
+ __syncthreads();
+
+ uint32 bucket;
+ uint32 offset;
+ uint64 lp;
+ uint32 count = 0;
+
+ const bool isPruned = gid >= entryCount || !CuBitFieldGet( rMarks, rIndex );
+ if( !isPruned )
+ {
+ const Pair p = inXs[gid];
+ CUDA_ASSERT( p.left || p.right );
+
+ lp = CudaSquareToLinePoint64( p.left, p.right );
+ bucket = (uint32)(lp >> bucketShift);
+ offset = atomicAdd( &sharedBuckets[bucket], 1 );
+
+ count = 1;
+ }
+ __syncthreads();
+
+ // Global offset
+ if( id < BBCU_BUCKET_COUNT )
+ sharedBuckets[id] = atomicAdd( &gBucketCounts[id], sharedBuckets[id] );
+ __syncthreads();
+
+ if( isPruned )
+ return;
+
+ const uint32 dst = bucket * P3_PRUNED_SLICE_MAX + sharedBuckets[bucket] + offset;
+
+ CUDA_ASSERT( lp );
+ // CUDA_ASSERT( outLPs[dst] == 0 );
+
+ outLPs [dst] = lp;
+ outIndices[dst] = rIndex;
+}
+
+
+//-----------------------------------------------------------
+__global__ void CudaTestPrune(
+ const uint64 entryCount, const uint32 rOffset, const uint64* rTableMarks, uint32* gPrunedEntryCount )
+{
+ const uint32 id = threadIdx.x;
+ const uint32 gid = blockIdx.x * blockDim.x + id;
+
+ const uint32 count = ( gid >= entryCount || !CuBitFieldGet( rTableMarks, rOffset + gid ) ) ? 0 : 1;
+
+ atomicAddShared( gPrunedEntryCount, count );
+}
+
+//-----------------------------------------------------------
+__global__ void CudaConvertToLinePoints(
+ const uint64 entryCount, const uint32 rOffset, const uint32 lpBitSize,
+ const uint32* lTable, const uint32* lPairs, const uint16* rPairs,
+ const byte* marks, uint64* outLPs, uint32* gPrunedCount )
+{
+ const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+
+ if( gid == 0 )
+ gPrunedCount = 0;
+
+ // Filter-out entries that are not marked
+ // if( !CuBitFieldGet( rMarks, rIndex ) )
+ // {
+
+ // }
+
+ // Grab L table values
+ const uint32 l = lPairs[gid];
+ const uint32 r = l + rPairs[gid];
+
+ const uint32 x = lTable[l];
+ const uint32 y = lTable[r];
+
+ // Convert to line point
+ const uint64 lp = CudaSquareToLinePoint64( x, y );
+
+ const uint32 dst = atomicGlobalOffset( gPrunedCount );
+
+ outLPs[dst] = lp;
+}
+
+
+//-----------------------------------------------------------
+template
+__global__ void PruneAndWriteRMap(
+ const uint32 entryCount, const uint64 rOffset,
+ uint32* gBucketCounts, uint32* gPrunedEntryCount, RMap* gRMap,
+ const uint32* lPairs, const uint16* rPairs, const uint64* rMarks )
+{
+ const uint32 id = threadIdx.x;
+ const uint32 gid = blockIdx.x * blockDim.x + id;
+
+ __shared__ uint32 sharedBuckets[BBCU_BUCKET_COUNT];
+
+ CUDA_ASSERT( gridDim.x >= BBCU_BUCKET_COUNT );
+ if( id < BBCU_BUCKET_COUNT )
+ sharedBuckets[id] = 0;
+
+ __syncthreads();
+
+ // if( gid >= entryCount )
+ // return;
+
+ const uint64 rIndex = rOffset + gid;
+
+ bool isPruned = gid >= entryCount;
+
+ if constexpr ( prune )
+ isPruned = isPruned || !CuBitFieldGet( rMarks, rIndex );
+
+ RMap entry;
+ uint32 bucket, offset;
+
+ if( !isPruned )
+ {
+ entry.dstL = lPairs[gid];
+ entry.dstR = entry.dstL + rPairs[gid];
+ entry.src = (uint32)rIndex; // It's original index
+
+ bucket = (uint32)(entry.dstL >> (BBCU_K - BBC_BUCKET_BITS));
+
+ // Block-level offset
+ offset = atomicAdd( &sharedBuckets[bucket], 1 );
+ }
+
+ // Global offset
+ __syncthreads();
+ if( id < BBCU_BUCKET_COUNT )
+ sharedBuckets[id] = atomicAdd( &gBucketCounts[id], sharedBuckets[id] );
+ __syncthreads();
+
+ if( isPruned )
+ return;
+
+ const uint32 dst = bucket * P3_PRUNED_SLICE_MAX + sharedBuckets[bucket] + offset;
+ gRMap[dst] = entry;
+}
+
+
+/**
+ * #TODO: Optimize Steps 1 & 2 w/ packing.
+ * Phase 3 works in 4 steps:
+ * Step 1:
+ * - Prune table R and for each pair write a mapping
+ * at the back pointer locations, which points to the index of the pair.
+ *
+ * Step 2:
+ * - Load the RMap
+ * - Load the LTable
+ * - Create line points given RMap with LTable values
+ * - Write line points to the their buckets along with the indices from the RMap
+ *
+ * Step 3:
+ * - Load line points and index
+ * - Sort line points w/ index
+ * - Compress line points to park
+ * - Write parks
+ * - Write index as a map, this will be the next iteration's L table
+*/
+//-----------------------------------------------------------
+void CudaK32PlotPhase3( CudaK32PlotContext& cx )
+{
+ // Set-up our context
+ memset( cx.phase3->prunedBucketCounts , 0, sizeof( cx.phase3->prunedBucketCounts ) );
+ memset( cx.phase3->prunedTableEntryCounts, 0, sizeof( cx.phase3->prunedTableEntryCounts ) );
+
+ InitFSEBitMask( cx );
+
+#if _DEBUG
+ //#define SKIP_TO_TABLE TableId::Table3
+#endif
+
+#if BBCU_DBG_SKIP_PHASE_2 && !defined( SKIP_TO_TABLE )
+ DbgLoadMarks( cx );
+
+ // if( cx.gCfg->compressionLevel > 0 )
+ {
+ DbgLoadTablePairs( cx, TableId::Table1 + (TableId)cx.gCfg->numDroppedTables + 2, false );
+ }
+#endif
+
+ // Ensure the host buffers are not being used by the plot writer anymore
+ #if !BBCU_DBG_SKIP_PHASE_1
+ {
+ Duration waitTime = Duration::zero();
+ cx.plotFence->Wait( waitTime );
+ cx.plotFence->Reset();
+
+ if( TicksToSeconds( waitTime ) > 0.001 )
+ Log::Line( "Waited %.2lf seconds for C tables to finish writing.", TicksToSeconds( waitTime ) );
+ }
+ #endif
+
+ const uint32 compressionLevel = cx.gCfg->compressionLevel;
+
+ // Special case with the starting table, since it has the values inlined already
+ cx.table = TableId::Table2 + cx.gCfg->numDroppedTables;
+
+ // if( compressionLevel == 0 )
+ {
+ Log::Line( "Compressing Table %u and %u...", cx.table, cx.table+1 );
+
+ auto tableTimer = TimerBegin();
+
+ auto timer = tableTimer;
+ CompressInlinedTable( cx );
+ auto elapsed = TimerEnd( timer );
+ Log::Line( " Step 1 completed step in %.2lf seconds.", elapsed );
+
+ timer = TimerBegin();
+ CudaK32PlotPhase3Step3( cx );
+
+ auto tableElapsed = TimerEnd( tableTimer );
+ elapsed = TimerEnd( timer );
+ Log::Line( " Step 2 completed step in %.2lf seconds.", elapsed );
+
+
+ const uint64 baseEntryCount = cx.tableEntryCounts[(int)cx.table];
+ const uint64 prunedEntryCount = cx.phase3->prunedTableEntryCounts[(int)cx.table];
+ Log::Line( "Completed table %u in %.2lf seconds with %llu / %llu entries ( %.2lf%% ).",
+ cx.table, tableElapsed, prunedEntryCount, baseEntryCount, (prunedEntryCount / (double)baseEntryCount) * 100.0 );
+ }
+ // else if( compressionLevel > 0 )
+ // {
+ // const TableId startLTable = TableId::Table1 + (TableId)cx.gCfg->numDroppedTables;
+ // cx.phase3->prunedTableEntryCounts[(int)startLTable] = cx.tableEntryCounts[(int)startLTable];
+ // if( cx.gCfg->numDroppedTables > 1 )
+ // cx.table = TableId::Table3;
+ // }
+
+#ifdef SKIP_TO_TABLE
+ cx.table = SKIP_TO_TABLE;
+ DbgLoadLMap( cx );
+#endif
+
+ auto& p3 = *cx.phase3;
+ const TableId startRTable = cx.table + 1;
+
+ for( TableId rTable = startRTable; rTable <= TableId::Table7; rTable++ )
+ {
+ Log::Line( "Compressing tables %u and %u...", (uint)rTable, (uint)rTable+1 );
+
+ cx.table = rTable;
+
+ #if BBCU_DBG_SKIP_PHASE_2
+ if( rTable < TableId::Table7 )
+ DbgLoadTablePairs( cx, rTable+1, false );
+ #endif
+
+ auto tableTimer = TimerBegin();
+
+ // Step 1
+ auto timer = tableTimer;
+ Step1( cx );
+ double elapsed = TimerEnd( timer );
+ Log::Line( " Step 1 completed step in %.2lf seconds.", elapsed );
+
+ // Step 2
+ timer = TimerBegin();
+ CudaK32PlotPhase3Step2( cx );
+ elapsed = TimerEnd( timer );
+ Log::Line( " Step 2 completed step in %.2lf seconds.", elapsed );
+
+ // Step 3
+ timer = TimerBegin();
+ CudaK32PlotPhase3Step3( cx );
+ elapsed = TimerEnd( timer );
+ Log::Line( " Step 3 completed step in %.2lf seconds.", elapsed );
+
+ auto tableElapsed = TimerEnd( tableTimer );
+
+ const uint64 baseEntryCount = cx.tableEntryCounts[(int)rTable];
+ const uint64 prunedEntryCount = p3.prunedTableEntryCounts[(int)rTable];
+ Log::Line( "Completed table %u in %.2lf seconds with %llu / %llu entries ( %.2lf%% ).",
+ rTable, tableElapsed, prunedEntryCount, baseEntryCount, (prunedEntryCount / (double)baseEntryCount) * 100.0 );
+ }
+
+ // Park 7
+ {
+ Log::Line( "Serializing P7 entries" );
+
+ const auto timer = TimerBegin();
+ WritePark7( cx );
+ const auto elapsed = TimerEnd( timer );
+ Log::Line( "Completed serializing P7 entries in %.2lf seconds.", elapsed );
+ }
+}
+
+//-----------------------------------------------------------
+void Step1( CudaK32PlotContext& cx )
+{
+ auto LoadBucket = []( CudaK32PlotContext& cx, const uint32 bucket ) -> void
+ {
+ const TableId rTable = cx.table;
+ auto& p3 = *cx.phase3;
+ auto& s1 = p3.step1;
+
+ const uint32 entryCount = BBCU_BUCKET_ENTRY_COUNT;
+
+ // uint32* hostPairsL = cx.hostTableSortedL + p3.pairsLoadOffset;
+ // uint16* hostPairsR = cx.hostTableSortedR + p3.pairsLoadOffset;
+ uint32* hostPairsL = cx.hostBackPointers[(int)rTable].left + p3.pairsLoadOffset;
+ uint16* hostPairsR = cx.hostBackPointers[(int)rTable].right + p3.pairsLoadOffset;
+
+ // if( rTable < TableId::Table7 )
+ // {
+ // const uint32* nextHostPairsL = cx.hostBackPointers[(int)rTable + 1].left + p3.pairsLoadOffset;
+ // const uint16* nextHostPairsR = cx.hostBackPointers[(int)rTable + 1].right + p3.pairsLoadOffset;
+
+ // s1.pairsLIn.UploadAndPreLoadT( hostPairsL, entryCount, nextHostPairsL, entryCount );
+ // s1.pairsRIn.UploadAndPreLoadT( hostPairsR, entryCount, nextHostPairsR, entryCount );
+ // }
+ // else
+ {
+ s1.pairsLIn.UploadT( hostPairsL, entryCount );
+ s1.pairsRIn.UploadT( hostPairsR, entryCount );
+ }
+
+ p3.pairsLoadOffset += entryCount;
+ };
+
+ auto& p2 = *cx.phase2;
+ auto& p3 = *cx.phase3;
+ auto& s1 = p3.step1;
+
+ const TableId rTable = cx.table;
+
+ // Clear pruned table count
+ CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) );
+
+ // Load marking table (must be loaded before first bucket, on the same stream)
+ if( cx.table < TableId::Table7 )
+ {
+ CudaErrCheck( cudaMemcpyAsync( s1.rTableMarks, cx.hostMarkingTables[(int)rTable],
+ GetMarkingTableBitFieldSize(), cudaMemcpyHostToDevice, s1.pairsLIn.GetQueue()->GetStream() ) );
+ }
+
+ // Load initial bucket
+ p3.pairsLoadOffset = 0;
+ LoadBucket( cx, 0 );
+
+
+ ///
+ /// Process buckets
+ ///
+ const uint32 threadPerBlock = 256;
+ const uint32 blocksPerGrid = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, (int)threadPerBlock );
+
+ uint64 rTableOffset = 0;
+ for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+ {
+ cx.bucket = bucket;
+
+ if( bucket + 1 < BBCU_BUCKET_COUNT )
+ LoadBucket( cx, bucket + 1 );
+
+ // Wait for R table pairs to be ready
+ const uint32* devLPairs = (uint32*)s1.pairsLIn.GetUploadedDeviceBuffer( cx.computeStream );
+ const uint16* devRPairs = (uint16*)s1.pairsRIn.GetUploadedDeviceBuffer( cx.computeStream );
+
+ const uint32 entryCount = bucket == BBCU_BUCKET_COUNT-1 ?
+ ( cx.tableEntryCounts[(int)rTable] - (BBCU_BUCKET_ENTRY_COUNT * (BBCU_BUCKET_COUNT-1)) ) : // Get only the remaining entries for the last bucket
+ BBCU_BUCKET_ENTRY_COUNT; // Otherwise, use a whole bucket's worth.
+
+ auto* devRMap = (RMap*)s1.rMapOut.LockDeviceBuffer( cx.computeStream );
+
+ uint32* devSliceCounts = cx.devSliceCounts + bucket * BBCU_BUCKET_COUNT;
+
+ // Generate map
+ #define KERN_RMAP_ARGS entryCount, rTableOffset, devSliceCounts, p3.devPrunedEntryCount, devRMap, devLPairs, devRPairs, s1.rTableMarks
+
+ CudaErrCheck( cudaMemsetAsync( devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT, cx.computeStream ) );
+
+ if( cx.table < TableId::Table7 )
+ PruneAndWriteRMap<<>>( KERN_RMAP_ARGS );
+ else
+ PruneAndWriteRMap<<>>( KERN_RMAP_ARGS );
+
+ #undef KERN_RMAP_ARGS
+ s1.pairsLIn.ReleaseDeviceBuffer( cx.computeStream );
+ s1.pairsRIn.ReleaseDeviceBuffer( cx.computeStream );
+ rTableOffset += entryCount;
+
+ // Download data (Vertical download (write 1 column))
+ s1.rMapOut.Download2DT( p3.hostRMap + (size_t)bucket * P3_PRUNED_SLICE_MAX,
+ P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_BUCKET_MAX, P3_PRUNED_SLICE_MAX, cx.computeStream );
+ }
+
+ // Download slice counts
+ cudaStream_t downloadStream = s1.rMapOut.GetQueue()->GetStream();
+
+ CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
+ cudaMemcpyDeviceToHost, downloadStream ) );
+
+ // Wait for completion
+ s1.rMapOut.WaitForCompletion();
+ s1.rMapOut.Reset();
+
+ s1.pairsLIn.Reset();
+ s1.pairsRIn.Reset();
+
+ CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
+
+ // Add-up pruned bucket counts and tables counts
+ memcpy( &s1.prunedBucketSlices[0][0], cx.hostBucketSlices, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT );
+ {
+ uint32* hostSliceCounts = cx.hostBucketSlices;
+
+ for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+ {
+ for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+ p3.prunedBucketCounts[(int)rTable][bucket] += s1.prunedBucketSlices[slice][bucket];
+
+ // hostSliceCounts += BBCU_BUCKET_COUNT;
+ }
+
+ p3.prunedTableEntryCounts[(int)rTable] = 0;
+
+ for( uint32 i = 0; i < BBCU_BUCKET_COUNT; i++ )
+ p3.prunedTableEntryCounts[(int)rTable] += p3.prunedBucketCounts[(int)rTable][i];
+ }
+}
+
+//-----------------------------------------------------------
+// Table 2 (or 3,4,etc., depending on comrpession level), already has
+// the x values inlined into the pairs. Therefore, we can skip step 1
+// and go directly into converting to line point, then sorting it to the target
+//-----------------------------------------------------------
+void CompressInlinedTable( CudaK32PlotContext& cx )
+{
+ auto LoadBucket = []( CudaK32PlotContext& cx, const uint32 bucket ) -> void {
+
+ auto& p3 = *cx.phase3;
+ auto& tx = p3.xTable;
+
+ if( bucket == 0 )
+ p3.pairsLoadOffset = 0;
+
+ // Load inlined x's
+ const TableId rTable = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables;
+ const uint32 entryCount = cx.bucketCounts[(int)rTable][bucket];
+
+ const Pair* inlinedXs = ((Pair*)cx.hostBackPointers[(int)rTable].left) + p3.pairsLoadOffset;
+
+ tx.xIn.UploadT( inlinedXs, entryCount, cx.computeStream );
+
+ p3.pairsLoadOffset += entryCount;
+ };
+
+ const TableId rTable = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables;
+ auto& p3 = *cx.phase3;
+ auto& tx = p3.xTable;
+ auto& s2 = p3.step2;
+
+ #if BBCU_DBG_SKIP_PHASE_2
+ DbgLoadTablePairs( cx, rTable );
+ #endif
+
+ // Load R Marking table (must be loaded before first bucket, on the same stream)
+ CudaErrCheck( cudaMemcpyAsync( (void*)tx.devRMarks, cx.hostMarkingTables[(int)rTable],
+ GetMarkingTableBitFieldSize(), cudaMemcpyHostToDevice, p3.xTable.xIn.GetQueue()->GetStream() ) );
+
+ // Load initial bucket
+ LoadBucket( cx, 0 );
+
+ const bool isCompressed = cx.gCfg->compressionLevel > 0;
+ const uint32 compressedLPBits = isCompressed ? GetCompressedLPBitCount( cx.gCfg->compressionLevel ) : 0;
+
+ const uint32 lpBits = isCompressed ? compressedLPBits : BBCU_K * 2 - 1;
+ const uint32 lpBucketShift = lpBits - BBC_BUCKET_BITS;
+
+ uint64 tablePrunedEntryCount = 0;
+ uint32 rTableOffset = 0;
+
+ CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );
+
+ for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+ {
+ cx.bucket = bucket;
+
+ if( bucket + 1 < BBCU_BUCKET_COUNT )
+ LoadBucket( cx, bucket + 1 );
+
+ // Wait for pairs to be ready
+ const Pair* devXs = (Pair*)tx.xIn.GetUploadedDeviceBuffer( cx.computeStream );
+
+ uint64* outLps = (uint64*)tx.lpOut .LockDeviceBuffer( cx.computeStream );
+ uint32* outIndices = (uint32*)tx.indexOut.LockDeviceBuffer( cx.computeStream );
+
+ const uint32 entryCount = cx.bucketCounts[(int)rTable][bucket];
+
+ const uint32 threadPerBlock = 256;
+ const uint32 blocksPerGrid = CDiv( entryCount, (int)threadPerBlock );
+
+ uint32* devSliceCounts = cx.devSliceCounts + bucket * BBCU_BUCKET_COUNT;
+
+ #if _DEBUG
+ CudaErrCheck( cudaMemsetAsync( outLps, 0, sizeof( uint64 ) * P3_PRUNED_BUCKET_MAX, cx.computeStream ) );
+ #endif
+
+ CudaConvertInlinedXsToLinePoints<<>>(
+ entryCount, rTableOffset, lpBucketShift,
+ devXs, tx.devRMarks, outLps, outIndices, devSliceCounts );
+
+ tx.xIn.ReleaseDeviceBuffer( cx.computeStream );
+
+ // Download output
+ // Horizontal download (write 1 row)
+ tx.lpOut .Download2DT( p3.hostLinePoints + (size_t)bucket * P3_PRUNED_BUCKET_MAX , P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX , P3_PRUNED_SLICE_MAX, cx.computeStream );
+ tx.indexOut.Download2DT( p3.hostIndices + (size_t)bucket * P3_PRUNED_BUCKET_MAX*3, P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX * 3, P3_PRUNED_SLICE_MAX, cx.computeStream );
+
+ rTableOffset += entryCount;
+ }
+
+ cudaStream_t downloadStream = tx.lpOut.GetQueue()->GetStream();
+
+ CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
+ cudaMemcpyDeviceToHost, downloadStream ) );
+
+ tx.lpOut .WaitForCompletion();
+ tx.indexOut.WaitForCompletion();
+ tx.lpOut .Reset();
+ tx.indexOut.Reset();
+
+ CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
+
+ #if _DEBUG
+ for( uint32 i = 0; i < BBCU_BUCKET_COUNT; i++ )
+ {
+ ASSERT( p3.prunedBucketCounts[(int)rTable][i] <= P3_PRUNED_BUCKET_MAX );
+ }
+ #endif
+
+ // Add-up pruned bucket counts and tables counts
+ {
+ bbmemcpy_t( &s2.prunedBucketSlices[0][0], cx.hostBucketSlices, BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT );
+
+ for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+ {
+ for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+ p3.prunedBucketCounts[(int)rTable][bucket] += s2.prunedBucketSlices[slice][bucket];
+ }
+
+ p3.prunedTableEntryCounts[(int)rTable] = 0;
+
+ for( uint32 i = 0; i < BBCU_BUCKET_COUNT; i++ )
+ p3.prunedTableEntryCounts[(int)rTable] += p3.prunedBucketCounts[(int)rTable][i];
+ }
+
+#if _DEBUG
+ // DbgValidateIndices( cx );
+ // DbgValidateStep2Output( cx );
+ // DbgDumpSortedLinePoints( cx );
+#endif
+}
+
+
+///
+/// Allocation
+///
+//-----------------------------------------------------------
+void CudaK32PlotPhase3AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
+{
+ auto& p3 = *cx.phase3;
+
+ // Shared allocations
+ p3.devBucketCounts = acx.devAllocator->CAlloc( BBCU_BUCKET_COUNT, acx.alignment );
+ p3.devPrunedEntryCount = acx.devAllocator->CAlloc( 1, acx.alignment );
+
+ // Host allocations
+ p3.hostRMap = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT ); // Used for rMap and index
+ p3.hostLinePoints = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT ); // Used for lMap and LPs
+
+ if( !acx.dryRun )
+ {
+ ASSERT( (uintptr_t)(p3.hostLinePoints + BBCU_TABLE_ALLOC_ENTRY_COUNT ) <= (uintptr_t)cx.hostTableL );
+ ASSERT( (uintptr_t)(p3.hostLinePoints + BBCU_TABLE_ALLOC_ENTRY_COUNT ) < (uintptr_t)cx.hostTableSortedL );
+ }
+ // p3.hostBucketCounts = acx.pinnedAllocator->CAlloc( BBCU_BUCKET_COUNT, acx.alignment );
+
+ if( acx.dryRun )
+ {
+ CudaK32AllocContext dacx = acx;
+
+ DummyAllocator devAlloc = {};
+ DummyAllocator pinnedAlloc = {};
+
+ dacx.devAllocator = &devAlloc;
+ dacx.pinnedAllocator = &pinnedAlloc;
+
+ AllocXTableStep( cx, dacx );
+
+ size_t sharedDevSize = devAlloc.Size();
+ size_t sharedPinnedSize = pinnedAlloc.Size();
+
+ devAlloc = {};
+ pinnedAlloc = {};
+ CudaK32PlotAllocateBuffersStep1( cx, dacx );
+
+ sharedDevSize = std::max( sharedDevSize , devAlloc.Size() );
+ sharedPinnedSize = std::max( sharedPinnedSize, pinnedAlloc.Size() );
+ devAlloc = {};
+ pinnedAlloc = {};
+ CudaK32PlotAllocateBuffersStep2( cx, dacx );
+
+ sharedDevSize = std::max( sharedDevSize , devAlloc.Size() );
+ sharedPinnedSize = std::max( sharedPinnedSize, pinnedAlloc.Size() );
+ devAlloc = {};
+ pinnedAlloc = {};
+ CudaK32PlotAllocateBuffersStep3( cx, dacx );
+
+ sharedDevSize = std::max( sharedDevSize , devAlloc.Size() );
+ sharedPinnedSize = std::max( sharedPinnedSize, pinnedAlloc.Size() );
+
+ acx.devAllocator ->Alloc( sharedDevSize , acx.alignment );
+ acx.pinnedAllocator->Alloc( sharedPinnedSize, acx.alignment );
+ }
+ else
+ {
+ StackAllocator* devAllocator = (StackAllocator*)acx.devAllocator;
+ StackAllocator* pinnedAllocator = (StackAllocator*)acx.pinnedAllocator;
+
+ const size_t devMarker = devAllocator ->Size();
+ const size_t pinMarker = pinnedAllocator->Size();
+
+ AllocXTableStep( cx, acx );
+ devAllocator ->PopToMarker( devMarker );
+ pinnedAllocator->PopToMarker( pinMarker );
+
+ CudaK32PlotAllocateBuffersStep1( cx, acx );
+ devAllocator ->PopToMarker( devMarker );
+ pinnedAllocator->PopToMarker( pinMarker );
+
+ CudaK32PlotAllocateBuffersStep2( cx, acx );
+ devAllocator ->PopToMarker( devMarker );
+ pinnedAllocator->PopToMarker( pinMarker );
+
+ CudaK32PlotAllocateBuffersStep3( cx, acx );
+ }
+}
+
+//-----------------------------------------------------------
+void AllocXTableStep( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
+{
+ auto& tx = cx.phase3->xTable;
+
+ tx.devRMarks = (uint64*)acx.devAllocator->AllocT( GetMarkingTableBitFieldSize(), acx.alignment );
+ tx.xIn = cx.gpuUploadStream[0]->CreateUploadBuffer(sizeof(Pair) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, acx.alignment, acx.dryRun);
+ tx.lpOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, acx.alignment, acx.dryRun );
+ tx.indexOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, acx.alignment, acx.dryRun );
+}
+
+//-----------------------------------------------------------
+void CudaK32PlotAllocateBuffersStep1( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
+{
+ auto& s1 = cx.phase3->step1;
+ const size_t alignment = acx.alignment;
+
+ s1.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
+ sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+
+ s1.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
+ sizeof( uint16 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+
+ s1.rMapOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer(
+ sizeof( RMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
+
+ s1.rTableMarks = (uint64*)acx.devAllocator->AllocT( GetMarkingTableBitFieldSize(), acx.alignment );
+}
+
+//-----------------------------------------------------------
+void CudaK32PlotAllocateBuffersStep2( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
+{
+ auto& s2 = cx.phase3->step2;
+ const size_t alignment = acx.alignment;
+
+ s2.rMapIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
+ sizeof( RMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+
+ s2.lMapIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
+ sizeof( LMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+
+ s2.lpOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer(
+ sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
+
+ s2.indexOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer(
+ sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
+
+ s2.devLTable[0] = acx.devAllocator->CAlloc( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
+ s2.devLTable[1] = acx.devAllocator->CAlloc( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
+}
+
+//-----------------------------------------------------------
+void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
+{
+ auto& s3 = cx.phase3->step3;
+ const size_t alignment = acx.alignment;
+
+ s3.hostParkOverrunCount = acx.pinnedAllocator->CAlloc( 1 );
+
+ const size_t devParkAllocSize = DEV_MAX_PARK_SIZE * P3_PRUNED_MAX_PARKS_PER_BUCKET;
+
+ s3.lpIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
+ sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+
+ s3.indexIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
+ sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+
+ s3.mapOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer(
+ sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
+
+ s3.parksOut = cx.gpuDownloadStream[0]->CreateDownloadBuffer(devParkAllocSize, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun);
+
+ if( acx.dryRun )
+ {
+ s3.sizeTmpSort = 0;
+ cub::DeviceRadixSort::SortPairs( nullptr, s3.sizeTmpSort, nullptr, nullptr, nullptr, nullptr, BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+ }
+
+ s3.devSortTmpData = acx.devAllocator->AllocT( s3.sizeTmpSort, alignment );
+
+
+ // Allocate 1 more park's worth of line points so we can have space to retain the line points
+ // that did not make it into a park for the next bucket.
+ const size_t linePointAllocCount = P3_PRUNED_MAX_PARKS_PER_BUCKET * (size_t)kEntriesPerPark;
+ static_assert( linePointAllocCount > BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+
+ s3.devLinePoints = acx.devAllocator->CAlloc( linePointAllocCount, alignment );
+ s3.devDeltaLinePoints = acx.devAllocator->CAlloc( linePointAllocCount, alignment );
+ s3.devIndices = acx.devAllocator->CAlloc( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
+
+ // s3.devParks = acx.devAllocator->AllocT( parkAllocSize, alignment );
+ // s3.hostParks = acx.devAllocator->AllocT ( maxParkSize , alignment );
+
+ s3.devCTable = acx.devAllocator->AllocT( P3_MAX_CTABLE_SIZE, alignment );
+ s3.devParkOverrunCount = acx.devAllocator->CAlloc( 1 );
+}
+
+
+
+#if _DEBUG
+
+//-----------------------------------------------------------
+__global__ static void DbgCudaValidateRMap( const uint64 entryCount, const uint32 lTableOffset, const RMap* rmap )
+{
+ const uint32 id = threadIdx.x;
+ const uint32 gid = blockIdx.x * blockDim.x + id;
+
+ if( gid >= entryCount )
+ return;
+
+
+ const RMap map = rmap[gid];
+
+ const uint32 left = map.dstL - lTableOffset;
+ const uint32 right = map.dstR - lTableOffset;
+
+ // if( left >= BBCU_BUCKET_ALLOC_ENTRY_COUNT )
+ if( left >= right || left >= BBCU_BUCKET_ALLOC_ENTRY_COUNT || right >= BBCU_BUCKET_ALLOC_ENTRY_COUNT )
+ {
+ printf( "gid: %u | left: %u | right: %u | loffset: %u\n"
+ " dstL: %u | dstR: %u | src: %u\n",
+ gid, left, right, lTableOffset, map.dstL, map.dstR, map.src );
+ CUDA_ASSERT( false );
+ }
+
+ CUDA_ASSERT( left < BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+ CUDA_ASSERT( right < BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+ CUDA_ASSERT( left < right );
+}
+
+//-----------------------------------------------------------
+void DbgValidateRMap( CudaK32PlotContext& cx )
+{
+ Log::Line( "[DEBUG] Validating RMap..." );
+
+ auto& p3 = *cx.phase3;
+ auto& s1 = p3.step1;
+
+ {
+ ThreadPool& pool = DbgGetThreadPool( cx );
+
+ RMap* rMap = bbcvirtallocbounded( BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+
+ for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+ {
+ const RMap* reader = p3.hostRMap + bucket * P3_PRUNED_BUCKET_MAX;
+ RMap* writer = rMap;
+
+ uint32 entryCount = 0;
+
+ for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+ {
+ const uint32 copyCount = s1.prunedBucketSlices[slice][bucket];
+ bbmemcpy_t( writer, reader, copyCount );
+
+ writer += copyCount;
+ entryCount += copyCount;
+
+ reader += P3_PRUNED_SLICE_MAX;
+ }
+
+ // Validate bucket
+ const uint32 bucketOffset = bucket * BBCU_BUCKET_ENTRY_COUNT;
+ for( uint32 i = 0; i < entryCount; i++ )
+ {
+ const RMap map = rMap[i];
+ ASSERT( map.dstL || map.dstR );
+ ASSERT( map.dstR - map.dstL < 0x10000u );
+ ASSERT( map.dstL >> ( 32 - BBC_BUCKET_BITS ) == bucket );
+
+ const uint32 left = map.dstL - bucketOffset;
+ const uint32 right = map.dstR - bucketOffset;
+ ASSERT( left < BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+ ASSERT( right < BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+ CUDA_ASSERT( left < right );
+
+ }
+ }
+
+ bbvirtfreebounded( rMap );
+ Log::Line( "[DEBUG] CPU OK" );
+ }
+
+ // Validate in CUDA
+ {
+ uint64 pairsLoadOffset = 0;
+ for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+ {
+ uint64 entryCount = 0;
+ for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+ {
+ const uint32 copyCount = s1.prunedBucketSlices[slice][bucket];
+ entryCount += copyCount;
+ }
+
+ const RMap* rmap = p3.hostRMap + (size_t)bucket * P3_PRUNED_BUCKET_MAX;
+ const uint32* rSliceCounts = &p3.step1.prunedBucketSlices[0][bucket];
+
+ p3.step2.rMapIn.UploadArrayT( rmap, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, rSliceCounts );
+
+ const uint32 rEntryCount = p3.prunedBucketCounts[(int)cx.table][bucket];
+ RMap* devRMap = p3.step2.rMapIn.GetUploadedDeviceBufferT( cx.computeStream );
+
+ ASSERT( entryCount == rEntryCount );
+
+ const uint32 threads = 256;
+ const uint32 blocks = CDiv( rEntryCount, threads );
+
+ const uint32 lTableOffset = bucket * BBCU_BUCKET_ENTRY_COUNT;
+
+ DbgCudaValidateRMap<<>>( rEntryCount, lTableOffset, devRMap );
+ CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
+
+ p3.step2.rMapIn.ReleaseDeviceBuffer( cx.computeStream );
+ }
+ Log::Line( "[DEBUG] CUDA OK" );
+
+ p3.step2.lMapIn.Reset();
+ }
+}
+
+//-----------------------------------------------------------
+void DbgValidateIndices( CudaK32PlotContext& cx )
+{
+ // Ensure all origin output indices are not repeated and well distributed
+ Log::Line( "[DEBUG] Validating indices..." );
+
+ auto& p3 = *cx.phase3;
+ auto& s2 = p3.step2;
+
+ ThreadPool& pool = DbgGetThreadPool( cx );
+
+ uint32* indices = bbcvirtallocbounded( BBCU_TABLE_ENTRY_COUNT );
+ uint32* idxTmp = bbcvirtallocbounded( BBCU_TABLE_ENTRY_COUNT );
+ uint32* idxWriter = indices;
+
+ const uint32* reader = p3.hostIndices;
+ const size_t readerStride = P3_PRUNED_SLICE_MAX * 3;
+
+ uint64 entryCount = 0;
+
+ for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+ {
+ for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+ {
+ const uint32 copyCount = s2.prunedBucketSlices[bucket][slice];
+
+ bbmemcpy_t( idxWriter, reader, copyCount );
+
+ idxWriter += copyCount;
+ entryCount += copyCount;
+ reader += readerStride;
+ }
+ }
+
+ ASSERT( entryCount == p3.prunedTableEntryCounts[(int)cx.table] );
+
+ RadixSort256::Sort( pool, indices, idxTmp, entryCount );
+
+ // Indices must not repeat:
+ for( uint64 i = 1; i < entryCount; i++ )
+ {
+ ASSERT( indices[i] > indices[i-1] );
+ }
+
+ bbvirtfreebounded( indices );
+ bbvirtfreebounded( idxTmp );
+
+ Log::Line( "[DEBUG] OK" );
+}
+
+#endif
+
diff --git a/cuda/CudaPlotPhase3Internal.h b/cuda/CudaPlotPhase3Internal.h
new file mode 100644
index 00000000..1a4bd7a8
--- /dev/null
+++ b/cuda/CudaPlotPhase3Internal.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include "CudaPlotContext.h"
+#include "plotting/CTables.h"
+#include "ChiaConsts.h"
+
+#if _DEBUG
+ #include "util/BitField.h"
+ #include "plotmem/LPGen.h"
+ #include "plotdisk/jobs/IOJob.h"
+ #include "algorithm/RadixSort.h"
+ #include "plotmem/ParkWriter.h"
+
+ void DbgValidateStep2Output( CudaK32PlotContext& cx );
+#endif
+
+using LMap = CudaK32Phase3::LMap;
+using RMap = CudaK32Phase3::RMap;
+
+static_assert( alignof( LMap ) == sizeof( uint32 ) );
+
+// #TODO: Remove this. It is unneeeded.
+#define P3_PRUNED_BUCKET_MULTIPLIER 0.98 // Enough to hold the largest pruned bucket size
+
+#define P3_PRUNED_SLICE_MAX BBCU_MAX_SLICE_ENTRY_COUNT //(CuCDiv( (size_t)((BBCU_TABLE_ENTRY_COUNT/BBCU_BUCKET_COUNT/BBCU_BUCKET_COUNT)*P3_PRUNED_BUCKET_MULTIPLIER), 4096 ) * 4096 + 4096)
+#define P3_PRUNED_BUCKET_MAX BBCU_BUCKET_ALLOC_ENTRY_COUNT //(P3_PRUNED_SLICE_MAX*BBCU_BUCKET_COUNT)
+#define P3_PRUNED_TABLE_MAX_ENTRIES BBCU_TABLE_ALLOC_ENTRY_COUNT //(P3_PRUNED_BUCKET_MAX*BBCU_BUCKET_COUNT)
+#define P3_PRUNED_MAX_PARKS_PER_BUCKET ((P3_PRUNED_BUCKET_MAX/kEntriesPerPark)+2)
+
+static constexpr size_t P3_MAX_CTABLE_SIZE = 38u * 1024u; // Should be more than enough
+
+//static constexpr size_t P3_LP_BUCKET_COUNT = BBCU_BUCKET_COUNT;// << 1;
+//static constexpr size_t P3_LP_SLICE_ENTRY_COUNT = BBCU_MAX_SLICE_ENTRY_COUNT;
+//static constexpr uint32 P3_LP_BUCKET_BITS = BBC_BUCKET_BITS;
+
+// static constexpr uint32 P3_LP_BUCKET_BITS = (uint32)(CuBBLog2( P3_LP_BUCKET_COUNT ));
+//static constexpr size_t P3_LP_SLICE_ENTRY_COUNT = ( CuCDiv( (size_t)( ( BBCU_TABLE_ENTRY_COUNT / P3_LP_BUCKET_COUNT / P3_LP_BUCKET_COUNT ) * P3_LP_BUCKET_MULTIPLER ),
+ //BBCU_XTRA_ENTRIES_PER_SLICE ) * BBCU_XTRA_ENTRIES_PER_SLICE + BBCU_XTRA_ENTRIES_PER_SLICE );
+// static constexpr size_t P3_LP_BUCKET_ENTRY_COUNT = P3_LP_SLICE_ENTRY_COUNT * P3_LP_BUCKET_COUNT;
+
+//static constexpr size_t P3_LP_BUCKET_STRIDE = BBCU_BUCKET_ALLOC_ENTRY_COUNT;
+
+// static constexpr size_t P3_LP_BUCKET_ALLOC_COUNT = ( CuCDiv( (size_t)( ( BBCU_TABLE_ENTRY_COUNT / P3_LP_BUCKET_COUNT / P3_LP_BUCKET_COUNT ) * P3_LP_BUCKET_MULTIPLER ),
+// BBCU_XTRA_ENTRIES_PER_SLICE ) * BBCU_XTRA_ENTRIES_PER_SLICE + BBCU_XTRA_ENTRIES_PER_SLICE );
+// //static constexpr size_t P3_LP_TABLE_ALLOC_COUNT = P3_LP_BUCKET_STRIDE * BBCU_BUCKET_COUNT;
+
+static constexpr size_t MAX_PARK_SIZE = CalculateParkSize( TableId::Table1 );
+static constexpr size_t DEV_MAX_PARK_SIZE = CuCDiv( MAX_PARK_SIZE, sizeof( uint64 ) ) * sizeof( uint64 ); // Align parks to 64 bits, for easier writing of stubs
+
diff --git a/cuda/CudaPlotPhase3Step2.cu b/cuda/CudaPlotPhase3Step2.cu
new file mode 100644
index 00000000..ac13e915
--- /dev/null
+++ b/cuda/CudaPlotPhase3Step2.cu
@@ -0,0 +1,693 @@
+#include "CudaPlotPhase3Internal.h"
+#include "CudaParkSerializer.h"
+#include "plotting/TableWriter.h"
+#include "algorithm/RadixSort.h"
+#include "plotdisk/jobs/IOJob.h"
+
+#define P3_CalculateMaxLPValue( x ) ((((uint64)(x))/2)*((uint64)(x))+x)
+#define P3_CalculateTableDivisor( p ) (P3_CalculateMaxLPValue( (uint64)(BBCU_TABLE_ENTRY_COUNT*(p)) ) / BBCU_BUCKET_COUNT)
+
+__constant__ uint64 BucketDivisor;
+
+static void CudaK32PlotPhase3Step2Compressed( CudaK32PlotContext& cx );
+
+//-----------------------------------------------------------
+__global__ static void CudaUnpackLMap( const uint32 entryCount, const LMap* devLMap, uint32* devLTable
+#if _DEBUG
+ , const uint32 bucket
+#endif
+)
+{
+ const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+ if( gid >= entryCount )
+ return;
+
+ const uint32 bucketMask = (1u << (BBCU_K - BBC_BUCKET_BITS)) - 1;
+ const LMap map = devLMap[gid];
+
+ const uint32 dst = map.sourceIndex & bucketMask;
+
+ CUDA_ASSERT( ( map.sourceIndex >> ( 32 - BBC_BUCKET_BITS ) ) == bucket );
+
+ devLTable[dst] = map.sortedIndex;
+}
+
+//-----------------------------------------------------------
+static void UnpackLMap( CudaK32PlotContext& cx, const uint32 entryCount, const LMap* devLMap, uint32* devLTable,
+ const uint32 bucket, cudaStream_t stream )
+{
+ const uint32 threads = 256;
+ const uint32 blocks = CDiv( entryCount, threads );
+
+ CudaUnpackLMap<<>>( entryCount, devLMap, devLTable
+#if _DEBUG
+ , bucket
+#endif
+ );
+}
+
+
+//-----------------------------------------------------------
+template
+__global__ static void CudaConvertRMapToLinePoints(
+ const uint64 entryCount, const uint32 rOffset, const uint32 lTableOffset,
+ const uint32* lTable, const RMap* rmap, uint64* outLPs, uint32* outIndices, uint32* gBucketCounts, const uint32 lpShift = 0 )
+{
+ const uint32 id = threadIdx.x;
+ const uint32 gid = blockIdx.x * blockDim.x + id;
+
+ __shared__ uint32 sharedBuckets[BBCU_BUCKET_COUNT];
+
+ CUDA_ASSERT( gridDim.x >= BBCU_BUCKET_COUNT );
+ if( id < BBCU_BUCKET_COUNT )
+ sharedBuckets[id] = 0;
+
+ __syncthreads();
+
+ uint32 bucket;
+ uint32 offset;
+ uint32 rIndex;
+ uint64 lp;
+
+ if( gid < entryCount )
+ {
+ const RMap map = rmap[gid];
+
+ const uint32 left = map.dstL - lTableOffset;
+ const uint32 right = map.dstR - lTableOffset;
+
+ CUDA_ASSERT( left < BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+ CUDA_ASSERT( right < BBCU_BUCKET_ALLOC_ENTRY_COUNT );
+ CUDA_ASSERT( left < right );
+
+ rIndex = map.src;
+
+ const uint32 x = lTable[left ];
+ const uint32 y = lTable[right];
+
+ lp = CudaSquareToLinePoint64( x, y );
+
+ if constexpr( !isCompressed )
+ {
+ CUDA_ASSERT( x || y );
+ CUDA_ASSERT( lp );
+ bucket = (uint32)( lp / BucketDivisor );
+ }
+ else
+ bucket = (uint32)( lp >> lpShift );
+
+ CUDA_ASSERT( bucket < BBCU_BUCKET_COUNT );
+
+ offset = atomicAdd( &sharedBuckets[bucket], 1 );
+ }
+ __syncthreads();
+
+ // Global offset
+ if( id < BBCU_BUCKET_COUNT )
+ {
+ sharedBuckets[id] = atomicAdd( &gBucketCounts[id], sharedBuckets[id] );
+ CUDA_ASSERT( sharedBuckets[id] <= P3_PRUNED_SLICE_MAX );
+ }
+ __syncthreads();
+
+ if( gid >= entryCount )
+ return;
+
+ const uint32 dst = bucket * P3_PRUNED_SLICE_MAX + sharedBuckets[bucket] + offset;
+ CUDA_ASSERT( dst < P3_PRUNED_BUCKET_MAX );
+
+ outLPs [dst] = lp;
+ outIndices[dst] = rIndex;
+}
+
+//-----------------------------------------------------------
+static void ConvertRMapToLinePoints( CudaK32PlotContext& cx, const uint32 entryCount, const uint32 rOffset,
+ const uint32* lTable, const RMap* rMap, uint64* outLPs, uint32* outIndices, cudaStream_t stream )
+{
+ const TableId rTable = cx.table;
+ auto& p3 = *cx.phase3;
+ auto& s2 = p3.step2;
+
+ const uint32 threads = 256;
+ const uint32 blocks = CDiv( entryCount, threads );
+
+ const uint32 lTableOffset = cx.bucket * BBCU_BUCKET_ENTRY_COUNT;
+
+ uint32* devSliceCounts = cx.devSliceCounts + cx.bucket * BBCU_BUCKET_COUNT;
+ #define Rmap2LPParams entryCount, rOffset, lTableOffset, lTable, rMap, outLPs, outIndices, devSliceCounts
+
+ const bool isCompressed = rTable - 1 <= (TableId)cx.gCfg->numDroppedTables;
+
+ if( !isCompressed )
+ {
+ if( cx.bucket == 0 )
+ {
+ // Calculate the divisor needed to generate a uniform distribution across buckets
+ // and set it as a constant for our kernel.
+ const uint64 prunedEntryCount = p3.prunedTableEntryCounts[(int)rTable - 1];
+ const uint64 divisor = P3_CalculateMaxLPValue( prunedEntryCount ) / BBCU_BUCKET_COUNT;
+
+ // #TODO: Use upload stream?
+ CudaErrCheck( cudaMemcpyToSymbolAsync( BucketDivisor, &divisor, sizeof( divisor ), 0, cudaMemcpyHostToDevice, cx.computeStream ) );
+ }
+
+ CudaConvertRMapToLinePoints<<>>( Rmap2LPParams, 0 );
+ }
+ else
+ {
+ const uint32 xBits = cx.gCfg->compressedEntryBits;
+ const uint32 lpBits = (xBits * 2 - 1) * 2 - 1;
+ const uint32 lpBitShift = lpBits - BBC_BUCKET_BITS;
+
+ CudaConvertRMapToLinePoints<<>>( Rmap2LPParams, lpBitShift );
+ }
+
+ #undef Rmap2LPParams
+}
+
+/**
+ * Load RMap and L table and generate line points from RMap and L table.
+ * Write line points to their buckets, along with their origin index.
+*/
+//-----------------------------------------------------------
+void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
+{
+ auto LoadLBucket = []( CudaK32PlotContext& cx, const uint32 bucket ) -> void {
+
+ auto& p3 = *cx.phase3;
+ auto& s2 = p3.step2;
+
+ const bool isCompressed = (uint32)cx.table-1 <= cx.gCfg->numDroppedTables;
+
+ if( !isCompressed )
+ {
+ ASSERT( p3.prunedBucketCounts[(int)cx.table-1][cx.bucket] > 0 );
+
+ // Load lMap
+ // Horizontal load
+ const LMap* lmap = p3.hostLMap + (size_t)bucket * P3_PRUNED_BUCKET_MAX;
+
+ const uint32* lSliceCounts = &p3.step3.prunedBucketSlices[0][bucket];
+
+ s2.lMapIn.UploadArrayT( lmap, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, lSliceCounts );
+ }
+ else
+ {
+ ASSERT( cx.gCfg->compressionLevel > 0 );
+
+ if( bucket == 0 )
+ p3.pairsLoadOffset = 0;
+
+ // Load the compressed entries from the table pairs
+ const uint32* lEntries = (cx.hostBackPointers[(int)cx.table-1].left) + p3.pairsLoadOffset;
+ // const uint32* lEntries = cx.hostTableL + p3.pairsLoadOffset; // Our compressed x's are copied to the LMap buffer before we get to this point
+
+ // #TODO: Do a preload here instead and have each bucket start at the max bucket offset
+ // const uint32 bucketEntryCount = cx.bucketCounts[(int)cx.table-1][bucket];
+
+ s2.lMapIn.UploadT( lEntries, BBCU_BUCKET_ENTRY_COUNT );
+ p3.pairsLoadOffset += BBCU_BUCKET_ENTRY_COUNT;
+ }
+ };
+
+ auto UnpackLBucket = []( CudaK32PlotContext& cx, const uint32 bucket ) -> void {
+
+ auto& p3 = *cx.phase3;
+ auto& s2 = p3.step2;
+
+ const bool isCompressed = (uint32)cx.table-1 <= cx.gCfg->numDroppedTables;
+
+ const auto* lMap = (LMap*)s2.lMapIn.GetUploadedDeviceBuffer( cx.computeStream );
+ uint32* lTable = s2.devLTable[bucket & 1];
+
+ if( isCompressed )
+ {
+ // Copy from upload buffer to working buffer
+ CudaErrCheck( cudaMemcpyAsync( lTable, lMap, BBCU_BUCKET_ENTRY_COUNT * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) );
+ }
+ else
+ {
+ // Unpack next LMap and copy to the end of the current map
+ const uint32 lEntryCount = p3.prunedBucketCounts[(int)cx.table-1][bucket];
+ ASSERT( lEntryCount > 0 );
+
+ UnpackLMap( cx, lEntryCount, lMap, lTable, bucket, cx.computeStream );
+ }
+ };
+
+ auto LoadRBucket = []( CudaK32PlotContext& cx, const uint32 bucket ) -> void {
+
+ auto& p3 = *cx.phase3;
+ auto& s2 = p3.step2;
+
+ // Load rMap
+ // Horizontal load
+ const RMap* rmap = p3.hostRMap + (size_t)bucket * P3_PRUNED_BUCKET_MAX;
+
+ const uint32* rSliceCounts = &p3.step1.prunedBucketSlices[0][bucket];
+
+ s2.rMapIn.UploadArrayT( rmap, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, rSliceCounts );
+ };
+
+
+ const TableId rTable = cx.table;
+ const TableId lTable = rTable-1;
+ auto& p3 = *cx.phase3;
+ auto& s2 = p3.step2;
+
+
+ // We always have 1 L bucket loaded ahead since we need the next bucket to be
+ // loaded also so that we can include the next bucket's initial
+ // entries as part of the current bucket.
+ LoadLBucket( cx, 0 );
+ LoadLBucket( cx, 1 );
+ LoadRBucket( cx, 0 );
+
+ // Clear pruned entry count
+ CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) );
+
+ // Unpack the first map beforehand
+ UnpackLBucket( cx, 0 );
+
+
+ ///
+ /// Process buckets
+ ///
+ uint32 rTableOffset = 0; // Track the global origin index of R entry/line point
+
+ CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );
+
+ for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+ {
+ cx.bucket = bucket;
+ const uint32 nextBucket = bucket + 1;
+ const uint32 nextBucketL = bucket + 2;
+
+ const uint32* devLTable = s2.devLTable[bucket & 1];
+
+ // Preload next buckets
+ if( nextBucket < BBCU_BUCKET_COUNT )
+ {
+ LoadRBucket( cx, nextBucket );
+
+ UnpackLBucket( cx, nextBucket );
+ s2.lMapIn.ReleaseDeviceBuffer( cx.computeStream );
+
+ // Copy start of next bucket to the end of the current one
+ const uint32 copyCount = BBCU_BUCKET_COUNT * BBCU_XTRA_ENTRIES_PER_SLICE;
+ static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT - BBCU_BUCKET_ENTRY_COUNT == copyCount );
+
+ uint32* nextLTable = s2.devLTable[nextBucket & 1];
+ CudaErrCheck( cudaMemcpyAsync( (uint32*)devLTable + BBCU_BUCKET_ENTRY_COUNT, nextLTable, copyCount * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) );
+ }
+
+ if( nextBucketL < BBCU_BUCKET_COUNT )
+ LoadLBucket( cx, nextBucketL );
+
+
+ // Generate line points given the unpacked LMap as input and the RMap
+ const auto* rMap = (RMap*)s2.rMapIn.GetUploadedDeviceBuffer( cx.computeStream );
+ const uint32 rEntryCount = p3.prunedBucketCounts[(int)rTable][bucket];
+
+
+ uint64* devOutLPs = (uint64*)s2.lpOut .LockDeviceBuffer( cx.computeStream );
+ uint32* devOutIndices = (uint32*)s2.indexOut.LockDeviceBuffer( cx.computeStream );
+
+ ConvertRMapToLinePoints( cx, rEntryCount, rTableOffset, devLTable, rMap, devOutLPs, devOutIndices, cx.computeStream );
+ s2.rMapIn.ReleaseDeviceBuffer( cx.computeStream );
+ rTableOffset += rEntryCount;
+
+
+ // Horizontal download (write 1 row)
+ s2.lpOut .Download2DT( p3.hostLinePoints + (size_t)bucket * P3_PRUNED_BUCKET_MAX , P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX , P3_PRUNED_SLICE_MAX, cx.computeStream );
+ s2.indexOut.Download2DT( p3.hostIndices + (size_t)bucket * P3_PRUNED_BUCKET_MAX*3, P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX*3, P3_PRUNED_SLICE_MAX, cx.computeStream );
+ }
+
+ #if _DEBUG
+ {
+ size_t tableLength = 0;
+ uint32 activeBucketCount = 0;
+ for( uint32 i = 0; i < BBCU_BUCKET_COUNT; i++ )
+ {
+ ASSERT( p3.prunedBucketCounts[(int)rTable][i] <= P3_PRUNED_BUCKET_MAX );
+ tableLength += p3.prunedBucketCounts[(int)rTable][i];
+
+ if( p3.prunedBucketCounts[(int)rTable][i] ) activeBucketCount++;
+ }
+
+ ASSERT( tableLength <= BBCU_TABLE_ALLOC_ENTRY_COUNT );
+ ASSERT( tableLength == p3.prunedTableEntryCounts[(int)rTable] );
+ }
+ #endif
+
+ s2.lpOut.WaitForCompletion();
+ s2.lpOut.Reset();
+
+ s2.indexOut.WaitForCompletion();
+ s2.indexOut.Reset();
+
+ s2.lMapIn.Reset();
+ s2.rMapIn.Reset();
+
+ // Copy slice counts & bucket count
+ cudaStream_t downloadStream = s2.lpOut.GetQueue()->GetStream();
+
+ CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
+ cudaMemcpyDeviceToHost, downloadStream ) );
+
+ memset( p3.prunedBucketCounts[(int)rTable], 0, BBCU_BUCKET_COUNT * sizeof( uint32 ) );
+
+ CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
+ bbmemcpy_t( &s2.prunedBucketSlices[0][0], cx.hostBucketSlices, BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT );
+ for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+ {
+ for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+ {
+ ASSERT( s2.prunedBucketSlices[slice][bucket] <= P3_PRUNED_SLICE_MAX );
+ p3.prunedBucketCounts[(int)rTable][bucket] += s2.prunedBucketSlices[slice][bucket];
+ }
+ // //ASSERT( p3.hostBucketCounts[i] );
+ ASSERT( p3.prunedBucketCounts[(int)rTable][bucket] <= P3_PRUNED_BUCKET_MAX );
+ }
+
+ // #if _DEBUG
+ // if( cx.table > TableId::Table3 )
+ // {
+ // DbgValidateStep2Output( cx );
+ // }
+ // #endif
+}
+
+//-----------------------------------------------------------
+void WritePark7( CudaK32PlotContext& cx )
+{
+ auto LoadBucket = []( CudaK32PlotContext& cx, const uint32 bucket ) -> void {
+
+ auto& p3 = *cx.phase3;
+ auto& s2 = p3.step2;
+
+ ASSERT( p3.prunedBucketCounts[(int)TableId::Table7][cx.bucket] > 0 );
+
+ // Load lMap
+ // Horizontal load
+ const LMap* lmap = p3.hostLMap + (size_t)bucket * P3_PRUNED_BUCKET_MAX;
+
+ const uint32* lSliceCounts = &p3.step3.prunedBucketSlices[0][bucket];
+
+ s2.lMapIn.UploadArrayT( lmap, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, lSliceCounts );
+ };
+
+ ASSERT( cx.table == TableId::Table7 );
+
+ auto& p3 = *cx.phase3;
+ auto& s2 = p3.step2;
+
+
+ // Load initial bucket
+ LoadBucket( cx, 0 );
+
+ // Begin park 7 table in plot
+ cx.plotWriter->BeginTable( PlotTable::Table7 );
+
+ constexpr size_t parkSize = CalculatePark7Size( BBCU_K );
+ constexpr size_t parkFieldCount = parkSize / sizeof( uint64 );
+ static_assert( parkFieldCount * sizeof( uint64 ) == parkSize );
+
+
+ GpuDownloadBuffer& parkDownloader = s2.lpOut;
+
+ constexpr size_t maxParksPerBucket = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2;
+ static_assert( sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT >= maxParksPerBucket * parkSize );
+
+
+ // Host stuff
+ constexpr size_t hostMetaTableSize = sizeof( RMap ) * BBCU_TABLE_ALLOC_ENTRY_COUNT;
+ StackAllocator hostAllocator( p3.hostRMap, hostMetaTableSize );
+
+ const uint64 tableEntryCount = cx.tableEntryCounts[(int)cx.table];
+ const size_t totalParkCount = CDiv( (size_t)tableEntryCount, kEntriesPerPark );
+
+ byte* hostParks = hostAllocator.AllocT( totalParkCount * parkSize );
+ byte* hostParkWriter = hostParks;
+ uint32* hostLastParkEntries = hostAllocator.CAlloc( kEntriesPerPark );
+
+ static_assert( kEntriesPerPark * maxParksPerBucket <= BBCU_BUCKET_ALLOC_ENTRY_COUNT * 2 );
+ uint32* devIndexBuffer = s2.devLTable[0] + kEntriesPerPark;
+ uint32 retainedEntryCount = 0;
+
+ // Begin serialization
+ cudaStream_t downloadStream = parkDownloader.GetQueue()->GetStream();
+
+ for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+ {
+ if( bucket + 1 < BBCU_BUCKET_COUNT )
+ LoadBucket( cx, bucket+1 );
+
+ const uint32 bucketEntryCount = p3.prunedBucketCounts[(int)TableId::Table7][bucket];
+
+ // Unmap bucket
+ auto* lMap = (LMap*)s2.lMapIn.GetUploadedDeviceBuffer( cx.computeStream );
+ UnpackLMap( cx, bucketEntryCount, lMap, devIndexBuffer, bucket, cx.computeStream );
+ s2.lMapIn.ReleaseDeviceBuffer( cx.computeStream );
+
+ // Serialize indices into a park
+ uint32* indices = devIndexBuffer - retainedEntryCount;
+ uint32 indexCount = bucketEntryCount + retainedEntryCount;
+
+ const uint32 parkCount = indexCount / kEntriesPerPark;
+
+ uint64* devParkFields = (uint64*)parkDownloader.LockDeviceBuffer( cx.computeStream );
+ SerializePark7InGPU( parkCount, indices, devParkFields, parkFieldCount, cx.computeStream );
+
+
+ // Retain any entries that did not fit into a park
+ retainedEntryCount = indexCount - (parkCount * kEntriesPerPark);
+ if( retainedEntryCount > 0 )
+ {
+ const bool isLastBucket = bucket + 1 == BBCU_BUCKET_COUNT;
+
+ const uint32 serializedEntryCount = parkCount * kEntriesPerPark;
+ const uint32* copySource = indices + serializedEntryCount;
+ const size_t copySize = sizeof( uint32 ) * retainedEntryCount;
+
+ if( !isLastBucket )
+ CudaErrCheck( cudaMemcpyAsync( devIndexBuffer - retainedEntryCount, copySource, copySize, cudaMemcpyDeviceToDevice, cx.computeStream ) );
+ else
+ CudaErrCheck( cudaMemcpyAsync( hostLastParkEntries, copySource, copySize, cudaMemcpyDeviceToHost, cx.computeStream ) );
+ }
+
+ // Download parks & write to plot
+ const size_t downloadSize = parkCount * parkSize;
+
+ parkDownloader.DownloadWithCallback( hostParkWriter, downloadSize,
+ []( void* parksBuffer, size_t size, void* userData ) {
+
+ auto& cx = *reinterpret_cast( userData );
+ cx.plotWriter->WriteTableData( parksBuffer, size );
+ }, &cx, cx.computeStream );
+
+ hostParkWriter += downloadSize;
+ }
+
+ // Wait for parks to complete downloading
+ parkDownloader.WaitForCompletion();
+ parkDownloader.Reset();
+
+ CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
+ CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
+
+ // Was there a left-over park?
+ if( retainedEntryCount > 0 )
+ {
+ // Submit last park to plot
+ TableWriter::WriteP7Parks( 1, hostLastParkEntries, hostParkWriter );
+ cx.plotWriter->WriteTableData( hostParkWriter, parkSize );
+ }
+ cx.plotWriter->EndTable();
+
+ // Cleanup
+ s2.lMapIn.Reset();
+}
+
+
+#if _DEBUG
+
+//-----------------------------------------------------------
+static void _DbgValidateOutput( CudaK32PlotContext& cx );
+void DbgValidateStep2Output( CudaK32PlotContext& cx )
+{
+ // New stack (prevent overflow)
+ auto* thread = new Thread();
+ thread->Run( []( void* p ) {
+ _DbgValidateOutput( *(CudaK32PlotContext*)p );
+ }, &cx );
+
+ thread->WaitForExit();
+ delete thread;
+}
+
+//-----------------------------------------------------------
+void _DbgValidateOutput( CudaK32PlotContext& cx )
+{
+ const TableId rTable = cx.table;
+ auto& p3 = *cx.phase3;
+ auto& s2 = p3.step2;
+
+ // Validate line points...
+ uint64* refLinePoints = bbcvirtallocboundednuma( BBCU_TABLE_ALLOC_ENTRY_COUNT );
+ uint64* tmpLinePoints = bbcvirtallocboundednuma( BBCU_TABLE_ALLOC_ENTRY_COUNT );
+ uint32* indices = bbcvirtallocboundednuma( BBCU_TABLE_ALLOC_ENTRY_COUNT );
+
+ uint64* writer = refLinePoints;
+ uint32* idxWriter = indices;
+
+ const uint64 prunedEntryCount = p3.prunedTableEntryCounts[(int)rTable];
+
+ const uint32 lpBits = 63; // #TODO: Change when compressing here
+ const uint32 lpBucketShift = lpBits - BBC_BUCKET_BITS;
+
+ for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+ {
+ uint64* reader = p3.hostLinePoints + bucket * P3_PRUNED_SLICE_MAX;
+ uint32* idxReader = p3.hostIndices + bucket * P3_PRUNED_SLICE_MAX*3;
+
+ for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice ++ )
+ {
+ const size_t count = s2.prunedBucketSlices[slice][bucket];
+ bbmemcpy_t( writer , reader , count );
+ bbmemcpy_t( idxWriter, idxReader, count );
+
+ // The line points must be in their given buckets if inlined x's
+ if( cx.table-1 == TableId::Table1 )
+ {
+ for( size_t i = 0; i < count; i++ )
+ {
+ const uint64 lp = writer[i];
+ const uint32 b = lp >> lpBucketShift;
+ ASSERT( b == bucket );
+ }
+ }
+
+ writer += count;
+ idxWriter += count;
+ reader += P3_PRUNED_BUCKET_MAX;
+ idxReader += P3_PRUNED_BUCKET_MAX*3;
+ }
+ }
+
+ const uint64 readEntries = (uint64)( (uintptr_t)writer - (uintptr_t)refLinePoints ) / sizeof( uint64 );
+ ASSERT( readEntries == prunedEntryCount );
+
+ ThreadPool& pool = DbgGetThreadPool( cx );
+ RadixSort256::Sort( pool, refLinePoints, tmpLinePoints, prunedEntryCount );
+ RadixSort256::Sort( pool, indices, (uint32*)tmpLinePoints, prunedEntryCount );
+
+ for( uint32 i = 1; i < (uint32)prunedEntryCount; i++ )
+ {
+ ASSERT( indices[i] >= indices[i-1] );
+ }
+
+ for( uint64 i = 1; i < prunedEntryCount; i++ )
+ {
+ ASSERT( refLinePoints[i] >= refLinePoints[i-1] );
+ }
+
+ // Delta test
+ // #TODO: Get correct stub bit size depending on compression
+ const uint32 stubBitSize = (BBCU_K - kStubMinusBits);
+ for( uint32 i = 0; i < (uint32)prunedEntryCount; i+=kEntriesPerPark )
+ {
+ const uint32 parkCount = std::min( prunedEntryCount - i, (uint64)kEntriesPerPark );
+
+ const uint64* park = refLinePoints + i;
+
+ uint64 prevLp = park[0];
+
+ for( uint32 j = 1; j < parkCount; j++ )
+ {
+ uint64 lp = park[j];
+ uint64 delta = lp - prevLp;
+ uint64 smallDelta = delta >> stubBitSize;
+ ASSERT( smallDelta < 256 );
+
+ prevLp = lp;
+ }
+ }
+
+ bbvirtfreebounded( refLinePoints );
+ bbvirtfreebounded( tmpLinePoints );
+ bbvirtfreebounded( indices );
+}
+
+#endif
+
+//-----------------------------------------------------------
+void DbgDumpSortedLinePoints( CudaK32PlotContext& cx )
+{
+ Log::Line( "[DEBUG] Prpaparing line ponts for writing to file." );
+ const TableId rTable = cx.table;
+
+ auto& p3 = *cx.phase3;
+ auto& s2 = p3.step2;
+
+
+ uint64* sortedLinePoints = bbcvirtallocboundednuma( BBCU_TABLE_ALLOC_ENTRY_COUNT );
+ uint64* tmpLinePoints = bbcvirtallocboundednuma( BBCU_TABLE_ALLOC_ENTRY_COUNT );
+
+ uint64* writer = sortedLinePoints;
+
+ const uint64 prunedEntryCount = p3.prunedTableEntryCounts[(int)rTable];
+
+ const uint32 lpBits = 63; // #TODO: Change when compressing here
+ const uint32 lpBucketShift = lpBits - BBC_BUCKET_BITS;
+
+ for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+ {
+ uint64* reader = p3.hostLinePoints + bucket * P3_PRUNED_SLICE_MAX;
+
+ for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice ++ )
+ {
+ const size_t count = s2.prunedBucketSlices[slice][bucket];
+ bbmemcpy_t( writer, reader, count );
+
+ writer += count;
+ reader += P3_PRUNED_BUCKET_MAX;
+ }
+ }
+
+ // Sort
+ ThreadPool& pool = *cx.threadPool; //DbgGetThreadPool( cx );
+ RadixSort256::Sort( pool, sortedLinePoints, tmpLinePoints, prunedEntryCount );
+
+ // Write to disk
+ {
+ char filePath[1024] = {};
+ sprintf( filePath, "%s/lp.c%u.ref", "/home/harold/plot/ref/compressed-lps", (uint32)cx.gCfg->compressionLevel );
+
+ FileStream file;
+ if( file.Open( filePath, FileMode::Open, FileAccess::Read ) )
+ {
+ Log::Line( "[DEBUG]File %s already exists. Cannot overwrite.", filePath );
+ }
+ else
+ {
+ Log::Line( "[DEBUG] Writing line points to %s", filePath );
+ file.Close();
+ file.Open( filePath, FileMode::Create, FileAccess::Write );
+
+ void* block = bbvirtalloc( file.BlockSize() );
+ int err;
+ if( !IOJob::WriteToFile( file, sortedLinePoints, prunedEntryCount * sizeof( uint64 ), block, file.BlockSize(), err ) )
+ Log::Line( "Failed to to file %s with error %d.", filePath, err );
+
+ bbvirtfree( block );
+
+ Log::Line( "[DEBUG] Wrote %llu line points", prunedEntryCount );
+ }
+
+ file.Close();
+ }
+
+ bbvirtfreebounded( sortedLinePoints );
+ bbvirtfreebounded( tmpLinePoints );
+}
diff --git a/cuda/CudaPlotPhase3Step3.cu b/cuda/CudaPlotPhase3Step3.cu
new file mode 100644
index 00000000..3949bd8c
--- /dev/null
+++ b/cuda/CudaPlotPhase3Step3.cu
@@ -0,0 +1,573 @@
+#include "CudaPlotPhase3Internal.h"
+#include "CudaParkSerializer.h"
+#include "plotmem/ParkWriter.h"
+
+static void GenerateLMap( CudaK32PlotContext& cx, const uint32 entryCount, const uint32 finalOffset, const uint32* indices, cudaStream_t stream );
+static void DeltafyLinePoints( CudaK32PlotContext& cx, const uint32 entryCount, const uint64* linePoints, uint64* deltaLinePoints, cudaStream_t stream );
+
+#if _DEBUG
+ #include "plotdisk/jobs/IOJob.h"
+ static void DbgSaveLMap( CudaK32PlotContext& cx );
+ static void DbgValidateLMapData( CudaK32PlotContext& cx );
+ static void DbgValidateLMap( CudaK32PlotContext& cx );
+#endif
+
+//-----------------------------------------------------------
+void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
+{
+ auto LoadBucket = []( CudaK32PlotContext& cx, const uint32 bucket ) -> void {
+
+ auto& p3 = *cx.phase3;
+ auto& s2 = p3.step2;
+ auto& s3 = p3.step3;
+
+ // if( bucket == 0 )
+ // p3.pairsLoadOffset = 0;
+
+ // Load line points and their source indices
+ const TableId rTable = cx.table;
+ const uint32 entryCount = p3.prunedBucketCounts[(int)rTable][bucket];
+ ASSERT( entryCount <= P3_PRUNED_BUCKET_MAX );
+
+ if( entryCount < 1 )
+ return;
+
+ // Vertical input layout of data: Start at row 0, column according to the current bucket
+ const uint64* linePoints = p3.hostLinePoints + (size_t)bucket * P3_PRUNED_SLICE_MAX;
+ const uint32* indices = p3.hostIndices + (size_t)bucket * P3_PRUNED_SLICE_MAX * 3; // This buffer is shared with RMap ((uint32)*3) (which we're about to write to),
+ // which is why we multiply by 3
+
+ const uint32* counts = &s2.prunedBucketSlices[0][bucket];
+
+ // Load 1 column
+ s3.lpIn .UploadArrayT( linePoints, BBCU_BUCKET_COUNT, P3_PRUNED_BUCKET_MAX , BBCU_BUCKET_COUNT, counts );
+ s3.indexIn.UploadArrayT( indices , BBCU_BUCKET_COUNT, P3_PRUNED_BUCKET_MAX*3, BBCU_BUCKET_COUNT, counts );
+ };
+
+ auto& p3 = *cx.phase3;
+ auto& s3 = p3.step3;
+
+ const TableId rTable = cx.table;
+ const TableId lTable = cx.table-1;
+
+ // Load CTable
+ const bool isCompressed = cx.gCfg->compressionLevel > 0 && lTable <= (TableId)cx.gCfg->numDroppedTables;
+ const uint32 stubBitSize = !isCompressed ? (BBCU_K - kStubMinusBits) : cx.gCfg->compressionInfo.subtSizeBits;
+ const TableId firstTable = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables;
+
+ const size_t cTableSize = !isCompressed ? sizeof( CTable_0 ) : cx.gCfg->cTableSize; ASSERT( cTableSize <= P3_MAX_CTABLE_SIZE );
+ const FSE_CTable* hostCTable = !isCompressed ? CTables[(int)lTable] : cx.gCfg->ctable;
+
+ // (upload must be loaded before first bucket, on the same stream)
+ CudaErrCheck( cudaMemcpyAsync( s3.devCTable, hostCTable, cTableSize, cudaMemcpyHostToDevice,
+ s3.lpIn.GetQueue()->GetStream() ) );
+
+ // Load initial bucket
+ LoadBucket( cx, 0 );
+
+ // Begin plot table
+ cx.plotWriter->BeginTable( (PlotTable)lTable );
+
+
+ uint32 mapOffset = 0;
+ uint32 retainedLPCount = 0; // Line points retained for the next bucket to write to park
+
+ const size_t hostParkSize = isCompressed ? cx.gCfg->compressionInfo.tableParkSize : CalculateParkSize( lTable );
+ ASSERT( DEV_MAX_PARK_SIZE >= hostParkSize );
+
+ // #TODO: Move this allocation to the beginning
+ if( s3.parkFence == nullptr )
+ s3.parkFence = new Fence();
+
+ byte* hostParksWriter = (byte*)cx.hostBackPointers[(int)rTable].left; //(byte*)cx.hostTableL;
+ uint64* hostRetainedEntries = nullptr;
+
+ // if( !isCompressed && lTable == TableId::Table1 )
+ // hostParksWriter = (byte*)cx.hostBackPointers[(int)TableId::Table2].left;
+
+ ///
+ /// Process buckets
+ ///
+ uint64* sortedLinePoints = s3.devLinePoints + kEntriesPerPark;
+ uint32* sortedIndices = s3.devIndices;
+
+ cudaStream_t sortAndMapStream = cx.computeStream;
+ cudaStream_t lpStream = cx.computeStream;//B;
+ cudaStream_t downloadStream = cx.gpuDownloadStream[0]->GetStream();
+
+ CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, sortAndMapStream ) );
+ CudaErrCheck( cudaMemsetAsync( s3.devParkOverrunCount, 0, sizeof( uint32 ), sortAndMapStream ) );
+
+ // Set initial event LP stream event as set.
+ CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) );
+
+ s3.parkFence->Reset( 0 );
+ s3.parkBucket = 0;
+
+ for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+ {
+ cx.bucket = bucket;
+
+ const uint32 bucketEntryCount = p3.prunedBucketCounts[(int)rTable][bucket];
+
+ if( bucketEntryCount == 0 )
+ break;
+
+ if( bucket + 1 < BBCU_BUCKET_COUNT )
+ LoadBucket( cx, bucket + 1 );
+
+ // Wait for upload to finish
+ uint64* unsortedLinePoints = (uint64*)s3.lpIn .GetUploadedDeviceBuffer( sortAndMapStream );
+ uint32* unsortedIndices = (uint32*)s3.indexIn.GetUploadedDeviceBuffer( sortAndMapStream );
+
+ // Sort line points
+ #if _DEBUG
+ {
+ size_t sortRequiredSize = 0;
+ CudaErrCheck( cub::DeviceRadixSort::SortPairs( nullptr, sortRequiredSize, nullptr, nullptr, nullptr, nullptr, bucketEntryCount, 0, 64 ) );
+ ASSERT( s3.sizeTmpSort >= sortRequiredSize );
+ }
+ #endif
+
+ // Wait for the previous bucket's LP work to finish, so we can re-use the device buffer
+ CudaErrCheck( cudaStreamWaitEvent( sortAndMapStream, cx.computeEventA ) );
+
+ // #TODO: We can use 63-7 (log2(128 buckets)), which might be faster
+ // #NOTE: I did change it and the sort failed. Investigate.
+ CudaErrCheck( cub::DeviceRadixSort::SortPairs(
+ s3.devSortTmpData, s3.sizeTmpSort,
+ unsortedLinePoints, sortedLinePoints,
+ unsortedIndices, sortedIndices,
+ bucketEntryCount, 0, 64, sortAndMapStream ) );
+
+ CudaErrCheck( cudaEventRecord( cx.computeEventB, sortAndMapStream ) );
+
+ s3.lpIn .ReleaseDeviceBuffer( sortAndMapStream ); unsortedLinePoints = nullptr;
+ s3.indexIn.ReleaseDeviceBuffer( sortAndMapStream ); unsortedIndices = nullptr;
+
+ ///
+ /// Map
+ ///
+ // Generate map and download to it to host
+ GenerateLMap( cx, bucketEntryCount, mapOffset, sortedIndices, sortAndMapStream );
+ mapOffset += bucketEntryCount;
+
+ // Vertical download map (write 1 column)
+ s3.mapOut.Download2DT( p3.hostLMap + (size_t)bucket * P3_PRUNED_SLICE_MAX,
+ P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_BUCKET_MAX, P3_PRUNED_SLICE_MAX, sortAndMapStream );
+
+
+ ///
+ /// Line points
+ ///
+ // If we have retained entries, let's account for them in this bucket
+ uint64* parkLinePoints = sortedLinePoints - retainedLPCount;
+
+ const uint32 totalEntryCount = bucketEntryCount + retainedLPCount;
+ const uint32 parkCount = totalEntryCount / kEntriesPerPark;
+ const uint32 entryCount = parkCount * kEntriesPerPark;
+ ASSERT( parkCount <= P3_PRUNED_MAX_PARKS_PER_BUCKET );
+
+ // Wait for sort to finish
+ CudaErrCheck( cudaStreamWaitEvent( lpStream, cx.computeEventB ) );
+
+ // Deltafy line points
+ DeltafyLinePoints( cx, entryCount, parkLinePoints, s3.devDeltaLinePoints, lpStream );
+
+ CudaErrCheck( cudaEventRecord( cx.computeEventC, lpStream ) ); // Signal download stream can download remaining line points for last park
+
+ // Compress line point parks
+ byte* devParks = (byte*)s3.parksOut.LockDeviceBuffer( lpStream );
+ CompressToParkInGPU( parkCount, hostParkSize, s3.devDeltaLinePoints, devParks, DEV_MAX_PARK_SIZE, stubBitSize, s3.devCTable, s3.devParkOverrunCount, lpStream );
+
+ // Retain any entries that did not maked it into parks for the next bucket to process
+ retainedLPCount = totalEntryCount - (parkCount * kEntriesPerPark);
+ if( retainedLPCount > 0 )
+ {
+ // Last bucket?
+ const bool isLastBucket = bucket + 1 == BBCU_BUCKET_COUNT;
+
+ const uint64* copySource = parkLinePoints + entryCount;
+ const size_t copySize = sizeof( uint64 ) * retainedLPCount;
+
+ if( !isLastBucket )
+ {
+ // Not the last bucket, so retain entries for the next GPU compression bucket
+ CudaErrCheck( cudaMemcpyAsync( sortedLinePoints - retainedLPCount, copySource, copySize, cudaMemcpyDeviceToDevice, lpStream ) );
+ }
+ else
+ {
+ // No more buckets so we have to compress this last park on the CPU
+ CudaErrCheck( cudaStreamWaitEvent( downloadStream, cx.computeEventC ) );
+
+ hostRetainedEntries = (uint64*)( hostParksWriter + hostParkSize * parkCount );
+ CudaErrCheck( cudaMemcpyAsync( hostRetainedEntries, copySource, copySize, cudaMemcpyDeviceToHost, downloadStream ) );
+ }
+ }
+
+ CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) ); // Signal sortedLinePoints buffer ready for use again
+
+
+ // Download parks
+ s3.parksOut.Download2DWithCallback( hostParksWriter, hostParkSize, parkCount, hostParkSize, DEV_MAX_PARK_SIZE,
+ []( void* parksBuffer, size_t size, void* userData ) {
+
+ auto& cx = *reinterpret_cast( userData );
+ auto& s3 = cx.phase3->step3;
+
+ cx.plotWriter->WriteTableData( parksBuffer, size );
+ cx.plotWriter->SignalFence( *s3.parkFence, ++s3.parkBucket );
+
+ }, &cx, lpStream, cx.downloadDirect );
+
+ hostParksWriter += hostParkSize * parkCount;
+ }
+
+ // Copy park overrun count
+ CudaErrCheck( cudaMemcpyAsync( s3.hostParkOverrunCount, s3.devParkOverrunCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, downloadStream ) );
+
+ // Wait for parks to complete downloading
+ s3.parksOut.WaitForCompletion();
+ s3.parksOut.Reset();
+
+ // Copy map slice counts (for the next step 2)
+ CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
+ cudaMemcpyDeviceToHost, downloadStream ) );
+
+ CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
+ memcpy( &s3.prunedBucketSlices[0][0], cx.hostBucketSlices, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT );
+
+ FatalIf( *s3.hostParkOverrunCount > 0, "Park buffer overrun." );
+
+ // Was there a left-over park?
+ if( retainedLPCount > 0 )
+ {
+ ASSERT( hostRetainedEntries );
+
+ uint64 lastParkEntries[kEntriesPerPark];
+ bbmemcpy_t( lastParkEntries, hostRetainedEntries, retainedLPCount );
+
+ WritePark( hostParkSize, retainedLPCount, lastParkEntries, hostParksWriter, stubBitSize, hostCTable );
+ cx.plotWriter->WriteTableData( hostParksWriter, hostParkSize );
+ }
+ cx.plotWriter->EndTable();
+
+ // Update buckets counts for L table
+ // #TODO: These should match Step 1 pruned entry count I believe, so just copy?
+
+ memset( p3.prunedBucketCounts[(int)rTable], 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT );
+ for( uint32 i = 0; i < BBCU_BUCKET_COUNT; i++ )
+ for( uint32 j = 0; j < BBCU_BUCKET_COUNT; j++ )
+ p3.prunedBucketCounts[(int)rTable][i] += s3.prunedBucketSlices[j][i];
+
+ s3.mapOut.WaitForCompletion();
+ s3.mapOut.Reset();
+
+ s3.lpIn .Reset();
+ s3.indexIn.Reset();
+
+
+ // #if _DEBUG
+ // //if( cx.table >= TableId::Table6 )
+ // //{
+ // DbgValidateLMap( cx );
+ // DbgValidateLMapData( cx );
+ // // DbgSaveLMap( cx );
+ // //}
+ // #endif
+}
+
+
+//-----------------------------------------------------------
+__global__ void CudaGenerateLMap( const uint32 entryCount, const uint32 finalOffset, const uint32* indices, LMap* gMap, uint32* gBucketCounts )
+{
+ const uint32 id = threadIdx.x;
+ const uint32 gid = blockIdx.x * blockDim.x + id;
+
+ __shared__ uint32 sharedBucketCounts[BBCU_BUCKET_COUNT];
+ if( id < BBCU_BUCKET_COUNT )
+ sharedBucketCounts[id] = 0;
+
+ __syncthreads();
+
+ uint32 index;
+ uint32 bucket;
+ uint32 offset;
+
+ if( gid < entryCount )
+ {
+ index = indices[gid];
+
+ bucket = ( index >> (32 - BBC_BUCKET_BITS) );
+ offset = atomicAdd( &sharedBucketCounts[bucket], 1 );
+ }
+
+ __syncthreads();
+
+ // Global offset
+ if( id < BBCU_BUCKET_COUNT )
+ sharedBucketCounts[id] = atomicAdd( &gBucketCounts[id], sharedBucketCounts[id] );
+
+ __syncthreads();
+
+ if( gid >= entryCount )
+ return;
+
+ const uint32 dst = bucket * P3_PRUNED_SLICE_MAX + sharedBucketCounts[bucket] + offset;
+
+ //CUDA_ASSERT( index != 0 );
+
+ LMap map;
+ map.sortedIndex = finalOffset + gid;
+ map.sourceIndex = index;
+#if _DEBUG
+ CUDA_ASSERT( map.sortedIndex != 0 || map.sourceIndex != 0 );
+#endif
+ gMap[dst] = map;
+}
+
+//-----------------------------------------------------------
+void GenerateLMap( CudaK32PlotContext& cx, const uint32 entryCount, const uint32 finalOffset, const uint32* indices, cudaStream_t stream )
+{
+ const uint32 threads = 256;
+ const uint32 blocks = CDiv( entryCount, threads );
+
+ auto& p3 = *cx.phase3;
+ auto& s3 = p3.step3;
+
+ auto* devMap = (LMap*)s3.mapOut.LockDeviceBuffer( stream );
+ uint32* devSliceCounts = cx.devSliceCounts + cx.bucket * BBCU_BUCKET_COUNT;
+
+ CudaErrCheck( cudaMemsetAsync( devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT, stream ) );
+
+ CudaGenerateLMap<<>>( entryCount, finalOffset, indices, devMap, devSliceCounts );
+}
+
+//-----------------------------------------------------------
+__global__ void CudaDeltafyLinePoints( const uint32 entryCount, const uint64* linePoints, uint64* deltaLinePoints )
+{
+ const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+ if( gid >= entryCount )
+ return;
+
+ const bool isFirstParkEntry = ( gid & ( kEntriesPerPark - 1 ) ) == 0;
+
+ if( isFirstParkEntry )
+ {
+ deltaLinePoints[gid] = linePoints[gid];
+ }
+ else
+ {
+ //CUDA_ASSERT( linePoints[gid] && linePoints[gid - 1] );
+ CUDA_ASSERT( linePoints[gid] >= linePoints[gid - 1] );
+ deltaLinePoints[gid] = linePoints[gid] - linePoints[gid - 1];
+ }
+}
+
+//-----------------------------------------------------------
+void DeltafyLinePoints( CudaK32PlotContext& cx, const uint32 entryCount, const uint64* linePoints, uint64* deltaLinePoints, cudaStream_t stream )
+{
+ ASSERT( entryCount / kEntriesPerPark * kEntriesPerPark == entryCount );
+
+ const uint32 threadsPerBlock = 256;
+ const uint32 blockCount = CDivT( entryCount, threadsPerBlock );
+ CudaDeltafyLinePoints<<>>( entryCount, linePoints, deltaLinePoints );
+}
+
+
+
+#if _DEBUG
+
+//-----------------------------------------------------------
+void DbgSaveLMap( CudaK32PlotContext& cx )
+{
+ Log::Line( "[DEBUG] Saving table %u LMap", (uint)cx.table+1 );
+ auto& p3 = *cx.phase3;
+
+ char path[512];
+ sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.tmp", (uint)cx.table+1 );
+
+ const size_t writeSize = sizeof( LMap ) * BBCU_TABLE_ALLOC_ENTRY_COUNT;
+ int err;
+ FatalIf( !IOJob::WriteToFile( path, p3.hostLMap, writeSize, err ),
+ "[DEBUG] Failed to write LMap with error: %d", err );
+
+ sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.slices.tmp", (uint)cx.table+1 );
+ FatalIf( !IOJob::WriteToFileUnaligned( path, p3.step3.prunedBucketSlices, sizeof( p3.step3.prunedBucketSlices ), err ),
+ "[DEBUG] Failed to write LMap slices with error: %d", err );
+
+ sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.buckets.tmp", (uint)cx.table+1 );
+ FatalIf( !IOJob::WriteToFileUnaligned( path, p3.prunedBucketCounts[(int)cx.table], sizeof( uint32 ) * BBCU_BUCKET_COUNT, err ),
+ "[DEBUG] Failed to write LMap buckets with error: %d", err );
+
+ Log::Line( " [DEBUG] OK" );
+}
+
+//-----------------------------------------------------------
+void DbgLoadLMap( CudaK32PlotContext& cx )
+{
+ auto& p3 = *cx.phase3;
+
+ char path[512];
+ sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.tmp", (uint)cx.table+1 );
+
+ const size_t writeSize = sizeof( LMap ) * BBCU_TABLE_ALLOC_ENTRY_COUNT;
+ int err;
+ FatalIf( !IOJob::ReadFromFile( path, p3.hostLMap, writeSize, err ),
+ "[DEBUG] Failed to read LMap with error: %d", err );
+
+ sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.slices.tmp", (uint)cx.table+1 );
+ FatalIf( !IOJob::ReadFromFileUnaligned( path, p3.step3.prunedBucketSlices, sizeof( p3.step3.prunedBucketSlices ), err ),
+ "[DEBUG] Failed to read LMap slices with error: %d", err );
+
+ sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.buckets.tmp", (uint)cx.table+1 );
+
+ FatalIf( !IOJob::ReadFromFileUnaligned( path, p3.prunedBucketCounts[(int)cx.table], sizeof( uint32 ) * BBCU_BUCKET_COUNT, err ),
+ "[DEBUG] Failed to read LMap buckets with error: %d", err );
+
+ //DbgValidateLMapData( cx );
+}
+
+//-----------------------------------------------------------
+void DbgValidateLMap( CudaK32PlotContext& cx )
+{
+ Log::Line( "[DEBUG] Validating LMap..." );
+
+ ThreadPool& pool = DbgGetThreadPool( cx );
+
+ auto& p3 = *cx.phase3;
+ auto& s3 = p3.step3;
+
+ LMap* lMap = bbcvirtallocbounded( BBCU_TABLE_ENTRY_COUNT );
+
+
+ {
+ for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+ {
+ const LMap* reader = p3.hostLMap + bucket * P3_PRUNED_BUCKET_MAX;
+
+ uint64 entryCount = 0;
+ LMap* writer = lMap;
+
+ for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+ {
+ // Read counts vertically, but read data horizontally
+ const uint32 copyCount = s3.prunedBucketSlices[slice][bucket];
+
+ bbmemcpy_t( writer, reader, copyCount );
+
+ writer += copyCount;
+ entryCount += copyCount;
+ reader += P3_PRUNED_SLICE_MAX;
+ }
+
+ // All source entries should belong to the same bucket
+ ASSERT( entryCount == p3.prunedBucketCounts[(int)cx.table][bucket] );
+
+ for( uint64 i = 0; i < entryCount; i++ )
+ {
+ const LMap map = lMap[i];
+
+ ASSERT( map.sourceIndex || map.sortedIndex );
+ ASSERT( ( map.sourceIndex >> ( 32 - BBC_BUCKET_BITS ) ) == bucket );
+ }
+ }
+
+
+ }
+
+ bbvirtfreebounded( lMap );
+
+ Log::Line( "[DEBUG] OK" );
+}
+
+//-----------------------------------------------------------
+static void _DbgValidateLMapData( CudaK32PlotContext& cx );
+void DbgValidateLMapData( CudaK32PlotContext& cx )
+{
+ // New stack (prevent overflow)
+ auto* thread = new Thread();
+ thread->Run( []( void* p ) {
+ _DbgValidateLMapData( *(CudaK32PlotContext*)p );
+ }, &cx );
+
+ thread->WaitForExit();
+ delete thread;
+}
+
+void _DbgValidateLMapData( CudaK32PlotContext& cx )
+{
+ Log::Line( "[DEBUG] Validating LMap uniquenes..." );
+
+ ThreadPool& pool = DbgGetThreadPool( cx );
+
+ auto& p3 = *cx.phase3;
+ auto& s3 = p3.step3;
+
+ uint32* srcIndices = bbcvirtallocbounded( BBCU_TABLE_ENTRY_COUNT );
+ uint32* dstIndices = bbcvirtallocbounded( BBCU_TABLE_ENTRY_COUNT );
+ uint32* tmpIndices = bbcvirtallocbounded( BBCU_TABLE_ENTRY_COUNT );
+
+ uint64 entryCount = 0;
+ uint32 twoCount = 0;
+ {
+ uint32* srcWriter = srcIndices;
+ uint32* dstWriter = dstIndices;
+
+ for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+ {
+ const LMap* reader = p3.hostLMap + bucket * P3_PRUNED_BUCKET_MAX;
+
+ for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+ {
+ // Read counts vertically, but read data horizontally
+ const uint32 copyCount = s3.prunedBucketSlices[slice][bucket];
+
+ for( uint32 i = 0; i < copyCount; i++ )
+ {
+ if( reader[i].sourceIndex == 2 )
+ twoCount++;
+ if( reader[i].sourceIndex == 0 && reader[i].sortedIndex == 0 )
+ {
+ ASSERT( 0 );
+ }
+
+ srcWriter[i] = reader[i].sourceIndex;
+ dstWriter[i] = reader[i].sortedIndex;
+ }
+
+ srcWriter += copyCount;
+ dstWriter += copyCount;
+ entryCount += copyCount;
+ reader += P3_PRUNED_SLICE_MAX;
+ }
+ }
+
+ ASSERT( entryCount == p3.prunedTableEntryCounts[(int)cx.table] );
+ }
+
+ RadixSort256::Sort( pool, srcIndices, tmpIndices, entryCount );
+ RadixSort256::Sort( pool, dstIndices, tmpIndices, entryCount );
+
+ // Indices must not repeat:
+ for( uint64 i = 1; i < entryCount; i++ )
+ {
+ ASSERT( srcIndices[i] > srcIndices[i-1] );
+ }
+
+ Log::Line( "Maximum source index: %u", srcIndices[entryCount-1] );
+
+ for( uint64 i = 0; i < entryCount; i++ )
+ {
+ ASSERT( dstIndices[i] == i );
+ }
+
+ bbvirtfreebounded( srcIndices );
+ bbvirtfreebounded( dstIndices );
+ bbvirtfreebounded( tmpIndices );
+
+ Log::Line( "[DEBUG] OK" );
+}
+
+#endif
+
diff --git a/cuda/CudaPlotUtil.cu b/cuda/CudaPlotUtil.cu
new file mode 100644
index 00000000..4f7f18b3
--- /dev/null
+++ b/cuda/CudaPlotUtil.cu
@@ -0,0 +1,124 @@
+#include "CudaPlotContext.h"
+
+//-----------------------------------------------------------
+__global__ void GenSortKey( const uint32 entryCount, uint32* key )
+{
+ const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+ if( gid >= entryCount )
+ return;
+
+ key[gid] = gid;
+}
+
+//-----------------------------------------------------------
+void CudaK32PlotGenSortKey( const uint32 entryCount, uint32* devKey, cudaStream_t stream, bool synchronize )
+{
+ const uint32 threadsPerBlock = 128;
+ const uint32 blockCount = CDiv( entryCount, threadsPerBlock );
+
+ if( stream == nullptr )
+ stream = CU_STREAM_LEGACY;
+
+ GenSortKey<<>>( entryCount, devKey );
+ if( synchronize )
+ CudaErrCheck( cudaStreamSynchronize( stream ) );
+
+}
+
+//-----------------------------------------------------------
+template
+__global__ void SortByKey( const uint32 entryCount, const uint32* key, const T* input, T* output )
+{
+ const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+ if( gid >= entryCount )
+ return;
+
+ output[gid] = input[key[gid]];
+}
+
+//-----------------------------------------------------------
+template
+void CudaK32PlotSortByKey( const uint32 entryCount, const uint32* devKey, const T* devInput, T* devOutput, cudaStream_t stream, bool synchronize )
+{
+ const uint32 threadsPerBlock = 128;
+ const uint32 blockCount = CDiv( entryCount, threadsPerBlock );
+
+ if( stream == nullptr )
+ stream = CU_STREAM_LEGACY;
+
+ SortByKey<<>>( entryCount, devKey, devInput, devOutput );
+ if( synchronize )
+ CudaErrCheck( cudaStreamSynchronize( stream ) );
+}
+
+//-----------------------------------------------------------
+void CudaK32PlotSortMeta( const uint32 entryCount, const uint32* devKey, const uint32* devMetaIn, uint32* devMetaOutput, cudaStream_t stream )
+{
+
+}
+
+
+template void CudaK32PlotSortByKey( const uint32 entryCount, const uint32* devKey, const uint16* devInput, uint16* devOutput, cudaStream_t stream, bool synchronize );
+template void CudaK32PlotSortByKey( const uint32 entryCount, const uint32* devKey, const uint32* devInput, uint32* devOutput, cudaStream_t stream, bool synchronize );
+template void CudaK32PlotSortByKey( const uint32 entryCount, const uint32* devKey, const uint64* devInput, uint64* devOutput, cudaStream_t stream, bool synchronize );
+template void CudaK32PlotSortByKey( const uint32 entryCount, const uint32* devKey, const K32Meta3* devInput, K32Meta3* devOutput, cudaStream_t stream, bool synchronize );
+template void CudaK32PlotSortByKey( const uint32 entryCount, const uint32* devKey, const K32Meta4* devInput, K32Meta4* devOutput, cudaStream_t stream, bool synchronize );
+template void CudaK32PlotSortByKey( const uint32 entryCount, const uint32* devKey, const Pair* devInput, Pair* devOutput, cudaStream_t stream, bool synchronize );
+
+
+__global__ void K32InlineXsIntoPairsKernel( const uint32 entryCount, Pair* outPairs, const Pair* inPairs, const uint32* xs )
+{
+ const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+ if( gid >= entryCount )
+ return;
+
+ const Pair pair = inPairs[gid];
+
+ Pair inlined;
+ inlined.left = xs[pair.left ];
+ inlined.right = xs[pair.right];
+ CUDA_ASSERT( inlined.left || inlined.right );
+
+ outPairs[gid] = inlined;
+}
+
+void CudaK32InlineXsIntoPairs(
+ const uint32 entryCount,
+ Pair* devOutPairs,
+ const Pair* devInPairs,
+ const uint32* devXs,
+ cudaStream_t stream )
+{
+ const uint32 kthreads = 256;
+ const uint32 kblocks = CDivT( entryCount, kthreads );
+
+ K32InlineXsIntoPairsKernel<<>>(
+ entryCount, devOutPairs, devInPairs, devXs );
+}
+
+
+__global__ void K3ApplyPairOffsetKernel( const uint32 entryCount, const uint32 offset, Pair* outPairs, const Pair* inPairs )
+{
+ const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+ if( gid >= entryCount )
+ return;
+
+ Pair pair = inPairs[gid];
+ pair.left += offset;
+ pair.right += offset;
+
+ outPairs[gid] = pair;
+}
+void CudaK32ApplyPairOffset(
+ const uint32 entryCount,
+ const uint32 offset,
+ Pair* devOutPairs,
+ const Pair* devInPairs,
+ cudaStream_t stream )
+{
+ const uint32 kthreads = 256;
+ const uint32 kblocks = CDivT( entryCount, kthreads );
+
+ K3ApplyPairOffsetKernel<<>>(
+ entryCount, offset, devOutPairs, devInPairs );
+}
diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu
new file mode 100644
index 00000000..8e0458dd
--- /dev/null
+++ b/cuda/CudaPlotter.cu
@@ -0,0 +1,1570 @@
+#include "CudaPlotter.h"
+#include "CudaPlotContext.h"
+#include "pos/chacha8.h"
+#include "b3/blake3.h"
+#include "threading/MTJob.h"
+#include "util/jobs/MemJobs.h"
+#include "util/StackAllocator.h"
+#include "CudaParkSerializer.h"
+#include "plotting/CTables.h"
+#include "plotting/TableWriter.h"
+#include "plotting/PlotTools.h"
+
+// TEST/DEBUG
+#if _DEBUG
+ #include "algorithm/RadixSort.h"
+ #include "plotdisk/jobs/IOJob.h"
+ #include "io/FileStream.h"
+
+ ThreadPool* _dbgThreadPool = nullptr;
+
+ static void DbgPruneTableBuckets( CudaK32PlotContext& cx, const TableId rTable );
+ static void DbgPruneTable( CudaK32PlotContext& cx, const TableId rTable );
+#endif
+
+static void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext );
+static void CudaInit( CudaK32PlotContext& cx );
+
+void GenF1Cuda( CudaK32PlotContext& cx );
+
+static void MakePlot( CudaK32PlotContext& cx );
+static void FpTable( CudaK32PlotContext& cx );
+static void FpTableBucket( CudaK32PlotContext& cx, const uint32 bucket );
+static void UploadBucketForTable( CudaK32PlotContext& cx, const uint64 bucket );
+static void FinalizeTable7( CudaK32PlotContext& cx );
+static void InlineTable( CudaK32PlotContext& cx, const uint32* devInX, cudaStream_t stream );
+
+static void AllocBuffers( CudaK32PlotContext& cx );
+static void AllocateP1Buffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx );
+
+template
+static void UploadBucketToGpu( CudaK32PlotContext& context, TableId table, const uint32* hostPtr, T* devPtr, uint64 bucket, uint64 stride );
+static void LoadAndSortBucket( CudaK32PlotContext& cx, const uint32 bucket );
+
+void CudaMatchBucketizedK32( CudaK32PlotContext& cx, const uint32* devY, cudaStream_t stream, cudaEvent_t event );
+
+// Defined in FxCuda.cu
+void GenFx( CudaK32PlotContext& cx, const uint32* devYIn, const uint32* devMetaIn, cudaStream_t stream );
+
+static const char* USAGE = "bladebit_cuda ... cudaplot \n"
+R"(
+GPU-based (CUDA) plotter
+
+[OPTIONS]:
+ -h, --help : Shows this help message and exits.
+ -d, --device : Select the CUDA device index. (default=0)
+)";
+
+///
+/// CLI
+///
+//-----------------------------------------------------------
+void CudaK32Plotter::ParseCLI( const GlobalPlotConfig& gCfg, CliParser& cli )
+{
+ CudaK32PlotConfig& cfg = _cfg;
+ cfg.gCfg = &gCfg;
+
+ while( cli.HasArgs() )
+ {
+ if( cli.ReadU32( cfg.deviceIndex, "-d", "--device" ) )
+ continue;
+ if( cli.ReadSwitch( cfg.disableDirectDownloads, "--no-direct-downloads" ) )
+ continue;
+ if( cli.ArgMatch( "--help", "-h" ) )
+ {
+ Log::Line( USAGE );
+ exit( 0 );
+ }
+ else
+ break; // Let the caller handle it
+ }
+
+ // The rest should be output directies, parsed by the global config parser.
+}
+
+//-----------------------------------------------------------
+void CudaK32Plotter::Init()
+{
+ if( _cx )
+ return;
+
+ InitContext( _cfg, _cx );
+}
+
+//-----------------------------------------------------------
+void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext )
+{
+ auto& cx = *new CudaK32PlotContext{};
+ outContext = &cx;
+
+ cx.cfg = cfg;
+ cx.gCfg = cfg.gCfg;
+
+ Log::Line( "[Bladebit CUDA Plotter]" );
+ CudaInit( cx );
+
+ CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStream , cudaStreamNonBlocking ) );
+ CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamB, cudaStreamNonBlocking ) );
+ CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamC, cudaStreamNonBlocking ) );
+ CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamD, cudaStreamNonBlocking ) );
+
+ cudaEventCreateWithFlags( &cx.computeEventA, cudaEventDisableTiming );
+ cudaEventCreateWithFlags( &cx.computeEventB, cudaEventDisableTiming );
+ cudaEventCreateWithFlags( &cx.computeEventC, cudaEventDisableTiming );
+
+ for( int32 i = 0; i < BBCU_GPU_STREAM_COUNT; i++ )
+ {
+ cx.gpuDownloadStream[i] = new GpuQueue( GpuQueue::Downloader );
+ cx.gpuUploadStream [i] = new GpuQueue( GpuQueue::Uploader );
+ }
+
+ cx.threadPool = new ThreadPool( SysHost::GetLogicalCPUCount() );
+
+ #if __linux__
+ cx.downloadDirect = cfg.disableDirectDownloads ? false : true;
+ #else
+ // #TODO: One windows, check if we have enough memory, if so, default to true.
+ cx.downloadDirect = true ;//false;
+ #endif
+
+ // cx.plotWriter = new PlotWriter( !cfg.gCfg->disableOutputDirectIO );
+ // if( cx.gCfg->benchmarkMode )
+ // cx.plotWriter->EnableDummyMode();
+
+ cx.plotFence = new Fence();
+
+ cx.phase2 = new CudaK32Phase2{};
+ cx.phase3 = new CudaK32Phase3{};
+
+ // #TODO: Support non-warm starting
+ Log::Line( "Allocating buffers (this may take a few seconds)..." );
+ AllocBuffers( cx );
+ InitFSEBitMask( cx );
+}
+
+//-----------------------------------------------------------
+void CudaInit( CudaK32PlotContext& cx )
+{
+ ASSERT( cx.cudaDevice == -1 );
+
+ // CUDA init
+ int deviceCount = 0;
+ CudaFatalCheckMsg( cudaGetDeviceCount( &deviceCount ), "Failed to fetch CUDA devices." );
+ FatalIf( deviceCount < 1, "No CUDA-capable devices found." );
+ FatalIf( cx.cfg.deviceIndex >= deviceCount, "CUDA device %u is out of range out of %d CUDA devices",
+ cx.cfg.deviceIndex, deviceCount );
+
+ CudaFatalCheckMsg( cudaSetDevice( (int)cx.cfg.deviceIndex ), "Failed to set cuda device at index %u", cx.cfg.deviceIndex );
+ cx.cudaDevice = (int32)cx.cfg.deviceIndex;
+
+ cudaDeviceProp* cudaDevProps = new cudaDeviceProp{};
+ CudaErrCheck( cudaGetDeviceProperties( cudaDevProps, cx.cudaDevice ) );
+ cx.cudaDevProps = cudaDevProps;
+
+ Log::Line( "Selected cuda device %u : %s", cx.cudaDevice, cudaDevProps->name );
+
+ // Get info & limites
+ size_t stack = 0, memFree = 0, memTotal = 0;
+ cudaMemGetInfo( &memFree, &memTotal );
+ cudaDeviceGetLimit( &stack, cudaLimitStackSize );
+
+ Log::Line( " CUDA Compute Capability : %u.%u", cudaDevProps->major, cudaDevProps->minor );
+ Log::Line( " SM count : %d", cudaDevProps->multiProcessorCount );
+ Log::Line( " Max blocks per SM : %d", cudaDevProps->maxBlocksPerMultiProcessor );
+ Log::Line( " Max threads per SM : %d", cudaDevProps->maxThreadsPerMultiProcessor );
+ Log::Line( " Async Engine Count : %d", cudaDevProps->asyncEngineCount );
+ Log::Line( " L2 cache size : %.2lf MB", (double)cudaDevProps->l2CacheSize BtoMB );
+ Log::Line( " L2 persist cache max size : %.2lf MB", (double)cudaDevProps->persistingL2CacheMaxSize BtoMB );
+ Log::Line( " Stack Size : %.2lf KB", (double)stack BtoKB );
+ Log::Line( " Memory:" );
+ Log::Line( " Total : %.2lf GB", (double)memTotal BtoGB );
+ Log::Line( " Free : %.2lf GB", (double)memFree BtoGB );
+ Log::Line( "" );
+
+ // Ensure we have the correct capabilities
+ //int supportsCoopLaunch = 0;
+ //cudaDeviceGetAttribute( &supportsCoopLaunch, cudaDevAttrCooperativeLaunch, cx.cudaDevice );
+ //FatalIf( supportsCoopLaunch != 1, "This CUDA device does not support cooperative kernel launches." );
+}
+
+
+///
+/// Plotting entry point
+///
+//-----------------------------------------------------------
+void CudaK32Plotter::Run( const PlotRequest& req )
+{
+ SysHost::InstallCrashHandler();
+
+ // Initialize if needed
+ if( _cx == nullptr )
+ Init();
+
+ auto& cx = *_cx;
+ const auto& cfg = _cfg;
+
+ // Only start profiling from here (don't profile allocations)
+ CudaErrCheck( cudaProfilerStart() );
+
+ ASSERT( cx.plotWriter == nullptr );
+ cx.plotWriter = new PlotWriter( !cfg.gCfg->disableOutputDirectIO );
+ if( cx.gCfg->benchmarkMode )
+ cx.plotWriter->EnableDummyMode();
+
+ FatalIf( !cx.plotWriter->BeginPlot( cfg.gCfg->compressionLevel > 0 ? PlotVersion::v2_0 : PlotVersion::v1_0,
+ req.outDir, req.plotFileName, req.plotId, req.memo, req.memoSize, cfg.gCfg->compressionLevel ),
+ "Failed to open plot file with error: %d", cx.plotWriter->GetError() );
+
+ cx.plotRequest = req;
+ MakePlot( cx );
+
+ cx.plotWriter->EndPlot( true );
+
+ // #TODO: Ensure the last plot ended here for now
+ {
+ const auto pltoCompleteTimer = TimerBegin();
+ cx.plotWriter->WaitForPlotToComplete();
+ const double plotIOTime = TimerEnd( pltoCompleteTimer );
+ Log::Line( "Completed writing plot in %.2lf seconds", plotIOTime );
+
+ cx.plotWriter->DumpTables();
+ }
+ Log::Line( "" );
+
+ delete cx.plotWriter;
+ cx.plotWriter = nullptr;
+}
+
+//-----------------------------------------------------------
+void MakePlot( CudaK32PlotContext& cx )
+{
+ memset( cx.bucketCounts , 0, sizeof( cx.bucketCounts ) );
+ memset( cx.bucketSlices , 0, sizeof( cx.bucketSlices ) );
+ memset( cx.tableEntryCounts, 0, sizeof( cx.tableEntryCounts ) );
+
+ cx.table = TableId::Table1;
+ const auto plotTimer = TimerBegin();
+ const auto p1Timer = plotTimer;
+
+ #if BBCU_DBG_SKIP_PHASE_1
+ DbgLoadContextAndPairs( cx );
+ #else
+ // F1
+ Log::Line( "Generating F1" );
+ const auto timer = TimerBegin();
+ GenF1Cuda( cx );
+ const auto elapsed = TimerEnd( timer );
+ Log::Line( "Finished F1 in %.2lf seconds.", elapsed );
+
+ // Time for FP
+ for( TableId table = TableId::Table2; table <= TableId::Table7; table++ )
+ {
+ cx.table = table;
+ cx.bucket = 0;
+ FpTable( cx );
+ }
+ const auto p1Elapsed = TimerEnd( p1Timer );
+ Log::Line( "Completed Phase 1 in %.2lf seconds", p1Elapsed );
+ #endif
+
+ // Prune
+ #if !BBCU_DBG_SKIP_PHASE_2
+ const auto p2Timer = TimerBegin();
+ CudaK32PlotPhase2( cx );
+ const auto p2Elapsed = TimerEnd( p2Timer );
+ Log::Line( "Completed Phase 2 in %.2lf seconds", p2Elapsed );
+ #endif
+
+ // Compress & write plot tables
+ const auto p3Timer = TimerBegin();
+ CudaK32PlotPhase3( cx );
+ const auto p3Elapsed = TimerEnd( p3Timer );
+ Log::Line( "Completed Phase 3 in %.2lf seconds", p3Elapsed );
+
+ auto plotElapsed = TimerEnd( plotTimer );
+ Log::Line( "Completed Plot 1 in %.2lf seconds ( %.2lf minutes )", plotElapsed, plotElapsed / 60.0 );
+ Log::Line( "" );
+}
+
+//-----------------------------------------------------------
+void FpTable( CudaK32PlotContext& cx )
+{
+ memset( &cx.timings, 0, sizeof( cx.timings ) );
+ const TableId inTable = cx.table - 1;
+
+ cx.prevTablePairOffset = 0;
+
+ // Clear slice counts
+ CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );
+
+ // Load initial buckets
+ UploadBucketForTable( cx, 0 );
+
+ const auto timer = TimerBegin();
+ for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+ {
+ FpTableBucket( cx, bucket );
+ }
+
+ CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
+
+ // Copy bucket slices to host
+ cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
+ cudaMemcpyDeviceToHost, cx.gpuDownloadStream[0]->GetStream() );
+ CudaErrCheck( cudaStreamSynchronize( cx.gpuDownloadStream[0]->GetStream() ) );
+
+ // #TODO: Don't do this copy and instead just use the hostBucketSlices one
+ const uint32 outIdx = CudaK32PlotGetOutputIndex( cx );
+ memcpy( &cx.bucketSlices[outIdx], cx.hostBucketSlices, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT );
+
+ // #TODO: Do this on the GPU and simply copy it over
+ for( uint32 i = 0; i < BBCU_BUCKET_COUNT; i++ )
+ for( uint32 j = 0; j < BBCU_BUCKET_COUNT; j++ )
+ cx.bucketCounts[(int)cx.table][i] += cx.bucketSlices[outIdx][j][i];
+
+ cx.tableEntryCounts[(int)cx.table] = 0;
+ for( uint32 i = 0; i < BBCU_BUCKET_COUNT; i++ )
+ cx.tableEntryCounts[(int)cx.table] += cx.bucketCounts[(int)cx.table][i];
+
+ // Cap entry counts to 2^k
+ if( cx.tableEntryCounts[(int)cx.table] > BBCU_TABLE_ENTRY_COUNT )
+ {
+ const uint32 overflow = (uint32)( cx.tableEntryCounts[(int)cx.table] - BBCU_TABLE_ENTRY_COUNT );
+ cx.tableEntryCounts[(int)cx.table] = BBCU_TABLE_ENTRY_COUNT;
+ cx.bucketCounts[(int)cx.table][BBCU_BUCKET_COUNT-1] -= overflow;
+ }
+
+ cx.yOut.WaitForCompletion();
+ cx.yOut.Reset();
+
+ cx.xPairsOut.WaitForCompletion();
+ cx.xPairsOut.Reset();
+
+ cx.xPairsIn.Reset();
+
+ cx.pairsLOut.WaitForCompletion();
+ cx.pairsLOut.Reset();
+ cx.pairsROut.WaitForCompletion();
+ cx.pairsROut.Reset();
+
+ // #NOTE: Must do this to ensure the buffers are
+ // free for the next go, which use the same underlying buffers
+ // but a different downloader object.
+ cx.sortedXPairsOut.WaitForCompletion();
+ cx.sortedXPairsOut.Reset();
+
+ cx.sortedPairsLOut.WaitForCompletion();//cx.sortedPairsLOut.WaitForCopyCompletion();
+ cx.sortedPairsLOut.Reset();
+ cx.sortedPairsROut.WaitForCompletion();//cx.sortedPairsROut.WaitForCopyCompletion();
+ cx.sortedPairsROut.Reset();
+
+
+ if( cx.table < TableId::Table7 )
+ {
+ cx.metaOut.WaitForCompletion(); cx.metaOut.Reset();
+ }
+
+ cx.yIn .Reset();
+ cx.pairsLIn.Reset();
+ cx.pairsRIn.Reset();
+ cx.metaIn .Reset();
+
+ const auto elapsed = TimerEnd( timer );
+ Log::Line( "Table %u completed in %.2lf seconds with %llu entries.",
+ (uint32)cx.table+1, elapsed, cx.tableEntryCounts[(int)cx.table] );
+
+ #if DBG_BBCU_P1_WRITE_PAIRS
+ // Write them sorted, so have to wait until table 3 completes
+ if( cx.table > TableId::Table2 )
+ DbgWritePairs( cx, cx.table - 1 );
+ #endif
+
+ if( cx.table == TableId::Table7 )
+ {
+ FinalizeTable7( cx );
+
+ #if DBG_BBCU_P1_WRITE_PAIRS
+ DbgWritePairs( cx, TableId::Table7 );
+ #endif
+
+ #if DBG_BBCU_P1_WRITE_CONTEXT
+ DbgWriteContext( cx );
+ #endif
+ }
+}
+
+//-----------------------------------------------------------
+void FpTableBucket( CudaK32PlotContext& cx, const uint32 bucket )
+{
+ cx.bucket = bucket;
+
+ // Load next bucket in the background
+ if( bucket + 1 < BBCU_BUCKET_COUNT )
+ UploadBucketForTable( cx, bucket + 1 );
+
+ const TableId inTable = cx.table - 1;
+ const uint32 entryCount = cx.bucketCounts[(int)inTable][bucket];
+
+ // #NOTE: Ensure these match the ones in UploadBucketForTable()
+ cudaStream_t mainStream = cx.computeStream;
+ cudaStream_t metaStream = cx.computeStream;//B;
+ cudaStream_t pairsStream = cx.computeStream;//C;
+
+ uint32* sortKeyIn = (uint32*)cx.devMatches;
+ uint32* sortKeyOut = cx.devSortKey;
+ if( cx.table > TableId::Table2 )
+ {
+ // Generate a sorting key
+ CudaK32PlotGenSortKey( entryCount, sortKeyIn, mainStream );
+ }
+
+ uint32* devYUnsorted = (uint32*)cx.yIn.GetUploadedDeviceBuffer( mainStream );
+ uint32* devMetaUnsorted = nullptr;
+
+ uint32* devYSorted = cx.devYWork;
+ uint32* devMetaSorted = cx.devMetaWork;
+
+ if( cx.table == TableId::Table2 )
+ {
+ devMetaUnsorted = (uint32*)cx.metaIn.GetUploadedDeviceBuffer( mainStream );
+ sortKeyIn = devMetaUnsorted;
+ sortKeyOut = devMetaSorted;
+ }
+
+ // Sort y w/ key
+ CudaErrCheck( cub::DeviceRadixSort::SortPairs(
+ cx.devSortTmp, cx.devSortTmpAllocSize,
+ devYUnsorted, devYSorted,
+ sortKeyIn, sortKeyOut,
+ entryCount, 0, 32, mainStream ) );
+
+ CudaErrCheck( cudaEventRecord( cx.computeEventC, mainStream ) );
+ CudaErrCheck( cudaEventRecord( cx.computeEventA, mainStream ) );
+
+ cx.yIn.ReleaseDeviceBuffer( mainStream );
+ if( cx.table == TableId::Table2 )
+ cx.metaIn.ReleaseDeviceBuffer( mainStream );
+
+ // Sort and download prev table's pairs
+ const bool isLTableInlineable = cx.table == TableId::Table2 || (uint32)cx.table <= cx.gCfg->numDroppedTables+1;
+
+ if( !isLTableInlineable )
+ {
+ CudaErrCheck( cudaStreamWaitEvent( pairsStream, cx.computeEventC ) ); // Ensure sort key is ready
+
+ const bool isLTableInlinedPairs = (uint32)cx.table == cx.gCfg->numDroppedTables + 2;
+
+ if( isLTableInlinedPairs )
+ {
+ // Table 2's pairs are inlined x's. Treat as Pairs
+ Pair* pairsIn = (Pair*)cx.xPairsIn.GetUploadedDeviceBuffer( pairsStream );
+ Pair* sortedPairs = (Pair*)cx.sortedXPairsOut.LockDeviceBuffer( pairsStream );
+
+ CudaK32PlotSortByKey( entryCount, sortKeyOut, pairsIn, sortedPairs, pairsStream );
+ cx.xPairsIn.ReleaseDeviceBuffer( pairsStream );
+
+ Pair* hostPairs = ((Pair*)cx.hostBackPointers[(int)cx.table-1].left) + cx.prevTablePairOffset;
+
+ // Write sorted pairs back to host
+ cx.sortedXPairsOut.DownloadT( hostPairs, entryCount, pairsStream, cx.downloadDirect );
+ }
+ else
+ {
+ uint32* hostPairsL, *hostPairsLFinal;
+ uint16* hostPairsR, *hostPairsRFinal;
+
+ // Wait for pairs to complete loading and sort on Y (or do this before match? Giving us time to write to disk while matching?)
+ uint32* pairsLIn = (uint32*)cx.pairsLIn .GetUploadedDeviceBuffer( pairsStream );
+ uint32* sortedPairsL = (uint32*)cx.sortedPairsLOut.LockDeviceBuffer( pairsStream );
+ CudaK32PlotSortByKey( entryCount, sortKeyOut, pairsLIn, sortedPairsL, pairsStream );
+ cx.pairsLIn.ReleaseDeviceBuffer( pairsStream );
+ hostPairsL = cx.hostTableSortedL + cx.prevTablePairOffset;
+ hostPairsLFinal = cx.hostBackPointers[(int)cx.table-1].left + cx.prevTablePairOffset;
+
+ cx.sortedPairsLOut.DownloadT( hostPairsLFinal, entryCount, pairsStream, cx.downloadDirect );
+ // cx.sortedPairsLOut.DownloadAndCopyT( hostPairsL, hostPairsLFinal, entryCount, pairsStream );
+
+ // if( !isOutputCompressed )
+ {
+ uint16* pairsRIn = (uint16*)cx.pairsRIn .GetUploadedDeviceBuffer( pairsStream );
+ uint16* sortedPairsR = (uint16*)cx.sortedPairsROut.LockDeviceBuffer( pairsStream );
+ CudaK32PlotSortByKey( entryCount, sortKeyOut, pairsRIn, sortedPairsR, pairsStream );
+ cx.pairsRIn.ReleaseDeviceBuffer( pairsStream );
+ hostPairsR = cx.hostTableSortedR + cx.prevTablePairOffset;
+ hostPairsRFinal = cx.hostBackPointers[(int)cx.table-1].right + cx.prevTablePairOffset;
+
+ cx.sortedPairsROut.DownloadT( hostPairsRFinal, entryCount, pairsStream, cx.downloadDirect );
+ // cx.sortedPairsROut.DownloadAndCopyT( hostPairsR, hostPairsRFinal, entryCount, pairsStream );
+ }
+ }
+ }
+
+ // Match pairs
+ CudaMatchBucketizedK32( cx, devYSorted, mainStream, nullptr );
+
+ // Inline input x's or compressed x's
+ if( isLTableInlineable )
+ {
+ uint32* inlineInput = devMetaSorted;
+
+ if( cx.table > TableId::Table2 )
+ {
+ uint32* pairsLIn = (uint32*)cx.pairsLIn.GetUploadedDeviceBuffer( pairsStream );
+ inlineInput = cx.devXInlineInput;
+
+ CudaK32PlotSortByKey( entryCount, sortKeyOut, pairsLIn, inlineInput, pairsStream );
+ cx.pairsLIn.ReleaseDeviceBuffer( pairsStream );
+ }
+
+ // Inline x values into our new pairs (merge L table into R table)
+ InlineTable( cx, inlineInput, mainStream );
+ }
+
+ // Upload and sort metadata
+ if( cx.table > TableId::Table2 )
+ {
+ const uint32 metaMultiplier = GetTableMetaMultiplier( cx.table - 1 );
+
+ // Wait for meta to complete loading, and sort on Y
+ devMetaUnsorted = (uint32*)cx.metaIn.GetUploadedDeviceBuffer( metaStream );
+
+ // Ensure the sort key is ready
+ CudaErrCheck( cudaStreamWaitEvent( metaStream, cx.computeEventA ) );
+
+ switch( metaMultiplier )
+ {
+ case 2: CudaK32PlotSortByKey( entryCount, sortKeyOut, (K32Meta2*)devMetaUnsorted, (K32Meta2*)devMetaSorted, metaStream ); break;
+ case 3: CudaK32PlotSortByKey( entryCount, sortKeyOut, (K32Meta3*)devMetaUnsorted, (K32Meta3*)devMetaSorted, metaStream ); break;
+ case 4: CudaK32PlotSortByKey( entryCount, sortKeyOut, (K32Meta4*)devMetaUnsorted, (K32Meta4*)devMetaSorted, metaStream ); break;
+ default: ASSERT( 0 ); break;
+ }
+ cx.metaIn.ReleaseDeviceBuffer( metaStream );
+ CudaErrCheck( cudaEventRecord( cx.computeEventB, metaStream ) );
+ }
+
+ // Ensure metadata is sorted
+ CudaErrCheck( cudaStreamWaitEvent( mainStream, cx.computeEventB ) );
+
+ // Compute Fx
+ GenFx( cx, devYSorted, devMetaSorted, mainStream );
+
+ CudaK32PlotDownloadBucket( cx );
+
+ cx.prevTablePairOffset += entryCount;
+}
+
+//-----------------------------------------------------------
+void FinalizeTable7( CudaK32PlotContext& cx )
+{
+ Log::Line( "Finalizing Table 7" );
+
+ const auto timer = TimerBegin();
+
+ cx.table = TableId::Table7+1; // Set a false table
+ cx.prevTablePairOffset = 0;
+
+ // Upload initial bucket
+ UploadBucketForTable( cx, 0 );
+
+
+ // Prepare C1 & 2 tables
+ const uint32 c1Interval = kCheckpoint1Interval;
+ const uint32 c2Interval = kCheckpoint1Interval * kCheckpoint2Interval;
+
+ const uint64 tableLength = cx.tableEntryCounts[(int)TableId::Table7];
+ const uint32 c1TotalEntries = (uint32)CDiv( tableLength, (int)c1Interval ) + 1; // +1 because chiapos adds an extra '0' entry at the end
+ const uint32 c2TotalEntries = (uint32)CDiv( tableLength, (int)c2Interval ) + 1; // +1 because we add a short-circuit entry to prevent C2 lookup overflows
+
+ const size_t c1TableSizeBytes = c1TotalEntries * sizeof( uint32 );
+ const size_t c2TableSizeBytes = c2TotalEntries * sizeof( uint32 );
+
+
+ // Prepare host allocations
+ constexpr size_t c3ParkSize = CalculateC3Size();
+
+ const uint64 totalParkSize = CDivT( tableLength, (uint64)kCheckpoint1Interval ) * c3ParkSize;
+
+ StackAllocator hostAlloc( cx.hostMeta, BBCU_TABLE_ALLOC_ENTRY_COUNT * sizeof( uint32 ) * 4 );
+ uint32* hostC1Buffer = hostAlloc.CAlloc( c1TotalEntries );
+ uint32* hostC2Buffer = hostAlloc.CAlloc( c2TotalEntries );
+ uint32* hostLastParkEntries = hostAlloc.CAlloc( kCheckpoint1Interval );
+ byte* hostLastParkBuffer = (byte*)hostAlloc.CAlloc( kCheckpoint1Interval );
+ byte* hostCompressedParks = hostAlloc.AllocT( totalParkSize );
+
+ byte* hostParkWriter = hostCompressedParks;
+ uint32* hostC1Writer = hostC1Buffer;
+
+ // Prepare device allocations
+ constexpr size_t devAllocatorSize = BBCU_BUCKET_ALLOC_ENTRY_COUNT * BBCU_HOST_META_MULTIPLIER * sizeof( uint32 );
+ StackAllocator devAlloc( cx.devMetaWork, devAllocatorSize );
+
+ constexpr uint32 maxParksPerBucket = CuCDiv( BBCU_BUCKET_ENTRY_COUNT, kCheckpoint1Interval ) + 1;
+ static_assert( maxParksPerBucket * c3ParkSize < devAllocatorSize );
+
+ uint32* devC1Buffer = devAlloc.CAlloc( c1TotalEntries );
+ uint32* devC1Writer = devC1Buffer;
+
+ const size_t parkBufferSize = kCheckpoint1Interval * sizeof( uint32 );
+
+ GpuDownloadBuffer& parkDownloader = cx.metaOut;
+
+ cudaStream_t mainStream = cx.computeStream;
+ cudaStream_t metaStream = cx.computeStream;//B;
+ cudaStream_t pairsStream = cx.computeStream;//C;
+ cudaStream_t downloadStream = cx.gpuDownloadStream[0]->GetStream();
+
+ // Load CTable
+ FSE_CTable* devCTable = devAlloc.AllocT( sizeof( CTable_C3 ), sizeof( uint64 ) );
+ CudaErrCheck( cudaMemcpyAsync( devCTable, CTable_C3, sizeof( CTable_C3 ), cudaMemcpyHostToDevice, cx.computeStream ) );
+
+
+ // Prepare plot tables
+ cx.plotWriter->ReserveTableSize( PlotTable::C1, c1TableSizeBytes );
+ cx.plotWriter->ReserveTableSize( PlotTable::C2, c2TableSizeBytes );
+ cx.plotWriter->BeginTable( PlotTable::C3 );
+
+ // Save a buffer with space before the start of it for us to copy retained entries for the next park.
+ uint32 retainedC3EntryCount = 0;
+ uint32* devYSorted = cx.devYWork + kCheckpoint1Interval;
+
+
+ uint32* sortKeyIn = (uint32*)cx.devMatches;
+ uint32* sortKeyOut = cx.devSortKey;
+
+ // Compress parks
+ for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
+ {
+ cx.bucket = bucket;
+
+ // Upload next bucket
+ if( bucket + 1 < BBCU_BUCKET_COUNT )
+ UploadBucketForTable( cx, bucket+1 );
+
+ const uint32 entryCount = cx.bucketCounts[(int)TableId::Table7][bucket];
+ ASSERT( entryCount > kCheckpoint1Interval );
+
+
+ // Generate a sorting key
+ CudaK32PlotGenSortKey( entryCount, sortKeyIn, mainStream );
+
+ // Sort y w/ key
+ uint32* devYUnsorted = (uint32*)cx.yIn.GetUploadedDeviceBuffer( mainStream );
+
+ CudaErrCheck( cub::DeviceRadixSort::SortPairs(
+ cx.devSortTmp, cx.devSortTmpAllocSize,
+ devYUnsorted, devYSorted,
+ sortKeyIn, sortKeyOut,
+ entryCount, 0, 32, mainStream ) );
+
+ CudaErrCheck( cudaEventRecord( cx.computeEventA, mainStream ) );
+ cx.yIn.ReleaseDeviceBuffer( mainStream ); devYUnsorted = nullptr;
+
+ // Sort pairs
+ {
+ CudaErrCheck( cudaStreamWaitEvent( pairsStream, cx.computeEventA ) ); // Wait for the sort key to be ready
+
+ uint32* sortedPairsL = (uint32*)cx.sortedPairsLOut.LockDeviceBuffer( pairsStream );
+ uint32* pairsLIn = (uint32*)cx.pairsLIn.GetUploadedDeviceBuffer( pairsStream );
+ CudaK32PlotSortByKey( entryCount, sortKeyOut, pairsLIn, sortedPairsL, pairsStream );
+ cx.pairsLIn.ReleaseDeviceBuffer( pairsStream );
+
+ uint16* sortedPairsR = (uint16*)cx.sortedPairsROut.LockDeviceBuffer( pairsStream );
+ uint16* pairsRIn = (uint16*)cx.pairsRIn.GetUploadedDeviceBuffer( pairsStream );
+ CudaK32PlotSortByKey( entryCount, sortKeyOut, pairsRIn, sortedPairsR, pairsStream );
+ cx.pairsRIn.ReleaseDeviceBuffer( pairsStream );
+
+
+ // Download sorted pairs back to host
+ // uint32* hostPairsL = cx.hostTableSortedL + cx.prevTablePairOffset;
+ // uint16* hostPairsR = cx.hostTableSortedR + cx.prevTablePairOffset;
+ uint32* hostPairsLFinal = cx.hostBackPointers[(int)TableId::Table7].left + cx.prevTablePairOffset;
+ uint16* hostPairsRFinal = cx.hostBackPointers[(int)TableId::Table7].right + cx.prevTablePairOffset;
+
+ // cx.sortedPairsLOut.DownloadAndCopyT( hostPairsL, hostPairsLFinal, entryCount, pairsStream );
+ // cx.sortedPairsROut.DownloadAndCopyT( hostPairsR, hostPairsRFinal, entryCount, pairsStream );
+ cx.sortedPairsLOut.DownloadT( hostPairsLFinal, entryCount, pairsStream, true );
+ cx.sortedPairsROut.DownloadT( hostPairsRFinal, entryCount, pairsStream, true );
+
+ cx.prevTablePairOffset += entryCount;
+ }
+
+
+ // If we previously had entries retained, adjust our buffer and counts accordingly
+ uint32* devF7Entries = devYSorted - retainedC3EntryCount;
+ uint32 f7EntryCount = entryCount + retainedC3EntryCount;
+
+ const uint32 parkCount = f7EntryCount / kCheckpoint1Interval;
+
+ // Copy C1 entries
+ CudaErrCheck( cudaMemcpy2DAsync( devC1Writer, sizeof( uint32 ), devF7Entries, sizeof( uint32 ) * c1Interval,
+ sizeof( uint32 ), parkCount, cudaMemcpyDeviceToDevice, mainStream ) );
+ devC1Writer += parkCount;
+
+ // Compress C tables
+ // This action mutates the f7 buffer in-place, so ensure the C1 copies happen before this call
+ byte* devParkBuffer = (byte*)parkDownloader.LockDeviceBuffer( mainStream );
+ CompressC3ParksInGPU( parkCount, devF7Entries, devParkBuffer, c3ParkSize, devCTable, mainStream );
+
+ // Retain any new f7 entries for the next bucket, if ndeeded
+ retainedC3EntryCount = f7EntryCount - (parkCount * kCheckpoint1Interval);
+ if( retainedC3EntryCount > 0 )
+ {
+ // Last bucket?
+ const bool isLastBucket = bucket + 1 == BBCU_BUCKET_COUNT;
+
+ const uint32 compressedEntryCount = parkCount * kCheckpoint1Interval;
+ const uint32* copySource = devF7Entries + compressedEntryCount;
+ const size_t copySize = sizeof( uint32 ) * retainedC3EntryCount;
+
+ if( !isLastBucket )
+ {
+ // Not the last bucket, so retain entries for the next GPU compression bucket
+ CudaErrCheck( cudaMemcpyAsync( devYSorted - retainedC3EntryCount, copySource, copySize,
+ cudaMemcpyDeviceToDevice, mainStream ) );
+ }
+ else
+ {
+ // No more buckets so we have to compress this last park on the CPU
+ CudaErrCheck( cudaMemcpyAsync( hostLastParkEntries, copySource, copySize,
+ cudaMemcpyDeviceToHost, downloadStream ) );
+ }
+ }
+
+ // Download compressed parks to host
+ const size_t parkDownloadSize = c3ParkSize * parkCount;
+ parkDownloader.DownloadWithCallback( hostParkWriter, parkDownloadSize,
+ []( void* parksBuffer, size_t size, void* userData ) {
+
+ auto& cx = *reinterpret_cast( userData );
+ cx.plotWriter->WriteTableData( parksBuffer, size );
+ }, &cx, mainStream );
+ hostParkWriter += parkDownloadSize;
+ }
+
+ // Download c1 entries
+ const size_t devC1EntryCount = (size_t)(uintptr_t)(devC1Writer - devC1Buffer);
+ CudaErrCheck( cudaMemcpyAsync( hostC1Buffer, devC1Buffer, sizeof( uint32 ) * devC1EntryCount, cudaMemcpyDeviceToHost, downloadStream ) );
+ hostC1Writer += devC1EntryCount;
+
+ // Wait for parks to finish downloading
+ parkDownloader.WaitForCompletion();
+ parkDownloader.Reset();
+
+ // Was there a left-over park?
+ if( retainedC3EntryCount > 0 )
+ {
+ // Copy c1 entry
+ *hostC1Writer++ = hostLastParkEntries[0];
+ ASSERT( hostC1Writer - hostC1Buffer == c1TotalEntries - 1 );
+
+ // Serialize and trailing park and submit it to the plot
+ if( retainedC3EntryCount > 1 )
+ {
+ TableWriter::WriteC3Park( retainedC3EntryCount - 1, hostLastParkEntries, hostLastParkBuffer );
+ cx.plotWriter->WriteTableData( hostLastParkBuffer, c3ParkSize );
+ }
+ }
+
+ // Write final empty C entries
+ hostC1Buffer[c1TotalEntries-1] = 0;
+ hostC2Buffer[c2TotalEntries-1] = 0;
+
+ // Byte-swap C1
+ for( uint32 i = 0; i < c1TotalEntries-1; i++ )
+ hostC1Buffer[i] = Swap32( hostC1Buffer[i] );
+
+ // Calculate C2 entries
+ for( uint32 i = 0; i < c2TotalEntries-1; i++ )
+ {
+ ASSERT( i * kCheckpoint2Interval < c1TotalEntries - 1 );
+ hostC2Buffer[i] = hostC1Buffer[i * kCheckpoint2Interval];
+ }
+
+ // End C3 table & write C1 & C2 tables
+ cx.plotWriter->EndTable();
+ cx.plotWriter->WriteReservedTable( PlotTable::C1, hostC1Buffer );
+ cx.plotWriter->WriteReservedTable( PlotTable::C2, hostC2Buffer );
+ cx.plotWriter->SignalFence( *cx.plotFence ); // Signal the fence for the start of Phase 3 when we have to use our tmp2 host buffer again
+
+
+ // Cleanup
+ // cx.sortedPairsLOut.WaitForCopyCompletion();
+ // cx.sortedPairsROut.WaitForCopyCompletion();
+ cx.sortedPairsLOut.WaitForCompletion();
+ cx.sortedPairsROut.WaitForCompletion();
+ cx.sortedPairsLOut.Reset();
+ cx.sortedPairsROut.Reset();
+
+ cx.prevTablePairOffset = 0;
+
+ auto elapsed = TimerEnd( timer );
+ Log::Line( "Finalized Table 7 in %.2lf seconds.", elapsed );
+}
+
+//-----------------------------------------------------------
+__global__ void CudaInlineTable( const uint32* entryCount, const uint32* inX, const Pair* matches, Pair* inlinedPairs, uint32 entryBits = 0 )
+{
+ const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+
+ if( gid >= *entryCount )
+ return;
+
+ const Pair pair = matches[gid];
+
+ Pair inlined;
+ inlined.left = inX[pair.left ];
+ inlined.right = inX[pair.right];
+
+ CUDA_ASSERT( inlined.left || inlined.right );
+
+ inlinedPairs[gid] = inlined;
+}
+
+//-----------------------------------------------------------
+template
+__global__ void CudaCompressTable( const uint32* entryCount, const uint32* inLEntries, const Pair* matches, uint32* outREntries, const uint32 bitShift )
+{
+ const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;
+
+ if( gid >= *entryCount )
+ return;
+
+ const Pair pair = matches[gid];
+
+ const uint32 x0 = inLEntries[pair.left ];
+ const uint32 x1 = inLEntries[pair.right];
+
+ // Convert to linepoint
+ if constexpr ( UseLP )
+ outREntries[gid] = (uint32)CudaSquareToLinePoint64( x1 >> bitShift, x0 >> bitShift );
+ else
+ outREntries[gid] = ((x1 >> bitShift) << (32-bitShift) ) | (x0 >> bitShift);
+}
+
+//-----------------------------------------------------------
+void InlineTable( CudaK32PlotContext& cx, const uint32* devInX, cudaStream_t stream )
+{
+ static_assert( alignof( Pair ) == sizeof( uint32 ) );
+
+ const bool isCompressedInput = cx.gCfg->compressionLevel > 0 && (uint32)cx.table <= cx.gCfg->numDroppedTables;
+
+ const uint32 kthreads = 256;
+ const uint32 kblocks = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, (int)kthreads );
+
+ if( isCompressedInput )
+ {
+ const bool isFinalTable = cx.table == TableId::Table1 + (TableId)cx.gCfg->numDroppedTables;
+ const uint32 bitShift = ( isFinalTable && cx.gCfg->numDroppedTables > 1 ) ? 0 : BBCU_K - cx.gCfg->compressedEntryBits;
+
+ if( isFinalTable )
+ CudaCompressTable<<>>( cx.devMatchCount, devInX, cx.devMatches, cx.devCompressedXs, bitShift );
+ else
+ CudaCompressTable<<>>( cx.devMatchCount, devInX, cx.devMatches, cx.devCompressedXs, bitShift );
+ }
+ else
+ {
+ CudaInlineTable<<>>( cx.devMatchCount, devInX, cx.devMatches, cx.devInlinedXs );
+ }
+}
+
+//-----------------------------------------------------------
+void CudaK32PlotDownloadBucket( CudaK32PlotContext& cx )
+{
+ const bool writeVertical = CudaK32PlotIsOutputInterleaved( cx );
+ const size_t metaMultiplier = GetTableMetaMultiplier( cx.table );
+
+ const bool downloadCompressed = cx.table > TableId::Table1 && (uint32)cx.table <= cx.gCfg->numDroppedTables;
+ const bool downloadInlinedPairs = !downloadCompressed && (uint32)cx.table == cx.gCfg->numDroppedTables+1;
+
+ uint32* hostY = cx.hostY;
+ uint32* hostMeta = cx.hostMeta;
+
+ uint32* hostPairsL = cx.hostTableL; //cx.hostBackPointers[6].left;
+ uint16* hostPairsR = cx.hostTableR; //cx.hostBackPointers[6].right;
+ Pair* t2HostPairs = (Pair*)cx.hostBackPointers[4].left;
+
+ const size_t startOffset = cx.bucket * ( writeVertical ? BBCU_MAX_SLICE_ENTRY_COUNT : BBCU_BUCKET_ALLOC_ENTRY_COUNT ); // vertical: offset to starting col. horizontal: to starting row
+ const size_t width = BBCU_MAX_SLICE_ENTRY_COUNT;
+ const size_t height = BBCU_BUCKET_COUNT;
+ const size_t dstStride = writeVertical ? BBCU_BUCKET_ALLOC_ENTRY_COUNT : BBCU_MAX_SLICE_ENTRY_COUNT;
+ const size_t srcStride = BBCU_MAX_SLICE_ENTRY_COUNT;
+
+ cx.yOut.Download2DT( hostY + startOffset, width, height, dstStride, srcStride, cx.computeStream );
+
+ // Metadata
+ if( metaMultiplier > 0 )
+ {
+ const size_t metaSizeMultiplier = metaMultiplier == 3 ? 4 : metaMultiplier;
+ const size_t metaSize = sizeof( uint32 ) * metaSizeMultiplier;
+
+ const size_t metaSrcStride = srcStride * metaSize;
+ const size_t metaDstStride = dstStride * sizeof( K32Meta4 );
+ const size_t metaWidth = width * metaSize;
+ uint32* meta = hostMeta + startOffset * 4;
+
+ cx.metaOut.Download2D( meta, metaWidth, height, metaDstStride, metaSrcStride, cx.computeStream );
+ }
+
+ if( cx.table > TableId::Table1 )
+ {
+ if( downloadInlinedPairs )
+ {
+ cx.xPairsOut.Download2DT( t2HostPairs + startOffset, width, height, dstStride, srcStride, cx.computeStream );
+ }
+ else
+ {
+ cx.pairsLOut.Download2DT( hostPairsL + startOffset, width, height, dstStride, srcStride, cx.computeStream );
+
+ if( !downloadCompressed )
+ cx.pairsROut.Download2DT( hostPairsR + startOffset, width, height, dstStride, srcStride, cx.computeStream );
+ }
+ }
+}
+
+//-----------------------------------------------------------
+void UploadBucketForTable( CudaK32PlotContext& cx, const uint64 bucket )
+{
+ const TableId rTable = cx.table;
+ const TableId inTable = rTable - 1;
+
+ uint32 metaMultiplier = GetTableMetaMultiplier( inTable );
+
+ const uint32 inIdx = CudaK32PlotGetInputIndex( cx );
+ const bool readVertical = CudaK32PlotIsOutputInterleaved( cx );
+
+ const uint32* hostY = cx.hostY;
+ const uint32* hostMeta = cx.hostMeta;
+ const uint32* hostPairsL = cx.hostTableL; //cx.hostBackPointers[6].left;
+ const uint16* hostPairsR = cx.hostTableR; //cx.hostBackPointers[6].right;
+
+ const bool uploadCompressed = cx.table > TableId::Table2 && (uint32)cx.table-1 <= cx.gCfg->numDroppedTables;
+ const bool uploadInlinedPairs = !uploadCompressed && (uint32)cx.table == cx.gCfg->numDroppedTables+2;
+ const Pair* t2HostPairs = (Pair*)cx.hostBackPointers[4].left; // Table 2 will use table 5, and overflow onto 6
+
+ uint32 stride = BBCU_BUCKET_ALLOC_ENTRY_COUNT; // Start as vertical
+ size_t offset = (size_t)bucket * BBCU_MAX_SLICE_ENTRY_COUNT;
+
+ if( !readVertical )
+ {
+ // Adjust to starting row
+ stride = BBCU_MAX_SLICE_ENTRY_COUNT;
+ offset = (size_t)bucket * BBCU_BUCKET_ALLOC_ENTRY_COUNT;
+ }
+
+ cudaStream_t mainStream = cx.computeStream;
+ cudaStream_t metaStream = cx.computeStream;//B;
+ cudaStream_t pairsStream = cx.computeStream;//C;
+
+ const uint32* counts = &cx.bucketSlices[inIdx][0][bucket];
+
+ cx.yIn.UploadArrayT( hostY + offset, BBCU_BUCKET_COUNT, stride, BBCU_BUCKET_COUNT, counts, cx.computeStream );
+
+ // Upload pairs, also
+ if( cx.table > TableId::Table2 )
+ {
+ if( uploadInlinedPairs )
+ {
+ cx.xPairsIn.UploadArrayT( t2HostPairs + offset, BBCU_BUCKET_COUNT, stride, BBCU_BUCKET_COUNT, counts, pairsStream );
+ }
+ else
+ {
+ cx.pairsLIn.UploadArrayT( hostPairsL + offset, BBCU_BUCKET_COUNT, stride, BBCU_BUCKET_COUNT, counts, pairsStream );
+
+ if( !uploadCompressed )
+ cx.pairsRIn.UploadArrayT( hostPairsR + offset, BBCU_BUCKET_COUNT, stride, BBCU_BUCKET_COUNT, counts, pairsStream );
+ }
+ }
+
+ // Meta
+ if( metaMultiplier > 0 )
+ {
+ const size_t metaSizeMultiplier = metaMultiplier == 3 ? 4 : metaMultiplier;
+ const size_t metaSize = sizeof( uint32 ) * metaSizeMultiplier;
+
+ auto actualMetaStream = inTable == TableId::Table1 ? cx.computeStream : metaStream;
+ cx.metaIn.UploadArray( hostMeta + offset * 4, BBCU_BUCKET_COUNT, metaSize, stride * sizeof( K32Meta4 ), BBCU_BUCKET_COUNT, counts, actualMetaStream );
+ }
+}
+
+
+///
+/// Allocations
+///
+//-----------------------------------------------------------
+void AllocBuffers( CudaK32PlotContext& cx )
+{
+ // Determine initially the largest required size
+
+ const size_t alignment = bbclamp( SysHost::GetPageSize(), sizeof( K32Meta4 ), 4096 );
+ cx.allocAlignment = alignment;
+ cx.pinnedAllocSize = 0;
+ cx.hostTableAllocSize = 0;
+ cx.hostTempAllocSize = 0;
+ cx.devAllocSize = 0;
+
+ // Gather the size needed first
+ {
+ CudaK32AllocContext acx = {};
+
+ acx.alignment = alignment;
+ acx.dryRun = true;
+
+ DummyAllocator pinnedAllocator;
+ DummyAllocator hostTableAllocator;
+ DummyAllocator hostTempAllocator;
+ DummyAllocator devAllocator;
+
+ acx.pinnedAllocator = &pinnedAllocator;
+ acx.hostTableAllocator = &hostTableAllocator;
+ acx.hostTempAllocator = &hostTempAllocator;
+ acx.devAllocator = &devAllocator;
+
+ AllocateP1Buffers( cx, acx );
+
+ cx.pinnedAllocSize = pinnedAllocator .Size();
+ cx.hostTableAllocSize = hostTableAllocator.Size();
+ cx.hostTempAllocSize = hostTempAllocator .Size();
+ cx.devAllocSize = devAllocator .Size();
+
+ /// Phase 2
+ pinnedAllocator = {};
+ hostTableAllocator = {};
+ hostTempAllocator = {};
+ devAllocator = {};
+
+ CudaK32PlotPhase2AllocateBuffers( cx, acx );
+
+ cx.pinnedAllocSize = std::max( cx.pinnedAllocSize , pinnedAllocator .Size() );
+ cx.hostTableAllocSize = std::max( cx.hostTableAllocSize, hostTableAllocator.Size() );
+ cx.hostTempAllocSize = std::max( cx.hostTempAllocSize , hostTempAllocator .Size() );
+ cx.devAllocSize = std::max( cx.devAllocSize , devAllocator .Size() );
+
+ /// Phase 3
+ pinnedAllocator = {};
+ hostTableAllocator = {};
+ hostTempAllocator = {};
+ devAllocator = {};
+
+ CudaK32PlotPhase3AllocateBuffers( cx, acx );
+
+ cx.pinnedAllocSize = std::max( cx.pinnedAllocSize , pinnedAllocator .Size() );
+ cx.hostTableAllocSize = std::max( cx.hostTableAllocSize, hostTableAllocator.Size() );
+ cx.hostTempAllocSize = std::max( cx.hostTempAllocSize , hostTempAllocator .Size() );
+ cx.devAllocSize = std::max( cx.devAllocSize , devAllocator .Size() );
+ }
+
+ size_t totalPinnedSize = cx.pinnedAllocSize + cx.hostTempAllocSize;
+ size_t totalHostSize = cx.hostTableAllocSize + totalPinnedSize;
+ Log::Line( "Kernel RAM required : %-12llu bytes ( %-9.2lf MiB or %-6.2lf GiB )", totalPinnedSize,
+ (double)totalPinnedSize BtoMB, (double)totalPinnedSize BtoGB );
+
+ Log::Line( "Intermediate RAM required : %-12llu bytes ( %-9.2lf MiB or %-6.2lf GiB )", cx.pinnedAllocSize,
+ (double)cx.pinnedAllocSize BtoMB, (double)cx.pinnedAllocSize BtoGB );
+
+ Log::Line( "Host RAM required : %-12llu bytes ( %-9.2lf MiB or %-6.2lf GiB )", cx.hostTableAllocSize,
+ (double)cx.hostTableAllocSize BtoMB, (double)cx.hostTableAllocSize BtoGB );
+
+ Log::Line( "Total Host RAM required : %-12llu bytes ( %-9.2lf MiB or %-6.2lf GiB )", totalHostSize,
+ (double)totalHostSize BtoMB, (double)totalHostSize BtoGB );
+
+ Log::Line( "GPU RAM required : %-12llu bytes ( %-9.2lf MiB or %-6.2lf GiB )", cx.devAllocSize,
+ (double)cx.devAllocSize BtoMB, (double)cx.devAllocSize BtoGB );
+
+ Log::Line( "Allocating buffers" );
+ // Now actually allocate the buffers
+ CudaErrCheck( cudaMallocHost( &cx.pinnedBuffer, cx.pinnedAllocSize, cudaHostAllocDefault ) );
+
+ #if _DEBUG
+ cx.hostBufferTables = bbvirtallocboundednuma( cx.hostTableAllocSize );
+ #else
+ #if !_WIN32
+ // if( cx.downloadDirect )
+ CudaErrCheck( cudaMallocHost( &cx.hostBufferTables, cx.hostTableAllocSize, cudaHostAllocDefault ) );
+ // else
+ // {
+ // // #TODO: On windows, first check if we have enough shared memory (512G)?
+ // // and attempt to alloc that way first. Otherwise, use intermediate pinned buffers.
+ #else
+ cx.hostBufferTables = bbvirtallocboundednuma( cx.hostTableAllocSize );
+ #endif
+ // }
+ #endif
+
+ //CudaErrCheck( cudaMallocHost( &cx.hostBufferTables, cx.hostTableAllocSize, cudaHostAllocDefault ) );
+
+ cx.hostBufferTemp = nullptr;
+#if _DEBUG
+ cx.hostBufferTemp = bbvirtallocboundednuma( cx.hostTempAllocSize );
+#endif
+ if( cx.hostBufferTemp == nullptr )
+ CudaErrCheck( cudaMallocHost( &cx.hostBufferTemp, cx.hostTempAllocSize, cudaHostAllocDefault ) );
+
+ CudaErrCheck( cudaMalloc( &cx.deviceBuffer, cx.devAllocSize ) );
+
+ // Warm start
+ if( true )
+ {
+ FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.pinnedBuffer, cx.pinnedAllocSize );
+ FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.hostBufferTables, cx.hostTableAllocSize );
+ FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.hostBufferTemp, cx.hostTempAllocSize );
+ }
+
+ {
+ CudaK32AllocContext acx = {};
+
+ acx.alignment = alignment;
+ acx.dryRun = false;
+
+ StackAllocator pinnedAllocator ( cx.pinnedBuffer , cx.pinnedAllocSize );
+ StackAllocator hostTableAllocator( cx.hostBufferTables, cx.hostTableAllocSize );
+ StackAllocator hostTempAllocator ( cx.hostBufferTemp , cx.hostTempAllocSize );
+ StackAllocator devAllocator ( cx.deviceBuffer , cx.devAllocSize );
+
+ acx.pinnedAllocator = &pinnedAllocator;
+ acx.hostTableAllocator = &hostTableAllocator;
+ acx.hostTempAllocator = &hostTempAllocator;
+ acx.devAllocator = &devAllocator;
+ AllocateP1Buffers( cx, acx );
+
+ pinnedAllocator .PopToMarker( 0 );
+ hostTableAllocator.PopToMarker( 0 );
+ hostTempAllocator .PopToMarker( 0 );
+ devAllocator .PopToMarker( 0 );
+ CudaK32PlotPhase2AllocateBuffers( cx, acx );
+
+ pinnedAllocator .PopToMarker( 0 );
+ hostTableAllocator.PopToMarker( 0 );
+ hostTempAllocator .PopToMarker( 0 );
+ devAllocator .PopToMarker( 0 );
+ CudaK32PlotPhase3AllocateBuffers( cx, acx );
+ }
+}
+
+//-----------------------------------------------------------
+void AllocateP1Buffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
+{
+ const size_t alignment = acx.alignment;
+
+ const bool isCompressed = cx.gCfg->compressionLevel > 0;
+
+ // #TODO: Re-optimize usage here again for windows running 256G
+ /// Host allocations
+ {
+ // Temp allocations are pinned host buffers that can be re-used for other means in different phases.
+ // This is roughly equivalent to temp2 dir during disk plotting.
+ cx.hostY = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
+ cx.hostMeta = acx.hostTempAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT * BBCU_HOST_META_MULTIPLIER, alignment );
+
+ const size_t markingTableBitFieldSize = GetMarkingTableBitFieldSize();
+
+ cx.hostMarkingTables[0] = nullptr;
+ cx.hostMarkingTables[1] = isCompressed ? nullptr : acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment );
+ cx.hostMarkingTables[2] = acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment );
+ cx.hostMarkingTables[3] = acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment );
+ cx.hostMarkingTables[4] = acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment );
+ cx.hostMarkingTables[5] = acx.hostTableAllocator->AllocT( markingTableBitFieldSize, alignment );
+
+
+ // NOTE: The first table has their values inlines into the backpointers of the next table
+ cx.hostBackPointers[0] = {};
+
+ const TableId firstTable = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables;
+
+ Pair* firstTablePairs = acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
+ cx.hostBackPointers[(int)firstTable] = { (uint32*)firstTablePairs, nullptr };
+
+ for( TableId table = firstTable + 1; table <= TableId::Table7; table++ )
+ cx.hostBackPointers[(int)table] = { acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ), acx.hostTableAllocator->CAlloc( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment ) };
+
+ cx.hostTableL = cx.hostBackPointers[6].left; // Also used for Table 7
+ cx.hostTableR = cx.hostBackPointers[6].right;
+ cx.hostTableSortedL = cx.hostBackPointers[5].left;
+ cx.hostTableSortedR = cx.hostBackPointers[5].right;
+ }
+
+ /// Device & Pinned allocations
+ {
+ // #NOTE: The R pair is allocated as uint32 because for table 2 we want to download them as inlined x's, so we need 2 uint32 buffers
+ /// Device/Pinned allocations
+ // cx.yOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+ // cx.metaOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+ cx.yOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer ( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
+ cx.metaOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
+
+ // These download buffers share the same backing buffers
+ {
+ const size_t devMarker = acx.devAllocator->Size();
+ const size_t pinnedMarker = acx.pinnedAllocator->Size();
+
+ cx.pairsLOut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
+ cx.pairsROut = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
+
+ acx.devAllocator->PopToMarker( devMarker );
+ acx.pinnedAllocator->PopToMarker( pinnedMarker );
+
+ // Allocate Pair at the end, to ensure we grab the highest value
+ cx.xPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+ }
+
+ // These download buffers share the same backing buffers
+ {
+ const size_t devMarker = acx.devAllocator->Size();
+ const size_t pinnedMarker = acx.pinnedAllocator->Size();
+
+ cx.sortedPairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+ cx.sortedPairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT( BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
+
+ acx.devAllocator->PopToMarker( devMarker );
+ acx.pinnedAllocator->PopToMarker( pinnedMarker );
+
+ // Allocate Pair at the end, to ensure we grab the highest value
+ cx.sortedXPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT