diff --git a/CMakeLists.txt b/CMakeLists.txt index c260e98..6b7dd8f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,19 +1,79 @@ -cmake_minimum_required(VERSION 3.8 FATAL_ERROR) + # Modifications Copyright (C) 2023 Intel Corporation + # + # This Program is subject to the terms of The Unlicense.​ + # If a copy of the license was not distributed with this file, ​ + # you can obtain one at https://spdx.org/licenses/Unlicense.html​ + #​ + # + # SPDX-License-Identifier: Unlicense + # -project(SimpleConcurrentGPUHashTable LANGUAGES CXX CUDA) +cmake_minimum_required(VERSION 3.10) -# put predefined cmake projects in their own solution folder -set_property(GLOBAL PROPERTY USE_FOLDERS ON) +project(hashtable_sycl LANGUAGES CXX) -string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_35,code=sm_35") +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) -set(SOURCE_FILES src/main.cpp src/test.cpp src/linearprobing.h src/linearprobing.cu) +option(GPU_AOT "Build AOT for Intel GPU" OFF) +option(USE_NVIDIA_BACKEND "Build for NVIDIA backend" OFF) +option(USE_AMDHIP_BACKEND "Build for AMD HIP backend" OFF) +option(USE_SM "Build for specific SM" OFF) + +set(INTEL_GPU_CXX_FLAGS " -O2 -fsycl -Wall -Wextra -Wno-unused-parameter ") +set(NVIDIA_GPU_CXX_FLAGS " -O3 -fsycl -Wall -Wextra -Wno-unused-parameter ") +set(AMD_GPU_CXX_FLAGS " -O3 -fsycl -Wall -Wextra -Wno-unused-parameter ") + +set(USE_DEFAULT_FLAGS ON) +if("${CMAKE_CXX_FLAGS}" STREQUAL "") + message(STATUS "Using DEFAULT compilation flags") +else() + message(STATUS "Overriding DEFAULT compilation flags") + set(USE_DEFAULT_FLAGS OFF) +endif() + +# AOT compilation +if(GPU_AOT) + message(STATUS "Enabling INTEL backend") + if(USE_DEFAULT_FLAGS) + set(CMAKE_CXX_FLAGS "${INTEL_GPU_CXX_FLAGS}") # Default flags for Intel backend + endif() + if( (${GPU_AOT} STREQUAL "pvc") OR (${GPU_AOT} STREQUAL "PVC") ) + message(STATUS "Enabling Intel GPU AOT compilation for ${GPU_AOT}") + string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=spir64_gen -Xs \"-device 0x0bd5 -revision_id 0x2f\" ") + else() + message(STATUS "Using custom AOT compilation flag ${GPU_AOT}") + string(APPEND CMAKE_CXX_FLAGS " ${GPU_AOT} ") + endif() +elseif(USE_NVIDIA_BACKEND) + message(STATUS "Enabling NVIDIA backend") + if(USE_DEFAULT_FLAGS) + set(CMAKE_CXX_FLAGS "${NVIDIA_GPU_CXX_FLAGS}") # Default flags for NV backend + endif() + if(USE_SM) + message("-- Building for SM_${USE_SM} compatibility") + string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_${USE_SM} ") + else() + message("-- Building for SM_80 compatibility (DEFAULT)") + string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_80 ") + endif() +elseif(USE_AMDHIP_BACKEND) + message(STATUS "Enabling AMD HIP backend for ${USE_AMDHIP_BACKEND} AMD architecture") + if(USE_DEFAULT_FLAGS) + set(CMAKE_CXX_FLAGS "${AMD_GPU_CXX_FLAGS}") # Default flags for AMD backend (gfx90a for MI250) + endif() + string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${USE_AMDHIP_BACKEND} ") +endif() + +set(SOURCES + ${CMAKE_SOURCE_DIR}/src/main.cpp + ${CMAKE_SOURCE_DIR}/src/test.cpp + ${CMAKE_SOURCE_DIR}/src/linearprobing.cpp +) include_directories(${CMAKE_SOURCE_DIR}/src) -add_executable(test ${SOURCE_FILES}) +add_executable(${PROJECT_NAME} ${SOURCES}) -# visual studio project should mimic directory structure -# this isn't working for me; I think because -# https://developercommunity.visualstudio.com/content/problem/777578/source-grouptree-no-longer-works.html -source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${SOURCE_FILES}) +target_link_libraries(${PROJECT_NAME} sycl OpenCL stdc++fs) diff --git a/LICENSE b/LICENSE deleted file mode 100644 index fdddb29..0000000 --- a/LICENSE +++ /dev/null @@ -1,24 +0,0 @@ -This is free and unencumbered software released into the public domain. - -Anyone is free to copy, modify, publish, use, compile, sell, or -distribute this software, either in source code form or as a compiled -binary, for any purpose, commercial or non-commercial, and by any -means. - -In jurisdictions that recognize copyright laws, the author or authors -of this software dedicate any and all copyright interest in the -software to the public domain. We make this dedication for the benefit -of the public at large and to the detriment of our heirs and -successors. We intend this dedication to be an overt act of -relinquishment in perpetuity of all present and future rights to this -software under copyright law. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR -OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -OTHER DEALINGS IN THE SOFTWARE. - -For more information, please refer to diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..2977253 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,8 @@ +Modifications Copyright (C) 2023 Intel Corporation + +This Program is subject to the terms of The Unlicense.​ +If a copy of the license was not distributed with this file, ​ +you can obtain one at https://spdx.org/licenses/Unlicense.html​ + + +SPDX-License-Identifier: Unlicense diff --git a/README.md b/README.md index ce2a8c1..46028ed 100644 --- a/README.md +++ b/README.md @@ -1,89 +1,64 @@ -![](screenshot.png) +# hashtable -# About +hashtable implements a simple hash table in GPU (original CUDA source code is from [here](https://github.com/nosferalatu/SimpleGPUHashTable)). -This project shows how to implement a simple GPU hash table. Thanks to the high bandwidth and massive parallelism of -GPU's, the result is a high performance hash table capable of hundreds of millions of operations per second. -The code achieves an average insertion rate of 326 million key/second on my development laptop with an NVIDIA GTX 1060, -measured by inserting 64 million elements. +## SYCL version -[Read my blog post about the code here](http://nosferalatu.com/SimpleGPUHashTable.html) for more information about the -implementation. +- The CUDA code was migrated using Intel DPCT, and then the resulting code was modified to remove the DPCT headers. +- Timing code was later added for performance measurement purpose. +- The same SYCL code runs on Intel GPUs & CPUs as well as NVIDIA (tested on A100 and H100) and AMD (tested on MI100 and MI250) GPUs. -The code implements a lock free hash table using linear probing. Concurrent inserts, deletes, and lookups are supported by -this hash table. The hash table works on 32 bit keys and 32 bit values (although 0xffffffff is reserved for both keys -and values). The load factor of the table is set to 50% in the code, and the table size must be a power of two. +# Build Instructions +- icpx compiler mentioned below is included in oneAPI Base Toolkit available [here](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html). +- clang++ compiler mentioned below is available [here](https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedGuide.md). +## To build for SYCL -Atomic operations are used to insert key/value pairs into the hash table on multiple GPU threads. It uses CUDA for ease -of development, but could easily be ported to HLSL or GLSL. 64 bit keys and/or values could be supported using 64 bit -atomics. - -Resizing the hash table is not implemented (it's a *simple* hash table!) although this can be achieved by inserting the -contents of a table into another, larger table. - -The code was kept simple for readability. There are many optimizations that can be done, but they muddy the waters. I -wanted to illustrate the basic design of the lock free hash table and how it can be implemented on a GPU. - -# How To Use - -If you build and run the executable, it enters an infinite loop of inserting and deleting random numbers into the -GPU hash table and verifying that the results are correct. The seed used to generate random numbers changes every time -you run the executable, but you can set the seed to a specific value in code if you'd like to reproduce results across -runs. - -This is how you insert a vector of `KeyValue` pairs into the hash table and then retrieve all the `KeyValue` pairs back: - -```cpp - std::vector things_to_insert = { {0,1}, {1,2}, {2,3}, {3,4} }; - - KeyValue* pHashTable = create_hashtable(); - insert_hashtable(pHashTable, things_to_insert.data(), (uint32_t)things_to_insert.size()); - std::vector result = iterate_hashtable(pHashTable); - destroy_hashtable(pHashTable); -``` - -After that runs, the vectors `things_to_insert` and `result` should be the same, but possibly in a different order. - -# Prerequisites - -* CMake -* CUDA - -This has been tested on Windows with Visual Studio Community 2019 on a machine with an NVIDIA GTX 1060. -An easy way to get CMake is to open a Visual Studio command prompt (in Windows, run "x64 Native Tools Command Prompt for -VS 2019"; that will put CMake in your path). - -This should work on other CUDA-supported platforms, but I have not tested this. - -# Cloning +For Intel GPU - +First, source icpx compiler. Then, ``` -git clone https://github.com/nosferalatu/SimpleConcurrentGPUHashTable.git SimpleConcurrentGPUHashTable +mkdir build +cd build +CXX=icpx cmake -DGPU_AOT=pvc .. +make -sj ``` +Note: +- To enable AOT compilation, please use the flag `-DGPU_AOT=pvc` for PVC. -# Generating Build Files - -Run the following commands to generate .sln and .vcxproj's that can be opened in Visual Studio: - +For AMD GPU - +First source clang++ compiler. Then, ``` -cd ConcurrentHashTables -md build +mkdir build cd build -cmake .. +CXX=clang++ cmake -DUSE_AMDHIP_BACKEND=gfx90a .. +make -sj ``` +Note: +- We use the flag `-DUSE_AMDHIP_BACKEND=gfx90a` for MI250. Use the correct value for your GPU. -You can now open `SimpleConcurrentGPUHashTable.sln` in Visual Studio. - -If CMake fails to find CUDA above, then run a CMake generator for 64 bit builds: +For NVIDIA GPU - +First source clang++ compiler. Then, ``` -cmake -G "Visual Studio 16 2019 Win64" .. +mkdir build +cd build +CXX=clang++ cmake -DUSE_NVIDIA_BACKEND=YES -DUSE_SM=80 .. +make -sj ``` +Note: +- We use the flag `-DUSE_SM=80` for A100 or `-DUSE_SM=90` for H100. -# Building +# Run instructions -You can build within Visual Studio, or from the command line with: +After building, to run the workload, cd into the build folder. Then ``` -cmake --build . --config Release +./hashtable_sycl +``` +By default a verification is done and that takes some time. To skip verification: ``` +./hashtable_sycl --no-verify +``` +# Output + +Output gives number of keys per second. diff --git a/screenshot.png b/screenshot.png deleted file mode 100644 index b8c09fd..0000000 Binary files a/screenshot.png and /dev/null differ diff --git a/src/acas.h b/src/acas.h new file mode 100644 index 0000000..1a755bd --- /dev/null +++ b/src/acas.h @@ -0,0 +1,66 @@ +/* Modifications Copyright (C) 2023 Intel Corporation + * + * This Program is subject to the terms of The Unlicense.​ + * If a copy of the license was not distributed with this file, ​ + * you can obtain one at https://spdx.org/licenses/Unlicense.html​ + *​ + * + * SPDX-License-Identifier: Unlicense + */ + +//===----------------------------------------------------------------------===// +// +// Following code is copied from atomic.hpp of dpct +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace acas { + +/// Atomically compare the value at \p addr to the value expected and exchange +/// with the value desired if the value at \p addr is equal to the value expected. +/// Returns the value at the \p addr before the call. +/// \param [in, out] addr Multi_ptr. +/// \param expected The value to compare against the value at \p addr. +/// \param desired The value to assign to \p addr if the value at \p addr is expected. +/// \param success The memory ordering used when comparison succeeds. +/// \param fail The memory ordering used when comparison fails. +/// \returns The value at the \p addr before the call. +template +T atomic_compare_exchange_strong( + sycl::multi_ptr addr, + T expected, + T desired, + sycl::memory_order success = sycl::memory_order::relaxed, + sycl::memory_order fail = sycl::memory_order::relaxed +) { + // sycl::atomic_ref obj(addr); + sycl::atomic_ref obj(addr[0]); + obj.compare_exchange_strong(expected, desired, success, fail); + return expected; +} + +/// Atomically compare the value at \p addr to the value expected and exchange +/// with the value desired if the value at \p addr is equal to the value expected. +/// Returns the value at the \p addr before the call. +/// \param [in] addr The pointer to the data. +/// \param expected The value to compare against the value at \p addr. +/// \param desired The value to assign to \p addr if the value at \p addr is expected. +/// \param success The memory ordering used when comparison succeeds. +/// \param fail The memory ordering used when comparison fails. +/// \returns The value at the \p addr before the call. +template +T atomic_compare_exchange_strong( + T* addr, + T expected, + T desired, + sycl::memory_order success = sycl::memory_order::relaxed, + sycl::memory_order fail = sycl::memory_order::relaxed +) { + return atomic_compare_exchange_strong(sycl::multi_ptr(addr), expected, desired, success, fail); +} + +} // namespace acas diff --git a/src/linearprobing.cpp b/src/linearprobing.cpp new file mode 100644 index 0000000..0642443 --- /dev/null +++ b/src/linearprobing.cpp @@ -0,0 +1,313 @@ +/* Modifications Copyright (C) 2023 Intel Corporation + * + * This Program is subject to the terms of The Unlicense.​ + * If a copy of the license was not distributed with this file, ​ + * you can obtain one at https://spdx.org/licenses/Unlicense.html​ + *​ + * + * SPDX-License-Identifier: Unlicense + */ + +#include "stdio.h" +#include "stdint.h" +#include "vector" + +#define CPP_MODULE "KERNEL" +#include "linearprobing.h" + +#include +#include +#include "acas.h" + +// 32 bit Murmur3 hash +uint32_t hash(uint32_t k) +{ + k ^= k >> 16; + k *= 0x85ebca6b; + k ^= k >> 13; + k *= 0xc2b2ae35; + k ^= k >> 16; + return k & (kHashTableCapacity - 1); +} + +// Create a hash table. For linear probing, this is just an array of KeyValues +KeyValue* create_hashtable(sycl::queue& qht) +{ + KeyValue* hashtable; + + try { + // Allocate memory + hashtable = sycl::malloc_device(kHashTableCapacity, qht); + + // Initialize hash table to empty + static_assert(kEmpty == 0xFFFFFFFF, "memset expected kEmpty=0xFFFFFFFF"); + qht.memset(hashtable, 0xFF, sizeof(KeyValue) * kHashTableCapacity); + qht.wait(); + } catch (std::exception const& e) { + LOG_ERROR("Exception caught, \'" << e.what() << "\'"); + } catch (...) { + LOG_ERROR("Unknown exception caught, bailing..."); + } + + return hashtable; +} + +// Insert the key/values in kvs into the hashtable +void gpu_hashtable_insert( + KeyValue* hashtable, + const KeyValue* kvs, + unsigned int numkvs, + sycl::nd_item<1> item) +{ + unsigned int tid = item.get_global_id(0); + if (tid < numkvs) { + uint32_t key = kvs[tid].key; + uint32_t value = kvs[tid].value; + uint32_t slot = hash(key); + + while (true) { + uint32_t prev = acas::atomic_compare_exchange_strong(&hashtable[slot].key, kEmpty, key); + if (prev == kEmpty || prev == key) { + hashtable[slot].value = value; + return; + } + + slot = (slot + 1) & (kHashTableCapacity - 1); + } + } +} + +void insert_hashtable( + KeyValue* pHashTable, // hashtable + const KeyValue* kvs, // starting position for this batch of key-value pairs + uint32_t num_kvs, // number of key-value pairs in this batch + sycl::queue& qht) +{ + try { + // Copy this batch of key-value pairs to the device + KeyValue* device_kvs; + device_kvs = sycl::malloc_device(num_kvs, qht); + auto e1 = qht.memcpy(device_kvs, kvs, sizeof(KeyValue) * num_kvs); + + int threadblocksize = 256; // perf does not seem to vary w/ thread block size (for all kernels in hashtable) + + // Create events for GPU timing + qht.parallel_for( + sycl::nd_range<1>(num_kvs, threadblocksize), std::move(e1), + [=](sycl::nd_item<1> item) { + + gpu_hashtable_insert( + pHashTable, + device_kvs, + (uint32_t)num_kvs, + item); + } + ); + qht.wait(); + + sycl::free(device_kvs, qht); + } catch (std::exception const& e) { + LOG_ERROR("Exception caught, \'" << e.what() << "\'"); + } catch (...) { + LOG_ERROR("Unknown exception caught, bailing..."); + } +} + +// Lookup keys in the hashtable, and return the values +void gpu_hashtable_lookup( + KeyValue* hashtable, + KeyValue* kvs, + unsigned int numkvs, + sycl::nd_item<1> item) +{ + unsigned int tid = item.get_global_id(0); + if (tid < numkvs) { + uint32_t key = kvs[tid].key; + uint32_t slot = hash(key); + + while (true) { + if (hashtable[slot].key == key) { + kvs[tid].value = hashtable[slot].value; + return; + } + if (hashtable[slot].key == kEmpty) { + kvs[tid].value = kEmpty; + return; + } + slot = (slot + 1) & (kHashTableCapacity - 1); + } + } +} + +void lookup_hashtable( + KeyValue* pHashTable, + KeyValue* kvs, + uint32_t num_kvs, + sycl::queue& qht) +{ + try { + // Copy this batch of key-value pairs to the device + KeyValue* device_kvs; + device_kvs = sycl::malloc_device(num_kvs, qht); + auto e1 = qht.memcpy(device_kvs, kvs, sizeof(KeyValue) * num_kvs); + + int threadblocksize = 256; + + qht.parallel_for( + sycl::nd_range<1>(num_kvs, threadblocksize), std::move(e1), + [=](sycl::nd_item<1> item) { + + gpu_hashtable_lookup( + pHashTable, + device_kvs, + (uint32_t)num_kvs, + item); + } + ); + qht.wait(); + + sycl::free(device_kvs, qht); + } catch (std::exception const& e) { + LOG_ERROR("Exception caught, \'" << e.what() << "\'"); + } catch (...) { + LOG_ERROR("Unknown exception caught, bailing..."); + } +} + +// Delete each key in kvs from the hash table, if the key exists +// A deleted key is left in the hash table, but its value is set to kEmpty +// Deleted keys are not reused; once a key is assigned a slot, it never moves +void gpu_hashtable_delete( + KeyValue* hashtable, + const KeyValue* kvs, + unsigned int numkvs, + sycl::nd_item<1> item) +{ + unsigned int tid = item.get_global_id(0); + if (tid < numkvs) { + uint32_t key = kvs[tid].key; + uint32_t slot = hash(key); + + while (true) { + if (hashtable[slot].key == key) { + hashtable[slot].value = kEmpty; + return; + } + if (hashtable[slot].key == kEmpty) { + return; + } + slot = (slot + 1) & (kHashTableCapacity - 1); + } + } +} + +void delete_hashtable( + KeyValue* pHashTable, + const KeyValue* kvs, + uint32_t num_kvs, + sycl::queue& qht) +{ + try { + // Copy the keyvalues to the GPU + KeyValue* device_kvs; + device_kvs = sycl::malloc_device(num_kvs, qht); + auto e1 = qht.memcpy(device_kvs, kvs, sizeof(KeyValue) * num_kvs); + + int threadblocksize = 256; + + qht.parallel_for( + sycl::nd_range<1>(num_kvs, threadblocksize), std::move(e1), + [=](sycl::nd_item<1> item) { + + gpu_hashtable_delete( + pHashTable, + device_kvs, + (uint32_t)num_kvs, + item) ; + } + ); + qht.wait(); + + sycl::free(device_kvs, qht); + } catch (std::exception const& e) { + LOG_ERROR("Exception caught, \'" << e.what() << "\'"); + } catch (...) { + LOG_ERROR("Unknown exception caught, bailing..."); + } +} + +// Iterate over every item in the hashtable; return non-empty key/values +void gpu_iterate_hashtable( + KeyValue* pHashTable, + KeyValue* kvs, + uint32_t* kvs_size, + sycl::nd_item<1> item) +{ + unsigned int tid = item.get_global_id(0); + if (tid < kHashTableCapacity) { + if (pHashTable[tid].key != kEmpty) { + uint32_t value = pHashTable[tid].value; + if (value != kEmpty) { + // uint32_t size = sycl::atomic(sycl::global_ptr(kvs_size)).fetch_add(1); + uint32_t size = sycl::atomic_ref(kvs_size[0]).fetch_add(1); + kvs[size] = pHashTable[tid]; + } + } + } +} + +std::vector iterate_hashtable( + KeyValue* pHashTable, + sycl::queue& qht) +{ + std::vector kvs; + + try { + uint32_t* device_num_kvs; + KeyValue* device_kvs; + device_num_kvs = sycl::malloc_device(1, qht); + device_kvs = sycl::malloc_device(kNumKeyValues, qht); + + auto e1 = qht.memset(device_num_kvs, 0, sizeof(uint32_t)); + + int threadblocksize = 256; + + auto e2 = qht.parallel_for( + sycl::nd_range<1>(kHashTableCapacity, threadblocksize), std::move(e1), + [=](sycl::nd_item<1> item) { + + gpu_iterate_hashtable( + pHashTable, + device_kvs, + device_num_kvs, + item); + } + ); + + uint32_t num_kvs; + qht.memcpy(&num_kvs, device_num_kvs, sizeof(uint32_t), std::move(e2)); + qht.wait(); + + kvs.resize(num_kvs); + + qht.memcpy(kvs.data(), device_kvs, sizeof(KeyValue) * num_kvs); + qht.wait(); + + sycl::free(device_kvs, qht); + sycl::free(device_num_kvs, qht); + } catch (std::exception const& e) { + LOG_ERROR("Exception caught, \'" << e.what() << "\'"); + } catch (...) { + LOG_ERROR("Unknown exception caught, bailing..."); + } + + return kvs; +} + +// Free the memory of the hashtable +void destroy_hashtable( + KeyValue* pHashTable, + sycl::queue& qht) +{ + sycl::free(pHashTable, qht); +} diff --git a/src/linearprobing.cu b/src/linearprobing.cu deleted file mode 100644 index ac75d9d..0000000 --- a/src/linearprobing.cu +++ /dev/null @@ -1,268 +0,0 @@ -#include "stdio.h" -#include "stdint.h" -#include "vector" -#include "linearprobing.h" - -// 32 bit Murmur3 hash -__device__ uint32_t hash(uint32_t k) -{ - k ^= k >> 16; - k *= 0x85ebca6b; - k ^= k >> 13; - k *= 0xc2b2ae35; - k ^= k >> 16; - return k & (kHashTableCapacity-1); -} - -// Create a hash table. For linear probing, this is just an array of KeyValues -KeyValue* create_hashtable() -{ - // Allocate memory - KeyValue* hashtable; - cudaMalloc(&hashtable, sizeof(KeyValue) * kHashTableCapacity); - - // Initialize hash table to empty - static_assert(kEmpty == 0xffffffff, "memset expected kEmpty=0xffffffff"); - cudaMemset(hashtable, 0xff, sizeof(KeyValue) * kHashTableCapacity); - - return hashtable; -} - -// Insert the key/values in kvs into the hashtable -__global__ void gpu_hashtable_insert(KeyValue* hashtable, const KeyValue* kvs, unsigned int numkvs) -{ - unsigned int threadid = blockIdx.x*blockDim.x + threadIdx.x; - if (threadid < numkvs) - { - uint32_t key = kvs[threadid].key; - uint32_t value = kvs[threadid].value; - uint32_t slot = hash(key); - - while (true) - { - uint32_t prev = atomicCAS(&hashtable[slot].key, kEmpty, key); - if (prev == kEmpty || prev == key) - { - hashtable[slot].value = value; - return; - } - - slot = (slot + 1) & (kHashTableCapacity-1); - } - } -} - -void insert_hashtable(KeyValue* pHashTable, const KeyValue* kvs, uint32_t num_kvs) -{ - // Copy the keyvalues to the GPU - KeyValue* device_kvs; - cudaMalloc(&device_kvs, sizeof(KeyValue) * num_kvs); - cudaMemcpy(device_kvs, kvs, sizeof(KeyValue) * num_kvs, cudaMemcpyHostToDevice); - - // Have CUDA calculate the thread block size - int mingridsize; - int threadblocksize; - cudaOccupancyMaxPotentialBlockSize(&mingridsize, &threadblocksize, gpu_hashtable_insert, 0, 0); - - // Create events for GPU timing - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - cudaEventRecord(start); - - // Insert all the keys into the hash table - int gridsize = ((uint32_t)num_kvs + threadblocksize - 1) / threadblocksize; - gpu_hashtable_insert<<>>(pHashTable, device_kvs, (uint32_t)num_kvs); - - cudaEventRecord(stop); - - cudaEventSynchronize(stop); - - float milliseconds = 0; - cudaEventElapsedTime(&milliseconds, start, stop); - float seconds = milliseconds / 1000.0f; - printf(" GPU inserted %d items in %f ms (%f million keys/second)\n", - num_kvs, milliseconds, num_kvs / (double)seconds / 1000000.0f); - - cudaFree(device_kvs); -} - -// Lookup keys in the hashtable, and return the values -__global__ void gpu_hashtable_lookup(KeyValue* hashtable, KeyValue* kvs, unsigned int numkvs) -{ - unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x; - if (threadid < numkvs) - { - uint32_t key = kvs[threadid].key; - uint32_t slot = hash(key); - - while (true) - { - if (hashtable[slot].key == key) - { - kvs[threadid].value = hashtable[slot].value; - return; - } - if (hashtable[slot].key == kEmpty) - { - kvs[threadid].value = kEmpty; - return; - } - slot = (slot + 1) & (kHashTableCapacity - 1); - } - } -} - -void lookup_hashtable(KeyValue* pHashTable, KeyValue* kvs, uint32_t num_kvs) -{ - // Copy the keyvalues to the GPU - KeyValue* device_kvs; - cudaMalloc(&device_kvs, sizeof(KeyValue) * num_kvs); - cudaMemcpy(device_kvs, kvs, sizeof(KeyValue) * num_kvs, cudaMemcpyHostToDevice); - - // Have CUDA calculate the thread block size - int mingridsize; - int threadblocksize; - cudaOccupancyMaxPotentialBlockSize(&mingridsize, &threadblocksize, gpu_hashtable_insert, 0, 0); - - // Create events for GPU timing - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - cudaEventRecord(start); - - // Insert all the keys into the hash table - int gridsize = ((uint32_t)num_kvs + threadblocksize - 1) / threadblocksize; - gpu_hashtable_lookup << > > (pHashTable, device_kvs, (uint32_t)num_kvs); - - cudaEventRecord(stop); - - cudaEventSynchronize(stop); - - float milliseconds = 0; - cudaEventElapsedTime(&milliseconds, start, stop); - float seconds = milliseconds / 1000.0f; - printf(" GPU lookup %d items in %f ms (%f million keys/second)\n", - num_kvs, milliseconds, num_kvs / (double)seconds / 1000000.0f); - - cudaFree(device_kvs); -} - -// Delete each key in kvs from the hash table, if the key exists -// A deleted key is left in the hash table, but its value is set to kEmpty -// Deleted keys are not reused; once a key is assigned a slot, it never moves -__global__ void gpu_hashtable_delete(KeyValue* hashtable, const KeyValue* kvs, unsigned int numkvs) -{ - unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x; - if (threadid < numkvs) - { - uint32_t key = kvs[threadid].key; - uint32_t slot = hash(key); - - while (true) - { - if (hashtable[slot].key == key) - { - hashtable[slot].value = kEmpty; - return; - } - if (hashtable[slot].key == kEmpty) - { - return; - } - slot = (slot + 1) & (kHashTableCapacity - 1); - } - } -} - -void delete_hashtable(KeyValue* pHashTable, const KeyValue* kvs, uint32_t num_kvs) -{ - // Copy the keyvalues to the GPU - KeyValue* device_kvs; - cudaMalloc(&device_kvs, sizeof(KeyValue) * num_kvs); - cudaMemcpy(device_kvs, kvs, sizeof(KeyValue) * num_kvs, cudaMemcpyHostToDevice); - - // Have CUDA calculate the thread block size - int mingridsize; - int threadblocksize; - cudaOccupancyMaxPotentialBlockSize(&mingridsize, &threadblocksize, gpu_hashtable_insert, 0, 0); - - // Create events for GPU timing - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - cudaEventRecord(start); - - // Insert all the keys into the hash table - int gridsize = ((uint32_t)num_kvs + threadblocksize - 1) / threadblocksize; - gpu_hashtable_delete<< > > (pHashTable, device_kvs, (uint32_t)num_kvs); - - cudaEventRecord(stop); - - cudaEventSynchronize(stop); - - float milliseconds = 0; - cudaEventElapsedTime(&milliseconds, start, stop); - float seconds = milliseconds / 1000.0f; - printf(" GPU delete %d items in %f ms (%f million keys/second)\n", - num_kvs, milliseconds, num_kvs / (double)seconds / 1000000.0f); - - cudaFree(device_kvs); -} - -// Iterate over every item in the hashtable; return non-empty key/values -__global__ void gpu_iterate_hashtable(KeyValue* pHashTable, KeyValue* kvs, uint32_t* kvs_size) -{ - unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x; - if (threadid < kHashTableCapacity) - { - if (pHashTable[threadid].key != kEmpty) - { - uint32_t value = pHashTable[threadid].value; - if (value != kEmpty) - { - uint32_t size = atomicAdd(kvs_size, 1); - kvs[size] = pHashTable[threadid]; - } - } - } -} - -std::vector iterate_hashtable(KeyValue* pHashTable) -{ - uint32_t* device_num_kvs; - cudaMalloc(&device_num_kvs, sizeof(uint32_t)); - cudaMemset(device_num_kvs, 0, sizeof(uint32_t)); - - KeyValue* device_kvs; - cudaMalloc(&device_kvs, sizeof(KeyValue) * kNumKeyValues); - - int mingridsize; - int threadblocksize; - cudaOccupancyMaxPotentialBlockSize(&mingridsize, &threadblocksize, gpu_iterate_hashtable, 0, 0); - - int gridsize = (kHashTableCapacity + threadblocksize - 1) / threadblocksize; - gpu_iterate_hashtable<<>>(pHashTable, device_kvs, device_num_kvs); - - uint32_t num_kvs; - cudaMemcpy(&num_kvs, device_num_kvs, sizeof(uint32_t), cudaMemcpyDeviceToHost); - - std::vector kvs; - kvs.resize(num_kvs); - - cudaMemcpy(kvs.data(), device_kvs, sizeof(KeyValue) * num_kvs, cudaMemcpyDeviceToHost); - - cudaFree(device_kvs); - cudaFree(device_num_kvs); - - return kvs; -} - -// Free the memory of the hashtable -void destroy_hashtable(KeyValue* pHashTable) -{ - cudaFree(pHashTable); -} diff --git a/src/linearprobing.h b/src/linearprobing.h index 6487e8b..5cadba3 100644 --- a/src/linearprobing.h +++ b/src/linearprobing.h @@ -1,25 +1,63 @@ +/* Modifications Copyright (C) 2023 Intel Corporation + * + * This Program is subject to the terms of The Unlicense.​ + * If a copy of the license was not distributed with this file, ​ + * you can obtain one at https://spdx.org/licenses/Unlicense.html​ + *​ + * + * SPDX-License-Identifier: Unlicense + */ + #pragma once +#include +#include +#include + +#ifndef CPP_MODULE +#define CPP_MODULE "UNKN" +#endif + struct KeyValue { uint32_t key; uint32_t value; }; -const uint32_t kHashTableCapacity = 128 * 1024 * 1024; +const uint32_t kHashTableCapacity = 256 * 1024 * 1024; const uint32_t kNumKeyValues = kHashTableCapacity / 2; -const uint32_t kEmpty = 0xffffffff; +const uint32_t kEmpty = 0xFFFFFFFF; + +const uint32_t NUM_LOOPS = 1; -KeyValue* create_hashtable(); +KeyValue* create_hashtable(sycl::queue& qht); -void insert_hashtable(KeyValue* hashtable, const KeyValue* kvs, uint32_t num_kvs); +void insert_hashtable(KeyValue* hashtable, const KeyValue* kvs, uint32_t num_kvs, sycl::queue& qht); +void lookup_hashtable(KeyValue* hashtable, KeyValue* kvs, uint32_t num_kvs, sycl::queue& qht); +void delete_hashtable(KeyValue* hashtable, const KeyValue* kvs, uint32_t num_kvs, sycl::queue& qht); -void lookup_hashtable(KeyValue* hashtable, KeyValue* kvs, uint32_t num_kvs); +std::vector iterate_hashtable(KeyValue* hashtable, sycl::queue& qht); -void delete_hashtable(KeyValue* hashtable, const KeyValue* kvs, uint32_t num_kvs); +void destroy_hashtable(KeyValue* hashtable, sycl::queue& qht); -std::vector iterate_hashtable(KeyValue* hashtable); +#define checkCUDA(expression) \ +{ \ + cudaError_t const status(expression); \ + if (status != cudaSuccess) { \ + std::stringstream sErrorMessage; \ + sErrorMessage << "Error on line " << __LINE__ << ": " \ + << cudaGetErrorString(status) << "\n"; \ + throw std::runtime_error(sErrorMessage.str()); \ + std::exit(EXIT_FAILURE); \ + } \ +} -void destroy_hashtable(KeyValue* hashtable); +#define LOG_ERROR(msg) \ +{ \ + std::stringstream sErrorMessage; \ + sErrorMessage << CPP_MODULE << " ERROR(" << __LINE__<< "): " << msg << "\n"; \ + std::cerr << sErrorMessage.str(); \ + throw std::runtime_error(sErrorMessage.str()); \ +} diff --git a/src/main.cpp b/src/main.cpp index 9b15dc2..c7f8e32 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,3 +1,13 @@ +/* Modifications Copyright (C) 2023 Intel Corporation + * + * This Program is subject to the terms of The Unlicense.​ + * If a copy of the license was not distributed with this file, ​ + * you can obtain one at https://spdx.org/licenses/Unlicense.html​ + *​ + * + * SPDX-License-Identifier: Unlicense + */ + #include "algorithm" #include "random" #include "stdint.h" @@ -6,11 +16,48 @@ #include "unordered_set" #include "vector" #include "chrono" +#include + +// #define DEBUG_TIME +#define CPP_MODULE "MAIN" #include "linearprobing.h" +#define TIMER_START() time_start = std::chrono::steady_clock::now(); +#define TIMER_END() \ + time_end = std::chrono::steady_clock::now(); \ + time_total = std::chrono::duration(time_end - time_start).count(); +#define TIMER_PRINT(name) std::cout << name <<": " << time_total / 1e3 << " s\n"; + +#ifdef DEBUG_TIME +#define START_TIMER() start_time = std::chrono::steady_clock::now(); +#define STOP_TIMER() \ + stop_time = std::chrono::steady_clock::now(); \ + duration = std::chrono::duration(stop_time - start_time).count(); \ + tot_time += duration; +#define PRINT_TIMER(name) std::cout <; + +Time start_timer() +{ + return std::chrono::steady_clock::now(); +} + +double get_elapsed_time(Time start) +{ + Time end = std::chrono::steady_clock::now(); + + std::chrono::duration d = end - start; + std::chrono::microseconds us = std::chrono::duration_cast(d); + return us.count() / 1000.0f; +} + // Create random keys/values in the range [0, kEmpty) // kEmpty is used to indicate an empty slot -std::vector generate_random_keyvalues(std::mt19937& rnd, uint32_t numkvs) +std::vector generate_random_keyvalues( + std::mt19937& rnd, + uint32_t numkvs) { std::uniform_int_distribution dis(0, kEmpty - 1); @@ -28,7 +75,10 @@ std::vector generate_random_keyvalues(std::mt19937& rnd, uint32_t numk } // return numshuffledkvs random items from kvs -std::vector shuffle_keyvalues(std::mt19937& rnd, std::vector kvs, uint32_t numshuffledkvs) +std::vector shuffle_keyvalues( + std::mt19937& rnd, + std::vector kvs, + uint32_t numshuffledkvs) { std::shuffle(kvs.begin(), kvs.end(), rnd); @@ -40,23 +90,9 @@ std::vector shuffle_keyvalues(std::mt19937& rnd, std::vector return shuffled_kvs; } -using Time = std::chrono::time_point; - -Time start_timer() -{ - return std::chrono::high_resolution_clock::now(); -} - -double get_elapsed_time(Time start) -{ - Time end = std::chrono::high_resolution_clock::now(); - - std::chrono::duration d = end - start; - std::chrono::microseconds us = std::chrono::duration_cast(d); - return us.count() / 1000.0f; -} - -void test_unordered_map(std::vector insert_kvs, std::vector delete_kvs) +void test_unordered_map( + std::vector insert_kvs, + std::vector delete_kvs) { Time timer = start_timer(); @@ -78,69 +114,161 @@ void test_unordered_map(std::vector insert_kvs, std::vector double milliseconds = get_elapsed_time(timer); double seconds = milliseconds / 1000.0f; - printf("Total time for std::unordered_map: %f ms (%f million keys/second)\n", + printf("Total time for std::unordered_map: %f ms (%f Mkeys/second)\n", milliseconds, kNumKeyValues / seconds / 1000000.0f); } -void test_correctness(std::vector, std::vector, std::vector); +void test_correctness( + std::vector, + std::vector, + std::vector); -int main() +int main(int argc, char* argv[]) { + std::chrono::steady_clock::time_point time_start; + std::chrono::steady_clock::time_point time_end; + double time_total = 0.0; + + try { + // To recreate the same random numbers across runs of the program, set seed to a specific // number instead of a number from random_device std::random_device rd; uint32_t seed = rd(); std::mt19937 rnd(seed); // mersenne_twister_engine - printf("Random number generator seed = %u\n", seed); + // printf("Random number generator seed = %u\n", seed); - while (true) - { - printf("Initializing keyvalue pairs with random numbers...\n"); + // double seconds; + // for (uint32_t n = 0; n < NUM_LOOPS; ++n) { + // printf("Initializing keyvalue pairs with random numbers...\n"); + +#ifdef DEBUG_TIME + std::chrono::steady_clock::time_point start_time; + std::chrono::steady_clock::time_point stop_time; + double duration = 0.0; + double tot_time = 0.0; + START_TIMER(); +#endif std::vector insert_kvs = generate_random_keyvalues(rnd, kNumKeyValues); + std::vector lookup_kvs = shuffle_keyvalues(rnd, insert_kvs, kNumKeyValues / 2); std::vector delete_kvs = shuffle_keyvalues(rnd, insert_kvs, kNumKeyValues / 2); - // Begin test - printf("Testing insertion/deletion of %d/%d elements into GPU hash table...\n", - (uint32_t)insert_kvs.size(), (uint32_t)delete_kvs.size()); - - Time timer = start_timer(); - - KeyValue* pHashTable = create_hashtable(); - - // Insert items into the hash table +#ifdef DEBUG_TIME +STOP_TIMER(); +PRINT_TIMER("generate_hashtable "); +#endif + + TIMER_START() + +#ifdef DEBUG_TIME +START_TIMER(); +#endif + sycl::queue qht; +#ifdef DEBUG_TIME +STOP_TIMER(); +PRINT_TIMER("init "); + +START_TIMER(); +#endif + // Allocates device memory for the hashtable and + // fills every byte with 0xFF (so each key (and value) is set to 0xFFFFFFFF) + KeyValue* pHashTable = create_hashtable(qht); +#ifdef DEBUG_TIME +STOP_TIMER(); +PRINT_TIMER("create_hashtable "); + +START_TIMER(); +#endif + // Insert items into the hash table in batches of num_inserts_per_batch const uint32_t num_insert_batches = 16; uint32_t num_inserts_per_batch = (uint32_t)insert_kvs.size() / num_insert_batches; - for (uint32_t i = 0; i < num_insert_batches; i++) - { - insert_hashtable(pHashTable, insert_kvs.data() + i * num_inserts_per_batch, num_inserts_per_batch); + for (uint32_t i = 0; i < num_insert_batches; i++) { + + insert_hashtable( + pHashTable, + insert_kvs.data() + i * num_inserts_per_batch, + num_inserts_per_batch, + qht); + } +#ifdef DEBUG_TIME +STOP_TIMER(); +PRINT_TIMER("insert_hashtable "); + +START_TIMER(); +#endif + // Look up items from the hash table in batches of num_lookups_per_batch + const uint32_t num_lookup_batches = 8; + uint32_t num_lookups_per_batch = (uint32_t)lookup_kvs.size() / num_lookup_batches; + for (uint32_t i = 0; i < num_lookup_batches; i++) { + + lookup_hashtable( + pHashTable, + lookup_kvs.data() + i * num_lookups_per_batch, + num_lookups_per_batch, + qht); } +#ifdef DEBUG_TIME +STOP_TIMER(); +PRINT_TIMER("lookup_hashtable "); - // Delete items from the hash table +START_TIMER(); +#endif + // Delete items from the hash table in batches of num_deletes_per_batch const uint32_t num_delete_batches = 8; uint32_t num_deletes_per_batch = (uint32_t)delete_kvs.size() / num_delete_batches; - for (uint32_t i = 0; i < num_delete_batches; i++) - { - delete_hashtable(pHashTable, delete_kvs.data() + i * num_deletes_per_batch, num_deletes_per_batch); + for (uint32_t i = 0; i < num_delete_batches; i++) { + + delete_hashtable( + pHashTable, + delete_kvs.data() + i * num_deletes_per_batch, + num_deletes_per_batch, + qht); } +#ifdef DEBUG_TIME +STOP_TIMER(); +PRINT_TIMER("delete_hashtable "); +START_TIMER(); +#endif // Get all the key-values from the hash table - std::vector kvs = iterate_hashtable(pHashTable); - - destroy_hashtable(pHashTable); + std::vector kvs = iterate_hashtable(pHashTable, qht); +#ifdef DEBUG_TIME +STOP_TIMER(); +PRINT_TIMER("iterate_hashtable "); +#endif +// std::cout << "tot_time: " << tot_time << " ms" << std::endl; // Summarize results - double milliseconds = get_elapsed_time(timer); - double seconds = milliseconds / 1000.0f; - printf("Total time (including memory copies, readback, etc): %f ms (%f million keys/second)\n", milliseconds, - kNumKeyValues / seconds / 1000000.0f); + // double milliseconds = get_elapsed_time(timer); + // seconds = milliseconds / 1000.0f; - test_unordered_map(insert_kvs, delete_kvs); + destroy_hashtable(pHashTable, qht); - test_correctness(insert_kvs, delete_kvs, kvs); + TIMER_END() + TIMER_PRINT("hashtable - total time for whole calculation") + printf("%f million keys/second\n", kNumKeyValues / (time_total / 1000.0f) / 1000000.0f); - printf("Success\n"); + bool verify = true; + if (argc > 1 && strcmp(argv[1], "--no-verify") == 0) { + verify = false; + } + if (verify) { + test_unordered_map(insert_kvs, delete_kvs); + test_correctness(insert_kvs, delete_kvs, std::move(kvs)); + printf("Success\n"); + } + // } + // printf("Total time: %f s\n", seconds); + // printf("%f million keys/second\n", kNumKeyValues / seconds / 1000000.0f); + + } catch (std::exception const& e) { + std::cout << "Exception caught, \'" << e.what() << "\'"; + return 1; + } catch (...) { + std::cout << "Unknown exception caught, bailing..."; + return 2; } return 0; diff --git a/src/test.cpp b/src/test.cpp index ad454a4..4b8aef4 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -1,3 +1,13 @@ +/* Modifications Copyright (C) 2023 Intel Corporation + * + * This Program is subject to the terms of The Unlicense.​ + * If a copy of the license was not distributed with this file, ​ + * you can obtain one at https://spdx.org/licenses/Unlicense.html​ + *​ + * + * SPDX-License-Identifier: Unlicense + */ + #include "stdio.h" #include "stdint.h" #include "unordered_set" @@ -7,7 +17,10 @@ #include "random" #include "linearprobing.h" -void test_correctness(std::vector insert_kvs, std::vector delete_kvs, std::vector kvs) +void test_correctness( + std::vector insert_kvs, + std::vector delete_kvs, + std::vector kvs) { printf("Testing that there are no duplicate keys...\n"); std::unordered_set unique_keys; @@ -27,7 +40,7 @@ void test_correctness(std::vector insert_kvs, std::vector de printf("Building unordered_map from original list...\n"); std::unordered_map> all_kvs_map; - for (int i = 0; i < insert_kvs.size(); i++) + for (uint32_t i = 0; i < insert_kvs.size(); i++) { if (i % 10000000 == 0) printf(" Inserting %d/%d\n", i, (uint32_t)insert_kvs.size()); @@ -43,7 +56,7 @@ void test_correctness(std::vector insert_kvs, std::vector de } } - for (int i = 0; i < delete_kvs.size(); i++) + for (uint32_t i = 0; i < delete_kvs.size(); i++) { if (i % 10000000 == 0) printf(" Deleting %d/%d\n", i, (uint32_t)delete_kvs.size());