diff --git a/benchmarks/hash_bench.cu b/benchmarks/hash_bench.cu index 074146ca2..9aa95e53b 100644 --- a/benchmarks/hash_bench.cu +++ b/benchmarks/hash_bench.cu @@ -38,6 +38,20 @@ struct large_key { int32_t data_[Words]; }; +template +constexpr __host__ __device__ void hash_result_aggregate(T& agg, T hash_val) +{ + agg += hash_val; +} + +template <> +constexpr __host__ __device__ void hash_result_aggregate(cuda::std::array& agg, + cuda::std::array hash_val) +{ + agg[0] += hash_val[0]; + agg[1] += hash_val[1]; +} + template __global__ void hash_bench_kernel(Hasher hash, cuco::detail::index_type n, @@ -47,12 +61,12 @@ __global__ void hash_bench_kernel(Hasher hash, cuco::detail::index_type const gid = BlockSize * blockIdx.x + threadIdx.x; cuco::detail::index_type const loop_stride = gridDim.x * BlockSize; cuco::detail::index_type idx = gid; - typename Hasher::result_type agg = 0; + typename Hasher::result_type agg = {}; while (idx < n) { typename Hasher::argument_type key(idx); for (int32_t i = 0; i < 100; ++i) { // execute hash func 100 times - agg += hash(key); + hash_result_aggregate(agg, hash(key)); } idx += loop_stride; } @@ -94,7 +108,10 @@ NVBENCH_BENCH_TYPES( cuco::xxhash_64, cuco::xxhash_64>, cuco::murmurhash3_fmix_32, - cuco::murmurhash3_fmix_64>)) + cuco::murmurhash3_fmix_64, + cuco::murmurhash3_x64_128, + cuco::murmurhash3_x64_128, + cuco::murmurhash3_x64_128>>)) .set_name("hash_function_eval") .set_type_axes_names({"Hash"}) .set_max_noise(cuco::benchmark::defaults::MAX_NOISE); diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh index 7703217be..1dd0905ce 100644 --- a/include/cuco/detail/hash_functions/murmurhash3.cuh +++ b/include/cuco/detail/hash_functions/murmurhash3.cuh @@ -19,6 +19,8 @@ #include #include +#include + #include #include #include @@ -206,4 +208,151 @@ struct MurmurHash3_32 { MurmurHash3_fmix32 fmix32_; std::uint32_t seed_; }; + +/** + * @brief A `MurmurHash3_x64_128` hash function to hash the given argument on host and device. + * + * MurmurHash3_x64_128 implementation from + * https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp + * ----------------------------------------------------------------------------- + * MurmurHash3 was written by Austin Appleby, and is placed in the public domain. The author + * hereby disclaims copyright to this source code. + * + * Note - The x86 and x64 versions do _not_ produce the same results, as the algorithms are + * optimized for their respective platforms. You can still compile and run any of them on any + * platform, but your performance with the non-native version will be less than optimal. + * + * @tparam Key The type of the values to hash + */ +template +struct MurmurHash3_x64_128 { + using argument_type = Key; ///< The type of the values taken as argument + using result_type = cuda::std::array; ///< The type of the hash values produced + + /** + * @brief Constructs a MurmurHash3_x64_128 hash function with the given `seed`. + * + * @param seed A custom number to randomize the resulting hash value + */ + __host__ __device__ constexpr MurmurHash3_x64_128(std::uint64_t seed = 0) + : fmix64_{0}, seed_{seed} + { + } + + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @param key The input argument to hash + * @return The resulting hash value for `key` + */ + constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept + { + return compute_hash(reinterpret_cast(&key), + cuco::extent{}); + } + + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @tparam Extent The extent type + * + * @param bytes The input argument to hash + * @param size The extent of the data in bytes + * @return The resulting hash value + */ + template + constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes, + Extent size) const noexcept + { + constexpr std::uint32_t block_size = 16; + auto const nblocks = size / block_size; + + std::uint64_t h1 = seed_; + std::uint64_t h2 = seed_; + constexpr std::uint64_t c1 = 0x87c37b91114253d5ull; + constexpr std::uint64_t c2 = 0x4cf5ad432745937full; + //---------- + // body + for (std::remove_const_t i = 0; size >= block_size && i < nblocks; i++) { + std::uint64_t k1 = load_chunk(bytes, 2 * i); + std::uint64_t k2 = load_chunk(bytes, 2 * i + 1); + + k1 *= c1; + k1 = rotl64(k1, 31); + k1 *= c2; + + h1 ^= k1; + h1 = rotl64(h1, 27); + h1 += h2; + h1 = h1 * 5 + 0x52dce729; + + k2 *= c2; + k2 = rotl64(k2, 33); + k2 *= c1; + + h2 ^= k2; + h2 = rotl64(h2, 31); + h2 += h1; + h2 = h2 * 5 + 0x38495ab5; + } + //---------- + // tail + std::uint64_t k1 = 0; + std::uint64_t k2 = 0; + auto const tail = reinterpret_cast(bytes) + nblocks * block_size; + switch (size & (block_size - 1)) { + case 15: k2 ^= static_cast(tail[14]) << 48; [[fallthrough]]; + case 14: k2 ^= static_cast(tail[13]) << 40; [[fallthrough]]; + case 13: k2 ^= static_cast(tail[12]) << 32; [[fallthrough]]; + case 12: k2 ^= static_cast(tail[11]) << 24; [[fallthrough]]; + case 11: k2 ^= static_cast(tail[10]) << 16; [[fallthrough]]; + case 10: k2 ^= static_cast(tail[9]) << 8; [[fallthrough]]; + case 9: + k2 ^= static_cast(tail[8]) << 0; + k2 *= c2; + k2 = rotl64(k2, 33); + k2 *= c1; + h2 ^= k2; + [[fallthrough]]; + + case 8: k1 ^= static_cast(tail[7]) << 56; [[fallthrough]]; + case 7: k1 ^= static_cast(tail[6]) << 48; [[fallthrough]]; + case 6: k1 ^= static_cast(tail[5]) << 40; [[fallthrough]]; + case 5: k1 ^= static_cast(tail[4]) << 32; [[fallthrough]]; + case 4: k1 ^= static_cast(tail[3]) << 24; [[fallthrough]]; + case 3: k1 ^= static_cast(tail[2]) << 16; [[fallthrough]]; + case 2: k1 ^= static_cast(tail[1]) << 8; [[fallthrough]]; + case 1: + k1 ^= static_cast(tail[0]) << 0; + k1 *= c1; + k1 = rotl64(k1, 31); + k1 *= c2; + h1 ^= k1; + }; + //---------- + // finalization + h1 ^= size; + h2 ^= size; + + h1 += h2; + h2 += h1; + + h1 = fmix64_(h1); + h2 = fmix64_(h2); + + h1 += h2; + h2 += h1; + + return {h1, h2}; + } + + private: + constexpr __host__ __device__ std::uint64_t rotl64(std::uint64_t x, std::int8_t r) const noexcept + { + return (x << r) | (x >> (64 - r)); + } + + MurmurHash3_fmix64 fmix64_; + std::uint64_t seed_; +}; } // namespace cuco::detail diff --git a/include/cuco/hash_functions.cuh b/include/cuco/hash_functions.cuh index 000f46fef..93a2f6cab 100644 --- a/include/cuco/hash_functions.cuh +++ b/include/cuco/hash_functions.cuh @@ -51,6 +51,14 @@ using murmurhash3_fmix_64 = detail::MurmurHash3_fmix64; template using murmurhash3_32 = detail::MurmurHash3_32; +/** + * @brief A 128-bit `MurmurHash3` hash function to hash the given argument on host and device. + * + * @tparam Key The type of the values to hash + */ +template +using murmurhash3_x64_128 = detail::MurmurHash3_x64_128; + /** * @brief A 32-bit `XXH32` hash function to hash the given argument on host and device. * diff --git a/tests/utility/hash_test.cu b/tests/utility/hash_test.cu index ccecd52dc..478b05943 100644 --- a/tests/utility/hash_test.cu +++ b/tests/utility/hash_test.cu @@ -39,9 +39,9 @@ struct large_key { int32_t data_[Words]; }; -template +template __host__ __device__ bool check_hash_result(typename Hash::argument_type const& key, - typename Hash::result_type seed, + Seed seed, typename Hash::result_type expected) noexcept { Hash h(seed); @@ -195,4 +195,94 @@ TEMPLATE_TEST_CASE_SIG("Static vs. dynamic key hash test", CHECK(hash(key) == hash.compute_hash(reinterpret_cast(&key), sizeof(key_type))); } +} + +template +__global__ void check_murmurhash3_128_result_kernel(OutputIter result) +{ + int i = 0; + + result[i++] = check_hash_result, uint64_t>( + 0, 0, {14961230494313510588ull, 6383328099726337777ull}); + result[i++] = check_hash_result, uint64_t>( + 9, 0, {1779292183511753683ull, 16298496441448380334ull}); + result[i++] = check_hash_result, uint64_t>( + 42, 0, {2913627637088662735ull, 16344193523890567190ull}); + result[i++] = check_hash_result, uint64_t>( + 42, 42, {2248879576374326886ull, 18006515275339376488ull}); + result[i++] = + check_hash_result>, uint64_t>( + {2, 2}, 0, {12221386834995143465ull, 6690950894782946573ull}); + result[i++] = + check_hash_result>, uint64_t>( + {1, 4, 9}, 42, {299140022350411792ull, 9891903873182035274ull}); + result[i++] = + check_hash_result>, uint64_t>( + {42, 64, 108, 1024}, 63, {4333511168876981289ull, 4659486988434316416ull}); + result[i++] = + check_hash_result>, uint64_t>( + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, + 1024, + {3302412811061286680ull, 7070355726356610672ull}); + result[i++] = + check_hash_result>, uint64_t>( + {2, 2}, 0, {8554944597931919519ull, 14938998000509429729ull}); + result[i++] = + check_hash_result>, uint64_t>( + {1, 4, 9}, 42, {13442629947720186435ull, 7061727494178573325ull}); + result[i++] = + check_hash_result>, uint64_t>( + {42, 64, 108, 1024}, 63, {8786399719555989948ull, 14954183901757012458ull}); + result[i++] = + check_hash_result>, uint64_t>( + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, + 1024, + {15409921801541329777ull, 10546487400963404004ull}); +} + +TEST_CASE("Test cuco::murmurhash3_x64_128", "") +{ + // Reference hash values were computed using + // https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp + + SECTION("Check if host-generated hash values match the reference implementation.") + { + CHECK(check_hash_result, uint64_t>( + 0, 0, {14961230494313510588ull, 6383328099726337777ull})); + CHECK(check_hash_result, uint64_t>( + 9, 0, {1779292183511753683ull, 16298496441448380334ull})); + CHECK(check_hash_result, uint64_t>( + 42, 0, {2913627637088662735ull, 16344193523890567190ull})); + CHECK(check_hash_result, uint64_t>( + 42, 42, {2248879576374326886ull, 18006515275339376488ull})); + CHECK(check_hash_result>, uint64_t>( + {2, 2}, 0, {12221386834995143465ull, 6690950894782946573ull})); + CHECK(check_hash_result>, uint64_t>( + {1, 4, 9}, 42, {299140022350411792ull, 9891903873182035274ull})); + CHECK(check_hash_result>, uint64_t>( + {42, 64, 108, 1024}, 63, {4333511168876981289ull, 4659486988434316416ull})); + CHECK(check_hash_result>, uint64_t>( + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, + 1024, + {3302412811061286680ull, 7070355726356610672ull})); + CHECK(check_hash_result>, uint64_t>( + {2, 2}, 0, {8554944597931919519ull, 14938998000509429729ull})); + CHECK(check_hash_result>, uint64_t>( + {1, 4, 9}, 42, {13442629947720186435ull, 7061727494178573325ull})); + CHECK(check_hash_result>, uint64_t>( + {42, 64, 108, 1024}, 63, {8786399719555989948ull, 14954183901757012458ull})); + CHECK(check_hash_result>, uint64_t>( + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, + 1024, + {15409921801541329777ull, 10546487400963404004ull})); + } + + SECTION("Check if device-generated hash values match the reference implementation.") + { + thrust::device_vector result(12, true); + + check_murmurhash3_128_result_kernel<<<1, 1>>>(result.begin()); + + CHECK(cuco::test::all_of(result.begin(), result.end(), [] __device__(bool v) { return v; })); + } } \ No newline at end of file