Add MurmurHash3 128 bit hash function (#495)

Adds MurmurHash3 128 bit hash function. - [x] Adds unit test (compares hash value based on [reference_implementation ](https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp) : [see_on_godbolt](https://godbolt.org/z/7sWKq74o6)). - [x] updates documentation. - [x] contributes to #480 --------- Signed-off-by: srinivasyadav18 <[email protected]>
NVIDIA · Jun 4, 2024 · 03304a3 · 03304a3
1 parent 114a780
commit 03304a3
Show file tree

Hide file tree

Showing 4 changed files with 269 additions and 5 deletions.
diff --git a/benchmarks/hash_bench.cu b/benchmarks/hash_bench.cu
@@ -38,6 +38,20 @@ struct large_key {
   int32_t data_[Words];
 };
 
+template <typename T>
+constexpr __host__ __device__ void hash_result_aggregate(T& agg, T hash_val)
+{
+  agg += hash_val;
+}
+
+template <>
+constexpr __host__ __device__ void hash_result_aggregate(cuda::std::array<uint64_t, 2>& agg,
+                                                         cuda::std::array<uint64_t, 2> hash_val)
+{
+  agg[0] += hash_val[0];
+  agg[1] += hash_val[1];
+}
+
 template <int32_t BlockSize, typename Hasher, typename OutputIt>
 __global__ void hash_bench_kernel(Hasher hash,
                                   cuco::detail::index_type n,
@@ -47,12 +61,12 @@ __global__ void hash_bench_kernel(Hasher hash,
   cuco::detail::index_type const gid         = BlockSize * blockIdx.x + threadIdx.x;
   cuco::detail::index_type const loop_stride = gridDim.x * BlockSize;
   cuco::detail::index_type idx               = gid;
-  typename Hasher::result_type agg           = 0;
+  typename Hasher::result_type agg           = {};
 
   while (idx < n) {
     typename Hasher::argument_type key(idx);
     for (int32_t i = 0; i < 100; ++i) {  // execute hash func 100 times
-      agg += hash(key);
+      hash_result_aggregate(agg, hash(key));
     }
     idx += loop_stride;
   }
@@ -94,7 +108,10 @@ NVBENCH_BENCH_TYPES(
                                        cuco::xxhash_64<nvbench::int64_t>,
                                        cuco::xxhash_64<large_key<32>>,
                                        cuco::murmurhash3_fmix_32<nvbench::int32_t>,
-                                       cuco::murmurhash3_fmix_64<nvbench::int64_t>>))
+                                       cuco::murmurhash3_fmix_64<nvbench::int64_t>,
+                                       cuco::murmurhash3_x64_128<nvbench::int32_t>,
+                                       cuco::murmurhash3_x64_128<nvbench::int64_t>,
+                                       cuco::murmurhash3_x64_128<large_key<32>>>))
   .set_name("hash_function_eval")
   .set_type_axes_names({"Hash"})
   .set_max_noise(cuco::benchmark::defaults::MAX_NOISE);
diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh
@@ -19,6 +19,8 @@
 #include <cuco/detail/hash_functions/utils.cuh>
 #include <cuco/extent.cuh>
 
+#include <cuda/std/array>
+
 #include <cstddef>
 #include <cstdint>
 #include <type_traits>
@@ -206,4 +208,151 @@ struct MurmurHash3_32 {
   MurmurHash3_fmix32<std::uint32_t> fmix32_;
   std::uint32_t seed_;
 };
+
+/**
+ * @brief A `MurmurHash3_x64_128` hash function to hash the given argument on host and device.
+ *
+ * MurmurHash3_x64_128 implementation from
+ * https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
+ * -----------------------------------------------------------------------------
+ * MurmurHash3 was written by Austin Appleby, and is placed in the public domain. The author
+ * hereby disclaims copyright to this source code.
+ *
+ * Note - The x86 and x64 versions do _not_ produce the same results, as the algorithms are
+ * optimized for their respective platforms. You can still compile and run any of them on any
+ * platform, but your performance with the non-native version will be less than optimal.
+ *
+ * @tparam Key The type of the values to hash
+ */
+template <typename Key>
+struct MurmurHash3_x64_128 {
+  using argument_type = Key;  ///< The type of the values taken as argument
+  using result_type = cuda::std::array<std::uint64_t, 2>;  ///< The type of the hash values produced
+
+  /**
+   * @brief Constructs a MurmurHash3_x64_128 hash function with the given `seed`.
+   *
+   * @param seed A custom number to randomize the resulting hash value
+   */
+  __host__ __device__ constexpr MurmurHash3_x64_128(std::uint64_t seed = 0)
+    : fmix64_{0}, seed_{seed}
+  {
+  }
+
+  /**
+   * @brief Returns a hash value for its argument, as a value of type `result_type`.
+   *
+   * @param key The input argument to hash
+   * @return The resulting hash value for `key`
+   */
+  constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
+  {
+    return compute_hash(reinterpret_cast<std::byte const*>(&key),
+                        cuco::extent<std::size_t, sizeof(Key)>{});
+  }
+
+  /**
+   * @brief Returns a hash value for its argument, as a value of type `result_type`.
+   *
+   * @tparam Extent The extent type
+   *
+   * @param bytes The input argument to hash
+   * @param size The extent of the data in bytes
+   * @return The resulting hash value
+   */
+  template <typename Extent>
+  constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes,
+                                                         Extent size) const noexcept
+  {
+    constexpr std::uint32_t block_size = 16;
+    auto const nblocks                 = size / block_size;
+
+    std::uint64_t h1           = seed_;
+    std::uint64_t h2           = seed_;
+    constexpr std::uint64_t c1 = 0x87c37b91114253d5ull;
+    constexpr std::uint64_t c2 = 0x4cf5ad432745937full;
+    //----------
+    // body
+    for (std::remove_const_t<decltype(nblocks)> i = 0; size >= block_size && i < nblocks; i++) {
+      std::uint64_t k1 = load_chunk<std::uint64_t>(bytes, 2 * i);
+      std::uint64_t k2 = load_chunk<std::uint64_t>(bytes, 2 * i + 1);
+
+      k1 *= c1;
+      k1 = rotl64(k1, 31);
+      k1 *= c2;
+
+      h1 ^= k1;
+      h1 = rotl64(h1, 27);
+      h1 += h2;
+      h1 = h1 * 5 + 0x52dce729;
+
+      k2 *= c2;
+      k2 = rotl64(k2, 33);
+      k2 *= c1;
+
+      h2 ^= k2;
+      h2 = rotl64(h2, 31);
+      h2 += h1;
+      h2 = h2 * 5 + 0x38495ab5;
+    }
+    //----------
+    // tail
+    std::uint64_t k1 = 0;
+    std::uint64_t k2 = 0;
+    auto const tail  = reinterpret_cast<uint8_t const*>(bytes) + nblocks * block_size;
+    switch (size & (block_size - 1)) {
+      case 15: k2 ^= static_cast<std::uint64_t>(tail[14]) << 48; [[fallthrough]];
+      case 14: k2 ^= static_cast<std::uint64_t>(tail[13]) << 40; [[fallthrough]];
+      case 13: k2 ^= static_cast<std::uint64_t>(tail[12]) << 32; [[fallthrough]];
+      case 12: k2 ^= static_cast<std::uint64_t>(tail[11]) << 24; [[fallthrough]];
+      case 11: k2 ^= static_cast<std::uint64_t>(tail[10]) << 16; [[fallthrough]];
+      case 10: k2 ^= static_cast<std::uint64_t>(tail[9]) << 8; [[fallthrough]];
+      case 9:
+        k2 ^= static_cast<std::uint64_t>(tail[8]) << 0;
+        k2 *= c2;
+        k2 = rotl64(k2, 33);
+        k2 *= c1;
+        h2 ^= k2;
+        [[fallthrough]];
+
+      case 8: k1 ^= static_cast<std::uint64_t>(tail[7]) << 56; [[fallthrough]];
+      case 7: k1 ^= static_cast<std::uint64_t>(tail[6]) << 48; [[fallthrough]];
+      case 6: k1 ^= static_cast<std::uint64_t>(tail[5]) << 40; [[fallthrough]];
+      case 5: k1 ^= static_cast<std::uint64_t>(tail[4]) << 32; [[fallthrough]];
+      case 4: k1 ^= static_cast<std::uint64_t>(tail[3]) << 24; [[fallthrough]];
+      case 3: k1 ^= static_cast<std::uint64_t>(tail[2]) << 16; [[fallthrough]];
+      case 2: k1 ^= static_cast<std::uint64_t>(tail[1]) << 8; [[fallthrough]];
+      case 1:
+        k1 ^= static_cast<std::uint64_t>(tail[0]) << 0;
+        k1 *= c1;
+        k1 = rotl64(k1, 31);
+        k1 *= c2;
+        h1 ^= k1;
+    };
+    //----------
+    // finalization
+    h1 ^= size;
+    h2 ^= size;
+
+    h1 += h2;
+    h2 += h1;
+
+    h1 = fmix64_(h1);
+    h2 = fmix64_(h2);
+
+    h1 += h2;
+    h2 += h1;
+
+    return {h1, h2};
+  }
+
+ private:
+  constexpr __host__ __device__ std::uint64_t rotl64(std::uint64_t x, std::int8_t r) const noexcept
+  {
+    return (x << r) | (x >> (64 - r));
+  }
+
+  MurmurHash3_fmix64<std::uint64_t> fmix64_;
+  std::uint64_t seed_;
+};
 }  //  namespace cuco::detail
diff --git a/include/cuco/hash_functions.cuh b/include/cuco/hash_functions.cuh
@@ -51,6 +51,14 @@ using murmurhash3_fmix_64 = detail::MurmurHash3_fmix64<Key>;
 template <typename Key>
 using murmurhash3_32 = detail::MurmurHash3_32<Key>;
 
+/**
+ * @brief A 128-bit `MurmurHash3` hash function to hash the given argument on host and device.
+ *
+ * @tparam Key The type of the values to hash
+ */
+template <typename Key>
+using murmurhash3_x64_128 = detail::MurmurHash3_x64_128<Key>;
+
 /**
  * @brief A 32-bit `XXH32` hash function to hash the given argument on host and device.
  *

diff --git a/tests/utility/hash_test.cu b/tests/utility/hash_test.cu
@@ -39,9 +39,9 @@ struct large_key {
   int32_t data_[Words];
 };
 
-template <typename Hash>
+template <typename Hash, typename Seed = typename Hash::result_type>
 __host__ __device__ bool check_hash_result(typename Hash::argument_type const& key,
-                                           typename Hash::result_type seed,
+                                           Seed seed,
                                            typename Hash::result_type expected) noexcept
 {
   Hash h(seed);
@@ -195,4 +195,94 @@ TEMPLATE_TEST_CASE_SIG("Static vs. dynamic key hash test",
     CHECK(hash(key) ==
           hash.compute_hash(reinterpret_cast<std::byte const*>(&key), sizeof(key_type)));
   }
+}
+
+template <typename OutputIter>
+__global__ void check_murmurhash3_128_result_kernel(OutputIter result)
+{
+  int i = 0;
+
+  result[i++] = check_hash_result<cuco::murmurhash3_x64_128<int32_t>, uint64_t>(
+    0, 0, {14961230494313510588ull, 6383328099726337777ull});
+  result[i++] = check_hash_result<cuco::murmurhash3_x64_128<int32_t>, uint64_t>(
+    9, 0, {1779292183511753683ull, 16298496441448380334ull});
+  result[i++] = check_hash_result<cuco::murmurhash3_x64_128<int32_t>, uint64_t>(
+    42, 0, {2913627637088662735ull, 16344193523890567190ull});
+  result[i++] = check_hash_result<cuco::murmurhash3_x64_128<int32_t>, uint64_t>(
+    42, 42, {2248879576374326886ull, 18006515275339376488ull});
+  result[i++] =
+    check_hash_result<cuco::murmurhash3_x64_128<cuda::std::array<int32_t, 2>>, uint64_t>(
+      {2, 2}, 0, {12221386834995143465ull, 6690950894782946573ull});
+  result[i++] =
+    check_hash_result<cuco::murmurhash3_x64_128<cuda::std::array<int32_t, 3>>, uint64_t>(
+      {1, 4, 9}, 42, {299140022350411792ull, 9891903873182035274ull});
+  result[i++] =
+    check_hash_result<cuco::murmurhash3_x64_128<cuda::std::array<int32_t, 4>>, uint64_t>(
+      {42, 64, 108, 1024}, 63, {4333511168876981289ull, 4659486988434316416ull});
+  result[i++] =
+    check_hash_result<cuco::murmurhash3_x64_128<cuda::std::array<int32_t, 16>>, uint64_t>(
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+      1024,
+      {3302412811061286680ull, 7070355726356610672ull});
+  result[i++] =
+    check_hash_result<cuco::murmurhash3_x64_128<cuda::std::array<int64_t, 2>>, uint64_t>(
+      {2, 2}, 0, {8554944597931919519ull, 14938998000509429729ull});
+  result[i++] =
+    check_hash_result<cuco::murmurhash3_x64_128<cuda::std::array<int64_t, 3>>, uint64_t>(
+      {1, 4, 9}, 42, {13442629947720186435ull, 7061727494178573325ull});
+  result[i++] =
+    check_hash_result<cuco::murmurhash3_x64_128<cuda::std::array<int64_t, 4>>, uint64_t>(
+      {42, 64, 108, 1024}, 63, {8786399719555989948ull, 14954183901757012458ull});
+  result[i++] =
+    check_hash_result<cuco::murmurhash3_x64_128<cuda::std::array<int64_t, 16>>, uint64_t>(
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+      1024,
+      {15409921801541329777ull, 10546487400963404004ull});
+}
+
+TEST_CASE("Test cuco::murmurhash3_x64_128", "")
+{
+  // Reference hash values were computed using
+  // https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
+
+  SECTION("Check if host-generated hash values match the reference implementation.")
+  {
+    CHECK(check_hash_result<cuco::murmurhash3_x64_128<int32_t>, uint64_t>(
+      0, 0, {14961230494313510588ull, 6383328099726337777ull}));
+    CHECK(check_hash_result<cuco::murmurhash3_x64_128<int32_t>, uint64_t>(
+      9, 0, {1779292183511753683ull, 16298496441448380334ull}));
+    CHECK(check_hash_result<cuco::murmurhash3_x64_128<int32_t>, uint64_t>(
+      42, 0, {2913627637088662735ull, 16344193523890567190ull}));
+    CHECK(check_hash_result<cuco::murmurhash3_x64_128<int32_t>, uint64_t>(
+      42, 42, {2248879576374326886ull, 18006515275339376488ull}));
+    CHECK(check_hash_result<cuco::murmurhash3_x64_128<cuda::std::array<int32_t, 2>>, uint64_t>(
+      {2, 2}, 0, {12221386834995143465ull, 6690950894782946573ull}));
+    CHECK(check_hash_result<cuco::murmurhash3_x64_128<cuda::std::array<int32_t, 3>>, uint64_t>(
+      {1, 4, 9}, 42, {299140022350411792ull, 9891903873182035274ull}));
+    CHECK(check_hash_result<cuco::murmurhash3_x64_128<cuda::std::array<int32_t, 4>>, uint64_t>(
+      {42, 64, 108, 1024}, 63, {4333511168876981289ull, 4659486988434316416ull}));
+    CHECK(check_hash_result<cuco::murmurhash3_x64_128<cuda::std::array<int32_t, 16>>, uint64_t>(
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+      1024,
+      {3302412811061286680ull, 7070355726356610672ull}));
+    CHECK(check_hash_result<cuco::murmurhash3_x64_128<cuda::std::array<int64_t, 2>>, uint64_t>(
+      {2, 2}, 0, {8554944597931919519ull, 14938998000509429729ull}));
+    CHECK(check_hash_result<cuco::murmurhash3_x64_128<cuda::std::array<int64_t, 3>>, uint64_t>(
+      {1, 4, 9}, 42, {13442629947720186435ull, 7061727494178573325ull}));
+    CHECK(check_hash_result<cuco::murmurhash3_x64_128<cuda::std::array<int64_t, 4>>, uint64_t>(
+      {42, 64, 108, 1024}, 63, {8786399719555989948ull, 14954183901757012458ull}));
+    CHECK(check_hash_result<cuco::murmurhash3_x64_128<cuda::std::array<int64_t, 16>>, uint64_t>(
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+      1024,
+      {15409921801541329777ull, 10546487400963404004ull}));
+  }
+
+  SECTION("Check if device-generated hash values match the reference implementation.")
+  {
+    thrust::device_vector<bool> result(12, true);
+
+    check_murmurhash3_128_result_kernel<<<1, 1>>>(result.begin());
+
+    CHECK(cuco::test::all_of(result.begin(), result.end(), [] __device__(bool v) { return v; }));
+  }
 }