From ce1e3433e8b2f3afd31921ad8b9dfb4437de499d Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Tue, 7 Nov 2023 08:54:52 +0800 Subject: [PATCH] backport faster string hash Signed-off-by: Yuan Zhou --- velox/common/base/BitUtil.cpp | 46 +++++++++++++++++++++++++++++++++ velox/common/base/BitUtil.h | 19 +------------- velox/type/StringView.h | 3 +-- velox/type/tests/CMakeLists.txt | 2 +- velox/vector/FlatVector-inl.h | 15 +++-------- 5 files changed, 52 insertions(+), 33 deletions(-) diff --git a/velox/common/base/BitUtil.cpp b/velox/common/base/BitUtil.cpp index cb964e405d4d..757714dce334 100644 --- a/velox/common/base/BitUtil.cpp +++ b/velox/common/base/BitUtil.cpp @@ -16,6 +16,7 @@ #include "velox/common/base/BitUtil.h" #include "velox/common/base/Exceptions.h" +#include "velox/common/base/SimdUtil.h" #include "velox/common/process/ProcessBase.h" namespace facebook::velox::bits { @@ -158,4 +159,49 @@ void scatterBits( #endif } +uint64_t hashBytes(uint64_t seed, const char* data, size_t size) { + auto begin = reinterpret_cast(data); + const uint64_t kMul = 0x9ddfea08eb382d69ULL; + if (size < 8) { + auto word = loadPartialWord(begin, size); + uint64_t crc = simd::crc32U64(seed, word); + uint64_t crc2 = simd::crc32U64(seed, word >> 32); + return crc | (crc2 << 32); + } + uint64_t a0 = seed; + uint64_t a1 = seed << 32; + uint64_t a2 = seed >> 16; + int32_t toGo = size; + auto words = reinterpret_cast(data); + while (toGo >= 24) { + a0 = simd::crc32U64(a0, words[0]); + a1 = simd::crc32U64(a1, words[1]); + a2 = simd::crc32U64(a2, words[2]); + words += 3; + toGo -= 24; + } + if (toGo > 16) { + a0 = simd::crc32U64(a0, words[0]); + a1 = simd::crc32U64(a1, words[1]); + a2 = simd::crc32U64( + a2, + loadPartialWord( + reinterpret_cast(words + 2), toGo - 16)); + } else if (toGo > 8) { + a0 = simd::crc32U64(a0, words[0]); + a1 = simd::crc32U64( + a1, + toGo == 16 + ? words[1] + : loadPartialWord( + reinterpret_cast(words + 1), toGo - 8)); + } else if (toGo > 0) { + a0 = simd::crc32U64( + a0, + toGo == 8 + ? words[0] + : loadPartialWord(reinterpret_cast(words), toGo)); + } + return a0 ^ ((a1 * kMul)) ^ (a2 * kMul); +} } // namespace facebook::velox::bits diff --git a/velox/common/base/BitUtil.h b/velox/common/base/BitUtil.h index f17ce5407f5d..920b9e3d5676 100644 --- a/velox/common/base/BitUtil.h +++ b/velox/common/base/BitUtil.h @@ -777,24 +777,7 @@ inline uint64_t loadPartialWord(const uint8_t* data, int32_t size) { return result; } -inline size_t hashBytes(size_t seed, const char* data, size_t size) { - auto begin = reinterpret_cast(data); - if (size < 8) { - return hashMix(seed, loadPartialWord(begin, size)); - } - auto result = seed; - auto end = begin + size; - while (begin + 8 <= end) { - result = hashMix(result, *reinterpret_cast(begin)); - begin += 8; - } - if (end != begin) { - // Accesses the last 64 bits. Some bytes may get processed twice but the - // access is safe. - result = hashMix(result, *reinterpret_cast(end - 8)); - } - return result; -} +uint64_t hashBytes(uint64_t seed, const char* data, size_t size); namespace detail { // Returns at least 'numBits' bits of data starting at bit 'bitOffset' diff --git a/velox/type/StringView.h b/velox/type/StringView.h index b674684e7380..4802d74f4ed0 100644 --- a/velox/type/StringView.h +++ b/velox/type/StringView.h @@ -283,8 +283,7 @@ namespace folly { template <> struct hasher<::facebook::velox::StringView> { size_t operator()(const ::facebook::velox::StringView view) const { - return hash::SpookyHashV2::Hash64(view.data(), view.size(), 0); - // return facebook::velox::bits::hashBytes(1, view.data(), view.size()); + return facebook::velox::bits::hashBytes(1, view.data(), view.size()); } }; diff --git a/velox/type/tests/CMakeLists.txt b/velox/type/tests/CMakeLists.txt index e841b8820f7b..1106ed087ec7 100644 --- a/velox/type/tests/CMakeLists.txt +++ b/velox/type/tests/CMakeLists.txt @@ -29,7 +29,7 @@ add_test(velox_type_test velox_type_test) target_link_libraries( velox_type_test velox_type - velox_serialization + velox_common_base velox_external_date Folly::folly gtest diff --git a/velox/vector/FlatVector-inl.h b/velox/vector/FlatVector-inl.h index 6762c641ed92..134020df637c 100644 --- a/velox/vector/FlatVector-inl.h +++ b/velox/vector/FlatVector-inl.h @@ -83,18 +83,9 @@ std::unique_ptr> FlatVector::hashAll() const { auto hashData = hashBuffer->asMutable(); if (rawValues_ != nullptr) { // non all-null case - if constexpr (std::is_same_v) { - folly::hasher stringHasher; - for (size_t i = 0; i < BaseVector::length_; ++i) { - auto view = valueAt(i); - folly::StringPiece piece(view.data(), view.size()); - hashData[i] = stringHasher(piece); - } - } else { - folly::hasher hasher; - for (size_t i = 0; i < BaseVector::length_; ++i) { - hashData[i] = hasher(valueAtFast(i)); - } + folly::hasher hasher; + for (size_t i = 0; i < BaseVector::length_; ++i) { + hashData[i] = hasher(valueAtFast(i)); } }