Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

backport faster string hash #435

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions velox/common/base/BitUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include "velox/common/base/BitUtil.h"
#include "velox/common/base/Exceptions.h"
#include "velox/common/base/SimdUtil.h"
#include "velox/common/process/ProcessBase.h"

namespace facebook::velox::bits {
Expand Down Expand Up @@ -158,4 +159,49 @@ void scatterBits(
#endif
}

uint64_t hashBytes(uint64_t seed, const char* data, size_t size) {
auto begin = reinterpret_cast<const uint8_t*>(data);
const uint64_t kMul = 0x9ddfea08eb382d69ULL;
if (size < 8) {
auto word = loadPartialWord(begin, size);
uint64_t crc = simd::crc32U64(seed, word);
uint64_t crc2 = simd::crc32U64(seed, word >> 32);
return crc | (crc2 << 32);
}
uint64_t a0 = seed;
uint64_t a1 = seed << 32;
uint64_t a2 = seed >> 16;
int32_t toGo = size;
auto words = reinterpret_cast<const uint64_t*>(data);
while (toGo >= 24) {
a0 = simd::crc32U64(a0, words[0]);
a1 = simd::crc32U64(a1, words[1]);
a2 = simd::crc32U64(a2, words[2]);
words += 3;
toGo -= 24;
}
if (toGo > 16) {
a0 = simd::crc32U64(a0, words[0]);
a1 = simd::crc32U64(a1, words[1]);
a2 = simd::crc32U64(
a2,
loadPartialWord(
reinterpret_cast<const uint8_t*>(words + 2), toGo - 16));
} else if (toGo > 8) {
a0 = simd::crc32U64(a0, words[0]);
a1 = simd::crc32U64(
a1,
toGo == 16
? words[1]
: loadPartialWord(
reinterpret_cast<const uint8_t*>(words + 1), toGo - 8));
} else if (toGo > 0) {
a0 = simd::crc32U64(
a0,
toGo == 8
? words[0]
: loadPartialWord(reinterpret_cast<const uint8_t*>(words), toGo));
}
return a0 ^ ((a1 * kMul)) ^ (a2 * kMul);
}
} // namespace facebook::velox::bits
19 changes: 1 addition & 18 deletions velox/common/base/BitUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -777,24 +777,7 @@ inline uint64_t loadPartialWord(const uint8_t* data, int32_t size) {
return result;
}

inline size_t hashBytes(size_t seed, const char* data, size_t size) {
auto begin = reinterpret_cast<const uint8_t*>(data);
if (size < 8) {
return hashMix(seed, loadPartialWord(begin, size));
}
auto result = seed;
auto end = begin + size;
while (begin + 8 <= end) {
result = hashMix(result, *reinterpret_cast<const uint64_t*>(begin));
begin += 8;
}
if (end != begin) {
// Accesses the last 64 bits. Some bytes may get processed twice but the
// access is safe.
result = hashMix(result, *reinterpret_cast<const uint64_t*>(end - 8));
}
return result;
}
uint64_t hashBytes(uint64_t seed, const char* data, size_t size);

namespace detail {
// Returns at least 'numBits' bits of data starting at bit 'bitOffset'
Expand Down
3 changes: 1 addition & 2 deletions velox/type/StringView.h
Original file line number Diff line number Diff line change
Expand Up @@ -283,8 +283,7 @@ namespace folly {
template <>
struct hasher<::facebook::velox::StringView> {
size_t operator()(const ::facebook::velox::StringView view) const {
return hash::SpookyHashV2::Hash64(view.data(), view.size(), 0);
// return facebook::velox::bits::hashBytes(1, view.data(), view.size());
return facebook::velox::bits::hashBytes(1, view.data(), view.size());
}
};

Expand Down
2 changes: 1 addition & 1 deletion velox/type/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ add_test(velox_type_test velox_type_test)
target_link_libraries(
velox_type_test
velox_type
velox_serialization
velox_common_base
velox_external_date
Folly::folly
gtest
Expand Down
15 changes: 3 additions & 12 deletions velox/vector/FlatVector-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,18 +83,9 @@ std::unique_ptr<SimpleVector<uint64_t>> FlatVector<T>::hashAll() const {
auto hashData = hashBuffer->asMutable<uint64_t>();

if (rawValues_ != nullptr) { // non all-null case
if constexpr (std::is_same_v<T, StringView>) {
folly::hasher<folly::StringPiece> stringHasher;
for (size_t i = 0; i < BaseVector::length_; ++i) {
auto view = valueAt(i);
folly::StringPiece piece(view.data(), view.size());
hashData[i] = stringHasher(piece);
}
} else {
folly::hasher<T> hasher;
for (size_t i = 0; i < BaseVector::length_; ++i) {
hashData[i] = hasher(valueAtFast(i));
}
folly::hasher<T> hasher;
for (size_t i = 0; i < BaseVector::length_; ++i) {
hashData[i] = hasher(valueAtFast(i));
}
}

Expand Down
Loading