diff --git a/.bazelrc b/.bazelrc index 2ab6340c8b..3b43871299 100644 --- a/.bazelrc +++ b/.bazelrc @@ -41,3 +41,9 @@ build:macos --cxxopt="-std=c++17" --linkopt="-pthread" build:clang-cl --cxxopt="-std=c++17" build:windows --cxxopt="/std:c++17" --cxxopt="/Zc:preprocessor" --cxxopt="/utf-8" build:msvc --cxxopt="/std:c++17" --cxxopt="/Zc:preprocessor" --cxxopt="/utf-8" + +build --copt=-mavx +build --copt=-mavx2 +build --copt=-mbmi +build --copt=-mbmi2 + diff --git a/WORKSPACE b/WORKSPACE index 8ee10fe704..08cb0f41ae 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -25,6 +25,7 @@ load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps") load("@com_github_grpc_grpc//third_party/py:python_configure.bzl", "python_configure") load("//bazel/arrow:pyarrow_configure.bzl", "pyarrow_configure") load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository") +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") # Add Benchmark git_repository( @@ -33,6 +34,14 @@ git_repository( tag = "v1.9.1", ) +# Add SIMDUTF +http_archive( + name = "simdutf", + urls = ["https://github.com/simdutf/simdutf/releases/download/v6.1.2/singleheader.zip"], + sha256 = "41bb25074fe1e917e96e539c7a87c502e530d88746d7c25d06fb55a28b884340", + build_file = "//cpp/fury/thirdparty:BUILD", +) + bazel_skylib_workspace() python_configure(name="local_config_python") pyarrow_configure(name="local_config_pyarrow") diff --git a/cpp/fury/benchmark/BUILD b/cpp/fury/benchmark/BUILD index f6e3f697da..0d7c10d2ac 100644 --- a/cpp/fury/benchmark/BUILD +++ b/cpp/fury/benchmark/BUILD @@ -10,6 +10,7 @@ cc_library( deps = [ "//cpp/fury/util:fury_util", "@com_google_benchmark//:benchmark", + "@simdutf//:simdutf" ], visibility = ["//visibility:public"], ) diff --git a/cpp/fury/benchmark/benchmark_string_util.cc b/cpp/fury/benchmark/benchmark_string_util.cc index 851cd3e538..e35a7c56ed 100644 --- a/cpp/fury/benchmark/benchmark_string_util.cc +++ b/cpp/fury/benchmark/benchmark_string_util.cc @@ -25,6 +25,7 @@ #include "fury/util/string_util.h" +#include "simdutf.h" #include #include @@ -217,6 +218,11 @@ bool isAscii_BaseLine(const std::string &str) { return true; } +bool isAscii_SIMDUTF(const std::string &str) { + // Call the API directly without validation + return simdutf::validate_ascii(str.data(), str.size()); +} + // Benchmark function for Baseline ASCII check static void BM_IsAscii_BaseLine(benchmark::State &state) { for (auto _ : state) { @@ -227,8 +233,22 @@ static void BM_IsAscii_BaseLine(benchmark::State &state) { } } +BENCHMARK(BM_IsAscii_BaseLine); + +// Benchmark function for SIMDUTF ASCII check +static void BM_IsAscii_SIMDUTF(benchmark::State &state) { + for (auto _ : state) { + for (const auto &str : test_ascii_strings) { + bool result = isAscii_SIMDUTF(str); + benchmark::DoNotOptimize(result); // Prevent compiler optimization + } + } +} + +BENCHMARK(BM_IsAscii_SIMDUTF); + // Benchmark function for SIMD ASCII check -static void BM_IsAscii_SIMD(benchmark::State &state) { +static void BM_IsAscii_FURY(benchmark::State &state) { for (auto _ : state) { for (const auto &str : test_ascii_strings) { bool result = fury::isAscii(str); @@ -237,8 +257,7 @@ static void BM_IsAscii_SIMD(benchmark::State &state) { } } -BENCHMARK(BM_IsAscii_BaseLine); -BENCHMARK(BM_IsAscii_SIMD); +BENCHMARK(BM_IsAscii_FURY); // Baseline implementation to check if a string is Latin-1 bool isLatin1_BaseLine(const std::u16string &str) { @@ -254,6 +273,18 @@ bool isLatin1_BaseLine(const std::u16string &str) { return true; } +bool isLatin1_SIMDUTF(const std::u16string &str) { + // Try the conversion directly, and all characters are considered Latin1 if + // they are successfully converted + size_t latin1_len = simdutf::latin1_length_from_utf16(str.size()); + if (latin1_len != str.size()) + return false; + std::string buffer(str.size(), '\0'); + size_t converted = + simdutf::convert_utf16_to_latin1(str.data(), str.size(), buffer.data()); + return converted == str.size(); +} + // Benchmark function for Baseline Latin-1 check static void BM_IsLatin1_BaseLine(benchmark::State &state) { for (auto _ : state) { @@ -264,8 +295,22 @@ static void BM_IsLatin1_BaseLine(benchmark::State &state) { } } +BENCHMARK(BM_IsLatin1_BaseLine); + +// Benchmark function for Optimized Latin-1 check +static void BM_IsLatin1_SIMDUTF(benchmark::State &state) { + for (auto _ : state) { + for (const auto &str : test_latin1_strings) { + bool result = isLatin1_SIMDUTF(str); + benchmark::DoNotOptimize(result); // Prevent compiler optimization + } + } +} + +BENCHMARK(BM_IsLatin1_SIMDUTF); + // Benchmark function for Optimized Latin-1 check -static void BM_IsLatin1_SIMD(benchmark::State &state) { +static void BM_IsLatin1_FURY(benchmark::State &state) { for (auto _ : state) { for (const auto &str : test_latin1_strings) { bool result = fury::isLatin1(str); @@ -274,8 +319,7 @@ static void BM_IsLatin1_SIMD(benchmark::State &state) { } } -BENCHMARK(BM_IsLatin1_BaseLine); -BENCHMARK(BM_IsLatin1_SIMD); +BENCHMARK(BM_IsLatin1_FURY); /* * TEST Utf16HasSurrogatePairs @@ -301,9 +345,11 @@ static void BM_Utf16HasSurrogatePairs_BaseLine(benchmark::State &state) { } } +BENCHMARK(BM_Utf16HasSurrogatePairs_BaseLine); + // Benchmark function for checking if a UTF-16 string contains surrogate pairs // with SIMD -static void BM_Utf16HasSurrogatePairs_SIMD(benchmark::State &state) { +static void BM_Utf16HasSurrogatePairs_FURY(benchmark::State &state) { for (auto _ : state) { for (const auto &str : test_utf16_strings) { bool result = fury::utf16HasSurrogatePairs(str); @@ -311,8 +357,8 @@ static void BM_Utf16HasSurrogatePairs_SIMD(benchmark::State &state) { } } } -BENCHMARK(BM_Utf16HasSurrogatePairs_BaseLine); -BENCHMARK(BM_Utf16HasSurrogatePairs_SIMD); + +BENCHMARK(BM_Utf16HasSurrogatePairs_FURY); /* * TEST Utf16ToUtf8 @@ -350,6 +396,25 @@ std::string utf16ToUtf8BaseLine(const std::u16string &utf16, return utf8_result; } +std::string utf16ToUtf8_SIMDUTF(const std::u16string &utf16, + bool is_little_endian) { + if (utf16.empty()) + return {}; + size_t utf8_len = + is_little_endian + ? simdutf::utf8_length_from_utf16le(utf16.data(), utf16.size()) + : simdutf::utf8_length_from_utf16be(utf16.data(), utf16.size()); + + std::string utf8_result(utf8_len, '\0'); + size_t converted = is_little_endian + ? simdutf::convert_utf16le_to_utf8( + utf16.data(), utf16.size(), utf8_result.data()) + : simdutf::convert_utf16be_to_utf8( + utf16.data(), utf16.size(), utf8_result.data()); + utf8_result.resize(converted); + return utf8_result; +} + // Benchmark function for Standard Library UTF-16 to UTF-8 conversion static void BM_Utf16ToUtf8_StandardLibrary(benchmark::State &state) { for (auto _ : state) { @@ -361,6 +426,8 @@ static void BM_Utf16ToUtf8_StandardLibrary(benchmark::State &state) { } } +BENCHMARK(BM_Utf16ToUtf8_StandardLibrary); + // Benchmark function for Baseline UTF-16 to UTF-8 conversion static void BM_Utf16ToUtf8_BaseLine(benchmark::State &state) { for (auto _ : state) { @@ -372,8 +439,23 @@ static void BM_Utf16ToUtf8_BaseLine(benchmark::State &state) { } } +BENCHMARK(BM_Utf16ToUtf8_BaseLine); + +// Benchmark function for SIMD-based UTF-16 to UTF-8 conversion +static void BM_Utf16ToUtf8_SIMDUTF(benchmark::State &state) { + for (auto _ : state) { + for (const auto &str : test_utf16_strings) { + std::string utf8 = utf16ToUtf8_SIMDUTF(str, true); + benchmark::DoNotOptimize( + utf8); // Prevents the compiler from optimizing away unused variables + } + } +} + +BENCHMARK(BM_Utf16ToUtf8_SIMDUTF); + // Benchmark function for SIMD-based UTF-16 to UTF-8 conversion -static void BM_Utf16ToUtf8_SIMD(benchmark::State &state) { +static void BM_Utf16ToUtf8_FURY(benchmark::State &state) { for (auto _ : state) { for (const auto &str : test_utf16_strings) { std::string utf8 = fury::utf16ToUtf8(str, true); @@ -383,9 +465,7 @@ static void BM_Utf16ToUtf8_SIMD(benchmark::State &state) { } } -BENCHMARK(BM_Utf16ToUtf8_StandardLibrary); -BENCHMARK(BM_Utf16ToUtf8_BaseLine); -BENCHMARK(BM_Utf16ToUtf8_SIMD); +BENCHMARK(BM_Utf16ToUtf8_FURY); /* * TEST Utf8ToUtf16 @@ -470,6 +550,25 @@ std::u16string utf8ToUtf16BaseLine(const std::string &utf8, return utf16; } +std::u16string utf8ToUtf16_SIMDUTF(const std::string &utf8, + bool is_little_endian) { + if (utf8.empty()) + return {}; + + size_t utf16_len = simdutf::utf16_length_from_utf8(utf8.data(), utf8.size()); + + std::u16string utf16_result(utf16_len, u'\0'); + + size_t converted = is_little_endian + ? simdutf::convert_utf8_to_utf16le( + utf8.data(), utf8.size(), utf16_result.data()) + : simdutf::convert_utf8_to_utf16be( + utf8.data(), utf8.size(), utf16_result.data()); + + utf16_result.resize(converted); + return utf16_result; +} + // Benchmark function for Standard Library UTF-8 to UTF-16 conversion static void BM_Utf8ToUtf16_StandardLibrary(benchmark::State &state) { for (auto _ : state) { @@ -480,6 +579,7 @@ static void BM_Utf8ToUtf16_StandardLibrary(benchmark::State &state) { } } } +BENCHMARK(BM_Utf8ToUtf16_StandardLibrary); // Benchmark function for Baseline UTF-8 to UTF-16 conversion static void BM_Utf8ToUtf16_BaseLine(benchmark::State &state) { @@ -492,8 +592,23 @@ static void BM_Utf8ToUtf16_BaseLine(benchmark::State &state) { } } +BENCHMARK(BM_Utf8ToUtf16_BaseLine); + +// Benchmark function for SIMD-based UTF-8 to UTF-16 conversion +static void BM_Utf8ToUtf16_SIMDUTF(benchmark::State &state) { + for (auto _ : state) { + for (const auto &str : test_utf8_strings) { + std::u16string utf16 = utf8ToUtf16_SIMDUTF(str, true); + benchmark::DoNotOptimize( + utf16); // Prevents the compiler from optimizing away unused variables + } + } +} + +BENCHMARK(BM_Utf8ToUtf16_SIMDUTF); + // Benchmark function for SIMD-based UTF-8 to UTF-16 conversion -static void BM_Utf8ToUtf16_SIMD(benchmark::State &state) { +static void BM_Utf8ToUtf16_FURY(benchmark::State &state) { for (auto _ : state) { for (const auto &str : test_utf8_strings) { std::u16string utf16 = fury::utf8ToUtf16(str, true); @@ -503,8 +618,6 @@ static void BM_Utf8ToUtf16_SIMD(benchmark::State &state) { } } -BENCHMARK(BM_Utf8ToUtf16_StandardLibrary); -BENCHMARK(BM_Utf8ToUtf16_BaseLine); -BENCHMARK(BM_Utf8ToUtf16_SIMD); +BENCHMARK(BM_Utf8ToUtf16_FURY); BENCHMARK_MAIN(); diff --git a/cpp/fury/thirdparty/BUILD b/cpp/fury/thirdparty/BUILD index f54e00356c..0ae1f0fe46 100644 --- a/cpp/fury/thirdparty/BUILD +++ b/cpp/fury/thirdparty/BUILD @@ -9,3 +9,11 @@ cc_library( linkstatic=True, visibility = ["//visibility:public"], ) + +cc_library( + name = "simdutf", + srcs = ["simdutf.cpp"], + hdrs = ["simdutf.h"], + includes = ["."], + visibility = ["//visibility:public"], +) diff --git a/cpp/fury/util/BUILD b/cpp/fury/util/BUILD index 36fe126b37..9795f89265 100644 --- a/cpp/fury/util/BUILD +++ b/cpp/fury/util/BUILD @@ -4,8 +4,6 @@ cc_library( name = "fury_util", srcs = glob(["*.cc"], exclude=["*test.cc"]), hdrs = glob(["*.h"]), - copts = ["-mavx2"], # Enable AVX2 support - linkopts = ["-mavx2"], # Ensure linker also knows about AVX2 strip_include_prefix = "/cpp", alwayslink=True, linkstatic=True, @@ -14,6 +12,7 @@ cc_library( "@com_google_absl//absl/debugging:failure_signal_handler", "@com_google_absl//absl/debugging:stacktrace", "@com_google_absl//absl/debugging:symbolize", + "@simdutf//:simdutf" ], visibility = ["//visibility:public"], ) diff --git a/cpp/fury/util/string_util.h b/cpp/fury/util/string_util.h index 6a23ff1092..9081f378f5 100644 --- a/cpp/fury/util/string_util.h +++ b/cpp/fury/util/string_util.h @@ -95,8 +95,64 @@ static inline bool hasSurrogatePairFallback(const uint16_t *data, size_t size) { } return false; } +#if defined(FURY_HAS_IMMINTRIN) -#if defined(FURY_HAS_NEON) +inline bool isAscii(const char *data, size_t length) { + constexpr size_t VECTOR_SIZE = 32; + const auto *ptr = reinterpret_cast(data); + const auto *end = ptr + length / VECTOR_SIZE; + const __m256i mask = _mm256_set1_epi8(0x80); + + for (; ptr < end; ++ptr) { + __m256i vec = _mm256_loadu_si256(ptr); + __m256i cmp = _mm256_and_si256(vec, mask); + if (!_mm256_testz_si256(cmp, cmp)) + return false; + } + + return isAsciiFallback(data + (length / VECTOR_SIZE) * VECTOR_SIZE, + length % VECTOR_SIZE); +} + +inline bool isLatin1(const uint16_t *data, size_t length) { + constexpr size_t VECTOR_SIZE = 16; + const auto *ptr = reinterpret_cast(data); + const auto *end = ptr + length / VECTOR_SIZE; + + const __m256i mask = _mm256_set1_epi16(0x00FF); + + for (; ptr < end; ++ptr) { + __m256i vec = _mm256_loadu_si256(ptr); + __m256i cmp = _mm256_cmpgt_epi16(vec, mask); + if (!_mm256_testz_si256(cmp, cmp)) { + return false; + } + } + + return isLatin1Fallback(data + (length / VECTOR_SIZE) * VECTOR_SIZE, + length % VECTOR_SIZE); +} +inline bool utf16HasSurrogatePairs(const uint16_t *data, size_t length) { + constexpr size_t VECTOR_SIZE = 16; + const auto *ptr = reinterpret_cast(data); + const auto *end = ptr + length / VECTOR_SIZE; + const __m256i lower_bound = _mm256_set1_epi16(0xD800); + const __m256i higher_bound = _mm256_set1_epi16(0xDFFF); + + for (; ptr < end; ++ptr) { + __m256i vec = _mm256_loadu_si256(ptr); + __m256i mask1 = _mm256_cmpgt_epi16(vec, lower_bound); + __m256i mask2 = _mm256_cmpgt_epi16(higher_bound, vec); + __m256i result = _mm256_and_si256(mask1, mask2); + if (!_mm256_testz_si256(result, result)) + return true; + } + + return hasSurrogatePairFallback(data + (length / VECTOR_SIZE) * VECTOR_SIZE, + length % VECTOR_SIZE); +} + +#elif defined(FURY_HAS_NEON) inline bool isAscii(const char *data, size_t length) { size_t i = 0; uint8x16_t mostSignificantBit = vdupq_n_u8(0x80); diff --git a/cpp/fury/util/string_util_test.cc b/cpp/fury/util/string_util_test.cc index 2f267e146b..9d5c9ddf11 100644 --- a/cpp/fury/util/string_util_test.cc +++ b/cpp/fury/util/string_util_test.cc @@ -88,9 +88,6 @@ TEST(StringUtilTest, TestisLatin1) { EXPECT_FALSE(isLatin1(u"Javaone Keynote\u1234")); EXPECT_TRUE(isLatin1(u"a\xFF")); // ÿ in Latin-1 EXPECT_TRUE(isLatin1(u"\x80")); // € in Latin-1 - const uint16_t str[] = {256, 256}; - EXPECT_FALSE(isLatin1(str, 2)); // Ā (not in Latin-1) - for (size_t i = 1; i < 256; i++) { EXPECT_TRUE(isLatin1(std::u16string(i, '.') + u"Fury")); EXPECT_FALSE(isLatin1(std::u16string(i, '.') + u"序列化"));