From 06bd7fccda1afc0d5dbd36e35ef2ad8273283dbe Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Fri, 18 Apr 2025 16:01:04 -0400 Subject: [PATCH 1/4] some tuning and more options --- benchmarks/CMakeLists.txt | 1 - benchmarks/algorithms.h | 2 +- benchmarks/benchmark.cpp | 38 ++++-- benchmarks/exhaustivefloat32.cpp | 31 ++++- benchmarks/ieeeToString.cpp | 196 +++++++++++++++++++++---------- benchmarks/thoroughfloat64.cpp | 25 +++- 6 files changed, 212 insertions(+), 81 deletions(-) diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 1750929..5501cf1 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -3,7 +3,6 @@ add_executable(benchmark ) add_library(benchmark_deps INTERFACE) add_library(ieeeToString ieeeToString.cpp) -target_include_directories(ieeeToString PRIVATE ${ryu_SOURCE_DIR}) target_link_libraries(benchmark_deps INTERFACE ieeeToString) include(CheckSourceCompiles) check_source_compiles(CXX " diff --git a/benchmarks/algorithms.h b/benchmarks/algorithms.h index 800bdf2..7358aca 100644 --- a/benchmarks/algorithms.h +++ b/benchmarks/algorithms.h @@ -82,7 +82,7 @@ struct BenchArgs { std::string name{}; int (*func)(T, std::span&){}; bool used{}; - unsigned char testRepeat{100}; + size_t testRepeat{100}; }; template diff --git a/benchmarks/benchmark.cpp b/benchmarks/benchmark.cpp index 74d0112..e2b1667 100644 --- a/benchmarks/benchmark.cpp +++ b/benchmarks/benchmark.cpp @@ -26,9 +26,21 @@ using Benchmarks::arithmetic_float; using Benchmarks::BenchArgs; +bool is_matched(const std::string &str, const std::span filter) { + if (filter.empty()) { + return true; + } + for (const auto &f : filter) { + if (str.find(f) != std::string::npos) { + return true; + } + } + return false; +} + template void evaluateProperties(const std::vector &lines, - const std::array, Benchmarks::COUNT> &args, const std::string& filter = "") { + const std::array, Benchmarks::COUNT> &args, const std::span filter = {}) { constexpr auto precision = std::numeric_limits::digits10; fmt::println("{:20} {:20}", "Algorithm", "Valid round-trip"); @@ -38,7 +50,7 @@ void evaluateProperties(const std::vector &lines, continue; } // Apply filter if provided - if (!filter.empty() && std::string(filter).find(algo.name) == std::string::npos) { + if (!is_matched(algo.name, filter)) { std::cout << "# filtered out " << algo.name << std::endl; continue; } @@ -73,14 +85,14 @@ void evaluateProperties(const std::vector &lines, template void process(const std::vector &lines, - const std::array, Benchmarks::COUNT> &args, const std::string& filter = "") { + const std::array, Benchmarks::COUNT> &args, const std::span filter = {}) { for (const auto &algo : args) { if (!algo.used) { std::cout << "# skipping " << algo.name << std::endl; continue; } // Apply filter if provided - if (!filter.empty() && std::string(filter).find(algo.name) == std::string::npos) { + if (!is_matched(algo.name, filter)) { std::cout << "# filtered out " << algo.name << std::endl; continue; } @@ -155,8 +167,10 @@ int main(int argc, char **argv) { cxxopts::value()->default_value("false")) ("e,errol", "Enable errol3 (current impl. returns invalid values, e.g., for 0).", cxxopts::value()->default_value("false")) - ("a,algo-filter", "Filter algorithms by name substring.", - cxxopts::value()->default_value("")) + ("a,algo-filter", "Filter algorithms by name substring: you can use multiple filters separated by commas.", + cxxopts::value>()->default_value("")) + ("r,repeat", "Force a number of repetitions.", + cxxopts::value()->default_value("0")) ("h,help", "Print usage."); const auto result = options.parse(argc, argv); @@ -164,9 +178,9 @@ int main(int argc, char **argv) { std::cout << options.help() << std::endl; return EXIT_SUCCESS; } - + const size_t repeat = result["repeat"].as(); const bool single = result["single"].as(); - const std::string filter = result["algo-filter"].as(); + std::vector filter = result["algo-filter"].as>(); std::cout << "number type: binary" << (single ? "32 (float)" : "64 (double)") << std::endl; @@ -198,6 +212,14 @@ int main(int argc, char **argv) { else algorithms = Benchmarks::initArgs(errol); + if(repeat > 0) { + std::cout << "# forcing repeat count to " << repeat << std::endl; + std::visit([repeat](auto &args) { + for (auto &arg : args) + arg.testRepeat = repeat; + }, algorithms); + } + const bool test = result["test"].as(); std::visit([test,&filter](const auto &lines, const auto &args) { using T1 = typename std::decay_t::value_type; diff --git a/benchmarks/exhaustivefloat32.cpp b/benchmarks/exhaustivefloat32.cpp index 65e53f9..67f671c 100644 --- a/benchmarks/exhaustivefloat32.cpp +++ b/benchmarks/exhaustivefloat32.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include "algorithms.h" #include "cxxopts.hpp" @@ -59,7 +60,7 @@ std::optional parse_float(std::string_view sv) { return std::nullopt; } -void run_exhaustive32(bool errol) { +void run_exhaustive32(bool errol, const std::vector& algo_filter = {}) { constexpr auto precision = std::numeric_limits::digits10; fmt::println("{:20} {:20}", "Algorithm", "Valid shortest serialization"); @@ -75,6 +76,22 @@ void run_exhaustive32(bool errol) { fmt::print("# skipping {} because it is the reference.\n", algo.name); continue; } + + // Apply filter if provided + if (!algo_filter.empty()) { + bool matched = false; + for (const auto &f : algo_filter) { + if (algo.name.find(f) != std::string::npos) { + matched = true; + break; + } + } + if (!matched) { + fmt::print("# filtered out {}\n", algo.name); + continue; + } + } + bool incorrect = false; char buf1[100], buf2[100]; std::span bufRef(buf1, sizeof(buf1)), bufAlgo(buf2, sizeof(buf2)); @@ -149,15 +166,21 @@ int main(int argc, char **argv) { options.add_options()( "e,errol", "Enable errol3 (current impl. returns invalid values, e.g., for 0).", - cxxopts::value()->default_value("false"))("h,help", - "Print usage."); + cxxopts::value()->default_value("false"))( + "a,algorithm", + "Specify which algorithm(s) to test (comma-separated).", + cxxopts::value>()->default_value({}))( + "h,help", + "Print usage."); const auto result = options.parse(argc, argv); if (result["help"].as()) { fmt::print("{}\n", options.help()); return EXIT_SUCCESS; } - run_exhaustive32(result["errol"].as()); + + auto algo_filter = result["algorithm"].as>(); + run_exhaustive32(result["errol"].as(), algo_filter); } catch (const std::exception &e) { fmt::print("error parsing options: {}\n", e.what()); return EXIT_FAILURE; diff --git a/benchmarks/ieeeToString.cpp b/benchmarks/ieeeToString.cpp index 783e24a..9df38b4 100644 --- a/benchmarks/ieeeToString.cpp +++ b/benchmarks/ieeeToString.cpp @@ -1,11 +1,40 @@ #include "ieeeToString.h" #include +#include +#include #include #include +#ifdef _MSC_VER +#ifdef __clang__ +#define WE_HAVE_CLANGCL 1 +#else +#define WE_HAVE_VISUAL_STUDIO 1 +#include +#endif +#endif -#include "ryu/digit_table.h" // For DIGIT_TABLE -#include "ryu/common.h" // For decimalLength9 +static const char hundreds_digit_table[200] = { + '0', '0', '0', '1', '0', '2', '0', '3', '0', '4', + '0', '5', '0', '6', '0', '7', '0', '8', '0', '9', + '1', '0', '1', '1', '1', '2', '1', '3', '1', '4', + '1', '5', '1', '6', '1', '7', '1', '8', '1', '9', + '2', '0', '2', '1', '2', '2', '2', '3', '2', '4', + '2', '5', '2', '6', '2', '7', '2', '8', '2', '9', + '3', '0', '3', '1', '3', '2', '3', '3', '3', '4', + '3', '5', '3', '6', '3', '7', '3', '8', '3', '9', + '4', '0', '4', '1', '4', '2', '4', '3', '4', '4', + '4', '5', '4', '6', '4', '7', '4', '8', '4', '9', + '5', '0', '5', '1', '5', '2', '5', '3', '5', '4', + '5', '5', '5', '6', '5', '7', '5', '8', '5', '9', + '6', '0', '6', '1', '6', '2', '6', '3', '6', '4', + '6', '5', '6', '6', '6', '7', '6', '8', '6', '9', + '7', '0', '7', '1', '7', '2', '7', '3', '7', '4', + '7', '5', '7', '6', '7', '7', '7', '8', '7', '9', + '8', '0', '8', '1', '8', '2', '8', '3', '8', '4', + '8', '5', '8', '6', '8', '7', '8', '8', '8', '9', + '9', '0', '9', '1', '9', '2', '9', '3', '9', '4', + '9', '5', '9', '6', '9', '7', '9', '8', '9', '9'}; IEEE754f decode_ieee754(float f) { const uint32_t bits = std::bit_cast(f); @@ -27,31 +56,77 @@ IEEE754d decode_ieee754(double f) { return decomposed; } -// Extracted from the Ryu implementation. -static inline uint32_t decimalLength17(const uint64_t v) { - // Function precondition: v is not an 18, 19, or 20-digit number. - // (17 digits are sufficient for round-tripping.) - assert(v < 100000000000000000L); - - // Slightly faster than a loop. - // Average output length is 16.38 digits, so we check high-to-low. - if (v >= 10000000000000000L) { return 17; } - if (v >= 1000000000000000L) { return 16; } - if (v >= 100000000000000L) { return 15; } - if (v >= 10000000000000L) { return 14; } - if (v >= 1000000000000L) { return 13; } - if (v >= 100000000000L) { return 12; } - if (v >= 10000000000L) { return 11; } - if (v >= 1000000000L) { return 10; } - if (v >= 100000000L) { return 9; } - if (v >= 10000000L) { return 8; } - if (v >= 1000000L) { return 7; } - if (v >= 100000L) { return 6; } - if (v >= 10000L) { return 5; } - if (v >= 1000L) { return 4; } - if (v >= 100L) { return 3; } - if (v >= 10L) { return 2; } - return 1; +//////////////////////// +// We should use https://en.cppreference.com/w/cpp/numeric/countl_zero +//////////////////////// +#if WE_HAVE_VISUAL_STUDIO +inline int leading_zeroes_64(uint64_t input_num) { + unsigned long index; +#ifdef _WIN64 // highly recommended!!! + _BitScanReverse64(&index, input_num); +#else // if we must support 32-bit Windows + if (input_num > 0xFFFFFFFF) { + _BitScanReverse(&index, (uint32_t)(input_num >> 32)); + index += 32; + } else { + _BitScanReverse(&index, (uint32_t)(input_num)); + } +#endif // _WIN64 + return 63 - index; +} +#else +inline int leading_zeroes_64(uint64_t input_num) { + return __builtin_clzll(input_num); +} +#endif + + +inline int int_log2_64(uint64_t x) { return 63 - leading_zeroes_64(x | 1); } + +/** + * Reference: + * Daniel Lemire, "Computing the number of digits of an integer even faster," in Daniel Lemire's blog, June 3, 2021, https://lemire.me/blog/2021/06/03/computing-the-number-of-digits-of-an-integer-even-faster/. + */ +inline int fast_digit_count32(uint32_t x) { + static uint64_t table[] = { + 4294967296, 8589934582, 8589934582, 8589934582, 12884901788, + 12884901788, 12884901788, 17179868184, 17179868184, 17179868184, + 21474826480, 21474826480, 21474826480, 21474826480, 25769703776, + 25769703776, 25769703776, 30063771072, 30063771072, 30063771072, + 34349738368, 34349738368, 34349738368, 34349738368, 38554705664, + 38554705664, 38554705664, 41949672960, 41949672960, 41949672960, + 42949672960, 42949672960}; + return uint32_t((x + table[int_log2_64(x)]) >> 32); +} + + +/** + * Reference: + * Daniel Lemire, "Counting the digits of 64-bit integers," in Daniel Lemire's blog, January 7, 2025, https://lemire.me/blog/2025/01/07/counting-the-digits-of-64-bit-integers/. + */ +inline int fast_digit_count64(uint64_t x) { + static uint64_t table[] = {9, + 99, + 999, + 9999, + 99999, + 999999, + 9999999, + 99999999, + 999999999, + 9999999999, + 99999999999, + 999999999999, + 9999999999999, + 99999999999999, + 999999999999999ULL, + 9999999999999999ULL, + 99999999999999999ULL, + 999999999999999999ULL, + 9999999999999999999ULL}; + int y = (19 * int_log2_64(x) >> 6); + y += x > table[y]; + return y + 1; } // Adapted from the Ryu implementation. @@ -63,9 +138,8 @@ int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) { if (sign) result[index++] = '-'; - const uint32_t olength = is_double ? decimalLength17(mantissa) - : decimalLength9(mantissa); - + const uint32_t olength = is_double ? fast_digit_count64(mantissa) + : fast_digit_count32(mantissa); // Print the decimal digits. // for (uint32_t i = 0; i < olength - 1; ++i) { // const uint32_t c = mantissa % 10; mantissa /= 10; @@ -74,33 +148,29 @@ int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) { // result[index] = '0' + mantissa % 10; uint32_t i = 0; - if constexpr (is_double) { - // We prefer 32-bit operations, even on 64-bit platforms. - // We have at most 17 digits, and uint32_t can store 9 digits. - // If mantissa doesn't fit into uint32_t, we cut off 8 digits, - // so the rest will fit into uint32_t. - if ((mantissa >> 32) != 0) { - // Expensive 64-bit division. - const uint64_t q = mantissa / 100'000'000; - uint32_t temp = ((uint32_t) mantissa) - 100'000'000 * ((uint32_t) q); - mantissa = q; - - const uint32_t c = temp % 10000; - temp /= 10000; - const uint32_t d = temp % 10000; - const uint32_t c0 = (c % 100) << 1; - const uint32_t c1 = (c / 100) << 1; - const uint32_t d0 = (d % 100) << 1; - const uint32_t d1 = (d / 100) << 1; - memcpy(result + index + olength - 1, DIGIT_TABLE + c0, 2); - memcpy(result + index + olength - 3, DIGIT_TABLE + c1, 2); - memcpy(result + index + olength - 5, DIGIT_TABLE + d0, 2); - memcpy(result + index + olength - 7, DIGIT_TABLE + d1, 2); - i += 8; - } + // We take care of the least significant eight digits first. + if (mantissa >= 100'000'000) { + // Expensive 64-bit division. + const uint64_t q = mantissa / 100'000'000; + uint32_t temp = mantissa % 100'000'000; + mantissa = q; + + const uint32_t c = temp % 10000; + temp /= 10000; + const uint32_t d = temp % 10000; + const uint32_t c0 = (c % 100) << 1; + const uint32_t c1 = (c / 100) << 1; + const uint32_t d0 = (d % 100) << 1; + const uint32_t d1 = (d / 100) << 1; + memcpy(result + index + olength - 1, hundreds_digit_table + c0, 2); + memcpy(result + index + olength - 3, hundreds_digit_table + c1, 2); + memcpy(result + index + olength - 5, hundreds_digit_table + d0, 2); + memcpy(result + index + olength - 7, hundreds_digit_table + d1, 2); + i += 8; } - uint32_t output = (uint32_t) mantissa; + + uint64_t output = mantissa; while (output >= 10000) { #ifdef __clang__ // https://bugs.llvm.org/show_bug.cgi?id=38217 const uint32_t c = output - 10000 * (output / 10000); @@ -110,21 +180,21 @@ int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) { output /= 10000; const uint32_t c0 = (c % 100) << 1; const uint32_t c1 = (c / 100) << 1; - memcpy(result + index + olength - i - 1, DIGIT_TABLE + c0, 2); - memcpy(result + index + olength - i - 3, DIGIT_TABLE + c1, 2); + memcpy(result + index + olength - i - 1, hundreds_digit_table + c0, 2); + memcpy(result + index + olength - i - 3, hundreds_digit_table + c1, 2); i += 4; } if (output >= 100) { const uint32_t c = (output % 100) << 1; output /= 100; - memcpy(result + index + olength - i - 1, DIGIT_TABLE + c, 2); + memcpy(result + index + olength - i - 1, hundreds_digit_table + c, 2); i += 2; } if (output >= 10) { - const uint32_t c = output << 1; + const uint64_t c = output << 1; // We can't use memcpy here: the decimal dot goes between these two digits. - result[index + olength - i] = DIGIT_TABLE[c + 1]; - result[index] = DIGIT_TABLE[c]; + result[index + olength - i] = hundreds_digit_table[c + 1]; + result[index] = hundreds_digit_table[c]; } else { result[index] = (char) ('0' + output); } @@ -147,7 +217,7 @@ int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) { const auto handle_common_cases = [&]() { if (exp >= 10) { - memcpy(result + index, DIGIT_TABLE + 2 * exp, 2); + memcpy(result + index, hundreds_digit_table + 2 * exp, 2); index += 2; } else result[index++] = (char)('0' + exp); @@ -155,7 +225,7 @@ int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) { if constexpr (is_double) { if (exp >= 100) { const int32_t c = exp % 10; - memcpy(result + index, DIGIT_TABLE + 2 * (exp / 10), 2); + memcpy(result + index, hundreds_digit_table + 2 * (exp / 10), 2); result[index + 2] = (char) ('0' + c); index += 3; } else diff --git a/benchmarks/thoroughfloat64.cpp b/benchmarks/thoroughfloat64.cpp index 1109ab4..3fe50fd 100644 --- a/benchmarks/thoroughfloat64.cpp +++ b/benchmarks/thoroughfloat64.cpp @@ -90,7 +90,7 @@ std::vector load_doubles_from_file(const std::string& filename) { return numbers; } -void run_file_test(const std::string& filename, bool errol) { +void run_file_test(const std::string& filename, bool errol, const std::vector& algo_filter = {}) { constexpr auto precision = std::numeric_limits::digits10; fmt::println("{:20} {:20}", "Algorithm", "Valid shortest serialization"); @@ -113,6 +113,22 @@ void run_file_test(const std::string& filename, bool errol) { fmt::print("# skipping {} because it is the reference.\n", algo.name); continue; } + + // Apply filter if provided + if (!algo_filter.empty()) { + bool matched = false; + for (const auto &f : algo_filter) { + if (algo.name.find(f) != std::string::npos) { + matched = true; + break; + } + } + if (!matched) { + fmt::print("# filtered out {}\n", algo.name); + continue; + } + } + bool incorrect = false; char buf1[100], buf2[100]; std::span bufRef(buf1, sizeof(buf1)), bufAlgo(buf2, sizeof(buf2)); @@ -190,6 +206,9 @@ int main(int argc, char **argv) { ("f,file", "Input file containing doubles (one per line)", cxxopts::value()->default_value(THOROUGH_DATA_FILE)) + ("a,algorithm", + "Filter algorithms to test (comma-separated)", + cxxopts::value>()->default_value("")) ("h,help", "Print usage."); const auto result = options.parse(argc, argv); @@ -198,9 +217,7 @@ int main(int argc, char **argv) { fmt::print("{}\n", options.help()); return EXIT_SUCCESS; } - - - run_file_test(result["file"].as(), result["errol"].as()); + run_file_test(result["file"].as(), result["errol"].as(), result["algorithm"].as>()); } catch (const std::exception &e) { fmt::print("error parsing options: {}\n", e.what()); return EXIT_FAILURE; From ff195978be1224ffd7f484d3d0b13793887b2bf4 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Fri, 18 Apr 2025 16:36:04 -0400 Subject: [PATCH 2/4] adding 'just the string' conversion. --- benchmarks/benchmark.cpp | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/benchmarks/benchmark.cpp b/benchmarks/benchmark.cpp index e2b1667..0d68d20 100644 --- a/benchmarks/benchmark.cpp +++ b/benchmarks/benchmark.cpp @@ -8,6 +8,7 @@ */ #include "algorithms.h" +#include #define IEEE_8087 #include "benchutil.h" #include "cxxopts.hpp" @@ -83,9 +84,35 @@ void evaluateProperties(const std::vector &lines, } } +struct diy_float_t { + uint64_t significand; + int exponent; + bool is_negative; +}; + template void process(const std::vector &lines, const std::array, Benchmarks::COUNT> &args, const std::span filter = {}) { + // We have a special algorithm for the reference: + std::string just_string = "just_string"; + if (is_matched(just_string, filter)) { + std::vector parsed; + for(auto d : lines) { + auto v = jkj::grisu_exact(d); + parsed.emplace_back(v.significand, v.exponent, v.is_negative); + } + pretty_print(parsed, just_string, [](const std::vector& parsed) -> int { + int volume = 0; + char buf[100]; + std::span bufspan(buf, sizeof(buf)); + for (const auto v : parsed) + volume += to_chars(v.significand, v.exponent, v.is_negative, bufspan.data()); + return volume; + }, 100); + } else { + std::cout << "# skipping " << just_string << std::endl; + + } for (const auto &algo : args) { if (!algo.used) { std::cout << "# skipping " << algo.name << std::endl; @@ -105,6 +132,7 @@ void process(const std::vector &lines, return volume; }, algo.testRepeat); } + } template From a684d308c8a45da2cf94bd64aceba91e969f6150 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Fri, 18 Apr 2025 16:48:58 -0400 Subject: [PATCH 3/4] some comments --- benchmarks/ieeeToString.cpp | 40 +++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/benchmarks/ieeeToString.cpp b/benchmarks/ieeeToString.cpp index 9df38b4..b70c52a 100644 --- a/benchmarks/ieeeToString.cpp +++ b/benchmarks/ieeeToString.cpp @@ -137,7 +137,7 @@ int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) { int index = 0; if (sign) result[index++] = '-'; - + // We use fast arithmetic to compute the number of digits. const uint32_t olength = is_double ? fast_digit_count64(mantissa) : fast_digit_count32(mantissa); // Print the decimal digits. @@ -146,22 +146,25 @@ int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) { // result[index + olength - i] = (char) ('0' + c); // } // result[index] = '0' + mantissa % 10; - + ////////////////// + // Performance: + // On 64-bit systems, 32-bit arithmetic is no faster than 64-bit, + // and sometimes slower. + ///////////////// uint32_t i = 0; - // We take care of the least significant eight digits first. + // We take care of the least significant eight digits first. if (mantissa >= 100'000'000) { - // Expensive 64-bit division. const uint64_t q = mantissa / 100'000'000; - uint32_t temp = mantissa % 100'000'000; + uint64_t temp = mantissa % 100'000'000; mantissa = q; - const uint32_t c = temp % 10000; + const uint64_t c = temp % 10000; temp /= 10000; - const uint32_t d = temp % 10000; - const uint32_t c0 = (c % 100) << 1; - const uint32_t c1 = (c / 100) << 1; - const uint32_t d0 = (d % 100) << 1; - const uint32_t d1 = (d / 100) << 1; + const uint64_t d = temp % 10000; + const uint64_t c0 = (c % 100) << 1; + const uint64_t c1 = (c / 100) << 1; + const uint64_t d0 = (d % 100) << 1; + const uint64_t d1 = (d / 100) << 1; memcpy(result + index + olength - 1, hundreds_digit_table + c0, 2); memcpy(result + index + olength - 3, hundreds_digit_table + c1, 2); memcpy(result + index + olength - 5, hundreds_digit_table + d0, 2); @@ -171,25 +174,24 @@ int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) { uint64_t output = mantissa; + // Next, we proceed in block of 4 digits. while (output >= 10000) { -#ifdef __clang__ // https://bugs.llvm.org/show_bug.cgi?id=38217 - const uint32_t c = output - 10000 * (output / 10000); -#else - const uint32_t c = output % 10000; -#endif + const uint64_t c = output % 10000; output /= 10000; - const uint32_t c0 = (c % 100) << 1; - const uint32_t c1 = (c / 100) << 1; + const uint64_t c0 = (c % 100) << 1; + const uint64_t c1 = (c / 100) << 1; memcpy(result + index + olength - i - 1, hundreds_digit_table + c0, 2); memcpy(result + index + olength - i - 3, hundreds_digit_table + c1, 2); i += 4; } + // We can take care of two digits out of the 2 or 3 remaining. if (output >= 100) { - const uint32_t c = (output % 100) << 1; + const uint64_t c = (output % 100) << 1; output /= 100; memcpy(result + index + olength - i - 1, hundreds_digit_table + c, 2); i += 2; } + // Last digit. if (output >= 10) { const uint64_t c = output << 1; // We can't use memcpy here: the decimal dot goes between these two digits. From 2dd412ac008458b47d5ed0c42729f11cf421b083 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Fri, 18 Apr 2025 17:52:01 -0400 Subject: [PATCH 4/4] more tuning --- benchmarks/benchmark.cpp | 53 ++++++++++++++++++-------------- benchmarks/exhaustivefloat32.cpp | 10 +++--- benchmarks/ieeeToString.cpp | 53 +++++++++++++++++--------------- benchmarks/thoroughfloat64.cpp | 20 ++++++------ 4 files changed, 73 insertions(+), 63 deletions(-) diff --git a/benchmarks/benchmark.cpp b/benchmarks/benchmark.cpp index 0d68d20..00b9f2b 100644 --- a/benchmarks/benchmark.cpp +++ b/benchmarks/benchmark.cpp @@ -23,6 +23,7 @@ #include #include #include +#include using Benchmarks::arithmetic_float; using Benchmarks::BenchArgs; @@ -47,12 +48,12 @@ void evaluateProperties(const std::vector &lines, for (const auto &algo : args) { if (!algo.used) { - std::cout << "# skipping " << algo.name << std::endl; + fmt::println("# skipping {}", algo.name); continue; } // Apply filter if provided if (!is_matched(algo.name, filter)) { - std::cout << "# filtered out " << algo.name << std::endl; + fmt::println("# filtered out {}", algo.name); continue; } char buf1[100], buf2[100]; @@ -93,7 +94,7 @@ struct diy_float_t { template void process(const std::vector &lines, const std::array, Benchmarks::COUNT> &args, const std::span filter = {}) { - // We have a special algorithm for the reference: + // We have a special algorithm for the string generation: std::string just_string = "just_string"; if (is_matched(just_string, filter)) { std::vector parsed; @@ -110,17 +111,16 @@ void process(const std::vector &lines, return volume; }, 100); } else { - std::cout << "# skipping " << just_string << std::endl; - + fmt::println("# skipping {}", just_string); } for (const auto &algo : args) { if (!algo.used) { - std::cout << "# skipping " << algo.name << std::endl; + fmt::println("# skipping {}", algo.name); continue; } // Apply filter if provided if (!is_matched(algo.name, filter)) { - std::cout << "# filtered out " << algo.name << std::endl; + fmt::println("# filtered out {}", algo.name); continue; } pretty_print(lines, algo.name, [&algo](const std::vector &lines) -> int { @@ -139,7 +139,7 @@ template std::vector fileload(const std::string &filename) { std::ifstream inputfile(filename); if (!inputfile) { - std::cerr << "can't open " << filename << std::endl; + fmt::print(stderr, "can't open {}\n", filename); return {}; } @@ -150,24 +150,21 @@ std::vector fileload(const std::string &filename) { lines.push_back(std::is_same_v ? std::stof(line) : std::stod(line)); } catch (...) { - std::cerr << "problem with " << line << "\n" - << "We expect floating-point numbers (one per line)." - << std::endl; + fmt::print(stderr, "problem with {}\nWe expect floating-point numbers (one per line).\n", line); std::abort(); } } - std::cout << "# read " << lines.size() << " lines " << std::endl; + fmt::println("# read {} lines", lines.size()); return lines; } template std::vector get_random_numbers(size_t howmany, const std::string &random_model) { - std::cout << "# parsing random numbers" << std::endl; + fmt::println("# parsing random numbers"); std::vector lines; auto g = get_generator_by_name(random_model); - std::cout << "model: " << g->describe() << "\n" - << "volume: " << howmany << " floats" << std::endl; + fmt::print("model: {}\nvolume: {} floats\n", g->describe(), howmany); lines.reserve(howmany); // let us reserve plenty of memory. for (size_t i = 0; i < howmany; i++) { const T line = g->new_float(); @@ -203,14 +200,13 @@ int main(int argc, char **argv) { const auto result = options.parse(argc, argv); if (result["help"].as()) { - std::cout << options.help() << std::endl; + fmt::print("{}\n", options.help()); return EXIT_SUCCESS; } const size_t repeat = result["repeat"].as(); const bool single = result["single"].as(); std::vector filter = result["algo-filter"].as>(); - std::cout << "number type: binary" - << (single ? "32 (float)" : "64 (double)") << std::endl; + fmt::println("number type: binary{}", (single ? "32 (float)" : "64 (double)")); std::variant, std::vector> numbers; const auto filename = result["file"].as(); @@ -221,9 +217,7 @@ int main(int argc, char **argv) { numbers = get_random_numbers(volume, model); else numbers = get_random_numbers(volume, model); - std::cout << "# You can also provide a filename (with the -f flag): " - "it should contain one string per line corresponding to a number" - << std::endl; + fmt::println("# You can also provide a filename (with the -f flag): it should contain one string per line corresponding to a number"); } else { if (single) @@ -241,7 +235,7 @@ int main(int argc, char **argv) { algorithms = Benchmarks::initArgs(errol); if(repeat > 0) { - std::cout << "# forcing repeat count to " << repeat << std::endl; + fmt::println("# forcing repeat count to {}", repeat); std::visit([repeat](auto &args) { for (auto &arg : args) arg.testRepeat = repeat; @@ -260,7 +254,20 @@ int main(int argc, char **argv) { } }, numbers, algorithms); } catch (const std::exception &e) { - std::cout << "error parsing options: " << e.what() << std::endl; + fmt::println("Error parsing options: {}", e.what()); + fmt::println("\nUSAGE GUIDE:"); + fmt::println(" ./benchmark [OPTIONS]"); + fmt::println("\nCOMMAND SUMMARY:"); + fmt::println(" The benchmark tool evaluates the performance of different floating-point to string"); + fmt::println(" conversion algorithms. It can use either synthetic data or a file containing"); + fmt::println(" floating-point numbers (one per line)."); + fmt::println("\nEXAMPLES:"); + fmt::println(" ./benchmark --single # Run benchmark with single precision (float)"); + fmt::println(" ./benchmark --file=data/canada.txt # Run benchmark using numbers from a file"); + fmt::println(" ./benchmark --test # Test correctness instead of performance"); + fmt::println(" ./benchmark --volume=1000 --model=uniform # Generate 1000 uniform random numbers"); + fmt::println(" ./benchmark --algo-filter=ryu,grisu # Only test algorithms containing 'ryu' or 'grisu'"); + fmt::println("\nFor full options list, run: ./benchmark --help"); return EXIT_FAILURE; } } diff --git a/benchmarks/exhaustivefloat32.cpp b/benchmarks/exhaustivefloat32.cpp index 67f671c..fefe656 100644 --- a/benchmarks/exhaustivefloat32.cpp +++ b/benchmarks/exhaustivefloat32.cpp @@ -48,14 +48,14 @@ std::optional parse_float(std::string_view sv) { float result; const char* begin = sv.data(); const char* end = sv.data() + sv.size(); - + auto [ptr, ec] = std::from_chars(begin, end, result); - + // Check if parsing succeeded and consumed the entire string if (ec == std::errc{} && ptr == end) { return result; } - + // Return nullopt if parsing failed or didn't consume all input return std::nullopt; } @@ -76,7 +76,7 @@ void run_exhaustive32(bool errol, const std::vector& algo_filter = fmt::print("# skipping {} because it is the reference.\n", algo.name); continue; } - + // Apply filter if provided if (!algo_filter.empty()) { bool matched = false; @@ -91,7 +91,7 @@ void run_exhaustive32(bool errol, const std::vector& algo_filter = continue; } } - + bool incorrect = false; char buf1[100], buf2[100]; std::span bufRef(buf1, sizeof(buf1)), bufAlgo(buf2, sizeof(buf2)); diff --git a/benchmarks/ieeeToString.cpp b/benchmarks/ieeeToString.cpp index b70c52a..43daedb 100644 --- a/benchmarks/ieeeToString.cpp +++ b/benchmarks/ieeeToString.cpp @@ -57,7 +57,7 @@ IEEE754d decode_ieee754(double f) { } //////////////////////// -// We should use https://en.cppreference.com/w/cpp/numeric/countl_zero +// We should use https://en.cppreference.com/w/cpp/numeric/countl_zero //////////////////////// #if WE_HAVE_VISUAL_STUDIO inline int leading_zeroes_64(uint64_t input_num) { @@ -84,7 +84,7 @@ inline int leading_zeroes_64(uint64_t input_num) { inline int int_log2_64(uint64_t x) { return 63 - leading_zeroes_64(x | 1); } /** - * Reference: + * Reference: * Daniel Lemire, "Computing the number of digits of an integer even faster," in Daniel Lemire's blog, June 3, 2021, https://lemire.me/blog/2021/06/03/computing-the-number-of-digits-of-an-integer-even-faster/. */ inline int fast_digit_count32(uint32_t x) { @@ -101,7 +101,7 @@ inline int fast_digit_count32(uint32_t x) { /** - * Reference: + * Reference: * Daniel Lemire, "Counting the digits of 64-bit integers," in Daniel Lemire's blog, January 7, 2025, https://lemire.me/blog/2025/01/07/counting-the-digits-of-64-bit-integers/. */ inline int fast_digit_count64(uint64_t x) { @@ -133,12 +133,13 @@ inline int fast_digit_count64(uint64_t x) { template int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) { constexpr bool is_double = sizeof(T) == 8; + static_assert(is_double || sizeof(T) == 4, "Unsupported type size"); int index = 0; if (sign) result[index++] = '-'; // We use fast arithmetic to compute the number of digits. - const uint32_t olength = is_double ? fast_digit_count64(mantissa) + const uint32_t olength = is_double ? fast_digit_count64(mantissa) : fast_digit_count32(mantissa); // Print the decimal digits. // for (uint32_t i = 0; i < olength - 1; ++i) { @@ -210,30 +211,32 @@ int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) { } // Print the exponent. - result[index++] = 'E'; int32_t exp = exponent + (int32_t) olength - 1; - if (exp < 0) { - result[index++] = '-'; - exp = -exp; - } - - const auto handle_common_cases = [&]() { - if (exp >= 10) { - memcpy(result + index, hundreds_digit_table + 2 * exp, 2); - index += 2; - } else - result[index++] = (char)('0' + exp); - }; - if constexpr (is_double) { - if (exp >= 100) { - const int32_t c = exp % 10; - memcpy(result + index, hundreds_digit_table + 2 * (exp / 10), 2); - result[index + 2] = (char) ('0' + c); - index += 3; + if(mantissa && exp) { // We do not print the exponent if mantissa is zero. + result[index++] = 'E'; + if (exp < 0) { + result[index++] = '-'; + exp = -exp; + } + + const auto handle_common_cases = [&]() { + if (exp >= 10) { + memcpy(result + index, hundreds_digit_table + 2 * exp, 2); + index += 2; + } else + result[index++] = (char)('0' + exp); + }; + if constexpr (is_double) { + if (exp >= 100) { + const int32_t c = exp % 10; + memcpy(result + index, hundreds_digit_table + 2 * (exp / 10), 2); + result[index + 2] = (char) ('0' + c); + index += 3; + } else + handle_common_cases(); } else handle_common_cases(); - } else - handle_common_cases(); + } return index; } diff --git a/benchmarks/thoroughfloat64.cpp b/benchmarks/thoroughfloat64.cpp index 3fe50fd..8f9fe0b 100644 --- a/benchmarks/thoroughfloat64.cpp +++ b/benchmarks/thoroughfloat64.cpp @@ -50,14 +50,14 @@ std::optional parse_double(std::string_view sv) { double result; const char* begin = sv.data(); const char* end = sv.data() + sv.size(); - + auto [ptr, ec] = std::from_chars(begin, end, result); - + // Check if parsing succeeded and consumed the entire string if (ec == std::errc{} && ptr == end) { return result; } - + // Return nullopt if parsing failed or didn't consume all input return std::nullopt; } @@ -72,7 +72,7 @@ std::vector load_doubles_from_file(const std::string& filename) { std::vector numbers; std::ifstream file(filename); std::string line; - + if (!file.is_open()) { fmt::print("Error: Could not open file {}\n", filename); return numbers; @@ -85,7 +85,7 @@ std::vector load_doubles_from_file(const std::string& filename) { fmt::print("Warning: Could not parse '{}' as double, skipping\n", line); } } - + file.close(); return numbers; } @@ -113,7 +113,7 @@ void run_file_test(const std::string& filename, bool errol, const std::vector bufRef(buf1, sizeof(buf1)), bufAlgo(buf2, sizeof(buf2)); fmt::print("# processing {}", algo.name); fflush(stdout); - + size_t total = test_values.size(); for (size_t i = 0; i < total; ++i) { if (i % (total/10) == 0 && total > 10) { @@ -145,7 +145,7 @@ void run_file_test(const std::string& filename, bool errol, const std::vector