From 06bd7fccda1afc0d5dbd36e35ef2ad8273283dbe Mon Sep 17 00:00:00 2001
From: Daniel Lemire <dlemire@lemire.me>
Date: Fri, 18 Apr 2025 16:01:04 -0400
Subject: [PATCH 1/4] some tuning and more options

---
 benchmarks/CMakeLists.txt        |   1 -
 benchmarks/algorithms.h          |   2 +-
 benchmarks/benchmark.cpp         |  38 ++++--
 benchmarks/exhaustivefloat32.cpp |  31 ++++-
 benchmarks/ieeeToString.cpp      | 196 +++++++++++++++++++++----------
 benchmarks/thoroughfloat64.cpp   |  25 +++-
 6 files changed, 212 insertions(+), 81 deletions(-)
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 1750929..5501cf1 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -3,7 +3,6 @@ add_executable(benchmark
 )
 add_library(benchmark_deps INTERFACE)
 add_library(ieeeToString ieeeToString.cpp)
-target_include_directories(ieeeToString PRIVATE ${ryu_SOURCE_DIR})
 target_link_libraries(benchmark_deps INTERFACE ieeeToString)
 include(CheckSourceCompiles)
 check_source_compiles(CXX "
diff --git a/benchmarks/algorithms.h b/benchmarks/algorithms.h
index 800bdf2..7358aca 100644
--- a/benchmarks/algorithms.h
+++ b/benchmarks/algorithms.h
@@ -82,7 +82,7 @@ struct BenchArgs {
   std::string name{};
   int (*func)(T, std::span<char>&){};
   bool used{};
-  unsigned char testRepeat{100};
+  size_t testRepeat{100};
 };
 
 template<arithmetic_float T>
diff --git a/benchmarks/benchmark.cpp b/benchmarks/benchmark.cpp
index 74d0112..e2b1667 100644
--- a/benchmarks/benchmark.cpp
+++ b/benchmarks/benchmark.cpp
@@ -26,9 +26,21 @@
 using Benchmarks::arithmetic_float;
 using Benchmarks::BenchArgs;
 
+bool is_matched(const std::string &str, const std::span<std::string> filter) {
+  if (filter.empty()) {
+    return true;
+  }
+  for (const auto &f : filter) {
+    if (str.find(f) != std::string::npos) {
+      return true;
+    }
+  }
+  return false;
+}
+
 template <arithmetic_float T>
 void evaluateProperties(const std::vector<T> &lines,
-                        const std::array<BenchArgs<T>, Benchmarks::COUNT> &args, const std::string& filter = "") {
+                        const std::array<BenchArgs<T>, Benchmarks::COUNT> &args, const std::span<std::string> filter = {}) {
   constexpr auto precision = std::numeric_limits<T>::digits10;
   fmt::println("{:20} {:20}", "Algorithm", "Valid round-trip");
 
@@ -38,7 +50,7 @@ void evaluateProperties(const std::vector<T> &lines,
       continue;
     }
     // Apply filter if provided
-    if (!filter.empty() && std::string(filter).find(algo.name) == std::string::npos) {
+    if (!is_matched(algo.name, filter)) {
       std::cout << "# filtered out " << algo.name << std::endl;
       continue;
     }
@@ -73,14 +85,14 @@ void evaluateProperties(const std::vector<T> &lines,
 
 template <arithmetic_float T>
 void process(const std::vector<T> &lines,
-             const std::array<BenchArgs<T>, Benchmarks::COUNT> &args, const std::string& filter = "") {
+             const std::array<BenchArgs<T>, Benchmarks::COUNT> &args, const std::span<std::string> filter = {}) {
   for (const auto &algo : args) {
     if (!algo.used) {
       std::cout << "# skipping " << algo.name << std::endl;
       continue;
     }
     // Apply filter if provided
-    if (!filter.empty() && std::string(filter).find(algo.name) == std::string::npos) {
+    if (!is_matched(algo.name, filter)) {
       std::cout << "# filtered out " << algo.name << std::endl;
       continue;
     }
@@ -155,8 +167,10 @@ int main(int argc, char **argv) {
         cxxopts::value<bool>()->default_value("false"))
         ("e,errol", "Enable errol3 (current impl. returns invalid values, e.g., for 0).",
         cxxopts::value<bool>()->default_value("false"))
-        ("a,algo-filter", "Filter algorithms by name substring.",
-          cxxopts::value<std::string>()->default_value(""))
+        ("a,algo-filter", "Filter algorithms by name substring: you can use multiple filters separated by commas.",
+        cxxopts::value<std::vector<std::string>>()->default_value(""))
+        ("r,repeat", "Force a number of repetitions.",
+        cxxopts::value<size_t>()->default_value("0"))
         ("h,help", "Print usage.");
     const auto result = options.parse(argc, argv);
 
@@ -164,9 +178,9 @@ int main(int argc, char **argv) {
       std::cout << options.help() << std::endl;
       return EXIT_SUCCESS;
     }
-
+    const size_t repeat = result["repeat"].as<size_t>();
     const bool single = result["single"].as<bool>();
-    const std::string filter = result["algo-filter"].as<std::string>();
+    std::vector<std::string> filter = result["algo-filter"].as<std::vector<std::string>>();
     std::cout << "number type: binary"
               << (single ? "32 (float)" : "64 (double)") << std::endl;
 
@@ -198,6 +212,14 @@ int main(int argc, char **argv) {
     else
       algorithms = Benchmarks::initArgs<double>(errol);
 
+    if(repeat > 0) {
+      std::cout << "# forcing repeat count to " << repeat << std::endl;
+      std::visit([repeat](auto &args) {
+        for (auto &arg : args)
+          arg.testRepeat = repeat;
+      }, algorithms);
+    }
+
     const bool test = result["test"].as<bool>();
     std::visit([test,&filter](const auto &lines, const auto &args) {
       using T1 = typename std::decay_t<decltype(lines)>::value_type;
diff --git a/benchmarks/exhaustivefloat32.cpp b/benchmarks/exhaustivefloat32.cpp
index 65e53f9..67f671c 100644
--- a/benchmarks/exhaustivefloat32.cpp
+++ b/benchmarks/exhaustivefloat32.cpp
@@ -8,6 +8,7 @@
 #include <iostream>
 #include <string_view>
 #include <charconv>
+#include <vector>
 
 #include "algorithms.h"
 #include "cxxopts.hpp"
@@ -59,7 +60,7 @@ std::optional<float> parse_float(std::string_view sv) {
   return std::nullopt;
 }
 
-void run_exhaustive32(bool errol) {
+void run_exhaustive32(bool errol, const std::vector<std::string>& algo_filter = {}) {
   constexpr auto precision = std::numeric_limits<float>::digits10;
   fmt::println("{:20} {:20}", "Algorithm", "Valid shortest serialization");
 
@@ -75,6 +76,22 @@ void run_exhaustive32(bool errol) {
       fmt::print("# skipping {} because it is the reference.\n", algo.name);
       continue;
     }
+    
+    // Apply filter if provided
+    if (!algo_filter.empty()) {
+      bool matched = false;
+      for (const auto &f : algo_filter) {
+        if (algo.name.find(f) != std::string::npos) {
+          matched = true;
+          break;
+        }
+      }
+      if (!matched) {
+        fmt::print("# filtered out {}\n", algo.name);
+        continue;
+      }
+    }
+    
     bool incorrect = false;
     char buf1[100], buf2[100];
     std::span<char> bufRef(buf1, sizeof(buf1)), bufAlgo(buf2, sizeof(buf2));
@@ -149,15 +166,21 @@ int main(int argc, char **argv) {
     options.add_options()(
         "e,errol",
         "Enable errol3 (current impl. returns invalid values, e.g., for 0).",
-        cxxopts::value<bool>()->default_value("false"))("h,help",
-                                                        "Print usage.");
+        cxxopts::value<bool>()->default_value("false"))(
+        "a,algorithm",
+        "Specify which algorithm(s) to test (comma-separated).",
+        cxxopts::value<std::vector<std::string>>()->default_value({}))(
+        "h,help",
+        "Print usage.");
     const auto result = options.parse(argc, argv);
 
     if (result["help"].as<bool>()) {
       fmt::print("{}\n", options.help());
       return EXIT_SUCCESS;
     }
-    run_exhaustive32(result["errol"].as<bool>());
+
+    auto algo_filter = result["algorithm"].as<std::vector<std::string>>();
+    run_exhaustive32(result["errol"].as<bool>(), algo_filter);
   } catch (const std::exception &e) {
     fmt::print("error parsing options: {}\n", e.what());
     return EXIT_FAILURE;
diff --git a/benchmarks/ieeeToString.cpp b/benchmarks/ieeeToString.cpp
index 783e24a..9df38b4 100644
--- a/benchmarks/ieeeToString.cpp
+++ b/benchmarks/ieeeToString.cpp
@@ -1,11 +1,40 @@
 #include "ieeeToString.h"
 
 #include <bit>
+#include <cstdlib>
+#include <stdio.h>
 #include <cassert>
 #include <cstring>
+#ifdef _MSC_VER
+#ifdef __clang__
+#define WE_HAVE_CLANGCL 1
+#else
+#define WE_HAVE_VISUAL_STUDIO 1
+#include <intrin.h>
+#endif
+#endif
 
-#include "ryu/digit_table.h" // For DIGIT_TABLE
-#include "ryu/common.h" // For decimalLength9
+static const char hundreds_digit_table[200] = {
+  '0', '0', '0', '1', '0', '2', '0', '3', '0', '4',
+  '0', '5', '0', '6', '0', '7', '0', '8', '0', '9',
+  '1', '0', '1', '1', '1', '2', '1', '3', '1', '4',
+  '1', '5', '1', '6', '1', '7', '1', '8', '1', '9',
+  '2', '0', '2', '1', '2', '2', '2', '3', '2', '4',
+  '2', '5', '2', '6', '2', '7', '2', '8', '2', '9',
+  '3', '0', '3', '1', '3', '2', '3', '3', '3', '4',
+  '3', '5', '3', '6', '3', '7', '3', '8', '3', '9',
+  '4', '0', '4', '1', '4', '2', '4', '3', '4', '4',
+  '4', '5', '4', '6', '4', '7', '4', '8', '4', '9',
+  '5', '0', '5', '1', '5', '2', '5', '3', '5', '4',
+  '5', '5', '5', '6', '5', '7', '5', '8', '5', '9',
+  '6', '0', '6', '1', '6', '2', '6', '3', '6', '4',
+  '6', '5', '6', '6', '6', '7', '6', '8', '6', '9',
+  '7', '0', '7', '1', '7', '2', '7', '3', '7', '4',
+  '7', '5', '7', '6', '7', '7', '7', '8', '7', '9',
+  '8', '0', '8', '1', '8', '2', '8', '3', '8', '4',
+  '8', '5', '8', '6', '8', '7', '8', '8', '8', '9',
+  '9', '0', '9', '1', '9', '2', '9', '3', '9', '4',
+  '9', '5', '9', '6', '9', '7', '9', '8', '9', '9'};
 
 IEEE754f decode_ieee754(float f) {
   const uint32_t bits = std::bit_cast<uint32_t>(f);
@@ -27,31 +56,77 @@ IEEE754d decode_ieee754(double f) {
   return decomposed;
 }
 
-// Extracted from the Ryu implementation.
-static inline uint32_t decimalLength17(const uint64_t v) {
-  // Function precondition: v is not an 18, 19, or 20-digit number.
-  // (17 digits are sufficient for round-tripping.)
-  assert(v < 100000000000000000L);
-
-  // Slightly faster than a loop.
-  // Average output length is 16.38 digits, so we check high-to-low.
-  if (v >= 10000000000000000L) { return 17; }
-  if (v >= 1000000000000000L) { return 16; }
-  if (v >= 100000000000000L) { return 15; }
-  if (v >= 10000000000000L) { return 14; }
-  if (v >= 1000000000000L) { return 13; }
-  if (v >= 100000000000L) { return 12; }
-  if (v >= 10000000000L) { return 11; }
-  if (v >= 1000000000L) { return 10; }
-  if (v >= 100000000L) { return 9; }
-  if (v >= 10000000L) { return 8; }
-  if (v >= 1000000L) { return 7; }
-  if (v >= 100000L) { return 6; }
-  if (v >= 10000L) { return 5; }
-  if (v >= 1000L) { return 4; }
-  if (v >= 100L) { return 3; }
-  if (v >= 10L) { return 2; }
-  return 1;
+////////////////////////
+// We should use https://en.cppreference.com/w/cpp/numeric/countl_zero 
+////////////////////////
+#if WE_HAVE_VISUAL_STUDIO
+inline int leading_zeroes_64(uint64_t input_num) {
+  unsigned long index;
+#ifdef _WIN64  // highly recommended!!!
+  _BitScanReverse64(&index, input_num);
+#else   // if we must support 32-bit Windows
+  if (input_num > 0xFFFFFFFF) {
+      _BitScanReverse(&index, (uint32_t)(input_num >> 32));
+      index += 32;
+  } else {
+      _BitScanReverse(&index, (uint32_t)(input_num));
+  }
+#endif  // _WIN64
+  return 63 - index;
+}
+#else
+inline int leading_zeroes_64(uint64_t input_num) {
+  return __builtin_clzll(input_num);
+}
+#endif
+
+
+inline int int_log2_64(uint64_t x) { return 63 - leading_zeroes_64(x | 1); }
+
+/**
+ * Reference:  
+ * Daniel Lemire, "Computing the number of digits of an integer even faster," in Daniel Lemire's blog, June 3, 2021, https://lemire.me/blog/2021/06/03/computing-the-number-of-digits-of-an-integer-even-faster/.
+ */
+inline int fast_digit_count32(uint32_t x) {
+  static uint64_t table[] = {
+      4294967296,  8589934582,  8589934582,  8589934582,  12884901788,
+      12884901788, 12884901788, 17179868184, 17179868184, 17179868184,
+      21474826480, 21474826480, 21474826480, 21474826480, 25769703776,
+      25769703776, 25769703776, 30063771072, 30063771072, 30063771072,
+      34349738368, 34349738368, 34349738368, 34349738368, 38554705664,
+      38554705664, 38554705664, 41949672960, 41949672960, 41949672960,
+      42949672960, 42949672960};
+  return uint32_t((x + table[int_log2_64(x)]) >> 32);
+}
+
+
+/**
+ * Reference:  
+ * Daniel Lemire, "Counting the digits of 64-bit integers," in Daniel Lemire's blog, January 7, 2025, https://lemire.me/blog/2025/01/07/counting-the-digits-of-64-bit-integers/.
+ */
+inline int fast_digit_count64(uint64_t x) {
+  static uint64_t table[] = {9,
+                             99,
+                             999,
+                             9999,
+                             99999,
+                             999999,
+                             9999999,
+                             99999999,
+                             999999999,
+                             9999999999,
+                             99999999999,
+                             999999999999,
+                             9999999999999,
+                             99999999999999,
+                             999999999999999ULL,
+                             9999999999999999ULL,
+                             99999999999999999ULL,
+                             999999999999999999ULL,
+                             9999999999999999999ULL};
+  int y = (19 * int_log2_64(x) >> 6);
+  y += x > table[y];
+  return y + 1;
 }
 
 // Adapted from the Ryu implementation.
@@ -63,9 +138,8 @@ int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) {
   if (sign)
     result[index++] = '-';
 
-  const uint32_t olength = is_double ? decimalLength17(mantissa)
-                                     : decimalLength9(mantissa);
-
+  const uint32_t olength = is_double ? fast_digit_count64(mantissa) 
+                                     : fast_digit_count32(mantissa);
   // Print the decimal digits.
   // for (uint32_t i = 0; i < olength - 1; ++i) {
   //   const uint32_t c = mantissa % 10; mantissa /= 10;
@@ -74,33 +148,29 @@ int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) {
   // result[index] = '0' + mantissa % 10;
 
   uint32_t i = 0;
-  if constexpr (is_double) {
-    // We prefer 32-bit operations, even on 64-bit platforms.
-    // We have at most 17 digits, and uint32_t can store 9 digits.
-    // If mantissa doesn't fit into uint32_t, we cut off 8 digits,
-    // so the rest will fit into uint32_t.
-    if ((mantissa >> 32) != 0) {
-      // Expensive 64-bit division.
-      const uint64_t q = mantissa / 100'000'000;
-      uint32_t temp = ((uint32_t) mantissa) - 100'000'000 * ((uint32_t) q);
-      mantissa = q;
-
-      const uint32_t c = temp % 10000;
-      temp /= 10000;
-      const uint32_t d = temp % 10000;
-      const uint32_t c0 = (c % 100) << 1;
-      const uint32_t c1 = (c / 100) << 1;
-      const uint32_t d0 = (d % 100) << 1;
-      const uint32_t d1 = (d / 100) << 1;
-      memcpy(result + index + olength - 1, DIGIT_TABLE + c0, 2);
-      memcpy(result + index + olength - 3, DIGIT_TABLE + c1, 2);
-      memcpy(result + index + olength - 5, DIGIT_TABLE + d0, 2);
-      memcpy(result + index + olength - 7, DIGIT_TABLE + d1, 2);
-      i += 8;
-    }
+    // We take care of the least significant eight digits first.
+  if (mantissa >= 100'000'000) {
+    // Expensive 64-bit division.
+    const uint64_t q = mantissa / 100'000'000;
+    uint32_t temp = mantissa % 100'000'000;
+    mantissa = q;
+
+    const uint32_t c = temp % 10000;
+    temp /= 10000;
+    const uint32_t d = temp % 10000;
+    const uint32_t c0 = (c % 100) << 1;
+    const uint32_t c1 = (c / 100) << 1;
+    const uint32_t d0 = (d % 100) << 1;
+    const uint32_t d1 = (d / 100) << 1;
+    memcpy(result + index + olength - 1, hundreds_digit_table + c0, 2);
+    memcpy(result + index + olength - 3, hundreds_digit_table + c1, 2);
+    memcpy(result + index + olength - 5, hundreds_digit_table + d0, 2);
+    memcpy(result + index + olength - 7, hundreds_digit_table + d1, 2);
+    i += 8;
   }
 
-  uint32_t output = (uint32_t) mantissa;
+
+  uint64_t output = mantissa;
   while (output >= 10000) {
 #ifdef __clang__ // https://bugs.llvm.org/show_bug.cgi?id=38217
     const uint32_t c = output - 10000 * (output / 10000);
@@ -110,21 +180,21 @@ int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) {
     output /= 10000;
     const uint32_t c0 = (c % 100) << 1;
     const uint32_t c1 = (c / 100) << 1;
-    memcpy(result + index + olength - i - 1, DIGIT_TABLE + c0, 2);
-    memcpy(result + index + olength - i - 3, DIGIT_TABLE + c1, 2);
+    memcpy(result + index + olength - i - 1, hundreds_digit_table + c0, 2);
+    memcpy(result + index + olength - i - 3, hundreds_digit_table + c1, 2);
     i += 4;
   }
   if (output >= 100) {
     const uint32_t c = (output % 100) << 1;
     output /= 100;
-    memcpy(result + index + olength - i - 1, DIGIT_TABLE + c, 2);
+    memcpy(result + index + olength - i - 1, hundreds_digit_table + c, 2);
     i += 2;
   }
   if (output >= 10) {
-    const uint32_t c = output << 1;
+    const uint64_t c = output << 1;
     // We can't use memcpy here: the decimal dot goes between these two digits.
-    result[index + olength - i] = DIGIT_TABLE[c + 1];
-    result[index] = DIGIT_TABLE[c];
+    result[index + olength - i] = hundreds_digit_table[c + 1];
+    result[index] = hundreds_digit_table[c];
   } else {
     result[index] = (char) ('0' + output);
   }
@@ -147,7 +217,7 @@ int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) {
 
   const auto handle_common_cases = [&]() {
     if (exp >= 10) {
-      memcpy(result + index, DIGIT_TABLE + 2 * exp, 2);
+      memcpy(result + index, hundreds_digit_table + 2 * exp, 2);
       index += 2;
     } else
       result[index++] = (char)('0' + exp);
@@ -155,7 +225,7 @@ int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) {
   if constexpr (is_double) {
     if (exp >= 100) {
       const int32_t c = exp % 10;
-      memcpy(result + index, DIGIT_TABLE + 2 * (exp / 10), 2);
+      memcpy(result + index, hundreds_digit_table + 2 * (exp / 10), 2);
       result[index + 2] = (char) ('0' + c);
       index += 3;
     } else
diff --git a/benchmarks/thoroughfloat64.cpp b/benchmarks/thoroughfloat64.cpp
index 1109ab4..3fe50fd 100644
--- a/benchmarks/thoroughfloat64.cpp
+++ b/benchmarks/thoroughfloat64.cpp
@@ -90,7 +90,7 @@ std::vector<test_case> load_doubles_from_file(const std::string& filename) {
   return numbers;
 }
 
-void run_file_test(const std::string& filename, bool errol) {
+void run_file_test(const std::string& filename, bool errol, const std::vector<std::string>& algo_filter = {}) {
   constexpr auto precision = std::numeric_limits<double>::digits10;
   fmt::println("{:20} {:20}", "Algorithm", "Valid shortest serialization");
 
@@ -113,6 +113,22 @@ void run_file_test(const std::string& filename, bool errol) {
       fmt::print("# skipping {} because it is the reference.\n", algo.name);
       continue;
     }
+    
+    // Apply filter if provided
+    if (!algo_filter.empty()) {
+      bool matched = false;
+      for (const auto &f : algo_filter) {
+        if (algo.name.find(f) != std::string::npos) {
+          matched = true;
+          break;
+        }
+      }
+      if (!matched) {
+        fmt::print("# filtered out {}\n", algo.name);
+        continue;
+      }
+    }
+    
     bool incorrect = false;
     char buf1[100], buf2[100];
     std::span<char> bufRef(buf1, sizeof(buf1)), bufAlgo(buf2, sizeof(buf2));
@@ -190,6 +206,9 @@ int main(int argc, char **argv) {
         ("f,file",
          "Input file containing doubles (one per line)",
          cxxopts::value<std::string>()->default_value(THOROUGH_DATA_FILE))
+        ("a,algorithm",
+         "Filter algorithms to test (comma-separated)",
+         cxxopts::value<std::vector<std::string>>()->default_value(""))
         ("h,help",
          "Print usage.");
     const auto result = options.parse(argc, argv);
@@ -198,9 +217,7 @@ int main(int argc, char **argv) {
       fmt::print("{}\n", options.help());
       return EXIT_SUCCESS;
     }
-
-
-    run_file_test(result["file"].as<std::string>(), result["errol"].as<bool>());
+    run_file_test(result["file"].as<std::string>(), result["errol"].as<bool>(), result["algorithm"].as<std::vector<std::string>>());
   } catch (const std::exception &e) {
     fmt::print("error parsing options: {}\n", e.what());
     return EXIT_FAILURE;

From ff195978be1224ffd7f484d3d0b13793887b2bf4 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <dlemire@lemire.me>
Date: Fri, 18 Apr 2025 16:36:04 -0400
Subject: [PATCH 2/4] adding 'just the string' conversion.

---
 benchmarks/benchmark.cpp | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/benchmarks/benchmark.cpp b/benchmarks/benchmark.cpp
index e2b1667..0d68d20 100644
--- a/benchmarks/benchmark.cpp
+++ b/benchmarks/benchmark.cpp
@@ -8,6 +8,7 @@
  */
 
 #include "algorithms.h"
+#include <vector>
 #define IEEE_8087
 #include "benchutil.h"
 #include "cxxopts.hpp"
@@ -83,9 +84,35 @@ void evaluateProperties(const std::vector<T> &lines,
   }
 }
 
+struct diy_float_t {
+		uint64_t	significand;
+		int							exponent;
+		bool						is_negative;
+};
+
 template <arithmetic_float T>
 void process(const std::vector<T> &lines,
              const std::array<BenchArgs<T>, Benchmarks::COUNT> &args, const std::span<std::string> filter = {}) {
+  // We have a special algorithm for the reference:
+  std::string just_string = "just_string";
+  if (is_matched(just_string, filter)) {
+    std::vector<diy_float_t> parsed;
+    for(auto d : lines) {
+      auto v = jkj::grisu_exact(d);
+      parsed.emplace_back(v.significand, v.exponent, v.is_negative);
+    }
+    pretty_print(parsed, just_string, [](const std::vector<diy_float_t>& parsed) -> int {
+      int volume = 0;
+      char buf[100];
+      std::span<char> bufspan(buf, sizeof(buf));
+      for (const auto v : parsed)
+        volume +=  to_chars(v.significand, v.exponent, v.is_negative, bufspan.data());
+      return volume;
+    }, 100);
+  } else {
+    std::cout << "# skipping " << just_string << std::endl;
+  
+  }
   for (const auto &algo : args) {
     if (!algo.used) {
       std::cout << "# skipping " << algo.name << std::endl;
@@ -105,6 +132,7 @@ void process(const std::vector<T> &lines,
       return volume;
     }, algo.testRepeat);
   }
+
 }
 
 template <typename T>

From a684d308c8a45da2cf94bd64aceba91e969f6150 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <dlemire@lemire.me>
Date: Fri, 18 Apr 2025 16:48:58 -0400
Subject: [PATCH 3/4] some comments

---
 benchmarks/ieeeToString.cpp | 40 +++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/benchmarks/ieeeToString.cpp b/benchmarks/ieeeToString.cpp
index 9df38b4..b70c52a 100644
--- a/benchmarks/ieeeToString.cpp
+++ b/benchmarks/ieeeToString.cpp
@@ -137,7 +137,7 @@ int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) {
   int index = 0;
   if (sign)
     result[index++] = '-';
-
+  // We use fast arithmetic to compute the number of digits.
   const uint32_t olength = is_double ? fast_digit_count64(mantissa) 
                                      : fast_digit_count32(mantissa);
   // Print the decimal digits.
@@ -146,22 +146,25 @@ int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) {
   //   result[index + olength - i] = (char) ('0' + c);
   // }
   // result[index] = '0' + mantissa % 10;
-
+  //////////////////
+  // Performance:
+  // On 64-bit systems, 32-bit arithmetic is no faster than 64-bit,
+  // and sometimes slower.
+  /////////////////
   uint32_t i = 0;
-    // We take care of the least significant eight digits first.
+  // We take care of the least significant eight digits first.
   if (mantissa >= 100'000'000) {
-    // Expensive 64-bit division.
     const uint64_t q = mantissa / 100'000'000;
-    uint32_t temp = mantissa % 100'000'000;
+    uint64_t temp = mantissa % 100'000'000;
     mantissa = q;
 
-    const uint32_t c = temp % 10000;
+    const uint64_t c = temp % 10000;
     temp /= 10000;
-    const uint32_t d = temp % 10000;
-    const uint32_t c0 = (c % 100) << 1;
-    const uint32_t c1 = (c / 100) << 1;
-    const uint32_t d0 = (d % 100) << 1;
-    const uint32_t d1 = (d / 100) << 1;
+    const uint64_t d = temp % 10000;
+    const uint64_t c0 = (c % 100) << 1;
+    const uint64_t c1 = (c / 100) << 1;
+    const uint64_t d0 = (d % 100) << 1;
+    const uint64_t d1 = (d / 100) << 1;
     memcpy(result + index + olength - 1, hundreds_digit_table + c0, 2);
     memcpy(result + index + olength - 3, hundreds_digit_table + c1, 2);
     memcpy(result + index + olength - 5, hundreds_digit_table + d0, 2);
@@ -171,25 +174,24 @@ int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) {
 
 
   uint64_t output = mantissa;
+  // Next, we proceed in block of 4 digits.
   while (output >= 10000) {
-#ifdef __clang__ // https://bugs.llvm.org/show_bug.cgi?id=38217
-    const uint32_t c = output - 10000 * (output / 10000);
-#else
-    const uint32_t c = output % 10000;
-#endif
+    const uint64_t c = output % 10000;
     output /= 10000;
-    const uint32_t c0 = (c % 100) << 1;
-    const uint32_t c1 = (c / 100) << 1;
+    const uint64_t c0 = (c % 100) << 1;
+    const uint64_t c1 = (c / 100) << 1;
     memcpy(result + index + olength - i - 1, hundreds_digit_table + c0, 2);
     memcpy(result + index + olength - i - 3, hundreds_digit_table + c1, 2);
     i += 4;
   }
+  // We can take care of two digits out of the 2 or 3 remaining.
   if (output >= 100) {
-    const uint32_t c = (output % 100) << 1;
+    const uint64_t c = (output % 100) << 1;
     output /= 100;
     memcpy(result + index + olength - i - 1, hundreds_digit_table + c, 2);
     i += 2;
   }
+  // Last digit.
   if (output >= 10) {
     const uint64_t c = output << 1;
     // We can't use memcpy here: the decimal dot goes between these two digits.

From 2dd412ac008458b47d5ed0c42729f11cf421b083 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <dlemire@lemire.me>
Date: Fri, 18 Apr 2025 17:52:01 -0400
Subject: [PATCH 4/4] more tuning

---
 benchmarks/benchmark.cpp         | 53 ++++++++++++++++++--------------
 benchmarks/exhaustivefloat32.cpp | 10 +++---
 benchmarks/ieeeToString.cpp      | 53 +++++++++++++++++---------------
 benchmarks/thoroughfloat64.cpp   | 20 ++++++------
 4 files changed, 73 insertions(+), 63 deletions(-)

diff --git a/benchmarks/benchmark.cpp b/benchmarks/benchmark.cpp
index 0d68d20..00b9f2b 100644
--- a/benchmarks/benchmark.cpp
+++ b/benchmarks/benchmark.cpp
@@ -23,6 +23,7 @@
 #include <string>
 #include <variant>
 #include <fast_float/fast_float.h>
+#include <fmt/core.h>
 
 using Benchmarks::arithmetic_float;
 using Benchmarks::BenchArgs;
@@ -47,12 +48,12 @@ void evaluateProperties(const std::vector<T> &lines,
 
   for (const auto &algo : args) {
     if (!algo.used) {
-      std::cout << "# skipping " << algo.name << std::endl;
+      fmt::println("# skipping {}", algo.name);
       continue;
     }
     // Apply filter if provided
     if (!is_matched(algo.name, filter)) {
-      std::cout << "# filtered out " << algo.name << std::endl;
+      fmt::println("# filtered out {}", algo.name);
       continue;
     }
     char buf1[100], buf2[100];
@@ -93,7 +94,7 @@ struct diy_float_t {
 template <arithmetic_float T>
 void process(const std::vector<T> &lines,
              const std::array<BenchArgs<T>, Benchmarks::COUNT> &args, const std::span<std::string> filter = {}) {
-  // We have a special algorithm for the reference:
+  // We have a special algorithm for the string generation:
   std::string just_string = "just_string";
   if (is_matched(just_string, filter)) {
     std::vector<diy_float_t> parsed;
@@ -110,17 +111,16 @@ void process(const std::vector<T> &lines,
       return volume;
     }, 100);
   } else {
-    std::cout << "# skipping " << just_string << std::endl;
-  
+    fmt::println("# skipping {}", just_string);
   }
   for (const auto &algo : args) {
     if (!algo.used) {
-      std::cout << "# skipping " << algo.name << std::endl;
+      fmt::println("# skipping {}", algo.name);
       continue;
     }
     // Apply filter if provided
     if (!is_matched(algo.name, filter)) {
-      std::cout << "# filtered out " << algo.name << std::endl;
+      fmt::println("# filtered out {}", algo.name);
       continue;
     }
     pretty_print(lines, algo.name, [&algo](const std::vector<T> &lines) -> int {
@@ -139,7 +139,7 @@ template <typename T>
 std::vector<T> fileload(const std::string &filename) {
   std::ifstream inputfile(filename);
   if (!inputfile) {
-    std::cerr << "can't open " << filename << std::endl;
+    fmt::print(stderr, "can't open {}\n", filename);
     return {};
   }
 
@@ -150,24 +150,21 @@ std::vector<T> fileload(const std::string &filename) {
       lines.push_back(std::is_same_v<T, float> ? std::stof(line)
                                                : std::stod(line));
     } catch (...) {
-      std::cerr << "problem with " << line << "\n"
-                << "We expect floating-point numbers (one per line)."
-                << std::endl;
+      fmt::print(stderr, "problem with {}\nWe expect floating-point numbers (one per line).\n", line);
       std::abort();
     }
   }
-  std::cout << "# read " << lines.size() << " lines " << std::endl;
+  fmt::println("# read {} lines", lines.size());
   return lines;
 }
 
 template <typename T>
 std::vector<T> get_random_numbers(size_t howmany,
                                   const std::string &random_model) {
-  std::cout << "# parsing random numbers" << std::endl;
+  fmt::println("# parsing random numbers");
   std::vector<T> lines;
   auto g = get_generator_by_name<T>(random_model);
-  std::cout << "model: " << g->describe() << "\n"
-            << "volume: " << howmany << " floats" << std::endl;
+  fmt::print("model: {}\nvolume: {} floats\n", g->describe(), howmany);
   lines.reserve(howmany); // let us reserve plenty of memory.
   for (size_t i = 0; i < howmany; i++) {
     const T line = g->new_float();
@@ -203,14 +200,13 @@ int main(int argc, char **argv) {
     const auto result = options.parse(argc, argv);
 
     if (result["help"].as<bool>()) {
-      std::cout << options.help() << std::endl;
+      fmt::print("{}\n", options.help());
       return EXIT_SUCCESS;
     }
     const size_t repeat = result["repeat"].as<size_t>();
     const bool single = result["single"].as<bool>();
     std::vector<std::string> filter = result["algo-filter"].as<std::vector<std::string>>();
-    std::cout << "number type: binary"
-              << (single ? "32 (float)" : "64 (double)") << std::endl;
+    fmt::println("number type: binary{}", (single ? "32 (float)" : "64 (double)"));
 
     std::variant<std::vector<float>, std::vector<double>> numbers;
     const auto filename = result["file"].as<std::string>();
@@ -221,9 +217,7 @@ int main(int argc, char **argv) {
         numbers = get_random_numbers<float>(volume, model);
       else
         numbers = get_random_numbers<double>(volume, model);
-      std::cout << "# You can also provide a filename (with the -f flag): "
-                   "it should contain one string per line corresponding to a number"
-                << std::endl;
+      fmt::println("# You can also provide a filename (with the -f flag): it should contain one string per line corresponding to a number");
     }
     else {
       if (single)
@@ -241,7 +235,7 @@ int main(int argc, char **argv) {
       algorithms = Benchmarks::initArgs<double>(errol);
 
     if(repeat > 0) {
-      std::cout << "# forcing repeat count to " << repeat << std::endl;
+      fmt::println("# forcing repeat count to {}", repeat);
       std::visit([repeat](auto &args) {
         for (auto &arg : args)
           arg.testRepeat = repeat;
@@ -260,7 +254,20 @@ int main(int argc, char **argv) {
       }
     }, numbers, algorithms);
   } catch (const std::exception &e) {
-    std::cout << "error parsing options: " << e.what() << std::endl;
+    fmt::println("Error parsing options: {}", e.what());
+    fmt::println("\nUSAGE GUIDE:");
+    fmt::println("  ./benchmark [OPTIONS]");
+    fmt::println("\nCOMMAND SUMMARY:");
+    fmt::println("  The benchmark tool evaluates the performance of different floating-point to string");
+    fmt::println("  conversion algorithms. It can use either synthetic data or a file containing");
+    fmt::println("  floating-point numbers (one per line).");
+    fmt::println("\nEXAMPLES:");
+    fmt::println("  ./benchmark --single                    # Run benchmark with single precision (float)");
+    fmt::println("  ./benchmark --file=data/canada.txt      # Run benchmark using numbers from a file");
+    fmt::println("  ./benchmark --test                      # Test correctness instead of performance");
+    fmt::println("  ./benchmark --volume=1000 --model=uniform # Generate 1000 uniform random numbers");
+    fmt::println("  ./benchmark --algo-filter=ryu,grisu     # Only test algorithms containing 'ryu' or 'grisu'");
+    fmt::println("\nFor full options list, run: ./benchmark --help");
     return EXIT_FAILURE;
   }
 }
diff --git a/benchmarks/exhaustivefloat32.cpp b/benchmarks/exhaustivefloat32.cpp
index 67f671c..fefe656 100644
--- a/benchmarks/exhaustivefloat32.cpp
+++ b/benchmarks/exhaustivefloat32.cpp
@@ -48,14 +48,14 @@ std::optional<float> parse_float(std::string_view sv) {
   float result;
   const char* begin = sv.data();
   const char* end = sv.data() + sv.size();
-  
+
   auto [ptr, ec] = std::from_chars(begin, end, result);
-  
+
   // Check if parsing succeeded and consumed the entire string
   if (ec == std::errc{} && ptr == end) {
       return result;
   }
-  
+
   // Return nullopt if parsing failed or didn't consume all input
   return std::nullopt;
 }
@@ -76,7 +76,7 @@ void run_exhaustive32(bool errol, const std::vector<std::string>& algo_filter =
       fmt::print("# skipping {} because it is the reference.\n", algo.name);
       continue;
     }
-    
+
     // Apply filter if provided
     if (!algo_filter.empty()) {
       bool matched = false;
@@ -91,7 +91,7 @@ void run_exhaustive32(bool errol, const std::vector<std::string>& algo_filter =
         continue;
       }
     }
-    
+
     bool incorrect = false;
     char buf1[100], buf2[100];
     std::span<char> bufRef(buf1, sizeof(buf1)), bufAlgo(buf2, sizeof(buf2));
diff --git a/benchmarks/ieeeToString.cpp b/benchmarks/ieeeToString.cpp
index b70c52a..43daedb 100644
--- a/benchmarks/ieeeToString.cpp
+++ b/benchmarks/ieeeToString.cpp
@@ -57,7 +57,7 @@ IEEE754d decode_ieee754(double f) {
 }
 
 ////////////////////////
-// We should use https://en.cppreference.com/w/cpp/numeric/countl_zero 
+// We should use https://en.cppreference.com/w/cpp/numeric/countl_zero
 ////////////////////////
 #if WE_HAVE_VISUAL_STUDIO
 inline int leading_zeroes_64(uint64_t input_num) {
@@ -84,7 +84,7 @@ inline int leading_zeroes_64(uint64_t input_num) {
 inline int int_log2_64(uint64_t x) { return 63 - leading_zeroes_64(x | 1); }
 
 /**
- * Reference:  
+ * Reference:
  * Daniel Lemire, "Computing the number of digits of an integer even faster," in Daniel Lemire's blog, June 3, 2021, https://lemire.me/blog/2021/06/03/computing-the-number-of-digits-of-an-integer-even-faster/.
  */
 inline int fast_digit_count32(uint32_t x) {
@@ -101,7 +101,7 @@ inline int fast_digit_count32(uint32_t x) {
 
 
 /**
- * Reference:  
+ * Reference:
  * Daniel Lemire, "Counting the digits of 64-bit integers," in Daniel Lemire's blog, January 7, 2025, https://lemire.me/blog/2025/01/07/counting-the-digits-of-64-bit-integers/.
  */
 inline int fast_digit_count64(uint64_t x) {
@@ -133,12 +133,13 @@ inline int fast_digit_count64(uint64_t x) {
 template <typename T>
 int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) {
   constexpr bool is_double = sizeof(T) == 8;
+  static_assert(is_double || sizeof(T) == 4, "Unsupported type size");
 
   int index = 0;
   if (sign)
     result[index++] = '-';
   // We use fast arithmetic to compute the number of digits.
-  const uint32_t olength = is_double ? fast_digit_count64(mantissa) 
+  const uint32_t olength = is_double ? fast_digit_count64(mantissa)
                                      : fast_digit_count32(mantissa);
   // Print the decimal digits.
   // for (uint32_t i = 0; i < olength - 1; ++i) {
@@ -210,30 +211,32 @@ int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) {
   }
 
   // Print the exponent.
-  result[index++] = 'E';
   int32_t exp = exponent + (int32_t) olength - 1;
-  if (exp < 0) {
-    result[index++] = '-';
-    exp = -exp;
-  }
-
-  const auto handle_common_cases = [&]() {
-    if (exp >= 10) {
-      memcpy(result + index, hundreds_digit_table + 2 * exp, 2);
-      index += 2;
-    } else
-      result[index++] = (char)('0' + exp);
-  };
-  if constexpr (is_double) {
-    if (exp >= 100) {
-      const int32_t c = exp % 10;
-      memcpy(result + index, hundreds_digit_table + 2 * (exp / 10), 2);
-      result[index + 2] = (char) ('0' + c);
-      index += 3;
+  if(mantissa && exp) { // We do not print the exponent if mantissa is zero.
+    result[index++] = 'E';
+    if (exp < 0) {
+      result[index++] = '-';
+      exp = -exp;
+    }
+
+    const auto handle_common_cases = [&]() {
+      if (exp >= 10) {
+        memcpy(result + index, hundreds_digit_table + 2 * exp, 2);
+        index += 2;
+      } else
+        result[index++] = (char)('0' + exp);
+    };
+    if constexpr (is_double) {
+      if (exp >= 100) {
+        const int32_t c = exp % 10;
+        memcpy(result + index, hundreds_digit_table + 2 * (exp / 10), 2);
+        result[index + 2] = (char) ('0' + c);
+        index += 3;
+      } else
+        handle_common_cases();
     } else
       handle_common_cases();
-  } else
-    handle_common_cases();
+  }
 
   return index;
 }
diff --git a/benchmarks/thoroughfloat64.cpp b/benchmarks/thoroughfloat64.cpp
index 3fe50fd..8f9fe0b 100644
--- a/benchmarks/thoroughfloat64.cpp
+++ b/benchmarks/thoroughfloat64.cpp
@@ -50,14 +50,14 @@ std::optional<double> parse_double(std::string_view sv) {
   double result;
   const char* begin = sv.data();
   const char* end = sv.data() + sv.size();
-  
+
   auto [ptr, ec] = std::from_chars(begin, end, result);
-  
+
   // Check if parsing succeeded and consumed the entire string
   if (ec == std::errc{} && ptr == end) {
       return result;
   }
-  
+
   // Return nullopt if parsing failed or didn't consume all input
   return std::nullopt;
 }
@@ -72,7 +72,7 @@ std::vector<test_case> load_doubles_from_file(const std::string& filename) {
   std::vector<test_case> numbers;
   std::ifstream file(filename);
   std::string line;
-  
+
   if (!file.is_open()) {
     fmt::print("Error: Could not open file {}\n", filename);
     return numbers;
@@ -85,7 +85,7 @@ std::vector<test_case> load_doubles_from_file(const std::string& filename) {
       fmt::print("Warning: Could not parse '{}' as double, skipping\n", line);
     }
   }
-  
+
   file.close();
   return numbers;
 }
@@ -113,7 +113,7 @@ void run_file_test(const std::string& filename, bool errol, const std::vector<st
       fmt::print("# skipping {} because it is the reference.\n", algo.name);
       continue;
     }
-    
+
     // Apply filter if provided
     if (!algo_filter.empty()) {
       bool matched = false;
@@ -128,13 +128,13 @@ void run_file_test(const std::string& filename, bool errol, const std::vector<st
         continue;
       }
     }
-    
+
     bool incorrect = false;
     char buf1[100], buf2[100];
     std::span<char> bufRef(buf1, sizeof(buf1)), bufAlgo(buf2, sizeof(buf2));
     fmt::print("# processing {}", algo.name);
     fflush(stdout);
-    
+
     size_t total = test_values.size();
     for (size_t i = 0; i < total; ++i) {
       if (i % (total/10) == 0 && total > 10) {
@@ -145,7 +145,7 @@ void run_file_test(const std::string& filename, bool errol, const std::vector<st
       const std::string& str_value = test_values[i].str_value;
       if (std::isnan(d) || std::isinf(d))
         continue;
-      
+
       const size_t vRef = Benchmarks::dragonbox(d, bufRef);
       const size_t vAlgo = algo.func(d, bufAlgo);
 
@@ -157,7 +157,7 @@ void run_file_test(const std::string& filename, bool errol, const std::vector<st
       auto countAlgo = count_significant_digits(svAlgo);
       auto backRef = parse_double(svRef);
       auto backAlgo = parse_double(svAlgo);
-      
+
       if(!backRef || !backAlgo) {
         incorrect = true;
         fmt::print(" parse error: case: {}; d = {}, bufRef = {}, bufAlgo = {}", str_value, double_to_hex(d),