From dcb065bf9fe3f11f6d9a6bc10f7431ae9edcd4db Mon Sep 17 00:00:00 2001
From: Raziel Alphadios <64050682+RazielXYZ@users.noreply.github.com>
Date: Sun, 26 Jan 2025 22:35:57 +0200
Subject: [PATCH] Porting to MSVC (#28)

* Improve: better support for Windows and MSVC
* Improve: Move logging around
* Improve: CMakeLists for MSVC some more
* Fix: Missing OpenBLAS config and macros
* Make: Uniform conditions for MSVC
* Docs: Style and links
* Docs: Mention MSVC compatibility
* Docs: Notes on Chrono

---------

Co-authored-by: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
---
 .vscode/settings.json |   3 +
 CMakeLists.txt        | 109 +++++++++++++-------
 README.md             |   5 +-
 less_slow.cpp         | 230 ++++++++++++++++++++++++++++++------------
 4 files changed, 245 insertions(+), 102 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index f04713c..95fa57a 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -17,6 +17,7 @@
     "colsb",
     "consteval",
     "coro",
+    "cplusplus",
     "cppcoro",
     "CTRE",
     "CUDA",
@@ -24,6 +25,7 @@
     "DOTPROD",
     "Dusíková",
     "Eigen",
+    "Eron",
     "excerise",
     "fconcepts",
     "Fedor",
@@ -53,6 +55,7 @@
     "Niels",
     "nlohmann",
     "NVCC",
+    "openblas",
     "openmp",
     "Ormrod",
     "Peta",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f5c8d8b..96c4a21 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 # Let's used CMake 3.16+ for native sanitizers support
-cmake_minimum_required(VERSION 3.16 FATAL_ERROR) 
+cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
 
 # ------------------------------------------------------------------------------
 # Project Setup
@@ -32,17 +32,25 @@ endif()
 # ------------------------------------------------------------------------------
 find_package(Threads REQUIRED)
 find_package(OpenMP REQUIRED)
-find_package(BLAS REQUIRED)
-if (BLAS_FOUND)
-    message(STATUS "BLAS found: ${BLAS_LIBRARIES}")
-else ()
-    message(FATAL_ERROR "BLAS not found")
-endif ()
-
 
 set(FETCHCONTENT_QUIET OFF)
 include(FetchContent)
 
+# Fetch and build OpenBLAS
+FetchContent_Declare(
+  OpenBLAS
+  GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git
+  GIT_TAG v0.3.29
+)
+
+# Set OpenBLAS build options
+set(NOFORTRAN ON CACHE BOOL "Disable Fortran" FORCE)
+set(BUILD_WITHOUT_LAPACK OFF CACHE BOOL "Build without LAPACK" FORCE)
+set(USE_THREAD ON CACHE BOOL "Use threading" FORCE)
+
+# Make OpenBLAS available
+FetchContent_MakeAvailable(OpenBLAS)
+
 # GTest (required by Google Benchmark)
 FetchContent_Declare(
   GoogleTest
@@ -105,7 +113,7 @@ endif()
 FetchContent_Declare(
   VictorZverovichFMT
   GIT_REPOSITORY https://github.com/fmtlib/fmt.git
-  GIT_TAG 11.1.0
+  GIT_TAG 11.1.2
 )
 FetchContent_MakeAvailable(VictorZverovichFMT)
 
@@ -189,19 +197,23 @@ add_executable(less_slow less_slow.cpp)
 set_target_properties(less_slow PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
 # Conditionally add the assembly file(s)
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
-    set_source_files_properties(less_slow_amd64.S PROPERTIES LANGUAGE ASM)
-    target_sources(less_slow PRIVATE less_slow_amd64.S)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
-    set_source_files_properties(less_slow_aarch64.S PROPERTIES LANGUAGE ASM)
-    target_sources(less_slow PRIVATE less_slow_aarch64.S)
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|AMD64|x64")
+  set_source_files_properties(less_slow_amd64.S PROPERTIES LANGUAGE ASM)
+  target_sources(less_slow PRIVATE less_slow_amd64.S)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64")
+  set_source_files_properties(less_slow_aarch64.S PROPERTIES LANGUAGE ASM)
+  target_sources(less_slow PRIVATE less_slow_aarch64.S)
 endif()
 
 # ------------------------------------------------------------------------------
 # Compiler Flags / Options
 # ------------------------------------------------------------------------------
-if(NOT CMAKE_SYSTEM_NAME STREQUAL "Darwin")
-  # Apple Clang doesn't support -march=native
+# Check for compiler support of `-march=native`
+if(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
+  target_compile_options(less_slow PRIVATE -xHost)
+elseif(CMAKE_CXX_COMPILER_ID MATCHES "AppleClang" OR CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+# Apple's Clang and MSVC can't auto-detect the highest CPU features
+else()
   target_compile_options(less_slow PRIVATE -march=native)
 endif()
 
@@ -213,8 +225,15 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
     -fconcepts-diagnostics-depth=10 # Needed to debug concepts
     -fopenmp # OpenMP support, also requires linking
   )
+elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+  target_compile_options(less_slow PRIVATE
+    /MP # Build with multiple processes; equivalent to `make -j` except it spans across all cores by default
+    /wd4068 # Disable the "unknown pragma" warning, as StringZilla uses many GCC and Clang pragmas
+    /Zc:__cplusplus # Make `__cplusplus` macro actually match used standard
+    /Zc:preprocessor # Use conformant preprocessor
+  )
+
 else()
-  # For other compilers (Clang, MSVC, Intel, etc.)
   target_compile_options(less_slow PRIVATE
     -Wno-deprecated-pragma
   )
@@ -230,28 +249,48 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang"
   )
   set_property(TARGET less_slow PROPERTY SANITIZE_ADDRESS TRUE)
   set_property(TARGET less_slow PROPERTY SANITIZE_UNDEFINED TRUE)
+elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+  target_compile_options(less_slow PRIVATE
+    $<$<CONFIG:Release>:/O2>
+    $<$<CONFIG:Release>:/Ob2>
+    $<$<CONFIG:Release>:/Oi>
+    $<$<CONFIG:Release>:/Ot>
+    $<$<CONFIG:Release>:/GL>
+  )
+  target_link_options(less_slow PRIVATE
+    $<$<CONFIG:Release>:/LTCG:incremental>
+  )
 endif()
 
 # ------------------------------------------------------------------------------
 # Link Libraries
 # ------------------------------------------------------------------------------
+# Add OpenBLAS include directory manually
+if(openblas_POPULATED)
+  target_include_directories(less_slow PRIVATE ${openblas_SOURCE_DIR})
+
+  # For config.h
+  target_include_directories(less_slow PRIVATE ${openblas_BINARY_DIR})
+endif()
+
 target_link_libraries(less_slow
   PRIVATE
-    Threads::Threads
-    benchmark
-    fmt::fmt
-    range-v3
-    cppcoro
-    unifex
-    stringzilla
-    yyjson
-    ctre
-    # There is no `absl` shortcut:
-    # https://github.com/abseil/abseil-cpp/blob/master/CMake/README.md#available-abseil-cmake-public-targets
-    absl::flat_hash_map
-    nlohmann_json::nlohmann_json
-    Eigen3::Eigen
-    ${BLAS_LIBRARIES}
-    $<$<STREQUAL:${CMAKE_SYSTEM_NAME},Linux>:TBB::tbb>
-    $<$<STREQUAL:${CMAKE_SYSTEM_NAME},Linux>:OpenMP::OpenMP_CXX>
+  Threads::Threads
+  benchmark
+  fmt::fmt
+  range-v3
+  cppcoro
+  unifex
+  stringzilla
+  yyjson
+  ctre
+  openblas
+
+  # There is no `absl` shortcut:
+  # https://github.com/abseil/abseil-cpp/blob/master/CMake/README.md#available-abseil-cmake-public-targets
+  absl::flat_hash_map
+  nlohmann_json::nlohmann_json
+  Eigen3::Eigen
+  $<$<STREQUAL:${CMAKE_SYSTEM_NAME},Linux>:TBB::tbb>
+  OpenMP::OpenMP_CXX
 )
\ No newline at end of file
diff --git a/README.md b/README.md
index 5338e96..fed5ab2 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ Some of the highlights include:
 - __Is the pointer size really 64 bits__ and how to exploit [pointer-tagging](https://en.wikipedia.org/wiki/Tagged_pointer)?
 - __How many packets is [UDP](https://www.cloudflare.com/learning/ddos/glossary/user-datagram-protocol-udp/) dropping__ and how to serve web requests in [`io_uring`](https://en.wikipedia.org/wiki/Io_uring) from user-space?
 - __Scatter and Gather__ for 50% faster vectorized disjoint memory operations.
-- __How to choose between intrinsics, inline Assembly, and separate Assembly files__ for your performance-critical code?
+- __How to choose between intrinsics, inline Assembly, and separate `.S` files__ for your performance-critical code?
 - __What are Encrypted Enclaves__ and what's the latency of Intel SGX, AMD SEV, and ARM Realm? 🔜
 
 To read, jump to the [`less_slow.cpp` source file](https://github.com/ashvardanian/less_slow.cpp/blob/main/less_slow.cpp) and read the code snippets and comments.
@@ -38,6 +38,9 @@ Follow the instructions below to run the code in your environment and compare it
 
 ## Running the Benchmarks
 
+The project aims to be compatible with GCC, Clang, and MSVC compilers on Linux, MacOS, and Windows.
+That said, to cover the broadest functionality, using GCC on Linux is recommended:
+
 - If you are on Windows, it's recommended that you set up a Linux environment using [WSL](https://docs.microsoft.com/en-us/windows/wsl/install).
 - If you are on MacOS, consider using the non-native distribution of Clang from [Homebrew](https://brew.sh) or [MacPorts](https://www.macports.org).
 - If you are on Linux, make sure to install CMake and a recent version of GCC or Clang compilers to support C++20 features.
diff --git a/less_slow.cpp b/less_slow.cpp
index 90f78bd..c33ff74 100644
--- a/less_slow.cpp
+++ b/less_slow.cpp
@@ -149,7 +149,7 @@ BENCHMARK(i32_addition_inline_asm);
  *  - @b less_slow_amd64.S - for the x86_64 architecture, with 64-bit extensions,
  *    originally introduced by AMD.
  */
-#if defined(__x86_64__) || defined(__aarch64__)
+#if !defined(_MSC_VER) && (defined(__x86_64__) || defined(__aarch64__) || defined(__i386__) || defined(_M_X64))
 
 extern "C" std::int32_t i32_add_asm_kernel(std::int32_t a, std::int32_t b);
 
@@ -262,8 +262,17 @@ BENCHMARK(i32_addition_randomly_initialized);
 #include <unistd.h> // `_SC_NPROCESSORS_ONLN`
 #elif defined(__APPLE__)
 #include <sys/sysctl.h> // `sysctlbyname` on macOS
+#elif defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <Windows.h>
+#include <WinBase.h>
 #endif
 
+/**
+ *  @brief  Returns the number of physical cores available on the system,
+ *          as opposed to the logical cores, which include hyper-threading.
+ */
 std::size_t physical_cores() {
 #if defined(__linux__)
     int nproc = sysconf(_SC_NPROCESSORS_ONLN);
@@ -273,6 +282,36 @@ std::size_t physical_cores() {
     size_t len = sizeof(nproc);
     sysctlbyname("hw.physicalcpu", &nproc, &len, nullptr, 0);
     return static_cast<std::size_t>(nproc);
+#elif defined(_WIN32)
+    // On Windows, both `std::thread::hardware_concurrency` and `GetSystemInfo`
+    // return at most 64 cores, as limited by a single windows processor group.
+    // However, starting with newer versions of Windows, applications can seamlessly
+    // span across multiple processor groups.
+    // GetActiveProcessorCount(ALL_PROCESSOR_GROUPS) can return all logical cores;
+    // However, in order to get physical cores, we have to dive deeper.
+    DWORD bufferSize = 0;
+    GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufferSize);
+    if (bufferSize == 0) {
+        return 0; // Error occurred
+    }
+
+    std::vector<BYTE> buffer(bufferSize);
+    if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &bufferSize)) {
+        return 0; // Error occurred
+    }
+
+    std::size_t coreCount = 0;
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX ptr = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
+    DWORD byteOffset = 0;
+    while (byteOffset < bufferSize) {
+        if (ptr->Relationship == RelationProcessorCore) {
+            ++coreCount;
+        }
+        byteOffset += ptr->Size;
+        ptr = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<BYTE*>(ptr) + ptr->Size);
+    }
+
+    return coreCount;
 #else
     return std::thread::hardware_concurrency();
 #endif
@@ -354,10 +393,20 @@ class aligned_array {
 
   public:
     aligned_array(std::size_t size, std::size_t alignment = 64) : size_(size) {
+#if defined(_MSC_VER) // MSVC
+        data_ = static_cast<type_ *>(_aligned_malloc(sizeof(type_) * size_, alignment));
+#else
         data_ = static_cast<type_ *>(std::aligned_alloc(alignment, sizeof(type_) * size_));
+#endif
         if (!data_) throw std::bad_alloc();
     }
-    ~aligned_array() noexcept { std::free(data_); }
+    ~aligned_array() noexcept {
+#if defined(_MSC_VER) // MSVC
+        _aligned_free(data_);
+#else
+        std::free(data_);
+#endif
+    }
     type_ *begin() const noexcept { return data_; }
     type_ *end() const noexcept { return data_ + size_; }
     type_ &operator[](std::size_t index) noexcept { return data_[index]; }
@@ -522,21 +571,22 @@ static void sorting_with_openmp(bm::State &state) {
 
     for (auto _ : state) {
         std::reverse(array.begin(), array.end());
-
+        //! Remarkably, on Windows, OpenMP can't handle unsigned integers,
+        //! so we use `std::int64_t` over `std::size_t`.
 #pragma omp parallel for
         // Sort each chunk in parallel
-        for (std::size_t i = 0; i < chunks; i++) {
-            std::size_t start = chunk_start_offset(i);
-            std::size_t finish = chunk_start_offset(i + 1);
+        for (std::int64_t i = 0; i < chunks; i++) {
+            std::size_t start = chunk_start_offset(static_cast<std::size_t>(i));
+            std::size_t finish = chunk_start_offset(static_cast<std::size_t>(i) + 1);
             std::sort(array.begin() + start, array.begin() + finish);
         }
 
         // Merge the blocks in a tree-like fashion doubling the size of the merged block each time
         for (std::size_t merge_step = 1; merge_step < chunks; merge_step *= 2) {
 #pragma omp parallel for
-            for (std::size_t i = 0; i < chunks; i += 2 * merge_step) {
-                std::size_t first_chunk_index = i;
-                std::size_t second_chunk_index = i + merge_step;
+            for (std::int64_t i = 0; i < chunks; i += 2 * merge_step) {
+                std::size_t first_chunk_index = static_cast<std::size_t>(i);
+                std::size_t second_chunk_index = first_chunk_index + merge_step;
                 if (second_chunk_index >= chunks) continue; // No merge needed
 
                 // We use `inplace_merge` as opposed to `std::merge` to avoid extra memory allocations,
@@ -1510,6 +1560,7 @@ void configure_x86_denormals(void) {
  *
  *  @see Arm Feature Detection: https://developer.arm.com/documentation/101028/0010/Feature-test-macros
  */
+#if !defined(_MSC_VER)
 #if defined(__AVX512F__)
 extern "C" std::uint32_t tops_f64_avx512ma_asm_kernel(void);
 BENCHMARK_CAPTURE(theoretic_tops, f64_avx512ma, tops_f64_avx512ma_asm_kernel, configure_x86_denormals)->MinTime(10);
@@ -1620,6 +1671,7 @@ extern "C" std::uint32_t tops_u8_neon_asm_kernel(void);
 BENCHMARK_CAPTURE(theoretic_tops, u8_neon, tops_u8_neon_asm_kernel)->MinTime(10);
 BENCHMARK_CAPTURE(theoretic_tops, u8_neon, tops_u8_neon_asm_kernel)->MinTime(10)->Threads(physical_cores());
 #endif // defined(__ARM_FEATURE_DOTPROD)
+#endif // !defined(_MSC_VER)
 
 #if defined(__AMX_TILE__)
 /**
@@ -1814,6 +1866,41 @@ BENCHMARK_CAPTURE(theoretic_tops, i7_amx_avx512, tops_i7_amx_avx512fma_asm_kerne
 
 #pragma region Alignment of Memory Accesses
 
+/**
+ *  @b Force-inline is the first macro that many High-Performance Computing
+ *  libraries define. It will bloat the binary, but will reduce the number
+ *  of function calls and stack frames, which can be crucial for small kernels.
+ *  The name of the attribute, however, differs between compilers!
+ */
+#if defined(_MSC_VER)
+#define LESS_SLOW_ALWAYS_INLINE [[msvc::forceinline]] inline // `__forceinline`
+#elif defined(__GNUC__)
+#define LESS_SLOW_ALWAYS_INLINE [[gnu::always_inline]] inline
+#elif defined(__clang__)
+#define LESS_SLOW_ALWAYS_INLINE [[clang::always_inline]] inline
+#else
+#define LESS_SLOW_ALWAYS_INLINE inline __attribute__((always_inline))
+#endif
+
+/**
+ *  @brief  Checks if a number is a power of two.
+ *
+ *  An unsigned integer is a power of two if and only if it has exactly one
+ *  bit set. This can be checked by using the bitwise AND operator with the
+ *  number and its predecessor: `x & (x - 1)` will be zero only for powers
+ *  of two.
+ *
+ *  The same thing can be achieved with the `std::popcount` function, which
+ *  is available in C++20 or compiler intrinsics like `__builtin_popcountll`
+ *  on GCC. Most modern compilers will optimize this to a single instruction.
+ *
+ *  @see "Bit Twiddling Hacks" by Sean Eron Anderson:
+ *       https://graphics.stanford.edu/~seander/bithacks
+ *  @see Book "Hacker's Delight" by Henry S. Warren Jr.:
+ *       https://en.wikipedia.org/wiki/Hacker%27s_Delight
+ */
+LESS_SLOW_ALWAYS_INLINE bool is_power_of_two(std::uint64_t x) noexcept { return x && !(x & (x - 1)); }
+
 /**
  *  When designing high-performance kernels, memory alignment is crucial.
  *  Misaligned memory accesses split data across cache lines, causing extra
@@ -1847,7 +1934,7 @@ BENCHMARK_CAPTURE(theoretic_tops, i7_amx_avx512, tops_i7_amx_avx512fma_asm_kerne
 std::string read_file_contents(std::string const &path) {
     std::ifstream file(path);
     std::string content;
-    if (!file.is_open()) return 0;
+    if (!file.is_open()) return "";
     std::getline(file, content);
     file.close();
     return content;
@@ -1962,7 +2049,7 @@ static void memory_access(bm::State &state) {
     constexpr std::size_t typical_l2_size = 1024u * 1024u;
     std::size_t const cache_line_width = fetch_cache_line_width();
     assert( //
-        cache_line_width > 0 && __builtin_popcountll(cache_line_width) == 1 &&
+        cache_line_width > 0 && is_power_of_two(cache_line_width) &&
         "The cache line width must be a power of two greater than 0");
 
     // We are using a fairly small L2-cache-sized buffer to show, that this is
@@ -2200,15 +2287,22 @@ std::size_t parse_size_string(std::string const &str) {
 
 #pragma region Memory Bound Linear Algebra
 #include <cblas.h>
+/**
+ *! OpenBLAS defines a `SIZE` macro for internal use, which conflicts with `fmt`
+ *! and other code trying to use that name for variable names, so we must undefine it.
+ */
+#undef SIZE
 
 template <typename scalar_type_>
 static void cblas_tops(bm::State &state) {
+    openblas_set_num_threads(physical_cores());
+
     // BLAS expects leading dimensions: `lda` = `ldb` = `ldc` = `n` for square inputs.
     std::size_t n = static_cast<std::size_t>(state.range(0));
     int const lda = static_cast<int>(n), ldb = static_cast<int>(n), ldc = static_cast<int>(n);
 
     // Allocate and initialize data
-    aligned_array<scalar_type_> a(n * n), b(n * n), c(n * n, 0);
+    aligned_array<scalar_type_> a(n * n), b(n * n), c(n * n);
     std::iota(a.begin(), a.end(), 0);
     std::iota(b.begin(), b.end(), 0);
 
@@ -2245,6 +2339,8 @@ BENCHMARK(cblas_tops<double>)->RangeMultiplier(2)->Range(8, 65536)->Complexity(b
 
 template <typename scalar_type_>
 static void eigen_tops(bm::State &state) {
+    Eigen::setNbThreads(physical_cores());
+
     // Matrix dimension
     std::size_t n = static_cast<std::size_t>(state.range(0));
 
@@ -2352,20 +2448,11 @@ BENCHMARK(eigen_tops<_Float16>)->RangeMultiplier(2)->Range(8, 65536)->Complexity
 constexpr std::uint64_t pipe_start = 3;
 constexpr std::uint64_t pipe_end = 49;
 
-/**
- *  @brief  Checks if a number is a power of two.
- */
-[[gnu::always_inline]]
-inline bool is_power_of_two(std::uint64_t x) noexcept {
-    return __builtin_popcountll(x) == 1;
-}
-
 /**
  *  @brief  Checks if a number is a power of three using modulo division.
  *          The largest power of three fitting in a 64-bit integer is 3^40.
  */
-[[gnu::always_inline]]
-inline bool is_power_of_three(std::uint64_t x) noexcept {
+LESS_SLOW_ALWAYS_INLINE bool is_power_of_three(std::uint64_t x) noexcept {
     constexpr std::uint64_t max_power_of_three = 12157665459056928801ull;
     return x > 0 && max_power_of_three % x == 0;
 }
@@ -2376,7 +2463,7 @@ inline bool is_power_of_three(std::uint64_t x) noexcept {
  *  @brief  Supplies the prime factors to a template-based callback.
  */
 template <typename callback_type_>
-[[gnu::always_inline]] inline void prime_factors_lambdas( //
+LESS_SLOW_ALWAYS_INLINE void prime_factors_lambdas( //
     std::uint64_t input, callback_type_ &&callback) noexcept {
     // Handle factor 2 separately
     while ((input & 1) == 0) {
@@ -2993,7 +3080,9 @@ BENCHMARK(packaging_stl_tuple)->MinTime(2);
  *
  *  @see Reddit discussion: https://www.reddit.com/r/cpp/comments/ar4ghs/stdpair_disappointing_performance/
  */
+#if !defined(_MSC_VER)
 static_assert(!std::is_trivially_copyable_v<std::pair<int, float>>);
+#endif
 static_assert(!std::is_trivially_copyable_v<std::tuple<int, float>>);
 
 /**
@@ -3077,17 +3166,9 @@ static constexpr std::string_view short_config_text =    //
     " # Tricky comment with a : colon in the middle\n\r" // Accorn newline
     "\tpath :/api/v1";                                   // No trailing newline!
 
-#if defined(_MSC_VER) // MSVC
-#define FORCE_INLINE __forceinline
-#elif defined(__GNUC__) || defined(__clang__) // GCC or Clang
-#define FORCE_INLINE inline __attribute__((always_inline))
-#else // Fallback
-#define FORCE_INLINE inline
-#endif
-
-FORCE_INLINE bool is_newline(char c) noexcept { return c == '\n' || c == '\r'; }
+LESS_SLOW_ALWAYS_INLINE bool is_newline(char c) noexcept { return c == '\n' || c == '\r'; }
 
-FORCE_INLINE std::string_view strip_spaces(std::string_view text) noexcept {
+LESS_SLOW_ALWAYS_INLINE std::string_view strip_spaces(std::string_view text) noexcept {
     // Trim leading whitespace
     while (!text.empty() && std::isspace(text.front())) text.remove_prefix(1);
     // Trim trailing whitespace
@@ -3192,7 +3273,7 @@ void config_parse_sz(std::string_view config_text, std::vector<std::pair<std::st
     auto newlines = sz::char_set("\r\n");
     auto whitespaces = sz::whitespaces_set();
 
-    for (sz::string_view line : sz::string_view(config_text).split(newlines)) {
+    for (sz::string_view line : sz::string_view {config_text}.split(newlines)) {
         line = line.strip(whitespaces);
         if (line.empty() || line.front() == '#') continue; // Skip empty lines or comments
         auto [key, delimiter, value] = line.partition(':');
@@ -3320,8 +3401,12 @@ void parse_regex(bm::State &state, std::string_view config_text) {
     std::size_t pairs = 0, bytes = 0;
     std::vector<std::pair<std::string, std::string>> settings;
 
-    // Use multiline mode so ^ and $ anchor to line breaks.
-    auto regex_options = std::regex_constants::ECMAScript | std::regex_constants::multiline;
+    // Prefer multiline mode so ^ and $ anchor to line breaks...
+    auto regex_options = std::regex_constants::ECMAScript;
+    // ... but MSVC does not define `std::regex_constants::multiline` yet!
+#if !defined(_MSC_VER)
+    regex_options |= std::regex_constants::multiline;
+#endif
     // Construct the regex only once. Compilation is expensive!
     // BTW, there is still no `std::string_view` constructor 🤦‍♂️
     std::regex regex_fsm(regex_for_config.data(), regex_for_config.size(), regex_options);
@@ -4984,12 +5069,46 @@ BENCHMARK(errors_with_status)->ComputeStatistics("max", get_max_value)->MinTime(
 
 using std::string_view_literals::operator""sv;
 
+template <typename logger_type_>
+static void logging(bm::State &state) {
+    struct {
+        int code;
+        std::string_view message;
+    } errors[3] = {
+        {1, "Operation not permitted"sv},
+        {12, "Cannot allocate memory"sv},
+        {113, "No route to host"sv},
+    };
+    char buffer[1024];
+    logger_type_ logger;
+    std::size_t iteration_index = 0;
+    std::size_t bytes_logged = 0;
+    for (auto _ : state) {
+        bytes_logged += logger(              //
+            buffer, sizeof(buffer),          //
+            std::source_location::current(), //
+            errors[iteration_index % 3].code, errors[iteration_index % 3].message);
+        iteration_index++;
+    }
+    state.SetBytesProcessed(bytes_logged);
+}
+
 struct log_printf_t {
     std::size_t operator()(                    //
         char *buffer, std::size_t buffer_size, //
         std::source_location const &location, int code, std::string_view message) const noexcept {
-
+        /**
+         * On MSVC, high_resolution_clock is steady_clock, which cannot have to_time_t applied to it.
+         * std::chrono wraps many system APIs and has some parts that are implementatio-defined;
+         * In particular, std::chrono::high_resolution_clock is usually just an alias to
+         * either system_clock or steady_clock. There is debate on whether using it is a good idea at all.
+         * https://en.cppreference.com/w/cpp/chrono/high_resolution_clock
+         */
+#if defined(_MSC_VER)
+        auto now = std::chrono::system_clock::now();
+#else
         auto now = std::chrono::high_resolution_clock::now();
+#endif
         auto time_since_epoch = now.time_since_epoch();
 
         // Extract seconds and milliseconds
@@ -5014,6 +5133,9 @@ struct log_printf_t {
     }
 };
 
+BENCHMARK(logging<log_printf_t>)->Name("log_printf")->MinTime(2);
+
+#if !defined(_MSC_VER)
 #if defined(__cpp_lib_format)
 #include <format> // `std::format_to_n`
 
@@ -5047,8 +5169,9 @@ struct log_format_t {
     }
 };
 
-#endif // defined(__cpp_lib_format)
+BENCHMARK(logging<log_format_t>)->Name("log_format")->MinTime(2);
 
+#endif // defined(__cpp_lib_format)
 #include <fmt/core.h>    // `std::format_to_n`
 #include <fmt/compile.h> // compile-time format strings
 #include <fmt/chrono.h>  // formatting for `std::chrono` types
@@ -5083,36 +5206,10 @@ struct log_fmt_t {
     }
 };
 
-template <typename logger_type_>
-static void logging(bm::State &state) {
-    struct {
-        int code;
-        std::string_view message;
-    } errors[3] = {
-        {1, "Operation not permitted"sv},
-        {12, "Cannot allocate memory"sv},
-        {113, "No route to host"sv},
-    };
-    char buffer[1024];
-    logger_type_ logger;
-    std::size_t iteration_index = 0;
-    std::size_t bytes_logged = 0;
-    for (auto _ : state) {
-        bytes_logged += logger(              //
-            buffer, sizeof(buffer),          //
-            std::source_location::current(), //
-            errors[iteration_index % 3].code, errors[iteration_index % 3].message);
-        iteration_index++;
-    }
-    state.SetBytesProcessed(bytes_logged);
-}
-
-BENCHMARK(logging<log_printf_t>)->Name("log_printf")->MinTime(2);
-#if defined(__cpp_lib_format)
-BENCHMARK(logging<log_format_t>)->Name("log_format")->MinTime(2);
-#endif
 BENCHMARK(logging<log_fmt_t>)->Name("log_fmt")->MinTime(2);
 
+
+
 /**
  *  The results for the logging benchmarks are as follows:
  *  - `log_printf`: @b 321ns
@@ -5129,6 +5226,7 @@ BENCHMARK(logging<log_fmt_t>)->Name("log_fmt")->MinTime(2);
  *       https://youtu.be/ptba_AqFYCM
  */
 
+#endif            // !defined(_MSC_VER)
 #endif            // defined(__cpp_lib_source_location)
 #pragma endregion // Logs