From dcb065bf9fe3f11f6d9a6bc10f7431ae9edcd4db Mon Sep 17 00:00:00 2001 From: Raziel Alphadios <64050682+RazielXYZ@users.noreply.github.com> Date: Sun, 26 Jan 2025 22:35:57 +0200 Subject: [PATCH] Porting to MSVC (#28) * Improve: better support for Windows and MSVC * Improve: Move logging around * Improve: CMakeLists for MSVC some more * Fix: Missing OpenBLAS config and macros * Make: Uniform conditions for MSVC * Docs: Style and links * Docs: Mention MSVC compatibility * Docs: Notes on Chrono --------- Co-authored-by: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> --- .vscode/settings.json | 3 + CMakeLists.txt | 109 +++++++++++++------- README.md | 5 +- less_slow.cpp | 230 ++++++++++++++++++++++++++++++------------ 4 files changed, 245 insertions(+), 102 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index f04713c..95fa57a 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -17,6 +17,7 @@ "colsb", "consteval", "coro", + "cplusplus", "cppcoro", "CTRE", "CUDA", @@ -24,6 +25,7 @@ "DOTPROD", "Dusíková", "Eigen", + "Eron", "excerise", "fconcepts", "Fedor", @@ -53,6 +55,7 @@ "Niels", "nlohmann", "NVCC", + "openblas", "openmp", "Ormrod", "Peta", diff --git a/CMakeLists.txt b/CMakeLists.txt index f5c8d8b..96c4a21 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ # Let's used CMake 3.16+ for native sanitizers support -cmake_minimum_required(VERSION 3.16 FATAL_ERROR) +cmake_minimum_required(VERSION 3.16 FATAL_ERROR) # ------------------------------------------------------------------------------ # Project Setup @@ -32,17 +32,25 @@ endif() # ------------------------------------------------------------------------------ find_package(Threads REQUIRED) find_package(OpenMP REQUIRED) -find_package(BLAS REQUIRED) -if (BLAS_FOUND) - message(STATUS "BLAS found: ${BLAS_LIBRARIES}") -else () - message(FATAL_ERROR "BLAS not found") -endif () - set(FETCHCONTENT_QUIET OFF) include(FetchContent) +# Fetch and build OpenBLAS +FetchContent_Declare( + OpenBLAS + GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git + GIT_TAG v0.3.29 +) + +# Set OpenBLAS build options +set(NOFORTRAN ON CACHE BOOL "Disable Fortran" FORCE) +set(BUILD_WITHOUT_LAPACK OFF CACHE BOOL "Build without LAPACK" FORCE) +set(USE_THREAD ON CACHE BOOL "Use threading" FORCE) + +# Make OpenBLAS available +FetchContent_MakeAvailable(OpenBLAS) + # GTest (required by Google Benchmark) FetchContent_Declare( GoogleTest @@ -105,7 +113,7 @@ endif() FetchContent_Declare( VictorZverovichFMT GIT_REPOSITORY https://github.com/fmtlib/fmt.git - GIT_TAG 11.1.0 + GIT_TAG 11.1.2 ) FetchContent_MakeAvailable(VictorZverovichFMT) @@ -189,19 +197,23 @@ add_executable(less_slow less_slow.cpp) set_target_properties(less_slow PROPERTIES POSITION_INDEPENDENT_CODE ON) # Conditionally add the assembly file(s) -if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") - set_source_files_properties(less_slow_amd64.S PROPERTIES LANGUAGE ASM) - target_sources(less_slow PRIVATE less_slow_amd64.S) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") - set_source_files_properties(less_slow_aarch64.S PROPERTIES LANGUAGE ASM) - target_sources(less_slow PRIVATE less_slow_aarch64.S) +if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|AMD64|x64") + set_source_files_properties(less_slow_amd64.S PROPERTIES LANGUAGE ASM) + target_sources(less_slow PRIVATE less_slow_amd64.S) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64") + set_source_files_properties(less_slow_aarch64.S PROPERTIES LANGUAGE ASM) + target_sources(less_slow PRIVATE less_slow_aarch64.S) endif() # ------------------------------------------------------------------------------ # Compiler Flags / Options # ------------------------------------------------------------------------------ -if(NOT CMAKE_SYSTEM_NAME STREQUAL "Darwin") - # Apple Clang doesn't support -march=native +# Check for compiler support of `-march=native` +if(CMAKE_CXX_COMPILER_ID MATCHES "Intel") + target_compile_options(less_slow PRIVATE -xHost) +elseif(CMAKE_CXX_COMPILER_ID MATCHES "AppleClang" OR CMAKE_CXX_COMPILER_ID MATCHES "MSVC") +# Apple's Clang and MSVC can't auto-detect the highest CPU features +else() target_compile_options(less_slow PRIVATE -march=native) endif() @@ -213,8 +225,15 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") -fconcepts-diagnostics-depth=10 # Needed to debug concepts -fopenmp # OpenMP support, also requires linking ) +elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + target_compile_options(less_slow PRIVATE + /MP # Build with multiple processes; equivalent to `make -j` except it spans across all cores by default + /wd4068 # Disable the "unknown pragma" warning, as StringZilla uses many GCC and Clang pragmas + /Zc:__cplusplus # Make `__cplusplus` macro actually match used standard + /Zc:preprocessor # Use conformant preprocessor + ) + else() - # For other compilers (Clang, MSVC, Intel, etc.) target_compile_options(less_slow PRIVATE -Wno-deprecated-pragma ) @@ -230,28 +249,48 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang" ) set_property(TARGET less_slow PROPERTY SANITIZE_ADDRESS TRUE) set_property(TARGET less_slow PROPERTY SANITIZE_UNDEFINED TRUE) +elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + target_compile_options(less_slow PRIVATE + $<$:/O2> + $<$:/Ob2> + $<$:/Oi> + $<$:/Ot> + $<$:/GL> + ) + target_link_options(less_slow PRIVATE + $<$:/LTCG:incremental> + ) endif() # ------------------------------------------------------------------------------ # Link Libraries # ------------------------------------------------------------------------------ +# Add OpenBLAS include directory manually +if(openblas_POPULATED) + target_include_directories(less_slow PRIVATE ${openblas_SOURCE_DIR}) + + # For config.h + target_include_directories(less_slow PRIVATE ${openblas_BINARY_DIR}) +endif() + target_link_libraries(less_slow PRIVATE - Threads::Threads - benchmark - fmt::fmt - range-v3 - cppcoro - unifex - stringzilla - yyjson - ctre - # There is no `absl` shortcut: - # https://github.com/abseil/abseil-cpp/blob/master/CMake/README.md#available-abseil-cmake-public-targets - absl::flat_hash_map - nlohmann_json::nlohmann_json - Eigen3::Eigen - ${BLAS_LIBRARIES} - $<$:TBB::tbb> - $<$:OpenMP::OpenMP_CXX> + Threads::Threads + benchmark + fmt::fmt + range-v3 + cppcoro + unifex + stringzilla + yyjson + ctre + openblas + + # There is no `absl` shortcut: + # https://github.com/abseil/abseil-cpp/blob/master/CMake/README.md#available-abseil-cmake-public-targets + absl::flat_hash_map + nlohmann_json::nlohmann_json + Eigen3::Eigen + $<$:TBB::tbb> + OpenMP::OpenMP_CXX ) \ No newline at end of file diff --git a/README.md b/README.md index 5338e96..fed5ab2 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Some of the highlights include: - __Is the pointer size really 64 bits__ and how to exploit [pointer-tagging](https://en.wikipedia.org/wiki/Tagged_pointer)? - __How many packets is [UDP](https://www.cloudflare.com/learning/ddos/glossary/user-datagram-protocol-udp/) dropping__ and how to serve web requests in [`io_uring`](https://en.wikipedia.org/wiki/Io_uring) from user-space? - __Scatter and Gather__ for 50% faster vectorized disjoint memory operations. -- __How to choose between intrinsics, inline Assembly, and separate Assembly files__ for your performance-critical code? +- __How to choose between intrinsics, inline Assembly, and separate `.S` files__ for your performance-critical code? - __What are Encrypted Enclaves__ and what's the latency of Intel SGX, AMD SEV, and ARM Realm? 🔜 To read, jump to the [`less_slow.cpp` source file](https://github.com/ashvardanian/less_slow.cpp/blob/main/less_slow.cpp) and read the code snippets and comments. @@ -38,6 +38,9 @@ Follow the instructions below to run the code in your environment and compare it ## Running the Benchmarks +The project aims to be compatible with GCC, Clang, and MSVC compilers on Linux, MacOS, and Windows. +That said, to cover the broadest functionality, using GCC on Linux is recommended: + - If you are on Windows, it's recommended that you set up a Linux environment using [WSL](https://docs.microsoft.com/en-us/windows/wsl/install). - If you are on MacOS, consider using the non-native distribution of Clang from [Homebrew](https://brew.sh) or [MacPorts](https://www.macports.org). - If you are on Linux, make sure to install CMake and a recent version of GCC or Clang compilers to support C++20 features. diff --git a/less_slow.cpp b/less_slow.cpp index 90f78bd..c33ff74 100644 --- a/less_slow.cpp +++ b/less_slow.cpp @@ -149,7 +149,7 @@ BENCHMARK(i32_addition_inline_asm); * - @b less_slow_amd64.S - for the x86_64 architecture, with 64-bit extensions, * originally introduced by AMD. */ -#if defined(__x86_64__) || defined(__aarch64__) +#if !defined(_MSC_VER) && (defined(__x86_64__) || defined(__aarch64__) || defined(__i386__) || defined(_M_X64)) extern "C" std::int32_t i32_add_asm_kernel(std::int32_t a, std::int32_t b); @@ -262,8 +262,17 @@ BENCHMARK(i32_addition_randomly_initialized); #include // `_SC_NPROCESSORS_ONLN` #elif defined(__APPLE__) #include // `sysctlbyname` on macOS +#elif defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#define NOMINMAX +#include +#include #endif +/** + * @brief Returns the number of physical cores available on the system, + * as opposed to the logical cores, which include hyper-threading. + */ std::size_t physical_cores() { #if defined(__linux__) int nproc = sysconf(_SC_NPROCESSORS_ONLN); @@ -273,6 +282,36 @@ std::size_t physical_cores() { size_t len = sizeof(nproc); sysctlbyname("hw.physicalcpu", &nproc, &len, nullptr, 0); return static_cast(nproc); +#elif defined(_WIN32) + // On Windows, both `std::thread::hardware_concurrency` and `GetSystemInfo` + // return at most 64 cores, as limited by a single windows processor group. + // However, starting with newer versions of Windows, applications can seamlessly + // span across multiple processor groups. + // GetActiveProcessorCount(ALL_PROCESSOR_GROUPS) can return all logical cores; + // However, in order to get physical cores, we have to dive deeper. + DWORD bufferSize = 0; + GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufferSize); + if (bufferSize == 0) { + return 0; // Error occurred + } + + std::vector buffer(bufferSize); + if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast(buffer.data()), &bufferSize)) { + return 0; // Error occurred + } + + std::size_t coreCount = 0; + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX ptr = reinterpret_cast(buffer.data()); + DWORD byteOffset = 0; + while (byteOffset < bufferSize) { + if (ptr->Relationship == RelationProcessorCore) { + ++coreCount; + } + byteOffset += ptr->Size; + ptr = reinterpret_cast(reinterpret_cast(ptr) + ptr->Size); + } + + return coreCount; #else return std::thread::hardware_concurrency(); #endif @@ -354,10 +393,20 @@ class aligned_array { public: aligned_array(std::size_t size, std::size_t alignment = 64) : size_(size) { +#if defined(_MSC_VER) // MSVC + data_ = static_cast(_aligned_malloc(sizeof(type_) * size_, alignment)); +#else data_ = static_cast(std::aligned_alloc(alignment, sizeof(type_) * size_)); +#endif if (!data_) throw std::bad_alloc(); } - ~aligned_array() noexcept { std::free(data_); } + ~aligned_array() noexcept { +#if defined(_MSC_VER) // MSVC + _aligned_free(data_); +#else + std::free(data_); +#endif + } type_ *begin() const noexcept { return data_; } type_ *end() const noexcept { return data_ + size_; } type_ &operator[](std::size_t index) noexcept { return data_[index]; } @@ -522,21 +571,22 @@ static void sorting_with_openmp(bm::State &state) { for (auto _ : state) { std::reverse(array.begin(), array.end()); - + //! Remarkably, on Windows, OpenMP can't handle unsigned integers, + //! so we use `std::int64_t` over `std::size_t`. #pragma omp parallel for // Sort each chunk in parallel - for (std::size_t i = 0; i < chunks; i++) { - std::size_t start = chunk_start_offset(i); - std::size_t finish = chunk_start_offset(i + 1); + for (std::int64_t i = 0; i < chunks; i++) { + std::size_t start = chunk_start_offset(static_cast(i)); + std::size_t finish = chunk_start_offset(static_cast(i) + 1); std::sort(array.begin() + start, array.begin() + finish); } // Merge the blocks in a tree-like fashion doubling the size of the merged block each time for (std::size_t merge_step = 1; merge_step < chunks; merge_step *= 2) { #pragma omp parallel for - for (std::size_t i = 0; i < chunks; i += 2 * merge_step) { - std::size_t first_chunk_index = i; - std::size_t second_chunk_index = i + merge_step; + for (std::int64_t i = 0; i < chunks; i += 2 * merge_step) { + std::size_t first_chunk_index = static_cast(i); + std::size_t second_chunk_index = first_chunk_index + merge_step; if (second_chunk_index >= chunks) continue; // No merge needed // We use `inplace_merge` as opposed to `std::merge` to avoid extra memory allocations, @@ -1510,6 +1560,7 @@ void configure_x86_denormals(void) { * * @see Arm Feature Detection: https://developer.arm.com/documentation/101028/0010/Feature-test-macros */ +#if !defined(_MSC_VER) #if defined(__AVX512F__) extern "C" std::uint32_t tops_f64_avx512ma_asm_kernel(void); BENCHMARK_CAPTURE(theoretic_tops, f64_avx512ma, tops_f64_avx512ma_asm_kernel, configure_x86_denormals)->MinTime(10); @@ -1620,6 +1671,7 @@ extern "C" std::uint32_t tops_u8_neon_asm_kernel(void); BENCHMARK_CAPTURE(theoretic_tops, u8_neon, tops_u8_neon_asm_kernel)->MinTime(10); BENCHMARK_CAPTURE(theoretic_tops, u8_neon, tops_u8_neon_asm_kernel)->MinTime(10)->Threads(physical_cores()); #endif // defined(__ARM_FEATURE_DOTPROD) +#endif // !defined(_MSC_VER) #if defined(__AMX_TILE__) /** @@ -1814,6 +1866,41 @@ BENCHMARK_CAPTURE(theoretic_tops, i7_amx_avx512, tops_i7_amx_avx512fma_asm_kerne #pragma region Alignment of Memory Accesses +/** + * @b Force-inline is the first macro that many High-Performance Computing + * libraries define. It will bloat the binary, but will reduce the number + * of function calls and stack frames, which can be crucial for small kernels. + * The name of the attribute, however, differs between compilers! + */ +#if defined(_MSC_VER) +#define LESS_SLOW_ALWAYS_INLINE [[msvc::forceinline]] inline // `__forceinline` +#elif defined(__GNUC__) +#define LESS_SLOW_ALWAYS_INLINE [[gnu::always_inline]] inline +#elif defined(__clang__) +#define LESS_SLOW_ALWAYS_INLINE [[clang::always_inline]] inline +#else +#define LESS_SLOW_ALWAYS_INLINE inline __attribute__((always_inline)) +#endif + +/** + * @brief Checks if a number is a power of two. + * + * An unsigned integer is a power of two if and only if it has exactly one + * bit set. This can be checked by using the bitwise AND operator with the + * number and its predecessor: `x & (x - 1)` will be zero only for powers + * of two. + * + * The same thing can be achieved with the `std::popcount` function, which + * is available in C++20 or compiler intrinsics like `__builtin_popcountll` + * on GCC. Most modern compilers will optimize this to a single instruction. + * + * @see "Bit Twiddling Hacks" by Sean Eron Anderson: + * https://graphics.stanford.edu/~seander/bithacks + * @see Book "Hacker's Delight" by Henry S. Warren Jr.: + * https://en.wikipedia.org/wiki/Hacker%27s_Delight + */ +LESS_SLOW_ALWAYS_INLINE bool is_power_of_two(std::uint64_t x) noexcept { return x && !(x & (x - 1)); } + /** * When designing high-performance kernels, memory alignment is crucial. * Misaligned memory accesses split data across cache lines, causing extra @@ -1847,7 +1934,7 @@ BENCHMARK_CAPTURE(theoretic_tops, i7_amx_avx512, tops_i7_amx_avx512fma_asm_kerne std::string read_file_contents(std::string const &path) { std::ifstream file(path); std::string content; - if (!file.is_open()) return 0; + if (!file.is_open()) return ""; std::getline(file, content); file.close(); return content; @@ -1962,7 +2049,7 @@ static void memory_access(bm::State &state) { constexpr std::size_t typical_l2_size = 1024u * 1024u; std::size_t const cache_line_width = fetch_cache_line_width(); assert( // - cache_line_width > 0 && __builtin_popcountll(cache_line_width) == 1 && + cache_line_width > 0 && is_power_of_two(cache_line_width) && "The cache line width must be a power of two greater than 0"); // We are using a fairly small L2-cache-sized buffer to show, that this is @@ -2200,15 +2287,22 @@ std::size_t parse_size_string(std::string const &str) { #pragma region Memory Bound Linear Algebra #include +/** + *! OpenBLAS defines a `SIZE` macro for internal use, which conflicts with `fmt` + *! and other code trying to use that name for variable names, so we must undefine it. + */ +#undef SIZE template static void cblas_tops(bm::State &state) { + openblas_set_num_threads(physical_cores()); + // BLAS expects leading dimensions: `lda` = `ldb` = `ldc` = `n` for square inputs. std::size_t n = static_cast(state.range(0)); int const lda = static_cast(n), ldb = static_cast(n), ldc = static_cast(n); // Allocate and initialize data - aligned_array a(n * n), b(n * n), c(n * n, 0); + aligned_array a(n * n), b(n * n), c(n * n); std::iota(a.begin(), a.end(), 0); std::iota(b.begin(), b.end(), 0); @@ -2245,6 +2339,8 @@ BENCHMARK(cblas_tops)->RangeMultiplier(2)->Range(8, 65536)->Complexity(b template static void eigen_tops(bm::State &state) { + Eigen::setNbThreads(physical_cores()); + // Matrix dimension std::size_t n = static_cast(state.range(0)); @@ -2352,20 +2448,11 @@ BENCHMARK(eigen_tops<_Float16>)->RangeMultiplier(2)->Range(8, 65536)->Complexity constexpr std::uint64_t pipe_start = 3; constexpr std::uint64_t pipe_end = 49; -/** - * @brief Checks if a number is a power of two. - */ -[[gnu::always_inline]] -inline bool is_power_of_two(std::uint64_t x) noexcept { - return __builtin_popcountll(x) == 1; -} - /** * @brief Checks if a number is a power of three using modulo division. * The largest power of three fitting in a 64-bit integer is 3^40. */ -[[gnu::always_inline]] -inline bool is_power_of_three(std::uint64_t x) noexcept { +LESS_SLOW_ALWAYS_INLINE bool is_power_of_three(std::uint64_t x) noexcept { constexpr std::uint64_t max_power_of_three = 12157665459056928801ull; return x > 0 && max_power_of_three % x == 0; } @@ -2376,7 +2463,7 @@ inline bool is_power_of_three(std::uint64_t x) noexcept { * @brief Supplies the prime factors to a template-based callback. */ template -[[gnu::always_inline]] inline void prime_factors_lambdas( // +LESS_SLOW_ALWAYS_INLINE void prime_factors_lambdas( // std::uint64_t input, callback_type_ &&callback) noexcept { // Handle factor 2 separately while ((input & 1) == 0) { @@ -2993,7 +3080,9 @@ BENCHMARK(packaging_stl_tuple)->MinTime(2); * * @see Reddit discussion: https://www.reddit.com/r/cpp/comments/ar4ghs/stdpair_disappointing_performance/ */ +#if !defined(_MSC_VER) static_assert(!std::is_trivially_copyable_v>); +#endif static_assert(!std::is_trivially_copyable_v>); /** @@ -3077,17 +3166,9 @@ static constexpr std::string_view short_config_text = // " # Tricky comment with a : colon in the middle\n\r" // Accorn newline "\tpath :/api/v1"; // No trailing newline! -#if defined(_MSC_VER) // MSVC -#define FORCE_INLINE __forceinline -#elif defined(__GNUC__) || defined(__clang__) // GCC or Clang -#define FORCE_INLINE inline __attribute__((always_inline)) -#else // Fallback -#define FORCE_INLINE inline -#endif - -FORCE_INLINE bool is_newline(char c) noexcept { return c == '\n' || c == '\r'; } +LESS_SLOW_ALWAYS_INLINE bool is_newline(char c) noexcept { return c == '\n' || c == '\r'; } -FORCE_INLINE std::string_view strip_spaces(std::string_view text) noexcept { +LESS_SLOW_ALWAYS_INLINE std::string_view strip_spaces(std::string_view text) noexcept { // Trim leading whitespace while (!text.empty() && std::isspace(text.front())) text.remove_prefix(1); // Trim trailing whitespace @@ -3192,7 +3273,7 @@ void config_parse_sz(std::string_view config_text, std::vector> settings; - // Use multiline mode so ^ and $ anchor to line breaks. - auto regex_options = std::regex_constants::ECMAScript | std::regex_constants::multiline; + // Prefer multiline mode so ^ and $ anchor to line breaks... + auto regex_options = std::regex_constants::ECMAScript; + // ... but MSVC does not define `std::regex_constants::multiline` yet! +#if !defined(_MSC_VER) + regex_options |= std::regex_constants::multiline; +#endif // Construct the regex only once. Compilation is expensive! // BTW, there is still no `std::string_view` constructor 🤦‍♂️ std::regex regex_fsm(regex_for_config.data(), regex_for_config.size(), regex_options); @@ -4984,12 +5069,46 @@ BENCHMARK(errors_with_status)->ComputeStatistics("max", get_max_value)->MinTime( using std::string_view_literals::operator""sv; +template +static void logging(bm::State &state) { + struct { + int code; + std::string_view message; + } errors[3] = { + {1, "Operation not permitted"sv}, + {12, "Cannot allocate memory"sv}, + {113, "No route to host"sv}, + }; + char buffer[1024]; + logger_type_ logger; + std::size_t iteration_index = 0; + std::size_t bytes_logged = 0; + for (auto _ : state) { + bytes_logged += logger( // + buffer, sizeof(buffer), // + std::source_location::current(), // + errors[iteration_index % 3].code, errors[iteration_index % 3].message); + iteration_index++; + } + state.SetBytesProcessed(bytes_logged); +} + struct log_printf_t { std::size_t operator()( // char *buffer, std::size_t buffer_size, // std::source_location const &location, int code, std::string_view message) const noexcept { - + /** + * On MSVC, high_resolution_clock is steady_clock, which cannot have to_time_t applied to it. + * std::chrono wraps many system APIs and has some parts that are implementatio-defined; + * In particular, std::chrono::high_resolution_clock is usually just an alias to + * either system_clock or steady_clock. There is debate on whether using it is a good idea at all. + * https://en.cppreference.com/w/cpp/chrono/high_resolution_clock + */ +#if defined(_MSC_VER) + auto now = std::chrono::system_clock::now(); +#else auto now = std::chrono::high_resolution_clock::now(); +#endif auto time_since_epoch = now.time_since_epoch(); // Extract seconds and milliseconds @@ -5014,6 +5133,9 @@ struct log_printf_t { } }; +BENCHMARK(logging)->Name("log_printf")->MinTime(2); + +#if !defined(_MSC_VER) #if defined(__cpp_lib_format) #include // `std::format_to_n` @@ -5047,8 +5169,9 @@ struct log_format_t { } }; -#endif // defined(__cpp_lib_format) +BENCHMARK(logging)->Name("log_format")->MinTime(2); +#endif // defined(__cpp_lib_format) #include // `std::format_to_n` #include // compile-time format strings #include // formatting for `std::chrono` types @@ -5083,36 +5206,10 @@ struct log_fmt_t { } }; -template -static void logging(bm::State &state) { - struct { - int code; - std::string_view message; - } errors[3] = { - {1, "Operation not permitted"sv}, - {12, "Cannot allocate memory"sv}, - {113, "No route to host"sv}, - }; - char buffer[1024]; - logger_type_ logger; - std::size_t iteration_index = 0; - std::size_t bytes_logged = 0; - for (auto _ : state) { - bytes_logged += logger( // - buffer, sizeof(buffer), // - std::source_location::current(), // - errors[iteration_index % 3].code, errors[iteration_index % 3].message); - iteration_index++; - } - state.SetBytesProcessed(bytes_logged); -} - -BENCHMARK(logging)->Name("log_printf")->MinTime(2); -#if defined(__cpp_lib_format) -BENCHMARK(logging)->Name("log_format")->MinTime(2); -#endif BENCHMARK(logging)->Name("log_fmt")->MinTime(2); + + /** * The results for the logging benchmarks are as follows: * - `log_printf`: @b 321ns @@ -5129,6 +5226,7 @@ BENCHMARK(logging)->Name("log_fmt")->MinTime(2); * https://youtu.be/ptba_AqFYCM */ +#endif // !defined(_MSC_VER) #endif // defined(__cpp_lib_source_location) #pragma endregion // Logs