diff --git a/.clang-format b/.clang-format index 42912faec..ad9f4bb18 100644 --- a/.clang-format +++ b/.clang-format @@ -2,19 +2,35 @@ Language: Cpp ColumnLimit: 110 IndentPPDirectives: BeforeHash -AlwaysBreakTemplateDeclarations : true -PackConstructorInitializers : CurrentLine +AlwaysBreakTemplateDeclarations: Yes +BreakAfterAttributes: Always +PackConstructorInitializers: Never AccessModifierOffset: -1 -IndentCaseLabels : true +IndentCaseLabels: true AllowShortLambdasOnASingleLine: Empty RequiresExpressionIndentation: OuterScope -BinPackArguments : false -BinPackParameters : false -LambdaBodyIndentation : Signature -PenaltyReturnTypeOnItsOwnLine : 1 +LambdaBodyIndentation: Signature +PenaltyReturnTypeOnItsOwnLine: 1 + +# TODO: update for clang-format 23 +BinPackArguments: false +BinPackParameters: OnePerLine + +BreakBeforeConceptDeclarations: Always + +Macros: + - LF_TRY=if + - LF_CATCH_ALL=else + - LF_CATCH(x)=else + - LF_HOF(x)={x;} + - LF_HOF(x,y)={x,y;} + - LF_HOF(x,y,z)={x,y,z;} + - LF_HOF(x,y,z,w)={x,y,z,w;} + - LF_HOF(a,b,c,d,e)={a;} + - LF_HOF(a,b,c,d,e,f)={a,b,c,d,e,f;} SpaceBeforeParens: Custom SpaceBeforeParensOptions: - AfterRequiresInClause: true - AfterRequiresInExpression : true + AfterRequiresInClause: true + AfterRequiresInExpression: true ... diff --git a/.clang-tidy b/.clang-tidy index 5d813fd55..d05681165 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -10,148 +10,150 @@ Checks: "*,\ -llvm-header-guard,\ -llvm-include-order,\ -llvmlibc-*,\ - -modernize-use-nodiscard,\ + -readability-identifier-length,\ -misc-non-private-member-variables-in-classes" -WarningsAsErrors: '' +WarningsAsErrors: "" CheckOptions: - key: readability-function-cognitive-complexity.IgnoreMacros - value: 'true' - - key: 'bugprone-argument-comment.StrictMode' - value: 'true' -# Prefer using enum classes with 2 values for parameters instead of bools - - key: 'bugprone-argument-comment.CommentBoolLiterals' - value: 'true' - - key: 'bugprone-misplaced-widening-cast.CheckImplicitCasts' - value: 'true' - - key: 'bugprone-sizeof-expression.WarnOnSizeOfIntegerExpression' - value: 'true' - - key: 'bugprone-suspicious-string-compare.WarnOnLogicalNotComparison' - value: 'true' - - key: 'readability-simplify-boolean-expr.ChainedConditionalReturn' - value: 'true' - - key: 'readability-simplify-boolean-expr.ChainedConditionalAssignment' - value: 'true' - - key: 'readability-uniqueptr-delete-release.PreferResetCall' - value: 'true' - - key: 'cppcoreguidelines-init-variables.MathHeader' - value: '' - - key: 'cppcoreguidelines-narrowing-conversions.PedanticMode' - value: 'true' - - key: 'readability-else-after-return.WarnOnUnfixable' - value: 'true' - - key: 'readability-else-after-return.WarnOnConditionVariables' - value: 'true' - - key: 'readability-inconsistent-declaration-parameter-name.Strict' - value: 'true' - - key: 'readability-qualified-auto.AddConstToQualified' - value: 'true' - - key: 'readability-redundant-access-specifiers.CheckFirstDeclaration' - value: 'true' -# These seem to be the most common identifier styles - - key: 'readability-identifier-naming.AbstractClassCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ClassCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ClassConstantCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ClassMemberCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ClassMethodCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ConstantCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ConstantMemberCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ConstantParameterCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ConstantPointerParameterCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ConstexprFunctionCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ConstexprMethodCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ConstexprVariableCase' - value: 'lower_case' - - key: 'readability-identifier-naming.EnumCase' - value: 'lower_case' - - key: 'readability-identifier-naming.EnumConstantCase' - value: 'lower_case' - - key: 'readability-identifier-naming.FunctionCase' - value: 'lower_case' - - key: 'readability-identifier-naming.GlobalConstantCase' - value: 'lower_case' - - key: 'readability-identifier-naming.GlobalConstantPointerCase' - value: 'lower_case' - - key: 'readability-identifier-naming.GlobalFunctionCase' - value: 'lower_case' - - key: 'readability-identifier-naming.GlobalPointerCase' - value: 'lower_case' - - key: 'readability-identifier-naming.GlobalVariableCase' - value: 'lower_case' - - key: 'readability-identifier-naming.InlineNamespaceCase' - value: 'lower_case' - - key: 'readability-identifier-naming.LocalConstantCase' - value: 'lower_case' - - key: 'readability-identifier-naming.LocalConstantPointerCase' - value: 'lower_case' - - key: 'readability-identifier-naming.LocalPointerCase' - value: 'lower_case' - - key: 'readability-identifier-naming.LocalVariableCase' - value: 'lower_case' - - key: 'readability-identifier-naming.MacroDefinitionCase' - value: 'UPPER_CASE' - - key: 'readability-identifier-naming.MemberCase' - value: 'lower_case' - - key: 'readability-identifier-naming.MethodCase' - value: 'lower_case' - - key: 'readability-identifier-naming.NamespaceCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ParameterCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ParameterPackCase' - value: 'lower_case' - - key: 'readability-identifier-naming.PointerParameterCase' - value: 'lower_case' - - key: 'readability-identifier-naming.PrivateMemberCase' - value: 'lower_case' - - key: 'readability-identifier-naming.PrivateMemberPrefix' - value: 'm_' - - key: 'readability-identifier-naming.PrivateMethodCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ProtectedMemberCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ProtectedMemberPrefix' - value: 'm_' - - key: 'readability-identifier-naming.ProtectedMethodCase' - value: 'lower_case' - - key: 'readability-identifier-naming.PublicMemberCase' - value: 'lower_case' - - key: 'readability-identifier-naming.PublicMethodCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ScopedEnumConstantCase' - value: 'lower_case' - - key: 'readability-identifier-naming.StaticConstantCase' - value: 'lower_case' - - key: 'readability-identifier-naming.StaticVariableCase' - value: 'lower_case' - - key: 'readability-identifier-naming.StructCase' - value: 'lower_case' - - key: 'readability-identifier-naming.TemplateParameterCase' - value: 'CamelCase' - - key: 'readability-identifier-naming.TemplateTemplateParameterCase' - value: 'CamelCase' - - key: 'readability-identifier-naming.TypeAliasCase' - value: 'lower_case' - - key: 'readability-identifier-naming.TypedefCase' - value: 'lower_case' - - key: 'readability-identifier-naming.TypeTemplateParameterCase' - value: 'CamelCase' - - key: 'readability-identifier-naming.UnionCase' - value: 'lower_case' - - key: 'readability-identifier-naming.ValueTemplateParameterCase' - value: 'CamelCase' - - key: 'readability-identifier-naming.VariableCase' - value: 'lower_case' - - key: 'readability-identifier-naming.VirtualMethodCase' - value: 'lower_case' + value: "true" + - key: "cppcoreguidelines-avoid-do-while.IgnoreMacros" + value: "true" + - key: "bugprone-argument-comment.StrictMode" + value: "true" + # Prefer using enum classes with 2 values for parameters instead of bools + - key: "bugprone-argument-comment.CommentBoolLiterals" + value: "true" + - key: "bugprone-misplaced-widening-cast.CheckImplicitCasts" + value: "true" + - key: "bugprone-sizeof-expression.WarnOnSizeOfIntegerExpression" + value: "true" + - key: "bugprone-suspicious-string-compare.WarnOnLogicalNotComparison" + value: "true" + - key: "readability-simplify-boolean-expr.ChainedConditionalReturn" + value: "true" + - key: "readability-simplify-boolean-expr.ChainedConditionalAssignment" + value: "true" + - key: "readability-uniqueptr-delete-release.PreferResetCall" + value: "true" + - key: "cppcoreguidelines-init-variables.MathHeader" + value: "" + - key: "cppcoreguidelines-narrowing-conversions.PedanticMode" + value: "true" + - key: "readability-else-after-return.WarnOnUnfixable" + value: "true" + - key: "readability-else-after-return.WarnOnConditionVariables" + value: "true" + - key: "readability-inconsistent-declaration-parameter-name.Strict" + value: "true" + - key: "readability-qualified-auto.AddConstToQualified" + value: "true" + - key: "readability-redundant-access-specifiers.CheckFirstDeclaration" + value: "true" + # These seem to be the most common identifier styles + - key: "readability-identifier-naming.AbstractClassCase" + value: "lower_case" + - key: "readability-identifier-naming.ClassCase" + value: "lower_case" + - key: "readability-identifier-naming.ClassConstantCase" + value: "lower_case" + - key: "readability-identifier-naming.ClassMemberCase" + value: "lower_case" + - key: "readability-identifier-naming.ClassMethodCase" + value: "lower_case" + - key: "readability-identifier-naming.ConstantCase" + value: "lower_case" + - key: "readability-identifier-naming.ConstantMemberCase" + value: "lower_case" + - key: "readability-identifier-naming.ConstantParameterCase" + value: "lower_case" + - key: "readability-identifier-naming.ConstantPointerParameterCase" + value: "lower_case" + - key: "readability-identifier-naming.ConstexprFunctionCase" + value: "lower_case" + - key: "readability-identifier-naming.ConstexprMethodCase" + value: "lower_case" + - key: "readability-identifier-naming.ConstexprVariableCase" + value: "lower_case" + - key: "readability-identifier-naming.EnumCase" + value: "lower_case" + - key: "readability-identifier-naming.EnumConstantCase" + value: "lower_case" + - key: "readability-identifier-naming.FunctionCase" + value: "lower_case" + - key: "readability-identifier-naming.GlobalConstantCase" + value: "lower_case" + - key: "readability-identifier-naming.GlobalConstantPointerCase" + value: "lower_case" + - key: "readability-identifier-naming.GlobalFunctionCase" + value: "lower_case" + - key: "readability-identifier-naming.GlobalPointerCase" + value: "lower_case" + - key: "readability-identifier-naming.GlobalVariableCase" + value: "lower_case" + - key: "readability-identifier-naming.InlineNamespaceCase" + value: "lower_case" + - key: "readability-identifier-naming.LocalConstantCase" + value: "lower_case" + - key: "readability-identifier-naming.LocalConstantPointerCase" + value: "lower_case" + - key: "readability-identifier-naming.LocalPointerCase" + value: "lower_case" + - key: "readability-identifier-naming.LocalVariableCase" + value: "lower_case" + - key: "readability-identifier-naming.MacroDefinitionCase" + value: "UPPER_CASE" + - key: "readability-identifier-naming.MemberCase" + value: "lower_case" + - key: "readability-identifier-naming.MethodCase" + value: "lower_case" + - key: "readability-identifier-naming.NamespaceCase" + value: "lower_case" + - key: "readability-identifier-naming.ParameterCase" + value: "lower_case" + - key: "readability-identifier-naming.ParameterPackCase" + value: "lower_case" + - key: "readability-identifier-naming.PointerParameterCase" + value: "lower_case" + - key: "readability-identifier-naming.PrivateMemberCase" + value: "lower_case" + - key: "readability-identifier-naming.PrivateMemberPrefix" + value: "m_" + - key: "readability-identifier-naming.PrivateMethodCase" + value: "lower_case" + - key: "readability-identifier-naming.ProtectedMemberCase" + value: "lower_case" + - key: "readability-identifier-naming.ProtectedMemberPrefix" + value: "m_" + - key: "readability-identifier-naming.ProtectedMethodCase" + value: "lower_case" + - key: "readability-identifier-naming.PublicMemberCase" + value: "lower_case" + - key: "readability-identifier-naming.PublicMethodCase" + value: "lower_case" + - key: "readability-identifier-naming.ScopedEnumConstantCase" + value: "lower_case" + - key: "readability-identifier-naming.StaticConstantCase" + value: "lower_case" + - key: "readability-identifier-naming.StaticVariableCase" + value: "lower_case" + - key: "readability-identifier-naming.StructCase" + value: "lower_case" + - key: "readability-identifier-naming.TemplateParameterCase" + value: "CamelCase" + - key: "readability-identifier-naming.TemplateTemplateParameterCase" + value: "CamelCase" + - key: "readability-identifier-naming.TypeAliasCase" + value: "lower_case" + - key: "readability-identifier-naming.TypedefCase" + value: "lower_case" + - key: "readability-identifier-naming.TypeTemplateParameterCase" + value: "CamelCase" + - key: "readability-identifier-naming.UnionCase" + value: "lower_case" + - key: "readability-identifier-naming.ValueTemplateParameterCase" + value: "CamelCase" + - key: "readability-identifier-naming.VariableCase" + value: "lower_case" + - key: "readability-identifier-naming.VirtualMethodCase" + value: "lower_case" ... diff --git a/.clangd b/.clangd index ef86cb6b0..fd3d2a8f4 100644 --- a/.clangd +++ b/.clangd @@ -1,2 +1,2 @@ CompileFlags: - CompilationDatabase: build/dev \ No newline at end of file + CompilationDatabase: build/dev diff --git a/.codespellrc b/.codespellrc index c3920f351..86730201c 100644 --- a/.codespellrc +++ b/.codespellrc @@ -1,7 +1,7 @@ [codespell] -builtin = clear,rare,en-GB_to_en-US,names,informal,code +builtin = clear,rare,names,informal,code check-filenames = check-hidden = ignore-words-list = deque,warmup,stdio,copyable,combinate -skip = */.git,*/build,*/prefix,*/vcpkg,*/_build,*/bench +skip = */.git,*/build,*/.legacy quiet-level = 2 diff --git a/.gemini/settings.json b/.gemini/settings.json new file mode 100644 index 000000000..b8dce87f3 --- /dev/null +++ b/.gemini/settings.json @@ -0,0 +1,8 @@ +{ + "context": { + "fileName": "AGENTS.md" + }, + "ui": { + "hideBanner": true + } +} \ No newline at end of file diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 000000000..b45ddfb1d --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,31 @@ +name: Documentation +on: + push: + branches: + - main + +permissions: + contents: read + pages: write + id-token: write + +jobs: + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + + runs-on: ubuntu-latest + steps: + - uses: actions/configure-pages@v6 + - uses: actions/checkout@v6 + - uses: actions/setup-python@v6 + with: + python-version: 3.x + - run: pip install zensical + - run: zensical build --clean + - uses: actions/upload-pages-artifact@v5 + with: + path: build/site + - uses: actions/deploy-pages@v5 + id: deployment diff --git a/.github/workflows/linear.yml b/.github/workflows/linear.yml new file mode 100644 index 000000000..8d1ba0975 --- /dev/null +++ b/.github/workflows/linear.yml @@ -0,0 +1,33 @@ +name: Linear History + +on: + pull_request: + branches: ["modules"] + workflow_dispatch: + +jobs: + check-linear-history: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + ref: ${{ github.event.pull_request.head.sha || github.sha }} + fetch-depth: 0 + + - name: Check for merge commits + run: | + BASE_REF=${{ github.base_ref || 'modules' }} + echo "Comparing against base: $BASE_REF" + git fetch origin $BASE_REF:$BASE_REF + MERGE_COMMITS=$(git rev-list --merges $BASE_REF..HEAD) + if [ -n "$MERGE_COMMITS" ]; then + echo "Error: Merge commits detected. libfork requires a linear history." + echo "Please rebase your branch onto $BASE_REF to remove merge commits." + echo "" + echo "Merge commits found:" + git log --merges --oneline $BASE_REF..HEAD + exit 1 + else + echo "No merge commits detected. Linear history check passed." + fi diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 000000000..82cfdc9a7 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,28 @@ +name: Lint + +on: + push: + branches: ["modules"] + pull_request: + branches: ["modules"] + workflow_dispatch: + +jobs: + lint: + runs-on: macos-latest + + steps: + - uses: actions/checkout@v6 + + - name: Set up Homebrew + uses: Homebrew/actions/setup-homebrew@main + + - name: Install Dependencies + run: brew install clang-format codespell + + - name: Run codespell + run: codespell + + - name: Run clang-format + run: | + find src include test benchmark/src -name "*.cpp" -o -name "*.hpp" -o -name "*.cxx" | xargs clang-format --dry-run --Werror diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml new file mode 100644 index 000000000..4904ae1e5 --- /dev/null +++ b/.github/workflows/linux.yml @@ -0,0 +1,35 @@ +name: Linux + +on: + push: + branches: ["modules"] + pull_request: + branches: ["modules"] + workflow_dispatch: + +jobs: + build-and-test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + preset: [ci-hardened, ci-release, ci-no-except-rtti] + + steps: + - uses: actions/checkout@v6 + + - name: Set up Homebrew + uses: Homebrew/actions/setup-homebrew@main + + - name: Install Dependencies + run: brew install cmake ninja gcc binutils catch2 google-benchmark + + - name: Configure + run: cmake --preset ${{ matrix.preset }} + -DCMAKE_TOOLCHAIN_FILE=cmake/gcc-brew-toolchain.cmake + + - name: Build + run: cmake --build --preset ${{ matrix.preset }} + + - name: Test + run: ctest --preset ${{ matrix.preset }} diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml new file mode 100644 index 000000000..17eb53779 --- /dev/null +++ b/.github/workflows/macos.yml @@ -0,0 +1,35 @@ +name: MacOS + +on: + push: + branches: ["modules"] + pull_request: + branches: ["modules"] + workflow_dispatch: + +jobs: + build-and-test: + runs-on: macos-latest + strategy: + fail-fast: false + matrix: + preset: [ci-hardened, ci-release, ci-no-except-rtti, ci-sanitize] + + steps: + - uses: actions/checkout@v6 + + - name: Set up Homebrew + uses: Homebrew/actions/setup-homebrew@main + + - name: Install Dependencies + run: brew install cmake ninja llvm catch2 google-benchmark + + - name: Configure + run: cmake --preset ${{ matrix.preset }} + -DCMAKE_TOOLCHAIN_FILE=cmake/llvm-brew-toolchain.cmake + + - name: Build + run: cmake --build --preset ${{ matrix.preset }} + + - name: Test + run: ctest --preset ${{ matrix.preset }} diff --git a/.gitignore b/.gitignore index 47a6b5d9d..2e43ca3a3 100644 --- a/.gitignore +++ b/.gitignore @@ -4,27 +4,7 @@ .cache/ build/ -_build/ -cmake-build-*/ -prefix/ -old/ -Testing/ - -docs/index.rst - -bench/data/ -*.svg - -mem.log -memory.csv -memory.*.csv -test.pdf -gmon.out -out.png -output.png **/.DS_Store compile_commands.json -CMakeLists.txt.user -CMakeUserPresets.json diff --git a/.legacy/docs/_static/android-chrome-192x192.png b/.legacy/docs/_static/android-chrome-192x192.png deleted file mode 100644 index 20d2ba4eb..000000000 Binary files a/.legacy/docs/_static/android-chrome-192x192.png and /dev/null differ diff --git a/.legacy/docs/_static/android-chrome-512x512.png b/.legacy/docs/_static/android-chrome-512x512.png deleted file mode 100644 index de0ebe9ef..000000000 Binary files a/.legacy/docs/_static/android-chrome-512x512.png and /dev/null differ diff --git a/.legacy/docs/_static/apple-touch-icon.png b/.legacy/docs/_static/apple-touch-icon.png deleted file mode 100644 index 39e94423b..000000000 Binary files a/.legacy/docs/_static/apple-touch-icon.png and /dev/null differ diff --git a/.legacy/docs/_static/favicon-16x16.png b/.legacy/docs/_static/favicon-16x16.png deleted file mode 100644 index a3930b408..000000000 Binary files a/.legacy/docs/_static/favicon-16x16.png and /dev/null differ diff --git a/.legacy/docs/_static/favicon-32x32.png b/.legacy/docs/_static/favicon-32x32.png deleted file mode 100644 index 2d85ce719..000000000 Binary files a/.legacy/docs/_static/favicon-32x32.png and /dev/null differ diff --git a/.legacy/docs/_static/favicon.ico b/.legacy/docs/_static/favicon.ico deleted file mode 100644 index 6824cb2cc..000000000 Binary files a/.legacy/docs/_static/favicon.ico and /dev/null differ diff --git a/.legacy/include/libfork/core/macro.hpp b/.legacy/include/libfork/core/macro.hpp index 0944c1644..e42fcfc18 100644 --- a/.legacy/include/libfork/core/macro.hpp +++ b/.legacy/include/libfork/core/macro.hpp @@ -61,17 +61,6 @@ #define LF_STATIC_CONST const #endif -// clang-format off - -/** - * @brief Use like `BOOST_HOF_RETURNS` to define a function/lambda with all the noexcept/requires/decltype specifiers. - * - * This macro is not truly variadic but the ``...`` allows commas in the macro argument. - */ -#define LF_HOF_RETURNS(...) noexcept(noexcept(__VA_ARGS__)) -> decltype(__VA_ARGS__) requires requires { __VA_ARGS__; } { return __VA_ARGS__;} - -// clang-format on - /** * @brief __[public]__ Detects if the compiler has exceptions enabled. * @@ -192,28 +181,6 @@ using std::unreachable; #define LF_ASSERT(expr) LF_ASSUME(expr) #endif -/** - * @brief Macro to prevent a function to be inlined. - */ -#if !defined(LF_NOINLINE) - #if defined(_MSC_VER) && !defined(__clang__) - #define LF_NOINLINE __declspec(noinline) - #elif defined(__GNUC__) && __GNUC__ > 3 - // Clang also defines __GNUC__ (as 4) - #if defined(__CUDACC__) - // nvcc doesn't always parse __noinline__, see: https://svn.boost.org/trac/boost/ticket/9392 - #define LF_NOINLINE __attribute__((noinline)) - #elif defined(__HIP__) - // See https://github.com/boostorg/config/issues/392 - #define LF_NOINLINE __attribute__((noinline)) - #else - #define LF_NOINLINE __attribute__((__noinline__)) - #endif - #else - #define LF_NOINLINE - #endif -#endif - /** * @brief Force no-inline for clang, works-around https://github.com/llvm/llvm-project/issues/63022. * @@ -229,28 +196,6 @@ using std::unreachable; #define LF_CLANG_TLS_NOINLINE #endif -/** - * @brief Macro to use next to 'inline' to force a function to be inlined. - * - * \rst - * - * .. note:: - * - * This does not imply the c++'s `inline` keyword which also has an effect on linkage. - * - * \endrst - */ -#if !defined(LF_FORCEINLINE) - #if defined(_MSC_VER) && !defined(__clang__) - #define LF_FORCEINLINE __forceinline - #elif defined(__GNUC__) && __GNUC__ > 3 - // Clang also defines __GNUC__ (as 4) - #define LF_FORCEINLINE __attribute__((__always_inline__)) - #else - #define LF_FORCEINLINE - #endif -#endif - #if defined(__clang__) && defined(__has_attribute) /** * @brief Compiler specific attribute. diff --git a/.python-version b/.python-version new file mode 100644 index 000000000..24ee5b1be --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.13 diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 000000000..cb1e1f47d --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,206 @@ +# Libfork Copilot Instructions + +## Project Overview + +**libfork** is a continuation-stealing coroutine-tasking library implementing +strict fork-join parallelism using C++20 coroutines. + +- **Type**: C++ library with module/`import std` support +- **Languages**: C++26 + +## Critical Build Requirements + +### Compiler & Module Support + +This project **requires C++23's `import std`** and **MUST** use the appropriate +toolchain file: + +- **MacOS**: Use `-DCMAKE_TOOLCHAIN_FILE=cmake/llvm-brew-toolchain.cmake` +- **Linux**: Use `-DCMAKE_TOOLCHAIN_FILE=cmake/gcc-brew-toolchain.cmake` + +**Common Error**: Without the toolchain file, CMake will fail. + +**Always include the toolchain file** in configure commands. + +### Dependencies (Homebrew) + +Make sure Homebrew is installed and `brew` is in your `PATH`: + +```bash +brew --version +``` + +**Required for building/testing:** + +- `cmake` +- `ninja` +- `catch2` +- `google-benchmark` +- `clang-format` +- `codespell` + +If on MacOS, also require: + +- `llvm` + +If on Linux, also require: + +- `gcc` +- `binutils` + +Install all at once (MacOS): + +```bash +brew install cmake ninja catch2 google-benchmark clang-format codespell llvm +``` + +Install all at once (Linux): + +```bash +brew install cmake ninja catch2 google-benchmark clang-format codespell gcc binutils +``` + +## Build & Test Workflow + +### 1. Configure + +Always use presets with the toolchain file: + +```bash +cmake --preset -DCMAKE_TOOLCHAIN_FILE=cmake/.cmake +``` + +**Relevant available presets** (from `CMakePresets.json`): + +- `ci-hardened` - Debug build with warnings and hardening flags +- `ci-release` - Optimized release build + +All presets enable developer mode (`libfork_DEV_MODE=ON`) and use Ninja generator. + +You should use the `ci-hardened` preset for development/testing and +`ci-release` for benchmarking. + +### 2. Build + +```bash +cmake --build --preset +``` + +**Build warnings** (expected and safe): + +- "It is recommended to build benchmarks in Release mode" - only relevant for `ci-hardened` +- CMake experimental `import std;` warning - expected for C++23's `import std` + +### 3. Test + +```bash +ctest --preset +``` + +All tests should pass. If tests fail, check that: + +- Configuration used the correct toolchain file +- Build completed without errors +- Any changes you have made are correct + +## Project Structure + +### Source Layout + +```sh +libfork/ +├── cmake/ # CMake utilities +├── include/libfork/**/*.hpp # Public headers (macros, version) +├── src/ # C++26 module source files (.cxx) and impl (.cpp) +│ ├── libfork.cxx # libfork — meta-module, re-exports all public modules +│ ├── utils/ # libfork.utils — internal utilities (not public API) +│ │ ├── utils.cxx # aggregator +│ │ └── *.cxx # :partitions +│ ├── core/ # libfork.core — core task/scheduler primitives +│ │ ├── core.cxx # aggregator +│ │ └── *.cxx # :partitions +│ ├── batteries/ # libfork.batteries — stacks, contexts, adaptors +│ │ ├── batteries.cxx # aggregator +│ │ └── *.cxx # :partitions +│ └── schedulers/ # libfork.schedulers — concrete schedulers +│ │ ├── schedulers.cxx # aggregator +│ │ └── *.cxx # :partitions +├── test/src/**/ # Test suite (Catch2) — uses `import libfork;` +│ └── *.cpp +├── benchmark/ # Benchmarking suite (google-benchmark) +│ ├── lib/ # Shared benchmark utilities and definitions +│ │ ├── *.hpp # headers +│ │ └── *.cpp # common source +│ ├── src/ # Implementation-specific benchmarks +│ │ ├── libfork/ # libfork-based benchmarks +│ │ ├── serial/ # serial benchmarks +│ │ └── */ # Other library benchmarks (e.g. OpenMP, TBB, Cilk Plus) +│ └── external/ # External benchmark code (e.g. UTS) +├── .github/workflows/ # CI workflows +│ ├── linux.yml # Linux builds +│ ├── macos.yml # MacOS builds +│ ├── lint.yml # Linting +│ └── linear.yml # Enforces linear history (no merge commits) +└── CMakeLists.txt # Main build configuration +``` + +## Workflows + +### Workflow Command Pattern + +All workflows follow this pattern: + +```yaml +- Install Dependencies: brew install ... +- Configure: cmake --preset -DCMAKE_TOOLCHAIN_FILE=.cmake +- Build: cmake --build --preset +- Test: ctest --preset +``` + +## Common Development Tasks + +### Making Code Changes + +1. **Modify source files** in `src/`, `include/`, `test/`, or `benchmark/` +2. **Rebuild**: `cmake --build --preset ` +3. **Test**: `ctest --preset ` + +#### Adding/removing files from `src/` or `include/` + +- Update the root `CMakeLists.txt` with new/removed files. + +#### Adding/removing files from benchmarks + +- Update the relevant `CMakeLists.txt` in `benchmark/lib/` or `benchmark/src//`. + +### Adding Tests + +Strive to add tests for new features/bug fixes. + +- Add `.cpp` files to `test/src/` +- Tests auto-discovered by CMake (GLOB_RECURSE) +- Links against `libfork::libfork` and `Catch2::Catch2WithMain` + +### Modifying Build Configuration + +**Warning**: Module-related changes are complex. Test thoroughly with clean builds. + +## Troubleshooting + +### Build Failures + +**Problem**: Configuration/Build fails after adding/removing files or modifying CMakeLists.txt +**Solution**: Try a clean build directory: + +```bash +rm -rf build/ +``` + +**Problem**: "compiler does not provide a way to discover the import graph" +**Solution**: Add `-DCMAKE_TOOLCHAIN_FILE=cmake/llvm-brew-toolchain.cmake` to configure + +**Problem**: "Could not find 'brew' executable" +**Solution**: Install Homebrew + +**Problem**: "Could not automatically find libc++.modules.json" +**Solution**: Ensure LLVM is installed via Homebrew; toolchain auto-detects the path diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000..68490f0c1 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,3 @@ +# In ./CLAUDE.md + +@AGENTS.md diff --git a/CMakeLists.txt b/CMakeLists.txt index d193bc879..107cc5dd2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,38 +1,131 @@ -cmake_minimum_required(VERSION 4.2.1 FATAL_ERROR) +cmake_minimum_required(VERSION 4.3 FATAL_ERROR) # See `Help/dev/experimental.rst` -set(CMAKE_EXPERIMENTAL_CXX_IMPORT_STD "d0edc3af-4c50-42ea-a356-e2862fe7a444") +set(CMAKE_EXPERIMENTAL_CXX_IMPORT_STD "451f2fe2-a8a2-47c3-bc32-94786d8fc91b") include(cmake/read_version.cmake) -read_version(${CMAKE_CURRENT_SOURCE_DIR}/include/libfork/core/macro.hpp) +read_version(${CMAKE_CURRENT_SOURCE_DIR}/include/libfork/version.hpp) project( libfork VERSION ${version_major}.${version_minor}.${version_patch} - DESCRIPTION "A bleeding-edge, lock-free, wait-free, continuation-stealing fork-join library built on C++20's coroutines." LANGUAGES CXX ) +# ---- Project options ---- + +option(libfork_DEV_MODE "Enable developer build (tests/benchmarks/etc) for libfork" OFF) + +# ---- System dependencies ---- + +find_package(Threads REQUIRED) + +# =========================== + # Tell CMake that we explicitly want `import std`. This will initialize the # property on all targets declared after this to 1 # TODO: set property per target set(CMAKE_CXX_MODULE_STD 1) -# Make a library. -add_library(uses_std STATIC) +add_library(libfork_libfork) +add_library(libfork::libfork ALIAS libfork_libfork) + +target_link_libraries(libfork_libfork PUBLIC Threads::Threads) -# Add sources. -target_sources(uses_std PRIVATE uses_std.cxx) +set_property(TARGET libfork_libfork PROPERTY EXPORT_NAME libfork) -# Tell CMake we're using C++23 but only C++20 is needed to consume it. -target_compile_features(uses_std INTERFACE cxx_std_23) +target_compile_features(libfork_libfork PUBLIC cxx_std_26) -# Make an executable. -add_executable(main) +# Public headers, __impl must be public because consumers need +# them to build the module BMI +target_sources(libfork_libfork + PUBLIC + FILE_SET HEADERS FILES + include/libfork/version.hpp + include/libfork/__impl/compiler.hpp + include/libfork/__impl/exception.hpp + include/libfork/__impl/utils.hpp + include/libfork/__impl/assume.hpp + BASE_DIRS + include +) -target_sources(main PRIVATE main.cxx) -target_link_libraries(main PRIVATE uses_std) +# Add the module files to the library, must be public because +# consumers will need bo build the BMI +target_sources(libfork_libfork + PUBLIC + FILE_SET CXX_MODULES FILES + # libfork (meta) + src/libfork.cxx + # libfork.utils + src/utils/utils.cxx + src/utils/utility.cxx + src/utils/constants.cxx + src/utils/tuple.cxx + src/utils/concepts.cxx + src/utils/defer.cxx + src/utils/uninitialized.cxx + # libfork.core + src/core/core.cxx + src/core/exception.cxx + src/core/concepts/stack.cxx + src/core/concepts/context.cxx + src/core/concepts/scheduler.cxx + src/core/concepts/invocable.cxx + src/core/concepts/awaitable.cxx + src/core/concepts/indirect.cxx + src/core/concepts/semigroup.cxx + src/core/frame.cxx + src/core/task.cxx + src/core/ops.cxx + src/core/poly_context.cxx + src/core/thread_locals.cxx + src/core/schedule.cxx + src/core/handles.cxx + src/core/root.cxx + src/core/execute.cxx + src/core/receiver.cxx + src/core/final_suspend.cxx + src/core/awaitables.cxx + src/core/promise.cxx + src/core/stop.cxx + src/core/projected.cxx + src/core/lift.cxx + # libfork.batteries + src/batteries/batteries.cxx + src/batteries/deque.cxx + src/batteries/adaptors.cxx + src/batteries/contexts.cxx + src/batteries/geometric_stack.cxx + src/batteries/adaptor_stack.cxx + src/batteries/slab_stack.cxx + # libfork.schedulers + src/schedulers/schedulers.cxx + src/schedulers/inline.cxx + src/schedulers/busy.cxx + # libfork.algorithm + src/algorithm/algorithm.cxx + src/algorithm/for_each.cxx + src/algorithm/fold.cxx + src/algorithm/concepts.cxx + PRIVATE + src/exception.cpp +) + +# ====================== + +if(libfork_DEV_MODE) + + include(CTest) # Enables the BUILD_TESTING option + + if(BUILD_TESTING) + add_subdirectory(test) + endif() + + add_subdirectory(benchmark) + +endif() # list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") # @@ -43,9 +136,6 @@ target_link_libraries(main PRIVATE uses_std) # # message(STATUS "CMAKE_BUILD_TYPE is set to '${CMAKE_BUILD_TYPE}'") # -# # ---- System dependencies ---- -# -# find_package(Threads REQUIRED) # # # ------ Declare library ------ # diff --git a/CMakePresets.json b/CMakePresets.json new file mode 100644 index 000000000..ac23b1e37 --- /dev/null +++ b/CMakePresets.json @@ -0,0 +1,126 @@ +{ + "version": 10, + "configurePresets": [ + { + "name": "cmake-pedantic", + "hidden": true, + "warnings": { + "dev": true, + "deprecated": true, + "uninitialized": true, + "unusedCli": true, + "systemVars": false + }, + "errors": { + "deprecated": true + } + }, + { + "name": "ci-base", + "inherits": "cmake-pedantic", + "hidden": true, + "generator": "Ninja", + "binaryDir": "${sourceDir}/build/${presetName}", + "cacheVariables": { + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON", + "libfork_DEV_MODE": "ON" + } + }, + { + "name": "ci-hardened", + "inherits": "ci-base", + "displayName": "Debug with warnings and hardening", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Debug", + "CMAKE_CXX_FLAGS": "-O2 -Wall -Wextra -Wpedantic -Wconversion -Wsign-conversion -Wcast-qual -Wformat -Wformat=2 -Wundef -Werror=float-equal -Wshadow -Wcast-align -Wunused -Wnull-dereference -Wdouble-promotion -Wimplicit-fallthrough -Wextra-semi -Woverloaded-virtual -Wnon-virtual-dtor -Wold-style-cast -Werror=format-security -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=3 -D_GLIBCXX_ASSERTIONS -fstrict-flex-arrays=3 -fstack-protector-strong -Wno-missing-braces -Wno-missing-field-initializers -Wno-c2y-extensions" + } + }, + { + "name": "ci-release", + "inherits": "ci-base", + "displayName": "Release", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release", + "CMAKE_CXX_FLAGS": "-O3 -DNDEBUG -flto=auto -march=native -falign-functions=64" + } + }, + { + "name": "ci-no-except-rtti", + "inherits": "ci-base", + "displayName": "Release no RTTI or exceptions", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release", + "CMAKE_CXX_FLAGS": "-O3 -DNDEBUG -flto=auto -march=native -fno-exceptions -fno-rtti -falign-functions=64" + } + }, + { + "name": "ci-sanitize", + "inherits": "ci-base", + "displayName": "Debug with sanitizers", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Sanitize", + "CMAKE_CXX_FLAGS": "-O2 -g -fsanitize=address,undefined -fno-omit-frame-pointer -fno-common -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=3 -D_GLIBCXX_ASSERTIONS" + } + } + ], + "buildPresets": [ + { + "name": "ci-hardened", + "configurePreset": "ci-hardened" + }, + { + "name": "ci-release", + "configurePreset": "ci-release" + }, + { + "name": "ci-no-except-rtti", + "configurePreset": "ci-no-except-rtti" + }, + { + "name": "ci-sanitize", + "configurePreset": "ci-sanitize" + } + ], + "testPresets": [ + { + "name": "ci-hardened", + "configurePreset": "ci-hardened", + "output": { + "outputOnFailure": true + }, + "execution": { + "stopOnFailure": true + } + }, + { + "name": "ci-release", + "configurePreset": "ci-release", + "output": { + "outputOnFailure": true + }, + "execution": { + "stopOnFailure": true + } + }, + { + "name": "ci-no-except-rtti", + "configurePreset": "ci-no-except-rtti", + "output": { + "outputOnFailure": true + }, + "execution": { + "stopOnFailure": true + } + }, + { + "name": "ci-sanitize", + "configurePreset": "ci-sanitize", + "output": { + "outputOnFailure": true + }, + "execution": { + "stopOnFailure": true + } + } + ] +} diff --git a/CMakeUserPresets.json b/CMakeUserPresets.json new file mode 100644 index 000000000..26e650168 --- /dev/null +++ b/CMakeUserPresets.json @@ -0,0 +1,79 @@ +{ + "version": 10, + "configurePresets": [ + { + "name": "dev", + "inherits": "ci-hardened", + "displayName": "Hardened development build", + "toolchainFile": "${sourceDir}/cmake/llvm-brew-toolchain.cmake", + "cacheVariables": { + "CMAKE_COLOR_DIAGNOSTICS": "ON" + } + }, + { + "name": "bench", + "inherits": "ci-release", + "displayName": "Release build for benchmarks", + "toolchainFile": "${sourceDir}/cmake/llvm-brew-toolchain.cmake", + "cacheVariables": { + "CMAKE_COLOR_DIAGNOSTICS": "ON" + } + } + ], + "buildPresets": [ + { + "name": "dev", + "configurePreset": "dev" + }, + { + "name": "bench", + "configurePreset": "bench" + } + ], + "testPresets": [ + { + "name": "dev", + "configurePreset": "dev", + "output": { + "outputOnFailure": true + }, + "execution": { + "stopOnFailure": true + } + } + ], + "workflowPresets": [ + { + "name": "dev", + "displayName": "Development Debug Hardened Workflow", + "steps": [ + { + "type": "configure", + "name": "dev" + }, + { + "type": "build", + "name": "dev" + }, + { + "type": "test", + "name": "dev" + } + ] + }, + { + "name": "bench", + "displayName": "Release Build (including Benchmarks)", + "steps": [ + { + "type": "configure", + "name": "bench" + }, + { + "type": "build", + "name": "bench" + } + ] + } + ] +} diff --git a/.legacy/LICENSE.md b/LICENSE.md similarity index 100% rename from .legacy/LICENSE.md rename to LICENSE.md diff --git a/actions/setup/action.yaml b/actions/setup/action.yaml deleted file mode 100644 index cd451a14b..000000000 --- a/actions/setup/action.yaml +++ /dev/null @@ -1,52 +0,0 @@ -name: 'setup' -description: 'setup vcpkg/cmake/ninja and caching' - -runs: - using: "composite" - - steps: - # Set env vars needed for vcpkg to leverage the GitHub Action cache as a storage for Binary Caching. - - uses: actions/github-script@v6 - with: - script: | - core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || ''); - core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || ''); - - - uses: actions/checkout@v3 - with: - submodules: true - - name: "Create directory '${{ env.VCPKG_DEFAULT_BINARY_CACHE }}'" - run: mkdir -p $VCPKG_DEFAULT_BINARY_CACHE - shell: bash - - # Setup the build machine with the most recent versions of CMake and Ninja. - # Both are cached if not already: on subsequent runs both will be quickly restored from GitHub cache service. - - uses: lukka/get-cmake@latest - - # Restore vcpkg from the GitHub Action cache service. - # Note that packages are restored by vcpkg's binary caching when it is being run afterwards by CMake. - - name: Restore vcpkg - uses: actions/cache@v3 - with: - # The first path is the location of vcpkg: it contains the vcpkg executable and data files, as long as the - # built package archives (aka binary cache) which are located by VCPKG_DEFAULT_BINARY_CACHE env var. - # The other paths starting with '!' are exclusions: they contain temporary files generated - # during the build of the installed packages. - path: | - ${{ env.VCPKG_ROOT_DIR }} - !${{ env.VCPKG_ROOT_DIR }}/buildtrees - !${{ env.VCPKG_ROOT_DIR }}/packages - !${{ env.VCPKG_ROOT_DIR }}/downloads - !${{ env.VCPKG_ROOT_DIR }}/installed - # The key is composed in a way that it gets properly invalidated whenever a different version of vcpkg is being used. - key: | - ${{ hashFiles( '.git/modules/vcpkg/HEAD' )}} - - # On Windows runners, let's ensure to have the Developer Command Prompt environment setup correctly. - # As used here the Developer Command Prompt created is targeting x64 and using the default the Windows SDK. - - uses: ilammy/msvc-dev-cmd@v1 - - - name: Setup xcode - if: matrix.os == 'macos-13' - shell: bash - run: sudo xcode-select --switch /Applications/Xcode_15.0.app/Contents/Developer \ No newline at end of file diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt new file mode 100644 index 000000000..5ce1a7480 --- /dev/null +++ b/benchmark/CMakeLists.txt @@ -0,0 +1,51 @@ +cmake_minimum_required(VERSION 4.2.1 FATAL_ERROR) + +project(libfork_benchmark LANGUAGES CXX) + +if(NOT CMAKE_BUILD_TYPE STREQUAL "Release") + message(WARNING "It is recommended to build benchmarks in Release mode for accurate results.") +endif() + +# ---- Dependencies ---- + +find_package(benchmark REQUIRED) + +# ---- Benchmarks ---- + +add_subdirectory(lib) + +add_subdirectory(src/serial) +add_subdirectory(src/baremetal) +add_subdirectory(src/libfork) + +# WHOLE_ARCHIVE ensures benchmark registrations (global initialisers) are not +# dropped by the linker when pulling objects from the static libraries above. +add_executable(libfork_benchmark src/benchmarks.cpp) + +target_link_libraries(libfork_benchmark + PRIVATE + $ + benchmark::benchmark_main +) + +if(BUILD_TESTING) + add_test(NAME Benchmark + COMMAND libfork_benchmark --benchmark_dry_run --benchmark_filter=^test/ + ) +endif() + +# ---- OpenMP Benchmarks ---- + +find_package(OpenMP REQUIRED) + +if(OpenMP_CXX_FOUND) + + add_subdirectory(src/openmp) + + target_link_libraries(libfork_benchmark + PRIVATE + $ + ) +endif() + + diff --git a/benchmark/external/uts/CMakeLists.txt b/benchmark/external/uts/CMakeLists.txt new file mode 100644 index 000000000..89c8b5e6e --- /dev/null +++ b/benchmark/external/uts/CMakeLists.txt @@ -0,0 +1,19 @@ +cmake_minimum_required(VERSION 4.2.1 FATAL_ERROR) + +project(uts_external LANGUAGES C) + +add_library(uts_c) + +target_sources(uts_c + PRIVATE + src/uts.c + src/rng/brg_sha1.c + PUBLIC + FILE_SET HEADERS + BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/include + FILES + include/uts/uts.h + include/uts/rng/rng.h + include/uts/rng/brg_sha1.h + include/uts/rng/brg_types.h +) diff --git a/benchmark/external/uts/include/uts/rng/brg_sha1.h b/benchmark/external/uts/include/uts/rng/brg_sha1.h new file mode 100644 index 000000000..d30f12c0d --- /dev/null +++ b/benchmark/external/uts/include/uts/rng/brg_sha1.h @@ -0,0 +1,100 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 2002, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue Date: 01/08/2005 +*/ + +#ifndef _SHA1_H +#define _SHA1_H + +#include "uts/rng/brg_types.h" + +#define SHA1_BLOCK_SIZE 64 +#define SHA1_DIGEST_SIZE 20 + +#if defined(__cplusplus) +extern "C" { +#endif + +/** BEGIN: UTS RNG Harness **/ + +#define POS_MASK 0x7fffffff +#define HIGH_BITS 0x80000000 + +#define sha1_context sha1_ctx_s + +/**********************************/ +/* random number generator state */ +/**********************************/ +struct state_t { + uint_8t state[20]; +}; + +typedef uint_8t RNG_state; + +/***************************************/ +/* random number generator operations */ +/***************************************/ +void rng_init(RNG_state *state, int seed); +void rng_spawn(RNG_state *mystate, RNG_state *newstate, int spawnNumber); +int rng_rand(RNG_state *mystate); +int rng_nextrand(RNG_state *mystate); +char *rng_showstate(RNG_state *state, char *s); +int rng_showtype(char *strBuf, int ind); + +/** END: UTS RNG Harness **/ +/* type to hold the SHA256 context */ + +struct sha1_ctx_s { + uint_32t count[2]; + uint_32t hash[5]; + uint_32t wbuf[16]; +}; + +typedef struct sha1_ctx_s sha1_ctx; + +/* Note that these prototypes are the same for both bit and */ +/* byte oriented implementations. However the length fields */ +/* are in bytes or bits as appropriate for the version used */ +/* and bit sequences are input as arrays of bytes in which */ +/* bit sequences run from the most to the least significant */ +/* end of each byte */ + +VOID_RETURN sha1_compile(sha1_ctx ctx[1]); + +VOID_RETURN sha1_begin(sha1_ctx ctx[1]); +VOID_RETURN sha1_hash(const unsigned char data[], unsigned long len, sha1_ctx ctx[1]); +VOID_RETURN sha1_end(unsigned char hval[], sha1_ctx ctx[1]); +VOID_RETURN sha1(unsigned char hval[], const unsigned char data[], unsigned long len); + +#if defined(__cplusplus) +} +#endif + +#endif \ No newline at end of file diff --git a/benchmark/external/uts/include/uts/rng/brg_types.h b/benchmark/external/uts/include/uts/rng/brg_types.h new file mode 100644 index 000000000..9532acce6 --- /dev/null +++ b/benchmark/external/uts/include/uts/rng/brg_types.h @@ -0,0 +1,214 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue 09/09/2006 + + The unsigned integer types defined here are of the form uint_t where + is the length of the type; for example, the unsigned 32-bit type is + 'uint_32t'. These are NOT the same as the 'C99 integer types' that are + defined in the inttypes.h and stdint.h headers since attempts to use these + types have shown that support for them is still highly variable. However, + since the latter are of the form unit_t, a regular expression search + and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t') + can be used to convert the types used here to the C99 standard types. +*/ + +#ifndef BRG_TYPES_H +#define BRG_TYPES_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include + +/* Try one of these if things don't work automatically */ +#ifdef BRG_C99_TYPES + #include + #include + #define BRG_UI8 +typedef uint8_t uint_8t; + #define BRG_UI16 +typedef uint16_t uint_16t; + #define BRG_UI32 + #define li_32(h) 0x##h##u +typedef uint32_t uint_32t; + #define BRG_UI64 + #define li_64(h) 0x##h##u +typedef uint64_t uint_64t; + +#elif defined(BRG_STD_TYPES) + #include + #define BRG_UI8 +typedef u_int8_t uint_8t; + #define BRG_UI16 +typedef u_int16_t uint_16t; + #define BRG_UI32 + #define li_32(h) 0x##h##u +typedef u_int32_t uint_32t; + #define BRG_UI64 + #define li_64(h) 0x##h##u +typedef u_int64_t uint_64t; + +#endif + +#ifndef BRG_UI8 + #define BRG_UI8 + #if UCHAR_MAX == 255u +typedef unsigned char uint_8t; + #else + #error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h + #endif +#endif + +#ifndef BRG_UI16 + #define BRG_UI16 + #if USHRT_MAX == 65535u +typedef unsigned short uint_16t; + #else + #error Please define uint_16t as a 16-bit unsigned short type in brg_types.h + #endif +#endif + +#ifndef BRG_UI32 + #define BRG_UI32 + #if UINT_MAX == 4294967295u + #define li_32(h) 0x##h##u +typedef unsigned int uint_32t; + #elif ULONG_MAX == 4294967295u + #define li_32(h) 0x##h##ul +typedef unsigned long uint_32t; + #elif defined(_CRAY) + #error This code needs 32-bit data types, which Cray machines do not provide + #else + #error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h + #endif +#endif + +#ifndef BRG_UI64 + #if defined(__BORLANDC__) && !defined(__MSDOS__) + #define BRG_UI64 + #define li_64(h) 0x##h##ull +typedef unsigned __int64 uint_64t; + #elif defined(_MSC_VER) && (_MSC_VER < 1300) /* 1300 == VC++ 7.0 */ + #define BRG_UI64 + #define li_64(h) 0x##h##ui64 +typedef unsigned __int64 uint_64t; + #elif defined(__sun) && defined(ULONG_MAX) && ULONG_MAX == 0xfffffffful + #define BRG_UI64 + #define li_64(h) 0x##h##ull +typedef unsigned long long uint_64t; + #elif defined(UINT_MAX) && UINT_MAX > 4294967295u + #if UINT_MAX == 18446744073709551615u + #define BRG_UI64 + #define li_64(h) 0x##h##u +typedef unsigned int uint_64t; + #endif + #elif defined(ULONG_MAX) && ULONG_MAX > 4294967295u + #if ULONG_MAX == 18446744073709551615ul + #define BRG_UI64 + #define li_64(h) 0x##h##ul +typedef unsigned long uint_64t; + #endif + #elif defined(ULLONG_MAX) && ULLONG_MAX > 4294967295u + #if ULLONG_MAX == 18446744073709551615ull + #define BRG_UI64 + #define li_64(h) 0x##h##ull +typedef unsigned long long uint_64t; + #endif + #elif defined(ULONG_LONG_MAX) && ULONG_LONG_MAX > 4294967295u + #if ULONG_LONG_MAX == 18446744073709551615ull + #define BRG_UI64 + #define li_64(h) 0x##h##ull +typedef unsigned long long uint_64t; + #endif + #endif +#endif + +#if defined(NEED_UINT_64T) && !defined(BRG_UI64) + #error Please define uint_64t as an unsigned 64 bit type in brg_types.h +#endif + +#ifndef RETURN_VALUES + #define RETURN_VALUES + #if defined(DLL_EXPORT) + #if defined(_MSC_VER) || defined(__INTEL_COMPILER) + #define VOID_RETURN __declspec(dllexport) void __stdcall + #define INT_RETURN __declspec(dllexport) int __stdcall + #elif defined(__GNUC__) + #define VOID_RETURN __declspec(__dllexport__) void + #define INT_RETURN __declspec(__dllexport__) int + #else + #error Use of the DLL is only available on the Microsoft, Intel and GCC compilers + #endif + #elif defined(DLL_IMPORT) + #if defined(_MSC_VER) || defined(__INTEL_COMPILER) + #define VOID_RETURN __declspec(dllimport) void __stdcall + #define INT_RETURN __declspec(dllimport) int __stdcall + #elif defined(__GNUC__) + #define VOID_RETURN __declspec(__dllimport__) void + #define INT_RETURN __declspec(__dllimport__) int + #else + #error Use of the DLL is only available on the Microsoft, Intel and GCC compilers + #endif + #elif defined(__WATCOMC__) + #define VOID_RETURN void __cdecl + #define INT_RETURN int __cdecl + #else + #define VOID_RETURN void + #define INT_RETURN int + #endif +#endif + +/* These defines are used to declare buffers in a way that allows + faster operations on longer variables to be used. In all these + defines 'size' must be a power of 2 and >= 8 + + dec_unit_type(size,x) declares a variable 'x' of length + 'size' bits + + dec_bufr_type(size,bsize,x) declares a buffer 'x' of length 'bsize' + bytes defined as an array of variables + each of 'size' bits (bsize must be a + multiple of size / 8) + + ptr_cast(x,size) casts a pointer to a pointer to a + variable of length 'size' bits +*/ + +#define ui_type(size) uint_##size##t +#define dec_unit_type(size, x) typedef ui_type(size) x +#define dec_bufr_type(size, bsize, x) typedef ui_type(size) x[bsize / (size >> 3)] +#define ptr_cast(x, size) ((ui_type(size) *)(x)) + +#if defined(__cplusplus) +} +#endif + +#endif \ No newline at end of file diff --git a/benchmark/external/uts/include/uts/rng/rng.h b/benchmark/external/uts/include/uts/rng/rng.h new file mode 100644 index 000000000..105c40466 --- /dev/null +++ b/benchmark/external/uts/include/uts/rng/rng.h @@ -0,0 +1,6 @@ +#ifndef _RNG_H +#define _RNG_H + +#include "uts/rng/brg_sha1.h" + +#endif /* _RNG_H */ \ No newline at end of file diff --git a/benchmark/external/uts/include/uts/uts.h b/benchmark/external/uts/include/uts/uts.h new file mode 100644 index 000000000..e86e68f3e --- /dev/null +++ b/benchmark/external/uts/include/uts/uts.h @@ -0,0 +1,120 @@ +#ifndef A0179FFF_4078_4EEB_BB6E_1E8C75CC694C +#define A0179FFF_4078_4EEB_BB6E_1E8C75CC694C +/* + * ---- The Unbalanced Tree Search (UTS) Benchmark ---- + * + * Copyright (c) 2010 See AUTHORS file for copyright holders + * + * This file is part of the unbalanced tree search benchmark. This + * project is licensed under the MIT Open Source license. See the LICENSE + * file for copyright and licensing information. + * + * UTS is a collaborative project between researchers at the University of + * Maryland, the University of North Carolina at Chapel Hill, and the Ohio + * State University. See AUTHORS file for more information. + * + */ + +#ifndef _UTS_H + #define _UTS_H + + #ifdef __cplusplus +extern "C" { + #endif + + #include "uts/rng/rng.h" + + #define UTS_VERSION "2.1" + + /*********************************************************** + * Tree node descriptor and statistics * + ***********************************************************/ + + #define MAXNUMCHILDREN 100 // cap on children (BIN root is exempt) + +struct node_t { + int type; // distribution governing number of children + int height; // depth of this node in the tree + int numChildren; // number of children, -1 => not yet determined + + /* for RNG state associated with this node */ + struct state_t state; +}; + +typedef struct node_t Node; + +/* Tree type + * Trees are generated using a Galton-Watson process, in + * which the branching factor of each node is a random + * variable. + * + * The random variable can follow a binomial distribution + * or a geometric distribution. Hybrid tree are + * generated with geometric distributions near the + * root and binomial distributions towards the leaves. + */ +enum uts_trees_e { BIN = 0, GEO, HYBRID, BALANCED }; +enum uts_geoshape_e { LINEAR = 0, EXPDEC, CYCLIC, FIXED }; + +typedef enum uts_trees_e tree_t; +typedef enum uts_geoshape_e geoshape_t; + +/* Strings for the above enums */ +extern char *uts_trees_str[]; +extern char *uts_geoshapes_str[]; + +/* Tree parameters */ +extern tree_t type; +extern double b_0; +extern int rootId; +extern int nonLeafBF; +extern double nonLeafProb; +extern int gen_mx; +extern geoshape_t shape_fn; +extern double shiftDepth; + +/* Benchmark parameters */ +extern int computeGranularity; +extern int debug; +extern int verbose; + +/* For stats generation: */ +typedef unsigned long long counter_t; + + /* Utility Functions */ + #define max(a, b) (((a) > (b)) ? (a) : (b)) + #define min(a, b) (((a) < (b)) ? (a) : (b)) + +void uts_error(char *str); +void uts_parseParams(int argc, char **argv); +int uts_paramsToStr(char *strBuf, int ind); +void uts_printParams(); +void uts_helpMessage(); + +void uts_showStats( + int nPes, int chunkSize, double walltime, counter_t nNodes, counter_t nLeaves, counter_t maxDepth); +double uts_wctime(); + +double rng_toProb(int n); + +/* Common tree routines */ +void uts_initRoot(Node *root, int type); +int uts_numChildren(Node *parent); +int uts_numChildren_bin(Node *parent); +int uts_numChildren_geo(Node *parent); +int uts_childType(Node *parent); + +/* Implementation Specific Functions */ +char *impl_getName(); +int impl_paramsToStr(char *strBuf, int ind); +int impl_parseParam(char *param, char *value); +void impl_helpMessage(); +void impl_abort(int err); + + #ifdef __cplusplus +} + #endif + +#endif /* _UTS_H */ + +#endif /* A0179FFF_4078_4EEB_BB6E_1E8C75CC694C */ diff --git a/benchmark/external/uts/src/rng/brg_endian.h b/benchmark/external/uts/src/rng/brg_endian.h new file mode 100644 index 000000000..96082e57b --- /dev/null +++ b/benchmark/external/uts/src/rng/brg_endian.h @@ -0,0 +1,132 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue 20/10/2006 +*/ + +#ifndef BRG_ENDIAN_H +#define BRG_ENDIAN_H + +#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ +#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ + +/* Include files where endian defines and byteswap functions may reside */ +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) + #include +#elif defined(BSD) && (BSD >= 199103) || defined(__APPLE__) || defined(__CYGWIN32__) || \ + defined(__DJGPP__) || defined(__osf__) + #include +#elif defined(__linux__) || defined(__GNUC__) || defined(__GNU_LIBRARY__) + #if !defined(__MINGW32__) && !defined(__sun__) + #include + #if !defined(__BEOS__) + #include + #endif + #endif +#endif + +/* Now attempt to set the define for platform byte order using any */ +/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */ +/* seem to encompass most endian symbol definitions */ + +#if defined(BIG_ENDIAN) && defined(LITTLE_ENDIAN) + #if defined(BYTE_ORDER) && BYTE_ORDER == BIG_ENDIAN + #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + #elif defined(BYTE_ORDER) && BYTE_ORDER == LITTLE_ENDIAN + #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + #endif +#elif defined(BIG_ENDIAN) + #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined(LITTLE_ENDIAN) + #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined(_BIG_ENDIAN) && defined(_LITTLE_ENDIAN) + #if defined(_BYTE_ORDER) && _BYTE_ORDER == _BIG_ENDIAN + #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + #elif defined(_BYTE_ORDER) && _BYTE_ORDER == _LITTLE_ENDIAN + #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + #endif +#elif defined(_BIG_ENDIAN) + #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined(_LITTLE_ENDIAN) + #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined(__BIG_ENDIAN) && defined(__LITTLE_ENDIAN) + #if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN + #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + #elif defined(__BYTE_ORDER) && __BYTE_ORDER == __LITTLE_ENDIAN + #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + #endif +#elif defined(__BIG_ENDIAN) + #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined(__LITTLE_ENDIAN) + #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined(__BIG_ENDIAN__) && defined(__LITTLE_ENDIAN__) + #if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __BIG_ENDIAN__ + #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + #elif defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __LITTLE_ENDIAN__ + #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + #endif +#elif defined(__BIG_ENDIAN__) + #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined(__LITTLE_ENDIAN__) + #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +/* if the platform byte order could not be determined, then try to */ +/* set this define using common machine defines */ +#if !defined(PLATFORM_BYTE_ORDER) + + #if defined(__alpha__) || defined(__alpha) || defined(i386) || defined(__i386__) || defined(_M_I86) || \ + defined(_M_IX86) || defined(__OS2__) || defined(sun386) || defined(__TURBOC__) || defined(vax) || \ + defined(vms) || defined(VMS) || defined(__VMS) || defined(_M_X64) + #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + + #elif defined(AMIGA) || defined(applec) || defined(__AS400__) || defined(_CRAY) || defined(__hppa) || \ + defined(__hp9000) || defined(ibm370) || defined(mc68000) || defined(m68k) || defined(__MRC__) || \ + defined(__MVS__) || defined(__MWERKS__) || defined(sparc) || defined(__sparc) || \ + defined(SYMANTEC_C) || defined(__VOS__) || defined(__TIGCC__) || defined(__TANDEM) || \ + defined(THINK_C) || defined(__VMCMS__) + #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + + #elif 0 /* **** EDIT HERE IF NECESSARY **** */ + #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + #elif 0 /* **** EDIT HERE IF NECESSARY **** */ + #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + #else + #error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order + #endif + +#endif + +#endif \ No newline at end of file diff --git a/benchmark/external/uts/src/rng/brg_sha1.c b/benchmark/external/uts/src/rng/brg_sha1.c new file mode 100644 index 000000000..f6757bafc --- /dev/null +++ b/benchmark/external/uts/src/rng/brg_sha1.c @@ -0,0 +1,340 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 2002, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue Date: 01/08/2005 + + This is a byte oriented version of SHA1 that operates on arrays of bytes + stored in memory. +*/ + +#include +#include /* for memcpy() etc. */ + +#include "brg_endian.h" +#include "uts/rng/brg_sha1.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +/** BEGIN: UTS RNG Harness **/ + +void rng_init(RNG_state *newstate, int seed) { + struct sha1_context ctx; + struct state_t gen; + int i; + + for (i = 0; i < 16; i++) + gen.state[i] = 0; + gen.state[16] = 0xFF & (seed >> 24); + gen.state[17] = 0xFF & (seed >> 16); + gen.state[18] = 0xFF & (seed >> 8); + gen.state[19] = 0xFF & (seed >> 0); + + sha1_begin(&ctx); + sha1_hash(gen.state, 20, &ctx); + sha1_end(newstate, &ctx); +} + +void rng_spawn(RNG_state *mystate, RNG_state *newstate, int spawnnumber) { + struct sha1_context ctx; + uint_8t bytes[4]; + + bytes[0] = 0xFF & (spawnnumber >> 24); + bytes[1] = 0xFF & (spawnnumber >> 16); + bytes[2] = 0xFF & (spawnnumber >> 8); + bytes[3] = 0xFF & spawnnumber; + + sha1_begin(&ctx); + sha1_hash(mystate, 20, &ctx); + sha1_hash(bytes, 4, &ctx); + sha1_end(newstate, &ctx); +} + +int rng_rand(RNG_state *mystate) { + int r; + uint_32t b = (mystate[16] << 24) | (mystate[17] << 16) | (mystate[18] << 8) | (mystate[19] << 0); + b = b & POS_MASK; + + r = (int)b; + // printf("b: %d\t, r: %d\n", b, r); + return r; +} + +int rng_nextrand(RNG_state *mystate) { + struct sha1_context ctx; + int r; + uint_32t b; + + sha1_begin(&ctx); + sha1_hash(mystate, 20, &ctx); + sha1_end(mystate, &ctx); + b = (mystate[16] << 24) | (mystate[17] << 16) | (mystate[18] << 8) | (mystate[19] << 0); + b = b & POS_MASK; + + r = (int)b; + return r; +} + +/* condense state into string to display during debugging */ +char *rng_showstate(RNG_state *state, char *s) { + sprintf(s, "%.2X%.2X...", state[0], state[1]); + return s; +} + +/* describe random number generator type into string */ +int rng_showtype(char *strBuf, int ind) { + ind += sprintf(strBuf + ind, "SHA-1 (state size = %uB)", (unsigned)sizeof(struct state_t)); + return ind; +} + +/** END: UTS RNG Harness **/ + +#if defined(_MSC_VER) && (_MSC_VER > 800) + #pragma intrinsic(memcpy) +#endif + +#if 0 && defined(_MSC_VER) + #define rotl32 _lrotl + #define rotr32 _lrotr +#else + #define rotl32(x, n) (((x) << n) | ((x) >> (32 - n))) + #define rotr32(x, n) (((x) >> n) | ((x) << (32 - n))) +#endif + +#if !defined(bswap_32) + #define bswap_32(x) ((rotr32((x), 24) & 0x00ff00ff) | (rotr32((x), 8) & 0xff00ff00)) +#endif + +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) + #define SWAP_BYTES +#else + #undef SWAP_BYTES +#endif + +#if defined(SWAP_BYTES) + #define bsw_32(p, n) \ + { \ + int _i = (n); \ + while (_i--) \ + ((uint_32t *)p)[_i] = bswap_32(((uint_32t *)p)[_i]); \ + } +#else + #define bsw_32(p, n) +#endif + +#define SHA1_MASK (SHA1_BLOCK_SIZE - 1) + +#if 0 + + #define ch(x, y, z) (((x) & (y)) ^ (~(x) & (z))) + #define parity(x, y, z) ((x) ^ (y) ^ (z)) + #define maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) + +#else /* Discovered by Rich Schroeppel and Colin Plumb */ + + #define ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) + #define parity(x, y, z) ((x) ^ (y) ^ (z)) + #define maj(x, y, z) (((x) & (y)) | ((z) & ((x) ^ (y)))) + +#endif + +/* Compile 64 bytes of hash data into SHA1 context. Note */ +/* that this routine assumes that the byte order in the */ +/* ctx->wbuf[] at this point is in such an order that low */ +/* address bytes in the ORIGINAL byte stream will go in */ +/* this buffer to the high end of 32-bit words on BOTH big */ +/* and little endian systems */ + +#ifdef ARRAY + #define q(v, n) v[n] +#else + #define q(v, n) v##n +#endif + +#define one_cycle(v, a, b, c, d, e, f, k, h) \ + q(v, e) += rotr32(q(v, a), 27) + f(q(v, b), q(v, c), q(v, d)) + k + h; \ + q(v, b) = rotr32(q(v, b), 2) + +#define five_cycle(v, f, k, i) \ + one_cycle(v, 0, 1, 2, 3, 4, f, k, hf(i)); \ + one_cycle(v, 4, 0, 1, 2, 3, f, k, hf(i + 1)); \ + one_cycle(v, 3, 4, 0, 1, 2, f, k, hf(i + 2)); \ + one_cycle(v, 2, 3, 4, 0, 1, f, k, hf(i + 3)); \ + one_cycle(v, 1, 2, 3, 4, 0, f, k, hf(i + 4)) + +VOID_RETURN sha1_compile(sha1_ctx ctx[1]) { + uint_32t *w = ctx->wbuf; + +#ifdef ARRAY + uint_32t v[5]; + memcpy(v, ctx->hash, 5 * sizeof(uint_32t)); +#else + uint_32t v0, v1, v2, v3, v4; + v0 = ctx->hash[0]; + v1 = ctx->hash[1]; + v2 = ctx->hash[2]; + v3 = ctx->hash[3]; + v4 = ctx->hash[4]; +#endif + +#define hf(i) w[i] + + five_cycle(v, ch, 0x5a827999, 0); + five_cycle(v, ch, 0x5a827999, 5); + five_cycle(v, ch, 0x5a827999, 10); + one_cycle(v, 0, 1, 2, 3, 4, ch, 0x5a827999, hf(15)); + +#undef hf +#define hf(i) (w[(i)&15] = rotl32(w[((i) + 13) & 15] ^ w[((i) + 8) & 15] ^ w[((i) + 2) & 15] ^ w[(i)&15], 1)) + + one_cycle(v, 4, 0, 1, 2, 3, ch, 0x5a827999, hf(16)); + one_cycle(v, 3, 4, 0, 1, 2, ch, 0x5a827999, hf(17)); + one_cycle(v, 2, 3, 4, 0, 1, ch, 0x5a827999, hf(18)); + one_cycle(v, 1, 2, 3, 4, 0, ch, 0x5a827999, hf(19)); + + five_cycle(v, parity, 0x6ed9eba1, 20); + five_cycle(v, parity, 0x6ed9eba1, 25); + five_cycle(v, parity, 0x6ed9eba1, 30); + five_cycle(v, parity, 0x6ed9eba1, 35); + + five_cycle(v, maj, 0x8f1bbcdc, 40); + five_cycle(v, maj, 0x8f1bbcdc, 45); + five_cycle(v, maj, 0x8f1bbcdc, 50); + five_cycle(v, maj, 0x8f1bbcdc, 55); + + five_cycle(v, parity, 0xca62c1d6, 60); + five_cycle(v, parity, 0xca62c1d6, 65); + five_cycle(v, parity, 0xca62c1d6, 70); + five_cycle(v, parity, 0xca62c1d6, 75); + +#ifdef ARRAY + ctx->hash[0] += v[0]; + ctx->hash[1] += v[1]; + ctx->hash[2] += v[2]; + ctx->hash[3] += v[3]; + ctx->hash[4] += v[4]; +#else + ctx->hash[0] += v0; + ctx->hash[1] += v1; + ctx->hash[2] += v2; + ctx->hash[3] += v3; + ctx->hash[4] += v4; +#endif +} + +VOID_RETURN sha1_begin(sha1_ctx ctx[1]) { + ctx->count[0] = ctx->count[1] = 0; + ctx->hash[0] = 0x67452301; + ctx->hash[1] = 0xefcdab89; + ctx->hash[2] = 0x98badcfe; + ctx->hash[3] = 0x10325476; + ctx->hash[4] = 0xc3d2e1f0; +} + +/* SHA1 hash data in an array of bytes into hash buffer and */ +/* call the hash_compile function as required. */ + +VOID_RETURN sha1_hash(const unsigned char data[], unsigned long len, sha1_ctx ctx[1]) { + uint_32t pos = (uint_32t)(ctx->count[0] & SHA1_MASK), space = SHA1_BLOCK_SIZE - pos; + const unsigned char *sp = data; + + if ((ctx->count[0] += len) < len) + ++(ctx->count[1]); + + while (len >= space) /* transfer whole blocks if possible */ + { + memcpy(((unsigned char *)ctx->wbuf) + pos, sp, space); + sp += space; + len -= space; + space = SHA1_BLOCK_SIZE; + pos = 0; + bsw_32(ctx->wbuf, SHA1_BLOCK_SIZE >> 2); + sha1_compile(ctx); + } + + memcpy(((unsigned char *)ctx->wbuf) + pos, sp, len); +} + +/* SHA1 final padding and digest calculation */ + +VOID_RETURN sha1_end(unsigned char hval[], sha1_ctx ctx[1]) { + uint_32t i = (uint_32t)(ctx->count[0] & SHA1_MASK); + + /* put bytes in the buffer in an order in which references to */ + /* 32-bit words will put bytes with lower addresses into the */ + /* top of 32 bit words on BOTH big and little endian machines */ + bsw_32(ctx->wbuf, (i + 3) >> 2); + + /* we now need to mask valid bytes and add the padding which is */ + /* a single 1 bit and as many zero bits as necessary. Note that */ + /* we can always add the first padding byte here because the */ + /* buffer always has at least one empty slot */ + ctx->wbuf[i >> 2] &= 0xffffff80 << 8 * (~i & 3); + ctx->wbuf[i >> 2] |= 0x00000080 << 8 * (~i & 3); + + /* we need 9 or more empty positions, one for the padding byte */ + /* (above) and eight for the length count. If there is not */ + /* enough space, pad and empty the buffer */ + if (i > SHA1_BLOCK_SIZE - 9) { + if (i < 60) + ctx->wbuf[15] = 0; + sha1_compile(ctx); + i = 0; + } else /* compute a word index for the empty buffer positions */ + i = (i >> 2) + 1; + + while (i < 14) /* and zero pad all but last two positions */ + ctx->wbuf[i++] = 0; + + /* the following 32-bit length fields are assembled in the */ + /* wrong byte order on little endian machines but this is */ + /* corrected later since they are only ever used as 32-bit */ + /* word values. */ + ctx->wbuf[14] = (ctx->count[1] << 3) | (ctx->count[0] >> 29); + ctx->wbuf[15] = ctx->count[0] << 3; + sha1_compile(ctx); + + /* extract the hash value as bytes in case the hash buffer is */ + /* misaligned for 32-bit words */ + for (i = 0; i < SHA1_DIGEST_SIZE; ++i) + hval[i] = (unsigned char)(ctx->hash[i >> 2] >> (8 * (~i & 3))); +} + +VOID_RETURN sha1(unsigned char hval[], const unsigned char data[], unsigned long len) { + sha1_ctx cx[1]; + + sha1_begin(cx); + sha1_hash(data, len, cx); + sha1_end(hval, cx); +} + +#if defined(__cplusplus) +} +#endif \ No newline at end of file diff --git a/benchmark/external/uts/src/uts.c b/benchmark/external/uts/src/uts.c new file mode 100644 index 000000000..507915bea --- /dev/null +++ b/benchmark/external/uts/src/uts.c @@ -0,0 +1,474 @@ +/* + * ---- The Unbalanced Tree Search (UTS) Benchmark ---- + * + * Copyright (c) 2010 See AUTHORS file for copyright holders + * + * This file is part of the unbalanced tree search benchmark. This + * project is licensed under the MIT Open Source license. See the LICENSE + * file for copyright and licensing information. + * + * UTS is a collaborative project between researchers at the University of + * Maryland, the University of North Carolina at Chapel Hill, and the Ohio + * State University. See AUTHORS file for more information. + * + */ + +#include +#include +#include +#include +#include + +#include "uts/uts.h" + +/*********************************************************** + * tree generation and search parameters * + * * + * Tree generation strategy is controlled via various * + * parameters set from the command line. The parameters * + * and their default values are given below. * + ***********************************************************/ + +char *uts_trees_str[] = {"Binomial", "Geometric", "Hybrid", "Balanced"}; +char *uts_geoshapes_str[] = {"Linear decrease", "Exponential decrease", "Cyclic", "Fixed branching factor"}; + +/* Tree type + * Trees are generated using a Galton-Watson process, in + * which the branching factor of each node is a random + * variable. + * + * The random variable can follow a binomial distribution + * or a geometric distribution. Hybrid tree are + * generated with geometric distributions near the + * root and binomial distributions towards the leaves. + */ +tree_t type = GEO; // Default tree type +double b_0 = 4.0; // default branching factor at the root +int rootId = 0; // default seed for RNG state at root + +/* Tree type BIN (BINOMIAL) + * The branching factor at the root is specified by b_0. + * The branching factor below the root follows an + * identical binomial distribution at all nodes. + * A node has m children with prob q, or no children with + * prob (1-q). The expected branching factor is q * m. + * + * Default parameter values + */ +int nonLeafBF = 4; // m +double nonLeafProb = 15.0 / 64.0; // q + +/* Tree type GEO (GEOMETRIC) + * The branching factor follows a geometric distribution with + * expected value b. + * The probability that a node has 0 <= n children is p(1-p)^n for + * 0 < p <= 1. The distribution is truncated at MAXNUMCHILDREN. + * The expected number of children b = (1-p)/p. Given b (the + * target branching factor) we can solve for p. + * + * A shape function computes a target branching factor b_i + * for nodes at depth i as a function of the root branching + * factor b_0 and a maximum depth gen_mx. + * + * Default parameter values + */ +int gen_mx = 6; // default depth of tree +geoshape_t shape_fn = LINEAR; // default shape function (b_i decr linearly) + +/* In type HYBRID trees, each node is either type BIN or type + * GEO, with the generation strategy changing from GEO to BIN + * at a fixed depth, expressed as a fraction of gen_mx + */ +double shiftDepth = 0.5; + +/* compute granularity - number of rng evaluations per tree node */ +int computeGranularity = 1; + +/* display parameters */ +int debug = 0; +int verbose = 1; + +/*********************************************************** + * * + * FUNCTIONS * + * * + ***********************************************************/ + +/* fatal error */ +void uts_error(char *str) { + printf("*** Error: %s\n", str); + impl_abort(1); +} + +/* + * wall clock time + * for detailed accounting of work, this needs + * high resolution + */ +double uts_wctime() { + struct timespec tv; + clock_gettime(CLOCK_MONOTONIC, &tv); + return (tv.tv_sec + 1E-9 * tv.tv_nsec); +} + +// Interpret 32 bit positive integer as value on [0,1) +double rng_toProb(int n) { + if (n < 0) { + printf("*** toProb: rand n = %d out of range\n", n); + } + return ((n < 0) ? 0.0 : ((double)n) / 2147483648.0); +} + +void uts_initRoot(Node *root, int type) { + root->type = type; + root->height = 0; + root->numChildren = -1; // means not yet determined + rng_init(root->state.state, rootId); + + if (debug & 1) + printf("root node of type %d at %p\n", type, root); +} + +int uts_numChildren_bin(Node *parent) { + // distribution is identical everywhere below root + int v = rng_rand(parent->state.state); + double d = rng_toProb(v); + + return (d < nonLeafProb) ? nonLeafBF : 0; +} + +int uts_numChildren_geo(Node *parent) { + double b_i = b_0; + int depth = parent->height; + int numChildren, h; + double p, u; + + // use shape function to compute target b_i + if (depth > 0) { + switch (shape_fn) { + + // expected size polynomial in depth + case EXPDEC: + b_i = b_0 * pow((double)depth, -log(b_0) / log((double)gen_mx)); + break; + + // cyclic tree size + case CYCLIC: + if (depth > 5 * gen_mx) { + b_i = 0.0; + break; + } + b_i = pow(b_0, sin(2.0 * 3.141592653589793 * (double)depth / (double)gen_mx)); + break; + + // identical distribution at all nodes up to max depth + case FIXED: + b_i = (depth < gen_mx) ? b_0 : 0; + break; + + // linear decrease in b_i + case LINEAR: + default: + b_i = b_0 * (1.0 - (double)depth / (double)gen_mx); + break; + } + } + + // given target b_i, find prob p so expected value of + // geometric distribution is b_i. + p = 1.0 / (1.0 + b_i); + + // get uniform random number on [0,1) + h = rng_rand(parent->state.state); + u = rng_toProb(h); + + // max number of children at this cumulative probability + // (from inverse geometric cumulative density function) + numChildren = (int)floor(log(1 - u) / log(1 - p)); + + return numChildren; +} + +int uts_numChildren(Node *parent) { + int numChildren = 0; + + /* Determine the number of children */ + switch (type) { + case BIN: + if (parent->height == 0) + numChildren = (int)floor(b_0); + else + numChildren = uts_numChildren_bin(parent); + break; + + case GEO: + numChildren = uts_numChildren_geo(parent); + break; + + case HYBRID: + if (parent->height < shiftDepth * gen_mx) + numChildren = uts_numChildren_geo(parent); + else + numChildren = uts_numChildren_bin(parent); + break; + case BALANCED: + if (parent->height < gen_mx) + numChildren = (int)b_0; + break; + default: + uts_error("parTreeSearch(): Unknown tree type"); + } + + // limit number of children + // only a BIN root can have more than MAXNUMCHILDREN + if (parent->height == 0 && parent->type == BIN) { + int rootBF = (int)ceil(b_0); + if (numChildren > rootBF) { + printf("*** Number of children of root truncated from %d to %d\n", numChildren, rootBF); + numChildren = rootBF; + } + } else if (type != BALANCED) { + if (numChildren > MAXNUMCHILDREN) { + printf("*** Number of children truncated from %d to %d\n", numChildren, MAXNUMCHILDREN); + numChildren = MAXNUMCHILDREN; + } + } + + return numChildren; +} + +int uts_childType(Node *parent) { + switch (type) { + case BIN: + return BIN; + case GEO: + return GEO; + case HYBRID: + if (parent->height < shiftDepth * gen_mx) + return GEO; + else + return BIN; + case BALANCED: + return BALANCED; + default: + uts_error("uts_get_childtype(): Unknown tree type"); + return -1; + } +} + +// construct string with all parameter settings +int uts_paramsToStr(char *strBuf, int ind) { + // version + execution model + ind += sprintf(strBuf + ind, "UTS - Unbalanced Tree Search %s (%s)\n", UTS_VERSION, impl_getName()); + + // tree type + ind += sprintf(strBuf + ind, "Tree type: %d (%s)\n", type, uts_trees_str[type]); + + // tree shape parameters + ind += sprintf(strBuf + ind, "Tree shape parameters:\n"); + ind += sprintf(strBuf + ind, " root branching factor b_0 = %.1f, root seed = %d\n", b_0, rootId); + + if (type == GEO || type == HYBRID) { + ind += sprintf(strBuf + ind, + " GEO parameters: gen_mx = %d, shape function = %d (%s)\n", + gen_mx, + shape_fn, + uts_geoshapes_str[shape_fn]); + } + + if (type == BIN || type == HYBRID) { + double q = nonLeafProb; + int m = nonLeafBF; + double es = (1.0 / (1.0 - q * m)); + ind += + sprintf(strBuf + ind, " BIN parameters: q = %f, m = %d, E(n) = %f, E(s) = %.2f\n", q, m, q * m, es); + } + + if (type == HYBRID) { + ind += sprintf( + strBuf + ind, " HYBRID: GEO from root to depth %d, then BIN\n", (int)ceil(shiftDepth * gen_mx)); + } + + if (type == BALANCED) { + ind += sprintf(strBuf + ind, " BALANCED parameters: gen_mx = %d\n", gen_mx); + ind += sprintf(strBuf + ind, + " Expected size: %llu nodes, %llu leaves\n", + (counter_t)((pow(b_0, gen_mx + 1) - 1.0) / (b_0 - 1.0)) /* geometric series */, + (counter_t)pow(b_0, gen_mx)); + } + + // random number generator + ind += sprintf(strBuf + ind, "Random number generator: "); + ind = rng_showtype(strBuf, ind); + ind += sprintf(strBuf + ind, "\nCompute granularity: %d\n", computeGranularity); + + return ind; +} + +// show parameter settings +void uts_printParams() { + char strBuf[5000] = ""; + int ind = 0; + + if (verbose > 0) { + ind = uts_paramsToStr(strBuf, ind); + ind = impl_paramsToStr(strBuf, ind); + printf("%s\n", strBuf); + } +} + +void uts_parseParams(int argc, char *argv[]) { + int i = 1; + int err = -1; + while (i < argc && err == -1) { + if (argv[i][0] == '-' && argv[i][1] == 'h') { + uts_helpMessage(); + impl_abort(0); + + } else if (argv[i][0] != '-' || strlen(argv[i]) != 2 || argc <= i + 1) { + err = i; + break; + } + + // Matched by implementation -- return 0 on success + // This is fragile, don't override parameters in impl_parseParam()! + if (!impl_parseParam(argv[i], argv[i + 1])) { + i += 2; + continue; + } + + switch (argv[i][1]) { + case 'q': + nonLeafProb = atof(argv[i + 1]); + break; + case 'm': + nonLeafBF = atoi(argv[i + 1]); + break; + case 'r': + rootId = atoi(argv[i + 1]); + break; + case 'x': + debug = atoi(argv[i + 1]); + break; + case 'v': + verbose = atoi(argv[i + 1]); + break; + case 't': + type = (tree_t)atoi(argv[i + 1]); + if (type != BIN && type != GEO && type != HYBRID && type != BALANCED) + err = i; + break; + case 'a': + shape_fn = (geoshape_t)atoi(argv[i + 1]); + if (shape_fn > FIXED) + err = i; + break; + case 'b': + b_0 = atof(argv[i + 1]); + break; + case 'd': + gen_mx = atoi(argv[i + 1]); + break; + case 'f': + shiftDepth = atof(argv[i + 1]); + break; + case 'g': + computeGranularity = max(1, atoi(argv[i + 1])); + break; + default: + err = i; + } + + if (err != -1) + break; + + i += 2; + } + + if (err != -1) { + printf("Unrecognized parameter or incorrect/missing value: '%s %s'\n", + argv[i], + (i + 1 < argc) ? argv[i + 1] : "[none]"); + printf("Try -h for help.\n"); + impl_abort(4); + } +} + +void uts_helpMessage() { + printf(" UTS - Unbalanced Tree Search %s (%s)\n\n", UTS_VERSION, impl_getName()); + printf(" usage: uts-bin [parameter value] ...\n\n"); + printf(" parameter type description\n"); + printf(" ==== ==== =========================================\n"); + printf("\n Benchmark Parameters:\n"); + printf(" -t int tree type (0: BIN, 1: GEO, 2: HYBRID, 3: BALANCED)\n"); + printf(" -b dble root branching factor\n"); + printf(" -r int root seed 0 <= r < 2^31 \n"); + printf(" -a int GEO: tree shape function \n"); + printf(" -d int GEO, BALANCED: tree depth\n"); + printf(" -q dble BIN: probability of non-leaf node\n"); + printf(" -m int BIN: number of children for non-leaf node\n"); + printf(" -f dble HYBRID: fraction of depth for GEO -> BIN transition\n"); + printf(" -g int compute granularity: number of rng_spawns per node\n"); + printf(" -v int nonzero to set verbose output\n"); + printf(" -x int debug level\n"); + + // Get help message from the implementation + printf("\n Additional Implementation Parameters:\n"); + impl_helpMessage(); + printf("\n"); +} + +void uts_showStats( + int nPes, int chunkSize, double walltime, counter_t nNodes, counter_t nLeaves, counter_t maxDepth) { + // summarize execution info for machine consumption + if (verbose == 0) { + printf("%4d %7.3f %9llu %7.0llu %7.0llu %d %d %.2f %d %d %1d %f %3d\n", + nPes, + walltime, + nNodes, + (long long)(nNodes / walltime), + (long long)((nNodes / walltime) / nPes), + chunkSize, + type, + b_0, + rootId, + gen_mx, + shape_fn, + nonLeafProb, + nonLeafBF); + } + + // summarize execution info for human consumption + else { + printf("Tree size = %llu, tree depth = %llu, num leaves = %llu (%.2f%%)\n", + nNodes, + maxDepth, + nLeaves, + nLeaves / (float)nNodes * 100.0); + printf("Wallclock time = %.3f sec, performance = %.0f nodes/sec (%.0f nodes/sec per PE)\n\n", + walltime, + (nNodes / walltime), + (nNodes / walltime / nPes)); + } +} + +// --------------------------------------------------------------------- // + +// The name of this implementation +char *impl_getName() { return "Sequential Recursive Search"; } + +int impl_paramsToStr(char *strBuf, int ind) { + ind += sprintf(strBuf + ind, "Execution strategy: %s\n", impl_getName()); + return ind; +} + +// Not using UTS command line params, return non-success +int impl_parseParam(char *param, char *value) { + return 1; + (void)param; + (void)value; +} + +void impl_helpMessage() { printf(" none.\n"); } + +void impl_abort(int err) { exit(err); } \ No newline at end of file diff --git a/benchmark/lib/CMakeLists.txt b/benchmark/lib/CMakeLists.txt new file mode 100644 index 000000000..c745bd439 --- /dev/null +++ b/benchmark/lib/CMakeLists.txt @@ -0,0 +1,35 @@ +add_library(benchmark_common) + +target_compile_features(benchmark_common PUBLIC cxx_std_26) + +target_sources(benchmark_common + PRIVATE + uts.cpp + PUBLIC + FILE_SET HEADERS + BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR} + FILES + bench.hpp + fib.hpp + fold.hpp + heat.hpp + integrate.hpp + knapsack.hpp + macros.hpp + mandelbrot.hpp + matmul.hpp + nqueens.hpp + primes.hpp + quicksort.hpp + scan.hpp + skynet.hpp + uts.hpp +) + +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../external/uts external/uts) + +target_link_libraries(benchmark_common + PUBLIC + benchmark::benchmark + uts_c +) diff --git a/benchmark/lib/bench.hpp b/benchmark/lib/bench.hpp new file mode 100644 index 000000000..360c79be1 --- /dev/null +++ b/benchmark/lib/bench.hpp @@ -0,0 +1,62 @@ +#pragma once + +#include + +#ifdef LF_BENCH_NO_IMPORT_STD + #include + #include + #include +#else +import std; +#endif + +namespace lf_bench { + +inline constexpr std::int64_t no_threads = 0; + +inline auto inverse_complexity(benchmark::IterationCount n) -> double { return 1.0 / static_cast(n); } + +inline void report_threads(benchmark::State &state, std::int64_t threads) { + if (threads == no_threads) { + return; + } + + state.counters["p"] = static_cast(threads); + state.SetComplexityN(static_cast(threads)); +} + +// `bench` reports mismatches with a `std::format` call that formats both +// `result` and `expected`, so `Expected` and `std::invoke_result_t` must be +// formattable. +template +void bench(benchmark::State &state, std::int64_t threads, const Expected &expected, Check check, Fn fn) { + report_threads(state, threads); + + for (auto _ : state) { + auto result = std::invoke(fn); + + if (!std::invoke(check, result, expected)) { + state.SkipWithError(std::format("incorrect result: {} != {}", result, expected)); + break; + } + + benchmark::DoNotOptimize(result); + } +} + +template +void bench(benchmark::State &state, std::int64_t threads, const Expected &expected, Fn fn) { + bench(state, threads, expected, std::equal_to<>{}, fn); +} + +template +void bench(benchmark::State &state, const Expected &expected, Check check, Fn fn) { + bench(state, no_threads, expected, check, fn); +} + +template +void bench(benchmark::State &state, const Expected &expected, Fn fn) { + bench(state, no_threads, expected, fn); +} + +} // namespace lf_bench diff --git a/benchmark/lib/fib.hpp b/benchmark/lib/fib.hpp new file mode 100644 index 000000000..f21b9ce4c --- /dev/null +++ b/benchmark/lib/fib.hpp @@ -0,0 +1,53 @@ +#pragma once + +#include + +#include "bench.hpp" + +#ifdef LF_BENCH_NO_IMPORT_STD + #include + #include +#else +import std; +#endif + +inline constexpr int fib_test = 8; +inline constexpr int fib_base = 37; + +/** + * @brief Non-recursive Fibonacci calculation + */ +constexpr auto fib_ref(std::int64_t n) -> std::int64_t { + + if (n < 2) { + return n; + } + + std::int64_t prev = 0; + std::int64_t curr = 1; + + for (std::int64_t i = 2; i <= n; ++i) { + std::int64_t next = prev + curr; + prev = curr; + curr = next; + } + + return curr; +} + +template +void run_fib(benchmark::State &state, std::int64_t threads, Fn fn) { + std::int64_t n = state.range(0); + std::int64_t expect = fib_ref(n); + + state.counters["n"] = static_cast(n); + + lf_bench::bench(state, threads, expect, [n, fn]() -> std::int64_t { + return std::invoke(fn, n); + }); +} + +template +void run_fib(benchmark::State &state, Fn fn) { + run_fib(state, lf_bench::no_threads, fn); +} diff --git a/benchmark/lib/fold.hpp b/benchmark/lib/fold.hpp new file mode 100644 index 000000000..64bb8b61a --- /dev/null +++ b/benchmark/lib/fold.hpp @@ -0,0 +1,111 @@ +#pragma once + +#include + +#include "macros.hpp" + +#ifdef LF_BENCH_NO_IMPORT_STD + #include + #include + #include + #include + #include + #include + #include + #include + #include +#else +import std; +#endif + +inline constexpr std::int64_t fold_test = 10; + +inline constexpr std::int64_t fold_1024 = 1'024; +inline constexpr std::int64_t fold_1024_base = fold_1024; +inline constexpr std::int64_t fold_1024_sq_base = fold_1024 * fold_1024; +inline constexpr std::int64_t fold_1024_cu_base = fold_1024 * fold_1024 * fold_1024; + +enum class fold_data_mode : char { memory, lazy }; +enum class fold_chunk_mode : char { explicit_one, deduced, fixed }; +enum class fold_projection_mode : char { sync, async }; + +template +constexpr auto fold_value(std::size_t index) -> T { + return static_cast(index % 4UZ); +} + +template +constexpr auto make_fold_range(std::size_t count) { + return std::views::iota(std::size_t{}, count) | std::views::transform([](std::size_t index) -> T { + return fold_value(index); + }); +} + +template +using fold_accum_t = std::conditional_t, double, std::int64_t>; + +template +constexpr auto expected_fold_result(std::size_t count) -> fold_accum_t { + auto groups = count / 4UZ; + auto remainder = count % 4UZ; + return static_cast>((groups * 6UZ) + ((remainder * (remainder - 1UZ)) / 2UZ)); +} + +template +auto fold_result_is_correct(fold_accum_t result, fold_accum_t expect) -> bool { + if constexpr (std::floating_point>) { + return std::abs(result - expect) <= 1e-6; + } else { + return result == expect; + } +} + +template +void run_fold_input(benchmark::State &state, std::int64_t threads, Fn fn) { + auto n = static_cast(state.range(0)); + auto expect = expected_fold_result(n); + + auto run = [&](auto const &range) -> void { + lf_bench::bench(state, threads, expect, fold_result_is_correct, [&]() -> fold_accum_t { + return std::invoke(fn, range); + }); + }; + + if constexpr (Data == fold_data_mode::memory) { + run(make_fold_range(n) | std::ranges::to>()); + } else { + run(make_fold_range(n)); + } + + state.SetItemsProcessed(state.iterations() * static_cast(n)); +} + +template +void run_fold_input(benchmark::State &state, Fn fn) { + run_fold_input(state, lf_bench::no_threads, fn); +} + +// Use alias for shorted names. +inline constexpr auto memory = fold_data_mode::memory; +inline constexpr auto lazy = fold_data_mode::lazy; +inline constexpr auto chunk_1 = fold_chunk_mode::explicit_one; +inline constexpr auto chunk_deduced = fold_chunk_mode::deduced; +inline constexpr auto chunk_fixed = fold_chunk_mode::fixed; +inline constexpr auto sync_proj = fold_projection_mode::sync; +inline constexpr auto async_proj = fold_projection_mode::async; + +using int32 = std::int32_t; +using float32 = float; + +#define LF_FOLD_BENCH_SIZES_SMALL(bench_fn, category, name, ...) \ + BENCH_ONE(bench_fn, category, name, test, fold __VA_OPT__(, ) __VA_ARGS__) \ + BENCH_ONE(bench_fn, category, name, base, fold_1024 __VA_OPT__(, ) __VA_ARGS__) \ + BENCH_ONE(bench_fn, category, name, base, fold_1024_sq __VA_OPT__(, ) __VA_ARGS__) + +#define LF_FOLD_BENCH_SIZES(bench_fn, category, name, ...) \ + LF_FOLD_BENCH_SIZES_SMALL(bench_fn, category, name __VA_OPT__(, ) __VA_ARGS__) \ + BENCH_ONE(bench_fn, category, name, base, fold_1024_cu __VA_OPT__(, ) __VA_ARGS__) + +#define LF_FOLD_BENCH_SIZES_MT(bench_fn, category, name, ...) \ + BENCH_ONE_MT(bench_fn, category, name, test, fold __VA_OPT__(, ) __VA_ARGS__) \ + BENCH_ONE_MT(bench_fn, category, name, base, fold_1024_cu __VA_OPT__(, ) __VA_ARGS__) diff --git a/benchmark/lib/heat.hpp b/benchmark/lib/heat.hpp new file mode 100644 index 000000000..c50c4ec13 --- /dev/null +++ b/benchmark/lib/heat.hpp @@ -0,0 +1,93 @@ +#pragma once + +#include "bench.hpp" + +#ifdef LF_BENCH_NO_IMPORT_STD + #include + #include + #include + #include + #include +#else +import std; +#endif + +inline constexpr std::size_t heat_test = 64; +inline constexpr std::size_t heat_base = 1024; + +inline constexpr std::size_t heat_iters = 16; + +// Initialise grid with a fixed analytic profile (boundaries clamped). +inline auto heat_make_grid(std::size_t n) -> std::vector { + std::vector g(n * n); + for (std::size_t y = 0; y < n; ++y) { + for (std::size_t x = 0; x < n; ++x) { + double dx = static_cast(x) / static_cast(n - 1) - 0.5; + double dy = static_cast(y) / static_cast(n - 1) - 0.5; + g[y * n + x] = std::exp(-8.0 * (dx * dx + dy * dy)); + } + } + return g; +} + +inline auto heat_matches(std::vector const &actual, std::vector const &expected) -> bool { + for (std::size_t i = 0; i < actual.size(); ++i) { + if (std::abs(actual[i] - expected[i]) > 1e-12) { + return false; + } + } + return true; +} + +inline void heat_jacobi_step(double const *src, double *dst, std::size_t n) { + for (std::size_t y = 1; y < n - 1; ++y) { + for (std::size_t x = 1; x < n - 1; ++x) { + std::size_t i = y * n + x; + dst[i] = 0.25 * (src[i - 1] + src[i + 1] + src[i - n] + src[i + n]); + } + } + for (std::size_t x = 0; x < n; ++x) { + dst[x] = src[x]; + dst[(n - 1) * n + x] = src[(n - 1) * n + x]; + } + for (std::size_t y = 0; y < n; ++y) { + dst[y * n] = src[y * n]; + dst[y * n + (n - 1)] = src[y * n + (n - 1)]; + } +} + +inline auto +heat_reference(std::vector initial, std::size_t n, std::size_t iters) -> std::vector { + std::vector scratch(initial.size()); + double *src = initial.data(); + double *dst = scratch.data(); + + for (std::size_t t = 0; t < iters; ++t) { + heat_jacobi_step(src, dst, n); + std::swap(src, dst); + } + + if (src == initial.data()) { + return initial; + } + return scratch; +} + +template +void run_heat(benchmark::State &state, Fn fn) { + auto n = static_cast(state.range(0)); + state.counters["n"] = static_cast(n); + state.counters["iters"] = static_cast(heat_iters); + + std::vector initial = heat_make_grid(n); + std::vector a(initial.size()); + std::vector b(initial.size()); + std::vector reference = heat_reference(initial, n, heat_iters); + + lf_bench::bench(state, true, [&]() -> bool { + a = initial; + std::invoke(fn, a.data(), b.data(), n, heat_iters); + benchmark::DoNotOptimize(a.data()); + return heat_matches((heat_iters % 2 == 0) ? a : b, reference); + }); +} diff --git a/benchmark/lib/integrate.hpp b/benchmark/lib/integrate.hpp new file mode 100644 index 000000000..836b4004f --- /dev/null +++ b/benchmark/lib/integrate.hpp @@ -0,0 +1,42 @@ +#pragma once + +#include "bench.hpp" + +#ifdef LF_BENCH_NO_IMPORT_STD + #include + #include + #include +#else +import std; +#endif + +inline constexpr std::int64_t integrate_test = 100; +inline constexpr std::int64_t integrate_base = 10'000; + +inline constexpr double integrate_epsilon = 1.0e-9; + +inline constexpr auto integrate_fn(double x) -> double { return (x * x + 1.0) * x; } + +inline constexpr auto integrate_exact(double a, double b) -> double { + auto indefinite = [](double x) { + return 0.25 * x * x * (x * x + 2); + }; + return indefinite(b) - indefinite(a); +} + +inline auto integrate_is_close(double result, double expect) -> bool { + return std::abs(result - expect) <= 1e-3 * std::abs(expect); +} + +template +void run_integrate(benchmark::State &state, Fn fn) { + std::int64_t n = state.range(0); + double upper = static_cast(n); + double expect = integrate_exact(0, upper); + + state.counters["n"] = static_cast(n); + + lf_bench::bench(state, expect, integrate_is_close, [upper, fn]() -> double { + return std::invoke(fn, upper); + }); +} diff --git a/benchmark/lib/knapsack.hpp b/benchmark/lib/knapsack.hpp new file mode 100644 index 000000000..dc5077b1f --- /dev/null +++ b/benchmark/lib/knapsack.hpp @@ -0,0 +1,78 @@ +#pragma once + +#include "bench.hpp" + +#ifdef LF_BENCH_NO_IMPORT_STD + #include + #include + #include + #include + #include + #include +#else +import std; +#endif + +inline constexpr std::size_t knapsack_test = 16; +inline constexpr std::size_t knapsack_base = 28; + +struct knapsack_item { + int weight; + int value; +}; + +struct knapsack_problem { + std::vector items; // sorted by value/weight desc + int capacity; +}; + +inline auto knapsack_make(std::size_t n, std::uint64_t seed = 0xCAFEBABE) -> knapsack_problem { + std::mt19937_64 rng{seed}; + std::uniform_int_distribution dw(1, 100); + std::uniform_int_distribution dv(1, 100); + + std::vector items(n); + int total = 0; + for (auto &it : items) { + it.weight = dw(rng); + it.value = dv(rng); + total += it.weight; + } + + // Sort by value-density, descending, for a tight relaxation bound. + std::sort(items.begin(), items.end(), [](knapsack_item a, knapsack_item b) { + return static_cast(a.value) * b.weight > static_cast(b.value) * a.weight; + }); + + return knapsack_problem{std::move(items), total / 2}; +} + +// Exact optimum via O(n * capacity) DP, used as oracle. +inline auto knapsack_dp_optimum(knapsack_problem const &p) -> int { + std::vector dp(static_cast(p.capacity) + 1, 0); + for (auto const &it : p.items) { + for (int c = p.capacity; c >= it.weight; --c) { + auto idx = static_cast(c); + auto idx_prev = static_cast(c - it.weight); + int cand = dp[idx_prev] + it.value; + if (cand > dp[idx]) { + dp[idx] = cand; + } + } + } + return dp[static_cast(p.capacity)]; +} + +template +void run_knapsack(benchmark::State &state, Fn fn) { + auto n = static_cast(state.range(0)); + auto problem = knapsack_make(n); + int expect = knapsack_dp_optimum(problem); + + state.counters["n"] = static_cast(n); + state.counters["capacity"] = problem.capacity; + + lf_bench::bench(state, expect, [problem = std::move(problem), fn]() -> int { + return std::invoke(fn, problem); + }); +} diff --git a/benchmark/lib/macros.hpp b/benchmark/lib/macros.hpp new file mode 100644 index 000000000..3e561c70e --- /dev/null +++ b/benchmark/lib/macros.hpp @@ -0,0 +1,172 @@ +#pragma once + +#include + +#include "bench.hpp" + +// Use `import std;` by default. Textually `#include ` drags in +// ``, which triggers a libc++ 22 link-time bug (undefined +// `__atomic_unique_lock::__set_locked_bit`) in TUs that later instantiate +// anything touching std::stop_*. Targets that can't use modules (e.g. the +// openmp benchmarks, see benchmark/src/openmp/CMakeLists.txt) define +// LF_BENCH_NO_IMPORT_STD and get textual includes instead. +#ifdef LF_BENCH_NO_IMPORT_STD + #include + #include + #include + #include +#else +import std; +#endif + +#define BENCH_GET_FN(bench_fn, ...) bench_fn __VA_OPT__(<__VA_ARGS__>) + +namespace lf_bench { + +inline void bench_thread_args(benchmark::Benchmark *bench, auto make_args) { + unsigned hw = std::max(1U, std::thread::hardware_concurrency()); + for (unsigned t : {1U, 2U, 4U, 6U, 8U, 12U, 16U, 24U, 32U, 48U, 64U, 96U}) { + if (t > hw) { + return; + } + make_args(bench, t); + } +} + +inline auto sanitize(std::string s) -> std::string { + s.erase(std::remove(s.begin(), s.end(), ' '), s.end()); + return s; +} + +inline auto +format_name(std::string mode, std::string category, std::string name, std::string args) -> std::string { + std::string res = sanitize(mode) + "/" + sanitize(category) + "/" + sanitize(name); + std::string s_args = sanitize(args); + if (!s_args.empty()) { + res += "/" + s_args; + } + return res; +} + +inline void setup_single(benchmark::Benchmark *b, std::int64_t size) { b->Arg(size)->UseRealTime(); } + +inline void setup_mt(benchmark::Benchmark *b, std::int64_t size) { + b->Apply([size](benchmark::Benchmark *bm) { + bench_thread_args(bm, [size](benchmark::Benchmark *inner_b, unsigned t) { + inner_b->Args({size, static_cast(t)}); + }); + }) + ->Complexity(inverse_complexity) + ->UseRealTime(); +} + +inline void setup_uts_mt(benchmark::Benchmark *b) { + b->Apply([](benchmark::Benchmark *bm) { + bench_thread_args(bm, [](benchmark::Benchmark *inner_b, unsigned t) { + inner_b->Arg(static_cast(t)); + }); + }) + ->Complexity(inverse_complexity) + ->UseRealTime(); +} + +} // namespace lf_bench + +// --- Standard Benchmarks --- + +#define BENCH_ONE_WITH_ID(id, bench_fn, category, name, mode, prefix, ...) \ + namespace { \ + struct benchmark_reg_##id { \ + benchmark_reg_##id() { \ + auto *b = benchmark::RegisterBenchmark(lf_bench::format_name(#mode, #category, #name, #__VA_ARGS__), \ + BENCH_GET_FN(bench_fn __VA_OPT__(, ) __VA_ARGS__)); \ + lf_bench::setup_single(b, prefix##_##mode); \ + } \ + } benchmark_reg_inst_##id; \ + } + +#define BENCH_ONE_HIDDEN(id, ...) BENCH_ONE_WITH_ID(id __VA_OPT__(, ) __VA_ARGS__) +#define BENCH_ONE(bench_fn, category, name, mode, prefix, ...) \ + BENCH_ONE_HIDDEN(__COUNTER__, bench_fn, category, name, mode, prefix __VA_OPT__(, ) __VA_ARGS__) + +#define BENCH_ALL(bench_fn, category, name, prefix, ...) \ + BENCH_ONE(bench_fn, category, name, test, prefix __VA_OPT__(, ) __VA_ARGS__) \ + BENCH_ONE(bench_fn, category, name, base, prefix __VA_OPT__(, ) __VA_ARGS__) + +// --- Multi-Threaded Benchmarks --- + +#define BENCH_ONE_MT_WITH_ID(id, bench_fn, category, name, mode, prefix, ...) \ + namespace { \ + struct benchmark_reg_##id { \ + benchmark_reg_##id() { \ + auto *b = benchmark::RegisterBenchmark(lf_bench::format_name(#mode, #category, #name, #__VA_ARGS__), \ + BENCH_GET_FN(bench_fn __VA_OPT__(, ) __VA_ARGS__)); \ + lf_bench::setup_mt(b, prefix##_##mode); \ + } \ + } benchmark_reg_inst_##id; \ + } + +#define BENCH_ONE_MT_HIDDEN(id, ...) BENCH_ONE_MT_WITH_ID(id __VA_OPT__(, ) __VA_ARGS__) +#define BENCH_ONE_MT(bench_fn, category, name, mode, prefix, ...) \ + BENCH_ONE_MT_HIDDEN(__COUNTER__, bench_fn, category, name, mode, prefix __VA_OPT__(, ) __VA_ARGS__) + +#define BENCH_ALL_MT(bench_fn, category, name, prefix, ...) \ + BENCH_ONE_MT(bench_fn, category, name, test, prefix __VA_OPT__(, ) __VA_ARGS__) \ + BENCH_ONE_MT(bench_fn, category, name, base, prefix __VA_OPT__(, ) __VA_ARGS__) + +// --- UTS Benchmarks --- + +#define UTS_BENCH_ONE_WITH_ID(id, bench_fn, category, mode, tree_name, tree_id, ...) \ + namespace { \ + struct benchmark_reg_##id { \ + benchmark_reg_##id() { \ + auto *b = benchmark::RegisterBenchmark( \ + lf_bench::format_name(#mode, #category, "uts/" tree_name, #__VA_ARGS__), \ + [=](benchmark::State &state) { \ + BENCH_GET_FN(bench_fn __VA_OPT__(, ) __VA_ARGS__)(state, tree_id); \ + }); \ + b->UseRealTime(); \ + } \ + } benchmark_reg_inst_##id; \ + } + +#define UTS_BENCH_ONE_HIDDEN(id, ...) UTS_BENCH_ONE_WITH_ID(id __VA_OPT__(, ) __VA_ARGS__) +#define UTS_BENCH_ONE(bench_fn, category, mode, tree_name, tree_id, ...) \ + UTS_BENCH_ONE_HIDDEN(__COUNTER__, bench_fn, category, mode, tree_name, tree_id __VA_OPT__(, ) __VA_ARGS__) + +#define UTS_BENCH_ALL(bench_fn, category, ...) \ + UTS_BENCH_ONE(bench_fn, category, test, "T1_mini", uts_t1_mini __VA_OPT__(, ) __VA_ARGS__) \ + UTS_BENCH_ONE(bench_fn, category, test, "T3_mini", uts_t3_mini __VA_OPT__(, ) __VA_ARGS__) \ + UTS_BENCH_ONE(bench_fn, category, base, "T1", uts_t1 __VA_OPT__(, ) __VA_ARGS__) \ + UTS_BENCH_ONE(bench_fn, category, base, "T3", uts_t3 __VA_OPT__(, ) __VA_ARGS__) \ + UTS_BENCH_ONE(bench_fn, category, large, "T1L", uts_t1l __VA_OPT__(, ) __VA_ARGS__) \ + UTS_BENCH_ONE(bench_fn, category, large, "T3L", uts_t3l __VA_OPT__(, ) __VA_ARGS__) + +// --- UTS Multi-Threaded Benchmarks --- + +#define UTS_BENCH_ONE_MT_WITH_ID(id, bench_fn, category, mode, tree_name, tree_id, ...) \ + namespace { \ + struct benchmark_reg_##id { \ + benchmark_reg_##id() { \ + auto *b = benchmark::RegisterBenchmark( \ + lf_bench::format_name(#mode, #category, "uts/" tree_name, #__VA_ARGS__), \ + [=](benchmark::State &state) { \ + BENCH_GET_FN(bench_fn __VA_OPT__(, ) __VA_ARGS__)(state, tree_id); \ + }); \ + lf_bench::setup_uts_mt(b); \ + } \ + } benchmark_reg_inst_##id; \ + } + +#define UTS_BENCH_ONE_MT_HIDDEN(id, ...) UTS_BENCH_ONE_MT_WITH_ID(id __VA_OPT__(, ) __VA_ARGS__) +#define UTS_BENCH_ONE_MT(bench_fn, category, mode, tree_name, tree_id, ...) \ + UTS_BENCH_ONE_MT_HIDDEN( \ + __COUNTER__, bench_fn, category, mode, tree_name, tree_id __VA_OPT__(, ) __VA_ARGS__) + +#define UTS_BENCH_ALL_MT(bench_fn, category, ...) \ + UTS_BENCH_ONE_MT(bench_fn, category, test, "T1_mini", uts_t1_mini __VA_OPT__(, ) __VA_ARGS__) \ + UTS_BENCH_ONE_MT(bench_fn, category, test, "T3_mini", uts_t3_mini __VA_OPT__(, ) __VA_ARGS__) \ + UTS_BENCH_ONE_MT(bench_fn, category, base, "T1", uts_t1 __VA_OPT__(, ) __VA_ARGS__) \ + UTS_BENCH_ONE_MT(bench_fn, category, base, "T3", uts_t3 __VA_OPT__(, ) __VA_ARGS__) \ + UTS_BENCH_ONE_MT(bench_fn, category, large, "T1L", uts_t1l __VA_OPT__(, ) __VA_ARGS__) \ + UTS_BENCH_ONE_MT(bench_fn, category, large, "T3L", uts_t3l __VA_OPT__(, ) __VA_ARGS__) diff --git a/benchmark/lib/mandelbrot.hpp b/benchmark/lib/mandelbrot.hpp new file mode 100644 index 000000000..bf312f172 --- /dev/null +++ b/benchmark/lib/mandelbrot.hpp @@ -0,0 +1,61 @@ +#pragma once + +#include "bench.hpp" + +#ifdef LF_BENCH_NO_IMPORT_STD + #include + #include + #include +#else +import std; +#endif + +inline constexpr int mandelbrot_test = 128; +inline constexpr int mandelbrot_base = 1024; + +inline constexpr int mandelbrot_max_iter = 256; + +inline constexpr double mandelbrot_x_min = -2.0; +inline constexpr double mandelbrot_x_max = 1.0; +inline constexpr double mandelbrot_y_min = -1.5; +inline constexpr double mandelbrot_y_max = 1.5; + +inline constexpr auto mandelbrot_pixel(int px, int py, int n) -> int { + double cr = mandelbrot_x_min + (mandelbrot_x_max - mandelbrot_x_min) * px / n; + double ci = mandelbrot_y_min + (mandelbrot_y_max - mandelbrot_y_min) * py / n; + + double zr = 0; + double zi = 0; + int iter = 0; + + while (iter < mandelbrot_max_iter && zr * zr + zi * zi <= 4.0) { + double zr_new = zr * zr - zi * zi + cr; + zi = 2 * zr * zi + ci; + zr = zr_new; + ++iter; + } + + return iter; +} + +inline auto mandelbrot_checksum(int n) -> std::uint64_t { + std::uint64_t checksum = 0; + for (int py = 0; py < n; ++py) { + for (int px = 0; px < n; ++px) { + checksum += static_cast(mandelbrot_pixel(px, py, n)); + } + } + return checksum; +} + +template +void run_mandelbrot(benchmark::State &state, Fn fn) { + int n = static_cast(state.range(0)); + std::uint64_t expect = mandelbrot_checksum(n); + + state.counters["n"] = n; + + lf_bench::bench(state, expect, [n, fn]() -> std::uint64_t { + return std::invoke(fn, n); + }); +} diff --git a/benchmark/lib/matmul.hpp b/benchmark/lib/matmul.hpp new file mode 100644 index 000000000..e51e07e06 --- /dev/null +++ b/benchmark/lib/matmul.hpp @@ -0,0 +1,123 @@ +#pragma once + +#include "bench.hpp" + +#ifdef LF_BENCH_NO_IMPORT_STD + #include + #include + #include + #include + #include + #include + #include + #include +#else +import std; +#endif + +inline constexpr unsigned matmul_test = 64; +inline constexpr unsigned matmul_base = 1024; + +inline constexpr unsigned strassen_test = 64; +inline constexpr unsigned strassen_base = 1024; + +inline constexpr unsigned matmul_basecase = 32; + +static_assert(std::has_single_bit(matmul_test)); +static_assert(std::has_single_bit(matmul_base)); + +struct matmul_args { + std::unique_ptr A; + std::unique_ptr B; + std::unique_ptr C; + std::unique_ptr ref; + unsigned n; +}; + +inline auto matmul_init(unsigned n, std::uint64_t seed = 0xC0FFEE) -> matmul_args { + + matmul_args args{ + std::make_unique(static_cast(n) * n), + std::make_unique(static_cast(n) * n), + std::make_unique(static_cast(n) * n), + std::make_unique(static_cast(n) * n), + n, + }; + + std::mt19937_64 rng{seed}; + std::uniform_real_distribution dist{0, 1}; + + for (std::size_t i = 0; i < static_cast(n) * n; ++i) { + args.A[i] = dist(rng); + args.B[i] = dist(rng); + args.C[i] = 0; + args.ref[i] = 0; + } + + return args; +} + +inline void matmul_zero(float *C, unsigned n) { + for (std::size_t i = 0; i < static_cast(n) * n; ++i) { + C[i] = 0; + } +} + +inline auto matmul_max_relative_error(float const *A, float const *B, unsigned n) -> float { + constexpr float epsilon = 1e-8F; + float error = 0; + for (std::size_t i = 0; i < static_cast(n) * n; ++i) { + float diff = std::abs(A[i] - B[i]) / std::max(std::abs(A[i]), epsilon); + if (diff > error) { + error = diff; + } + } + return error; +} + +inline void matmul_iter(float const *A, float const *B, float *C, unsigned n) { + for (unsigned i = 0; i < n; ++i) { + for (unsigned k = 0; k < n; ++k) { + float c = 0; + for (unsigned j = 0; j < n; ++j) { + c += A[i * n + j] * B[j * n + k]; + } + C[i * n + k] = c; + } + } +} + +template +inline void matmul_basecase_multiply(float const *A, float const *B, float *R, unsigned n, unsigned s) { + for (unsigned i = 0; i < n; ++i) { + for (unsigned j = 0; j < n; ++j) { + float sum = 0; + for (unsigned k = 0; k < n; ++k) { + sum += A[i * s + k] * B[k * s + j]; + } + if constexpr (Add) { + R[i * s + j] += sum; + } else { + R[i * s + j] = sum; + } + } + } +} + +inline auto matmul_error_is_acceptable(float err, float max_err) -> bool { return err <= max_err; } + +template +void run_matmul(benchmark::State &state, float max_relative_error, Fn fn) { + auto n = static_cast(state.range(0)); + state.counters["n"] = n; + + auto args = matmul_init(n); + matmul_iter(args.A.get(), args.B.get(), args.ref.get(), n); + + lf_bench::bench(state, max_relative_error, matmul_error_is_acceptable, [&]() -> float { + matmul_zero(args.C.get(), n); + std::invoke(fn, args.A.get(), args.B.get(), args.C.get(), n); + benchmark::DoNotOptimize(args.C.get()); + return matmul_max_relative_error(args.ref.get(), args.C.get(), n); + }); +} diff --git a/benchmark/lib/nqueens.hpp b/benchmark/lib/nqueens.hpp new file mode 100644 index 000000000..088d1b973 --- /dev/null +++ b/benchmark/lib/nqueens.hpp @@ -0,0 +1,66 @@ +#pragma once + +#include "bench.hpp" + +#ifdef LF_BENCH_NO_IMPORT_STD + #include + #include + #include + #include +#else +import std; +#endif + +inline constexpr int nqueens_test = 8; +inline constexpr int nqueens_base = 14; + +inline constexpr std::array nqueens_answers = { + 0, + 1, + 0, + 0, + 2, + 10, + 4, + 40, + 92, + 352, + 724, + 2'680, + 14'200, + 73'712, + 365'596, + 2'279'184, + 14'772'512, + 95'815'104, + 666'090'624, + 4'968'057'848, + 39'029'188'884, +}; + +inline auto queens_ok(int n, char const *a) -> bool { + for (int i = 0; i < n; ++i) { + char p = a[i]; + for (int j = i + 1; j < n; ++j) { + char q = a[j]; + if (q == p || q == p - (j - i) || q == p + (j - i)) { + return false; + } + } + } + return true; +} + +template +void run_nqueens(benchmark::State &state, Fn fn) { + int n = static_cast(state.range(0)); + std::int64_t expect = nqueens_answers.at(static_cast(n)); + + state.counters["n"] = n; + + std::vector board(static_cast(n)); + + lf_bench::bench(state, expect, [n, &board, fn]() -> std::int64_t { + return std::invoke(fn, n, board.data()); + }); +} diff --git a/benchmark/lib/primes.hpp b/benchmark/lib/primes.hpp new file mode 100644 index 000000000..b0154f8ba --- /dev/null +++ b/benchmark/lib/primes.hpp @@ -0,0 +1,56 @@ +#pragma once + +#include "bench.hpp" + +#ifdef LF_BENCH_NO_IMPORT_STD + #include + #include +#else +import std; +#endif + +inline constexpr std::int64_t primes_test = 100'000; +inline constexpr std::int64_t primes_base = 10'000'000; + +// 6k +/- 1 trial division, see https://en.wikipedia.org/wiki/Primality_test +inline constexpr auto is_prime(std::int64_t n) -> bool { + if (n == 2 || n == 3) { + return true; + } + if (n <= 1 || n % 2 == 0 || n % 3 == 0) { + return false; + } + for (std::int64_t i = 5; i * i <= n; i += 6) { + if (n % i == 0 || n % (i + 2) == 0) { + return false; + } + } + return true; +} + +// Prime-counting function pi(n) reference values for the configured sizes. +inline constexpr auto primes_expected(std::int64_t n) -> std::int64_t { + if (n == primes_test) { + return 9592; // pi(1e5) + } + if (n == primes_base) { + return 664'579; // pi(1e7) + } + return -1; +} + +inline auto primes_count_is_correct(std::int64_t result, std::int64_t expect) -> bool { + return expect < 0 || result == expect; +} + +template +void run_primes(benchmark::State &state, Fn fn) { + std::int64_t n = state.range(0); + std::int64_t expect = primes_expected(n); + + state.counters["n"] = static_cast(n); + + lf_bench::bench(state, lf_bench::no_threads, expect, primes_count_is_correct, [n, fn]() -> std::int64_t { + return std::invoke(fn, n); + }); +} diff --git a/benchmark/lib/quicksort.hpp b/benchmark/lib/quicksort.hpp new file mode 100644 index 000000000..afaeb04af --- /dev/null +++ b/benchmark/lib/quicksort.hpp @@ -0,0 +1,49 @@ +#pragma once + +#include "bench.hpp" + +#ifdef LF_BENCH_NO_IMPORT_STD + #include + #include + #include + #include + #include + #include +#else +import std; +#endif + +inline constexpr std::size_t quicksort_test = 10'000; +inline constexpr std::size_t quicksort_base = 10'000'000; + +inline constexpr std::size_t quicksort_basecase = 32; + +inline auto +quicksort_make_input(std::size_t n, std::uint64_t seed = 0xDEADBEEF) -> std::vector { + std::vector out(n); + std::mt19937_64 rng{seed}; + std::uniform_int_distribution dist; + for (auto &v : out) { + v = dist(rng); + } + return out; +} + +template +void run_quicksort(benchmark::State &state, Fn fn) { + auto n = static_cast(state.range(0)); + state.counters["n"] = static_cast(n); + + std::vector source = quicksort_make_input(n); + std::vector reference = source; + std::sort(reference.begin(), reference.end()); + + std::vector work(n); + + lf_bench::bench(state, true, [&]() -> bool { + work = source; + std::invoke(fn, work.data(), work.data() + work.size()); + benchmark::DoNotOptimize(work.data()); + return work == reference; + }); +} diff --git a/benchmark/lib/scan.hpp b/benchmark/lib/scan.hpp new file mode 100644 index 000000000..5d6d471dc --- /dev/null +++ b/benchmark/lib/scan.hpp @@ -0,0 +1,45 @@ +#pragma once + +#include "bench.hpp" + +#ifdef LF_BENCH_NO_IMPORT_STD + #include + #include + #include + #include +#else +import std; +#endif + +inline constexpr std::size_t scan_test = 1'000; +inline constexpr std::size_t scan_base = 8'000; + +inline constexpr std::size_t scan_reps = 1'000; + +inline auto scan_make_vec(std::size_t n) -> std::vector { + std::vector out(n); + unsigned count = 0; + for (auto &elem : out) { + elem = ++count; + } + return out; +} + +template +void run_scan(benchmark::State &state, Fn fn) { + auto n = static_cast(state.range(0)); + state.counters["n"] = static_cast(n); + state.counters["reps"] = static_cast(scan_reps); + + std::vector in = scan_make_vec(n); + std::vector out(n); + + // For 1..n the inclusive scan's last element equals n*(n+1)/2 (mod 2^32). + unsigned expect = static_cast(static_cast(n) * (n + 1) / 2); + + lf_bench::bench(state, expect, [&]() -> unsigned { + std::invoke(fn, in, out, scan_reps); + benchmark::DoNotOptimize(out.data()); + return out.back(); + }); +} diff --git a/benchmark/lib/skynet.hpp b/benchmark/lib/skynet.hpp new file mode 100644 index 000000000..faa7a1b86 --- /dev/null +++ b/benchmark/lib/skynet.hpp @@ -0,0 +1,42 @@ +#pragma once + +#include "bench.hpp" + +#ifdef LF_BENCH_NO_IMPORT_STD + #include + #include +#else +import std; +#endif + +inline constexpr int skynet_branching = 10; + +// Tree depth: total leaves = branching ** depth. +inline constexpr int skynet_test = 4; // 10^4 = 10'000 leaves +inline constexpr int skynet_base = 6; // 10^6 = 1'000'000 leaves + +inline constexpr auto skynet_leaves(int depth) -> std::int64_t { + std::int64_t out = 1; + for (int i = 0; i < depth; ++i) { + out *= skynet_branching; + } + return out; +} + +inline constexpr auto skynet_expected(int depth) -> std::int64_t { + std::int64_t leaves = skynet_leaves(depth); + return leaves * (leaves - 1) / 2; +} + +template +void run_skynet(benchmark::State &state, Fn fn) { + int depth = static_cast(state.range(0)); + std::int64_t expect = skynet_expected(depth); + + state.counters["depth"] = depth; + state.counters["leaves"] = static_cast(skynet_leaves(depth)); + + lf_bench::bench(state, expect, [depth, fn]() -> std::int64_t { + return std::invoke(fn, 0, depth); + }); +} diff --git a/benchmark/lib/uts.cpp b/benchmark/lib/uts.cpp new file mode 100644 index 000000000..90964d48c --- /dev/null +++ b/benchmark/lib/uts.cpp @@ -0,0 +1,159 @@ +#include "uts.hpp" + +#ifdef LF_BENCH_NO_IMPORT_STD + #include +#else +import std; +#endif + +namespace { + +void reset_uts() { + type = GEO; + b_0 = 4.0; + rootId = 0; + nonLeafBF = 4; + nonLeafProb = 15.0 / 64.0; + gen_mx = 6; + shape_fn = LINEAR; + shiftDepth = 0.5; + computeGranularity = 1; + debug = 0; + verbose = 1; +} + +// (T1 mini) Geometric +void setup_t1_mini() { + reset_uts(); + type = static_cast(1); + shape_fn = static_cast(3); + gen_mx = 7; + b_0 = 4; + rootId = 19; +} + +// (T1) Geometric +void setup_t1() { + reset_uts(); + type = static_cast(1); + shape_fn = static_cast(3); + gen_mx = 10; + b_0 = 4; + rootId = 19; +} + +// (T1L) Geometric +void setup_t1l() { + reset_uts(); + type = static_cast(1); + shape_fn = static_cast(3); + gen_mx = 13; + b_0 = 4; + rootId = 29; +} + +// (T1XXL) +void setup_t1xxl() { + reset_uts(); + type = static_cast(1); + shape_fn = static_cast(3); + gen_mx = 15; + b_0 = 4; + rootId = 19; +} + +// (T3 mini) +void setup_t3_mini() { + reset_uts(); + type = static_cast(0); + b_0 = 20; + nonLeafBF = 8; + nonLeafProb = 0.124875; + rootId = 42; +} + +// (T3) Binomial +void setup_t3() { + reset_uts(); + type = static_cast(0); + b_0 = 2000; + nonLeafBF = 8; + nonLeafProb = 0.124875; + rootId = 42; +} + +// (T3L) Binomial +void setup_t3l() { + reset_uts(); + type = static_cast(0); + b_0 = 2000; + nonLeafBF = 5; + nonLeafProb = 0.200014; + rootId = 7; +} + +// (T3XXL) Binomial +void setup_t3xxl() { + reset_uts(); + type = static_cast(0); + b_0 = 2000; + nonLeafBF = 2; + nonLeafProb = 0.499995; + rootId = 316; +} + +} // namespace + +void setup_tree(uts_tree tree) { + switch (tree) { + case uts_t1_mini: + setup_t1_mini(); + break; + case uts_t1: + setup_t1(); + break; + case uts_t1l: + setup_t1l(); + break; + case uts_t1xxl: + setup_t1xxl(); + break; + case uts_t3_mini: + setup_t3_mini(); + break; + case uts_t3: + setup_t3(); + break; + case uts_t3l: + setup_t3l(); + break; + case uts_t3xxl: + setup_t3xxl(); + break; + default: + std::terminate(); + } +} + +auto expected_result(uts_tree tree) -> result { + switch (tree) { + case uts_t1_mini: + return {.maxdepth = 7, .size = 63914, .leaves = 51124}; + case uts_t1: + return {.maxdepth = 10, .size = 4130071, .leaves = 3305118}; + case uts_t1l: + return {.maxdepth = 13, .size = 102181082, .leaves = 81746377}; + case uts_t1xxl: + return {.maxdepth = 15, .size = 4230646601, .leaves = 3384495738}; + case uts_t3_mini: + return {.maxdepth = 67, .size = 6213, .leaves = 5438}; + case uts_t3: + return {.maxdepth = 1572, .size = 4112897, .leaves = 3599034}; + case uts_t3l: + return {.maxdepth = 17844, .size = 111345631, .leaves = 89076904}; + case uts_t3xxl: + return {.maxdepth = 99049, .size = 2793220501, .leaves = 1396611250}; + default: + std::terminate(); + } +} diff --git a/benchmark/lib/uts.hpp b/benchmark/lib/uts.hpp new file mode 100644 index 000000000..61e4361dd --- /dev/null +++ b/benchmark/lib/uts.hpp @@ -0,0 +1,72 @@ +#pragma once + +#include + +#include "bench.hpp" + +// Include the C UTS library header first (it defines max/min macros that would +// clash with std::max/std::min after import std). +#include "uts/uts.h" + +#undef max +#undef min + +#ifdef LF_BENCH_NO_IMPORT_STD + #include + #include + #include +#else +import std; +#endif + +struct result { + counter_t maxdepth; + counter_t size; + counter_t leaves; + auto operator<=>(const result &) const = default; +}; + +template <> +struct std::formatter : std::formatter { + auto format(const result &r, auto &ctx) const { + return std::formatter::format( + std::format("{{maxdepth={}, size={}, leaves={}}}", r.maxdepth, r.size, r.leaves), ctx); + } +}; + +struct pair { + result res; + Node child; +}; + +enum uts_tree : char { + uts_t1_mini, // Geometric [fixed], ~64K nodes (test only) + uts_t1, // Geometric [fixed], ~4M nodes + uts_t1l, // Geometric [fixed], ~102M nodes + uts_t1xxl, // Geometric [fixed], ~4.2B nodes + uts_t3_mini, // Binomial, ~6K nodes (test only) + uts_t3, // Binomial, ~4M nodes + uts_t3l, // Binomial, ~111M nodes + uts_t3xxl, // Binomial, ~2.8B nodes +}; + +void setup_tree(uts_tree tree); + +auto expected_result(uts_tree tree) -> result; + +template +void run_uts(benchmark::State &state, uts_tree tree, std::int64_t threads, Fn fn) { + setup_tree(tree); + auto expect = expected_result(tree); + + lf_bench::bench(state, threads, expect, [fn]() -> result { + Node root; + uts_initRoot(&root, type); + return std::invoke(fn, &root); + }); +} + +template +void run_uts(benchmark::State &state, uts_tree tree, Fn fn) { + run_uts(state, tree, lf_bench::no_threads, fn); +} diff --git a/benchmark/src/baremetal/CMakeLists.txt b/benchmark/src/baremetal/CMakeLists.txt new file mode 100644 index 000000000..ea6f1e69f --- /dev/null +++ b/benchmark/src/baremetal/CMakeLists.txt @@ -0,0 +1,5 @@ +add_library(baremetal_benchmarks) + +target_sources(baremetal_benchmarks PRIVATE fib.cpp) + +target_link_libraries(baremetal_benchmarks PUBLIC benchmark_common libfork::libfork) diff --git a/benchmark/src/baremetal/fib.cpp b/benchmark/src/baremetal/fib.cpp new file mode 100644 index 000000000..b2bd9798a --- /dev/null +++ b/benchmark/src/baremetal/fib.cpp @@ -0,0 +1,148 @@ +#include + +#include "fib.hpp" +#include "macros.hpp" + +import std; + +import libfork; + +// === Coroutine + +namespace { + +// ==== Allocators ==== // + +[[nodiscard]] +inline auto fib_align_size(std::size_t n) -> std::size_t { + constexpr std::size_t align = __STDCPP_DEFAULT_NEW_ALIGNMENT__; + return (n + align - 1) & ~(align - 1); +} + +constinit inline thread_local std::byte *tls_bump_ptr = nullptr; + +struct task { + struct promise_type { + + static auto operator new(std::size_t sz) -> void * { + auto *prev = tls_bump_ptr; + tls_bump_ptr += fib_align_size(sz); + return prev; + } + + static auto operator delete(void *p, [[maybe_unused]] std::size_t sz) noexcept -> void { + tls_bump_ptr = std::bit_cast(p); + } + + auto get_return_object() -> task { return {std::coroutine_handle::from_promise(*this)}; } + + auto initial_suspend() -> std::suspend_always { return {}; } + + auto final_suspend() noexcept { + struct final_awaitable : std::suspend_always { + auto await_suspend(std::coroutine_handle h) noexcept -> std::coroutine_handle<> { + + std::coroutine_handle<> cont = h.promise().continuation; + + h.destroy(); + + if (cont) { + return cont; + } + + return std::noop_coroutine(); + } + }; + + return final_awaitable{}; + } + + void return_value(std::int64_t val) { *value = val; } + void unhandled_exception() { std::terminate(); } + + std::int64_t *value = nullptr; + std::coroutine_handle<> continuation = nullptr; + }; + + std::coroutine_handle coro; + + auto set(std::int64_t &out) -> task & { + coro.promise().value = &out; + return *this; + } + + auto await_ready() noexcept -> bool { return false; } + + auto await_suspend(std::coroutine_handle<> h) -> std::coroutine_handle { + coro.promise().continuation = h; + return coro; + } + + void await_resume() noexcept {} +}; + +auto fib(std::int64_t n) -> task { + if (n <= 1) { + co_return n; + } + std::int64_t a = 0; + std::int64_t b = 0; + co_await fib(n - 2).set(a); + co_await fib(n - 1).set(b); + co_return a + b; +} + +template +void fib_coro_no_queue(benchmark::State &state) { + // 8MB stack + std::unique_ptr buffer = std::make_unique(1024 * 1024 * 8); + tls_bump_ptr = buffer.get(); + + run_fib(state, [](std::int64_t n) { + std::int64_t result = 0; + fib(n).set(result).coro.resume(); + return result; + }); + + if (tls_bump_ptr != buffer.get()) { + std::terminate(); // Stack leak + } +} + +// === Recursive with Deque overhead + +constinit inline thread_local lf::deque *tls_deque = nullptr; + +auto deque() -> lf::deque & { return *tls_deque; } + +auto fib_recursive_deque_impl(std::int64_t n) -> std::int64_t { + if (n <= 1) { + return n; + } + + // Emulate work item creation/scheduling overhead + deque().push(n); + std::int64_t a = fib_recursive_deque_impl(n - 2); + deque().pop(); + + std::int64_t b = fib_recursive_deque_impl(n - 1); + + return a + b; +} + +template +void fib_recursive_deque(benchmark::State &state) { + lf::deque deque{64}; + tls_deque = &deque; + + run_fib(state, fib_recursive_deque_impl); + + tls_deque = nullptr; +} + +} // namespace + +// Minimal coroutine, bump allocated (thread-local) stack +BENCH_ALL(fib_coro_no_queue, baremetal, coro, fib) + +BENCH_ALL(fib_recursive_deque, baremetal, deque, fib) diff --git a/benchmark/src/benchmarks.cpp b/benchmark/src/benchmarks.cpp new file mode 100644 index 000000000..d6cf26f54 --- /dev/null +++ b/benchmark/src/benchmarks.cpp @@ -0,0 +1 @@ +// Benchmarks are registered in the linked sub-libraries. diff --git a/benchmark/src/libfork/CMakeLists.txt b/benchmark/src/libfork/CMakeLists.txt new file mode 100644 index 000000000..1c444c60e --- /dev/null +++ b/benchmark/src/libfork/CMakeLists.txt @@ -0,0 +1,21 @@ +add_library(libfork_benchmarks) + +target_sources(libfork_benchmarks + PRIVATE + fib.cpp + fold.cpp + uts.cpp + switch_io_pool.cpp + switch_random.cpp + PRIVATE + FILE_SET HEADERS + BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR} + FILES + helpers.hpp +) + +target_link_libraries(libfork_benchmarks + PUBLIC + benchmark_common + libfork::libfork +) diff --git a/benchmark/src/libfork/fib.cpp b/benchmark/src/libfork/fib.cpp new file mode 100644 index 000000000..c3bf502be --- /dev/null +++ b/benchmark/src/libfork/fib.cpp @@ -0,0 +1,79 @@ +#include + +#include "fib.hpp" + +#include "helpers.hpp" + +import std; + +import libfork; + +// === Coroutine + +namespace { + +struct fib { + template + static auto operator()(lf::env, std::int64_t n) -> lf::task { + if (n < 2) { + co_return n; + } + + std::int64_t lhs = 0; + std::int64_t rhs = 0; + + auto sc = co_await lf::scope(); + + co_await sc.fork(&rhs, fib{}, n - 2); + co_await sc.call(&lhs, fib{}, n - 1); + + co_await sc.join(); + + co_return lhs + rhs; + } +}; + +template +void run(benchmark::State &state) { + + auto threads = static_cast(thread_count(state)); + Sch scheduler = make_scheduler(state); + + run_fib(state, threads, [&](std::int64_t n) -> std::int64_t { + return lf::schedule(scheduler, fib{}, n).get(); + }); +} + +} // namespace + +using lf::adapt_deque; +using lf::adapt_vector; + +using lf::adaptor_stack; +using lf::geometric_stack; +using lf::slab_stack; + +// -- Vector + +LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler, adapt_vector<>>) +LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler, adapt_vector<>>) + +LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler, adapt_vector<>>) +LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler, adapt_vector<>>) + +LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler, adapt_vector<>>) +LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler, adapt_vector<>>) + +// -- Deque + +LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler, adapt_deque<>>) +LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler, adapt_deque<>>) + +LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler, adapt_deque<>>) +LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler, adapt_deque<>>) + +LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler, adapt_deque<>>) +LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler, adapt_deque<>>) + +LIBFORK_BENCH_ALL_MT(run, fib, fib, mono_busy_pool) +LIBFORK_BENCH_ALL_MT(run, fib, fib, poly_busy_pool) diff --git a/benchmark/src/libfork/fold.cpp b/benchmark/src/libfork/fold.cpp new file mode 100644 index 000000000..194968b32 --- /dev/null +++ b/benchmark/src/libfork/fold.cpp @@ -0,0 +1,102 @@ +#include + +#include "fold.hpp" + +#include "helpers.hpp" + +import std; + +import libfork; + +namespace { + +template +struct sync_projection { + static constexpr auto operator()(T value) -> fold_accum_t { return static_cast>(value); } +}; + +template +struct async_projection { + template + static auto operator()(lf::env, T value) -> lf::task, Context> { + co_return static_cast>(value); + } +}; + +template +constexpr auto make_projection() { + if constexpr (Projection == fold_projection_mode::sync) { + return sync_projection{}; + } else { + return async_projection{}; + } +} + +template +auto run_fold(Sch &pool, Range &&range) -> fold_accum_t { + + auto projection = make_projection(); + + using std::ranges::begin; + using std::ranges::end; + + using diff_t = std::ranges::range_difference_t; + + if constexpr (Chunk == fold_chunk_mode::deduced) { + return *lf::schedule(pool, lf::fold, begin(range), end(range), std::plus<>{}, projection).get(); + } else { + constexpr diff_t chunk = Chunk == fold_chunk_mode::explicit_one ? diff_t{1} : diff_t{4096}; + return *lf::schedule(pool, lf::fold, begin(range), end(range), chunk, std::plus<>{}, projection).get(); + } +} + +template +void run(benchmark::State &state) { + + mono_busy_pool pool{1}; + + run_fold_input(state, [&](auto const &values) -> fold_accum_t { + return run_fold(pool, values); + }); +} + +template +void run_mt(benchmark::State &state) { + + auto threads = static_cast(thread_count(state)); + Sch pool = make_scheduler(state); + + run_fold_input(state, threads, [&](auto const &values) -> fold_accum_t { + return run_fold(pool, values); + }); +} + +} // namespace + +// Chunked/sync/sync versions to mirror serial benchmarks. +LF_FOLD_BENCH_SIZES(run, libfork, fold / std_plus, memory, chunk_fixed, sync_proj, int32) +LF_FOLD_BENCH_SIZES(run, libfork, fold / std_plus, memory, chunk_fixed, sync_proj, float32) +LF_FOLD_BENCH_SIZES(run, libfork, fold / std_plus, lazy, chunk_fixed, sync_proj, int32) +LF_FOLD_BENCH_SIZES(run, libfork, fold / std_plus, lazy, chunk_fixed, sync_proj, float32) + +// Compare specialised for sync/async (no largest size) +LF_FOLD_BENCH_SIZES_SMALL(run, libfork, fold / std_plus, memory, chunk_1, sync_proj, float32) +LF_FOLD_BENCH_SIZES_SMALL(run, libfork, fold / std_plus, memory, chunk_deduced, sync_proj, float32) +LF_FOLD_BENCH_SIZES_SMALL(run, libfork, fold / std_plus, memory, chunk_1, async_proj, float32) +LF_FOLD_BENCH_SIZES_SMALL(run, libfork, fold / std_plus, memory, chunk_deduced, async_proj, float32) + +#define MT(...) LF_FOLD_BENCH_SIZES_MT(__VA_ARGS__) + +// Multi-threaded float32/sync projection. +MT(run_mt, libfork, fold / std_plus, memory, chunk_fixed, sync_proj, float32, mono_busy_pool) +MT(run_mt, libfork, fold / std_plus, lazy, chunk_fixed, sync_proj, float32, mono_busy_pool) +MT(run_mt, libfork, fold / std_plus, memory, chunk_fixed, sync_proj, float32, poly_busy_pool) +MT(run_mt, libfork, fold / std_plus, lazy, chunk_fixed, sync_proj, float32, poly_busy_pool) diff --git a/benchmark/src/libfork/helpers.hpp b/benchmark/src/libfork/helpers.hpp new file mode 100644 index 000000000..8fbef45dc --- /dev/null +++ b/benchmark/src/libfork/helpers.hpp @@ -0,0 +1,41 @@ +#pragma once + +#include + +#include "macros.hpp" + +import std; + +import libfork; + +template +auto thread_count(benchmark::State &state) -> std::size_t { + if constexpr (std::constructible_from) { + return static_cast(state.range(1)); + } else { + return 1; + } +} + +template +auto make_scheduler(benchmark::State &state) -> Sch { + if constexpr (std::constructible_from) { + return Sch{static_cast(state.range(1))}; + } else { + return Sch{}; + } +} + +using mono_busy_pool = lf::mono_busy_pool>; +using poly_busy_pool = lf::poly_busy_pool>; + +#define LIBFORK_BENCH_ALL(bench_fn, name, prefix, ...) \ + BENCH_ALL(bench_fn, libfork, name, prefix __VA_OPT__(, ) __VA_ARGS__) + +#define LIBFORK_BENCH_ALL_MT(bench_fn, name, prefix, ...) \ + BENCH_ALL_MT(bench_fn, libfork, name, prefix __VA_OPT__(, ) __VA_ARGS__) + +#define LIBFORK_UTS_BENCH_ONE_MT(bench_fn, mode, tree_name, tree_id, ...) \ + UTS_BENCH_ONE_MT(bench_fn, libfork, mode, tree_name, tree_id __VA_OPT__(, ) __VA_ARGS__) + +#define LIBFORK_UTS_BENCH_ALL_MT(bench_fn, ...) UTS_BENCH_ALL_MT(bench_fn, libfork __VA_OPT__(, ) __VA_ARGS__) diff --git a/benchmark/src/libfork/switch_io_pool.cpp b/benchmark/src/libfork/switch_io_pool.cpp new file mode 100644 index 000000000..d14199aa0 --- /dev/null +++ b/benchmark/src/libfork/switch_io_pool.cpp @@ -0,0 +1,185 @@ +#include + +#include "helpers.hpp" + +import std; + +import libfork; + +// Constants must be at file scope (outside any namespace) so the macro +// expansion can paste `requests_test` / `requests_base` from any position +// in the translation unit. +inline constexpr std::int64_t requests_test = 64; +inline constexpr std::int64_t requests_base = (1 << 16) - 2; + +namespace { + +inline constexpr std::int64_t k_compute_units = 256; +inline constexpr std::int64_t k_io_units = 32; + +inline auto k_io_workers() -> unsigned { return std::max(2u, std::thread::hardware_concurrency() / 8u); } + +// Busy-loop work that the optimizer cannot elide. +auto do_work(std::int64_t n) -> std::int64_t { + std::int64_t acc = 0; + for (std::int64_t i = 0; i < n; ++i) { + acc += i ^ (acc >> 1); + } + return acc; +} + +// Generic awaitable that posts the task's continuation to an arbitrary pool +// whose context_type matches the current task's Context. +template +struct switch_to { + + using context_type = Sch::context_type; + + Sch *target; + + auto await_ready() noexcept -> bool { return false; } + + auto await_suspend(lf::sched_handle h, context_type & /*context*/) -> void { + target->post(h); + } + + auto await_resume() noexcept -> void {} +}; + +// One "request": CPU work, hop to IO pool, IO work, hop back, more CPU work. +template +struct request_with_io { + + using context_type = Sch::context_type; + + static auto operator()(Sch *compute_pool, Sch *io_pool) -> lf::task { + + std::int64_t acc = do_work(k_compute_units / 2); + + co_await switch_to{io_pool}; + + acc += do_work(k_io_units); + + co_await switch_to{compute_pool}; + + acc += do_work(k_compute_units / 2); + + co_return acc; + } +}; + +// Baseline: same total work but no pool hops. +template +struct request_baseline { + + using context_type = Sch::context_type; + + static auto operator()() -> lf::task { + std::int64_t acc = do_work(k_compute_units / 2); + acc += do_work(k_io_units); + acc += do_work(k_compute_units / 2); + co_return acc; + } +}; + +// Fan-out: fork M request_with_io tasks and sum the results. +template +struct fan_out_with_io { + + using context_type = Sch::context_type; + + static auto + operator()(std::int64_t m, Sch *compute_pool, Sch *io_pool) -> lf::task { + + std::vector results(static_cast(m), 0); + + auto sc = co_await lf::scope(); + + // TODO: use for_each algorithm + + for (std::int64_t i = 0; i < m; ++i) { + co_await sc.fork(&results[static_cast(i)], request_with_io{}, compute_pool, io_pool); + } + + co_await sc.join(); + + std::int64_t total = 0; + for (auto v : results) { + total += v; + } + co_return total; + } +}; + +// Fan-out: fork M request_baseline tasks and sum. +template +struct fan_out_baseline { + + using context_type = Sch::context_type; + + static auto operator()(std::int64_t m) -> lf::task { + std::vector results(static_cast(m), 0); + + auto sc = co_await lf::scope(); + + for (std::int64_t i = 0; i < m; ++i) { + co_await sc.fork(&results[static_cast(i)], request_baseline{}); + } + + co_await sc.join(); + + std::int64_t total = 0; + for (auto v : results) { + total += v; + } + co_return total; + } +}; + +// Compute expected result per request once at startup. +auto expected_per_request() -> std::int64_t { + return do_work(k_compute_units / 2) + do_work(k_io_units) + do_work(k_compute_units / 2); +} + +template +void run_with_io(benchmark::State &state) { + std::int64_t m = state.range(0); + + state.counters["requests"] = static_cast(m); + state.counters["compute_threads"] = static_cast(thread_count(state)); + state.counters["io_threads"] = static_cast(k_io_workers()); + + std::int64_t expect = m * expected_per_request(); + + Sch compute_pool = make_scheduler(state); + Sch io_pool{static_cast(k_io_workers())}; + + lf_bench::bench(state, static_cast(thread_count(state)), expect, [&]() -> std::int64_t { + return lf::schedule(compute_pool, fan_out_with_io{}, m, &compute_pool, &io_pool).get(); + }); +} + +template +void run_baseline(benchmark::State &state) { + std::int64_t m = state.range(0); + + state.counters["requests"] = static_cast(m); + state.counters["compute_threads"] = static_cast(thread_count(state)); + + std::int64_t expect = m * expected_per_request(); + + Sch compute_pool = make_scheduler(state); + + lf_bench::bench(state, static_cast(thread_count(state)), expect, [&]() -> std::int64_t { + return lf::schedule(compute_pool, fan_out_baseline{}, m).get(); + }); +} + +} // namespace + +// prefix = requests → macro uses requests_test / requests_base +LIBFORK_BENCH_ALL_MT(run_with_io, request_io, requests, mono_busy_pool) +LIBFORK_BENCH_ALL_MT(run_baseline, request_baseline, requests, mono_busy_pool) + +LIBFORK_BENCH_ALL_MT(run_with_io, request_io, requests, poly_busy_pool) +LIBFORK_BENCH_ALL_MT(run_baseline, request_baseline, requests, poly_busy_pool) diff --git a/benchmark/src/libfork/switch_random.cpp b/benchmark/src/libfork/switch_random.cpp new file mode 100644 index 000000000..ecd5d42b9 --- /dev/null +++ b/benchmark/src/libfork/switch_random.cpp @@ -0,0 +1,116 @@ +#include + +#include "fib.hpp" + +#include "helpers.hpp" + +import std; + +import libfork; + +// === Awaitable and helpers + +namespace { + +template +struct switch_to_other { + + Sch *target; + + using context_type = Sch::context_type; + + auto await_ready() noexcept -> bool { return false; } + + void await_suspend(lf::sched_handle h, context_type &) { target->post(h); } + + auto await_resume() noexcept -> void {} +}; + +template +struct pool_pair { + Sch *pools[2]; +}; + +// SplitMix64 +struct rng { + + std::uint64_t state; + + auto next() -> rng { + state += 0x9e3779b97f4a7c15ULL; + std::uint64_t z = state; + z = (z ^ (z >> 30ULL)) * 0xbf58476d1ce4e5b9ULL; + z = (z ^ (z >> 27ULL)) * 0x94d049bb133111ebULL; + return {.state = z ^ (z >> 31ULL)}; + } +}; + +// ~10% switch probability: threshold / 256 ≈ 0.10 +inline constexpr std::uint64_t k_switch_threshold = 25; + +template +struct random_switch_fib { + + using context_type = Sch::context_type; + + using task = lf::task; + + static auto operator()(std::int64_t n, pool_pair *pp, rng seed, unsigned current) -> task { + + if (n < 2) { + co_return n; + } + + if ((seed.state & 0xffULL) < k_switch_threshold) { + current = 1U - current; + co_await switch_to_other{pp->pools[current]}; + } + + std::int64_t lhs = 0; + std::int64_t rhs = 0; + + auto sc = co_await lf::scope(); + + co_await sc.fork(&rhs, random_switch_fib{}, n - 2, pp, seed.next(), current); + co_await sc.call(&lhs, random_switch_fib{}, n - 1, pp, seed.next(), current); + + co_await sc.join(); + + co_return lhs + rhs; + } +}; + +template +void run(benchmark::State &state) { + + std::int64_t n = state.range(0); + std::int64_t expect = fib_ref(n); + + auto threads_total = static_cast(state.range(1)); + + if (threads_total < 2) { + state.SkipWithMessage("switch_random requires at least 2 total workers"); + return; + } + + auto threads_a = (threads_total + 1) / 2; + auto threads_b = threads_total - threads_a; + + state.counters["n"] = static_cast(n); + state.counters["p_a"] = static_cast(threads_a); + state.counters["p_b"] = static_cast(threads_b); + + Sch pool_a{threads_a}; + Sch pool_b{threads_b}; + + pool_pair pp{&pool_a, &pool_b}; + + lf_bench::bench(state, static_cast(threads_total), expect, [&]() -> std::int64_t { + return lf::schedule(pool_a, random_switch_fib{}, n, &pp, rng{1}, 0U).get(); + }); +} + +} // namespace + +LIBFORK_BENCH_ALL_MT(run, switch_random, fib, mono_busy_pool) +LIBFORK_BENCH_ALL_MT(run, switch_random, fib, poly_busy_pool) diff --git a/benchmark/src/libfork/uts.cpp b/benchmark/src/libfork/uts.cpp new file mode 100644 index 000000000..2a6e1fdfe --- /dev/null +++ b/benchmark/src/libfork/uts.cpp @@ -0,0 +1,78 @@ +#include + +#include "uts.hpp" + +#include "helpers.hpp" + +import std; + +import libfork; + +// === Coroutine + +namespace { + +// TODO: try a version that uses try_fork + +struct uts_fn { + template + static auto operator()(lf::env, int depth, Node *parent) -> lf::task { + + result r{.maxdepth = static_cast(depth), .size = counter_t{1}, .leaves = counter_t{0}}; + + int num_children = uts_numChildren(parent); + int child_type = uts_childType(parent); + + parent->numChildren = num_children; + + if (num_children > 0) { + std::vector cs(static_cast(num_children)); + + auto sc = co_await lf::scope(); + + for (std::size_t i = 0; i < static_cast(num_children); ++i) { + cs[i].child.type = child_type; + cs[i].child.height = parent->height + 1; + cs[i].child.numChildren = -1; + + for (int j = 0; j < computeGranularity; ++j) { + rng_spawn(parent->state.state, cs[i].child.state.state, static_cast(i)); + } + + if (i + 1 == static_cast(num_children)) { + co_await sc.call(&cs[i].res, uts_fn{}, depth + 1, &cs[i].child); + } else { + co_await sc.fork(&cs[i].res, uts_fn{}, depth + 1, &cs[i].child); + } + } + + co_await sc.join(); + + for (auto &&elem : cs) { + r.maxdepth = std::max(r.maxdepth, elem.res.maxdepth); + r.size += elem.res.size; + r.leaves += elem.res.leaves; + } + } else { + r.leaves = 1; + } + + co_return r; + } +}; + +template +void run(benchmark::State &state, uts_tree tree) { + + auto threads = static_cast(state.range(0)); + Sch scheduler = Sch{threads}; + + run_uts(state, tree, static_cast(threads), [&](Node *root) -> result { + return lf::schedule(scheduler, uts_fn{}, 0, root).get(); + }); +} + +} // namespace + +LIBFORK_UTS_BENCH_ALL_MT(run, mono_busy_pool) +LIBFORK_UTS_BENCH_ALL_MT(run, poly_busy_pool) diff --git a/benchmark/src/openmp/CMakeLists.txt b/benchmark/src/openmp/CMakeLists.txt new file mode 100644 index 000000000..035d5326f --- /dev/null +++ b/benchmark/src/openmp/CMakeLists.txt @@ -0,0 +1,23 @@ +add_library(openmp_benchmarks) + +# OpenMP compiles with -fopenmp which conflicts with the shared std.pcm (built +# without OpenMP). Disable module scanning so CMake doesn't inject the +# incompatible modmap for this target. +set_target_properties(openmp_benchmarks PROPERTIES CXX_SCAN_FOR_MODULES OFF) + +# TODO: remove this hack when we have LLVM 23 + +# Signal to shared benchmark headers that this target cannot `import std;` +# and must use textual standard headers instead. +target_compile_definitions(openmp_benchmarks PRIVATE LF_BENCH_NO_IMPORT_STD) + +target_sources(openmp_benchmarks + PRIVATE + fib.cpp uts.cpp +) + +target_link_libraries(openmp_benchmarks + PUBLIC + benchmark_common + OpenMP::OpenMP_CXX +) diff --git a/benchmark/src/openmp/fib.cpp b/benchmark/src/openmp/fib.cpp new file mode 100644 index 000000000..54afd0f3c --- /dev/null +++ b/benchmark/src/openmp/fib.cpp @@ -0,0 +1,47 @@ +#include +#include + +#include + +#include "fib.hpp" +#include "macros.hpp" + +namespace { + +auto fib(std::int64_t n) -> std::int64_t { + if (n < 2) { + return n; + } + + std::int64_t lhs = 0; + std::int64_t rhs = 0; + +#pragma omp task untied shared(lhs) firstprivate(n) default(none) + lhs = fib(n - 2); + + rhs = fib(n - 1); + +#pragma omp taskwait + return lhs + rhs; +} + +template +void fib_run(benchmark::State &state) { + int threads = static_cast(state.range(1)); + + run_fib(state, threads, [threads](std::int64_t n) -> std::int64_t { + std::int64_t return_value = 0; + +#pragma omp parallel num_threads(threads) default(shared) +#pragma omp single nowait + { + return_value = fib(n); + } + + return return_value; + }); +} + +} // namespace + +BENCH_ALL_MT(fib_run, openmp, fib, fib) diff --git a/benchmark/src/openmp/uts.cpp b/benchmark/src/openmp/uts.cpp new file mode 100644 index 000000000..e574b72f4 --- /dev/null +++ b/benchmark/src/openmp/uts.cpp @@ -0,0 +1,73 @@ +#include +#include +#include +#include + +#include + +#include "macros.hpp" +#include "uts.hpp" + +namespace { + +auto uts(int depth, Node *parent) -> result { + result r{.maxdepth = static_cast(depth), .size = counter_t{1}, .leaves = counter_t{0}}; + + int num_children = uts_numChildren(parent); + int child_type = uts_childType(parent); + + parent->numChildren = num_children; + + if (num_children > 0) { + std::vector cs(static_cast(num_children)); + + for (std::size_t i = 0; i < static_cast(num_children); ++i) { + cs[i].child.type = child_type; + cs[i].child.height = parent->height + 1; + cs[i].child.numChildren = -1; + + for (int j = 0; j < computeGranularity; ++j) { + rng_spawn(parent->state.state, cs[i].child.state.state, static_cast(i)); + } + + if (i + 1 == static_cast(num_children)) { + cs[i].res = uts(depth + 1, &cs[i].child); + } else { +#pragma omp task untied shared(cs) firstprivate(depth, i) default(none) + cs[i].res = uts(depth + 1, &cs[i].child); + } + } + +#pragma omp taskwait + + for (auto &&elem : cs) { + r.maxdepth = std::max(r.maxdepth, elem.res.maxdepth); + r.size += elem.res.size; + r.leaves += elem.res.leaves; + } + } else { + r.leaves = 1; + } + + return r; +} + +void uts_run(benchmark::State &state, uts_tree tree) { + int threads = static_cast(state.range(0)); + + run_uts(state, tree, threads, [threads](Node *root) -> result { + result r; + +#pragma omp parallel num_threads(threads) default(shared) +#pragma omp single nowait + { + r = uts(0, root); + } + + return r; + }); +} + +} // namespace + +UTS_BENCH_ALL_MT(uts_run, openmp) diff --git a/benchmark/src/serial/CMakeLists.txt b/benchmark/src/serial/CMakeLists.txt new file mode 100644 index 000000000..1d215bd92 --- /dev/null +++ b/benchmark/src/serial/CMakeLists.txt @@ -0,0 +1,20 @@ +add_library(serial_benchmarks) + +target_sources(serial_benchmarks PRIVATE + fib.cpp + fold.cpp + heat.cpp + integrate.cpp + knapsack.cpp + mandelbrot.cpp + matmul.cpp + nqueens.cpp + primes.cpp + quicksort.cpp + scan.cpp + skynet.cpp + strassen.cpp + uts.cpp +) + +target_link_libraries(serial_benchmarks PUBLIC benchmark_common) diff --git a/benchmark/src/serial/fib.cpp b/benchmark/src/serial/fib.cpp new file mode 100644 index 000000000..a13cf5f47 --- /dev/null +++ b/benchmark/src/serial/fib.cpp @@ -0,0 +1,53 @@ +#include + +#include "fib.hpp" +#include "macros.hpp" + +import std; + +namespace { + +auto fib_impl(std::int64_t &ret, std::int64_t n) -> void { + if (n < 2) { + ret = n; + return; + } + + std::int64_t lhs = 0; + std::int64_t rhs = 0; + + fib_impl(lhs, n - 2); + fib_impl(rhs, n - 1); + + ret = lhs + rhs; +} + +template +void fib_serial(benchmark::State &state) { + run_fib(state, [](std::int64_t n) -> std::int64_t { + std::int64_t result = 0; + fib_impl(result, n); + return result; + }); +} + +auto fib_ret_impl(std::int64_t n) -> std::int64_t { + if (n < 2) { + return n; + } + + std::int64_t lhs = fib_ret_impl(n - 1); + std::int64_t rhs = fib_ret_impl(n - 2); + + return lhs + rhs; +} + +template +void fib_serial_return(benchmark::State &state) { + run_fib(state, fib_ret_impl); +} + +} // namespace + +BENCH_ALL(fib_serial, serial, fib, fib) +BENCH_ALL(fib_serial_return, serial, fib / return, fib) diff --git a/benchmark/src/serial/fold.cpp b/benchmark/src/serial/fold.cpp new file mode 100644 index 000000000..f7c7f58ec --- /dev/null +++ b/benchmark/src/serial/fold.cpp @@ -0,0 +1,27 @@ +#include + +#include "fold.hpp" + +import std; + +namespace { + +template +void fold_reduce(benchmark::State &state) { + run_fold_input(state, [](auto &&values) -> fold_accum_t { + return std::reduce( + std::ranges::begin(values), std::ranges::end(values), fold_accum_t{}, [](auto a, auto b) static { + return fold_accum_t(a) + fold_accum_t(b); + }); + }); +} + +} // namespace + +#define LF_REGISTER_FOLD_REDUCE(data, dtype) \ + LF_FOLD_BENCH_SIZES(fold_reduce, serial, fold / std_reduce, data, dtype) + +LF_REGISTER_FOLD_REDUCE(memory, int32) +LF_REGISTER_FOLD_REDUCE(memory, float32) +LF_REGISTER_FOLD_REDUCE(lazy, int32) +LF_REGISTER_FOLD_REDUCE(lazy, float32) diff --git a/benchmark/src/serial/heat.cpp b/benchmark/src/serial/heat.cpp new file mode 100644 index 000000000..1a6f2e4af --- /dev/null +++ b/benchmark/src/serial/heat.cpp @@ -0,0 +1,24 @@ +#include + +#include "heat.hpp" +#include "macros.hpp" + +import std; + +namespace { + +void heat_run(double *a, double *b, std::size_t n, std::size_t iters) { + for (std::size_t t = 0; t < iters; ++t) { + heat_jacobi_step(a, b, n); + std::swap(a, b); + } +} + +template +void heat_serial(benchmark::State &state) { + run_heat(state, heat_run); +} + +} // namespace + +BENCH_ALL(heat_serial, serial, heat, heat) diff --git a/benchmark/src/serial/integrate.cpp b/benchmark/src/serial/integrate.cpp new file mode 100644 index 000000000..31e4bf4dc --- /dev/null +++ b/benchmark/src/serial/integrate.cpp @@ -0,0 +1,39 @@ +#include + +#include "integrate.hpp" +#include "macros.hpp" + +import std; + +namespace { + +auto integrate_recurse(double x1, double y1, double x2, double y2, double area) -> double { + + double half = (x2 - x1) / 2; + double x0 = x1 + half; + double y0 = integrate_fn(x0); + + double area_x1x0 = (y1 + y0) / 2 * half; + double area_x0x2 = (y0 + y2) / 2 * half; + double area_x1x2 = area_x1x0 + area_x0x2; + + if (area_x1x2 - area < integrate_epsilon && area - area_x1x2 < integrate_epsilon) { + return area_x1x2; + } + + area_x1x0 = integrate_recurse(x1, y1, x0, y0, area_x1x0); + area_x0x2 = integrate_recurse(x0, y0, x2, y2, area_x0x2); + + return area_x1x0 + area_x0x2; +} + +template +void integrate_serial(benchmark::State &state) { + run_integrate(state, [](double upper) { + return integrate_recurse(0, integrate_fn(0), upper, integrate_fn(upper), 0); + }); +} + +} // namespace + +BENCH_ALL(integrate_serial, serial, integrate, integrate) diff --git a/benchmark/src/serial/knapsack.cpp b/benchmark/src/serial/knapsack.cpp new file mode 100644 index 000000000..3e9ed8f14 --- /dev/null +++ b/benchmark/src/serial/knapsack.cpp @@ -0,0 +1,65 @@ +#include + +#include "knapsack.hpp" +#include "macros.hpp" + +import std; + +namespace { + +// Linear-relaxation bound: greedily fill remaining capacity with the densest +// items, taking a fractional piece of the last one. +auto upper_bound(std::vector const &items, + std::size_t idx, + int remaining_cap, + int current_value) -> double { + double bound = current_value; + int cap = remaining_cap; + for (std::size_t i = idx; i < items.size(); ++i) { + if (items[i].weight <= cap) { + cap -= items[i].weight; + bound += items[i].value; + } else { + bound += static_cast(items[i].value) * cap / items[i].weight; + break; + } + } + return bound; +} + +void knapsack_bb(std::vector const &items, + std::size_t idx, + int remaining_cap, + int current_value, + int &best) { + + if (current_value > best) { + best = current_value; + } + + if (idx == items.size()) { + return; + } + + if (upper_bound(items, idx, remaining_cap, current_value) <= best) { + return; + } + + if (items[idx].weight <= remaining_cap) { + knapsack_bb(items, idx + 1, remaining_cap - items[idx].weight, current_value + items[idx].value, best); + } + knapsack_bb(items, idx + 1, remaining_cap, current_value, best); +} + +template +void knapsack_serial(benchmark::State &state) { + run_knapsack(state, [](knapsack_problem const &problem) { + int best = 0; + knapsack_bb(problem.items, 0, problem.capacity, 0, best); + return best; + }); +} + +} // namespace + +BENCH_ALL(knapsack_serial, serial, knapsack, knapsack) diff --git a/benchmark/src/serial/mandelbrot.cpp b/benchmark/src/serial/mandelbrot.cpp new file mode 100644 index 000000000..518d547cf --- /dev/null +++ b/benchmark/src/serial/mandelbrot.cpp @@ -0,0 +1,17 @@ +#include + +#include "macros.hpp" +#include "mandelbrot.hpp" + +import std; + +namespace { + +template +void mandelbrot_serial(benchmark::State &state) { + run_mandelbrot(state, mandelbrot_checksum); +} + +} // namespace + +BENCH_ALL(mandelbrot_serial, serial, mandelbrot, mandelbrot) diff --git a/benchmark/src/serial/matmul.cpp b/benchmark/src/serial/matmul.cpp new file mode 100644 index 000000000..d9753c5c0 --- /dev/null +++ b/benchmark/src/serial/matmul.cpp @@ -0,0 +1,44 @@ +#include + +#include "macros.hpp" +#include "matmul.hpp" + +import std; + +namespace { + +template +void matmul_dc(float const *A, float const *B, float *R, unsigned n, unsigned s) { + if (n <= matmul_basecase) { + matmul_basecase_multiply(A, B, R, n, s); + return; + } + + unsigned m = n / 2; + + unsigned o00 = 0; + unsigned o01 = m; + unsigned o10 = m * s; + unsigned o11 = m * s + m; + + matmul_dc(A + o00, B + o00, R + o00, m, s); + matmul_dc(A + o00, B + o01, R + o01, m, s); + matmul_dc(A + o10, B + o00, R + o10, m, s); + matmul_dc(A + o10, B + o01, R + o11, m, s); + + matmul_dc(A + o01, B + o10, R + o00, m, s); + matmul_dc(A + o01, B + o11, R + o01, m, s); + matmul_dc(A + o11, B + o10, R + o10, m, s); + matmul_dc(A + o11, B + o11, R + o11, m, s); +} + +template +void matmul_serial(benchmark::State &state) { + run_matmul(state, 1e-5f, [](float const *A, float const *B, float *C, unsigned n) { + matmul_dc(A, B, C, n, n); + }); +} + +} // namespace + +BENCH_ALL(matmul_serial, serial, matmul, matmul) diff --git a/benchmark/src/serial/nqueens.cpp b/benchmark/src/serial/nqueens.cpp new file mode 100644 index 000000000..9d8bb715d --- /dev/null +++ b/benchmark/src/serial/nqueens.cpp @@ -0,0 +1,36 @@ +#include + +#include "macros.hpp" +#include "nqueens.hpp" + +import std; + +namespace { + +auto nqueens(int j, int n, char *a) -> std::int64_t { + if (j == n) { + return 1; + } + + std::int64_t res = 0; + + for (int i = 0; i < n; ++i) { + a[j] = static_cast(i); + if (queens_ok(j + 1, a)) { + res += nqueens(j + 1, n, a); + } + } + + return res; +} + +template +void nqueens_serial(benchmark::State &state) { + run_nqueens(state, [](int n, char *board) { + return nqueens(0, n, board); + }); +} + +} // namespace + +BENCH_ALL(nqueens_serial, serial, nqueens, nqueens) diff --git a/benchmark/src/serial/primes.cpp b/benchmark/src/serial/primes.cpp new file mode 100644 index 000000000..cb0f05005 --- /dev/null +++ b/benchmark/src/serial/primes.cpp @@ -0,0 +1,23 @@ +#include + +#include "macros.hpp" +#include "primes.hpp" + +import std; + +namespace { + +template +void primes_serial(benchmark::State &state) { + run_primes(state, [](std::int64_t lim) { + std::int64_t count = 0; + for (std::int64_t i = 2; i < lim; ++i) { + count += is_prime(i) ? 1 : 0; + } + return count; + }); +} + +} // namespace + +BENCH_ALL(primes_serial, serial, primes, primes) diff --git a/benchmark/src/serial/quicksort.cpp b/benchmark/src/serial/quicksort.cpp new file mode 100644 index 000000000..58fa023cc --- /dev/null +++ b/benchmark/src/serial/quicksort.cpp @@ -0,0 +1,54 @@ +#include + +#include "macros.hpp" +#include "quicksort.hpp" + +import std; + +namespace { + +void insertion_sort(std::uint32_t *first, std::uint32_t *last) { + for (auto *it = first + 1; it < last; ++it) { + std::uint32_t v = *it; + auto *j = it; + while (j > first && *(j - 1) > v) { + *j = *(j - 1); + --j; + } + *j = v; + } +} + +auto partition(std::uint32_t *first, std::uint32_t *last) -> std::uint32_t * { + std::uint32_t *mid = first + (last - first) / 2; + std::uint32_t pivot = *mid; + std::swap(*mid, *(last - 1)); + + auto *store = first; + for (auto *it = first; it < last - 1; ++it) { + if (*it < pivot) { + std::swap(*it, *store); + ++store; + } + } + std::swap(*store, *(last - 1)); + return store; +} + +void quicksort(std::uint32_t *first, std::uint32_t *last) { + while (last - first > static_cast(quicksort_basecase)) { + auto *pivot = partition(first, last); + quicksort(pivot + 1, last); + last = pivot; + } + insertion_sort(first, last); +} + +template +void quicksort_serial(benchmark::State &state) { + run_quicksort(state, quicksort); +} + +} // namespace + +BENCH_ALL(quicksort_serial, serial, quicksort, quicksort) diff --git a/benchmark/src/serial/scan.cpp b/benchmark/src/serial/scan.cpp new file mode 100644 index 000000000..298cab7d1 --- /dev/null +++ b/benchmark/src/serial/scan.cpp @@ -0,0 +1,21 @@ +#include + +#include "macros.hpp" +#include "scan.hpp" + +import std; + +namespace { + +template +void scan_serial(benchmark::State &state) { + run_scan(state, [](std::vector const &in, std::vector &out, std::size_t reps) { + for (std::size_t i = 0; i < reps; ++i) { + std::inclusive_scan(in.begin(), in.end(), out.begin(), std::plus<>{}); + } + }); +} + +} // namespace + +BENCH_ALL(scan_serial, serial, scan, scan) diff --git a/benchmark/src/serial/skynet.cpp b/benchmark/src/serial/skynet.cpp new file mode 100644 index 000000000..37079b444 --- /dev/null +++ b/benchmark/src/serial/skynet.cpp @@ -0,0 +1,30 @@ +#include + +#include "macros.hpp" +#include "skynet.hpp" + +import std; + +namespace { + +auto skynet_recurse(std::int64_t num, int depth) -> std::int64_t { + if (depth == 0) { + return num; + } + + std::int64_t sub = skynet_leaves(depth - 1); + std::int64_t sum = 0; + for (int i = 0; i < skynet_branching; ++i) { + sum += skynet_recurse(num + i * sub, depth - 1); + } + return sum; +} + +template +void skynet_serial(benchmark::State &state) { + run_skynet(state, skynet_recurse); +} + +} // namespace + +BENCH_ALL(skynet_serial, serial, skynet, skynet) diff --git a/benchmark/src/serial/strassen.cpp b/benchmark/src/serial/strassen.cpp new file mode 100644 index 000000000..c5918e7af --- /dev/null +++ b/benchmark/src/serial/strassen.cpp @@ -0,0 +1,131 @@ +#include + +#include "macros.hpp" +#include "matmul.hpp" + +import std; + +namespace { + +inline constexpr unsigned strassen_cutoff = 64; + +// Out[i][j] = A[i][j] + B[i][j], all m x m with respective row strides. +void mat_add(float const *A, unsigned sa, float const *B, unsigned sb, float *Out, unsigned so, unsigned m) { + for (unsigned i = 0; i < m; ++i) { + for (unsigned j = 0; j < m; ++j) { + Out[i * so + j] = A[i * sa + j] + B[i * sb + j]; + } + } +} + +void mat_sub(float const *A, unsigned sa, float const *B, unsigned sb, float *Out, unsigned so, unsigned m) { + for (unsigned i = 0; i < m; ++i) { + for (unsigned j = 0; j < m; ++j) { + Out[i * so + j] = A[i * sa + j] - B[i * sb + j]; + } + } +} + +void naive_multiply( + float const *A, unsigned sa, float const *B, unsigned sb, float *C, unsigned sc, unsigned n) { + for (unsigned i = 0; i < n; ++i) { + for (unsigned j = 0; j < n; ++j) { + float sum = 0; + for (unsigned k = 0; k < n; ++k) { + sum += A[i * sa + k] * B[k * sb + j]; + } + C[i * sc + j] = sum; + } + } +} + +void strassen(float const *A, unsigned sa, float const *B, unsigned sb, float *C, unsigned sc, unsigned n) { + + if (n <= strassen_cutoff) { + naive_multiply(A, sa, B, sb, C, sc, n); + return; + } + + unsigned m = n / 2; + + auto block = [m](auto *p, unsigned s, unsigned i, unsigned j) { + return p + i * m * s + j * m; + }; + + float const *A11 = block(A, sa, 0, 0); + float const *A12 = block(A, sa, 0, 1); + float const *A21 = block(A, sa, 1, 0); + float const *A22 = block(A, sa, 1, 1); + float const *B11 = block(B, sb, 0, 0); + float const *B12 = block(B, sb, 0, 1); + float const *B21 = block(B, sb, 1, 0); + float const *B22 = block(B, sb, 1, 1); + float *C11 = block(C, sc, 0, 0); + float *C12 = block(C, sc, 0, 1); + float *C21 = block(C, sc, 1, 0); + float *C22 = block(C, sc, 1, 1); + + std::vector buf(static_cast(m) * m * 9); + float *T1 = buf.data(); + float *T2 = T1 + static_cast(m) * m; + float *M1 = T2 + static_cast(m) * m; + float *M2 = M1 + static_cast(m) * m; + float *M3 = M2 + static_cast(m) * m; + float *M4 = M3 + static_cast(m) * m; + float *M5 = M4 + static_cast(m) * m; + float *M6 = M5 + static_cast(m) * m; + float *M7 = M6 + static_cast(m) * m; + + // M1 = (A11 + A22)(B11 + B22) + mat_add(A11, sa, A22, sa, T1, m, m); + mat_add(B11, sb, B22, sb, T2, m, m); + strassen(T1, m, T2, m, M1, m, m); + + // M2 = (A21 + A22) B11 + mat_add(A21, sa, A22, sa, T1, m, m); + strassen(T1, m, B11, sb, M2, m, m); + + // M3 = A11 (B12 - B22) + mat_sub(B12, sb, B22, sb, T2, m, m); + strassen(A11, sa, T2, m, M3, m, m); + + // M4 = A22 (B21 - B11) + mat_sub(B21, sb, B11, sb, T2, m, m); + strassen(A22, sa, T2, m, M4, m, m); + + // M5 = (A11 + A12) B22 + mat_add(A11, sa, A12, sa, T1, m, m); + strassen(T1, m, B22, sb, M5, m, m); + + // M6 = (A21 - A11)(B11 + B12) + mat_sub(A21, sa, A11, sa, T1, m, m); + mat_add(B11, sb, B12, sb, T2, m, m); + strassen(T1, m, T2, m, M6, m, m); + + // M7 = (A12 - A22)(B21 + B22) + mat_sub(A12, sa, A22, sa, T1, m, m); + mat_add(B21, sb, B22, sb, T2, m, m); + strassen(T1, m, T2, m, M7, m, m); + + // Combine. + for (unsigned i = 0; i < m; ++i) { + for (unsigned j = 0; j < m; ++j) { + std::size_t k = static_cast(i) * m + j; + C11[i * sc + j] = M1[k] + M4[k] - M5[k] + M7[k]; + C12[i * sc + j] = M3[k] + M5[k]; + C21[i * sc + j] = M2[k] + M4[k]; + C22[i * sc + j] = M1[k] - M2[k] + M3[k] + M6[k]; + } + } +} + +template +void strassen_serial(benchmark::State &state) { + run_matmul(state, 1e-3f, [](float const *A, float const *B, float *C, unsigned n) { + strassen(A, n, B, n, C, n, n); + }); +} + +} // namespace + +BENCH_ALL(strassen_serial, serial, strassen, strassen) diff --git a/benchmark/src/serial/uts.cpp b/benchmark/src/serial/uts.cpp new file mode 100644 index 000000000..f404e806a --- /dev/null +++ b/benchmark/src/serial/uts.cpp @@ -0,0 +1,101 @@ +#include + +#include "macros.hpp" +#include "uts.hpp" + +import std; + +namespace { + +auto uts_traverse(int depth, Node *parent) -> result { + + result r{.maxdepth = static_cast(depth), .size = counter_t{1}, .leaves = counter_t{0}}; + + int num_children = uts_numChildren(parent); + int child_type = uts_childType(parent); + + parent->numChildren = num_children; + + if (num_children > 0) { + std::vector cs(static_cast(num_children)); + + for (std::size_t i = 0; i < static_cast(num_children); ++i) { + cs[i].child.type = child_type; + cs[i].child.height = parent->height + 1; + cs[i].child.numChildren = -1; + + for (int j = 0; j < computeGranularity; ++j) { + rng_spawn(parent->state.state, cs[i].child.state.state, static_cast(i)); + } + + cs[i].res = uts_traverse(depth + 1, &cs[i].child); + } + + for (auto &&elem : cs) { + r.maxdepth = std::max(r.maxdepth, elem.res.maxdepth); + r.size += elem.res.size; + r.leaves += elem.res.leaves; + } + } else { + r.leaves = 1; + } + + return r; +} + +void uts_serial(benchmark::State &state, uts_tree tree) { + run_uts(state, tree, [](Node *root) { + return uts_traverse(0, root); + }); +} + +} // namespace + +UTS_BENCH_ALL(uts_serial, serial) + +namespace { + +auto uts_traverse_no_alloc(int depth, Node *parent) -> result { + + result r{.maxdepth = static_cast(depth), .size = counter_t{1}, .leaves = counter_t{0}}; + + int num_children = uts_numChildren(parent); + int child_type = uts_childType(parent); + + parent->numChildren = num_children; + + if (num_children > 0) { + for (std::size_t i = 0; i < static_cast(num_children); ++i) { + + pair cs; + + cs.child.type = child_type; + cs.child.height = parent->height + 1; + cs.child.numChildren = -1; + + for (int j = 0; j < computeGranularity; ++j) { + rng_spawn(parent->state.state, cs.child.state.state, static_cast(i)); + } + + cs.res = uts_traverse(depth + 1, &cs.child); + + r.maxdepth = std::max(r.maxdepth, cs.res.maxdepth); + r.size += cs.res.size; + r.leaves += cs.res.leaves; + } + } else { + r.leaves = 1; + } + + return r; +} + +void uts_serial_no_alloc(benchmark::State &state, uts_tree tree) { + run_uts(state, tree, [](Node *root) { + return uts_traverse_no_alloc(0, root); + }); +} + +} // namespace + +UTS_BENCH_ALL(uts_serial_no_alloc, serial / no_alloc) diff --git a/cmake/gcc-brew-toolchain.cmake b/cmake/gcc-brew-toolchain.cmake new file mode 100644 index 000000000..aa67ccaaf --- /dev/null +++ b/cmake/gcc-brew-toolchain.cmake @@ -0,0 +1,92 @@ +cmake_minimum_required(VERSION 4.2.1) + +# Set up Homebrew GCC@15 & Ninja toolchain for CMake + +find_program(BREW_EXE brew) + +if(NOT BREW_EXE) + message(FATAL_ERROR "Could not find 'brew' executable. Please install Homebrew.") +endif() + +# --- Ninja + +execute_process( + COMMAND ${BREW_EXE} --prefix ninja + OUTPUT_VARIABLE NINJA_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE + COMMAND_ERROR_IS_FATAL ANY +) + +find_program(CMAKE_MAKE_PROGRAM + NAMES ninja + HINTS "${NINJA_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +# --- GCC + +execute_process( + COMMAND ${BREW_EXE} --prefix gcc + OUTPUT_VARIABLE GCC_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE + COMMAND_ERROR_IS_FATAL ANY +) + +find_program(CMAKE_C_COMPILER + NAMES gcc-HEAD + HINTS "${GCC_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +find_program(CMAKE_CXX_COMPILER + NAMES g++-HEAD + HINTS "${GCC_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +find_program(CMAKE_AR + NAMES gcc-ar-HEAD + HINTS "${GCC_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +find_program(CMAKE_RANLIB + NAMES gcc-ranlib-HEAD + HINTS "${GCC_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +find_program(CMAKE_NM + NAMES gcc-nm-HEAD + HINTS "${GCC_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +# --- Binutils + +execute_process( + COMMAND ${BREW_EXE} --prefix binutils + OUTPUT_VARIABLE BINUTILS_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE + COMMAND_ERROR_IS_FATAL ANY +) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -B${BINUTILS_PREFIX}/bin/" CACHE STRING "" FORCE) +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -B${BINUTILS_PREFIX}/bin/" CACHE STRING "" FORCE) + + +# Get macOS SDK path (only on macOS) +if(APPLE) + execute_process( + COMMAND xcrun --show-sdk-path + OUTPUT_VARIABLE CMAKE_OSX_SYSROOT + OUTPUT_STRIP_TRAILING_WHITESPACE + COMMAND_ERROR_IS_FATAL ANY + ) +endif() diff --git a/cmake/llvm-brew-toolchain.cmake b/cmake/llvm-brew-toolchain.cmake new file mode 100644 index 000000000..199bdae34 --- /dev/null +++ b/cmake/llvm-brew-toolchain.cmake @@ -0,0 +1,88 @@ +cmake_minimum_required(VERSION 4.2.1) + +# Set up Homebrew LLVM & Ninja toolchain for CMake + +find_program(BREW_EXE brew) + +if(NOT BREW_EXE) + message(FATAL_ERROR "Could not find 'brew' executable. Please install Homebrew.") +endif() + +# --- Ninja + +execute_process( + COMMAND ${BREW_EXE} --prefix ninja + OUTPUT_VARIABLE NINJA_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE + COMMAND_ERROR_IS_FATAL ANY +) + +find_program(CMAKE_MAKE_PROGRAM + NAMES ninja + HINTS "${NINJA_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +# --- LLVM + +execute_process( + COMMAND ${BREW_EXE} --prefix llvm + OUTPUT_VARIABLE LLVM_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE + COMMAND_ERROR_IS_FATAL ANY +) + +find_program(CMAKE_C_COMPILER + NAMES clang + HINTS "${LLVM_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +find_program(CMAKE_CXX_COMPILER + NAMES clang++ + HINTS "${LLVM_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +find_program(CMAKE_AR + NAMES llvm-ar + HINTS "${LLVM_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +find_program(CMAKE_RANLIB + NAMES llvm-ranlib + HINTS "${LLVM_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +find_program(CMAKE_NM + NAMES llvm-nm + HINTS "${LLVM_PREFIX}/bin" + NO_DEFAULT_PATH + REQUIRED +) + +# Dynamically find the standard library modules JSON, brew puts it in the wrong place +file(GLOB_RECURSE LIBCXX_MODULES_JSON "${LLVM_PREFIX}/lib/**/libc++.modules.json") + +if(LIBCXX_MODULES_JSON) + set(CMAKE_CXX_STDLIB_MODULES_JSON "${LIBCXX_MODULES_JSON}") +else() + message(FATAL_ERROR "Could not automatically find libc++.modules.json in ${LLVM_PREFIX}") +endif() + +# Get macOS SDK path (only on macOS) +if(APPLE) + execute_process( + COMMAND xcrun --show-sdk-path + OUTPUT_VARIABLE CMAKE_OSX_SYSROOT + OUTPUT_STRIP_TRAILING_WHITESPACE + COMMAND_ERROR_IS_FATAL ANY + ) +endif() diff --git a/.legacy/ChangeLog.md b/docs/ChangeLog.md similarity index 98% rename from .legacy/ChangeLog.md rename to docs/ChangeLog.md index 90e1a3606..c186137ab 100644 --- a/.legacy/ChangeLog.md +++ b/docs/ChangeLog.md @@ -1,3 +1,7 @@ +--- +icon: lucide/notebook-pen +--- + # Changelog