diff --git a/.clang-format b/.clang-format
index 42912faec..ad9f4bb18 100644
--- a/.clang-format
+++ b/.clang-format
@@ -2,19 +2,35 @@
 Language: Cpp
 ColumnLimit: 110
 IndentPPDirectives: BeforeHash
-AlwaysBreakTemplateDeclarations : true
-PackConstructorInitializers : CurrentLine
+AlwaysBreakTemplateDeclarations: Yes
+BreakAfterAttributes: Always
+PackConstructorInitializers: Never
 AccessModifierOffset: -1
-IndentCaseLabels : true
+IndentCaseLabels: true
 AllowShortLambdasOnASingleLine: Empty
 RequiresExpressionIndentation: OuterScope
-BinPackArguments : false
-BinPackParameters : false
-LambdaBodyIndentation : Signature
-PenaltyReturnTypeOnItsOwnLine : 1
+LambdaBodyIndentation: Signature
+PenaltyReturnTypeOnItsOwnLine: 1
+
+# TODO: update for clang-format 23
+BinPackArguments: false
+BinPackParameters: OnePerLine
+
+BreakBeforeConceptDeclarations: Always
+
+Macros:
+  - LF_TRY=if
+  - LF_CATCH_ALL=else
+  - LF_CATCH(x)=else
+  - LF_HOF(x)={x;}
+  - LF_HOF(x,y)={x,y;}
+  - LF_HOF(x,y,z)={x,y,z;}
+  - LF_HOF(x,y,z,w)={x,y,z,w;}
+  - LF_HOF(a,b,c,d,e)={a;}
+  - LF_HOF(a,b,c,d,e,f)={a,b,c,d,e,f;}
 
 SpaceBeforeParens: Custom
 SpaceBeforeParensOptions:
-    AfterRequiresInClause: true
-    AfterRequiresInExpression : true
+  AfterRequiresInClause: true
+  AfterRequiresInExpression: true
 ...
diff --git a/.clang-tidy b/.clang-tidy
index 5d813fd55..d05681165 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -10,148 +10,150 @@ Checks: "*,\
   -llvm-header-guard,\
   -llvm-include-order,\
   -llvmlibc-*,\
-  -modernize-use-nodiscard,\
+  -readability-identifier-length,\
   -misc-non-private-member-variables-in-classes"
-WarningsAsErrors: ''
+WarningsAsErrors: ""
 CheckOptions:
   - key: readability-function-cognitive-complexity.IgnoreMacros
-    value: 'true'
-  - key: 'bugprone-argument-comment.StrictMode'
-    value: 'true'
-# Prefer using enum classes with 2 values for parameters instead of bools
-  - key: 'bugprone-argument-comment.CommentBoolLiterals'
-    value: 'true'
-  - key: 'bugprone-misplaced-widening-cast.CheckImplicitCasts'
-    value: 'true'
-  - key: 'bugprone-sizeof-expression.WarnOnSizeOfIntegerExpression'
-    value: 'true'
-  - key: 'bugprone-suspicious-string-compare.WarnOnLogicalNotComparison'
-    value: 'true'
-  - key: 'readability-simplify-boolean-expr.ChainedConditionalReturn'
-    value: 'true'
-  - key: 'readability-simplify-boolean-expr.ChainedConditionalAssignment'
-    value: 'true'
-  - key: 'readability-uniqueptr-delete-release.PreferResetCall'
-    value: 'true'
-  - key: 'cppcoreguidelines-init-variables.MathHeader'
-    value: '<cmath>'
-  - key: 'cppcoreguidelines-narrowing-conversions.PedanticMode'
-    value: 'true'
-  - key: 'readability-else-after-return.WarnOnUnfixable'
-    value: 'true'
-  - key: 'readability-else-after-return.WarnOnConditionVariables'
-    value: 'true'
-  - key: 'readability-inconsistent-declaration-parameter-name.Strict'
-    value: 'true'
-  - key: 'readability-qualified-auto.AddConstToQualified'
-    value: 'true'
-  - key: 'readability-redundant-access-specifiers.CheckFirstDeclaration'
-    value: 'true'
-# These seem to be the most common identifier styles
-  - key: 'readability-identifier-naming.AbstractClassCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ClassCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ClassConstantCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ClassMemberCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ClassMethodCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ConstantCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ConstantMemberCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ConstantParameterCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ConstantPointerParameterCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ConstexprFunctionCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ConstexprMethodCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ConstexprVariableCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.EnumCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.EnumConstantCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.FunctionCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.GlobalConstantCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.GlobalConstantPointerCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.GlobalFunctionCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.GlobalPointerCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.GlobalVariableCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.InlineNamespaceCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.LocalConstantCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.LocalConstantPointerCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.LocalPointerCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.LocalVariableCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.MacroDefinitionCase'
-    value: 'UPPER_CASE'
-  - key: 'readability-identifier-naming.MemberCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.MethodCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.NamespaceCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ParameterCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ParameterPackCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.PointerParameterCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.PrivateMemberCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.PrivateMemberPrefix'
-    value: 'm_'
-  - key: 'readability-identifier-naming.PrivateMethodCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ProtectedMemberCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ProtectedMemberPrefix'
-    value: 'm_'
-  - key: 'readability-identifier-naming.ProtectedMethodCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.PublicMemberCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.PublicMethodCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ScopedEnumConstantCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.StaticConstantCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.StaticVariableCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.StructCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.TemplateParameterCase'
-    value: 'CamelCase'
-  - key: 'readability-identifier-naming.TemplateTemplateParameterCase'
-    value: 'CamelCase'
-  - key: 'readability-identifier-naming.TypeAliasCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.TypedefCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.TypeTemplateParameterCase'
-    value: 'CamelCase'
-  - key: 'readability-identifier-naming.UnionCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ValueTemplateParameterCase'
-    value: 'CamelCase'
-  - key: 'readability-identifier-naming.VariableCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.VirtualMethodCase'
-    value: 'lower_case'
+    value: "true"
+  - key: "cppcoreguidelines-avoid-do-while.IgnoreMacros"
+    value: "true"
+  - key: "bugprone-argument-comment.StrictMode"
+    value: "true"
+  # Prefer using enum classes with 2 values for parameters instead of bools
+  - key: "bugprone-argument-comment.CommentBoolLiterals"
+    value: "true"
+  - key: "bugprone-misplaced-widening-cast.CheckImplicitCasts"
+    value: "true"
+  - key: "bugprone-sizeof-expression.WarnOnSizeOfIntegerExpression"
+    value: "true"
+  - key: "bugprone-suspicious-string-compare.WarnOnLogicalNotComparison"
+    value: "true"
+  - key: "readability-simplify-boolean-expr.ChainedConditionalReturn"
+    value: "true"
+  - key: "readability-simplify-boolean-expr.ChainedConditionalAssignment"
+    value: "true"
+  - key: "readability-uniqueptr-delete-release.PreferResetCall"
+    value: "true"
+  - key: "cppcoreguidelines-init-variables.MathHeader"
+    value: "<cmath>"
+  - key: "cppcoreguidelines-narrowing-conversions.PedanticMode"
+    value: "true"
+  - key: "readability-else-after-return.WarnOnUnfixable"
+    value: "true"
+  - key: "readability-else-after-return.WarnOnConditionVariables"
+    value: "true"
+  - key: "readability-inconsistent-declaration-parameter-name.Strict"
+    value: "true"
+  - key: "readability-qualified-auto.AddConstToQualified"
+    value: "true"
+  - key: "readability-redundant-access-specifiers.CheckFirstDeclaration"
+    value: "true"
+  # These seem to be the most common identifier styles
+  - key: "readability-identifier-naming.AbstractClassCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ClassCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ClassConstantCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ClassMemberCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ClassMethodCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ConstantCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ConstantMemberCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ConstantParameterCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ConstantPointerParameterCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ConstexprFunctionCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ConstexprMethodCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ConstexprVariableCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.EnumCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.EnumConstantCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.FunctionCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.GlobalConstantCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.GlobalConstantPointerCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.GlobalFunctionCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.GlobalPointerCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.GlobalVariableCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.InlineNamespaceCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.LocalConstantCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.LocalConstantPointerCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.LocalPointerCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.LocalVariableCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.MacroDefinitionCase"
+    value: "UPPER_CASE"
+  - key: "readability-identifier-naming.MemberCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.MethodCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.NamespaceCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ParameterCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ParameterPackCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.PointerParameterCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.PrivateMemberCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.PrivateMemberPrefix"
+    value: "m_"
+  - key: "readability-identifier-naming.PrivateMethodCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ProtectedMemberCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ProtectedMemberPrefix"
+    value: "m_"
+  - key: "readability-identifier-naming.ProtectedMethodCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.PublicMemberCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.PublicMethodCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ScopedEnumConstantCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.StaticConstantCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.StaticVariableCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.StructCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.TemplateParameterCase"
+    value: "CamelCase"
+  - key: "readability-identifier-naming.TemplateTemplateParameterCase"
+    value: "CamelCase"
+  - key: "readability-identifier-naming.TypeAliasCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.TypedefCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.TypeTemplateParameterCase"
+    value: "CamelCase"
+  - key: "readability-identifier-naming.UnionCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ValueTemplateParameterCase"
+    value: "CamelCase"
+  - key: "readability-identifier-naming.VariableCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.VirtualMethodCase"
+    value: "lower_case"
 ...
diff --git a/.clangd b/.clangd
index ef86cb6b0..fd3d2a8f4 100644
--- a/.clangd
+++ b/.clangd
@@ -1,2 +1,2 @@
 CompileFlags:
-  CompilationDatabase: build/dev
\ No newline at end of file
+  CompilationDatabase: build/dev
diff --git a/.codespellrc b/.codespellrc
index c3920f351..86730201c 100644
--- a/.codespellrc
+++ b/.codespellrc
@@ -1,7 +1,7 @@
 [codespell]
-builtin = clear,rare,en-GB_to_en-US,names,informal,code
+builtin = clear,rare,names,informal,code
 check-filenames =
 check-hidden =
 ignore-words-list = deque,warmup,stdio,copyable,combinate
-skip = */.git,*/build,*/prefix,*/vcpkg,*/_build,*/bench
+skip = */.git,*/build,*/.legacy
 quiet-level = 2
diff --git a/.gemini/settings.json b/.gemini/settings.json
new file mode 100644
index 000000000..b8dce87f3
--- /dev/null
+++ b/.gemini/settings.json
@@ -0,0 +1,8 @@
+{
+  "context": {
+    "fileName": "AGENTS.md"
+  },
+  "ui": {
+    "hideBanner": true
+  }
+}
\ No newline at end of file
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
new file mode 100644
index 000000000..b45ddfb1d
--- /dev/null
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,31 @@
+name: Documentation
+on:
+  push:
+    branches:
+      - main
+
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+jobs:
+  deploy:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/configure-pages@v6
+      - uses: actions/checkout@v6
+      - uses: actions/setup-python@v6
+        with:
+          python-version: 3.x
+      - run: pip install zensical
+      - run: zensical build --clean
+      - uses: actions/upload-pages-artifact@v5
+        with:
+          path: build/site
+      - uses: actions/deploy-pages@v5
+        id: deployment
diff --git a/.github/workflows/linear.yml b/.github/workflows/linear.yml
new file mode 100644
index 000000000..8d1ba0975
--- /dev/null
+++ b/.github/workflows/linear.yml
@@ -0,0 +1,33 @@
+name: Linear History
+
+on:
+  pull_request:
+    branches: ["modules"]
+  workflow_dispatch:
+
+jobs:
+  check-linear-history:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+        with:
+          ref: ${{ github.event.pull_request.head.sha || github.sha }}
+          fetch-depth: 0
+
+      - name: Check for merge commits
+        run: |
+          BASE_REF=${{ github.base_ref || 'modules' }}
+          echo "Comparing against base: $BASE_REF"
+          git fetch origin $BASE_REF:$BASE_REF
+          MERGE_COMMITS=$(git rev-list --merges $BASE_REF..HEAD)
+          if [ -n "$MERGE_COMMITS" ]; then
+            echo "Error: Merge commits detected. libfork requires a linear history."
+            echo "Please rebase your branch onto $BASE_REF to remove merge commits."
+            echo ""
+            echo "Merge commits found:"
+            git log --merges --oneline $BASE_REF..HEAD
+            exit 1
+          else
+            echo "No merge commits detected. Linear history check passed."
+          fi
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 000000000..82cfdc9a7
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,28 @@
+name: Lint
+
+on:
+  push:
+    branches: ["modules"]
+  pull_request:
+    branches: ["modules"]
+  workflow_dispatch:
+
+jobs:
+  lint:
+    runs-on: macos-latest
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Set up Homebrew
+        uses: Homebrew/actions/setup-homebrew@main
+
+      - name: Install Dependencies
+        run: brew install clang-format codespell
+
+      - name: Run codespell
+        run: codespell
+
+      - name: Run clang-format
+        run: |
+          find src include test benchmark/src -name "*.cpp" -o -name "*.hpp" -o -name "*.cxx" | xargs clang-format --dry-run --Werror
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
new file mode 100644
index 000000000..4904ae1e5
--- /dev/null
+++ b/.github/workflows/linux.yml
@@ -0,0 +1,35 @@
+name: Linux
+
+on:
+  push:
+    branches: ["modules"]
+  pull_request:
+    branches: ["modules"]
+  workflow_dispatch:
+
+jobs:
+  build-and-test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        preset: [ci-hardened, ci-release, ci-no-except-rtti]
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Set up Homebrew
+        uses: Homebrew/actions/setup-homebrew@main
+
+      - name: Install Dependencies
+        run: brew install cmake ninja gcc binutils catch2 google-benchmark
+
+      - name: Configure
+        run: cmake --preset ${{ matrix.preset }}
+          -DCMAKE_TOOLCHAIN_FILE=cmake/gcc-brew-toolchain.cmake
+
+      - name: Build
+        run: cmake --build --preset ${{ matrix.preset }}
+
+      - name: Test
+        run: ctest --preset ${{ matrix.preset }}
diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
new file mode 100644
index 000000000..17eb53779
--- /dev/null
+++ b/.github/workflows/macos.yml
@@ -0,0 +1,35 @@
+name: MacOS
+
+on:
+  push:
+    branches: ["modules"]
+  pull_request:
+    branches: ["modules"]
+  workflow_dispatch:
+
+jobs:
+  build-and-test:
+    runs-on: macos-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        preset: [ci-hardened, ci-release, ci-no-except-rtti, ci-sanitize]
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Set up Homebrew
+        uses: Homebrew/actions/setup-homebrew@main
+
+      - name: Install Dependencies
+        run: brew install cmake ninja llvm catch2 google-benchmark
+
+      - name: Configure
+        run: cmake --preset ${{ matrix.preset }}
+          -DCMAKE_TOOLCHAIN_FILE=cmake/llvm-brew-toolchain.cmake
+
+      - name: Build
+        run: cmake --build --preset ${{ matrix.preset }}
+
+      - name: Test
+        run: ctest --preset ${{ matrix.preset }}
diff --git a/.gitignore b/.gitignore
index 47a6b5d9d..2e43ca3a3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,27 +4,7 @@
 .cache/
 
 build/
-_build/
-cmake-build-*/
-prefix/
-old/
-Testing/
-
-docs/index.rst
-
-bench/data/
-*.svg
-
-mem.log
-memory.csv
-memory.*.csv
-test.pdf
-gmon.out
-out.png
-output.png
 
 **/.DS_Store
 
 compile_commands.json
-CMakeLists.txt.user
-CMakeUserPresets.json
diff --git a/.legacy/docs/_static/android-chrome-192x192.png b/.legacy/docs/_static/android-chrome-192x192.png
deleted file mode 100644
index 20d2ba4eb..000000000
Binary files a/.legacy/docs/_static/android-chrome-192x192.png and /dev/null differ
diff --git a/.legacy/docs/_static/android-chrome-512x512.png b/.legacy/docs/_static/android-chrome-512x512.png
deleted file mode 100644
index de0ebe9ef..000000000
Binary files a/.legacy/docs/_static/android-chrome-512x512.png and /dev/null differ
diff --git a/.legacy/docs/_static/apple-touch-icon.png b/.legacy/docs/_static/apple-touch-icon.png
deleted file mode 100644
index 39e94423b..000000000
Binary files a/.legacy/docs/_static/apple-touch-icon.png and /dev/null differ
diff --git a/.legacy/docs/_static/favicon-16x16.png b/.legacy/docs/_static/favicon-16x16.png
deleted file mode 100644
index a3930b408..000000000
Binary files a/.legacy/docs/_static/favicon-16x16.png and /dev/null differ
diff --git a/.legacy/docs/_static/favicon-32x32.png b/.legacy/docs/_static/favicon-32x32.png
deleted file mode 100644
index 2d85ce719..000000000
Binary files a/.legacy/docs/_static/favicon-32x32.png and /dev/null differ
diff --git a/.legacy/docs/_static/favicon.ico b/.legacy/docs/_static/favicon.ico
deleted file mode 100644
index 6824cb2cc..000000000
Binary files a/.legacy/docs/_static/favicon.ico and /dev/null differ
diff --git a/.legacy/include/libfork/core/macro.hpp b/.legacy/include/libfork/core/macro.hpp
index 0944c1644..e42fcfc18 100644
--- a/.legacy/include/libfork/core/macro.hpp
+++ b/.legacy/include/libfork/core/macro.hpp
@@ -61,17 +61,6 @@
   #define LF_STATIC_CONST const
 #endif
 
-// clang-format off
-
-/**
- * @brief Use like `BOOST_HOF_RETURNS` to define a function/lambda with all the noexcept/requires/decltype specifiers.
- * 
- * This macro is not truly variadic but the ``...`` allows commas in the macro argument.
- */
-#define LF_HOF_RETURNS(...) noexcept(noexcept(__VA_ARGS__)) -> decltype(__VA_ARGS__) requires requires { __VA_ARGS__; } { return __VA_ARGS__;}
-
-// clang-format on
-
 /**
  * @brief __[public]__ Detects if the compiler has exceptions enabled.
  *
@@ -192,28 +181,6 @@ using std::unreachable;
   #define LF_ASSERT(expr) LF_ASSUME(expr)
 #endif
 
-/**
- * @brief Macro to prevent a function to be inlined.
- */
-#if !defined(LF_NOINLINE)
-  #if defined(_MSC_VER) && !defined(__clang__)
-    #define LF_NOINLINE __declspec(noinline)
-  #elif defined(__GNUC__) && __GNUC__ > 3
-  // Clang also defines __GNUC__ (as 4)
-    #if defined(__CUDACC__)
-  // nvcc doesn't always parse __noinline__, see: https://svn.boost.org/trac/boost/ticket/9392
-      #define LF_NOINLINE __attribute__((noinline))
-    #elif defined(__HIP__)
-  // See https://github.com/boostorg/config/issues/392
-      #define LF_NOINLINE __attribute__((noinline))
-    #else
-      #define LF_NOINLINE __attribute__((__noinline__))
-    #endif
-  #else
-    #define LF_NOINLINE
-  #endif
-#endif
-
 /**
  * @brief Force no-inline for clang, works-around https://github.com/llvm/llvm-project/issues/63022.
  *
@@ -229,28 +196,6 @@ using std::unreachable;
   #define LF_CLANG_TLS_NOINLINE
 #endif
 
-/**
- * @brief Macro to use next to 'inline' to force a function to be inlined.
- *
- * \rst
- *
- * .. note::
- *
- *    This does not imply the c++'s `inline` keyword which also has an effect on linkage.
- *
- * \endrst
- */
-#if !defined(LF_FORCEINLINE)
-  #if defined(_MSC_VER) && !defined(__clang__)
-    #define LF_FORCEINLINE __forceinline
-  #elif defined(__GNUC__) && __GNUC__ > 3
-  // Clang also defines __GNUC__ (as 4)
-    #define LF_FORCEINLINE __attribute__((__always_inline__))
-  #else
-    #define LF_FORCEINLINE
-  #endif
-#endif
-
 #if defined(__clang__) && defined(__has_attribute)
   /**
    * @brief Compiler specific attribute.
diff --git a/.python-version b/.python-version
new file mode 100644
index 000000000..24ee5b1be
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.13
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 000000000..cb1e1f47d
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,206 @@
+# Libfork Copilot Instructions
+
+## Project Overview
+
+**libfork** is a continuation-stealing coroutine-tasking library implementing
+strict fork-join parallelism using C++20 coroutines.
+
+- **Type**: C++ library with module/`import std` support
+- **Languages**: C++26
+
+## Critical Build Requirements
+
+### Compiler & Module Support
+
+This project **requires C++23's `import std`** and **MUST** use the appropriate
+toolchain file:
+
+- **MacOS**: Use `-DCMAKE_TOOLCHAIN_FILE=cmake/llvm-brew-toolchain.cmake`
+- **Linux**: Use `-DCMAKE_TOOLCHAIN_FILE=cmake/gcc-brew-toolchain.cmake`
+
+**Common Error**: Without the toolchain file, CMake will fail.
+
+**Always include the toolchain file** in configure commands.
+
+### Dependencies (Homebrew)
+
+Make sure Homebrew is installed and `brew` is in your `PATH`:
+
+```bash
+brew --version
+```
+
+**Required for building/testing:**
+
+- `cmake`
+- `ninja`
+- `catch2`
+- `google-benchmark`
+- `clang-format`
+- `codespell`
+
+If on MacOS, also require:
+
+- `llvm`
+
+If on Linux, also require:
+
+- `gcc`
+- `binutils`
+
+Install all at once (MacOS):
+
+```bash
+brew install cmake ninja catch2 google-benchmark clang-format codespell llvm
+```
+
+Install all at once (Linux):
+
+```bash
+brew install cmake ninja catch2 google-benchmark clang-format codespell gcc binutils
+```
+
+## Build & Test Workflow
+
+### 1. Configure
+
+Always use presets with the toolchain file:
+
+```bash
+cmake --preset <preset-name> -DCMAKE_TOOLCHAIN_FILE=cmake/<toolchain>.cmake
+```
+
+**Relevant available presets** (from `CMakePresets.json`):
+
+- `ci-hardened` - Debug build with warnings and hardening flags
+- `ci-release` - Optimized release build
+
+All presets enable developer mode (`libfork_DEV_MODE=ON`) and use Ninja generator.
+
+You should use the `ci-hardened` preset for development/testing and
+`ci-release` for benchmarking.
+
+### 2. Build
+
+```bash
+cmake --build --preset <preset-name>
+```
+
+**Build warnings** (expected and safe):
+
+- "It is recommended to build benchmarks in Release mode" - only relevant for `ci-hardened`
+- CMake experimental `import std;` warning - expected for C++23's `import std`
+
+### 3. Test
+
+```bash
+ctest --preset <preset-name>
+```
+
+All tests should pass. If tests fail, check that:
+
+- Configuration used the correct toolchain file
+- Build completed without errors
+- Any changes you have made are correct
+
+## Project Structure
+
+### Source Layout
+
+```sh
+libfork/
+├── cmake/                    # CMake utilities
+├── include/libfork/**/*.hpp  # Public headers (macros, version)
+├── src/                      # C++26 module source files (.cxx) and impl (.cpp)
+│   ├── libfork.cxx           # libfork — meta-module, re-exports all public modules
+│   ├── utils/                # libfork.utils — internal utilities (not public API)
+│   │   ├── utils.cxx         #   aggregator
+│   │   └── *.cxx             #   :partitions
+│   ├── core/                 # libfork.core — core task/scheduler primitives
+│   │   ├── core.cxx          #   aggregator
+│   │   └── *.cxx             #   :partitions
+│   ├── batteries/            # libfork.batteries — stacks, contexts, adaptors
+│   │   ├── batteries.cxx     #   aggregator
+│   │   └── *.cxx             #   :partitions
+│   └── schedulers/           # libfork.schedulers — concrete schedulers
+│   │   ├── schedulers.cxx    #   aggregator
+│   │   └── *.cxx             #   :partitions
+├── test/src/**/              # Test suite (Catch2) — uses `import libfork;`
+│   └── *.cpp
+├── benchmark/                # Benchmarking suite (google-benchmark)
+│   ├── lib/                  # Shared benchmark utilities and definitions
+│   │   ├── *.hpp             #   headers
+│   │   └── *.cpp             #   common source
+│   ├── src/                  # Implementation-specific benchmarks
+│   │   ├── libfork/          #   libfork-based benchmarks
+│   │   ├── serial/           #   serial benchmarks
+│   │   └── */                #   Other library benchmarks (e.g. OpenMP, TBB, Cilk Plus)
+│   └── external/             # External benchmark code (e.g. UTS)
+├── .github/workflows/        # CI workflows
+│   ├── linux.yml             # Linux builds
+│   ├── macos.yml             # MacOS builds
+│   ├── lint.yml              # Linting
+│   └── linear.yml            # Enforces linear history (no merge commits)
+└── CMakeLists.txt            # Main build configuration
+```
+
+## Workflows
+
+### Workflow Command Pattern
+
+All workflows follow this pattern:
+
+```yaml
+- Install Dependencies: brew install ...
+- Configure: cmake --preset <preset> -DCMAKE_TOOLCHAIN_FILE=<toolchain>.cmake
+- Build: cmake --build --preset <preset>
+- Test: ctest --preset <preset>
+```
+
+## Common Development Tasks
+
+### Making Code Changes
+
+1. **Modify source files** in `src/`, `include/`, `test/`, or `benchmark/`
+2. **Rebuild**: `cmake --build --preset <your-preset>`
+3. **Test**: `ctest --preset <your-preset>`
+
+#### Adding/removing files from `src/` or `include/`
+
+- Update the root `CMakeLists.txt` with new/removed files.
+
+#### Adding/removing files from benchmarks
+
+- Update the relevant `CMakeLists.txt` in `benchmark/lib/` or `benchmark/src/<impl>/`.
+
+### Adding Tests
+
+Strive to add tests for new features/bug fixes.
+
+- Add `.cpp` files to `test/src/`
+- Tests auto-discovered by CMake (GLOB_RECURSE)
+- Links against `libfork::libfork` and `Catch2::Catch2WithMain`
+
+### Modifying Build Configuration
+
+**Warning**: Module-related changes are complex. Test thoroughly with clean builds.
+
+## Troubleshooting
+
+### Build Failures
+
+**Problem**: Configuration/Build fails after adding/removing files or modifying CMakeLists.txt
+**Solution**: Try a clean build directory:
+
+```bash
+rm -rf build/
+```
+
+**Problem**: "compiler does not provide a way to discover the import graph"
+**Solution**: Add `-DCMAKE_TOOLCHAIN_FILE=cmake/llvm-brew-toolchain.cmake` to configure
+
+**Problem**: "Could not find 'brew' executable"
+**Solution**: Install Homebrew
+
+**Problem**: "Could not automatically find libc++.modules.json"
+**Solution**: Ensure LLVM is installed via Homebrew; toolchain auto-detects the path
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 000000000..68490f0c1
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,3 @@
+# In ./CLAUDE.md
+
+@AGENTS.md
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d193bc879..107cc5dd2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,38 +1,131 @@
-cmake_minimum_required(VERSION 4.2.1 FATAL_ERROR)
+cmake_minimum_required(VERSION 4.3 FATAL_ERROR)
 
 # See `Help/dev/experimental.rst`
-set(CMAKE_EXPERIMENTAL_CXX_IMPORT_STD "d0edc3af-4c50-42ea-a356-e2862fe7a444")
+set(CMAKE_EXPERIMENTAL_CXX_IMPORT_STD "451f2fe2-a8a2-47c3-bc32-94786d8fc91b")
 
 include(cmake/read_version.cmake)
 
-read_version(${CMAKE_CURRENT_SOURCE_DIR}/include/libfork/core/macro.hpp)
+read_version(${CMAKE_CURRENT_SOURCE_DIR}/include/libfork/version.hpp)
 
 project(
   libfork
   VERSION ${version_major}.${version_minor}.${version_patch}
-  DESCRIPTION "A bleeding-edge, lock-free, wait-free, continuation-stealing fork-join library built on C++20's coroutines."
   LANGUAGES CXX
 )
 
+# ---- Project options ----
+
+option(libfork_DEV_MODE "Enable developer build (tests/benchmarks/etc) for libfork" OFF)
+
+# ---- System dependencies ----
+
+find_package(Threads REQUIRED)
+
+# ===========================
+
 # Tell CMake that we explicitly want `import std`. This will initialize the
 # property on all targets declared after this to 1
 # TODO: set property per target
 set(CMAKE_CXX_MODULE_STD 1)
 
-# Make a library.
-add_library(uses_std STATIC)
+add_library(libfork_libfork)
+add_library(libfork::libfork ALIAS libfork_libfork)
+
+target_link_libraries(libfork_libfork PUBLIC Threads::Threads)
 
-# Add sources.
-target_sources(uses_std PRIVATE uses_std.cxx)
+set_property(TARGET libfork_libfork PROPERTY EXPORT_NAME libfork)
 
-# Tell CMake we're using C++23 but only C++20 is needed to consume it.
-target_compile_features(uses_std INTERFACE cxx_std_23)
+target_compile_features(libfork_libfork PUBLIC cxx_std_26)
 
-# Make an executable.
-add_executable(main)
+# Public headers, __impl must be public because consumers need
+# them to build the module BMI
+target_sources(libfork_libfork
+  PUBLIC
+    FILE_SET HEADERS FILES
+      include/libfork/version.hpp
+      include/libfork/__impl/compiler.hpp
+      include/libfork/__impl/exception.hpp
+      include/libfork/__impl/utils.hpp
+      include/libfork/__impl/assume.hpp
+    BASE_DIRS
+      include
+)
 
-target_sources(main PRIVATE main.cxx)
-target_link_libraries(main PRIVATE uses_std)
+# Add the module files to the library, must be public because
+# consumers will need bo build the BMI
+target_sources(libfork_libfork
+  PUBLIC
+    FILE_SET CXX_MODULES FILES
+      # libfork (meta)
+      src/libfork.cxx
+      # libfork.utils
+      src/utils/utils.cxx
+      src/utils/utility.cxx
+      src/utils/constants.cxx
+      src/utils/tuple.cxx
+      src/utils/concepts.cxx
+      src/utils/defer.cxx
+      src/utils/uninitialized.cxx
+      # libfork.core
+      src/core/core.cxx
+      src/core/exception.cxx
+      src/core/concepts/stack.cxx
+      src/core/concepts/context.cxx
+      src/core/concepts/scheduler.cxx
+      src/core/concepts/invocable.cxx
+      src/core/concepts/awaitable.cxx
+      src/core/concepts/indirect.cxx
+      src/core/concepts/semigroup.cxx
+      src/core/frame.cxx
+      src/core/task.cxx
+      src/core/ops.cxx
+      src/core/poly_context.cxx
+      src/core/thread_locals.cxx
+      src/core/schedule.cxx
+      src/core/handles.cxx
+      src/core/root.cxx
+      src/core/execute.cxx
+      src/core/receiver.cxx
+      src/core/final_suspend.cxx
+      src/core/awaitables.cxx
+      src/core/promise.cxx
+      src/core/stop.cxx
+      src/core/projected.cxx
+      src/core/lift.cxx
+      # libfork.batteries
+      src/batteries/batteries.cxx
+      src/batteries/deque.cxx
+      src/batteries/adaptors.cxx
+      src/batteries/contexts.cxx
+      src/batteries/geometric_stack.cxx
+      src/batteries/adaptor_stack.cxx
+      src/batteries/slab_stack.cxx
+      # libfork.schedulers
+      src/schedulers/schedulers.cxx
+      src/schedulers/inline.cxx
+      src/schedulers/busy.cxx
+      # libfork.algorithm
+      src/algorithm/algorithm.cxx
+      src/algorithm/for_each.cxx
+      src/algorithm/fold.cxx
+      src/algorithm/concepts.cxx
+    PRIVATE
+      src/exception.cpp
+)
+
+# ======================
+
+if(libfork_DEV_MODE)
+
+  include(CTest) # Enables the BUILD_TESTING option
+
+  if(BUILD_TESTING)
+    add_subdirectory(test)
+  endif()
+
+  add_subdirectory(benchmark)
+
+endif()
 
 # list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 #
@@ -43,9 +136,6 @@ target_link_libraries(main PRIVATE uses_std)
 #
 # message(STATUS "CMAKE_BUILD_TYPE is set to '${CMAKE_BUILD_TYPE}'")
 #
-# # ---- System dependencies ----
-#
-# find_package(Threads REQUIRED)
 #
 # # ------ Declare library ------
 #
diff --git a/CMakePresets.json b/CMakePresets.json
new file mode 100644
index 000000000..ac23b1e37
--- /dev/null
+++ b/CMakePresets.json
@@ -0,0 +1,126 @@
+{
+  "version": 10,
+  "configurePresets": [
+    {
+      "name": "cmake-pedantic",
+      "hidden": true,
+      "warnings": {
+        "dev": true,
+        "deprecated": true,
+        "uninitialized": true,
+        "unusedCli": true,
+        "systemVars": false
+      },
+      "errors": {
+        "deprecated": true
+      }
+    },
+    {
+      "name": "ci-base",
+      "inherits": "cmake-pedantic",
+      "hidden": true,
+      "generator": "Ninja",
+      "binaryDir": "${sourceDir}/build/${presetName}",
+      "cacheVariables": {
+        "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
+        "libfork_DEV_MODE": "ON"
+      }
+    },
+    {
+      "name": "ci-hardened",
+      "inherits": "ci-base",
+      "displayName": "Debug with warnings and hardening",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Debug",
+        "CMAKE_CXX_FLAGS": "-O2 -Wall -Wextra -Wpedantic -Wconversion -Wsign-conversion -Wcast-qual -Wformat -Wformat=2 -Wundef -Werror=float-equal -Wshadow -Wcast-align -Wunused -Wnull-dereference -Wdouble-promotion -Wimplicit-fallthrough -Wextra-semi -Woverloaded-virtual -Wnon-virtual-dtor -Wold-style-cast -Werror=format-security -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=3 -D_GLIBCXX_ASSERTIONS -fstrict-flex-arrays=3 -fstack-protector-strong -Wno-missing-braces -Wno-missing-field-initializers -Wno-c2y-extensions"
+      }
+    },
+    {
+      "name": "ci-release",
+      "inherits": "ci-base",
+      "displayName": "Release",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Release",
+        "CMAKE_CXX_FLAGS": "-O3 -DNDEBUG -flto=auto -march=native -falign-functions=64"
+      }
+    },
+    {
+      "name": "ci-no-except-rtti",
+      "inherits": "ci-base",
+      "displayName": "Release no RTTI or exceptions",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Release",
+        "CMAKE_CXX_FLAGS": "-O3 -DNDEBUG -flto=auto -march=native -fno-exceptions -fno-rtti -falign-functions=64"
+      }
+    },
+    {
+      "name": "ci-sanitize",
+      "inherits": "ci-base",
+      "displayName": "Debug with sanitizers",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Sanitize",
+        "CMAKE_CXX_FLAGS": "-O2 -g -fsanitize=address,undefined -fno-omit-frame-pointer -fno-common -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=3 -D_GLIBCXX_ASSERTIONS"
+      }
+    }
+  ],
+  "buildPresets": [
+    {
+      "name": "ci-hardened",
+      "configurePreset": "ci-hardened"
+    },
+    {
+      "name": "ci-release",
+      "configurePreset": "ci-release"
+    },
+    {
+      "name": "ci-no-except-rtti",
+      "configurePreset": "ci-no-except-rtti"
+    },
+    {
+      "name": "ci-sanitize",
+      "configurePreset": "ci-sanitize"
+    }
+  ],
+  "testPresets": [
+    {
+      "name": "ci-hardened",
+      "configurePreset": "ci-hardened",
+      "output": {
+        "outputOnFailure": true
+      },
+      "execution": {
+        "stopOnFailure": true
+      }
+    },
+    {
+      "name": "ci-release",
+      "configurePreset": "ci-release",
+      "output": {
+        "outputOnFailure": true
+      },
+      "execution": {
+        "stopOnFailure": true
+      }
+    },
+    {
+      "name": "ci-no-except-rtti",
+      "configurePreset": "ci-no-except-rtti",
+      "output": {
+        "outputOnFailure": true
+      },
+      "execution": {
+        "stopOnFailure": true
+      }
+    },
+    {
+      "name": "ci-sanitize",
+      "configurePreset": "ci-sanitize",
+      "output": {
+        "outputOnFailure": true
+      },
+      "execution": {
+        "stopOnFailure": true
+      }
+    }
+  ]
+}
diff --git a/CMakeUserPresets.json b/CMakeUserPresets.json
new file mode 100644
index 000000000..26e650168
--- /dev/null
+++ b/CMakeUserPresets.json
@@ -0,0 +1,79 @@
+{
+  "version": 10,
+  "configurePresets": [
+    {
+      "name": "dev",
+      "inherits": "ci-hardened",
+      "displayName": "Hardened development build",
+      "toolchainFile": "${sourceDir}/cmake/llvm-brew-toolchain.cmake",
+      "cacheVariables": {
+        "CMAKE_COLOR_DIAGNOSTICS": "ON"
+      }
+    },
+    {
+      "name": "bench",
+      "inherits": "ci-release",
+      "displayName": "Release build for benchmarks",
+      "toolchainFile": "${sourceDir}/cmake/llvm-brew-toolchain.cmake",
+      "cacheVariables": {
+        "CMAKE_COLOR_DIAGNOSTICS": "ON"
+      }
+    }
+  ],
+  "buildPresets": [
+    {
+      "name": "dev",
+      "configurePreset": "dev"
+    },
+    {
+      "name": "bench",
+      "configurePreset": "bench"
+    }
+  ],
+  "testPresets": [
+    {
+      "name": "dev",
+      "configurePreset": "dev",
+      "output": {
+        "outputOnFailure": true
+      },
+      "execution": {
+        "stopOnFailure": true
+      }
+    }
+  ],
+  "workflowPresets": [
+    {
+      "name": "dev",
+      "displayName": "Development Debug Hardened Workflow",
+      "steps": [
+        {
+          "type": "configure",
+          "name": "dev"
+        },
+        {
+          "type": "build",
+          "name": "dev"
+        },
+        {
+          "type": "test",
+          "name": "dev"
+        }
+      ]
+    },
+    {
+      "name": "bench",
+      "displayName": "Release Build (including Benchmarks)",
+      "steps": [
+        {
+          "type": "configure",
+          "name": "bench"
+        },
+        {
+          "type": "build",
+          "name": "bench"
+        }
+      ]
+    }
+  ]
+}
diff --git a/.legacy/LICENSE.md b/LICENSE.md
similarity index 100%
rename from .legacy/LICENSE.md
rename to LICENSE.md
diff --git a/actions/setup/action.yaml b/actions/setup/action.yaml
deleted file mode 100644
index cd451a14b..000000000
--- a/actions/setup/action.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-name: 'setup'
-description: 'setup vcpkg/cmake/ninja and caching'
-
-runs:
-  using: "composite"
-
-  steps:
-    # Set env vars needed for vcpkg to leverage the GitHub Action cache as a storage for Binary Caching.
-    - uses: actions/github-script@v6
-      with:
-        script: |
-          core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
-          core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
-
-    - uses: actions/checkout@v3
-      with:
-        submodules: true
-    - name: "Create directory '${{ env.VCPKG_DEFAULT_BINARY_CACHE }}'"
-      run: mkdir -p $VCPKG_DEFAULT_BINARY_CACHE
-      shell: bash
-
-    # Setup the build machine with the most recent versions of CMake and Ninja. 
-    # Both are cached if not already: on subsequent runs both will be quickly restored from GitHub cache service.
-    - uses: lukka/get-cmake@latest
-
-    # Restore vcpkg from the GitHub Action cache service. 
-    # Note that packages are restored by vcpkg's binary caching when it is being run afterwards by CMake.
-    - name: Restore vcpkg
-      uses: actions/cache@v3
-      with:
-        # The first path is the location of vcpkg: it contains the vcpkg executable and data files, as long as the
-        # built package archives (aka binary cache) which are located by VCPKG_DEFAULT_BINARY_CACHE env var.
-        # The other paths starting with '!' are exclusions: they contain temporary files generated 
-        # during the build of the installed packages.
-        path: |
-          ${{ env.VCPKG_ROOT_DIR }}
-          !${{ env.VCPKG_ROOT_DIR }}/buildtrees
-          !${{ env.VCPKG_ROOT_DIR }}/packages
-          !${{ env.VCPKG_ROOT_DIR }}/downloads
-          !${{ env.VCPKG_ROOT_DIR }}/installed
-        # The key is composed in a way that it gets properly invalidated whenever a different version of vcpkg is being used.
-        key: |
-          ${{ hashFiles( '.git/modules/vcpkg/HEAD' )}}
-
-    # On Windows runners, let's ensure to have the Developer Command Prompt environment setup correctly.
-    # As used here the Developer Command Prompt created is targeting x64 and using the default the Windows SDK.
-    - uses: ilammy/msvc-dev-cmd@v1
-
-    - name: Setup xcode
-      if: matrix.os == 'macos-13'
-      shell: bash
-      run: sudo xcode-select --switch /Applications/Xcode_15.0.app/Contents/Developer 
\ No newline at end of file
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
new file mode 100644
index 000000000..5ce1a7480
--- /dev/null
+++ b/benchmark/CMakeLists.txt
@@ -0,0 +1,51 @@
+cmake_minimum_required(VERSION 4.2.1 FATAL_ERROR)
+
+project(libfork_benchmark LANGUAGES CXX)
+
+if(NOT CMAKE_BUILD_TYPE STREQUAL "Release")
+  message(WARNING "It is recommended to build benchmarks in Release mode for accurate results.")
+endif()
+
+# ---- Dependencies ----
+
+find_package(benchmark REQUIRED)
+
+# ---- Benchmarks ----
+
+add_subdirectory(lib)
+
+add_subdirectory(src/serial)
+add_subdirectory(src/baremetal)
+add_subdirectory(src/libfork)
+
+# WHOLE_ARCHIVE ensures benchmark registrations (global initialisers) are not
+# dropped by the linker when pulling objects from the static libraries above.
+add_executable(libfork_benchmark src/benchmarks.cpp)
+
+target_link_libraries(libfork_benchmark
+  PRIVATE
+    $<LINK_LIBRARY:WHOLE_ARCHIVE,serial_benchmarks,baremetal_benchmarks,libfork_benchmarks>
+    benchmark::benchmark_main
+)
+
+if(BUILD_TESTING)
+  add_test(NAME Benchmark
+    COMMAND libfork_benchmark --benchmark_dry_run --benchmark_filter=^test/
+  )
+endif()
+
+# ---- OpenMP Benchmarks ----
+
+find_package(OpenMP REQUIRED)
+
+if(OpenMP_CXX_FOUND)
+
+  add_subdirectory(src/openmp)
+
+  target_link_libraries(libfork_benchmark
+    PRIVATE
+      $<LINK_LIBRARY:WHOLE_ARCHIVE,openmp_benchmarks>
+  )
+endif()
+
+
diff --git a/benchmark/external/uts/CMakeLists.txt b/benchmark/external/uts/CMakeLists.txt
new file mode 100644
index 000000000..89c8b5e6e
--- /dev/null
+++ b/benchmark/external/uts/CMakeLists.txt
@@ -0,0 +1,19 @@
+cmake_minimum_required(VERSION 4.2.1 FATAL_ERROR)
+
+project(uts_external LANGUAGES C)
+
+add_library(uts_c)
+
+target_sources(uts_c
+  PRIVATE
+    src/uts.c
+    src/rng/brg_sha1.c
+  PUBLIC
+    FILE_SET HEADERS
+    BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/include
+    FILES
+      include/uts/uts.h
+      include/uts/rng/rng.h
+      include/uts/rng/brg_sha1.h
+      include/uts/rng/brg_types.h
+)
diff --git a/benchmark/external/uts/include/uts/rng/brg_sha1.h b/benchmark/external/uts/include/uts/rng/brg_sha1.h
new file mode 100644
index 000000000..d30f12c0d
--- /dev/null
+++ b/benchmark/external/uts/include/uts/rng/brg_sha1.h
@@ -0,0 +1,100 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2002, Dr Brian Gladman, Worcester, UK.   All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 01/08/2005
+*/
+
+#ifndef _SHA1_H
+#define _SHA1_H
+
+#include "uts/rng/brg_types.h"
+
+#define SHA1_BLOCK_SIZE 64
+#define SHA1_DIGEST_SIZE 20
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/** BEGIN: UTS RNG Harness **/
+
+#define POS_MASK 0x7fffffff
+#define HIGH_BITS 0x80000000
+
+#define sha1_context sha1_ctx_s
+
+/**********************************/
+/* random number generator state  */
+/**********************************/
+struct state_t {
+  uint_8t state[20];
+};
+
+typedef uint_8t RNG_state;
+
+/***************************************/
+/* random number generator operations  */
+/***************************************/
+void rng_init(RNG_state *state, int seed);
+void rng_spawn(RNG_state *mystate, RNG_state *newstate, int spawnNumber);
+int rng_rand(RNG_state *mystate);
+int rng_nextrand(RNG_state *mystate);
+char *rng_showstate(RNG_state *state, char *s);
+int rng_showtype(char *strBuf, int ind);
+
+/** END: UTS RNG Harness **/
+/* type to hold the SHA256 context  */
+
+struct sha1_ctx_s {
+  uint_32t count[2];
+  uint_32t hash[5];
+  uint_32t wbuf[16];
+};
+
+typedef struct sha1_ctx_s sha1_ctx;
+
+/* Note that these prototypes are the same for both bit and */
+/* byte oriented implementations. However the length fields */
+/* are in bytes or bits as appropriate for the version used */
+/* and bit sequences are input as arrays of bytes in which  */
+/* bit sequences run from the most to the least significant */
+/* end of each byte                                         */
+
+VOID_RETURN sha1_compile(sha1_ctx ctx[1]);
+
+VOID_RETURN sha1_begin(sha1_ctx ctx[1]);
+VOID_RETURN sha1_hash(const unsigned char data[], unsigned long len, sha1_ctx ctx[1]);
+VOID_RETURN sha1_end(unsigned char hval[], sha1_ctx ctx[1]);
+VOID_RETURN sha1(unsigned char hval[], const unsigned char data[], unsigned long len);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/benchmark/external/uts/include/uts/rng/brg_types.h b/benchmark/external/uts/include/uts/rng/brg_types.h
new file mode 100644
index 000000000..9532acce6
--- /dev/null
+++ b/benchmark/external/uts/include/uts/rng/brg_types.h
@@ -0,0 +1,214 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue 09/09/2006
+
+ The unsigned integer types defined here are of the form uint_<nn>t where
+ <nn> is the length of the type; for example, the unsigned 32-bit type is
+ 'uint_32t'.  These are NOT the same as the 'C99 integer types' that are
+ defined in the inttypes.h and stdint.h headers since attempts to use these
+ types have shown that support for them is still highly variable.  However,
+ since the latter are of the form unit<nn>_t, a regular expression search
+ and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t')
+ can be used to convert the types used here to the C99 standard types.
+*/
+
+#ifndef BRG_TYPES_H
+#define BRG_TYPES_H
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#include <limits.h>
+
+/* Try one of these if things don't work automatically */
+#ifdef BRG_C99_TYPES
+  #include <inttypes.h>
+  #include <stdint.h>
+  #define BRG_UI8
+typedef uint8_t uint_8t;
+  #define BRG_UI16
+typedef uint16_t uint_16t;
+  #define BRG_UI32
+  #define li_32(h) 0x##h##u
+typedef uint32_t uint_32t;
+  #define BRG_UI64
+  #define li_64(h) 0x##h##u
+typedef uint64_t uint_64t;
+
+#elif defined(BRG_STD_TYPES)
+  #include <sys/types.h>
+  #define BRG_UI8
+typedef u_int8_t uint_8t;
+  #define BRG_UI16
+typedef u_int16_t uint_16t;
+  #define BRG_UI32
+  #define li_32(h) 0x##h##u
+typedef u_int32_t uint_32t;
+  #define BRG_UI64
+  #define li_64(h) 0x##h##u
+typedef u_int64_t uint_64t;
+
+#endif
+
+#ifndef BRG_UI8
+  #define BRG_UI8
+  #if UCHAR_MAX == 255u
+typedef unsigned char uint_8t;
+  #else
+    #error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h
+  #endif
+#endif
+
+#ifndef BRG_UI16
+  #define BRG_UI16
+  #if USHRT_MAX == 65535u
+typedef unsigned short uint_16t;
+  #else
+    #error Please define uint_16t as a 16-bit unsigned short type in brg_types.h
+  #endif
+#endif
+
+#ifndef BRG_UI32
+  #define BRG_UI32
+  #if UINT_MAX == 4294967295u
+    #define li_32(h) 0x##h##u
+typedef unsigned int uint_32t;
+  #elif ULONG_MAX == 4294967295u
+    #define li_32(h) 0x##h##ul
+typedef unsigned long uint_32t;
+  #elif defined(_CRAY)
+    #error This code needs 32-bit data types, which Cray machines do not provide
+  #else
+    #error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h
+  #endif
+#endif
+
+#ifndef BRG_UI64
+  #if defined(__BORLANDC__) && !defined(__MSDOS__)
+    #define BRG_UI64
+    #define li_64(h) 0x##h##ull
+typedef unsigned __int64 uint_64t;
+  #elif defined(_MSC_VER) && (_MSC_VER < 1300) /* 1300 == VC++ 7.0 */
+    #define BRG_UI64
+    #define li_64(h) 0x##h##ui64
+typedef unsigned __int64 uint_64t;
+  #elif defined(__sun) && defined(ULONG_MAX) && ULONG_MAX == 0xfffffffful
+    #define BRG_UI64
+    #define li_64(h) 0x##h##ull
+typedef unsigned long long uint_64t;
+  #elif defined(UINT_MAX) && UINT_MAX > 4294967295u
+    #if UINT_MAX == 18446744073709551615u
+      #define BRG_UI64
+      #define li_64(h) 0x##h##u
+typedef unsigned int uint_64t;
+    #endif
+  #elif defined(ULONG_MAX) && ULONG_MAX > 4294967295u
+    #if ULONG_MAX == 18446744073709551615ul
+      #define BRG_UI64
+      #define li_64(h) 0x##h##ul
+typedef unsigned long uint_64t;
+    #endif
+  #elif defined(ULLONG_MAX) && ULLONG_MAX > 4294967295u
+    #if ULLONG_MAX == 18446744073709551615ull
+      #define BRG_UI64
+      #define li_64(h) 0x##h##ull
+typedef unsigned long long uint_64t;
+    #endif
+  #elif defined(ULONG_LONG_MAX) && ULONG_LONG_MAX > 4294967295u
+    #if ULONG_LONG_MAX == 18446744073709551615ull
+      #define BRG_UI64
+      #define li_64(h) 0x##h##ull
+typedef unsigned long long uint_64t;
+    #endif
+  #endif
+#endif
+
+#if defined(NEED_UINT_64T) && !defined(BRG_UI64)
+  #error Please define uint_64t as an unsigned 64 bit type in brg_types.h
+#endif
+
+#ifndef RETURN_VALUES
+  #define RETURN_VALUES
+  #if defined(DLL_EXPORT)
+    #if defined(_MSC_VER) || defined(__INTEL_COMPILER)
+      #define VOID_RETURN __declspec(dllexport) void __stdcall
+      #define INT_RETURN __declspec(dllexport) int __stdcall
+    #elif defined(__GNUC__)
+      #define VOID_RETURN __declspec(__dllexport__) void
+      #define INT_RETURN __declspec(__dllexport__) int
+    #else
+      #error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+    #endif
+  #elif defined(DLL_IMPORT)
+    #if defined(_MSC_VER) || defined(__INTEL_COMPILER)
+      #define VOID_RETURN __declspec(dllimport) void __stdcall
+      #define INT_RETURN __declspec(dllimport) int __stdcall
+    #elif defined(__GNUC__)
+      #define VOID_RETURN __declspec(__dllimport__) void
+      #define INT_RETURN __declspec(__dllimport__) int
+    #else
+      #error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+    #endif
+  #elif defined(__WATCOMC__)
+    #define VOID_RETURN void __cdecl
+    #define INT_RETURN int __cdecl
+  #else
+    #define VOID_RETURN void
+    #define INT_RETURN int
+  #endif
+#endif
+
+/*  These defines are used to declare buffers in a way that allows
+    faster operations on longer variables to be used.  In all these
+    defines 'size' must be a power of 2 and >= 8
+
+    dec_unit_type(size,x)       declares a variable 'x' of length
+                                'size' bits
+
+    dec_bufr_type(size,bsize,x) declares a buffer 'x' of length 'bsize'
+                                bytes defined as an array of variables
+                                each of 'size' bits (bsize must be a
+                                multiple of size / 8)
+
+    ptr_cast(x,size)            casts a pointer to a pointer to a
+                                variable of length 'size' bits
+*/
+
+#define ui_type(size) uint_##size##t
+#define dec_unit_type(size, x) typedef ui_type(size) x
+#define dec_bufr_type(size, bsize, x) typedef ui_type(size) x[bsize / (size >> 3)]
+#define ptr_cast(x, size) ((ui_type(size) *)(x))
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/benchmark/external/uts/include/uts/rng/rng.h b/benchmark/external/uts/include/uts/rng/rng.h
new file mode 100644
index 000000000..105c40466
--- /dev/null
+++ b/benchmark/external/uts/include/uts/rng/rng.h
@@ -0,0 +1,6 @@
+#ifndef _RNG_H
+#define _RNG_H
+
+#include "uts/rng/brg_sha1.h"
+
+#endif /* _RNG_H */
\ No newline at end of file
diff --git a/benchmark/external/uts/include/uts/uts.h b/benchmark/external/uts/include/uts/uts.h
new file mode 100644
index 000000000..e86e68f3e
--- /dev/null
+++ b/benchmark/external/uts/include/uts/uts.h
@@ -0,0 +1,120 @@
+#ifndef A0179FFF_4078_4EEB_BB6E_1E8C75CC694C
+#define A0179FFF_4078_4EEB_BB6E_1E8C75CC694C
+/*
+ *         ---- The Unbalanced Tree Search (UTS) Benchmark ----
+ *
+ *  Copyright (c) 2010 See AUTHORS file for copyright holders
+ *
+ *  This file is part of the unbalanced tree search benchmark.  This
+ *  project is licensed under the MIT Open Source license.  See the LICENSE
+ *  file for copyright and licensing information.
+ *
+ *  UTS is a collaborative project between researchers at the University of
+ *  Maryland, the University of North Carolina at Chapel Hill, and the Ohio
+ *  State University.  See AUTHORS file for more information.
+ *
+ */
+
+#ifndef _UTS_H
+  #define _UTS_H
+
+  #ifdef __cplusplus
+extern "C" {
+  #endif
+
+  #include "uts/rng/rng.h"
+
+  #define UTS_VERSION "2.1"
+
+  /***********************************************************
+   *  Tree node descriptor and statistics                    *
+   ***********************************************************/
+
+  #define MAXNUMCHILDREN 100 // cap on children (BIN root is exempt)
+
+struct node_t {
+  int type;        // distribution governing number of children
+  int height;      // depth of this node in the tree
+  int numChildren; // number of children, -1 => not yet determined
+
+  /* for RNG state associated with this node */
+  struct state_t state;
+};
+
+typedef struct node_t Node;
+
+/* Tree type
+ *   Trees are generated using a Galton-Watson process, in
+ *   which the branching factor of each node is a random
+ *   variable.
+ *
+ *   The random variable can follow a binomial distribution
+ *   or a geometric distribution.  Hybrid tree are
+ *   generated with geometric distributions near the
+ *   root and binomial distributions towards the leaves.
+ */
+enum uts_trees_e { BIN = 0, GEO, HYBRID, BALANCED };
+enum uts_geoshape_e { LINEAR = 0, EXPDEC, CYCLIC, FIXED };
+
+typedef enum uts_trees_e tree_t;
+typedef enum uts_geoshape_e geoshape_t;
+
+/* Strings for the above enums */
+extern char *uts_trees_str[];
+extern char *uts_geoshapes_str[];
+
+/* Tree  parameters */
+extern tree_t type;
+extern double b_0;
+extern int rootId;
+extern int nonLeafBF;
+extern double nonLeafProb;
+extern int gen_mx;
+extern geoshape_t shape_fn;
+extern double shiftDepth;
+
+/* Benchmark parameters */
+extern int computeGranularity;
+extern int debug;
+extern int verbose;
+
+/* For stats generation: */
+typedef unsigned long long counter_t;
+
+  /* Utility Functions */
+  #define max(a, b) (((a) > (b)) ? (a) : (b))
+  #define min(a, b) (((a) < (b)) ? (a) : (b))
+
+void uts_error(char *str);
+void uts_parseParams(int argc, char **argv);
+int uts_paramsToStr(char *strBuf, int ind);
+void uts_printParams();
+void uts_helpMessage();
+
+void uts_showStats(
+    int nPes, int chunkSize, double walltime, counter_t nNodes, counter_t nLeaves, counter_t maxDepth);
+double uts_wctime();
+
+double rng_toProb(int n);
+
+/* Common tree routines */
+void uts_initRoot(Node *root, int type);
+int uts_numChildren(Node *parent);
+int uts_numChildren_bin(Node *parent);
+int uts_numChildren_geo(Node *parent);
+int uts_childType(Node *parent);
+
+/* Implementation Specific Functions */
+char *impl_getName();
+int impl_paramsToStr(char *strBuf, int ind);
+int impl_parseParam(char *param, char *value);
+void impl_helpMessage();
+void impl_abort(int err);
+
+  #ifdef __cplusplus
+}
+  #endif
+
+#endif /* _UTS_H */
+
+#endif /* A0179FFF_4078_4EEB_BB6E_1E8C75CC694C */
diff --git a/benchmark/external/uts/src/rng/brg_endian.h b/benchmark/external/uts/src/rng/brg_endian.h
new file mode 100644
index 000000000..96082e57b
--- /dev/null
+++ b/benchmark/external/uts/src/rng/brg_endian.h
@@ -0,0 +1,132 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue 20/10/2006
+*/
+
+#ifndef BRG_ENDIAN_H
+#define BRG_ENDIAN_H
+
+#define IS_BIG_ENDIAN 4321    /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */
+
+/* Include files where endian defines and byteswap functions may reside */
+#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
+  #include <sys/endian.h>
+#elif defined(BSD) && (BSD >= 199103) || defined(__APPLE__) || defined(__CYGWIN32__) ||                      \
+    defined(__DJGPP__) || defined(__osf__)
+  #include <machine/endian.h>
+#elif defined(__linux__) || defined(__GNUC__) || defined(__GNU_LIBRARY__)
+  #if !defined(__MINGW32__) && !defined(__sun__)
+    #include <endian.h>
+    #if !defined(__BEOS__)
+      #include <byteswap.h>
+    #endif
+  #endif
+#endif
+
+/* Now attempt to set the define for platform byte order using any  */
+/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
+/* seem to encompass most endian symbol definitions                 */
+
+#if defined(BIG_ENDIAN) && defined(LITTLE_ENDIAN)
+  #if defined(BYTE_ORDER) && BYTE_ORDER == BIG_ENDIAN
+    #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+  #elif defined(BYTE_ORDER) && BYTE_ORDER == LITTLE_ENDIAN
+    #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+  #endif
+#elif defined(BIG_ENDIAN)
+  #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined(LITTLE_ENDIAN)
+  #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined(_BIG_ENDIAN) && defined(_LITTLE_ENDIAN)
+  #if defined(_BYTE_ORDER) && _BYTE_ORDER == _BIG_ENDIAN
+    #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+  #elif defined(_BYTE_ORDER) && _BYTE_ORDER == _LITTLE_ENDIAN
+    #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+  #endif
+#elif defined(_BIG_ENDIAN)
+  #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined(_LITTLE_ENDIAN)
+  #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined(__BIG_ENDIAN) && defined(__LITTLE_ENDIAN)
+  #if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN
+    #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+  #elif defined(__BYTE_ORDER) && __BYTE_ORDER == __LITTLE_ENDIAN
+    #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+  #endif
+#elif defined(__BIG_ENDIAN)
+  #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined(__LITTLE_ENDIAN)
+  #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined(__BIG_ENDIAN__) && defined(__LITTLE_ENDIAN__)
+  #if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __BIG_ENDIAN__
+    #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+  #elif defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
+    #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+  #endif
+#elif defined(__BIG_ENDIAN__)
+  #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined(__LITTLE_ENDIAN__)
+  #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+/*  if the platform byte order could not be determined, then try to */
+/*  set this define using common machine defines                    */
+#if !defined(PLATFORM_BYTE_ORDER)
+
+  #if defined(__alpha__) || defined(__alpha) || defined(i386) || defined(__i386__) || defined(_M_I86) ||     \
+      defined(_M_IX86) || defined(__OS2__) || defined(sun386) || defined(__TURBOC__) || defined(vax) ||      \
+      defined(vms) || defined(VMS) || defined(__VMS) || defined(_M_X64)
+    #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+  #elif defined(AMIGA) || defined(applec) || defined(__AS400__) || defined(_CRAY) || defined(__hppa) ||      \
+      defined(__hp9000) || defined(ibm370) || defined(mc68000) || defined(m68k) || defined(__MRC__) ||       \
+      defined(__MVS__) || defined(__MWERKS__) || defined(sparc) || defined(__sparc) ||                       \
+      defined(SYMANTEC_C) || defined(__VOS__) || defined(__TIGCC__) || defined(__TANDEM) ||                  \
+      defined(THINK_C) || defined(__VMCMS__)
+    #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+
+  #elif 0 /* **** EDIT HERE IF NECESSARY **** */
+    #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+  #elif 0 /* **** EDIT HERE IF NECESSARY **** */
+    #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+  #else
+    #error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order
+  #endif
+
+#endif
+
+#endif
\ No newline at end of file
diff --git a/benchmark/external/uts/src/rng/brg_sha1.c b/benchmark/external/uts/src/rng/brg_sha1.c
new file mode 100644
index 000000000..f6757bafc
--- /dev/null
+++ b/benchmark/external/uts/src/rng/brg_sha1.c
@@ -0,0 +1,340 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2002, Dr Brian Gladman, Worcester, UK.   All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 01/08/2005
+
+ This is a byte oriented version of SHA1 that operates on arrays of bytes
+ stored in memory.
+*/
+
+#include <stdio.h>
+#include <string.h> /* for memcpy() etc.        */
+
+#include "brg_endian.h"
+#include "uts/rng/brg_sha1.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/** BEGIN: UTS RNG Harness **/
+
+void rng_init(RNG_state *newstate, int seed) {
+  struct sha1_context ctx;
+  struct state_t gen;
+  int i;
+
+  for (i = 0; i < 16; i++)
+    gen.state[i] = 0;
+  gen.state[16] = 0xFF & (seed >> 24);
+  gen.state[17] = 0xFF & (seed >> 16);
+  gen.state[18] = 0xFF & (seed >> 8);
+  gen.state[19] = 0xFF & (seed >> 0);
+
+  sha1_begin(&ctx);
+  sha1_hash(gen.state, 20, &ctx);
+  sha1_end(newstate, &ctx);
+}
+
+void rng_spawn(RNG_state *mystate, RNG_state *newstate, int spawnnumber) {
+  struct sha1_context ctx;
+  uint_8t bytes[4];
+
+  bytes[0] = 0xFF & (spawnnumber >> 24);
+  bytes[1] = 0xFF & (spawnnumber >> 16);
+  bytes[2] = 0xFF & (spawnnumber >> 8);
+  bytes[3] = 0xFF & spawnnumber;
+
+  sha1_begin(&ctx);
+  sha1_hash(mystate, 20, &ctx);
+  sha1_hash(bytes, 4, &ctx);
+  sha1_end(newstate, &ctx);
+}
+
+int rng_rand(RNG_state *mystate) {
+  int r;
+  uint_32t b = (mystate[16] << 24) | (mystate[17] << 16) | (mystate[18] << 8) | (mystate[19] << 0);
+  b = b & POS_MASK;
+
+  r = (int)b;
+  // printf("b: %d\t, r: %d\n", b, r);
+  return r;
+}
+
+int rng_nextrand(RNG_state *mystate) {
+  struct sha1_context ctx;
+  int r;
+  uint_32t b;
+
+  sha1_begin(&ctx);
+  sha1_hash(mystate, 20, &ctx);
+  sha1_end(mystate, &ctx);
+  b = (mystate[16] << 24) | (mystate[17] << 16) | (mystate[18] << 8) | (mystate[19] << 0);
+  b = b & POS_MASK;
+
+  r = (int)b;
+  return r;
+}
+
+/* condense state into string to display during debugging */
+char *rng_showstate(RNG_state *state, char *s) {
+  sprintf(s, "%.2X%.2X...", state[0], state[1]);
+  return s;
+}
+
+/* describe random number generator type into string */
+int rng_showtype(char *strBuf, int ind) {
+  ind += sprintf(strBuf + ind, "SHA-1 (state size = %uB)", (unsigned)sizeof(struct state_t));
+  return ind;
+}
+
+/** END: UTS RNG Harness **/
+
+#if defined(_MSC_VER) && (_MSC_VER > 800)
+  #pragma intrinsic(memcpy)
+#endif
+
+#if 0 && defined(_MSC_VER)
+  #define rotl32 _lrotl
+  #define rotr32 _lrotr
+#else
+  #define rotl32(x, n) (((x) << n) | ((x) >> (32 - n)))
+  #define rotr32(x, n) (((x) >> n) | ((x) << (32 - n)))
+#endif
+
+#if !defined(bswap_32)
+  #define bswap_32(x) ((rotr32((x), 24) & 0x00ff00ff) | (rotr32((x), 8) & 0xff00ff00))
+#endif
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+  #define SWAP_BYTES
+#else
+  #undef SWAP_BYTES
+#endif
+
+#if defined(SWAP_BYTES)
+  #define bsw_32(p, n)                                                                                       \
+    {                                                                                                        \
+      int _i = (n);                                                                                          \
+      while (_i--)                                                                                           \
+        ((uint_32t *)p)[_i] = bswap_32(((uint_32t *)p)[_i]);                                                 \
+    }
+#else
+  #define bsw_32(p, n)
+#endif
+
+#define SHA1_MASK (SHA1_BLOCK_SIZE - 1)
+
+#if 0
+
+  #define ch(x, y, z) (((x) & (y)) ^ (~(x) & (z)))
+  #define parity(x, y, z) ((x) ^ (y) ^ (z))
+  #define maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+#else /* Discovered by Rich Schroeppel and Colin Plumb   */
+
+  #define ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+  #define parity(x, y, z) ((x) ^ (y) ^ (z))
+  #define maj(x, y, z) (((x) & (y)) | ((z) & ((x) ^ (y))))
+
+#endif
+
+/* Compile 64 bytes of hash data into SHA1 context. Note    */
+/* that this routine assumes that the byte order in the     */
+/* ctx->wbuf[] at this point is in such an order that low   */
+/* address bytes in the ORIGINAL byte stream will go in     */
+/* this buffer to the high end of 32-bit words on BOTH big  */
+/* and little endian systems                                */
+
+#ifdef ARRAY
+  #define q(v, n) v[n]
+#else
+  #define q(v, n) v##n
+#endif
+
+#define one_cycle(v, a, b, c, d, e, f, k, h)                                                                 \
+  q(v, e) += rotr32(q(v, a), 27) + f(q(v, b), q(v, c), q(v, d)) + k + h;                                     \
+  q(v, b) = rotr32(q(v, b), 2)
+
+#define five_cycle(v, f, k, i)                                                                               \
+  one_cycle(v, 0, 1, 2, 3, 4, f, k, hf(i));                                                                  \
+  one_cycle(v, 4, 0, 1, 2, 3, f, k, hf(i + 1));                                                              \
+  one_cycle(v, 3, 4, 0, 1, 2, f, k, hf(i + 2));                                                              \
+  one_cycle(v, 2, 3, 4, 0, 1, f, k, hf(i + 3));                                                              \
+  one_cycle(v, 1, 2, 3, 4, 0, f, k, hf(i + 4))
+
+VOID_RETURN sha1_compile(sha1_ctx ctx[1]) {
+  uint_32t *w = ctx->wbuf;
+
+#ifdef ARRAY
+  uint_32t v[5];
+  memcpy(v, ctx->hash, 5 * sizeof(uint_32t));
+#else
+  uint_32t v0, v1, v2, v3, v4;
+  v0 = ctx->hash[0];
+  v1 = ctx->hash[1];
+  v2 = ctx->hash[2];
+  v3 = ctx->hash[3];
+  v4 = ctx->hash[4];
+#endif
+
+#define hf(i) w[i]
+
+  five_cycle(v, ch, 0x5a827999, 0);
+  five_cycle(v, ch, 0x5a827999, 5);
+  five_cycle(v, ch, 0x5a827999, 10);
+  one_cycle(v, 0, 1, 2, 3, 4, ch, 0x5a827999, hf(15));
+
+#undef hf
+#define hf(i) (w[(i)&15] = rotl32(w[((i) + 13) & 15] ^ w[((i) + 8) & 15] ^ w[((i) + 2) & 15] ^ w[(i)&15], 1))
+
+  one_cycle(v, 4, 0, 1, 2, 3, ch, 0x5a827999, hf(16));
+  one_cycle(v, 3, 4, 0, 1, 2, ch, 0x5a827999, hf(17));
+  one_cycle(v, 2, 3, 4, 0, 1, ch, 0x5a827999, hf(18));
+  one_cycle(v, 1, 2, 3, 4, 0, ch, 0x5a827999, hf(19));
+
+  five_cycle(v, parity, 0x6ed9eba1, 20);
+  five_cycle(v, parity, 0x6ed9eba1, 25);
+  five_cycle(v, parity, 0x6ed9eba1, 30);
+  five_cycle(v, parity, 0x6ed9eba1, 35);
+
+  five_cycle(v, maj, 0x8f1bbcdc, 40);
+  five_cycle(v, maj, 0x8f1bbcdc, 45);
+  five_cycle(v, maj, 0x8f1bbcdc, 50);
+  five_cycle(v, maj, 0x8f1bbcdc, 55);
+
+  five_cycle(v, parity, 0xca62c1d6, 60);
+  five_cycle(v, parity, 0xca62c1d6, 65);
+  five_cycle(v, parity, 0xca62c1d6, 70);
+  five_cycle(v, parity, 0xca62c1d6, 75);
+
+#ifdef ARRAY
+  ctx->hash[0] += v[0];
+  ctx->hash[1] += v[1];
+  ctx->hash[2] += v[2];
+  ctx->hash[3] += v[3];
+  ctx->hash[4] += v[4];
+#else
+  ctx->hash[0] += v0;
+  ctx->hash[1] += v1;
+  ctx->hash[2] += v2;
+  ctx->hash[3] += v3;
+  ctx->hash[4] += v4;
+#endif
+}
+
+VOID_RETURN sha1_begin(sha1_ctx ctx[1]) {
+  ctx->count[0] = ctx->count[1] = 0;
+  ctx->hash[0] = 0x67452301;
+  ctx->hash[1] = 0xefcdab89;
+  ctx->hash[2] = 0x98badcfe;
+  ctx->hash[3] = 0x10325476;
+  ctx->hash[4] = 0xc3d2e1f0;
+}
+
+/* SHA1 hash data in an array of bytes into hash buffer and */
+/* call the hash_compile function as required.              */
+
+VOID_RETURN sha1_hash(const unsigned char data[], unsigned long len, sha1_ctx ctx[1]) {
+  uint_32t pos = (uint_32t)(ctx->count[0] & SHA1_MASK), space = SHA1_BLOCK_SIZE - pos;
+  const unsigned char *sp = data;
+
+  if ((ctx->count[0] += len) < len)
+    ++(ctx->count[1]);
+
+  while (len >= space) /* transfer whole blocks if possible  */
+  {
+    memcpy(((unsigned char *)ctx->wbuf) + pos, sp, space);
+    sp += space;
+    len -= space;
+    space = SHA1_BLOCK_SIZE;
+    pos = 0;
+    bsw_32(ctx->wbuf, SHA1_BLOCK_SIZE >> 2);
+    sha1_compile(ctx);
+  }
+
+  memcpy(((unsigned char *)ctx->wbuf) + pos, sp, len);
+}
+
+/* SHA1 final padding and digest calculation  */
+
+VOID_RETURN sha1_end(unsigned char hval[], sha1_ctx ctx[1]) {
+  uint_32t i = (uint_32t)(ctx->count[0] & SHA1_MASK);
+
+  /* put bytes in the buffer in an order in which references to   */
+  /* 32-bit words will put bytes with lower addresses into the    */
+  /* top of 32 bit words on BOTH big and little endian machines   */
+  bsw_32(ctx->wbuf, (i + 3) >> 2);
+
+  /* we now need to mask valid bytes and add the padding which is */
+  /* a single 1 bit and as many zero bits as necessary. Note that */
+  /* we can always add the first padding byte here because the    */
+  /* buffer always has at least one empty slot                    */
+  ctx->wbuf[i >> 2] &= 0xffffff80 << 8 * (~i & 3);
+  ctx->wbuf[i >> 2] |= 0x00000080 << 8 * (~i & 3);
+
+  /* we need 9 or more empty positions, one for the padding byte  */
+  /* (above) and eight for the length count. If there is not      */
+  /* enough space, pad and empty the buffer                       */
+  if (i > SHA1_BLOCK_SIZE - 9) {
+    if (i < 60)
+      ctx->wbuf[15] = 0;
+    sha1_compile(ctx);
+    i = 0;
+  } else /* compute a word index for the empty buffer positions  */
+    i = (i >> 2) + 1;
+
+  while (i < 14) /* and zero pad all but last two positions        */
+    ctx->wbuf[i++] = 0;
+
+  /* the following 32-bit length fields are assembled in the      */
+  /* wrong byte order on little endian machines but this is       */
+  /* corrected later since they are only ever used as 32-bit      */
+  /* word values.                                                 */
+  ctx->wbuf[14] = (ctx->count[1] << 3) | (ctx->count[0] >> 29);
+  ctx->wbuf[15] = ctx->count[0] << 3;
+  sha1_compile(ctx);
+
+  /* extract the hash value as bytes in case the hash buffer is   */
+  /* misaligned for 32-bit words                                  */
+  for (i = 0; i < SHA1_DIGEST_SIZE; ++i)
+    hval[i] = (unsigned char)(ctx->hash[i >> 2] >> (8 * (~i & 3)));
+}
+
+VOID_RETURN sha1(unsigned char hval[], const unsigned char data[], unsigned long len) {
+  sha1_ctx cx[1];
+
+  sha1_begin(cx);
+  sha1_hash(data, len, cx);
+  sha1_end(hval, cx);
+}
+
+#if defined(__cplusplus)
+}
+#endif
\ No newline at end of file
diff --git a/benchmark/external/uts/src/uts.c b/benchmark/external/uts/src/uts.c
new file mode 100644
index 000000000..507915bea
--- /dev/null
+++ b/benchmark/external/uts/src/uts.c
@@ -0,0 +1,474 @@
+/*
+ *         ---- The Unbalanced Tree Search (UTS) Benchmark ----
+ *
+ *  Copyright (c) 2010 See AUTHORS file for copyright holders
+ *
+ *  This file is part of the unbalanced tree search benchmark.  This
+ *  project is licensed under the MIT Open Source license.  See the LICENSE
+ *  file for copyright and licensing information.
+ *
+ *  UTS is a collaborative project between researchers at the University of
+ *  Maryland, the University of North Carolina at Chapel Hill, and the Ohio
+ *  State University.  See AUTHORS file for more information.
+ *
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "uts/uts.h"
+
+/***********************************************************
+ *  tree generation and search parameters                  *
+ *                                                         *
+ *  Tree generation strategy is controlled via various     *
+ *  parameters set from the command line.  The parameters  *
+ *  and their default values are given below.              *
+ ***********************************************************/
+
+char *uts_trees_str[] = {"Binomial", "Geometric", "Hybrid", "Balanced"};
+char *uts_geoshapes_str[] = {"Linear decrease", "Exponential decrease", "Cyclic", "Fixed branching factor"};
+
+/* Tree type
+ *   Trees are generated using a Galton-Watson process, in
+ *   which the branching factor of each node is a random
+ *   variable.
+ *
+ *   The random variable can follow a binomial distribution
+ *   or a geometric distribution.  Hybrid tree are
+ *   generated with geometric distributions near the
+ *   root and binomial distributions towards the leaves.
+ */
+tree_t type = GEO; // Default tree type
+double b_0 = 4.0;  // default branching factor at the root
+int rootId = 0;    // default seed for RNG state at root
+
+/*  Tree type BIN (BINOMIAL)
+ *  The branching factor at the root is specified by b_0.
+ *  The branching factor below the root follows an
+ *     identical binomial distribution at all nodes.
+ *  A node has m children with prob q, or no children with
+ *     prob (1-q).  The expected branching factor is q * m.
+ *
+ *  Default parameter values
+ */
+int nonLeafBF = 4;                // m
+double nonLeafProb = 15.0 / 64.0; // q
+
+/*  Tree type GEO (GEOMETRIC)
+ *  The branching factor follows a geometric distribution with
+ *     expected value b.
+ *  The probability that a node has 0 <= n children is p(1-p)^n for
+ *     0 < p <= 1. The distribution is truncated at MAXNUMCHILDREN.
+ *  The expected number of children b = (1-p)/p.  Given b (the
+ *     target branching factor) we can solve for p.
+ *
+ *  A shape function computes a target branching factor b_i
+ *     for nodes at depth i as a function of the root branching
+ *     factor b_0 and a maximum depth gen_mx.
+ *
+ *  Default parameter values
+ */
+int gen_mx = 6;               // default depth of tree
+geoshape_t shape_fn = LINEAR; // default shape function (b_i decr linearly)
+
+/*  In type HYBRID trees, each node is either type BIN or type
+ *  GEO, with the generation strategy changing from GEO to BIN
+ *  at a fixed depth, expressed as a fraction of gen_mx
+ */
+double shiftDepth = 0.5;
+
+/* compute granularity - number of rng evaluations per tree node */
+int computeGranularity = 1;
+
+/* display parameters */
+int debug = 0;
+int verbose = 1;
+
+/***********************************************************
+ *                                                         *
+ *  FUNCTIONS                                              *
+ *                                                         *
+ ***********************************************************/
+
+/* fatal error */
+void uts_error(char *str) {
+  printf("*** Error: %s\n", str);
+  impl_abort(1);
+}
+
+/*
+ * wall clock time
+ *   for detailed accounting of work, this needs
+ *   high resolution
+ */
+double uts_wctime() {
+  struct timespec tv;
+  clock_gettime(CLOCK_MONOTONIC, &tv);
+  return (tv.tv_sec + 1E-9 * tv.tv_nsec);
+}
+
+// Interpret 32 bit positive integer as value on [0,1)
+double rng_toProb(int n) {
+  if (n < 0) {
+    printf("*** toProb: rand n = %d out of range\n", n);
+  }
+  return ((n < 0) ? 0.0 : ((double)n) / 2147483648.0);
+}
+
+void uts_initRoot(Node *root, int type) {
+  root->type = type;
+  root->height = 0;
+  root->numChildren = -1; // means not yet determined
+  rng_init(root->state.state, rootId);
+
+  if (debug & 1)
+    printf("root node of type %d at %p\n", type, root);
+}
+
+int uts_numChildren_bin(Node *parent) {
+  // distribution is identical everywhere below root
+  int v = rng_rand(parent->state.state);
+  double d = rng_toProb(v);
+
+  return (d < nonLeafProb) ? nonLeafBF : 0;
+}
+
+int uts_numChildren_geo(Node *parent) {
+  double b_i = b_0;
+  int depth = parent->height;
+  int numChildren, h;
+  double p, u;
+
+  // use shape function to compute target b_i
+  if (depth > 0) {
+    switch (shape_fn) {
+
+        // expected size polynomial in depth
+      case EXPDEC:
+        b_i = b_0 * pow((double)depth, -log(b_0) / log((double)gen_mx));
+        break;
+
+        // cyclic tree size
+      case CYCLIC:
+        if (depth > 5 * gen_mx) {
+          b_i = 0.0;
+          break;
+        }
+        b_i = pow(b_0, sin(2.0 * 3.141592653589793 * (double)depth / (double)gen_mx));
+        break;
+
+        // identical distribution at all nodes up to max depth
+      case FIXED:
+        b_i = (depth < gen_mx) ? b_0 : 0;
+        break;
+
+        // linear decrease in b_i
+      case LINEAR:
+      default:
+        b_i = b_0 * (1.0 - (double)depth / (double)gen_mx);
+        break;
+    }
+  }
+
+  // given target b_i, find prob p so expected value of
+  // geometric distribution is b_i.
+  p = 1.0 / (1.0 + b_i);
+
+  // get uniform random number on [0,1)
+  h = rng_rand(parent->state.state);
+  u = rng_toProb(h);
+
+  // max number of children at this cumulative probability
+  // (from inverse geometric cumulative density function)
+  numChildren = (int)floor(log(1 - u) / log(1 - p));
+
+  return numChildren;
+}
+
+int uts_numChildren(Node *parent) {
+  int numChildren = 0;
+
+  /* Determine the number of children */
+  switch (type) {
+    case BIN:
+      if (parent->height == 0)
+        numChildren = (int)floor(b_0);
+      else
+        numChildren = uts_numChildren_bin(parent);
+      break;
+
+    case GEO:
+      numChildren = uts_numChildren_geo(parent);
+      break;
+
+    case HYBRID:
+      if (parent->height < shiftDepth * gen_mx)
+        numChildren = uts_numChildren_geo(parent);
+      else
+        numChildren = uts_numChildren_bin(parent);
+      break;
+    case BALANCED:
+      if (parent->height < gen_mx)
+        numChildren = (int)b_0;
+      break;
+    default:
+      uts_error("parTreeSearch(): Unknown tree type");
+  }
+
+  // limit number of children
+  // only a BIN root can have more than MAXNUMCHILDREN
+  if (parent->height == 0 && parent->type == BIN) {
+    int rootBF = (int)ceil(b_0);
+    if (numChildren > rootBF) {
+      printf("*** Number of children of root truncated from %d to %d\n", numChildren, rootBF);
+      numChildren = rootBF;
+    }
+  } else if (type != BALANCED) {
+    if (numChildren > MAXNUMCHILDREN) {
+      printf("*** Number of children truncated from %d to %d\n", numChildren, MAXNUMCHILDREN);
+      numChildren = MAXNUMCHILDREN;
+    }
+  }
+
+  return numChildren;
+}
+
+int uts_childType(Node *parent) {
+  switch (type) {
+    case BIN:
+      return BIN;
+    case GEO:
+      return GEO;
+    case HYBRID:
+      if (parent->height < shiftDepth * gen_mx)
+        return GEO;
+      else
+        return BIN;
+    case BALANCED:
+      return BALANCED;
+    default:
+      uts_error("uts_get_childtype(): Unknown tree type");
+      return -1;
+  }
+}
+
+// construct string with all parameter settings
+int uts_paramsToStr(char *strBuf, int ind) {
+  // version + execution model
+  ind += sprintf(strBuf + ind, "UTS - Unbalanced Tree Search %s (%s)\n", UTS_VERSION, impl_getName());
+
+  // tree type
+  ind += sprintf(strBuf + ind, "Tree type:  %d (%s)\n", type, uts_trees_str[type]);
+
+  // tree shape parameters
+  ind += sprintf(strBuf + ind, "Tree shape parameters:\n");
+  ind += sprintf(strBuf + ind, "  root branching factor b_0 = %.1f, root seed = %d\n", b_0, rootId);
+
+  if (type == GEO || type == HYBRID) {
+    ind += sprintf(strBuf + ind,
+                   "  GEO parameters: gen_mx = %d, shape function = %d (%s)\n",
+                   gen_mx,
+                   shape_fn,
+                   uts_geoshapes_str[shape_fn]);
+  }
+
+  if (type == BIN || type == HYBRID) {
+    double q = nonLeafProb;
+    int m = nonLeafBF;
+    double es = (1.0 / (1.0 - q * m));
+    ind +=
+        sprintf(strBuf + ind, "  BIN parameters:  q = %f, m = %d, E(n) = %f, E(s) = %.2f\n", q, m, q * m, es);
+  }
+
+  if (type == HYBRID) {
+    ind += sprintf(
+        strBuf + ind, "  HYBRID:  GEO from root to depth %d, then BIN\n", (int)ceil(shiftDepth * gen_mx));
+  }
+
+  if (type == BALANCED) {
+    ind += sprintf(strBuf + ind, "  BALANCED parameters: gen_mx = %d\n", gen_mx);
+    ind += sprintf(strBuf + ind,
+                   "        Expected size: %llu nodes, %llu leaves\n",
+                   (counter_t)((pow(b_0, gen_mx + 1) - 1.0) / (b_0 - 1.0)) /* geometric series */,
+                   (counter_t)pow(b_0, gen_mx));
+  }
+
+  // random number generator
+  ind += sprintf(strBuf + ind, "Random number generator: ");
+  ind = rng_showtype(strBuf, ind);
+  ind += sprintf(strBuf + ind, "\nCompute granularity: %d\n", computeGranularity);
+
+  return ind;
+}
+
+// show parameter settings
+void uts_printParams() {
+  char strBuf[5000] = "";
+  int ind = 0;
+
+  if (verbose > 0) {
+    ind = uts_paramsToStr(strBuf, ind);
+    ind = impl_paramsToStr(strBuf, ind);
+    printf("%s\n", strBuf);
+  }
+}
+
+void uts_parseParams(int argc, char *argv[]) {
+  int i = 1;
+  int err = -1;
+  while (i < argc && err == -1) {
+    if (argv[i][0] == '-' && argv[i][1] == 'h') {
+      uts_helpMessage();
+      impl_abort(0);
+
+    } else if (argv[i][0] != '-' || strlen(argv[i]) != 2 || argc <= i + 1) {
+      err = i;
+      break;
+    }
+
+    // Matched by implementation -- return 0 on success
+    // This is fragile, don't override parameters in impl_parseParam()!
+    if (!impl_parseParam(argv[i], argv[i + 1])) {
+      i += 2;
+      continue;
+    }
+
+    switch (argv[i][1]) {
+      case 'q':
+        nonLeafProb = atof(argv[i + 1]);
+        break;
+      case 'm':
+        nonLeafBF = atoi(argv[i + 1]);
+        break;
+      case 'r':
+        rootId = atoi(argv[i + 1]);
+        break;
+      case 'x':
+        debug = atoi(argv[i + 1]);
+        break;
+      case 'v':
+        verbose = atoi(argv[i + 1]);
+        break;
+      case 't':
+        type = (tree_t)atoi(argv[i + 1]);
+        if (type != BIN && type != GEO && type != HYBRID && type != BALANCED)
+          err = i;
+        break;
+      case 'a':
+        shape_fn = (geoshape_t)atoi(argv[i + 1]);
+        if (shape_fn > FIXED)
+          err = i;
+        break;
+      case 'b':
+        b_0 = atof(argv[i + 1]);
+        break;
+      case 'd':
+        gen_mx = atoi(argv[i + 1]);
+        break;
+      case 'f':
+        shiftDepth = atof(argv[i + 1]);
+        break;
+      case 'g':
+        computeGranularity = max(1, atoi(argv[i + 1]));
+        break;
+      default:
+        err = i;
+    }
+
+    if (err != -1)
+      break;
+
+    i += 2;
+  }
+
+  if (err != -1) {
+    printf("Unrecognized parameter or incorrect/missing value: '%s %s'\n",
+           argv[i],
+           (i + 1 < argc) ? argv[i + 1] : "[none]");
+    printf("Try -h for help.\n");
+    impl_abort(4);
+  }
+}
+
+void uts_helpMessage() {
+  printf("  UTS - Unbalanced Tree Search %s (%s)\n\n", UTS_VERSION, impl_getName());
+  printf("    usage:  uts-bin [parameter value] ...\n\n");
+  printf("  parameter type  description\n");
+  printf("  ==== ====  =========================================\n");
+  printf("\n  Benchmark Parameters:\n");
+  printf("   -t  int   tree type (0: BIN, 1: GEO, 2: HYBRID, 3: BALANCED)\n");
+  printf("   -b  dble  root branching factor\n");
+  printf("   -r  int   root seed 0 <= r < 2^31 \n");
+  printf("   -a  int   GEO: tree shape function \n");
+  printf("   -d  int   GEO, BALANCED: tree depth\n");
+  printf("   -q  dble  BIN: probability of non-leaf node\n");
+  printf("   -m  int   BIN: number of children for non-leaf node\n");
+  printf("   -f  dble  HYBRID: fraction of depth for GEO -> BIN transition\n");
+  printf("   -g  int   compute granularity: number of rng_spawns per node\n");
+  printf("   -v  int   nonzero to set verbose output\n");
+  printf("   -x  int   debug level\n");
+
+  // Get help message from the implementation
+  printf("\n  Additional Implementation Parameters:\n");
+  impl_helpMessage();
+  printf("\n");
+}
+
+void uts_showStats(
+    int nPes, int chunkSize, double walltime, counter_t nNodes, counter_t nLeaves, counter_t maxDepth) {
+  // summarize execution info for machine consumption
+  if (verbose == 0) {
+    printf("%4d %7.3f %9llu %7.0llu %7.0llu %d %d %.2f %d %d %1d %f %3d\n",
+           nPes,
+           walltime,
+           nNodes,
+           (long long)(nNodes / walltime),
+           (long long)((nNodes / walltime) / nPes),
+           chunkSize,
+           type,
+           b_0,
+           rootId,
+           gen_mx,
+           shape_fn,
+           nonLeafProb,
+           nonLeafBF);
+  }
+
+  // summarize execution info for human consumption
+  else {
+    printf("Tree size = %llu, tree depth = %llu, num leaves = %llu (%.2f%%)\n",
+           nNodes,
+           maxDepth,
+           nLeaves,
+           nLeaves / (float)nNodes * 100.0);
+    printf("Wallclock time = %.3f sec, performance = %.0f nodes/sec (%.0f nodes/sec per PE)\n\n",
+           walltime,
+           (nNodes / walltime),
+           (nNodes / walltime / nPes));
+  }
+}
+
+// --------------------------------------------------------------------- //
+
+// The name of this implementation
+char *impl_getName() { return "Sequential Recursive Search"; }
+
+int impl_paramsToStr(char *strBuf, int ind) {
+  ind += sprintf(strBuf + ind, "Execution strategy:  %s\n", impl_getName());
+  return ind;
+}
+
+// Not using UTS command line params, return non-success
+int impl_parseParam(char *param, char *value) {
+  return 1;
+  (void)param;
+  (void)value;
+}
+
+void impl_helpMessage() { printf("   none.\n"); }
+
+void impl_abort(int err) { exit(err); }
\ No newline at end of file
diff --git a/benchmark/lib/CMakeLists.txt b/benchmark/lib/CMakeLists.txt
new file mode 100644
index 000000000..c745bd439
--- /dev/null
+++ b/benchmark/lib/CMakeLists.txt
@@ -0,0 +1,35 @@
+add_library(benchmark_common)
+
+target_compile_features(benchmark_common PUBLIC cxx_std_26)
+
+target_sources(benchmark_common
+  PRIVATE
+    uts.cpp
+  PUBLIC
+    FILE_SET HEADERS
+    BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}
+    FILES
+      bench.hpp
+      fib.hpp
+      fold.hpp
+      heat.hpp
+      integrate.hpp
+      knapsack.hpp
+      macros.hpp
+      mandelbrot.hpp
+      matmul.hpp
+      nqueens.hpp
+      primes.hpp
+      quicksort.hpp
+      scan.hpp
+      skynet.hpp
+      uts.hpp
+)
+
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../external/uts external/uts)
+
+target_link_libraries(benchmark_common
+  PUBLIC
+    benchmark::benchmark
+    uts_c
+)
diff --git a/benchmark/lib/bench.hpp b/benchmark/lib/bench.hpp
new file mode 100644
index 000000000..360c79be1
--- /dev/null
+++ b/benchmark/lib/bench.hpp
@@ -0,0 +1,62 @@
+#pragma once
+
+#include <benchmark/benchmark.h>
+
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <cstdint>
+  #include <format>
+  #include <functional>
+#else
+import std;
+#endif
+
+namespace lf_bench {
+
+inline constexpr std::int64_t no_threads = 0;
+
+inline auto inverse_complexity(benchmark::IterationCount n) -> double { return 1.0 / static_cast<double>(n); }
+
+inline void report_threads(benchmark::State &state, std::int64_t threads) {
+  if (threads == no_threads) {
+    return;
+  }
+
+  state.counters["p"] = static_cast<double>(threads);
+  state.SetComplexityN(static_cast<benchmark::IterationCount>(threads));
+}
+
+// `bench` reports mismatches with a `std::format` call that formats both
+// `result` and `expected`, so `Expected` and `std::invoke_result_t<Fn>` must be
+// formattable.
+template <typename Expected, typename Check, typename Fn>
+void bench(benchmark::State &state, std::int64_t threads, const Expected &expected, Check check, Fn fn) {
+  report_threads(state, threads);
+
+  for (auto _ : state) {
+    auto result = std::invoke(fn);
+
+    if (!std::invoke(check, result, expected)) {
+      state.SkipWithError(std::format("incorrect result: {} != {}", result, expected));
+      break;
+    }
+
+    benchmark::DoNotOptimize(result);
+  }
+}
+
+template <typename Expected, typename Fn>
+void bench(benchmark::State &state, std::int64_t threads, const Expected &expected, Fn fn) {
+  bench(state, threads, expected, std::equal_to<>{}, fn);
+}
+
+template <typename Expected, typename Check, typename Fn>
+void bench(benchmark::State &state, const Expected &expected, Check check, Fn fn) {
+  bench(state, no_threads, expected, check, fn);
+}
+
+template <typename Expected, typename Fn>
+void bench(benchmark::State &state, const Expected &expected, Fn fn) {
+  bench(state, no_threads, expected, fn);
+}
+
+} // namespace lf_bench
diff --git a/benchmark/lib/fib.hpp b/benchmark/lib/fib.hpp
new file mode 100644
index 000000000..f21b9ce4c
--- /dev/null
+++ b/benchmark/lib/fib.hpp
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <benchmark/benchmark.h>
+
+#include "bench.hpp"
+
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <cstdint>
+  #include <functional>
+#else
+import std;
+#endif
+
+inline constexpr int fib_test = 8;
+inline constexpr int fib_base = 37;
+
+/**
+ * @brief Non-recursive Fibonacci calculation
+ */
+constexpr auto fib_ref(std::int64_t n) -> std::int64_t {
+
+  if (n < 2) {
+    return n;
+  }
+
+  std::int64_t prev = 0;
+  std::int64_t curr = 1;
+
+  for (std::int64_t i = 2; i <= n; ++i) {
+    std::int64_t next = prev + curr;
+    prev = curr;
+    curr = next;
+  }
+
+  return curr;
+}
+
+template <typename Fn>
+void run_fib(benchmark::State &state, std::int64_t threads, Fn fn) {
+  std::int64_t n = state.range(0);
+  std::int64_t expect = fib_ref(n);
+
+  state.counters["n"] = static_cast<double>(n);
+
+  lf_bench::bench(state, threads, expect, [n, fn]() -> std::int64_t {
+    return std::invoke(fn, n);
+  });
+}
+
+template <typename Fn>
+void run_fib(benchmark::State &state, Fn fn) {
+  run_fib(state, lf_bench::no_threads, fn);
+}
diff --git a/benchmark/lib/fold.hpp b/benchmark/lib/fold.hpp
new file mode 100644
index 000000000..64bb8b61a
--- /dev/null
+++ b/benchmark/lib/fold.hpp
@@ -0,0 +1,111 @@
+#pragma once
+
+#include <benchmark/benchmark.h>
+
+#include "macros.hpp"
+
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <concepts>
+  #include <cstddef>
+  #include <cstdint>
+  #include <functional>
+  #include <new>
+  #include <ranges>
+  #include <span>
+  #include <type_traits>
+  #include <vector>
+#else
+import std;
+#endif
+
+inline constexpr std::int64_t fold_test = 10;
+
+inline constexpr std::int64_t fold_1024 = 1'024;
+inline constexpr std::int64_t fold_1024_base = fold_1024;
+inline constexpr std::int64_t fold_1024_sq_base = fold_1024 * fold_1024;
+inline constexpr std::int64_t fold_1024_cu_base = fold_1024 * fold_1024 * fold_1024;
+
+enum class fold_data_mode : char { memory, lazy };
+enum class fold_chunk_mode : char { explicit_one, deduced, fixed };
+enum class fold_projection_mode : char { sync, async };
+
+template <typename T>
+constexpr auto fold_value(std::size_t index) -> T {
+  return static_cast<T>(index % 4UZ);
+}
+
+template <typename T>
+constexpr auto make_fold_range(std::size_t count) {
+  return std::views::iota(std::size_t{}, count) | std::views::transform([](std::size_t index) -> T {
+           return fold_value<T>(index);
+         });
+}
+
+template <typename T>
+using fold_accum_t = std::conditional_t<std::same_as<T, float>, double, std::int64_t>;
+
+template <typename T>
+constexpr auto expected_fold_result(std::size_t count) -> fold_accum_t<T> {
+  auto groups = count / 4UZ;
+  auto remainder = count % 4UZ;
+  return static_cast<fold_accum_t<T>>((groups * 6UZ) + ((remainder * (remainder - 1UZ)) / 2UZ));
+}
+
+template <typename T>
+auto fold_result_is_correct(fold_accum_t<T> result, fold_accum_t<T> expect) -> bool {
+  if constexpr (std::floating_point<fold_accum_t<T>>) {
+    return std::abs(result - expect) <= 1e-6;
+  } else {
+    return result == expect;
+  }
+}
+
+template <fold_data_mode Data, typename T, typename Fn>
+void run_fold_input(benchmark::State &state, std::int64_t threads, Fn fn) {
+  auto n = static_cast<std::size_t>(state.range(0));
+  auto expect = expected_fold_result<T>(n);
+
+  auto run = [&](auto const &range) -> void {
+    lf_bench::bench(state, threads, expect, fold_result_is_correct<T>, [&]() -> fold_accum_t<T> {
+      return std::invoke(fn, range);
+    });
+  };
+
+  if constexpr (Data == fold_data_mode::memory) {
+    run(make_fold_range<T>(n) | std::ranges::to<std::vector<T>>());
+  } else {
+    run(make_fold_range<T>(n));
+  }
+
+  state.SetItemsProcessed(state.iterations() * static_cast<std::int64_t>(n));
+}
+
+template <fold_data_mode Data, typename T, typename Fn>
+void run_fold_input(benchmark::State &state, Fn fn) {
+  run_fold_input<Data, T>(state, lf_bench::no_threads, fn);
+}
+
+// Use alias for shorted names.
+inline constexpr auto memory = fold_data_mode::memory;
+inline constexpr auto lazy = fold_data_mode::lazy;
+inline constexpr auto chunk_1 = fold_chunk_mode::explicit_one;
+inline constexpr auto chunk_deduced = fold_chunk_mode::deduced;
+inline constexpr auto chunk_fixed = fold_chunk_mode::fixed;
+inline constexpr auto sync_proj = fold_projection_mode::sync;
+inline constexpr auto async_proj = fold_projection_mode::async;
+
+using int32 = std::int32_t;
+using float32 = float;
+
+#define LF_FOLD_BENCH_SIZES_SMALL(bench_fn, category, name, ...)                                             \
+  BENCH_ONE(bench_fn, category, name, test, fold __VA_OPT__(, ) __VA_ARGS__)                                 \
+  BENCH_ONE(bench_fn, category, name, base, fold_1024 __VA_OPT__(, ) __VA_ARGS__)                            \
+  BENCH_ONE(bench_fn, category, name, base, fold_1024_sq __VA_OPT__(, ) __VA_ARGS__)
+
+#define LF_FOLD_BENCH_SIZES(bench_fn, category, name, ...)                                                   \
+  LF_FOLD_BENCH_SIZES_SMALL(bench_fn, category, name __VA_OPT__(, ) __VA_ARGS__)                             \
+  BENCH_ONE(bench_fn, category, name, base, fold_1024_cu __VA_OPT__(, ) __VA_ARGS__)
+
+#define LF_FOLD_BENCH_SIZES_MT(bench_fn, category, name, ...)                                                \
+  BENCH_ONE_MT(bench_fn, category, name, test, fold __VA_OPT__(, ) __VA_ARGS__)                              \
+  BENCH_ONE_MT(bench_fn, category, name, base, fold_1024_cu __VA_OPT__(, ) __VA_ARGS__)
diff --git a/benchmark/lib/heat.hpp b/benchmark/lib/heat.hpp
new file mode 100644
index 000000000..c50c4ec13
--- /dev/null
+++ b/benchmark/lib/heat.hpp
@@ -0,0 +1,93 @@
+#pragma once
+
+#include "bench.hpp"
+
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <cmath>
+  #include <cstddef>
+  #include <functional>
+  #include <utility>
+  #include <vector>
+#else
+import std;
+#endif
+
+inline constexpr std::size_t heat_test = 64;
+inline constexpr std::size_t heat_base = 1024;
+
+inline constexpr std::size_t heat_iters = 16;
+
+// Initialise grid with a fixed analytic profile (boundaries clamped).
+inline auto heat_make_grid(std::size_t n) -> std::vector<double> {
+  std::vector<double> g(n * n);
+  for (std::size_t y = 0; y < n; ++y) {
+    for (std::size_t x = 0; x < n; ++x) {
+      double dx = static_cast<double>(x) / static_cast<double>(n - 1) - 0.5;
+      double dy = static_cast<double>(y) / static_cast<double>(n - 1) - 0.5;
+      g[y * n + x] = std::exp(-8.0 * (dx * dx + dy * dy));
+    }
+  }
+  return g;
+}
+
+inline auto heat_matches(std::vector<double> const &actual, std::vector<double> const &expected) -> bool {
+  for (std::size_t i = 0; i < actual.size(); ++i) {
+    if (std::abs(actual[i] - expected[i]) > 1e-12) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void heat_jacobi_step(double const *src, double *dst, std::size_t n) {
+  for (std::size_t y = 1; y < n - 1; ++y) {
+    for (std::size_t x = 1; x < n - 1; ++x) {
+      std::size_t i = y * n + x;
+      dst[i] = 0.25 * (src[i - 1] + src[i + 1] + src[i - n] + src[i + n]);
+    }
+  }
+  for (std::size_t x = 0; x < n; ++x) {
+    dst[x] = src[x];
+    dst[(n - 1) * n + x] = src[(n - 1) * n + x];
+  }
+  for (std::size_t y = 0; y < n; ++y) {
+    dst[y * n] = src[y * n];
+    dst[y * n + (n - 1)] = src[y * n + (n - 1)];
+  }
+}
+
+inline auto
+heat_reference(std::vector<double> initial, std::size_t n, std::size_t iters) -> std::vector<double> {
+  std::vector<double> scratch(initial.size());
+  double *src = initial.data();
+  double *dst = scratch.data();
+
+  for (std::size_t t = 0; t < iters; ++t) {
+    heat_jacobi_step(src, dst, n);
+    std::swap(src, dst);
+  }
+
+  if (src == initial.data()) {
+    return initial;
+  }
+  return scratch;
+}
+
+template <typename Fn>
+void run_heat(benchmark::State &state, Fn fn) {
+  auto n = static_cast<std::size_t>(state.range(0));
+  state.counters["n"] = static_cast<double>(n);
+  state.counters["iters"] = static_cast<double>(heat_iters);
+
+  std::vector<double> initial = heat_make_grid(n);
+  std::vector<double> a(initial.size());
+  std::vector<double> b(initial.size());
+  std::vector<double> reference = heat_reference(initial, n, heat_iters);
+
+  lf_bench::bench(state, true, [&]() -> bool {
+    a = initial;
+    std::invoke(fn, a.data(), b.data(), n, heat_iters);
+    benchmark::DoNotOptimize(a.data());
+    return heat_matches((heat_iters % 2 == 0) ? a : b, reference);
+  });
+}
diff --git a/benchmark/lib/integrate.hpp b/benchmark/lib/integrate.hpp
new file mode 100644
index 000000000..836b4004f
--- /dev/null
+++ b/benchmark/lib/integrate.hpp
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "bench.hpp"
+
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <cmath>
+  #include <cstdint>
+  #include <functional>
+#else
+import std;
+#endif
+
+inline constexpr std::int64_t integrate_test = 100;
+inline constexpr std::int64_t integrate_base = 10'000;
+
+inline constexpr double integrate_epsilon = 1.0e-9;
+
+inline constexpr auto integrate_fn(double x) -> double { return (x * x + 1.0) * x; }
+
+inline constexpr auto integrate_exact(double a, double b) -> double {
+  auto indefinite = [](double x) {
+    return 0.25 * x * x * (x * x + 2);
+  };
+  return indefinite(b) - indefinite(a);
+}
+
+inline auto integrate_is_close(double result, double expect) -> bool {
+  return std::abs(result - expect) <= 1e-3 * std::abs(expect);
+}
+
+template <typename Fn>
+void run_integrate(benchmark::State &state, Fn fn) {
+  std::int64_t n = state.range(0);
+  double upper = static_cast<double>(n);
+  double expect = integrate_exact(0, upper);
+
+  state.counters["n"] = static_cast<double>(n);
+
+  lf_bench::bench(state, expect, integrate_is_close, [upper, fn]() -> double {
+    return std::invoke(fn, upper);
+  });
+}
diff --git a/benchmark/lib/knapsack.hpp b/benchmark/lib/knapsack.hpp
new file mode 100644
index 000000000..dc5077b1f
--- /dev/null
+++ b/benchmark/lib/knapsack.hpp
@@ -0,0 +1,78 @@
+#pragma once
+
+#include "bench.hpp"
+
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <algorithm>
+  #include <cstddef>
+  #include <cstdint>
+  #include <functional>
+  #include <random>
+  #include <vector>
+#else
+import std;
+#endif
+
+inline constexpr std::size_t knapsack_test = 16;
+inline constexpr std::size_t knapsack_base = 28;
+
+struct knapsack_item {
+  int weight;
+  int value;
+};
+
+struct knapsack_problem {
+  std::vector<knapsack_item> items; // sorted by value/weight desc
+  int capacity;
+};
+
+inline auto knapsack_make(std::size_t n, std::uint64_t seed = 0xCAFEBABE) -> knapsack_problem {
+  std::mt19937_64 rng{seed};
+  std::uniform_int_distribution<int> dw(1, 100);
+  std::uniform_int_distribution<int> dv(1, 100);
+
+  std::vector<knapsack_item> items(n);
+  int total = 0;
+  for (auto &it : items) {
+    it.weight = dw(rng);
+    it.value = dv(rng);
+    total += it.weight;
+  }
+
+  // Sort by value-density, descending, for a tight relaxation bound.
+  std::sort(items.begin(), items.end(), [](knapsack_item a, knapsack_item b) {
+    return static_cast<long long>(a.value) * b.weight > static_cast<long long>(b.value) * a.weight;
+  });
+
+  return knapsack_problem{std::move(items), total / 2};
+}
+
+// Exact optimum via O(n * capacity) DP, used as oracle.
+inline auto knapsack_dp_optimum(knapsack_problem const &p) -> int {
+  std::vector<int> dp(static_cast<std::size_t>(p.capacity) + 1, 0);
+  for (auto const &it : p.items) {
+    for (int c = p.capacity; c >= it.weight; --c) {
+      auto idx = static_cast<std::size_t>(c);
+      auto idx_prev = static_cast<std::size_t>(c - it.weight);
+      int cand = dp[idx_prev] + it.value;
+      if (cand > dp[idx]) {
+        dp[idx] = cand;
+      }
+    }
+  }
+  return dp[static_cast<std::size_t>(p.capacity)];
+}
+
+template <typename Fn>
+void run_knapsack(benchmark::State &state, Fn fn) {
+  auto n = static_cast<std::size_t>(state.range(0));
+  auto problem = knapsack_make(n);
+  int expect = knapsack_dp_optimum(problem);
+
+  state.counters["n"] = static_cast<double>(n);
+  state.counters["capacity"] = problem.capacity;
+
+  lf_bench::bench(state, expect, [problem = std::move(problem), fn]() -> int {
+    return std::invoke(fn, problem);
+  });
+}
diff --git a/benchmark/lib/macros.hpp b/benchmark/lib/macros.hpp
new file mode 100644
index 000000000..3e561c70e
--- /dev/null
+++ b/benchmark/lib/macros.hpp
@@ -0,0 +1,172 @@
+#pragma once
+
+#include <benchmark/benchmark.h>
+
+#include "bench.hpp"
+
+// Use `import std;` by default. Textually `#include <thread>` drags in
+// `<stop_token>`, which triggers a libc++ 22 link-time bug (undefined
+// `__atomic_unique_lock::__set_locked_bit`) in TUs that later instantiate
+// anything touching std::stop_*. Targets that can't use modules (e.g. the
+// openmp benchmarks, see benchmark/src/openmp/CMakeLists.txt) define
+// LF_BENCH_NO_IMPORT_STD and get textual includes instead.
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <algorithm>
+  #include <cstdint>
+  #include <string>
+  #include <thread>
+#else
+import std;
+#endif
+
+#define BENCH_GET_FN(bench_fn, ...) bench_fn __VA_OPT__(<__VA_ARGS__>)
+
+namespace lf_bench {
+
+inline void bench_thread_args(benchmark::Benchmark *bench, auto make_args) {
+  unsigned hw = std::max(1U, std::thread::hardware_concurrency());
+  for (unsigned t : {1U, 2U, 4U, 6U, 8U, 12U, 16U, 24U, 32U, 48U, 64U, 96U}) {
+    if (t > hw) {
+      return;
+    }
+    make_args(bench, t);
+  }
+}
+
+inline auto sanitize(std::string s) -> std::string {
+  s.erase(std::remove(s.begin(), s.end(), ' '), s.end());
+  return s;
+}
+
+inline auto
+format_name(std::string mode, std::string category, std::string name, std::string args) -> std::string {
+  std::string res = sanitize(mode) + "/" + sanitize(category) + "/" + sanitize(name);
+  std::string s_args = sanitize(args);
+  if (!s_args.empty()) {
+    res += "/" + s_args;
+  }
+  return res;
+}
+
+inline void setup_single(benchmark::Benchmark *b, std::int64_t size) { b->Arg(size)->UseRealTime(); }
+
+inline void setup_mt(benchmark::Benchmark *b, std::int64_t size) {
+  b->Apply([size](benchmark::Benchmark *bm) {
+     bench_thread_args(bm, [size](benchmark::Benchmark *inner_b, unsigned t) {
+       inner_b->Args({size, static_cast<std::int64_t>(t)});
+     });
+   })
+      ->Complexity(inverse_complexity)
+      ->UseRealTime();
+}
+
+inline void setup_uts_mt(benchmark::Benchmark *b) {
+  b->Apply([](benchmark::Benchmark *bm) {
+     bench_thread_args(bm, [](benchmark::Benchmark *inner_b, unsigned t) {
+       inner_b->Arg(static_cast<std::int64_t>(t));
+     });
+   })
+      ->Complexity(inverse_complexity)
+      ->UseRealTime();
+}
+
+} // namespace lf_bench
+
+// --- Standard Benchmarks ---
+
+#define BENCH_ONE_WITH_ID(id, bench_fn, category, name, mode, prefix, ...)                                   \
+  namespace {                                                                                                \
+  struct benchmark_reg_##id {                                                                                \
+    benchmark_reg_##id() {                                                                                   \
+      auto *b = benchmark::RegisterBenchmark(lf_bench::format_name(#mode, #category, #name, #__VA_ARGS__),   \
+                                             BENCH_GET_FN(bench_fn __VA_OPT__(, ) __VA_ARGS__));             \
+      lf_bench::setup_single(b, prefix##_##mode);                                                            \
+    }                                                                                                        \
+  } benchmark_reg_inst_##id;                                                                                 \
+  }
+
+#define BENCH_ONE_HIDDEN(id, ...) BENCH_ONE_WITH_ID(id __VA_OPT__(, ) __VA_ARGS__)
+#define BENCH_ONE(bench_fn, category, name, mode, prefix, ...)                                               \
+  BENCH_ONE_HIDDEN(__COUNTER__, bench_fn, category, name, mode, prefix __VA_OPT__(, ) __VA_ARGS__)
+
+#define BENCH_ALL(bench_fn, category, name, prefix, ...)                                                     \
+  BENCH_ONE(bench_fn, category, name, test, prefix __VA_OPT__(, ) __VA_ARGS__)                               \
+  BENCH_ONE(bench_fn, category, name, base, prefix __VA_OPT__(, ) __VA_ARGS__)
+
+// --- Multi-Threaded Benchmarks ---
+
+#define BENCH_ONE_MT_WITH_ID(id, bench_fn, category, name, mode, prefix, ...)                                \
+  namespace {                                                                                                \
+  struct benchmark_reg_##id {                                                                                \
+    benchmark_reg_##id() {                                                                                   \
+      auto *b = benchmark::RegisterBenchmark(lf_bench::format_name(#mode, #category, #name, #__VA_ARGS__),   \
+                                             BENCH_GET_FN(bench_fn __VA_OPT__(, ) __VA_ARGS__));             \
+      lf_bench::setup_mt(b, prefix##_##mode);                                                                \
+    }                                                                                                        \
+  } benchmark_reg_inst_##id;                                                                                 \
+  }
+
+#define BENCH_ONE_MT_HIDDEN(id, ...) BENCH_ONE_MT_WITH_ID(id __VA_OPT__(, ) __VA_ARGS__)
+#define BENCH_ONE_MT(bench_fn, category, name, mode, prefix, ...)                                            \
+  BENCH_ONE_MT_HIDDEN(__COUNTER__, bench_fn, category, name, mode, prefix __VA_OPT__(, ) __VA_ARGS__)
+
+#define BENCH_ALL_MT(bench_fn, category, name, prefix, ...)                                                  \
+  BENCH_ONE_MT(bench_fn, category, name, test, prefix __VA_OPT__(, ) __VA_ARGS__)                            \
+  BENCH_ONE_MT(bench_fn, category, name, base, prefix __VA_OPT__(, ) __VA_ARGS__)
+
+// --- UTS Benchmarks ---
+
+#define UTS_BENCH_ONE_WITH_ID(id, bench_fn, category, mode, tree_name, tree_id, ...)                         \
+  namespace {                                                                                                \
+  struct benchmark_reg_##id {                                                                                \
+    benchmark_reg_##id() {                                                                                   \
+      auto *b = benchmark::RegisterBenchmark(                                                                \
+          lf_bench::format_name(#mode, #category, "uts/" tree_name, #__VA_ARGS__),                           \
+          [=](benchmark::State &state) {                                                                     \
+            BENCH_GET_FN(bench_fn __VA_OPT__(, ) __VA_ARGS__)(state, tree_id);                               \
+          });                                                                                                \
+      b->UseRealTime();                                                                                      \
+    }                                                                                                        \
+  } benchmark_reg_inst_##id;                                                                                 \
+  }
+
+#define UTS_BENCH_ONE_HIDDEN(id, ...) UTS_BENCH_ONE_WITH_ID(id __VA_OPT__(, ) __VA_ARGS__)
+#define UTS_BENCH_ONE(bench_fn, category, mode, tree_name, tree_id, ...)                                     \
+  UTS_BENCH_ONE_HIDDEN(__COUNTER__, bench_fn, category, mode, tree_name, tree_id __VA_OPT__(, ) __VA_ARGS__)
+
+#define UTS_BENCH_ALL(bench_fn, category, ...)                                                               \
+  UTS_BENCH_ONE(bench_fn, category, test, "T1_mini", uts_t1_mini __VA_OPT__(, ) __VA_ARGS__)                 \
+  UTS_BENCH_ONE(bench_fn, category, test, "T3_mini", uts_t3_mini __VA_OPT__(, ) __VA_ARGS__)                 \
+  UTS_BENCH_ONE(bench_fn, category, base, "T1", uts_t1 __VA_OPT__(, ) __VA_ARGS__)                           \
+  UTS_BENCH_ONE(bench_fn, category, base, "T3", uts_t3 __VA_OPT__(, ) __VA_ARGS__)                           \
+  UTS_BENCH_ONE(bench_fn, category, large, "T1L", uts_t1l __VA_OPT__(, ) __VA_ARGS__)                        \
+  UTS_BENCH_ONE(bench_fn, category, large, "T3L", uts_t3l __VA_OPT__(, ) __VA_ARGS__)
+
+// --- UTS Multi-Threaded Benchmarks ---
+
+#define UTS_BENCH_ONE_MT_WITH_ID(id, bench_fn, category, mode, tree_name, tree_id, ...)                      \
+  namespace {                                                                                                \
+  struct benchmark_reg_##id {                                                                                \
+    benchmark_reg_##id() {                                                                                   \
+      auto *b = benchmark::RegisterBenchmark(                                                                \
+          lf_bench::format_name(#mode, #category, "uts/" tree_name, #__VA_ARGS__),                           \
+          [=](benchmark::State &state) {                                                                     \
+            BENCH_GET_FN(bench_fn __VA_OPT__(, ) __VA_ARGS__)(state, tree_id);                               \
+          });                                                                                                \
+      lf_bench::setup_uts_mt(b);                                                                             \
+    }                                                                                                        \
+  } benchmark_reg_inst_##id;                                                                                 \
+  }
+
+#define UTS_BENCH_ONE_MT_HIDDEN(id, ...) UTS_BENCH_ONE_MT_WITH_ID(id __VA_OPT__(, ) __VA_ARGS__)
+#define UTS_BENCH_ONE_MT(bench_fn, category, mode, tree_name, tree_id, ...)                                  \
+  UTS_BENCH_ONE_MT_HIDDEN(                                                                                   \
+      __COUNTER__, bench_fn, category, mode, tree_name, tree_id __VA_OPT__(, ) __VA_ARGS__)
+
+#define UTS_BENCH_ALL_MT(bench_fn, category, ...)                                                            \
+  UTS_BENCH_ONE_MT(bench_fn, category, test, "T1_mini", uts_t1_mini __VA_OPT__(, ) __VA_ARGS__)              \
+  UTS_BENCH_ONE_MT(bench_fn, category, test, "T3_mini", uts_t3_mini __VA_OPT__(, ) __VA_ARGS__)              \
+  UTS_BENCH_ONE_MT(bench_fn, category, base, "T1", uts_t1 __VA_OPT__(, ) __VA_ARGS__)                        \
+  UTS_BENCH_ONE_MT(bench_fn, category, base, "T3", uts_t3 __VA_OPT__(, ) __VA_ARGS__)                        \
+  UTS_BENCH_ONE_MT(bench_fn, category, large, "T1L", uts_t1l __VA_OPT__(, ) __VA_ARGS__)                     \
+  UTS_BENCH_ONE_MT(bench_fn, category, large, "T3L", uts_t3l __VA_OPT__(, ) __VA_ARGS__)
diff --git a/benchmark/lib/mandelbrot.hpp b/benchmark/lib/mandelbrot.hpp
new file mode 100644
index 000000000..bf312f172
--- /dev/null
+++ b/benchmark/lib/mandelbrot.hpp
@@ -0,0 +1,61 @@
+#pragma once
+
+#include "bench.hpp"
+
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <complex>
+  #include <cstdint>
+  #include <functional>
+#else
+import std;
+#endif
+
+inline constexpr int mandelbrot_test = 128;
+inline constexpr int mandelbrot_base = 1024;
+
+inline constexpr int mandelbrot_max_iter = 256;
+
+inline constexpr double mandelbrot_x_min = -2.0;
+inline constexpr double mandelbrot_x_max = 1.0;
+inline constexpr double mandelbrot_y_min = -1.5;
+inline constexpr double mandelbrot_y_max = 1.5;
+
+inline constexpr auto mandelbrot_pixel(int px, int py, int n) -> int {
+  double cr = mandelbrot_x_min + (mandelbrot_x_max - mandelbrot_x_min) * px / n;
+  double ci = mandelbrot_y_min + (mandelbrot_y_max - mandelbrot_y_min) * py / n;
+
+  double zr = 0;
+  double zi = 0;
+  int iter = 0;
+
+  while (iter < mandelbrot_max_iter && zr * zr + zi * zi <= 4.0) {
+    double zr_new = zr * zr - zi * zi + cr;
+    zi = 2 * zr * zi + ci;
+    zr = zr_new;
+    ++iter;
+  }
+
+  return iter;
+}
+
+inline auto mandelbrot_checksum(int n) -> std::uint64_t {
+  std::uint64_t checksum = 0;
+  for (int py = 0; py < n; ++py) {
+    for (int px = 0; px < n; ++px) {
+      checksum += static_cast<std::uint64_t>(mandelbrot_pixel(px, py, n));
+    }
+  }
+  return checksum;
+}
+
+template <typename Fn>
+void run_mandelbrot(benchmark::State &state, Fn fn) {
+  int n = static_cast<int>(state.range(0));
+  std::uint64_t expect = mandelbrot_checksum(n);
+
+  state.counters["n"] = n;
+
+  lf_bench::bench(state, expect, [n, fn]() -> std::uint64_t {
+    return std::invoke(fn, n);
+  });
+}
diff --git a/benchmark/lib/matmul.hpp b/benchmark/lib/matmul.hpp
new file mode 100644
index 000000000..e51e07e06
--- /dev/null
+++ b/benchmark/lib/matmul.hpp
@@ -0,0 +1,123 @@
+#pragma once
+
+#include "bench.hpp"
+
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <algorithm>
+  #include <bit>
+  #include <cmath>
+  #include <cstddef>
+  #include <cstdint>
+  #include <functional>
+  #include <memory>
+  #include <random>
+#else
+import std;
+#endif
+
+inline constexpr unsigned matmul_test = 64;
+inline constexpr unsigned matmul_base = 1024;
+
+inline constexpr unsigned strassen_test = 64;
+inline constexpr unsigned strassen_base = 1024;
+
+inline constexpr unsigned matmul_basecase = 32;
+
+static_assert(std::has_single_bit(matmul_test));
+static_assert(std::has_single_bit(matmul_base));
+
+struct matmul_args {
+  std::unique_ptr<float[]> A;
+  std::unique_ptr<float[]> B;
+  std::unique_ptr<float[]> C;
+  std::unique_ptr<float[]> ref;
+  unsigned n;
+};
+
+inline auto matmul_init(unsigned n, std::uint64_t seed = 0xC0FFEE) -> matmul_args {
+
+  matmul_args args{
+      std::make_unique<float[]>(static_cast<std::size_t>(n) * n),
+      std::make_unique<float[]>(static_cast<std::size_t>(n) * n),
+      std::make_unique<float[]>(static_cast<std::size_t>(n) * n),
+      std::make_unique<float[]>(static_cast<std::size_t>(n) * n),
+      n,
+  };
+
+  std::mt19937_64 rng{seed};
+  std::uniform_real_distribution<float> dist{0, 1};
+
+  for (std::size_t i = 0; i < static_cast<std::size_t>(n) * n; ++i) {
+    args.A[i] = dist(rng);
+    args.B[i] = dist(rng);
+    args.C[i] = 0;
+    args.ref[i] = 0;
+  }
+
+  return args;
+}
+
+inline void matmul_zero(float *C, unsigned n) {
+  for (std::size_t i = 0; i < static_cast<std::size_t>(n) * n; ++i) {
+    C[i] = 0;
+  }
+}
+
+inline auto matmul_max_relative_error(float const *A, float const *B, unsigned n) -> float {
+  constexpr float epsilon = 1e-8F;
+  float error = 0;
+  for (std::size_t i = 0; i < static_cast<std::size_t>(n) * n; ++i) {
+    float diff = std::abs(A[i] - B[i]) / std::max(std::abs(A[i]), epsilon);
+    if (diff > error) {
+      error = diff;
+    }
+  }
+  return error;
+}
+
+inline void matmul_iter(float const *A, float const *B, float *C, unsigned n) {
+  for (unsigned i = 0; i < n; ++i) {
+    for (unsigned k = 0; k < n; ++k) {
+      float c = 0;
+      for (unsigned j = 0; j < n; ++j) {
+        c += A[i * n + j] * B[j * n + k];
+      }
+      C[i * n + k] = c;
+    }
+  }
+}
+
+template <bool Add>
+inline void matmul_basecase_multiply(float const *A, float const *B, float *R, unsigned n, unsigned s) {
+  for (unsigned i = 0; i < n; ++i) {
+    for (unsigned j = 0; j < n; ++j) {
+      float sum = 0;
+      for (unsigned k = 0; k < n; ++k) {
+        sum += A[i * s + k] * B[k * s + j];
+      }
+      if constexpr (Add) {
+        R[i * s + j] += sum;
+      } else {
+        R[i * s + j] = sum;
+      }
+    }
+  }
+}
+
+inline auto matmul_error_is_acceptable(float err, float max_err) -> bool { return err <= max_err; }
+
+template <typename Fn>
+void run_matmul(benchmark::State &state, float max_relative_error, Fn fn) {
+  auto n = static_cast<unsigned>(state.range(0));
+  state.counters["n"] = n;
+
+  auto args = matmul_init(n);
+  matmul_iter(args.A.get(), args.B.get(), args.ref.get(), n);
+
+  lf_bench::bench(state, max_relative_error, matmul_error_is_acceptable, [&]() -> float {
+    matmul_zero(args.C.get(), n);
+    std::invoke(fn, args.A.get(), args.B.get(), args.C.get(), n);
+    benchmark::DoNotOptimize(args.C.get());
+    return matmul_max_relative_error(args.ref.get(), args.C.get(), n);
+  });
+}
diff --git a/benchmark/lib/nqueens.hpp b/benchmark/lib/nqueens.hpp
new file mode 100644
index 000000000..088d1b973
--- /dev/null
+++ b/benchmark/lib/nqueens.hpp
@@ -0,0 +1,66 @@
+#pragma once
+
+#include "bench.hpp"
+
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <array>
+  #include <cstdint>
+  #include <functional>
+  #include <vector>
+#else
+import std;
+#endif
+
+inline constexpr int nqueens_test = 8;
+inline constexpr int nqueens_base = 14;
+
+inline constexpr std::array<std::int64_t, 21> nqueens_answers = {
+    0,
+    1,
+    0,
+    0,
+    2,
+    10,
+    4,
+    40,
+    92,
+    352,
+    724,
+    2'680,
+    14'200,
+    73'712,
+    365'596,
+    2'279'184,
+    14'772'512,
+    95'815'104,
+    666'090'624,
+    4'968'057'848,
+    39'029'188'884,
+};
+
+inline auto queens_ok(int n, char const *a) -> bool {
+  for (int i = 0; i < n; ++i) {
+    char p = a[i];
+    for (int j = i + 1; j < n; ++j) {
+      char q = a[j];
+      if (q == p || q == p - (j - i) || q == p + (j - i)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+template <typename Fn>
+void run_nqueens(benchmark::State &state, Fn fn) {
+  int n = static_cast<int>(state.range(0));
+  std::int64_t expect = nqueens_answers.at(static_cast<std::size_t>(n));
+
+  state.counters["n"] = n;
+
+  std::vector<char> board(static_cast<std::size_t>(n));
+
+  lf_bench::bench(state, expect, [n, &board, fn]() -> std::int64_t {
+    return std::invoke(fn, n, board.data());
+  });
+}
diff --git a/benchmark/lib/primes.hpp b/benchmark/lib/primes.hpp
new file mode 100644
index 000000000..b0154f8ba
--- /dev/null
+++ b/benchmark/lib/primes.hpp
@@ -0,0 +1,56 @@
+#pragma once
+
+#include "bench.hpp"
+
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <cstdint>
+  #include <functional>
+#else
+import std;
+#endif
+
+inline constexpr std::int64_t primes_test = 100'000;
+inline constexpr std::int64_t primes_base = 10'000'000;
+
+// 6k +/- 1 trial division, see https://en.wikipedia.org/wiki/Primality_test
+inline constexpr auto is_prime(std::int64_t n) -> bool {
+  if (n == 2 || n == 3) {
+    return true;
+  }
+  if (n <= 1 || n % 2 == 0 || n % 3 == 0) {
+    return false;
+  }
+  for (std::int64_t i = 5; i * i <= n; i += 6) {
+    if (n % i == 0 || n % (i + 2) == 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Prime-counting function pi(n) reference values for the configured sizes.
+inline constexpr auto primes_expected(std::int64_t n) -> std::int64_t {
+  if (n == primes_test) {
+    return 9592; // pi(1e5)
+  }
+  if (n == primes_base) {
+    return 664'579; // pi(1e7)
+  }
+  return -1;
+}
+
+inline auto primes_count_is_correct(std::int64_t result, std::int64_t expect) -> bool {
+  return expect < 0 || result == expect;
+}
+
+template <typename Fn>
+void run_primes(benchmark::State &state, Fn fn) {
+  std::int64_t n = state.range(0);
+  std::int64_t expect = primes_expected(n);
+
+  state.counters["n"] = static_cast<double>(n);
+
+  lf_bench::bench(state, lf_bench::no_threads, expect, primes_count_is_correct, [n, fn]() -> std::int64_t {
+    return std::invoke(fn, n);
+  });
+}
diff --git a/benchmark/lib/quicksort.hpp b/benchmark/lib/quicksort.hpp
new file mode 100644
index 000000000..afaeb04af
--- /dev/null
+++ b/benchmark/lib/quicksort.hpp
@@ -0,0 +1,49 @@
+#pragma once
+
+#include "bench.hpp"
+
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <algorithm>
+  #include <cstddef>
+  #include <cstdint>
+  #include <functional>
+  #include <random>
+  #include <vector>
+#else
+import std;
+#endif
+
+inline constexpr std::size_t quicksort_test = 10'000;
+inline constexpr std::size_t quicksort_base = 10'000'000;
+
+inline constexpr std::size_t quicksort_basecase = 32;
+
+inline auto
+quicksort_make_input(std::size_t n, std::uint64_t seed = 0xDEADBEEF) -> std::vector<std::uint32_t> {
+  std::vector<std::uint32_t> out(n);
+  std::mt19937_64 rng{seed};
+  std::uniform_int_distribution<std::uint32_t> dist;
+  for (auto &v : out) {
+    v = dist(rng);
+  }
+  return out;
+}
+
+template <typename Fn>
+void run_quicksort(benchmark::State &state, Fn fn) {
+  auto n = static_cast<std::size_t>(state.range(0));
+  state.counters["n"] = static_cast<double>(n);
+
+  std::vector<std::uint32_t> source = quicksort_make_input(n);
+  std::vector<std::uint32_t> reference = source;
+  std::sort(reference.begin(), reference.end());
+
+  std::vector<std::uint32_t> work(n);
+
+  lf_bench::bench(state, true, [&]() -> bool {
+    work = source;
+    std::invoke(fn, work.data(), work.data() + work.size());
+    benchmark::DoNotOptimize(work.data());
+    return work == reference;
+  });
+}
diff --git a/benchmark/lib/scan.hpp b/benchmark/lib/scan.hpp
new file mode 100644
index 000000000..5d6d471dc
--- /dev/null
+++ b/benchmark/lib/scan.hpp
@@ -0,0 +1,45 @@
+#pragma once
+
+#include "bench.hpp"
+
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <cstddef>
+  #include <cstdint>
+  #include <functional>
+  #include <vector>
+#else
+import std;
+#endif
+
+inline constexpr std::size_t scan_test = 1'000;
+inline constexpr std::size_t scan_base = 8'000;
+
+inline constexpr std::size_t scan_reps = 1'000;
+
+inline auto scan_make_vec(std::size_t n) -> std::vector<unsigned> {
+  std::vector<unsigned> out(n);
+  unsigned count = 0;
+  for (auto &elem : out) {
+    elem = ++count;
+  }
+  return out;
+}
+
+template <typename Fn>
+void run_scan(benchmark::State &state, Fn fn) {
+  auto n = static_cast<std::size_t>(state.range(0));
+  state.counters["n"] = static_cast<double>(n);
+  state.counters["reps"] = static_cast<double>(scan_reps);
+
+  std::vector<unsigned> in = scan_make_vec(n);
+  std::vector<unsigned> out(n);
+
+  // For 1..n the inclusive scan's last element equals n*(n+1)/2 (mod 2^32).
+  unsigned expect = static_cast<unsigned>(static_cast<std::uint64_t>(n) * (n + 1) / 2);
+
+  lf_bench::bench(state, expect, [&]() -> unsigned {
+    std::invoke(fn, in, out, scan_reps);
+    benchmark::DoNotOptimize(out.data());
+    return out.back();
+  });
+}
diff --git a/benchmark/lib/skynet.hpp b/benchmark/lib/skynet.hpp
new file mode 100644
index 000000000..faa7a1b86
--- /dev/null
+++ b/benchmark/lib/skynet.hpp
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "bench.hpp"
+
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <cstdint>
+  #include <functional>
+#else
+import std;
+#endif
+
+inline constexpr int skynet_branching = 10;
+
+// Tree depth: total leaves = branching ** depth.
+inline constexpr int skynet_test = 4; // 10^4 = 10'000 leaves
+inline constexpr int skynet_base = 6; // 10^6 = 1'000'000 leaves
+
+inline constexpr auto skynet_leaves(int depth) -> std::int64_t {
+  std::int64_t out = 1;
+  for (int i = 0; i < depth; ++i) {
+    out *= skynet_branching;
+  }
+  return out;
+}
+
+inline constexpr auto skynet_expected(int depth) -> std::int64_t {
+  std::int64_t leaves = skynet_leaves(depth);
+  return leaves * (leaves - 1) / 2;
+}
+
+template <typename Fn>
+void run_skynet(benchmark::State &state, Fn fn) {
+  int depth = static_cast<int>(state.range(0));
+  std::int64_t expect = skynet_expected(depth);
+
+  state.counters["depth"] = depth;
+  state.counters["leaves"] = static_cast<double>(skynet_leaves(depth));
+
+  lf_bench::bench(state, expect, [depth, fn]() -> std::int64_t {
+    return std::invoke(fn, 0, depth);
+  });
+}
diff --git a/benchmark/lib/uts.cpp b/benchmark/lib/uts.cpp
new file mode 100644
index 000000000..90964d48c
--- /dev/null
+++ b/benchmark/lib/uts.cpp
@@ -0,0 +1,159 @@
+#include "uts.hpp"
+
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <exception>
+#else
+import std;
+#endif
+
+namespace {
+
+void reset_uts() {
+  type = GEO;
+  b_0 = 4.0;
+  rootId = 0;
+  nonLeafBF = 4;
+  nonLeafProb = 15.0 / 64.0;
+  gen_mx = 6;
+  shape_fn = LINEAR;
+  shiftDepth = 0.5;
+  computeGranularity = 1;
+  debug = 0;
+  verbose = 1;
+}
+
+// (T1 mini) Geometric
+void setup_t1_mini() {
+  reset_uts();
+  type = static_cast<tree_t>(1);
+  shape_fn = static_cast<geoshape_t>(3);
+  gen_mx = 7;
+  b_0 = 4;
+  rootId = 19;
+}
+
+// (T1) Geometric
+void setup_t1() {
+  reset_uts();
+  type = static_cast<tree_t>(1);
+  shape_fn = static_cast<geoshape_t>(3);
+  gen_mx = 10;
+  b_0 = 4;
+  rootId = 19;
+}
+
+// (T1L) Geometric
+void setup_t1l() {
+  reset_uts();
+  type = static_cast<tree_t>(1);
+  shape_fn = static_cast<geoshape_t>(3);
+  gen_mx = 13;
+  b_0 = 4;
+  rootId = 29;
+}
+
+// (T1XXL)
+void setup_t1xxl() {
+  reset_uts();
+  type = static_cast<tree_t>(1);
+  shape_fn = static_cast<geoshape_t>(3);
+  gen_mx = 15;
+  b_0 = 4;
+  rootId = 19;
+}
+
+// (T3 mini)
+void setup_t3_mini() {
+  reset_uts();
+  type = static_cast<tree_t>(0);
+  b_0 = 20;
+  nonLeafBF = 8;
+  nonLeafProb = 0.124875;
+  rootId = 42;
+}
+
+// (T3) Binomial
+void setup_t3() {
+  reset_uts();
+  type = static_cast<tree_t>(0);
+  b_0 = 2000;
+  nonLeafBF = 8;
+  nonLeafProb = 0.124875;
+  rootId = 42;
+}
+
+// (T3L) Binomial
+void setup_t3l() {
+  reset_uts();
+  type = static_cast<tree_t>(0);
+  b_0 = 2000;
+  nonLeafBF = 5;
+  nonLeafProb = 0.200014;
+  rootId = 7;
+}
+
+// (T3XXL) Binomial
+void setup_t3xxl() {
+  reset_uts();
+  type = static_cast<tree_t>(0);
+  b_0 = 2000;
+  nonLeafBF = 2;
+  nonLeafProb = 0.499995;
+  rootId = 316;
+}
+
+} // namespace
+
+void setup_tree(uts_tree tree) {
+  switch (tree) {
+    case uts_t1_mini:
+      setup_t1_mini();
+      break;
+    case uts_t1:
+      setup_t1();
+      break;
+    case uts_t1l:
+      setup_t1l();
+      break;
+    case uts_t1xxl:
+      setup_t1xxl();
+      break;
+    case uts_t3_mini:
+      setup_t3_mini();
+      break;
+    case uts_t3:
+      setup_t3();
+      break;
+    case uts_t3l:
+      setup_t3l();
+      break;
+    case uts_t3xxl:
+      setup_t3xxl();
+      break;
+    default:
+      std::terminate();
+  }
+}
+
+auto expected_result(uts_tree tree) -> result {
+  switch (tree) {
+    case uts_t1_mini:
+      return {.maxdepth = 7, .size = 63914, .leaves = 51124};
+    case uts_t1:
+      return {.maxdepth = 10, .size = 4130071, .leaves = 3305118};
+    case uts_t1l:
+      return {.maxdepth = 13, .size = 102181082, .leaves = 81746377};
+    case uts_t1xxl:
+      return {.maxdepth = 15, .size = 4230646601, .leaves = 3384495738};
+    case uts_t3_mini:
+      return {.maxdepth = 67, .size = 6213, .leaves = 5438};
+    case uts_t3:
+      return {.maxdepth = 1572, .size = 4112897, .leaves = 3599034};
+    case uts_t3l:
+      return {.maxdepth = 17844, .size = 111345631, .leaves = 89076904};
+    case uts_t3xxl:
+      return {.maxdepth = 99049, .size = 2793220501, .leaves = 1396611250};
+    default:
+      std::terminate();
+  }
+}
diff --git a/benchmark/lib/uts.hpp b/benchmark/lib/uts.hpp
new file mode 100644
index 000000000..61e4361dd
--- /dev/null
+++ b/benchmark/lib/uts.hpp
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <benchmark/benchmark.h>
+
+#include "bench.hpp"
+
+// Include the C UTS library header first (it defines max/min macros that would
+// clash with std::max/std::min after import std).
+#include "uts/uts.h"
+
+#undef max
+#undef min
+
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <cstdint>
+  #include <format>
+  #include <string>
+#else
+import std;
+#endif
+
+struct result {
+  counter_t maxdepth;
+  counter_t size;
+  counter_t leaves;
+  auto operator<=>(const result &) const = default;
+};
+
+template <>
+struct std::formatter<result> : std::formatter<std::string> {
+  auto format(const result &r, auto &ctx) const {
+    return std::formatter<std::string>::format(
+        std::format("{{maxdepth={}, size={}, leaves={}}}", r.maxdepth, r.size, r.leaves), ctx);
+  }
+};
+
+struct pair {
+  result res;
+  Node child;
+};
+
+enum uts_tree : char {
+  uts_t1_mini, // Geometric [fixed],  ~64K nodes  (test only)
+  uts_t1,      // Geometric [fixed],  ~4M nodes
+  uts_t1l,     // Geometric [fixed],  ~102M nodes
+  uts_t1xxl,   // Geometric [fixed],  ~4.2B nodes
+  uts_t3_mini, // Binomial,           ~6K nodes   (test only)
+  uts_t3,      // Binomial,           ~4M nodes
+  uts_t3l,     // Binomial,           ~111M nodes
+  uts_t3xxl,   // Binomial,           ~2.8B nodes
+};
+
+void setup_tree(uts_tree tree);
+
+auto expected_result(uts_tree tree) -> result;
+
+template <typename Fn>
+void run_uts(benchmark::State &state, uts_tree tree, std::int64_t threads, Fn fn) {
+  setup_tree(tree);
+  auto expect = expected_result(tree);
+
+  lf_bench::bench(state, threads, expect, [fn]() -> result {
+    Node root;
+    uts_initRoot(&root, type);
+    return std::invoke(fn, &root);
+  });
+}
+
+template <typename Fn>
+void run_uts(benchmark::State &state, uts_tree tree, Fn fn) {
+  run_uts(state, tree, lf_bench::no_threads, fn);
+}
diff --git a/benchmark/src/baremetal/CMakeLists.txt b/benchmark/src/baremetal/CMakeLists.txt
new file mode 100644
index 000000000..ea6f1e69f
--- /dev/null
+++ b/benchmark/src/baremetal/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_library(baremetal_benchmarks)
+
+target_sources(baremetal_benchmarks PRIVATE fib.cpp)
+
+target_link_libraries(baremetal_benchmarks PUBLIC benchmark_common libfork::libfork)
diff --git a/benchmark/src/baremetal/fib.cpp b/benchmark/src/baremetal/fib.cpp
new file mode 100644
index 000000000..b2bd9798a
--- /dev/null
+++ b/benchmark/src/baremetal/fib.cpp
@@ -0,0 +1,148 @@
+#include <benchmark/benchmark.h>
+
+#include "fib.hpp"
+#include "macros.hpp"
+
+import std;
+
+import libfork;
+
+// === Coroutine
+
+namespace {
+
+// ==== Allocators ==== //
+
+[[nodiscard]]
+inline auto fib_align_size(std::size_t n) -> std::size_t {
+  constexpr std::size_t align = __STDCPP_DEFAULT_NEW_ALIGNMENT__;
+  return (n + align - 1) & ~(align - 1);
+}
+
+constinit inline thread_local std::byte *tls_bump_ptr = nullptr;
+
+struct task {
+  struct promise_type {
+
+    static auto operator new(std::size_t sz) -> void * {
+      auto *prev = tls_bump_ptr;
+      tls_bump_ptr += fib_align_size(sz);
+      return prev;
+    }
+
+    static auto operator delete(void *p, [[maybe_unused]] std::size_t sz) noexcept -> void {
+      tls_bump_ptr = std::bit_cast<std::byte *>(p);
+    }
+
+    auto get_return_object() -> task { return {std::coroutine_handle<promise_type>::from_promise(*this)}; }
+
+    auto initial_suspend() -> std::suspend_always { return {}; }
+
+    auto final_suspend() noexcept {
+      struct final_awaitable : std::suspend_always {
+        auto await_suspend(std::coroutine_handle<promise_type> h) noexcept -> std::coroutine_handle<> {
+
+          std::coroutine_handle<> cont = h.promise().continuation;
+
+          h.destroy();
+
+          if (cont) {
+            return cont;
+          }
+
+          return std::noop_coroutine();
+        }
+      };
+
+      return final_awaitable{};
+    }
+
+    void return_value(std::int64_t val) { *value = val; }
+    void unhandled_exception() { std::terminate(); }
+
+    std::int64_t *value = nullptr;
+    std::coroutine_handle<> continuation = nullptr;
+  };
+
+  std::coroutine_handle<promise_type> coro;
+
+  auto set(std::int64_t &out) -> task & {
+    coro.promise().value = &out;
+    return *this;
+  }
+
+  auto await_ready() noexcept -> bool { return false; }
+
+  auto await_suspend(std::coroutine_handle<> h) -> std::coroutine_handle<promise_type> {
+    coro.promise().continuation = h;
+    return coro;
+  }
+
+  void await_resume() noexcept {}
+};
+
+auto fib(std::int64_t n) -> task {
+  if (n <= 1) {
+    co_return n;
+  }
+  std::int64_t a = 0;
+  std::int64_t b = 0;
+  co_await fib(n - 2).set(a);
+  co_await fib(n - 1).set(b);
+  co_return a + b;
+}
+
+template <typename = void>
+void fib_coro_no_queue(benchmark::State &state) {
+  // 8MB stack
+  std::unique_ptr buffer = std::make_unique<std::byte[]>(1024 * 1024 * 8);
+  tls_bump_ptr = buffer.get();
+
+  run_fib(state, [](std::int64_t n) {
+    std::int64_t result = 0;
+    fib(n).set(result).coro.resume();
+    return result;
+  });
+
+  if (tls_bump_ptr != buffer.get()) {
+    std::terminate(); // Stack leak
+  }
+}
+
+// === Recursive with Deque overhead
+
+constinit inline thread_local lf::deque<std::int64_t> *tls_deque = nullptr;
+
+auto deque() -> lf::deque<std::int64_t> & { return *tls_deque; }
+
+auto fib_recursive_deque_impl(std::int64_t n) -> std::int64_t {
+  if (n <= 1) {
+    return n;
+  }
+
+  // Emulate work item creation/scheduling overhead
+  deque().push(n);
+  std::int64_t a = fib_recursive_deque_impl(n - 2);
+  deque().pop();
+
+  std::int64_t b = fib_recursive_deque_impl(n - 1);
+
+  return a + b;
+}
+
+template <typename = void>
+void fib_recursive_deque(benchmark::State &state) {
+  lf::deque<std::int64_t> deque{64};
+  tls_deque = &deque;
+
+  run_fib(state, fib_recursive_deque_impl);
+
+  tls_deque = nullptr;
+}
+
+} // namespace
+
+// Minimal coroutine, bump allocated (thread-local) stack
+BENCH_ALL(fib_coro_no_queue, baremetal, coro, fib)
+
+BENCH_ALL(fib_recursive_deque, baremetal, deque, fib)
diff --git a/benchmark/src/benchmarks.cpp b/benchmark/src/benchmarks.cpp
new file mode 100644
index 000000000..d6cf26f54
--- /dev/null
+++ b/benchmark/src/benchmarks.cpp
@@ -0,0 +1 @@
+// Benchmarks are registered in the linked sub-libraries.
diff --git a/benchmark/src/libfork/CMakeLists.txt b/benchmark/src/libfork/CMakeLists.txt
new file mode 100644
index 000000000..1c444c60e
--- /dev/null
+++ b/benchmark/src/libfork/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_library(libfork_benchmarks)
+
+target_sources(libfork_benchmarks
+  PRIVATE
+    fib.cpp
+    fold.cpp
+    uts.cpp
+    switch_io_pool.cpp
+    switch_random.cpp
+  PRIVATE
+    FILE_SET HEADERS
+    BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}
+    FILES
+      helpers.hpp
+)
+
+target_link_libraries(libfork_benchmarks
+  PUBLIC
+    benchmark_common
+    libfork::libfork
+)
diff --git a/benchmark/src/libfork/fib.cpp b/benchmark/src/libfork/fib.cpp
new file mode 100644
index 000000000..c3bf502be
--- /dev/null
+++ b/benchmark/src/libfork/fib.cpp
@@ -0,0 +1,79 @@
+#include <benchmark/benchmark.h>
+
+#include "fib.hpp"
+
+#include "helpers.hpp"
+
+import std;
+
+import libfork;
+
+// === Coroutine
+
+namespace {
+
+struct fib {
+  template <lf::worker_context Context>
+  static auto operator()(lf::env<Context>, std::int64_t n) -> lf::task<std::int64_t, Context> {
+    if (n < 2) {
+      co_return n;
+    }
+
+    std::int64_t lhs = 0;
+    std::int64_t rhs = 0;
+
+    auto sc = co_await lf::scope();
+
+    co_await sc.fork(&rhs, fib{}, n - 2);
+    co_await sc.call(&lhs, fib{}, n - 1);
+
+    co_await sc.join();
+
+    co_return lhs + rhs;
+  }
+};
+
+template <lf::scheduler Sch>
+void run(benchmark::State &state) {
+
+  auto threads = static_cast<std::int64_t>(thread_count<Sch>(state));
+  Sch scheduler = make_scheduler<Sch>(state);
+
+  run_fib(state, threads, [&](std::int64_t n) -> std::int64_t {
+    return lf::schedule(scheduler, fib{}, n).get();
+  });
+}
+
+} // namespace
+
+using lf::adapt_deque;
+using lf::adapt_vector;
+
+using lf::adaptor_stack;
+using lf::geometric_stack;
+using lf::slab_stack;
+
+// -- Vector
+
+LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler<adaptor_stack<>, adapt_vector<>>)
+LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler<adaptor_stack<>, adapt_vector<>>)
+
+LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler<slab_stack<>, adapt_vector<>>)
+LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler<slab_stack<>, adapt_vector<>>)
+
+LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler<geometric_stack<>, adapt_vector<>>)
+LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler<geometric_stack<>, adapt_vector<>>)
+
+// -- Deque
+
+LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler<adaptor_stack<>, adapt_deque<>>)
+LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler<adaptor_stack<>, adapt_deque<>>)
+
+LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler<slab_stack<>, adapt_deque<>>)
+LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler<slab_stack<>, adapt_deque<>>)
+
+LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler<geometric_stack<>, adapt_deque<>>)
+LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler<geometric_stack<>, adapt_deque<>>)
+
+LIBFORK_BENCH_ALL_MT(run, fib, fib, mono_busy_pool)
+LIBFORK_BENCH_ALL_MT(run, fib, fib, poly_busy_pool)
diff --git a/benchmark/src/libfork/fold.cpp b/benchmark/src/libfork/fold.cpp
new file mode 100644
index 000000000..194968b32
--- /dev/null
+++ b/benchmark/src/libfork/fold.cpp
@@ -0,0 +1,102 @@
+#include <benchmark/benchmark.h>
+
+#include "fold.hpp"
+
+#include "helpers.hpp"
+
+import std;
+
+import libfork;
+
+namespace {
+
+template <typename T>
+struct sync_projection {
+  static constexpr auto operator()(T value) -> fold_accum_t<T> { return static_cast<fold_accum_t<T>>(value); }
+};
+
+template <typename T>
+struct async_projection {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, T value) -> lf::task<fold_accum_t<T>, Context> {
+    co_return static_cast<fold_accum_t<T>>(value);
+  }
+};
+
+template <fold_projection_mode Projection, typename T>
+constexpr auto make_projection() {
+  if constexpr (Projection == fold_projection_mode::sync) {
+    return sync_projection<T>{};
+  } else {
+    return async_projection<T>{};
+  }
+}
+
+template <fold_chunk_mode Chunk,
+          fold_projection_mode Projection,
+          typename T,
+          lf::scheduler Sch,
+          typename Range>
+auto run_fold(Sch &pool, Range &&range) -> fold_accum_t<T> {
+
+  auto projection = make_projection<Projection, T>();
+
+  using std::ranges::begin;
+  using std::ranges::end;
+
+  using diff_t = std::ranges::range_difference_t<Range>;
+
+  if constexpr (Chunk == fold_chunk_mode::deduced) {
+    return *lf::schedule(pool, lf::fold, begin(range), end(range), std::plus<>{}, projection).get();
+  } else {
+    constexpr diff_t chunk = Chunk == fold_chunk_mode::explicit_one ? diff_t{1} : diff_t{4096};
+    return *lf::schedule(pool, lf::fold, begin(range), end(range), chunk, std::plus<>{}, projection).get();
+  }
+}
+
+template <fold_data_mode Data, fold_chunk_mode Chunk, fold_projection_mode Projection, typename T>
+void run(benchmark::State &state) {
+
+  mono_busy_pool pool{1};
+
+  run_fold_input<Data, T>(state, [&](auto const &values) -> fold_accum_t<T> {
+    return run_fold<Chunk, Projection, T>(pool, values);
+  });
+}
+
+template <fold_data_mode Data,
+          fold_chunk_mode Chunk,
+          fold_projection_mode Projection,
+          typename T,
+          lf::scheduler Sch>
+void run_mt(benchmark::State &state) {
+
+  auto threads = static_cast<std::int64_t>(thread_count<Sch>(state));
+  Sch pool = make_scheduler<Sch>(state);
+
+  run_fold_input<Data, T>(state, threads, [&](auto const &values) -> fold_accum_t<T> {
+    return run_fold<Chunk, Projection, T>(pool, values);
+  });
+}
+
+} // namespace
+
+// Chunked/sync/sync versions to mirror serial benchmarks.
+LF_FOLD_BENCH_SIZES(run, libfork, fold / std_plus, memory, chunk_fixed, sync_proj, int32)
+LF_FOLD_BENCH_SIZES(run, libfork, fold / std_plus, memory, chunk_fixed, sync_proj, float32)
+LF_FOLD_BENCH_SIZES(run, libfork, fold / std_plus, lazy, chunk_fixed, sync_proj, int32)
+LF_FOLD_BENCH_SIZES(run, libfork, fold / std_plus, lazy, chunk_fixed, sync_proj, float32)
+
+// Compare specialised for sync/async (no largest size)
+LF_FOLD_BENCH_SIZES_SMALL(run, libfork, fold / std_plus, memory, chunk_1, sync_proj, float32)
+LF_FOLD_BENCH_SIZES_SMALL(run, libfork, fold / std_plus, memory, chunk_deduced, sync_proj, float32)
+LF_FOLD_BENCH_SIZES_SMALL(run, libfork, fold / std_plus, memory, chunk_1, async_proj, float32)
+LF_FOLD_BENCH_SIZES_SMALL(run, libfork, fold / std_plus, memory, chunk_deduced, async_proj, float32)
+
+#define MT(...) LF_FOLD_BENCH_SIZES_MT(__VA_ARGS__)
+
+// Multi-threaded float32/sync projection.
+MT(run_mt, libfork, fold / std_plus, memory, chunk_fixed, sync_proj, float32, mono_busy_pool)
+MT(run_mt, libfork, fold / std_plus, lazy, chunk_fixed, sync_proj, float32, mono_busy_pool)
+MT(run_mt, libfork, fold / std_plus, memory, chunk_fixed, sync_proj, float32, poly_busy_pool)
+MT(run_mt, libfork, fold / std_plus, lazy, chunk_fixed, sync_proj, float32, poly_busy_pool)
diff --git a/benchmark/src/libfork/helpers.hpp b/benchmark/src/libfork/helpers.hpp
new file mode 100644
index 000000000..8fbef45dc
--- /dev/null
+++ b/benchmark/src/libfork/helpers.hpp
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <benchmark/benchmark.h>
+
+#include "macros.hpp"
+
+import std;
+
+import libfork;
+
+template <lf::scheduler Sch>
+auto thread_count(benchmark::State &state) -> std::size_t {
+  if constexpr (std::constructible_from<Sch, std::size_t>) {
+    return static_cast<std::size_t>(state.range(1));
+  } else {
+    return 1;
+  }
+}
+
+template <lf::scheduler Sch>
+auto make_scheduler(benchmark::State &state) -> Sch {
+  if constexpr (std::constructible_from<Sch, std::size_t>) {
+    return Sch{static_cast<std::size_t>(state.range(1))};
+  } else {
+    return Sch{};
+  }
+}
+
+using mono_busy_pool = lf::mono_busy_pool<lf::geometric_stack<>>;
+using poly_busy_pool = lf::poly_busy_pool<lf::geometric_stack<>>;
+
+#define LIBFORK_BENCH_ALL(bench_fn, name, prefix, ...)                                                       \
+  BENCH_ALL(bench_fn, libfork, name, prefix __VA_OPT__(, ) __VA_ARGS__)
+
+#define LIBFORK_BENCH_ALL_MT(bench_fn, name, prefix, ...)                                                    \
+  BENCH_ALL_MT(bench_fn, libfork, name, prefix __VA_OPT__(, ) __VA_ARGS__)
+
+#define LIBFORK_UTS_BENCH_ONE_MT(bench_fn, mode, tree_name, tree_id, ...)                                    \
+  UTS_BENCH_ONE_MT(bench_fn, libfork, mode, tree_name, tree_id __VA_OPT__(, ) __VA_ARGS__)
+
+#define LIBFORK_UTS_BENCH_ALL_MT(bench_fn, ...) UTS_BENCH_ALL_MT(bench_fn, libfork __VA_OPT__(, ) __VA_ARGS__)
diff --git a/benchmark/src/libfork/switch_io_pool.cpp b/benchmark/src/libfork/switch_io_pool.cpp
new file mode 100644
index 000000000..d14199aa0
--- /dev/null
+++ b/benchmark/src/libfork/switch_io_pool.cpp
@@ -0,0 +1,185 @@
+#include <benchmark/benchmark.h>
+
+#include "helpers.hpp"
+
+import std;
+
+import libfork;
+
+// Constants must be at file scope (outside any namespace) so the macro
+// expansion can paste `requests_test` / `requests_base` from any position
+// in the translation unit.
+inline constexpr std::int64_t requests_test = 64;
+inline constexpr std::int64_t requests_base = (1 << 16) - 2;
+
+namespace {
+
+inline constexpr std::int64_t k_compute_units = 256;
+inline constexpr std::int64_t k_io_units = 32;
+
+inline auto k_io_workers() -> unsigned { return std::max(2u, std::thread::hardware_concurrency() / 8u); }
+
+// Busy-loop work that the optimizer cannot elide.
+auto do_work(std::int64_t n) -> std::int64_t {
+  std::int64_t acc = 0;
+  for (std::int64_t i = 0; i < n; ++i) {
+    acc += i ^ (acc >> 1);
+  }
+  return acc;
+}
+
+// Generic awaitable that posts the task's continuation to an arbitrary pool
+// whose context_type matches the current task's Context.
+template <typename Sch>
+struct switch_to {
+
+  using context_type = Sch::context_type;
+
+  Sch *target;
+
+  auto await_ready() noexcept -> bool { return false; }
+
+  auto await_suspend(lf::sched_handle<context_type> h, context_type & /*context*/) -> void {
+    target->post(h);
+  }
+
+  auto await_resume() noexcept -> void {}
+};
+
+// One "request": CPU work, hop to IO pool, IO work, hop back, more CPU work.
+template <lf::scheduler Sch>
+struct request_with_io {
+
+  using context_type = Sch::context_type;
+
+  static auto operator()(Sch *compute_pool, Sch *io_pool) -> lf::task<std::int64_t, context_type> {
+
+    std::int64_t acc = do_work(k_compute_units / 2);
+
+    co_await switch_to<Sch>{io_pool};
+
+    acc += do_work(k_io_units);
+
+    co_await switch_to<Sch>{compute_pool};
+
+    acc += do_work(k_compute_units / 2);
+
+    co_return acc;
+  }
+};
+
+// Baseline: same total work but no pool hops.
+template <lf::scheduler Sch>
+struct request_baseline {
+
+  using context_type = Sch::context_type;
+
+  static auto operator()() -> lf::task<std::int64_t, context_type> {
+    std::int64_t acc = do_work(k_compute_units / 2);
+    acc += do_work(k_io_units);
+    acc += do_work(k_compute_units / 2);
+    co_return acc;
+  }
+};
+
+// Fan-out: fork M request_with_io tasks and sum the results.
+template <lf::scheduler Sch>
+struct fan_out_with_io {
+
+  using context_type = Sch::context_type;
+
+  static auto
+  operator()(std::int64_t m, Sch *compute_pool, Sch *io_pool) -> lf::task<std::int64_t, context_type> {
+
+    std::vector<std::int64_t> results(static_cast<std::size_t>(m), 0);
+
+    auto sc = co_await lf::scope();
+
+    // TODO: use for_each algorithm
+
+    for (std::int64_t i = 0; i < m; ++i) {
+      co_await sc.fork(&results[static_cast<std::size_t>(i)], request_with_io<Sch>{}, compute_pool, io_pool);
+    }
+
+    co_await sc.join();
+
+    std::int64_t total = 0;
+    for (auto v : results) {
+      total += v;
+    }
+    co_return total;
+  }
+};
+
+// Fan-out: fork M request_baseline tasks and sum.
+template <lf::scheduler Sch>
+struct fan_out_baseline {
+
+  using context_type = Sch::context_type;
+
+  static auto operator()(std::int64_t m) -> lf::task<std::int64_t, context_type> {
+    std::vector<std::int64_t> results(static_cast<std::size_t>(m), 0);
+
+    auto sc = co_await lf::scope();
+
+    for (std::int64_t i = 0; i < m; ++i) {
+      co_await sc.fork(&results[static_cast<std::size_t>(i)], request_baseline<Sch>{});
+    }
+
+    co_await sc.join();
+
+    std::int64_t total = 0;
+    for (auto v : results) {
+      total += v;
+    }
+    co_return total;
+  }
+};
+
+// Compute expected result per request once at startup.
+auto expected_per_request() -> std::int64_t {
+  return do_work(k_compute_units / 2) + do_work(k_io_units) + do_work(k_compute_units / 2);
+}
+
+template <lf::scheduler Sch>
+void run_with_io(benchmark::State &state) {
+  std::int64_t m = state.range(0);
+
+  state.counters["requests"] = static_cast<double>(m);
+  state.counters["compute_threads"] = static_cast<double>(thread_count<Sch>(state));
+  state.counters["io_threads"] = static_cast<double>(k_io_workers());
+
+  std::int64_t expect = m * expected_per_request();
+
+  Sch compute_pool = make_scheduler<Sch>(state);
+  Sch io_pool{static_cast<std::size_t>(k_io_workers())};
+
+  lf_bench::bench(state, static_cast<std::int64_t>(thread_count<Sch>(state)), expect, [&]() -> std::int64_t {
+    return lf::schedule(compute_pool, fan_out_with_io<Sch>{}, m, &compute_pool, &io_pool).get();
+  });
+}
+
+template <lf::scheduler Sch>
+void run_baseline(benchmark::State &state) {
+  std::int64_t m = state.range(0);
+
+  state.counters["requests"] = static_cast<double>(m);
+  state.counters["compute_threads"] = static_cast<double>(thread_count<Sch>(state));
+
+  std::int64_t expect = m * expected_per_request();
+
+  Sch compute_pool = make_scheduler<Sch>(state);
+
+  lf_bench::bench(state, static_cast<std::int64_t>(thread_count<Sch>(state)), expect, [&]() -> std::int64_t {
+    return lf::schedule(compute_pool, fan_out_baseline<Sch>{}, m).get();
+  });
+}
+
+} // namespace
+
+// prefix = requests  →  macro uses requests_test / requests_base
+LIBFORK_BENCH_ALL_MT(run_with_io, request_io, requests, mono_busy_pool)
+LIBFORK_BENCH_ALL_MT(run_baseline, request_baseline, requests, mono_busy_pool)
+
+LIBFORK_BENCH_ALL_MT(run_with_io, request_io, requests, poly_busy_pool)
+LIBFORK_BENCH_ALL_MT(run_baseline, request_baseline, requests, poly_busy_pool)
diff --git a/benchmark/src/libfork/switch_random.cpp b/benchmark/src/libfork/switch_random.cpp
new file mode 100644
index 000000000..ecd5d42b9
--- /dev/null
+++ b/benchmark/src/libfork/switch_random.cpp
@@ -0,0 +1,116 @@
+#include <benchmark/benchmark.h>
+
+#include "fib.hpp"
+
+#include "helpers.hpp"
+
+import std;
+
+import libfork;
+
+// === Awaitable and helpers
+
+namespace {
+
+template <lf::scheduler Sch>
+struct switch_to_other {
+
+  Sch *target;
+
+  using context_type = Sch::context_type;
+
+  auto await_ready() noexcept -> bool { return false; }
+
+  void await_suspend(lf::sched_handle<context_type> h, context_type &) { target->post(h); }
+
+  auto await_resume() noexcept -> void {}
+};
+
+template <lf::scheduler Sch>
+struct pool_pair {
+  Sch *pools[2];
+};
+
+// SplitMix64
+struct rng {
+
+  std::uint64_t state;
+
+  auto next() -> rng {
+    state += 0x9e3779b97f4a7c15ULL;
+    std::uint64_t z = state;
+    z = (z ^ (z >> 30ULL)) * 0xbf58476d1ce4e5b9ULL;
+    z = (z ^ (z >> 27ULL)) * 0x94d049bb133111ebULL;
+    return {.state = z ^ (z >> 31ULL)};
+  }
+};
+
+// ~10% switch probability: threshold / 256 ≈ 0.10
+inline constexpr std::uint64_t k_switch_threshold = 25;
+
+template <lf::scheduler Sch>
+struct random_switch_fib {
+
+  using context_type = Sch::context_type;
+
+  using task = lf::task<std::int64_t, context_type>;
+
+  static auto operator()(std::int64_t n, pool_pair<Sch> *pp, rng seed, unsigned current) -> task {
+
+    if (n < 2) {
+      co_return n;
+    }
+
+    if ((seed.state & 0xffULL) < k_switch_threshold) {
+      current = 1U - current;
+      co_await switch_to_other<Sch>{pp->pools[current]};
+    }
+
+    std::int64_t lhs = 0;
+    std::int64_t rhs = 0;
+
+    auto sc = co_await lf::scope();
+
+    co_await sc.fork(&rhs, random_switch_fib{}, n - 2, pp, seed.next(), current);
+    co_await sc.call(&lhs, random_switch_fib{}, n - 1, pp, seed.next(), current);
+
+    co_await sc.join();
+
+    co_return lhs + rhs;
+  }
+};
+
+template <lf::scheduler Sch>
+void run(benchmark::State &state) {
+
+  std::int64_t n = state.range(0);
+  std::int64_t expect = fib_ref(n);
+
+  auto threads_total = static_cast<std::size_t>(state.range(1));
+
+  if (threads_total < 2) {
+    state.SkipWithMessage("switch_random requires at least 2 total workers");
+    return;
+  }
+
+  auto threads_a = (threads_total + 1) / 2;
+  auto threads_b = threads_total - threads_a;
+
+  state.counters["n"] = static_cast<double>(n);
+  state.counters["p_a"] = static_cast<double>(threads_a);
+  state.counters["p_b"] = static_cast<double>(threads_b);
+
+  Sch pool_a{threads_a};
+  Sch pool_b{threads_b};
+
+  pool_pair<Sch> pp{&pool_a, &pool_b};
+
+  lf_bench::bench(state, static_cast<std::int64_t>(threads_total), expect, [&]() -> std::int64_t {
+    return lf::schedule(pool_a, random_switch_fib<Sch>{}, n, &pp, rng{1}, 0U).get();
+  });
+}
+
+} // namespace
+
+LIBFORK_BENCH_ALL_MT(run, switch_random, fib, mono_busy_pool)
+LIBFORK_BENCH_ALL_MT(run, switch_random, fib, poly_busy_pool)
diff --git a/benchmark/src/libfork/uts.cpp b/benchmark/src/libfork/uts.cpp
new file mode 100644
index 000000000..2a6e1fdfe
--- /dev/null
+++ b/benchmark/src/libfork/uts.cpp
@@ -0,0 +1,78 @@
+#include <benchmark/benchmark.h>
+
+#include "uts.hpp"
+
+#include "helpers.hpp"
+
+import std;
+
+import libfork;
+
+// === Coroutine
+
+namespace {
+
+// TODO: try a version that uses try_fork
+
+struct uts_fn {
+  template <lf::worker_context Context>
+  static auto operator()(lf::env<Context>, int depth, Node *parent) -> lf::task<result, Context> {
+
+    result r{.maxdepth = static_cast<counter_t>(depth), .size = counter_t{1}, .leaves = counter_t{0}};
+
+    int num_children = uts_numChildren(parent);
+    int child_type = uts_childType(parent);
+
+    parent->numChildren = num_children;
+
+    if (num_children > 0) {
+      std::vector<pair> cs(static_cast<std::size_t>(num_children));
+
+      auto sc = co_await lf::scope();
+
+      for (std::size_t i = 0; i < static_cast<std::size_t>(num_children); ++i) {
+        cs[i].child.type = child_type;
+        cs[i].child.height = parent->height + 1;
+        cs[i].child.numChildren = -1;
+
+        for (int j = 0; j < computeGranularity; ++j) {
+          rng_spawn(parent->state.state, cs[i].child.state.state, static_cast<int>(i));
+        }
+
+        if (i + 1 == static_cast<std::size_t>(num_children)) {
+          co_await sc.call(&cs[i].res, uts_fn{}, depth + 1, &cs[i].child);
+        } else {
+          co_await sc.fork(&cs[i].res, uts_fn{}, depth + 1, &cs[i].child);
+        }
+      }
+
+      co_await sc.join();
+
+      for (auto &&elem : cs) {
+        r.maxdepth = std::max(r.maxdepth, elem.res.maxdepth);
+        r.size += elem.res.size;
+        r.leaves += elem.res.leaves;
+      }
+    } else {
+      r.leaves = 1;
+    }
+
+    co_return r;
+  }
+};
+
+template <lf::scheduler Sch>
+void run(benchmark::State &state, uts_tree tree) {
+
+  auto threads = static_cast<std::size_t>(state.range(0));
+  Sch scheduler = Sch{threads};
+
+  run_uts(state, tree, static_cast<std::int64_t>(threads), [&](Node *root) -> result {
+    return lf::schedule(scheduler, uts_fn{}, 0, root).get();
+  });
+}
+
+} // namespace
+
+LIBFORK_UTS_BENCH_ALL_MT(run, mono_busy_pool)
+LIBFORK_UTS_BENCH_ALL_MT(run, poly_busy_pool)
diff --git a/benchmark/src/openmp/CMakeLists.txt b/benchmark/src/openmp/CMakeLists.txt
new file mode 100644
index 000000000..035d5326f
--- /dev/null
+++ b/benchmark/src/openmp/CMakeLists.txt
@@ -0,0 +1,23 @@
+add_library(openmp_benchmarks)
+
+# OpenMP compiles with -fopenmp which conflicts with the shared std.pcm (built
+# without OpenMP). Disable module scanning so CMake doesn't inject the
+# incompatible modmap for this target.
+set_target_properties(openmp_benchmarks PROPERTIES CXX_SCAN_FOR_MODULES OFF)
+
+# TODO: remove this hack when we have LLVM 23
+
+# Signal to shared benchmark headers that this target cannot `import std;`
+# and must use textual standard headers instead.
+target_compile_definitions(openmp_benchmarks PRIVATE LF_BENCH_NO_IMPORT_STD)
+
+target_sources(openmp_benchmarks
+  PRIVATE
+    fib.cpp uts.cpp
+)
+
+target_link_libraries(openmp_benchmarks
+  PUBLIC
+    benchmark_common
+    OpenMP::OpenMP_CXX
+)
diff --git a/benchmark/src/openmp/fib.cpp b/benchmark/src/openmp/fib.cpp
new file mode 100644
index 000000000..54afd0f3c
--- /dev/null
+++ b/benchmark/src/openmp/fib.cpp
@@ -0,0 +1,47 @@
+#include <cstdint>
+#include <format>
+
+#include <benchmark/benchmark.h>
+
+#include "fib.hpp"
+#include "macros.hpp"
+
+namespace {
+
+auto fib(std::int64_t n) -> std::int64_t {
+  if (n < 2) {
+    return n;
+  }
+
+  std::int64_t lhs = 0;
+  std::int64_t rhs = 0;
+
+#pragma omp task untied shared(lhs) firstprivate(n) default(none)
+  lhs = fib(n - 2);
+
+  rhs = fib(n - 1);
+
+#pragma omp taskwait
+  return lhs + rhs;
+}
+
+template <typename = void>
+void fib_run(benchmark::State &state) {
+  int threads = static_cast<int>(state.range(1));
+
+  run_fib(state, threads, [threads](std::int64_t n) -> std::int64_t {
+    std::int64_t return_value = 0;
+
+#pragma omp parallel num_threads(threads) default(shared)
+#pragma omp single nowait
+    {
+      return_value = fib(n);
+    }
+
+    return return_value;
+  });
+}
+
+} // namespace
+
+BENCH_ALL_MT(fib_run, openmp, fib, fib)
diff --git a/benchmark/src/openmp/uts.cpp b/benchmark/src/openmp/uts.cpp
new file mode 100644
index 000000000..e574b72f4
--- /dev/null
+++ b/benchmark/src/openmp/uts.cpp
@@ -0,0 +1,73 @@
+#include <algorithm>
+#include <cstddef>
+#include <format>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+
+#include "macros.hpp"
+#include "uts.hpp"
+
+namespace {
+
+auto uts(int depth, Node *parent) -> result {
+  result r{.maxdepth = static_cast<counter_t>(depth), .size = counter_t{1}, .leaves = counter_t{0}};
+
+  int num_children = uts_numChildren(parent);
+  int child_type = uts_childType(parent);
+
+  parent->numChildren = num_children;
+
+  if (num_children > 0) {
+    std::vector<pair> cs(static_cast<std::size_t>(num_children));
+
+    for (std::size_t i = 0; i < static_cast<std::size_t>(num_children); ++i) {
+      cs[i].child.type = child_type;
+      cs[i].child.height = parent->height + 1;
+      cs[i].child.numChildren = -1;
+
+      for (int j = 0; j < computeGranularity; ++j) {
+        rng_spawn(parent->state.state, cs[i].child.state.state, static_cast<int>(i));
+      }
+
+      if (i + 1 == static_cast<std::size_t>(num_children)) {
+        cs[i].res = uts(depth + 1, &cs[i].child);
+      } else {
+#pragma omp task untied shared(cs) firstprivate(depth, i) default(none)
+        cs[i].res = uts(depth + 1, &cs[i].child);
+      }
+    }
+
+#pragma omp taskwait
+
+    for (auto &&elem : cs) {
+      r.maxdepth = std::max(r.maxdepth, elem.res.maxdepth);
+      r.size += elem.res.size;
+      r.leaves += elem.res.leaves;
+    }
+  } else {
+    r.leaves = 1;
+  }
+
+  return r;
+}
+
+void uts_run(benchmark::State &state, uts_tree tree) {
+  int threads = static_cast<int>(state.range(0));
+
+  run_uts(state, tree, threads, [threads](Node *root) -> result {
+    result r;
+
+#pragma omp parallel num_threads(threads) default(shared)
+#pragma omp single nowait
+    {
+      r = uts(0, root);
+    }
+
+    return r;
+  });
+}
+
+} // namespace
+
+UTS_BENCH_ALL_MT(uts_run, openmp)
diff --git a/benchmark/src/serial/CMakeLists.txt b/benchmark/src/serial/CMakeLists.txt
new file mode 100644
index 000000000..1d215bd92
--- /dev/null
+++ b/benchmark/src/serial/CMakeLists.txt
@@ -0,0 +1,20 @@
+add_library(serial_benchmarks)
+
+target_sources(serial_benchmarks PRIVATE
+  fib.cpp
+  fold.cpp
+  heat.cpp
+  integrate.cpp
+  knapsack.cpp
+  mandelbrot.cpp
+  matmul.cpp
+  nqueens.cpp
+  primes.cpp
+  quicksort.cpp
+  scan.cpp
+  skynet.cpp
+  strassen.cpp
+  uts.cpp
+)
+
+target_link_libraries(serial_benchmarks PUBLIC benchmark_common)
diff --git a/benchmark/src/serial/fib.cpp b/benchmark/src/serial/fib.cpp
new file mode 100644
index 000000000..a13cf5f47
--- /dev/null
+++ b/benchmark/src/serial/fib.cpp
@@ -0,0 +1,53 @@
+#include <benchmark/benchmark.h>
+
+#include "fib.hpp"
+#include "macros.hpp"
+
+import std;
+
+namespace {
+
+auto fib_impl(std::int64_t &ret, std::int64_t n) -> void {
+  if (n < 2) {
+    ret = n;
+    return;
+  }
+
+  std::int64_t lhs = 0;
+  std::int64_t rhs = 0;
+
+  fib_impl(lhs, n - 2);
+  fib_impl(rhs, n - 1);
+
+  ret = lhs + rhs;
+}
+
+template <typename = void>
+void fib_serial(benchmark::State &state) {
+  run_fib(state, [](std::int64_t n) -> std::int64_t {
+    std::int64_t result = 0;
+    fib_impl(result, n);
+    return result;
+  });
+}
+
+auto fib_ret_impl(std::int64_t n) -> std::int64_t {
+  if (n < 2) {
+    return n;
+  }
+
+  std::int64_t lhs = fib_ret_impl(n - 1);
+  std::int64_t rhs = fib_ret_impl(n - 2);
+
+  return lhs + rhs;
+}
+
+template <typename = void>
+void fib_serial_return(benchmark::State &state) {
+  run_fib(state, fib_ret_impl);
+}
+
+} // namespace
+
+BENCH_ALL(fib_serial, serial, fib, fib)
+BENCH_ALL(fib_serial_return, serial, fib / return, fib)
diff --git a/benchmark/src/serial/fold.cpp b/benchmark/src/serial/fold.cpp
new file mode 100644
index 000000000..f7c7f58ec
--- /dev/null
+++ b/benchmark/src/serial/fold.cpp
@@ -0,0 +1,27 @@
+#include <benchmark/benchmark.h>
+
+#include "fold.hpp"
+
+import std;
+
+namespace {
+
+template <fold_data_mode Data, typename T>
+void fold_reduce(benchmark::State &state) {
+  run_fold_input<Data, T>(state, [](auto &&values) -> fold_accum_t<T> {
+    return std::reduce(
+        std::ranges::begin(values), std::ranges::end(values), fold_accum_t<T>{}, [](auto a, auto b) static {
+          return fold_accum_t<T>(a) + fold_accum_t<T>(b);
+        });
+  });
+}
+
+} // namespace
+
+#define LF_REGISTER_FOLD_REDUCE(data, dtype)                                                                 \
+  LF_FOLD_BENCH_SIZES(fold_reduce, serial, fold / std_reduce, data, dtype)
+
+LF_REGISTER_FOLD_REDUCE(memory, int32)
+LF_REGISTER_FOLD_REDUCE(memory, float32)
+LF_REGISTER_FOLD_REDUCE(lazy, int32)
+LF_REGISTER_FOLD_REDUCE(lazy, float32)
diff --git a/benchmark/src/serial/heat.cpp b/benchmark/src/serial/heat.cpp
new file mode 100644
index 000000000..1a6f2e4af
--- /dev/null
+++ b/benchmark/src/serial/heat.cpp
@@ -0,0 +1,24 @@
+#include <benchmark/benchmark.h>
+
+#include "heat.hpp"
+#include "macros.hpp"
+
+import std;
+
+namespace {
+
+void heat_run(double *a, double *b, std::size_t n, std::size_t iters) {
+  for (std::size_t t = 0; t < iters; ++t) {
+    heat_jacobi_step(a, b, n);
+    std::swap(a, b);
+  }
+}
+
+template <typename = void>
+void heat_serial(benchmark::State &state) {
+  run_heat(state, heat_run);
+}
+
+} // namespace
+
+BENCH_ALL(heat_serial, serial, heat, heat)
diff --git a/benchmark/src/serial/integrate.cpp b/benchmark/src/serial/integrate.cpp
new file mode 100644
index 000000000..31e4bf4dc
--- /dev/null
+++ b/benchmark/src/serial/integrate.cpp
@@ -0,0 +1,39 @@
+#include <benchmark/benchmark.h>
+
+#include "integrate.hpp"
+#include "macros.hpp"
+
+import std;
+
+namespace {
+
+auto integrate_recurse(double x1, double y1, double x2, double y2, double area) -> double {
+
+  double half = (x2 - x1) / 2;
+  double x0 = x1 + half;
+  double y0 = integrate_fn(x0);
+
+  double area_x1x0 = (y1 + y0) / 2 * half;
+  double area_x0x2 = (y0 + y2) / 2 * half;
+  double area_x1x2 = area_x1x0 + area_x0x2;
+
+  if (area_x1x2 - area < integrate_epsilon && area - area_x1x2 < integrate_epsilon) {
+    return area_x1x2;
+  }
+
+  area_x1x0 = integrate_recurse(x1, y1, x0, y0, area_x1x0);
+  area_x0x2 = integrate_recurse(x0, y0, x2, y2, area_x0x2);
+
+  return area_x1x0 + area_x0x2;
+}
+
+template <typename = void>
+void integrate_serial(benchmark::State &state) {
+  run_integrate(state, [](double upper) {
+    return integrate_recurse(0, integrate_fn(0), upper, integrate_fn(upper), 0);
+  });
+}
+
+} // namespace
+
+BENCH_ALL(integrate_serial, serial, integrate, integrate)
diff --git a/benchmark/src/serial/knapsack.cpp b/benchmark/src/serial/knapsack.cpp
new file mode 100644
index 000000000..3e9ed8f14
--- /dev/null
+++ b/benchmark/src/serial/knapsack.cpp
@@ -0,0 +1,65 @@
+#include <benchmark/benchmark.h>
+
+#include "knapsack.hpp"
+#include "macros.hpp"
+
+import std;
+
+namespace {
+
+// Linear-relaxation bound: greedily fill remaining capacity with the densest
+// items, taking a fractional piece of the last one.
+auto upper_bound(std::vector<knapsack_item> const &items,
+                 std::size_t idx,
+                 int remaining_cap,
+                 int current_value) -> double {
+  double bound = current_value;
+  int cap = remaining_cap;
+  for (std::size_t i = idx; i < items.size(); ++i) {
+    if (items[i].weight <= cap) {
+      cap -= items[i].weight;
+      bound += items[i].value;
+    } else {
+      bound += static_cast<double>(items[i].value) * cap / items[i].weight;
+      break;
+    }
+  }
+  return bound;
+}
+
+void knapsack_bb(std::vector<knapsack_item> const &items,
+                 std::size_t idx,
+                 int remaining_cap,
+                 int current_value,
+                 int &best) {
+
+  if (current_value > best) {
+    best = current_value;
+  }
+
+  if (idx == items.size()) {
+    return;
+  }
+
+  if (upper_bound(items, idx, remaining_cap, current_value) <= best) {
+    return;
+  }
+
+  if (items[idx].weight <= remaining_cap) {
+    knapsack_bb(items, idx + 1, remaining_cap - items[idx].weight, current_value + items[idx].value, best);
+  }
+  knapsack_bb(items, idx + 1, remaining_cap, current_value, best);
+}
+
+template <typename = void>
+void knapsack_serial(benchmark::State &state) {
+  run_knapsack(state, [](knapsack_problem const &problem) {
+    int best = 0;
+    knapsack_bb(problem.items, 0, problem.capacity, 0, best);
+    return best;
+  });
+}
+
+} // namespace
+
+BENCH_ALL(knapsack_serial, serial, knapsack, knapsack)
diff --git a/benchmark/src/serial/mandelbrot.cpp b/benchmark/src/serial/mandelbrot.cpp
new file mode 100644
index 000000000..518d547cf
--- /dev/null
+++ b/benchmark/src/serial/mandelbrot.cpp
@@ -0,0 +1,17 @@
+#include <benchmark/benchmark.h>
+
+#include "macros.hpp"
+#include "mandelbrot.hpp"
+
+import std;
+
+namespace {
+
+template <typename = void>
+void mandelbrot_serial(benchmark::State &state) {
+  run_mandelbrot(state, mandelbrot_checksum);
+}
+
+} // namespace
+
+BENCH_ALL(mandelbrot_serial, serial, mandelbrot, mandelbrot)
diff --git a/benchmark/src/serial/matmul.cpp b/benchmark/src/serial/matmul.cpp
new file mode 100644
index 000000000..d9753c5c0
--- /dev/null
+++ b/benchmark/src/serial/matmul.cpp
@@ -0,0 +1,44 @@
+#include <benchmark/benchmark.h>
+
+#include "macros.hpp"
+#include "matmul.hpp"
+
+import std;
+
+namespace {
+
+template <bool Add>
+void matmul_dc(float const *A, float const *B, float *R, unsigned n, unsigned s) {
+  if (n <= matmul_basecase) {
+    matmul_basecase_multiply<Add>(A, B, R, n, s);
+    return;
+  }
+
+  unsigned m = n / 2;
+
+  unsigned o00 = 0;
+  unsigned o01 = m;
+  unsigned o10 = m * s;
+  unsigned o11 = m * s + m;
+
+  matmul_dc<Add>(A + o00, B + o00, R + o00, m, s);
+  matmul_dc<Add>(A + o00, B + o01, R + o01, m, s);
+  matmul_dc<Add>(A + o10, B + o00, R + o10, m, s);
+  matmul_dc<Add>(A + o10, B + o01, R + o11, m, s);
+
+  matmul_dc<true>(A + o01, B + o10, R + o00, m, s);
+  matmul_dc<true>(A + o01, B + o11, R + o01, m, s);
+  matmul_dc<true>(A + o11, B + o10, R + o10, m, s);
+  matmul_dc<true>(A + o11, B + o11, R + o11, m, s);
+}
+
+template <typename = void>
+void matmul_serial(benchmark::State &state) {
+  run_matmul(state, 1e-5f, [](float const *A, float const *B, float *C, unsigned n) {
+    matmul_dc<false>(A, B, C, n, n);
+  });
+}
+
+} // namespace
+
+BENCH_ALL(matmul_serial, serial, matmul, matmul)
diff --git a/benchmark/src/serial/nqueens.cpp b/benchmark/src/serial/nqueens.cpp
new file mode 100644
index 000000000..9d8bb715d
--- /dev/null
+++ b/benchmark/src/serial/nqueens.cpp
@@ -0,0 +1,36 @@
+#include <benchmark/benchmark.h>
+
+#include "macros.hpp"
+#include "nqueens.hpp"
+
+import std;
+
+namespace {
+
+auto nqueens(int j, int n, char *a) -> std::int64_t {
+  if (j == n) {
+    return 1;
+  }
+
+  std::int64_t res = 0;
+
+  for (int i = 0; i < n; ++i) {
+    a[j] = static_cast<char>(i);
+    if (queens_ok(j + 1, a)) {
+      res += nqueens(j + 1, n, a);
+    }
+  }
+
+  return res;
+}
+
+template <typename = void>
+void nqueens_serial(benchmark::State &state) {
+  run_nqueens(state, [](int n, char *board) {
+    return nqueens(0, n, board);
+  });
+}
+
+} // namespace
+
+BENCH_ALL(nqueens_serial, serial, nqueens, nqueens)
diff --git a/benchmark/src/serial/primes.cpp b/benchmark/src/serial/primes.cpp
new file mode 100644
index 000000000..cb0f05005
--- /dev/null
+++ b/benchmark/src/serial/primes.cpp
@@ -0,0 +1,23 @@
+#include <benchmark/benchmark.h>
+
+#include "macros.hpp"
+#include "primes.hpp"
+
+import std;
+
+namespace {
+
+template <typename = void>
+void primes_serial(benchmark::State &state) {
+  run_primes(state, [](std::int64_t lim) {
+    std::int64_t count = 0;
+    for (std::int64_t i = 2; i < lim; ++i) {
+      count += is_prime(i) ? 1 : 0;
+    }
+    return count;
+  });
+}
+
+} // namespace
+
+BENCH_ALL(primes_serial, serial, primes, primes)
diff --git a/benchmark/src/serial/quicksort.cpp b/benchmark/src/serial/quicksort.cpp
new file mode 100644
index 000000000..58fa023cc
--- /dev/null
+++ b/benchmark/src/serial/quicksort.cpp
@@ -0,0 +1,54 @@
+#include <benchmark/benchmark.h>
+
+#include "macros.hpp"
+#include "quicksort.hpp"
+
+import std;
+
+namespace {
+
+void insertion_sort(std::uint32_t *first, std::uint32_t *last) {
+  for (auto *it = first + 1; it < last; ++it) {
+    std::uint32_t v = *it;
+    auto *j = it;
+    while (j > first && *(j - 1) > v) {
+      *j = *(j - 1);
+      --j;
+    }
+    *j = v;
+  }
+}
+
+auto partition(std::uint32_t *first, std::uint32_t *last) -> std::uint32_t * {
+  std::uint32_t *mid = first + (last - first) / 2;
+  std::uint32_t pivot = *mid;
+  std::swap(*mid, *(last - 1));
+
+  auto *store = first;
+  for (auto *it = first; it < last - 1; ++it) {
+    if (*it < pivot) {
+      std::swap(*it, *store);
+      ++store;
+    }
+  }
+  std::swap(*store, *(last - 1));
+  return store;
+}
+
+void quicksort(std::uint32_t *first, std::uint32_t *last) {
+  while (last - first > static_cast<std::ptrdiff_t>(quicksort_basecase)) {
+    auto *pivot = partition(first, last);
+    quicksort(pivot + 1, last);
+    last = pivot;
+  }
+  insertion_sort(first, last);
+}
+
+template <typename = void>
+void quicksort_serial(benchmark::State &state) {
+  run_quicksort(state, quicksort);
+}
+
+} // namespace
+
+BENCH_ALL(quicksort_serial, serial, quicksort, quicksort)
diff --git a/benchmark/src/serial/scan.cpp b/benchmark/src/serial/scan.cpp
new file mode 100644
index 000000000..298cab7d1
--- /dev/null
+++ b/benchmark/src/serial/scan.cpp
@@ -0,0 +1,21 @@
+#include <benchmark/benchmark.h>
+
+#include "macros.hpp"
+#include "scan.hpp"
+
+import std;
+
+namespace {
+
+template <typename = void>
+void scan_serial(benchmark::State &state) {
+  run_scan(state, [](std::vector<unsigned> const &in, std::vector<unsigned> &out, std::size_t reps) {
+    for (std::size_t i = 0; i < reps; ++i) {
+      std::inclusive_scan(in.begin(), in.end(), out.begin(), std::plus<>{});
+    }
+  });
+}
+
+} // namespace
+
+BENCH_ALL(scan_serial, serial, scan, scan)
diff --git a/benchmark/src/serial/skynet.cpp b/benchmark/src/serial/skynet.cpp
new file mode 100644
index 000000000..37079b444
--- /dev/null
+++ b/benchmark/src/serial/skynet.cpp
@@ -0,0 +1,30 @@
+#include <benchmark/benchmark.h>
+
+#include "macros.hpp"
+#include "skynet.hpp"
+
+import std;
+
+namespace {
+
+auto skynet_recurse(std::int64_t num, int depth) -> std::int64_t {
+  if (depth == 0) {
+    return num;
+  }
+
+  std::int64_t sub = skynet_leaves(depth - 1);
+  std::int64_t sum = 0;
+  for (int i = 0; i < skynet_branching; ++i) {
+    sum += skynet_recurse(num + i * sub, depth - 1);
+  }
+  return sum;
+}
+
+template <typename = void>
+void skynet_serial(benchmark::State &state) {
+  run_skynet(state, skynet_recurse);
+}
+
+} // namespace
+
+BENCH_ALL(skynet_serial, serial, skynet, skynet)
diff --git a/benchmark/src/serial/strassen.cpp b/benchmark/src/serial/strassen.cpp
new file mode 100644
index 000000000..c5918e7af
--- /dev/null
+++ b/benchmark/src/serial/strassen.cpp
@@ -0,0 +1,131 @@
+#include <benchmark/benchmark.h>
+
+#include "macros.hpp"
+#include "matmul.hpp"
+
+import std;
+
+namespace {
+
+inline constexpr unsigned strassen_cutoff = 64;
+
+// Out[i][j] = A[i][j] + B[i][j], all m x m with respective row strides.
+void mat_add(float const *A, unsigned sa, float const *B, unsigned sb, float *Out, unsigned so, unsigned m) {
+  for (unsigned i = 0; i < m; ++i) {
+    for (unsigned j = 0; j < m; ++j) {
+      Out[i * so + j] = A[i * sa + j] + B[i * sb + j];
+    }
+  }
+}
+
+void mat_sub(float const *A, unsigned sa, float const *B, unsigned sb, float *Out, unsigned so, unsigned m) {
+  for (unsigned i = 0; i < m; ++i) {
+    for (unsigned j = 0; j < m; ++j) {
+      Out[i * so + j] = A[i * sa + j] - B[i * sb + j];
+    }
+  }
+}
+
+void naive_multiply(
+    float const *A, unsigned sa, float const *B, unsigned sb, float *C, unsigned sc, unsigned n) {
+  for (unsigned i = 0; i < n; ++i) {
+    for (unsigned j = 0; j < n; ++j) {
+      float sum = 0;
+      for (unsigned k = 0; k < n; ++k) {
+        sum += A[i * sa + k] * B[k * sb + j];
+      }
+      C[i * sc + j] = sum;
+    }
+  }
+}
+
+void strassen(float const *A, unsigned sa, float const *B, unsigned sb, float *C, unsigned sc, unsigned n) {
+
+  if (n <= strassen_cutoff) {
+    naive_multiply(A, sa, B, sb, C, sc, n);
+    return;
+  }
+
+  unsigned m = n / 2;
+
+  auto block = [m](auto *p, unsigned s, unsigned i, unsigned j) {
+    return p + i * m * s + j * m;
+  };
+
+  float const *A11 = block(A, sa, 0, 0);
+  float const *A12 = block(A, sa, 0, 1);
+  float const *A21 = block(A, sa, 1, 0);
+  float const *A22 = block(A, sa, 1, 1);
+  float const *B11 = block(B, sb, 0, 0);
+  float const *B12 = block(B, sb, 0, 1);
+  float const *B21 = block(B, sb, 1, 0);
+  float const *B22 = block(B, sb, 1, 1);
+  float *C11 = block(C, sc, 0, 0);
+  float *C12 = block(C, sc, 0, 1);
+  float *C21 = block(C, sc, 1, 0);
+  float *C22 = block(C, sc, 1, 1);
+
+  std::vector<float> buf(static_cast<std::size_t>(m) * m * 9);
+  float *T1 = buf.data();
+  float *T2 = T1 + static_cast<std::size_t>(m) * m;
+  float *M1 = T2 + static_cast<std::size_t>(m) * m;
+  float *M2 = M1 + static_cast<std::size_t>(m) * m;
+  float *M3 = M2 + static_cast<std::size_t>(m) * m;
+  float *M4 = M3 + static_cast<std::size_t>(m) * m;
+  float *M5 = M4 + static_cast<std::size_t>(m) * m;
+  float *M6 = M5 + static_cast<std::size_t>(m) * m;
+  float *M7 = M6 + static_cast<std::size_t>(m) * m;
+
+  // M1 = (A11 + A22)(B11 + B22)
+  mat_add(A11, sa, A22, sa, T1, m, m);
+  mat_add(B11, sb, B22, sb, T2, m, m);
+  strassen(T1, m, T2, m, M1, m, m);
+
+  // M2 = (A21 + A22) B11
+  mat_add(A21, sa, A22, sa, T1, m, m);
+  strassen(T1, m, B11, sb, M2, m, m);
+
+  // M3 = A11 (B12 - B22)
+  mat_sub(B12, sb, B22, sb, T2, m, m);
+  strassen(A11, sa, T2, m, M3, m, m);
+
+  // M4 = A22 (B21 - B11)
+  mat_sub(B21, sb, B11, sb, T2, m, m);
+  strassen(A22, sa, T2, m, M4, m, m);
+
+  // M5 = (A11 + A12) B22
+  mat_add(A11, sa, A12, sa, T1, m, m);
+  strassen(T1, m, B22, sb, M5, m, m);
+
+  // M6 = (A21 - A11)(B11 + B12)
+  mat_sub(A21, sa, A11, sa, T1, m, m);
+  mat_add(B11, sb, B12, sb, T2, m, m);
+  strassen(T1, m, T2, m, M6, m, m);
+
+  // M7 = (A12 - A22)(B21 + B22)
+  mat_sub(A12, sa, A22, sa, T1, m, m);
+  mat_add(B21, sb, B22, sb, T2, m, m);
+  strassen(T1, m, T2, m, M7, m, m);
+
+  // Combine.
+  for (unsigned i = 0; i < m; ++i) {
+    for (unsigned j = 0; j < m; ++j) {
+      std::size_t k = static_cast<std::size_t>(i) * m + j;
+      C11[i * sc + j] = M1[k] + M4[k] - M5[k] + M7[k];
+      C12[i * sc + j] = M3[k] + M5[k];
+      C21[i * sc + j] = M2[k] + M4[k];
+      C22[i * sc + j] = M1[k] - M2[k] + M3[k] + M6[k];
+    }
+  }
+}
+
+template <typename = void>
+void strassen_serial(benchmark::State &state) {
+  run_matmul(state, 1e-3f, [](float const *A, float const *B, float *C, unsigned n) {
+    strassen(A, n, B, n, C, n, n);
+  });
+}
+
+} // namespace
+
+BENCH_ALL(strassen_serial, serial, strassen, strassen)
diff --git a/benchmark/src/serial/uts.cpp b/benchmark/src/serial/uts.cpp
new file mode 100644
index 000000000..f404e806a
--- /dev/null
+++ b/benchmark/src/serial/uts.cpp
@@ -0,0 +1,101 @@
+#include <benchmark/benchmark.h>
+
+#include "macros.hpp"
+#include "uts.hpp"
+
+import std;
+
+namespace {
+
+auto uts_traverse(int depth, Node *parent) -> result {
+
+  result r{.maxdepth = static_cast<counter_t>(depth), .size = counter_t{1}, .leaves = counter_t{0}};
+
+  int num_children = uts_numChildren(parent);
+  int child_type = uts_childType(parent);
+
+  parent->numChildren = num_children;
+
+  if (num_children > 0) {
+    std::vector<pair> cs(static_cast<std::size_t>(num_children));
+
+    for (std::size_t i = 0; i < static_cast<std::size_t>(num_children); ++i) {
+      cs[i].child.type = child_type;
+      cs[i].child.height = parent->height + 1;
+      cs[i].child.numChildren = -1;
+
+      for (int j = 0; j < computeGranularity; ++j) {
+        rng_spawn(parent->state.state, cs[i].child.state.state, static_cast<int>(i));
+      }
+
+      cs[i].res = uts_traverse(depth + 1, &cs[i].child);
+    }
+
+    for (auto &&elem : cs) {
+      r.maxdepth = std::max(r.maxdepth, elem.res.maxdepth);
+      r.size += elem.res.size;
+      r.leaves += elem.res.leaves;
+    }
+  } else {
+    r.leaves = 1;
+  }
+
+  return r;
+}
+
+void uts_serial(benchmark::State &state, uts_tree tree) {
+  run_uts(state, tree, [](Node *root) {
+    return uts_traverse(0, root);
+  });
+}
+
+} // namespace
+
+UTS_BENCH_ALL(uts_serial, serial)
+
+namespace {
+
+auto uts_traverse_no_alloc(int depth, Node *parent) -> result {
+
+  result r{.maxdepth = static_cast<counter_t>(depth), .size = counter_t{1}, .leaves = counter_t{0}};
+
+  int num_children = uts_numChildren(parent);
+  int child_type = uts_childType(parent);
+
+  parent->numChildren = num_children;
+
+  if (num_children > 0) {
+    for (std::size_t i = 0; i < static_cast<std::size_t>(num_children); ++i) {
+
+      pair cs;
+
+      cs.child.type = child_type;
+      cs.child.height = parent->height + 1;
+      cs.child.numChildren = -1;
+
+      for (int j = 0; j < computeGranularity; ++j) {
+        rng_spawn(parent->state.state, cs.child.state.state, static_cast<int>(i));
+      }
+
+      cs.res = uts_traverse(depth + 1, &cs.child);
+
+      r.maxdepth = std::max(r.maxdepth, cs.res.maxdepth);
+      r.size += cs.res.size;
+      r.leaves += cs.res.leaves;
+    }
+  } else {
+    r.leaves = 1;
+  }
+
+  return r;
+}
+
+void uts_serial_no_alloc(benchmark::State &state, uts_tree tree) {
+  run_uts(state, tree, [](Node *root) {
+    return uts_traverse_no_alloc(0, root);
+  });
+}
+
+} // namespace
+
+UTS_BENCH_ALL(uts_serial_no_alloc, serial / no_alloc)
diff --git a/cmake/gcc-brew-toolchain.cmake b/cmake/gcc-brew-toolchain.cmake
new file mode 100644
index 000000000..aa67ccaaf
--- /dev/null
+++ b/cmake/gcc-brew-toolchain.cmake
@@ -0,0 +1,92 @@
+cmake_minimum_required(VERSION 4.2.1)
+
+# Set up Homebrew GCC@15 & Ninja toolchain for CMake
+
+find_program(BREW_EXE brew)
+
+if(NOT BREW_EXE)
+  message(FATAL_ERROR "Could not find 'brew' executable. Please install Homebrew.")
+endif()
+
+# --- Ninja
+
+execute_process(
+  COMMAND ${BREW_EXE} --prefix ninja
+  OUTPUT_VARIABLE NINJA_PREFIX
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  COMMAND_ERROR_IS_FATAL ANY
+)
+
+find_program(CMAKE_MAKE_PROGRAM
+  NAMES ninja
+  HINTS "${NINJA_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+# --- GCC
+
+execute_process(
+  COMMAND ${BREW_EXE} --prefix gcc
+  OUTPUT_VARIABLE GCC_PREFIX
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  COMMAND_ERROR_IS_FATAL ANY
+)
+
+find_program(CMAKE_C_COMPILER
+  NAMES gcc-HEAD
+  HINTS "${GCC_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+find_program(CMAKE_CXX_COMPILER
+  NAMES g++-HEAD
+  HINTS "${GCC_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+find_program(CMAKE_AR
+  NAMES gcc-ar-HEAD
+  HINTS "${GCC_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+find_program(CMAKE_RANLIB
+  NAMES gcc-ranlib-HEAD
+  HINTS "${GCC_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+find_program(CMAKE_NM
+  NAMES gcc-nm-HEAD
+  HINTS "${GCC_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+# --- Binutils
+
+execute_process(
+  COMMAND ${BREW_EXE} --prefix binutils
+  OUTPUT_VARIABLE BINUTILS_PREFIX
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  COMMAND_ERROR_IS_FATAL ANY
+)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -B${BINUTILS_PREFIX}/bin/" CACHE STRING "" FORCE)
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -B${BINUTILS_PREFIX}/bin/" CACHE STRING "" FORCE)
+
+
+# Get macOS SDK path (only on macOS)
+if(APPLE)
+  execute_process(
+    COMMAND xcrun --show-sdk-path
+    OUTPUT_VARIABLE CMAKE_OSX_SYSROOT
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    COMMAND_ERROR_IS_FATAL ANY
+  )
+endif()
diff --git a/cmake/llvm-brew-toolchain.cmake b/cmake/llvm-brew-toolchain.cmake
new file mode 100644
index 000000000..199bdae34
--- /dev/null
+++ b/cmake/llvm-brew-toolchain.cmake
@@ -0,0 +1,88 @@
+cmake_minimum_required(VERSION 4.2.1)
+
+# Set up Homebrew LLVM & Ninja toolchain for CMake
+
+find_program(BREW_EXE brew)
+
+if(NOT BREW_EXE)
+  message(FATAL_ERROR "Could not find 'brew' executable. Please install Homebrew.")
+endif()
+
+# --- Ninja
+
+execute_process(
+  COMMAND ${BREW_EXE} --prefix ninja
+  OUTPUT_VARIABLE NINJA_PREFIX
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  COMMAND_ERROR_IS_FATAL ANY
+)
+
+find_program(CMAKE_MAKE_PROGRAM
+  NAMES ninja
+  HINTS "${NINJA_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+# --- LLVM
+
+execute_process(
+  COMMAND ${BREW_EXE} --prefix llvm
+  OUTPUT_VARIABLE LLVM_PREFIX
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  COMMAND_ERROR_IS_FATAL ANY
+)
+
+find_program(CMAKE_C_COMPILER
+  NAMES clang
+  HINTS "${LLVM_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+find_program(CMAKE_CXX_COMPILER
+  NAMES clang++
+  HINTS "${LLVM_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+find_program(CMAKE_AR
+  NAMES llvm-ar
+  HINTS "${LLVM_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+find_program(CMAKE_RANLIB
+  NAMES llvm-ranlib
+  HINTS "${LLVM_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+find_program(CMAKE_NM
+  NAMES llvm-nm
+  HINTS "${LLVM_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+# Dynamically find the standard library modules JSON, brew puts it in the wrong place
+file(GLOB_RECURSE LIBCXX_MODULES_JSON "${LLVM_PREFIX}/lib/**/libc++.modules.json")
+
+if(LIBCXX_MODULES_JSON)
+  set(CMAKE_CXX_STDLIB_MODULES_JSON "${LIBCXX_MODULES_JSON}")
+else()
+  message(FATAL_ERROR "Could not automatically find libc++.modules.json in ${LLVM_PREFIX}")
+endif()
+
+# Get macOS SDK path (only on macOS)
+if(APPLE)
+  execute_process(
+    COMMAND xcrun --show-sdk-path
+    OUTPUT_VARIABLE CMAKE_OSX_SYSROOT
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    COMMAND_ERROR_IS_FATAL ANY
+  )
+endif()
diff --git a/.legacy/ChangeLog.md b/docs/ChangeLog.md
similarity index 98%
rename from .legacy/ChangeLog.md
rename to docs/ChangeLog.md
index 90e1a3606..c186137ab 100644
--- a/.legacy/ChangeLog.md
+++ b/docs/ChangeLog.md
@@ -1,3 +1,7 @@
+---
+icon: lucide/notebook-pen
+---
+
 # Changelog
 
 <!-- ## [**Version x.x.x**](https://github.com/ConorWilliams/libfork/compare/v2.1.0...dev)
@@ -19,7 +23,6 @@
 - More benchmarks (TooManyCooks).
 - Benchmarks graphs in readme [Issue #14](https://github.com/ConorWilliams/libfork/issues/14).
 
-
 ### Changed
 
 ### Removed
@@ -28,9 +31,9 @@
 
 ### Bugfixes
 
-- Fixed checks in benchmarks (UTS was reported as failing but the results were correct). 
+- Fixed checks in benchmarks (UTS was reported as failing but the results were correct).
 
-### Meta 
+### Meta
 
 - Moved vcpkg submodule to https [PR #12](https://github.com/ConorWilliams/libfork/pull/12)
 
diff --git a/docs/api/algorithm.md b/docs/api/algorithm.md
new file mode 100644
index 000000000..fff952a3a
--- /dev/null
+++ b/docs/api/algorithm.md
@@ -0,0 +1,121 @@
+---
+icon: lucide/workflow
+---
+
+# Algorithm
+
+The algorithm module provides fork-join algorithms over sized random-access
+ranges. Algorithms are async functions and are normally launched with
+`lf::schedule` or called from another task through a scope.
+
+## Concepts
+
+### `sized_random_access_range<T>`
+
+`T` satisfies both `std::ranges::random_access_range` and
+`std::ranges::sized_range`.
+
+## `for_each`
+
+### `inline constexpr for_each_impl for_each`
+
+Applies a function to every element in a sized random-access range or
+iterator/sentinel pair.
+
+Overload shapes:
+
+```cpp
+lf::for_each(env, first, last, chunk_size, fn);
+lf::for_each(env, first, last, fn);
+lf::for_each(env, range, chunk_size, fn);
+lf::for_each(env, range, fn);
+```
+
+When `fn` is an async invocable, `for_each` calls it through libfork scopes.
+Otherwise it invokes `fn` synchronously.
+
+```cpp
+std::vector<int> values{1, 2, 3};
+auto recv = lf::schedule(pool, lf::for_each, std::span(values), [](int& x) {
+  x *= 2;
+});
+std::move(recv).get();
+```
+
+The chunk-size overload assumes `chunk_size > 0`. Use the overload without a
+chunk size for the single-element base case.
+
+## `fold`
+
+### `fold_chunk_error`
+
+Thrown when a public `fold` overload receives a non-positive chunk size.
+
+### `inline constexpr fold_fn fold`
+
+Reduces a sized random-access range or iterator/sentinel pair with a semigroup
+operation. The result is `std::optional<R>`: empty input returns `std::nullopt`,
+and non-empty input returns the reduced value.
+
+Overload shapes:
+
+```cpp
+lf::fold(env, first, last, chunk_size, binary_op, projection = {});
+lf::fold(env, first, last, binary_op, projection = {});
+lf::fold(env, range, chunk_size, binary_op, projection = {});
+lf::fold(env, range, binary_op, projection = {});
+```
+
+The binary operation may be synchronous or async. The projection may also be
+synchronous or async. The projected value and binary operation must satisfy
+libfork's indirect semigroup constraints.
+
+```cpp
+std::vector<int> values{1, 2, 3, 4};
+
+auto recv = lf::schedule(pool, lf::fold, std::span(values), std::plus<>{});
+std::optional<int> sum = std::move(recv).get();
+```
+
+With projection:
+
+```cpp
+struct record {
+  int value;
+};
+
+std::vector<record> records{{1}, {2}, {3}};
+auto recv = lf::schedule(
+    pool,
+    lf::fold,
+    std::span(records),
+    std::plus<>{},
+    &record::value);
+```
+
+For overloads with an explicit chunk size, `chunk_size <= 0` throws
+`fold_chunk_error` before checking whether the range is empty.
+
+## Async operations
+
+Both algorithms understand libfork async callables. For example, an async
+projection can be used with `fold`:
+
+```cpp
+struct square {
+  template <lf::worker_context Context>
+  static auto operator()(lf::env<Context>, int value) -> lf::task<int, Context> {
+    co_return value * value;
+  }
+};
+
+auto recv = lf::schedule(
+    pool,
+    lf::fold,
+    std::span(values),
+    std::plus<>{},
+    square{});
+```
+
+If a callable is both synchronously and asynchronously invocable, the algorithm
+implementation may prefer the async path where its constraints select it.
diff --git a/docs/api/batteries.md b/docs/api/batteries.md
new file mode 100644
index 000000000..381a040e8
--- /dev/null
+++ b/docs/api/batteries.md
@@ -0,0 +1,171 @@
+---
+icon: lucide/battery-charging
+---
+
+# Batteries
+
+Batteries provide ready-to-use stacks, deques, context policies, and worker
+contexts.
+
+## Stacks
+
+All stack classes are allocator-aware templates over `std::byte` allocators and
+satisfy `worker_stack` when the allocator's void pointer type is `void*`.
+
+### `geometric_stack<Allocator = std::allocator<std::byte>>`
+
+Segmented user-space stack with geometric growth and one cached segment to
+avoid hot splitting. This is the general-purpose stack choice.
+
+Main operations:
+
+- `empty() const noexcept -> bool`
+- `checkpoint() noexcept`
+- `push(std::size_t) -> void*`
+- `pop(void*, std::size_t) noexcept -> void`
+- `prepare_release() noexcept`
+- `release(release_t) noexcept`
+- `acquire(checkpoint_t) noexcept`
+
+### `slab_stack<Allocator = std::allocator<std::byte>>`
+
+Fixed-size slab-backed stack. It allocates one slab at construction and throws
+`std::bad_alloc` if a push exceeds the slab.
+
+Constructors:
+
+- `slab_stack()`
+- `explicit slab_stack(diff_type num_nodes, Allocator const& = Allocator())`
+
+Main operations match `geometric_stack`.
+
+### `adaptor_stack<Allocator = std::allocator<std::byte>>`
+
+Thin wrapper over an allocator. Every push allocates and every pop deallocates.
+It is simple and useful for testing or comparison.
+
+Main operations match `geometric_stack`, except release/acquire only propagate
+allocator state when needed.
+
+## Work-stealing deque
+
+### `concept dequeable<T>`
+
+`T` can be stored in `lf::deque`. It must be lock-free and default-initializable.
+
+### `deque_full_error`
+
+Thrown when `deque::push` cannot add another element because the deque is full.
+
+### `enum class err`
+
+Result code for stealing:
+
+- `none`: steal succeeded.
+- `lost`: another thief won the race.
+- `empty`: the deque was empty.
+
+### `steal_t<T>`
+
+Return type of `thief_handle::steal`. It supports `operator bool`, `operator*`,
+and `operator->`.
+
+Fields:
+
+- `err code`
+- `T val`
+
+`val` is valid only when `code == err::none`.
+
+### `deque<T, Allocator = std::allocator<std::atomic<T>>>`
+
+Bounded Chase-Lev single-producer multiple-consumer work-stealing deque.
+
+The owner thread uses:
+
+- `empty() const noexcept -> bool`
+- `size() const noexcept -> size_type`
+- `ssize() const noexcept -> diff_type`
+- `capacity() const noexcept -> diff_type`
+- `push(T) -> diff_type`
+- `pop(Fn when_empty = {}) -> invoke_result_t<Fn>`
+- `thief() noexcept -> thief_handle`
+
+Non-owner threads steal through `deque::thief_handle`, which provides:
+
+- `empty() noexcept -> bool`
+- `size() noexcept -> size_type`
+- `ssize() noexcept -> diff_type`
+- `capacity() noexcept -> diff_type`
+- `steal() noexcept -> steal_t<T>`
+
+All threads must stop using a deque before it is destroyed.
+
+## Deque adaptors
+
+### `adapt_vector<Allocator = std::allocator<unsafe_steal_handle>>`
+
+`std::vector`-backed LIFO context policy. It supports:
+
+- `push(unsafe_steal_handle)`
+- `pop() noexcept -> unsafe_steal_handle`
+
+Use it for inline/single-threaded contexts where stealing is unnecessary.
+
+### `adapt_deque<Allocator = std::allocator<std::atomic<unsafe_steal_handle>>>`
+
+Lock-free deque-backed context policy. It supports:
+
+- `push(unsafe_steal_handle)`
+- `pop() noexcept -> unsafe_steal_handle`
+- `steal() noexcept -> unsafe_steal_handle`
+
+The default capacity is `32 * 1024` handles. The explicit constructor accepts a
+capacity and allocator.
+
+## Context policies
+
+### `concept deque_policy<T>`
+
+A LIFO stack over `unsafe_steal_handle`. Used by contexts to store work without
+naming the full context type.
+
+### `concept stealable_deque_policy<T>`
+
+Extends `deque_policy` with:
+
+```cpp
+auto steal() -> lf::unsafe_steal_handle;
+```
+
+Use a stealable policy for multi-worker schedulers.
+
+## Contexts
+
+### `derived_poly_context<Stack, Deque>`
+
+Polymorphic worker context composed of a `Stack` and `Deque`. It derives from
+`poly_context<Stack>`, implements virtual `push`/`pop`, and exposes `steal()`
+when `Deque` is stealable.
+
+Aliases:
+
+- `context_type = poly_context<Stack>`
+
+### `mono_context<Stack, Deque>`
+
+Monomorphic worker context composed of a `Stack` and `Deque`. It implements
+`push`/`pop` directly and exposes `steal()` when `Deque` is stealable.
+
+Aliases:
+
+- `context_type = mono_context`
+
+Both context classes support piecewise construction of their stack and deque:
+
+```cpp
+lf::mono_context<lf::geometric_stack<>, lf::adapt_deque<>> ctx{
+    std::piecewise_construct,
+    std::tuple{},
+    std::tuple{1024}};
+```
diff --git a/docs/api/core/cancellation.md b/docs/api/core/cancellation.md
new file mode 100644
index 000000000..f6d24316d
--- /dev/null
+++ b/docs/api/core/cancellation.md
@@ -0,0 +1,128 @@
+---
+icon: lucide/octagon-x
+---
+
+# Cancellation
+
+Libfork cancellation is cooperative. `stop_source` records a request, and tasks
+observe it through stop tokens propagated through scopes. Cancellation never
+asynchronously interrupts running user code.
+
+## `stop_source`
+
+```cpp
+class stop_source {
+ public:
+  class stop_token;
+
+  constexpr stop_source() noexcept;
+  constexpr explicit stop_source(stop_token parent) noexcept;
+
+  stop_source(stop_source const&) = delete;
+  stop_source(stop_source&&) = delete;
+  auto operator=(stop_source const&) -> stop_source& = delete;
+  auto operator=(stop_source&&) -> stop_source& = delete;
+
+  auto token() const noexcept -> stop_token;
+  auto stop_requested() const noexcept -> bool;
+  auto request_stop() noexcept -> void;
+  auto race_request_stop() noexcept -> bool;
+};
+```
+
+`stop_source` is a small linked cancellation source. A source may be constructed
+with a parent token; `stop_requested()` then checks this source and every parent
+source in the chain.
+
+Root stoppability is created with [`recv_state<T, true>`](receiver.md#recv_state).
+Nested stoppability is created with [`child_scope()`](scope.md#child_scope).
+
+```cpp
+template <lf::worker_context Context>
+auto root(lf::env<Context>) -> lf::task<void, Context> {
+  auto sc = co_await lf::child_scope();
+
+  co_await sc.fork_drop(worker{});
+  sc.request_stop();
+  co_await sc.join();
+}
+```
+
+### `token`
+
+```cpp
+auto token() const noexcept -> stop_token;
+```
+
+Returns a non-owning token referring to this source. Child scopes store tokens
+to propagate cancellation.
+
+### `stop_requested`
+
+```cpp
+auto stop_requested() const noexcept -> bool;
+```
+
+Returns true if this source or any ancestor source has been stopped. The check
+is linear in the parent-chain depth.
+
+### `request_stop`
+
+```cpp
+auto request_stop() noexcept -> void;
+```
+
+Requests cancellation for this source and its descendants. Calling it more than
+once is allowed.
+
+### `race_request_stop`
+
+```cpp
+auto race_request_stop() noexcept -> bool;
+```
+
+Requests cancellation and returns true only for the first caller that changed
+the source from not-stopped to stopped. Use `request_stop()` when the return
+value is not needed.
+
+## `stop_source::stop_token`
+
+```cpp
+class stop_source::stop_token {
+ public:
+  constexpr stop_token() noexcept;
+
+  auto stop_possible() const noexcept -> bool;
+  auto stop_requested() const noexcept -> bool;
+};
+```
+
+`stop_token` is a pointer-sized non-owning handle to a stop-source chain. A
+default-constructed token is unstoppable.
+
+### `stop_possible`
+
+```cpp
+auto stop_possible() const noexcept -> bool;
+```
+
+Returns true when the token refers to a stop source.
+
+### `stop_requested`
+
+```cpp
+auto stop_requested() const noexcept -> bool;
+```
+
+Returns true if any source in the chain has been stopped. A null token always
+returns false.
+
+## Propagation rules
+
+Normal [`scope()`](scope.md#scope) children inherit the parent's current stop
+token. [`child_scope()`](scope.md#child_scope) inserts a new stop source between
+the parent token and any children launched by that scope.
+
+If a child task has not started and its stop token is already stopped, libfork
+destroys the child frame without resuming it. At `join`, a cancelled task may
+stop resuming the cancelled subtree after required cleanup.
diff --git a/docs/api/core/concepts.md b/docs/api/core/concepts.md
new file mode 100644
index 000000000..b0b4a1555
--- /dev/null
+++ b/docs/api/core/concepts.md
@@ -0,0 +1,276 @@
+---
+icon: lucide/puzzle
+---
+
+# Concepts
+
+`libfork.core` exposes the concepts used by tasks, schedulers, algorithms, and
+custom integrations. They are public API because user-defined schedulers,
+contexts, stacks, awaitables, projections, and algorithms are expected to use
+the same constraints as libfork itself.
+
+## Returnable
+
+```cpp
+template <typename T>
+concept returnable = std::is_void_v<T> || (/*plain-object*/<T> && std::movable<T>);
+```
+
+Types suitable for use as the `T` in [`task<T, Context>`](task.md). `void` is
+allowed. Non-void return types must be movable plain object types.
+
+### Plain-object
+
+An exposition-only concept used throughout libfork:
+
+```cpp
+template <typename T>
+concept /*plain-object*/ =
+    std::is_object_v<T> &&
+    std::same_as<T, std::remove_reference_t<T>>;
+```
+
+## Worker stacks
+
+```cpp
+template <typename T>
+concept worker_stack = /* ... */;
+```
+
+`worker_stack` defines the stack API used to allocate coroutine frames. A stack
+implementation must provide:
+
+```cpp
+auto push(std::size_t bytes) -> void*;
+auto pop(void* ptr, std::size_t bytes) noexcept -> void;
+auto checkpoint() noexcept -> std::regular auto;
+auto prepare_release() noexcept -> std::movable auto;
+auto release(decltype(prepare_release())) noexcept -> void;
+auto acquire(decltype(checkpoint()) const&) noexcept -> void;
+```
+
+The fast path is `push`, `pop`, and `checkpoint`. `prepare_release`, `release`,
+and `acquire` are used when continuation stealing transfers stack ownership
+between workers.
+
+Stack checkpoints must be cheap to copy. A default-constructed checkpoint is a
+null state that only compares equal to itself. Non-null checkpoints compare
+equal when they refer to the same underlying stack allocation source.
+
+## Worker contexts
+
+```cpp
+template <typename T, typename U>
+concept lifo_stack = /* ... */;
+
+template <typename T>
+concept worker_context = /* ... */;
+
+template <worker_context T>
+using stack_t = std::remove_reference_t<decltype(std::declval<T&>().stack())>;
+```
+
+`lifo_stack<T, U>` requires:
+
+```cpp
+context.push(value); // -> void
+context.pop();       // noexcept -> U
+```
+
+`worker_context<T>` refines that shape for `steal_handle<T>` and also requires
+access to a worker stack:
+
+```cpp
+auto stack() noexcept -> Stack&; // Stack models worker_stack
+```
+
+Contexts are the per-worker execution state. They store stealable
+continuations, expose the coroutine frame stack, and are temporarily bound to a
+thread while `execute` resumes a task.
+
+## Schedulers
+
+```cpp
+template <typename T>
+concept has_context_typedef = requires {
+  typename std::remove_cvref_t<T>::context_type;
+};
+
+template <has_context_typedef T>
+using context_t = typename std::remove_cvref_t<T>::context_type;
+
+template <typename Sch>
+concept scheduler = /* ... */;
+```
+
+A scheduler provides a `context_type` and can post a scheduled root task:
+
+```cpp
+void post(lf::sched_handle<context_type>);
+```
+
+`post` must provide the strong exception guarantee. If it returns normally, the
+task associated with the handle must eventually be resumed by a worker using
+[`execute`](scheduling.md#execute).
+
+## Async invocables
+
+```cpp
+template <typename Fn, typename Context, typename... Args>
+concept async_invocable = /* ... */;
+
+template <typename Fn, typename Context, typename... Args>
+concept async_regular_invocable = async_invocable<Fn, Context, Args...>;
+
+template <typename Fn, typename Context, typename... Args>
+concept async_nothrow_invocable = /* ... */;
+
+template <typename Fn, typename Context, typename... Args>
+using async_result_t = /* task value_type */;
+
+template <typename Fn, typename R, typename Context, typename... Args>
+concept async_invocable_to = /* ... */;
+
+template <typename Fn, typename R, typename Context, typename... Args>
+concept async_nothrow_invocable_to = /* ... */;
+```
+
+`async_invocable` checks whether `Fn` can be called as a libfork task for the
+given `Context` and arguments. The result must be `task<R, Context>` for some
+`R`.
+
+When overload resolution is checked, libfork first tries to call:
+
+```cpp
+fn(lf::env<Context>{}, args...);
+```
+
+If that is not viable, it checks:
+
+```cpp
+fn(args...);
+```
+
+This is what makes [`env<Context>`](env.md) useful for context-generic tasks.
+`async_result_t` is the `value_type` of the returned task.
+
+## Awaitables
+
+```cpp
+template <awaitable_acquirable T>
+constexpr auto acquire_awaitable(T&& t);
+
+template <typename T, typename Context>
+concept awaitable = /* ... */;
+```
+
+`acquire_awaitable` implements the same acquisition rule used by `co_await`:
+
+- if `t.operator co_await()` is available, use it;
+- otherwise, if `operator co_await(t)` is available, use it;
+- otherwise, treat `t` itself as the awaitable.
+
+`awaitable<T, Context>` is libfork's context-switching awaitable concept. The
+acquired awaitable must be storable and provide:
+
+```cpp
+auto await_ready() -> bool-convertible;
+auto await_suspend(lf::sched_handle<Context>, Context&) -> void;
+auto await_resume();
+```
+
+!!! warning
+    `await_suspend` must not complete the same coroutine inline. A custom
+    awaitable hands the suspended task to another scheduling path, which must
+    later resume it with `execute`.
+
+## Indirect invocables
+
+```cpp
+namespace sync {
+  template <typename Fn, typename I>
+  concept indirectly_unary_invocable = /* ... */;
+
+  template <typename Fn, typename I>
+  concept indirectly_regular_unary_invocable = /* ... */;
+}
+
+namespace async {
+  template <typename Fn, typename Context, typename I>
+  concept indirectly_unary_invocable = /* ... */;
+
+  template <typename Fn, typename Context, typename I>
+  concept indirectly_regular_unary_invocable = /* ... */;
+}
+
+template <typename Fn, typename Context, typename I>
+concept indirectly_unary_invocable =
+    sync::indirectly_unary_invocable<Fn, I> ||
+    async::indirectly_unary_invocable<Fn, Context, I>;
+
+template <typename Fn, typename Context, typename I>
+concept indirectly_regular_unary_invocable =
+    sync::indirectly_regular_unary_invocable<Fn, I> ||
+    async::indirectly_regular_unary_invocable<Fn, Context, I>;
+```
+
+These concepts mirror the standard indirect callable concepts, but also support
+libfork's async projections and the `indirect_value_t` customization used by
+[`projected`](projected.md).
+
+The combined concepts accept either a synchronous callable or an asynchronous
+callable. If both are viable, libfork algorithms generally prefer the async
+form.
+
+## Semigroups
+
+```cpp
+namespace sync {
+  template <typename Fn, typename I>
+  concept indirect_semigroup = /* ... */;
+}
+
+namespace async {
+  template <typename Fn, typename Context, typename I>
+  concept indirect_semigroup = /* ... */;
+}
+
+template <typename Fn, typename Context, typename I>
+concept indirect_semigroup =
+    sync::indirect_semigroup<Fn, I> ||
+    async::indirect_semigroup<Fn, Context, I>;
+
+template <typename Fn, typename Context, typename I>
+concept indirect_commutative_semigroup = indirect_semigroup<Fn, Context, I>;
+
+template <typename Fn, typename Context, typename I>
+using indirect_semigroup_t = /* operation result type */;
+```
+
+An indirect semigroup is an indirectly readable input plus an associative binary
+operation that is closed over all combinations of:
+
+- the projected indirect value type;
+- the iterator reference type;
+- the operation result type.
+
+The async variant requires those combinations to be valid libfork task
+invocations. `indirect_commutative_semigroup` is a semantic refinement: the
+type system cannot prove commutativity, so users are responsible for supplying
+an operation where `a op b == b op a`.
+
+`indirect_semigroup_t` returns the result type of applying the operation to two
+elements.
+
+## Projections
+
+```cpp
+template <typename Fn, typename Context, typename I>
+concept projectable = /* ... */;
+
+template <worker_context Context, std::indirectly_readable I, projectable<Context, I> Fn>
+using projected = /* ... */;
+```
+
+See [Projections](projected.md) for the public projection alias and the extra
+default-initialization rule for async projections.
diff --git a/docs/api/core/context.md b/docs/api/core/context.md
new file mode 100644
index 000000000..09daefa6f
--- /dev/null
+++ b/docs/api/core/context.md
@@ -0,0 +1,83 @@
+---
+icon: lucide/cpu
+---
+
+# Contexts
+
+Contexts are the worker-local objects used by `execute`. They own a coroutine
+frame stack and a LIFO collection of stealable continuations.
+
+The core module exposes reusable context base classes. Concrete contexts are in
+`libfork.batteries` and `libfork.schedulers`, but custom schedulers can build on
+these types directly.
+
+## `base_context`
+
+```cpp
+template <worker_stack Stack>
+class base_context {
+ public:
+  auto stack() noexcept -> Stack&;
+
+ protected:
+  constexpr base_context();
+
+  template <typename... Args>
+    requires std::constructible_from<Stack, Args...>
+  explicit constexpr base_context(Args&&... args);
+};
+```
+
+`base_context` stores a worker stack and exposes it through `stack()`. It is a
+convenience base for context implementations that want stack storage without
+committing to a particular queue or scheduler policy.
+
+```cpp
+class my_context : public lf::base_context<my_stack> {
+ public:
+  using base_context::base_context;
+
+  void push(lf::steal_handle<my_context>);
+  auto pop() noexcept -> lf::steal_handle<my_context>;
+};
+```
+
+`base_context<Stack>` does not itself model
+[`worker_context`](concepts.md#worker-contexts); derived types must add
+`push` and `pop`.
+
+## `poly_context`
+
+```cpp
+template <worker_stack Stack>
+class poly_context : public base_context<Stack> {
+ public:
+  using base_context<Stack>::base_context;
+
+  virtual void push(steal_handle<poly_context>) = 0;
+  virtual auto pop() noexcept -> steal_handle<poly_context> = 0;
+  virtual void post(sched_handle<poly_context> handle);
+
+  virtual ~poly_context() noexcept = default;
+};
+```
+
+`poly_context` is the standard polymorphic context base. It is polymorphic over
+`push`, `pop`, and optionally `post`, while the stack type remains part of the
+static type.
+
+The default `post` throws [`post_error`](#post_error). Scheduler-like derived
+contexts override it when they can accept externally scheduled root work.
+
+Use `poly_context` when a scheduler or adapter needs dynamic dispatch but still
+wants libfork's typed handle discipline.
+
+## `post_error`
+
+```cpp
+struct post_error final : libfork_exception {
+  auto what() const noexcept -> const char* override;
+};
+```
+
+Thrown by `poly_context::post` when a derived context does not override posting.
diff --git a/docs/api/core/env.md b/docs/api/core/env.md
new file mode 100644
index 000000000..3307df7f9
--- /dev/null
+++ b/docs/api/core/env.md
@@ -0,0 +1,45 @@
+---
+icon: lucide/earth
+---
+
+# Env
+
+```cpp
+template <worker_context>
+struct env {}
+```
+
+See associated:
+
+- [worker_context](./concepts.md#worker-contexts)
+
+A tag type that can be used to help write context-generic code. This can be
+used as the (templated) first parameter of a coroutine such that
+[worker_context](./concepts.md#worker-contexts) can be deduced. This parameter
+will be generated and passed to the coroutine by libfork automatically.
+
+!!! note
+    This type is not user-constructible.
+
+!!! example
+    For example:
+    ```cpp
+    struct context_generic {
+      template <typename Context>
+      auto operator()(lf::env<Context>, int param) -> lf::task<void, Context> {
+        // ...
+      }
+    };
+    ```
+    Then this can be called from any context:
+    ```cpp linenums="1"
+    auto some_coro(/* args */) -> lf::task<void, some_context> {
+      // ...
+      co_await lf::invoke(context_generic{}, 42);
+      // ...
+    }
+    ```
+    !!! note
+        Here we defined `context_generic` as a function object so that we could
+        pass it to [lf::invoke](./invoke.md) without needing to specify template
+        parameters.
diff --git a/docs/api/core/exceptions.md b/docs/api/core/exceptions.md
new file mode 100644
index 000000000..a4397c8da
--- /dev/null
+++ b/docs/api/core/exceptions.md
@@ -0,0 +1,30 @@
+---
+icon: lucide/triangle-alert
+---
+
+# Exceptions
+
+Core exceptions derive from `libfork_exception`.
+
+## `libfork_exception`
+
+```cpp
+struct libfork_exception : std::exception {};
+```
+
+The base class for exceptions thrown by `libfork.core`.
+
+## Derived exceptions
+
+| Exception | Thrown by |
+| --- | --- |
+| `schedule_error` | `schedule` called from a worker thread already bound to the same context type |
+| `execute_error` | `execute` called recursively on a thread already bound to the same context type |
+| `steal_overflow_error` | a single task exceeds libfork's internal steal counter |
+| `root_alloc_error` | a root coroutine frame exceeds the receiver state's embedded buffer |
+| `broken_receiver_error` | receiver operations on an invalid receiver |
+| `operation_cancelled_error` | `receiver<T, true>::get()` after cancellation was requested |
+| `post_error` | default `poly_context::post` implementation |
+
+See the pages for [scheduling](scheduling.md), [receivers](receiver.md), and
+[contexts](context.md) for the exact operation-level behavior.
diff --git a/docs/api/core/handles.md b/docs/api/core/handles.md
new file mode 100644
index 000000000..35123557f
--- /dev/null
+++ b/docs/api/core/handles.md
@@ -0,0 +1,86 @@
+---
+icon: lucide/key-round
+---
+
+# Handles
+
+Handles are lightweight wrappers around libfork coroutine frames. They are used
+by context and scheduler implementations, not by ordinary task code.
+
+All handle types are nullable and testable with explicit `operator bool`.
+
+## `unsafe_steal_handle`
+
+```cpp
+struct unsafe_steal_handle {
+  constexpr unsafe_steal_handle() = default;
+  explicit constexpr operator bool() const noexcept;
+  auto operator==(unsafe_steal_handle const&) const noexcept -> bool = default;
+};
+```
+
+An untyped handle to a stealable continuation. It exists for erased storage
+policies such as generic deques.
+
+Prefer [`steal_handle<T>`](#steal_handle) whenever the context type is known.
+
+## `unsafe_sched_handle`
+
+```cpp
+struct unsafe_sched_handle {
+  constexpr unsafe_sched_handle() = default;
+  explicit constexpr operator bool() const noexcept;
+  auto operator==(unsafe_sched_handle const&) const noexcept -> bool = default;
+};
+```
+
+An untyped handle to scheduled work. It exists for erased storage policies.
+
+Prefer [`sched_handle<T>`](#sched_handle) whenever the context type is known.
+
+## `steal_handle`
+
+```cpp
+template <typename T>
+struct steal_handle : unsafe_steal_handle {
+  using unsafe_steal_handle::unsafe_steal_handle;
+};
+```
+
+A typed handle to a continuation that may be resumed with:
+
+```cpp
+lf::execute(context, handle);
+```
+
+The coroutine behind a `steal_handle<T>` is suspended at a fork point. Contexts
+store these handles in LIFO order and thieves may take them through a stealing
+policy.
+
+## `sched_handle`
+
+```cpp
+template <typename T>
+struct sched_handle : unsafe_sched_handle {
+  using unsafe_sched_handle::unsafe_sched_handle;
+};
+```
+
+A typed handle to scheduled work. The coroutine behind a `sched_handle<T>` is
+either a not-yet-started root task or a task suspended at a custom
+context-switching awaitable.
+
+Schedulers receive `sched_handle<context_type>` in `post`, store it as needed,
+and eventually resume it with `execute`.
+
+## Safety
+
+Handles do not own coroutine frames. They are only valid under the protocol that
+created them:
+
+- `sched_handle<T>` must eventually be resumed by a scheduler compatible with
+  `T`;
+- `steal_handle<T>` must be treated as consumed once a worker starts executing
+  it;
+- untyped handles should only be used inside storage adapters that restore the
+  correct typed handle before execution.
diff --git a/docs/api/core/index.md b/docs/api/core/index.md
new file mode 100644
index 000000000..629a65e2d
--- /dev/null
+++ b/docs/api/core/index.md
@@ -0,0 +1,54 @@
+---
+icon: lucide/blocks
+---
+
+# Core
+
+All public symbols documented here live in namespace `lf` and are reachable via:
+
+```cpp
+import libfork.core;
+```
+
+`libfork.core` is the minimal public module for writing libfork tasks and for
+building schedulers, contexts, stacks, and higher-level algorithms. It does not
+include concrete schedulers or stack implementations; those are provided by
+`libfork.batteries` and `libfork.schedulers`.
+
+Use this module directly when you want the core coroutine protocol without the
+standard batteries. Most applications can instead import the meta-module:
+
+```cpp
+import libfork;
+```
+
+## What core provides
+
+- [Tasks](task.md): `task<T, Context>` and `env<Context>`.
+- [Scopes](scope.md): `scope()`, `child_scope()`, `fork`, `call`, and `join`.
+- [Scheduling](scheduling.md): `schedule`, `execute`, and root-task errors.
+- [Receivers](receiver.md): `recv_state`, `receiver`, waiting, result retrieval,
+  and root cancellation.
+- [Cancellation](cancellation.md): `stop_source` and `stop_token`.
+- [Contexts](context.md): `base_context`, `poly_context`, and `post_error`.
+- [Handles](handles.md): typed and untyped task handles.
+- [Projections](projected.md): async-aware `projectable` and `projected`.
+- [Concepts](concepts.md): the constraints used by the public API.
+- [Exceptions](exceptions.md): the common exception hierarchy.
+
+## Execution model
+
+Core tasks are coroutines arranged into a strict fork-join tree. A task may
+create child tasks with `fork` or `call`, but those children are always joined
+before the parent can return. Libfork uses continuation stealing: a forked child
+runs immediately, while the parent's continuation becomes stealable work.
+
+The core module deliberately separates three roles:
+
+- a **task** is a coroutine with a `task<T, Context>` return type;
+- a **context** owns the worker-local stack and a LIFO queue of stealable
+  continuations;
+- a **scheduler** accepts root work and eventually resumes it with `execute`.
+
+That split is what lets `libfork.core` stay independent from any specific
+thread pool or work-stealing queue.
diff --git a/docs/api/core/invoke.md b/docs/api/core/invoke.md
new file mode 100644
index 000000000..4153dd373
--- /dev/null
+++ b/docs/api/core/invoke.md
@@ -0,0 +1,7 @@
+---
+icon: lucide/circle-play
+---
+
+# Invoke
+
+TODO: waiting on implementation
diff --git a/docs/api/core/projected.md b/docs/api/core/projected.md
new file mode 100644
index 000000000..77531e0db
--- /dev/null
+++ b/docs/api/core/projected.md
@@ -0,0 +1,58 @@
+---
+icon: lucide/filter
+---
+
+# Projections
+
+Libfork algorithms support projections that may be synchronous or asynchronous.
+The core module exposes the concepts and aliases used to model those projected
+iterator values.
+
+## `projectable`
+
+```cpp
+template <typename Fn, typename Context, typename I>
+concept projectable = /* ... */;
+```
+
+`projectable<Fn, Context, I>` checks that `Fn` can be used as a projection for
+the indirectly readable type `I`.
+
+Synchronous projections follow the usual standard-library shape. Async
+projections are libfork tasks and may receive `env<Context>`:
+
+```cpp
+struct square_async {
+  template <lf::worker_context Context>
+  auto operator()(lf::env<Context>, int x) const -> lf::task<int, Context> {
+    co_return x * x;
+  }
+};
+```
+
+Async projections must also produce default-initializable result types. Libfork
+algorithms need scratch storage for async projected values before child tasks
+write their results.
+
+## `projected`
+
+```cpp
+template <worker_context Context, std::indirectly_readable I, projectable<Context, I> Fn>
+using projected = /* exposition-only projected iterator type */;
+```
+
+`projected` is libfork's async-aware analogue of `std::projected`. It provides
+the associated value and reference types that indirect concepts use when a range
+is viewed through a projection.
+
+For synchronous callables, `projected` behaves like the standard projection
+machinery. For async callables, its value and reference types come from
+[`async_result_t`](concepts.md#async-invocables).
+
+```cpp
+using P = lf::projected<Context, std::vector<int>::iterator, square_async>;
+```
+
+Most users do not name `projected` directly. It is primarily useful when
+writing algorithms that should accept the same sync and async projection forms
+as libfork's built-in algorithms.
diff --git a/docs/api/core/receiver.md b/docs/api/core/receiver.md
new file mode 100644
index 000000000..a7d8e0400
--- /dev/null
+++ b/docs/api/core/receiver.md
@@ -0,0 +1,167 @@
+---
+icon: lucide/inbox
+---
+
+# Receivers
+
+Receivers are the completion handles returned by
+[`schedule`](scheduling.md#schedule). They are intentionally separate from
+tasks: a `task` belongs to the coroutine tree, while a `receiver` belongs to
+the outside caller waiting for a root task.
+
+## `recv_state`
+
+```cpp
+template <typename T, bool Stoppable = false>
+class recv_state {
+ public:
+  recv_state();
+
+  template <typename... Args>
+    requires std::constructible_from</*state*/, Args...>
+  explicit recv_state(Args&&... args);
+
+  template <simple_allocator Alloc>
+  recv_state(std::allocator_arg_t, Alloc const& alloc);
+
+  template <simple_allocator Alloc, typename... Args>
+    requires std::constructible_from</*state*/, Args...>
+  recv_state(std::allocator_arg_t, Alloc const& alloc, Args&&... args);
+
+  recv_state(recv_state&&) noexcept;
+  auto operator=(recv_state&&) noexcept -> recv_state&;
+
+  recv_state(recv_state const&) = delete;
+  auto operator=(recv_state const&) -> recv_state& = delete;
+};
+```
+
+`recv_state` owns the shared state used by a scheduled root task and its
+receiver. It contains the result storage, exception storage, ready flag, optional
+root stop source, and the embedded root-frame buffer.
+
+Most users can rely on the convenience `schedule` overload, which creates a
+non-stoppable state automatically:
+
+```cpp
+auto recv = lf::schedule(pool, root_task{});
+```
+
+Construct `recv_state` yourself when you need stoppability, allocator-aware
+state allocation, or a non-default initial result value:
+
+```cpp
+lf::recv_state<int, true> state{0};
+auto recv = lf::schedule(pool, std::move(state), root_task{});
+```
+
+`recv_state` is move-only and is consumed by `schedule`.
+
+## `receiver`
+
+```cpp
+template <typename T, bool Stoppable = false>
+class receiver {
+ public:
+  receiver(receiver&&) noexcept;
+  auto operator=(receiver&&) noexcept -> receiver&;
+
+  receiver(receiver const&) = delete;
+  auto operator=(receiver const&) -> receiver& = delete;
+
+  auto valid() const noexcept -> bool;
+  auto ready() const -> bool;
+  void wait() const;
+
+  auto stop_source() -> stop_source& requires Stoppable;
+
+  auto get() && -> T;
+};
+```
+
+`receiver<T, Stoppable>` is a move-only handle to scheduled root-task
+completion.
+
+### `valid`
+
+```cpp
+auto valid() const noexcept -> bool;
+```
+
+Returns whether this receiver still refers to shared state. A receiver becomes
+invalid after `std::move(receiver).get()`.
+
+### `ready`
+
+```cpp
+auto ready() const -> bool;
+```
+
+Returns whether the root task has completed, either with a value, with an
+exception, or through cancellation. Throws `broken_receiver_error` if the
+receiver is invalid.
+
+### `wait`
+
+```cpp
+void wait() const;
+```
+
+Blocks until the root task completes. `wait` may be called multiple times.
+Throws `broken_receiver_error` if the receiver is invalid.
+
+### `stop_source`
+
+```cpp
+auto stop_source() -> lf::stop_source& requires Stoppable;
+```
+
+Returns the root stop source for stoppable receivers. Requesting stop prevents
+not-yet-started cancellable work from running and causes cancellation-aware join
+paths to stop resuming cancelled subtrees.
+
+```cpp
+lf::recv_state<void, true> state;
+auto recv = lf::schedule(pool, std::move(state), root_task{});
+recv.stop_source().request_stop();
+```
+
+### `get`
+
+```cpp
+auto get() && -> T;
+```
+
+Waits for completion, consumes the receiver state, and returns the result. For
+`T = void`, it returns nothing.
+
+If the task completed with an exception, `get` rethrows it. If `Stoppable` is
+`true` and the receiver's stop source has been requested, `get` throws
+`operation_cancelled_error`.
+
+!!! warning
+    `get` is rvalue-qualified and may only be called once:
+    ```cpp
+    auto value = std::move(recv).get();
+    ```
+
+## `broken_receiver_error`
+
+```cpp
+struct broken_receiver_error final : libfork_exception {
+  auto what() const noexcept -> const char* override;
+};
+```
+
+Thrown by `ready`, `wait`, or `stop_source` when called on an invalid receiver.
+
+## `operation_cancelled_error`
+
+```cpp
+struct operation_cancelled_error final : libfork_exception {
+  auto what() const noexcept -> const char* override;
+};
+```
+
+Thrown by `std::move(receiver).get()` for a stoppable receiver whose root stop
+source has been requested.
diff --git a/docs/api/core/scheduling.md b/docs/api/core/scheduling.md
new file mode 100644
index 000000000..ed5fe753f
--- /dev/null
+++ b/docs/api/core/scheduling.md
@@ -0,0 +1,135 @@
+---
+icon: lucide/calendar-clock
+---
+
+# Scheduling
+
+Scheduling bridges outside code and libfork tasks. `schedule` creates a root
+task and submits it to a scheduler; `execute` resumes a task handle on a worker
+context.
+
+## `schedule`
+
+```cpp
+template <scheduler Sch, typename R, bool Stoppable, typename Fn, typename... Args>
+  requires async_invocable_to<std::decay_t<Fn>, R, context_t<Sch>, std::decay_t<Args>...>
+[[nodiscard]]
+constexpr auto schedule(
+    Sch&& sch,
+    recv_state<R, Stoppable> state,
+    Fn&& fn,
+    Args&&... args) -> receiver<R, Stoppable>;
+```
+
+Schedules a root task using caller-provided receiver state. `Fn` and `Args...`
+are decayed into the root coroutine frame. The returned
+[`receiver`](receiver.md#receiver) observes completion, exceptions, and the
+result.
+
+Use this overload when you need a custom allocator for receiver state, an
+initial return object value, or a stoppable root task:
+
+```cpp
+lf::recv_state<int, true> state;
+auto recv = lf::schedule(pool, std::move(state), root_task{}, 42);
+
+recv.stop_source().request_stop();
+int result = std::move(recv).get();
+```
+
+!!! warning
+    `schedule` must not be called from inside a worker thread that is already
+    executing the same context type. Doing so throws `schedule_error`.
+
+### Convenience overload
+
+```cpp
+template <scheduler Sch, typename Fn, typename... Args>
+  requires /* async invocable with default-schedulable result */
+[[nodiscard]]
+constexpr auto schedule(Sch&& sch, Fn&& fn, Args&&... args)
+    -> receiver<async_result_t<std::decay_t<Fn>, context_t<Sch>, std::decay_t<Args>...>>;
+```
+
+This overload creates a non-stoppable `recv_state` with the default allocator.
+The task result must be `void` or default-initializable and movable.
+
+```cpp
+auto recv = lf::schedule(pool, root_task{}, 42);
+auto value = std::move(recv).get();
+```
+
+## `execute`
+
+```cpp
+template <worker_context Context>
+constexpr void execute(Context& context, sched_handle<Context> handle);
+
+template <worker_context Context>
+constexpr void execute(Context& context, steal_handle<Context> handle);
+```
+
+Binds the current thread to `context`, resumes the task represented by
+`handle`, and unbinds the thread before returning.
+
+Scheduler implementations call `execute` after taking a handle from their work
+source:
+
+```cpp
+if (auto h = queue.pop()) {
+  lf::execute(context, h);
+}
+```
+
+The `sched_handle` overload resumes root tasks and tasks suspended by custom
+awaitables. The `steal_handle` overload resumes a stolen continuation and marks
+the frame as stolen before execution.
+
+!!! warning
+    `execute` must not be called recursively on a thread already bound to a
+    context of the same type. Doing so throws `execute_error`.
+
+## `schedule_error`
+
+```cpp
+struct schedule_error final : libfork_exception {
+  auto what() const noexcept -> const char* override;
+};
+```
+
+Thrown when `schedule` is called from inside a worker thread for the same
+context type.
+
+## `execute_error`
+
+```cpp
+struct execute_error final : libfork_exception {
+  auto what() const noexcept -> const char* override;
+};
+```
+
+Thrown when `execute` is called while the current thread is already executing a
+task for the same context type.
+
+## `steal_overflow_error`
+
+```cpp
+struct steal_overflow_error final : libfork_exception {
+  auto what() const noexcept -> const char* override;
+};
+```
+
+Thrown if a single task is stolen enough times to overflow libfork's internal
+steal counter.
+
+## `root_alloc_error`
+
+```cpp
+struct root_alloc_error final : libfork_exception {
+  auto what() const noexcept -> const char* override;
+};
+```
+
+Thrown when the root coroutine frame does not fit into the buffer embedded in
+the receiver state. This usually means the scheduled callable or its arguments
+are too large to store directly in the root frame.
diff --git a/docs/api/core/scope.md b/docs/api/core/scope.md
new file mode 100644
index 000000000..2f1cc6a78
--- /dev/null
+++ b/docs/api/core/scope.md
@@ -0,0 +1,166 @@
+---
+icon: lucide/git-branch
+---
+
+# Scopes
+
+```cpp
+[[nodiscard]]
+constexpr auto scope() noexcept;
+
+[[nodiscard]]
+constexpr auto child_scope() noexcept;
+```
+
+Scopes are acquired inside a libfork task:
+
+```cpp
+auto sc = co_await lf::scope();
+```
+
+The returned scope object is the public way to create children and join them.
+The exact scope type is intentionally unnamed, but its member functions are part
+of the API.
+
+## `scope`
+
+`scope()` creates a normal child-launching scope. It inherits the current task's
+stop token.
+
+```cpp
+auto sc = co_await lf::scope();
+```
+
+The scope object is immovable and should be used locally. It exposes `fork`,
+`fork_drop`, `call`, `call_drop`, and `join`.
+
+## `child_scope`
+
+`child_scope()` creates a normal scope plus a new embedded
+[`stop_source`](cancellation.md#stop_source). Children launched from the scope
+receive that stop source's token, chained to the parent's token.
+
+```cpp
+auto sc = co_await lf::child_scope();
+sc.request_stop();
+co_await sc.join();
+```
+
+Because the returned object derives from `stop_source`, it also exposes:
+
+```cpp
+auto token() const noexcept -> stop_source::stop_token;
+auto stop_requested() const noexcept -> bool;
+auto request_stop() noexcept -> void;
+auto race_request_stop() noexcept -> bool;
+```
+
+## `fork`
+
+```cpp
+template <typename R, typename... Args, async_invocable_to<R, Context, Args...> Fn>
+auto fork(R* ret, Fn&& fn, Args&&... args) noexcept;
+
+template <typename... Args, async_invocable_to<void, Context, Args...> Fn>
+auto fork(Fn&& fn, Args&&... args) noexcept;
+```
+
+`fork` creates a child task and makes the parent continuation stealable. The
+child starts running immediately on the current worker. The parent continues
+later, either on this worker or on a worker that stole the continuation.
+
+Use the pointer overload to receive a non-void result:
+
+```cpp
+template <lf::worker_context Context>
+auto parent(lf::env<Context>) -> lf::task<int, Context> {
+  int value = 0;
+  auto sc = co_await lf::scope();
+
+  co_await sc.fork(&value, child{});
+  co_await sc.join();
+
+  co_return value;
+}
+```
+
+The pointed-to object must remain alive until `join` completes. That is
+normally achieved by using a local variable in the parent task.
+
+## `fork_drop`
+
+```cpp
+template <typename... Args, async_invocable<Context, Args...> Fn>
+auto fork_drop(Fn&& fn, Args&&... args) noexcept;
+```
+
+`fork_drop` launches a child and discards its result. It accepts both `void` and
+non-void child tasks.
+
+```cpp
+co_await sc.fork_drop(write_log{}, item);
+```
+
+## `call`
+
+```cpp
+template <typename R, typename... Args, async_invocable_to<R, Context, Args...> Fn>
+auto call(R* ret, Fn&& fn, Args&&... args) noexcept;
+
+template <typename... Args, async_invocable_to<void, Context, Args...> Fn>
+auto call(Fn&& fn, Args&&... args) noexcept;
+```
+
+`call` invokes a child task as a direct child but does not make the parent
+continuation stealable. It behaves like an async function call in the fork-join
+tree.
+
+Use it when the parent cannot profitably continue in parallel with the child:
+
+```cpp
+int value = 0;
+co_await sc.call(&value, child{}, input);
+```
+
+## `call_drop`
+
+```cpp
+template <typename... Args, async_invocable<Context, Args...> Fn>
+auto call_drop(Fn&& fn, Args&&... args) noexcept;
+```
+
+`call_drop` is the direct-call equivalent of `fork_drop`.
+
+## `join`
+
+```cpp
+auto join() noexcept;
+```
+
+`join` waits for all forked children in the current scope to complete. A task
+that forks children must join before returning.
+
+```cpp
+co_await sc.join();
+```
+
+If no child continuation was stolen, `join` is a fast local operation. If one or
+more continuations were stolen, `join` participates in the join race and may
+suspend the current coroutine until the last child completes.
+
+## Exceptions
+
+Exceptions thrown by children are stashed in the parent and rethrown from
+`join`. If several children throw, libfork preserves one exception.
+
+If a task is already cancelled when a child would start, the child frame is
+destroyed without running. Exceptions already recorded in a cancelled subtree
+may be dropped while cancellation unwinds that subtree.
+
+## Choosing `fork` or `call`
+
+Use `fork` when the parent has useful work that can run in parallel with the
+child or when multiple independent children should race toward a later `join`.
+
+Use `call` when the parent needs the result immediately and exposing the parent
+continuation as stealable work would only add scheduler traffic.
diff --git a/docs/api/core/task.md b/docs/api/core/task.md
new file mode 100644
index 000000000..2e5e5e90d
--- /dev/null
+++ b/docs/api/core/task.md
@@ -0,0 +1,45 @@
+---
+icon: lucide/box
+---
+
+# Task
+
+```cpp
+template <returnable T, worker_context Context>
+class task {
+ public:
+  using value_type = T;
+  using context_type = Context;
+};
+```
+
+See associated:
+
+- [returnable](./concepts.md#returnable)
+- [worker_context](./concepts.md#worker-contexts)
+
+The return type for all coroutines/async-functions in libfork. This type exists
+so that users can mark their functions as coroutines. Other than its typedefs
+it has no public interface.
+
+!!! warning
+    No consumer of this library should ever touch an instance of this type,
+    it is used for specifying the return type of a coroutine only.
+
+!!! example
+    With a simple type alias, coroutines can be written more compactly:
+
+    ```cpp
+    template <lf::returnable T>
+    using task = lf::task<T, some_worker_context>;
+    ```
+
+    Now you can write coroutines like this:
+
+    ```cpp
+    auto my_coroutine() -> task<int> {
+      co_return 42;
+    }
+    ```
+
+    See [env](env.md) for writing context-generic coroutines.
diff --git a/docs/api/index.md b/docs/api/index.md
new file mode 100644
index 000000000..9c6dc013c
--- /dev/null
+++ b/docs/api/index.md
@@ -0,0 +1,27 @@
+---
+icon: lucide/book-open
+---
+
+# API reference
+
+All public symbols documented here live in namespace `lf` and are reachable via:
+
+```cpp
+import libfork;
+```
+
+The meta-module re-exports the following modules:
+
+- `libfork.core`
+- `libfork.batteries`
+- `libfork.schedulers`
+- `libfork.algorithm`
+
+Each of these modules is documented in its own section:
+
+- [Core](core/index.md): Central component.
+- [Batteries](batteries.md): worker stacks, deque, context policies, and context implementations.
+- [Schedulers](schedulers.md): inline and busy-pool schedulers.
+- [Algorithm](algorithm.md): fork-join algorithms over random-access ranges.
+
+`libfork.utils` is a support module and is not documented as user-facing API.
diff --git a/docs/api/schedulers.md b/docs/api/schedulers.md
new file mode 100644
index 000000000..bfc098844
--- /dev/null
+++ b/docs/api/schedulers.md
@@ -0,0 +1,111 @@
+---
+icon: lucide/network
+---
+
+# Schedulers
+
+Schedulers map libfork task handles to worker contexts and OS threads. User
+task code is independent of the scheduler as long as it satisfies the
+`scheduler` concept.
+
+## Inline scheduler
+
+### `concept derived_worker_context<Context>`
+
+`Context` has a `context_type` alias and derives from that context type. This is
+the shape required by `inline_scheduler`.
+
+### `inline_scheduler<Context>`
+
+Single-threaded scheduler that owns one context. `post` immediately calls
+`execute` on the calling thread.
+
+Constructor forms:
+
+- `inline_scheduler()`
+- `explicit inline_scheduler(Args&&...)`, forwarded to `Context`
+
+Public API:
+
+- `using context_type = Context::context_type`
+- `post(sched_handle<context_type>) -> void`
+
+Inline schedulers are useful for tests, debugging, and measuring stack/context
+overhead without worker-thread scheduling.
+
+### `mono_inline_scheduler<Stack, Deque>`
+
+Alias for:
+
+```cpp
+lf::inline_scheduler<lf::mono_context<Stack, Deque>>
+```
+
+### `poly_inline_scheduler<Stack, Deque>`
+
+Alias for:
+
+```cpp
+lf::inline_scheduler<lf::derived_poly_context<Stack, Deque>>
+```
+
+## Busy pool
+
+### `enum class pool_kind`
+
+Selects the context implementation used by `basic_busy_pool`.
+
+- `pool_kind::mono`
+- `pool_kind::poly`
+
+### `basic_busy_pool<Kind, Stack, Deque = adapt_deque<>, Alloc = std::allocator<std::byte>>`
+
+Busy-waiting work-stealing thread pool. It creates `n` `std::jthread` workers,
+posts root tasks into a shared queue, and lets idle workers steal from other
+worker contexts.
+
+Constructor:
+
+```cpp
+explicit basic_busy_pool(
+    std::size_t n = std::thread::hardware_concurrency(),
+    Alloc const& alloc = Alloc());
+```
+
+Public API:
+
+- `using context_type`
+- `post(sched_handle<context_type>) -> void`
+
+The pool is non-copyable and non-movable. Destruction requests stop on all
+workers and joins them through `std::jthread`.
+
+!!! note
+
+    `basic_busy_pool` currently busy-waits for work. It is useful for benchmark
+    and development scenarios where low latency matters more than idle power.
+
+### `mono_busy_pool<Stack, Deque = adapt_deque<>, Alloc = std::allocator<std::byte>>`
+
+Alias for:
+
+```cpp
+lf::basic_busy_pool<lf::pool_kind::mono, Stack, Deque, Alloc>
+```
+
+This is the usual pool shape for concrete contexts:
+
+```cpp
+lf::mono_busy_pool<lf::geometric_stack<>> pool{4};
+```
+
+### `poly_busy_pool<Stack, Deque = adapt_deque<>, Alloc = std::allocator<std::byte>>`
+
+Alias for:
+
+```cpp
+lf::basic_busy_pool<lf::pool_kind::poly, Stack, Deque, Alloc>
+```
+
+Use the polymorphic variant when scheduler/context code needs the
+`poly_context<Stack>` abstraction.
diff --git a/docs/benchmarks/benchmarks/fib.md b/docs/benchmarks/benchmarks/fib.md
new file mode 100644
index 000000000..5d75b0654
--- /dev/null
+++ b/docs/benchmarks/benchmarks/fib.md
@@ -0,0 +1,60 @@
+---
+icon: lucide/git-fork
+---
+
+# Fibonacci
+
+The Fibonacci benchmark computes `fib(n)` with the deliberately inefficient
+binary recursion. It is a tasking microbenchmark: each internal node creates two
+subproblems, does almost no arithmetic, and joins the results. The serial result
+is checked against an iterative reference implementation.
+
+Source:
+
+- [shared input and reference](https://github.com/conorwilliams/libfork/blob/main/benchmark/lib/fib.hpp)
+- [serial variants](https://github.com/conorwilliams/libfork/blob/main/benchmark/src/serial/fib.cpp)
+- [libfork variants](https://github.com/conorwilliams/libfork/blob/main/benchmark/src/libfork/fib.cpp)
+- [OpenMP variant](https://github.com/conorwilliams/libfork/blob/main/benchmark/src/openmp/fib.cpp)
+- [bare-metal variants](https://github.com/conorwilliams/libfork/blob/main/benchmark/src/baremetal/fib.cpp)
+
+## What It Measures
+
+`test` uses `n = 8`; `base` uses `n = 37`. The useful arithmetic per recursive
+node is tiny, so elapsed time is mostly frame creation, coroutine/task
+scheduling, deque or stack traffic, and join overhead.
+
+The benchmark includes:
+
+- serial recursion that writes into an out-parameter;
+- serial recursion that returns the value directly;
+- libfork inline schedulers over vector or deque adaptors and multiple stack
+  types;
+- libfork busy pools using mono and type-erased schedulers;
+- OpenMP `task`/`taskwait`;
+- bare-metal coroutine and deque-overhead probes.
+
+## Scaling
+
+The work grows exponentially with `n`, while the span is linear in `n`, so the
+algorithm exposes huge theoretical parallelism at the base size. In practice,
+tasks are so fine grained that scaling is limited by scheduler throughput,
+worker wake-up/steal costs, cache traffic on queues or stacks, and join
+coordination.
+
+Good scaling means the runtime can process many small strict fork-join tasks per
+second. Poor scaling does not say much about Fibonacci itself; it usually means
+the per-task overhead is larger than the useful work.
+
+## Bottlenecks And Granularity
+
+The benchmark is compute-light and allocation/scheduling-heavy. Memory pressure
+comes from coroutine frames, recursion frames, and per-worker stack/deque
+storage, not from input data. Granularity is intentionally too small for a real
+parallel Fibonacci implementation. A production algorithm would stop spawning
+below a cutoff and compute the remaining subtree serially.
+
+## References
+
+- [Scheduling multithreaded computations by work stealing](https://doi.org/10.1145/324133.324234)
+- [Cilk publications and tasking references](https://cilk.mit.edu/publications/)
+- [OpenMP 5.2 task construct](https://www.openmp.org/spec-html/5.2/openmpse73.html)
diff --git a/docs/benchmarks/benchmarks/fold.md b/docs/benchmarks/benchmarks/fold.md
new file mode 100644
index 000000000..8174d426d
--- /dev/null
+++ b/docs/benchmarks/benchmarks/fold.md
@@ -0,0 +1,58 @@
+---
+icon: lucide/sigma
+---
+
+# Fold
+
+The fold benchmark reduces a range with addition and verifies the result against
+a closed-form sum. It exercises libfork's `fold` algorithm across input
+representation, chunk size, projection mode, type, and scheduler variants.
+
+Source:
+
+- [shared fold helpers](https://github.com/conorwilliams/libfork/blob/main/benchmark/lib/fold.hpp)
+- [serial `std::reduce` variants](https://github.com/conorwilliams/libfork/blob/main/benchmark/src/serial/fold.cpp)
+- [libfork variants](https://github.com/conorwilliams/libfork/blob/main/benchmark/src/libfork/fold.cpp)
+- [fold API docs](../../api/algorithm.md#fold)
+
+## What It Measures
+
+Inputs are either memory-backed vectors or lazy iota/transform ranges. Values
+are `int32` or `float32`, accumulated into a wider integer or double type where
+appropriate. Libfork variants compare:
+
+- fixed chunks of 4096 elements;
+- a single-element explicit chunk;
+- the algorithm's deduced chunking path;
+- synchronous and asynchronous projections;
+- mono and type-erased busy-pool schedulers.
+
+## Scaling
+
+For memory-backed ranges, large reductions should scale until memory bandwidth,
+cache hierarchy, or reduction-tree overhead dominates. Lazy ranges remove input
+loads and emphasize iterator/projection arithmetic plus scheduling overhead.
+
+Single-element chunks are expected to scale poorly except as an overhead stress
+test. Fixed chunks provide enough arithmetic per task to amortize scheduling at
+large sizes. Async projections add coroutine overhead and are useful for
+checking whether the algorithm handles async work correctly, not for raw
+throughput.
+
+## Bottlenecks And Granularity
+
+Memory-backed `1024^3` inputs can require several GiB of storage and should be
+treated as machine-size-sensitive. Lazy inputs avoid that allocation but still
+perform one projected addition per element. Floating-point reductions may differ
+in order from serial code, so the benchmark checks tolerance rather than exact
+bit identity.
+
+Granularity should be large enough that each task streams many cache lines. Too
+small a chunk turns the benchmark into a scheduler test; too large a chunk
+leaves workers idle near the end of the reduction tree.
+
+## References
+
+- [C++ `std::reduce`](https://en.cppreference.com/w/cpp/algorithm/reduce)
+- [libfork `fold` API](../../api/algorithm.md#fold)
+- [Google Benchmark counters and complexity](https://github.com/google/benchmark/blob/main/docs/user_guide.md)
diff --git a/docs/benchmarks/benchmarks/heat.md b/docs/benchmarks/benchmarks/heat.md
new file mode 100644
index 000000000..0f140fbdf
--- /dev/null
+++ b/docs/benchmarks/benchmarks/heat.md
@@ -0,0 +1,44 @@
+---
+icon: lucide/flame
+---
+
+# Heat
+
+The heat benchmark runs a fixed number of Jacobi sweeps over a square grid.
+Interior cells are updated from their four direct neighbors, while boundary
+cells remain clamped. The final grid is compared against a reference run.
+
+Source:
+
+- [shared heat setup](https://github.com/conorwilliams/libfork/blob/main/benchmark/lib/heat.hpp)
+- [serial implementation](https://github.com/conorwilliams/libfork/blob/main/benchmark/src/serial/heat.cpp)
+
+## What It Measures
+
+`test` uses a `64 x 64` grid; `base` uses `1024 x 1024`. Each benchmark run
+performs 16 sweeps using two grids and swaps the active buffers between sweeps.
+The initial condition is a deterministic analytic profile.
+
+## Scaling
+
+A parallel version should scale by splitting rows, tiles, or bands among
+workers. Each sweep has a global phase boundary because the next sweep depends
+on the previous sweep's full output. Good scaling is expected within a sweep
+until memory bandwidth dominates, but the per-sweep synchronization point limits
+the benefit of very small grids.
+
+## Bottlenecks And Granularity
+
+This is a memory-bandwidth benchmark. Each interior update reads four doubles
+and writes one double, with little arithmetic per byte. Cache blocking can help
+larger grids by improving locality, but the simple serial baseline streams rows.
+
+Granularity should be coarse enough that each task owns many contiguous rows or
+tiles. Per-cell tasks would be dominated by scheduling overhead. Boundary
+handling is small and should not be parallelized separately.
+
+## References
+
+- [Jacobi method overview](https://en.wikipedia.org/wiki/Jacobi_method)
+- [Stencil computation overview](https://en.wikipedia.org/wiki/Stencil_code)
+- [OpenMP loop scheduling background](https://www.openmp.org/spec-html/5.2/openmpse66.html)
diff --git a/docs/benchmarks/benchmarks/index.md b/docs/benchmarks/benchmarks/index.md
new file mode 100644
index 000000000..1d762e4c5
--- /dev/null
+++ b/docs/benchmarks/benchmarks/index.md
@@ -0,0 +1,183 @@
+---
+icon: lucide/list
+---
+
+# Benchmarks
+
+This section documents each benchmark family and the common registration
+patterns used by the benchmark sources.
+
+The benchmark suite is built on Google Benchmark. Shared family definitions live
+in `benchmark/lib/`, while implementation variants live under
+`benchmark/src/<implementation>/`.
+
+## Shared Benchmark Loop
+
+The common loop is `lf_bench::bench` in `benchmark/lib/bench.hpp`. It owns the
+central Google Benchmark iteration loop, result checking, error reporting, and
+`DoNotOptimize` call.
+
+Use the overload without a thread count for serial or single-worker variants:
+
+```cpp
+lf_bench::bench(state, expected, [] {
+  return run_work();
+});
+```
+
+Use the overload with a thread count for benchmark registrations whose argument
+list includes `p`:
+
+```cpp
+lf_bench::bench(state, threads, expected, [] {
+  return run_parallel_work();
+});
+```
+
+The callable is always the last argument. Pass callables by value; benchmark
+wrappers copy lambdas rather than forwarding them.
+
+If equality is not the right correctness check, pass a custom predicate before
+the workload callable:
+
+```cpp
+lf_bench::bench(state, expected, result_is_close, [] {
+  return run_work();
+});
+```
+
+For threaded benchmarks, `bench` also reports:
+
+- `state.counters["p"]`, the thread count.
+- `SetComplexityN(p)`, used with the inverse-complexity reporter configured by
+  the multi-threaded registration macros.
+
+## Family Wrappers
+
+Each benchmark family should hide input construction, expected-result
+calculation, and family-specific counters in a helper in `benchmark/lib/`.
+
+For example, `fib.hpp` exposes `run_fib`:
+
+```cpp
+template <typename Fn>
+void run_fib(benchmark::State &state, std::int64_t threads, Fn fn);
+
+template <typename Fn>
+void run_fib(benchmark::State &state, Fn fn);
+```
+
+The implementation-specific source then supplies only the work:
+
+```cpp
+void fib_run(benchmark::State &state) {
+  run_fib(state, [](std::int64_t n) {
+    return fib_impl(n);
+  });
+}
+```
+
+Threaded variants put `threads` before the callable so the workload remains the
+last argument:
+
+```cpp
+void fib_run(benchmark::State &state) {
+  auto threads = state.range(1);
+  run_fib(state, threads, [threads](std::int64_t n) {
+    return fib_impl(n, threads);
+  });
+}
+```
+
+The same pattern is used by:
+
+- `run_fib` for Fibonacci.
+- `run_fold_input` for fold input construction and item reporting.
+- `run_heat` for heat-grid construction and convergence checking.
+- `run_integrate` for integration bounds and tolerance checking.
+- `run_knapsack` for problem generation and optimum checking.
+- `run_mandelbrot` for image-size counters and checksum checking.
+- `run_matmul` for matrix input construction and relative-error checking.
+- `run_nqueens` for board allocation and known solution counts.
+- `run_primes` for prime-count reference values.
+- `run_quicksort` for input generation and sorted-order checking.
+- `run_scan` for scan input construction and tail checking.
+- `run_skynet` for depth, leaf-count, and expected sum setup.
+- `run_uts` for UTS tree setup and root construction.
+
+Benchmarks that do not need a family wrapper can still call `lf_bench::bench`
+directly once they have set their counters and built their expected result.
+
+## Registration Macros
+
+Registration macros live in `benchmark/lib/macros.hpp`. They wrap
+`benchmark::RegisterBenchmark`, build the benchmark name, and attach the
+appropriate argument sets.
+
+Standard single-argument benchmarks use:
+
+```cpp
+BENCH_ONE(bench_fn, category, name, mode, prefix, ...);
+BENCH_ALL(bench_fn, category, name, prefix, ...);
+```
+
+`BENCH_ONE` registers one size. The size comes from `prefix##_##mode`; for
+example, `BENCH_ONE(fib_run, serial, fib, test, fib)` uses `fib_test`.
+
+`BENCH_ALL` registers the `test` and `base` sizes:
+
+```cpp
+BENCH_ALL(fib_run, serial, fib, fib);
+```
+
+Multi-threaded standard benchmarks use:
+
+```cpp
+BENCH_ONE_MT(bench_fn, category, name, mode, prefix, ...);
+BENCH_ALL_MT(bench_fn, category, name, prefix, ...);
+```
+
+These register argument pairs `{size, p}`. The size still comes from
+`prefix##_##mode`; `p` is generated from hardware-supported thread counts:
+`1, 2, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96`, stopping once the value exceeds
+`std::thread::hardware_concurrency()`.
+
+UTS has separate macros because its first benchmark argument is the thread count
+and the tree selection is captured in the registration lambda:
+
+```cpp
+UTS_BENCH_ONE(bench_fn, category, mode, tree_name, tree_id, ...);
+UTS_BENCH_ALL(bench_fn, category, ...);
+UTS_BENCH_ONE_MT(bench_fn, category, mode, tree_name, tree_id, ...);
+UTS_BENCH_ALL_MT(bench_fn, category, ...);
+```
+
+`UTS_BENCH_ALL` registers the mini, base, and large tree presets. The
+multi-threaded form registers the same tree presets for each supported thread
+count.
+
+## Names And Template Arguments
+
+Benchmark names are formatted as:
+
+```text
+<mode>/<category>/<name>[/template-or-argument-tags]
+```
+
+The variadic macro arguments are both template arguments for the benchmark
+function and a readable suffix in the benchmark name. Spaces are stripped from
+the formatted name, so template-heavy libfork scheduler names remain usable in
+Google Benchmark filters.
+
+For example:
+
+```cpp
+BENCH_ALL(run, libfork, fib, fib, mono_busy_pool);
+```
+
+registers `run<mono_busy_pool>` and gives the benchmark a suffix containing
+`mono_busy_pool`.
+
+Family-specific macros can layer on top of the standard macros. Fold uses
+`LF_FOLD_BENCH_SIZES` and `LF_FOLD_BENCH_SIZES_MT` to register its chosen input
+sizes while still delegating to `BENCH_ONE` and `BENCH_ONE_MT`.
diff --git a/docs/benchmarks/benchmarks/integrate.md b/docs/benchmarks/benchmarks/integrate.md
new file mode 100644
index 000000000..e2b6d96dd
--- /dev/null
+++ b/docs/benchmarks/benchmarks/integrate.md
@@ -0,0 +1,43 @@
+---
+icon: lucide/chart-area
+---
+
+# Integrate
+
+The integrate benchmark computes an adaptive trapezoidal integral of
+`(x * x + 1) * x` over `[0, n]`. The recursion subdivides until the split
+trapezoids agree with the parent area within a fixed epsilon. The result is
+checked against the exact polynomial integral.
+
+Source:
+
+- [shared integration helpers](https://github.com/conorwilliams/libfork/blob/main/benchmark/lib/integrate.hpp)
+- [serial implementation](https://github.com/conorwilliams/libfork/blob/main/benchmark/src/serial/integrate.cpp)
+
+## What It Measures
+
+`test` uses `n = 100`; `base` uses `n = 10000`. The benchmark stresses recursive
+adaptive control flow rather than dense numeric kernels. Work is concentrated in
+intervals where the trapezoid approximation needs further subdivision.
+
+## Scaling
+
+A parallel version can fork the two subintervals whenever an interval fails the
+error test. Scaling depends on how quickly the recursion exposes enough
+independent intervals. Smooth functions with balanced subdivision are easier to
+parallelize than functions where one side of the domain keeps subdividing much
+more deeply than the other.
+
+## Bottlenecks And Granularity
+
+The computation is mostly floating-point arithmetic and branch-heavy recursion.
+There is almost no shared memory traffic. Task granularity should use a cutoff
+or depth threshold because near the leaves each interval performs only a few
+floating-point operations. Without a cutoff, scheduling can cost more than the
+quadrature step.
+
+## References
+
+- [Adaptive quadrature overview](https://en.wikipedia.org/wiki/Adaptive_quadrature)
+- [Trapezoidal rule overview](https://en.wikipedia.org/wiki/Trapezoidal_rule)
+- [Scheduling multithreaded computations by work stealing](https://doi.org/10.1145/324133.324234)
diff --git a/docs/benchmarks/benchmarks/knapsack.md b/docs/benchmarks/benchmarks/knapsack.md
new file mode 100644
index 000000000..f46331497
--- /dev/null
+++ b/docs/benchmarks/benchmarks/knapsack.md
@@ -0,0 +1,42 @@
+---
+icon: lucide/package
+---
+
+# Knapsack
+
+The knapsack benchmark solves a deterministic 0/1 knapsack instance exactly
+with branch and bound. Items are sorted by value density, and a fractional
+relaxation bound prunes subtrees that cannot beat the current best value.
+
+Source:
+
+- [shared knapsack setup](https://github.com/conorwilliams/libfork/blob/main/benchmark/lib/knapsack.hpp)
+- [serial implementation](https://github.com/conorwilliams/libfork/blob/main/benchmark/src/serial/knapsack.cpp)
+
+## What It Measures
+
+`test` uses 16 items; `base` uses 28 items. The item weights and values are
+generated from a fixed random seed. The benchmark verifies the branch-and-bound
+answer against a dynamic-programming optimum.
+
+## Scaling
+
+Parallel branch and bound can expose large amounts of independent search, but
+speedup depends on pruning order and on how quickly workers see improved
+incumbent values. If the best value is shared globally, synchronization on that
+incumbent can become visible. If it is not shared aggressively, workers may
+waste time exploring subtrees that a newer bound would prune.
+
+## Bottlenecks And Granularity
+
+The serial benchmark is branch-heavy and cache-light. Most time goes into
+recursive search and repeated bound calculations over the remaining items.
+Granularity is irregular: high-level branches are valuable tasks, while leaf
+subtrees are too small to schedule individually. A parallel implementation
+should stop forking below a depth or estimated-subtree cutoff.
+
+## References
+
+- [Knapsack problem overview](https://en.wikipedia.org/wiki/Knapsack_problem)
+- [Branch and bound overview](https://en.wikipedia.org/wiki/Branch_and_bound)
+- [OR-Tools knapsack solver reference implementation](https://developers.google.com/optimization/pack/knapsack)
diff --git a/docs/benchmarks/benchmarks/mandelbrot.md b/docs/benchmarks/benchmarks/mandelbrot.md
new file mode 100644
index 000000000..b8161526c
--- /dev/null
+++ b/docs/benchmarks/benchmarks/mandelbrot.md
@@ -0,0 +1,43 @@
+---
+icon: lucide/aperture
+---
+
+# Mandelbrot
+
+The Mandelbrot benchmark computes an escape-time checksum over an `n x n` grid
+covering the rectangle `[-2, 1] x [-1.5, 1.5]`. Each pixel iterates the standard
+quadratic recurrence up to 256 iterations.
+
+Source:
+
+- [shared Mandelbrot helpers](https://github.com/conorwilliams/libfork/blob/main/benchmark/lib/mandelbrot.hpp)
+- [serial implementation](https://github.com/conorwilliams/libfork/blob/main/benchmark/src/serial/mandelbrot.cpp)
+
+## What It Measures
+
+`test` uses `n = 128`; `base` uses `n = 1024`. The output is not stored as an
+image. Instead, the benchmark sums all per-pixel iteration counts and checks the
+checksum against a reference computation.
+
+## Scaling
+
+Pixels are independent, so a parallel version should scale well with row, tile,
+or chunk partitioning. Load balance matters because pixels near the set boundary
+usually run more iterations than pixels that escape quickly. Static row
+partitioning can be uneven; tiles or dynamic chunks are usually better.
+
+## Bottlenecks And Granularity
+
+This benchmark is compute-bound for large grids. It has little memory traffic
+because the serial version only accumulates a checksum. Branch divergence is the
+main irregularity: different pixels exit after different iteration counts.
+
+Granularity should group many pixels per task. Per-pixel tasks are too small,
+while very large row blocks can leave workers idle if one block contains more
+boundary-heavy pixels.
+
+## References
+
+- [Mandelbrot set overview](https://en.wikipedia.org/wiki/Mandelbrot_set)
+- [Escape-time algorithm overview](https://en.wikipedia.org/wiki/Plotting_algorithms_for_the_Mandelbrot_set)
+- [OpenMP loop scheduling background](https://www.openmp.org/spec-html/5.2/openmpse66.html)
diff --git a/docs/benchmarks/benchmarks/matmul.md b/docs/benchmarks/benchmarks/matmul.md
new file mode 100644
index 000000000..464ee9fc4
--- /dev/null
+++ b/docs/benchmarks/benchmarks/matmul.md
@@ -0,0 +1,48 @@
+---
+icon: lucide/grid-3x3
+---
+
+# Matrix Multiply
+
+The matrix multiply benchmark computes `C = A * B` for square `float` matrices.
+The serial benchmark uses a recursive divide-and-conquer implementation with
+eight half-size multiplications and a conventional cubic base case.
+
+Source:
+
+- [shared matrix helpers](https://github.com/conorwilliams/libfork/blob/main/benchmark/lib/matmul.hpp)
+- [serial matrix multiply](https://github.com/conorwilliams/libfork/blob/main/benchmark/src/serial/matmul.cpp)
+
+## What It Measures
+
+`test` uses `64 x 64`; `base` uses `1024 x 1024`. Inputs are deterministic
+random matrices. A straightforward iterative multiply computes a reference
+matrix, and the benchmark checks the maximum relative error after the recursive
+multiply.
+
+The recursive implementation cuts down to a `32 x 32` base case.
+
+## Scaling
+
+Matrix multiply has high arithmetic intensity at large sizes, so a tuned
+parallel implementation can scale well until cache, memory bandwidth, or core
+floating-point throughput saturates. This benchmark is not a tuned BLAS kernel;
+it is useful as a structured divide-and-conquer workload.
+
+Parallelizing the eight recursive products exposes regular fork-join work.
+However, base-case size matters: too small a base case increases task overhead
+and loses cache efficiency; too large a base case underuses workers near the top
+of the recursion.
+
+## Bottlenecks And Granularity
+
+The naive `i, k, j` base multiply and row-major layout drive cache behavior.
+The recursive split improves structure but does not implement packing,
+vectorized microkernels, or cache-tuned blocking. Memory pressure is dominated
+by the input, output, and reference matrices.
+
+## References
+
+- [BLAS reference implementation](https://www.netlib.org/blas/)
+- [Matrix multiplication overview](https://en.wikipedia.org/wiki/Matrix_multiplication_algorithm)
+- [GotoBLAS paper](https://www.cs.utexas.edu/users/flame/pubs/GotoTOMS_revision.pdf)
diff --git a/docs/benchmarks/benchmarks/mergesort.md b/docs/benchmarks/benchmarks/mergesort.md
new file mode 100644
index 000000000..0d1b0fe42
--- /dev/null
+++ b/docs/benchmarks/benchmarks/mergesort.md
@@ -0,0 +1,5 @@
+---
+icon: lucide/merge
+---
+
+# Merge sort
diff --git a/docs/benchmarks/benchmarks/nqueens.md b/docs/benchmarks/benchmarks/nqueens.md
new file mode 100644
index 000000000..0cc2206f7
--- /dev/null
+++ b/docs/benchmarks/benchmarks/nqueens.md
@@ -0,0 +1,45 @@
+---
+icon: lucide/crown
+---
+
+# N-Queens
+
+The N-Queens benchmark counts all valid placements of `n` queens on an `n x n`
+board. It uses recursive backtracking and checks the result against known
+solution counts.
+
+Source:
+
+- [shared N-Queens helpers](https://github.com/conorwilliams/libfork/blob/main/benchmark/lib/nqueens.hpp)
+- [serial implementation](https://github.com/conorwilliams/libfork/blob/main/benchmark/src/serial/nqueens.cpp)
+
+## What It Measures
+
+`test` uses `n = 8`; `base` uses `n = 14`. The board is represented as one
+column choice per row. At each level, the benchmark tries every column and calls
+`queens_ok` over the current prefix.
+
+## Scaling
+
+Backtracking search can scale well when high-level row choices are distributed
+across workers. The search tree is irregular because many partial boards are
+pruned early while others continue deeply. Static splitting by the first row is
+often not enough for large worker counts; deeper dynamic work sharing improves
+balance.
+
+## Bottlenecks And Granularity
+
+The benchmark is branch-heavy and uses a small working set. It stresses
+recursive control flow and pruning rather than memory bandwidth. The simple
+validity check scans queen pairs in the prefix, so per-node cost grows with
+depth.
+
+Parallel granularity should fork high in the tree and switch to serial
+backtracking below a cutoff. Creating a task for every candidate placement would
+be much more expensive than the placement check itself.
+
+## References
+
+- [N-Queens problem overview](https://en.wikipedia.org/wiki/Eight_queens_puzzle)
+- [Known N-Queens solution counts](https://oeis.org/A000170)
+- [Scheduling multithreaded computations by work stealing](https://doi.org/10.1145/324133.324234)
diff --git a/docs/benchmarks/benchmarks/primes.md b/docs/benchmarks/benchmarks/primes.md
new file mode 100644
index 000000000..ebf03465a
--- /dev/null
+++ b/docs/benchmarks/benchmarks/primes.md
@@ -0,0 +1,42 @@
+---
+icon: lucide/hash
+---
+
+# Primes
+
+The primes benchmark counts primes below a limit using trial division. The
+predicate uses the standard `6k +/- 1` candidate pattern after handling small
+and even divisors.
+
+Source:
+
+- [shared prime helpers](https://github.com/conorwilliams/libfork/blob/main/benchmark/lib/primes.hpp)
+- [serial implementation](https://github.com/conorwilliams/libfork/blob/main/benchmark/src/serial/primes.cpp)
+
+## What It Measures
+
+`test` counts primes below `100000`; `base` counts primes below `10000000`.
+Known prime-counting values are used for those configured sizes.
+
+## Scaling
+
+A parallel version can partition the candidate range across workers. Static
+equal-size ranges are not perfectly balanced because larger candidates require
+more trial divisions on average, and primes do more work than composites that
+hit a small divisor. Dynamic chunking or weighted ranges improve balance.
+
+## Bottlenecks And Granularity
+
+The benchmark is integer compute-bound with branch-heavy early exits. It has
+almost no shared memory traffic beyond the final reduction. Chunk sizes should
+contain many candidates so each worker amortizes task overhead and local count
+reduction.
+
+The algorithm is intentionally simple. Sieve-based prime counting would have a
+very different memory profile and should not be compared as the same workload.
+
+## References
+
+- [Primality test overview](https://en.wikipedia.org/wiki/Primality_test)
+- [Prime-counting function values](https://oeis.org/A006880)
+- [Segmented sieve implementation background](https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes#Segmented_sieve)
diff --git a/docs/benchmarks/benchmarks/quicksort.md b/docs/benchmarks/benchmarks/quicksort.md
new file mode 100644
index 000000000..654af90ef
--- /dev/null
+++ b/docs/benchmarks/benchmarks/quicksort.md
@@ -0,0 +1,43 @@
+---
+icon: lucide/list-ordered
+---
+
+# Quicksort
+
+The quicksort benchmark sorts a deterministic random array of 32-bit unsigned
+integers. It uses an in-place partition, a middle-element pivot, tail recursion
+on one side, and insertion sort for small partitions.
+
+Source:
+
+- [shared quicksort input](https://github.com/conorwilliams/libfork/blob/main/benchmark/lib/quicksort.hpp)
+- [serial implementation](https://github.com/conorwilliams/libfork/blob/main/benchmark/src/serial/quicksort.cpp)
+
+## What It Measures
+
+`test` uses 10000 elements; `base` uses 10000000 elements. Each iteration copies
+the original input to a work buffer, sorts it, and validates against
+`std::sort` output.
+
+## Scaling
+
+Parallel quicksort can fork the two partitions after each split. Scaling depends
+on pivot quality and partition balance. Random input with a middle-element pivot
+usually produces enough parallel work, but bad partitions reduce parallelism and
+increase span.
+
+## Bottlenecks And Granularity
+
+Partitioning is memory-bandwidth-sensitive and branch-heavy. Sorting small
+partitions is better done inline with insertion sort. A parallel implementation
+should stop spawning when a partition is below a size cutoff, both for task
+overhead and cache locality.
+
+The benchmark includes input-copy time inside the measured loop, so memory
+bandwidth for copying is part of the result.
+
+## References
+
+- [Quicksort overview](https://en.wikipedia.org/wiki/Quicksort)
+- [C++ `std::sort`](https://en.cppreference.com/w/cpp/algorithm/sort)
+- [OpenMP task construct](https://www.openmp.org/spec-html/5.2/openmpse73.html)
diff --git a/docs/benchmarks/benchmarks/scan.md b/docs/benchmarks/benchmarks/scan.md
new file mode 100644
index 000000000..f0dc6b201
--- /dev/null
+++ b/docs/benchmarks/benchmarks/scan.md
@@ -0,0 +1,42 @@
+---
+icon: lucide/scan-line
+---
+
+# Scan
+
+The scan benchmark repeatedly computes an inclusive prefix sum over a vector of
+unsigned integers using `std::inclusive_scan`. The last output value is checked
+against the arithmetic-series sum.
+
+Source:
+
+- [shared scan helpers](https://github.com/conorwilliams/libfork/blob/main/benchmark/lib/scan.hpp)
+- [serial implementation](https://github.com/conorwilliams/libfork/blob/main/benchmark/src/serial/scan.cpp)
+
+## What It Measures
+
+`test` uses 1000 elements; `base` uses 8000 elements. Each benchmark iteration
+performs 1000 scans to make the small configured sizes measurable.
+
+## Scaling
+
+Prefix scan has less trivial parallelism than a map or reduction because each
+output depends on all earlier inputs. Parallel algorithms usually perform an
+upsweep over block totals followed by a downsweep or offset pass. That means at
+least two global phases and more synchronization than a simple reduction.
+
+For small vectors, the serial implementation should be hard to beat. A parallel
+version needs larger inputs or many batched scans to amortize phase overhead.
+
+## Bottlenecks And Granularity
+
+This benchmark is memory-bandwidth-sensitive and synchronization-sensitive. The
+working set is small at current sizes, so cache effects and loop overhead are
+important. Useful task granularity is a block of contiguous elements, not
+individual prefix operations.
+
+## References
+
+- [Blelloch, "Prefix Sums and Their Applications"](https://www.cs.cmu.edu/~scandal/papers/CMU-CS-90-190.html)
+- [C++ `std::inclusive_scan`](https://en.cppreference.com/w/cpp/algorithm/inclusive_scan)
+- [Prefix sum overview](https://en.wikipedia.org/wiki/Prefix_sum)
diff --git a/docs/benchmarks/benchmarks/skynet.md b/docs/benchmarks/benchmarks/skynet.md
new file mode 100644
index 000000000..badfda116
--- /dev/null
+++ b/docs/benchmarks/benchmarks/skynet.md
@@ -0,0 +1,40 @@
+---
+icon: lucide/network
+---
+
+# Skynet
+
+The Skynet benchmark traverses a regular recursive tree with branching factor
+10. Leaves are numbered consecutively, and the recursion returns the sum of all
+leaf numbers. The expected value is the arithmetic-series sum over the leaves.
+
+Source:
+
+- [shared Skynet helpers](https://github.com/conorwilliams/libfork/blob/main/benchmark/lib/skynet.hpp)
+- [serial implementation](https://github.com/conorwilliams/libfork/blob/main/benchmark/src/serial/skynet.cpp)
+
+## What It Measures
+
+`test` uses depth 4, or 10000 leaves. `base` uses depth 6, or 1000000 leaves.
+Unlike UTS, the tree is perfectly regular, so this is a fan-out/fan-in overhead
+benchmark with predictable load.
+
+## Scaling
+
+A parallel version should expose abundant balanced work at the top levels. The
+span is proportional to depth, while work is proportional to `10^depth`. Because
+each node does very little work, scaling depends on grouping subtrees into tasks
+large enough to amortize scheduling.
+
+## Bottlenecks And Granularity
+
+The workload has almost no memory pressure and little arithmetic per node. It is
+therefore a scheduler and recursion-overhead benchmark. Forking every child at
+every depth would create many tiny tasks; a practical version should stop
+forking below a depth cutoff and sum the remaining subtree serially.
+
+## References
+
+- [Skynet benchmark in Crystal examples](https://github.com/kostya/benchmarks#skynet)
+- [Scheduling multithreaded computations by work stealing](https://doi.org/10.1145/324133.324234)
+- [Strict fork-join background in libfork paper](https://arxiv.org/abs/2402.18480)
diff --git a/docs/benchmarks/benchmarks/strassen.md b/docs/benchmarks/benchmarks/strassen.md
new file mode 100644
index 000000000..04d3ef2de
--- /dev/null
+++ b/docs/benchmarks/benchmarks/strassen.md
@@ -0,0 +1,50 @@
+---
+icon: lucide/blocks
+---
+
+# Strassen
+
+The Strassen benchmark computes square `float` matrix multiplication using the
+classic seven-product recursive algorithm. It reduces the number of recursive
+multiplications from eight to seven, at the cost of extra additions,
+subtractions, temporary storage, and more complicated memory access.
+
+Source:
+
+- [shared matrix helpers](https://github.com/conorwilliams/libfork/blob/main/benchmark/lib/matmul.hpp)
+- [serial Strassen implementation](https://github.com/conorwilliams/libfork/blob/main/benchmark/src/serial/strassen.cpp)
+- [matrix multiply comparison](matmul.md)
+
+## What It Measures
+
+`test` uses `64 x 64`; `base` uses `1024 x 1024`. The benchmark compares against
+a conventional iterative reference multiply and accepts a larger relative-error
+tolerance than the divide-and-conquer cubic benchmark.
+
+The implementation stops recursing at `64 x 64`, where it uses a naive cubic
+multiply. Each recursive level allocates temporary buffers for two sums and
+seven products.
+
+## Scaling
+
+Strassen exposes seven independent recursive products per level, so a parallel
+implementation can scale well above the cutoff. Scaling is often constrained by
+temporary allocation, additions and subtractions around each product, cache
+locality, and numeric overhead.
+
+The algorithm has lower asymptotic arithmetic count than cubic multiply, but it
+is not automatically faster at moderate sizes. Cutoff selection is critical.
+
+## Bottlenecks And Granularity
+
+Memory pressure is higher than regular matrix multiply because each level
+allocates nine `m x m` temporary matrices. The extra matrix additions are
+bandwidth-sensitive. Tasks below the cutoff are too small to schedule, while
+tasks above the cutoff should represent full submatrix products or groups of
+matrix additions.
+
+## References
+
+- [Strassen, "Gaussian Elimination is not Optimal"](https://eudml.org/doc/131927)
+- [DOI: 10.1007/BF02165411](https://doi.org/10.1007/BF02165411)
+- [BLAS reference implementation](https://www.netlib.org/blas/)
diff --git a/docs/benchmarks/benchmarks/switch-io-pool.md b/docs/benchmarks/benchmarks/switch-io-pool.md
new file mode 100644
index 000000000..be0fd2512
--- /dev/null
+++ b/docs/benchmarks/benchmarks/switch-io-pool.md
@@ -0,0 +1,54 @@
+---
+icon: lucide/arrow-left-right
+---
+
+# I/O Pool Switch
+
+The I/O pool switch benchmark models request fan-out where each request does
+CPU work, hops to an I/O pool, does a smaller amount of work, then hops back to
+the compute pool. A baseline variant performs the same total work without pool
+switches.
+
+Source:
+
+- [benchmark implementation](https://github.com/conorwilliams/libfork/blob/main/benchmark/src/libfork/switch_io_pool.cpp)
+- [libfork scheduling API](../../api/core/scheduling.md)
+- [libfork scheduler docs](../../api/schedulers.md)
+
+## What It Measures
+
+Each request performs deterministic busy-loop work. The `request_io` variants
+use a custom awaitable to post the continuation to another scheduler, while
+`request_baseline` stays on the compute pool. A parent task forks many requests,
+joins them, and sums their return values.
+
+The benchmark compares mono and type-erased busy pools. It records request
+count, compute workers, and I/O workers. The I/O pool worker count is
+`max(2, hardware_concurrency / 8)`.
+
+## Scaling
+
+The baseline should scale like a regular fan-out/fan-in computation until
+request tasks become too small or the join reduction dominates. The I/O variant
+adds two cross-pool posts per request, so speedup depends on whether useful
+work per request amortizes those hops.
+
+If compute workers scale but I/O workers are saturated, the I/O pool can become
+the bottleneck. If the I/O pool is underused, the extra switches mostly measure
+posting and cache-migration overhead.
+
+## Bottlenecks And Granularity
+
+The benchmark is compute-bound by construction, but it is intended to expose
+scheduler overhead rather than memory bandwidth. Granularity is controlled by
+request count and fixed busy-loop units. Small request counts underfill the
+workers; very large counts amplify queue traffic and result-vector writes.
+
+Because the I/O work is simulated, this benchmark should not be interpreted as
+network or disk throughput. It isolates continuation migration costs.
+
+## References
+
+- [libfork scheduler docs](../../api/schedulers.md)
+- [OpenMP task scheduling background](https://www.openmp.org/spec-html/5.2/openmpch12.html)
+- [Scheduling multithreaded computations by work stealing](https://doi.org/10.1145/324133.324234)
diff --git a/docs/benchmarks/benchmarks/switch-random.md b/docs/benchmarks/benchmarks/switch-random.md
new file mode 100644
index 000000000..7f177e438
--- /dev/null
+++ b/docs/benchmarks/benchmarks/switch-random.md
@@ -0,0 +1,50 @@
+---
+icon: lucide/shuffle
+---
+
+# Random Scheduler Switch
+
+The random scheduler-switch benchmark runs recursive Fibonacci while randomly
+migrating continuations between two scheduler pools. It is a libfork-specific
+stress test for cross-pool posting, continuation resumption, and type-erased
+scheduler overhead.
+
+Source:
+
+- [benchmark implementation](https://github.com/conorwilliams/libfork/blob/main/benchmark/src/libfork/switch_random.cpp)
+- [shared Fibonacci reference](https://github.com/conorwilliams/libfork/blob/main/benchmark/lib/fib.hpp)
+- [libfork scheduling API](../../api/core/scheduling.md)
+
+## What It Measures
+
+The workload is still recursive Fibonacci, checked against the iterative
+reference value. At each internal node, a SplitMix64-derived state gives an
+approximately 10 percent chance of switching to the other pool before forking
+children. The total worker count is split between the two pools.
+
+Variants compare mono and type-erased busy pools. The benchmark records the
+worker split as counters.
+
+## Scaling
+
+This benchmark should scale worse than the ordinary Fibonacci task benchmark
+because some continuations must be posted to another pool. Good results indicate
+that cross-pool scheduling remains cheap relative to coroutine creation and
+join overhead. Poor results can point to posting contention, cache migration, or
+type-erasure overhead.
+
+## Bottlenecks And Granularity
+
+Tasks are intentionally tiny. The useful compute per node is small, so random
+pool switches expose scheduler mechanics rather than application throughput.
+The benchmark requires at least two workers because a single-worker run cannot
+split work between pools.
+
+This is not an I/O model. It is a controlled migration probe with deterministic
+randomness and strict fork-join structure.
+
+## References
+
+- [Scheduling multithreaded computations by work stealing](https://doi.org/10.1145/324133.324234)
+- [libfork scheduler docs](../../api/schedulers.md)
+- [SplitMix64 reference implementation](https://prng.di.unimi.it/splitmix64.c)
diff --git a/docs/benchmarks/benchmarks/uts.md b/docs/benchmarks/benchmarks/uts.md
new file mode 100644
index 000000000..14ad4ec21
--- /dev/null
+++ b/docs/benchmarks/benchmarks/uts.md
@@ -0,0 +1,58 @@
+---
+icon: lucide/tree-pine
+---
+
+# Unbalanced Tree Search
+
+Unbalanced Tree Search, or UTS, traverses a deterministic random tree and
+returns the maximum depth, node count, and leaf count. It is designed to stress
+dynamic load balancing because the amount of work below each node is not known
+until traversal reaches that node.
+
+Source:
+
+- [shared UTS helpers](https://github.com/conorwilliams/libfork/blob/main/benchmark/lib/uts.hpp)
+- [shared UTS setup](https://github.com/conorwilliams/libfork/blob/main/benchmark/lib/uts.cpp)
+- [bundled C UTS code](https://github.com/conorwilliams/libfork/tree/main/benchmark/external/uts)
+- [serial variants](https://github.com/conorwilliams/libfork/blob/main/benchmark/src/serial/uts.cpp)
+- [libfork variants](https://github.com/conorwilliams/libfork/blob/main/benchmark/src/libfork/uts.cpp)
+- [OpenMP variant](https://github.com/conorwilliams/libfork/blob/main/benchmark/src/openmp/uts.cpp)
+
+## What It Measures
+
+The suite registers small `T1_mini` and `T3_mini` smoke inputs, base `T1` and
+`T3` inputs, and large `T1L` and `T3L` inputs. T1 is a geometric tree with more
+regular branching than T3; T3 is binomial and more irregular. The traversal is
+checked against known result triples for each tree.
+
+The serial implementation has an allocation-heavy version that stores child
+results in a vector and a `serial/no_alloc` version that traverses one child at
+a time. The libfork and OpenMP implementations fork child subtrees and join
+their result triples.
+
+## Scaling
+
+UTS should scale well when stealing balances the irregular frontier and the tree
+is large enough to keep all workers busy. Scaling is normally worse on tiny
+trees because random-number setup, task creation, and joins dominate. On very
+large trees, memory allocation, cache locality, and steal traffic can become
+visible.
+
+The expected work is proportional to the generated node count. Span is driven by
+the deepest path plus scheduling delays, so unlucky imbalance near the root can
+limit speedup.
+
+## Bottlenecks And Granularity
+
+Each node performs random child generation plus a small amount of reduction
+work. High fan-out creates many short tasks; low fan-out creates long serial
+paths. The vector-based implementations allocate per internal node, which makes
+allocator behavior and cache locality part of the measurement. The no-allocation
+serial baseline isolates traversal work from child-result storage cost.
+
+## References
+
+- [UTS paper DOI](https://doi.org/10.1007/978-3-540-72521-3_18)
+- [UTS publication record](https://scholars.uky.edu/en/publications/uts-an-unbalanced-tree-search-benchmark/)
+- [Scheduling multithreaded computations by work stealing](https://doi.org/10.1145/324133.324234)
+- [OpenMP 5.2 task construct](https://www.openmp.org/spec-html/5.2/openmpse73.html)
diff --git a/docs/benchmarks/index.md b/docs/benchmarks/index.md
new file mode 100644
index 000000000..795f4d3ec
--- /dev/null
+++ b/docs/benchmarks/index.md
@@ -0,0 +1,87 @@
+---
+icon: lucide/timer
+---
+
+# Benchmarks
+
+The benchmark suite measures fork-join task overhead, scheduler behavior, and
+classic recursive or data-parallel kernels. It is built on Google Benchmark and
+is organized around benchmark families. Each family page explains the workload,
+expected scaling, bottlenecks, and available implementations.
+
+## Running
+
+Build benchmarks in release mode:
+
+```sh
+cmake --preset ci-release -DCMAKE_TOOLCHAIN_FILE=cmake/llvm-brew-toolchain.cmake
+cmake --build --preset ci-release
+```
+
+On Linux, use `cmake/gcc-brew-toolchain.cmake`.
+
+Benchmark names use this shape:
+
+```text
+<mode>/<category>/<name>[/template-or-argument-tags]
+```
+
+`mode` is normally `test`, `base`, or `large`. Test inputs are for correctness
+and smoke runs. Base inputs are the default comparison sizes. Large inputs are
+intended for machines where the working set and runtime are acceptable.
+
+## Implementations
+
+The source tree separates shared benchmark data from implementation variants:
+
+- [`benchmark/lib/`](../../benchmark/lib/)
+  contains shared kernels, input sizes, and correctness helpers.
+- [`benchmark/src/libfork/`](../../benchmark/src/libfork/)
+  contains libfork coroutine and scheduler benchmarks.
+- [`benchmark/src/serial/`](../../benchmark/src/serial/)
+  contains single-threaded baselines.
+- [`benchmark/src/openmp/`](../../benchmark/src/openmp/)
+  contains OpenMP tasking comparisons where present.
+- [`benchmark/src/baremetal/`](../../benchmark/src/baremetal/)
+  contains low-level coroutine or data-structure baselines.
+
+## Families
+
+- [Fibonacci](benchmarks/fib.md): recursive task overhead and frame allocation.
+- [Fold](benchmarks/fold.md): reductions over memory-backed and lazy ranges.
+- [Unbalanced Tree Search](benchmarks/uts.md): irregular search-tree traversal.
+- [Random Scheduler Switch](benchmarks/switch-random.md): cross-pool coroutine
+  migration during recursive Fibonacci.
+- [I/O Pool Switch](benchmarks/switch-io-pool.md): request fan-out with explicit
+  compute-pool and I/O-pool hops.
+- [Heat](benchmarks/heat.md): Jacobi heat-diffusion stencil.
+- [Integrate](benchmarks/integrate.md): adaptive recursive quadrature.
+- [Knapsack](benchmarks/knapsack.md): exact branch-and-bound search.
+- [Mandelbrot](benchmarks/mandelbrot.md): per-pixel escape-time computation.
+- [Matrix Multiply](benchmarks/matmul.md): recursive cubic matrix multiply.
+- [Strassen](benchmarks/strassen.md): recursive seven-product matrix multiply.
+- [N-Queens](benchmarks/nqueens.md): recursive backtracking search.
+- [Primes](benchmarks/primes.md): trial-division prime counting.
+- [Quicksort](benchmarks/quicksort.md): in-place divide-and-conquer sorting.
+- [Scan](benchmarks/scan.md): repeated inclusive prefix scan.
+- [Skynet](benchmarks/skynet.md): regular recursive fan-out reduction.
+
+## Interpreting Results
+
+For scheduler benchmarks, near-linear speedup is possible only while there is
+enough ready work to keep all workers busy and each task has enough work to
+amortize fork, scheduling, and join costs. Recursive microbenchmarks such as
+Fibonacci intentionally create tiny tasks, so they are most useful for comparing
+runtime overhead rather than end-user algorithms.
+
+For data-parallel kernels, memory bandwidth, cache locality, allocation rate,
+and reduction or synchronization costs usually dominate before raw worker count
+does. Compare variants at the same input size, compiler, CPU frequency policy,
+and thread count.
+
+Useful background:
+
+- [libfork paper](https://arxiv.org/abs/2402.18480)
+- [Google Benchmark user guide](https://github.com/google/benchmark/blob/main/docs/user_guide.md)
+- [OpenMP 5.2 task construct](https://www.openmp.org/spec-html/5.2/openmpse73.html)
+- [Scheduling multithreaded computations by work stealing](https://doi.org/10.1145/324133.324234)
diff --git a/docs/contributing.md b/docs/contributing.md
new file mode 100644
index 000000000..4b7e96e54
--- /dev/null
+++ b/docs/contributing.md
@@ -0,0 +1,86 @@
+---
+icon: lucide/git-pull-request
+---
+
+# Contributing
+
+This repository currently targets C++26, CMake module file sets, and C++23
+`import std`. Build configuration must use the platform toolchain file.
+
+## Dependencies
+
+Install development dependencies with Homebrew.
+
+macOS:
+
+```sh
+brew install cmake ninja catch2 google-benchmark clang-format codespell llvm
+```
+
+Linux:
+
+```sh
+brew install cmake ninja catch2 google-benchmark clang-format codespell gcc binutils
+```
+
+## Configure, build, test
+
+Use `ci-hardened` for normal development:
+
+```sh
+cmake --preset ci-hardened -DCMAKE_TOOLCHAIN_FILE=cmake/llvm-brew-toolchain.cmake
+cmake --build --preset ci-hardened
+ctest --preset ci-hardened
+```
+
+On Linux, use:
+
+```sh
+cmake --preset ci-hardened -DCMAKE_TOOLCHAIN_FILE=cmake/gcc-brew-toolchain.cmake
+```
+
+Use `ci-release` for benchmarks:
+
+```sh
+cmake --preset ci-release -DCMAKE_TOOLCHAIN_FILE=cmake/llvm-brew-toolchain.cmake
+cmake --build --preset ci-release
+```
+
+Expected warnings include CMake's experimental `import std` warning and the
+benchmark warning about release mode when building benchmarks through
+`ci-hardened`.
+
+## Documentation
+
+The docs site is built with zensical. With the Python project environment:
+
+```sh
+uv sync --group dev
+uv run zensical serve
+uv run zensical build --clean
+```
+
+Without `uv`, install zensical directly:
+
+```sh
+python -m pip install zensical
+zensical serve
+zensical build --clean
+```
+
+The configured output directory is `build/site`.
+
+## Source changes
+
+Module files live under `src/`. If a source or public header file is added or
+removed, update the root `CMakeLists.txt` module/header file sets. Tests live in
+`test/src/` and are discovered recursively by CMake.
+
+For API changes, update the matching docs page under `docs/api/` and add or
+adjust tests that exercise the behavior.
+
+## Benchmarks
+
+Benchmark sources live under `benchmark/`. Use the release preset before
+comparing timings. The benchmark target depends on Google Benchmark and is
+enabled when `libfork_DEV_MODE=ON`, which is set by the CI presets.
diff --git a/docs/favicon/android-chrome-192x192.png b/docs/favicon/android-chrome-192x192.png
new file mode 100644
index 000000000..fbcb1b81d
Binary files /dev/null and b/docs/favicon/android-chrome-192x192.png differ
diff --git a/docs/favicon/android-chrome-512x512.png b/docs/favicon/android-chrome-512x512.png
new file mode 100644
index 000000000..43037a571
Binary files /dev/null and b/docs/favicon/android-chrome-512x512.png differ
diff --git a/docs/favicon/apple-touch-icon.png b/docs/favicon/apple-touch-icon.png
new file mode 100644
index 000000000..b2cdc1b3a
Binary files /dev/null and b/docs/favicon/apple-touch-icon.png differ
diff --git a/docs/favicon/favicon-16x16.png b/docs/favicon/favicon-16x16.png
new file mode 100644
index 000000000..1ba8d0c14
Binary files /dev/null and b/docs/favicon/favicon-16x16.png differ
diff --git a/docs/favicon/favicon-32x32.png b/docs/favicon/favicon-32x32.png
new file mode 100644
index 000000000..b282eff9d
Binary files /dev/null and b/docs/favicon/favicon-32x32.png differ
diff --git a/docs/favicon/favicon.ico b/docs/favicon/favicon.ico
new file mode 100644
index 000000000..06e59e7a9
Binary files /dev/null and b/docs/favicon/favicon.ico differ
diff --git a/.legacy/docs/_static/site.webmanifest b/docs/favicon/site.webmanifest
similarity index 100%
rename from .legacy/docs/_static/site.webmanifest
rename to docs/favicon/site.webmanifest
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 000000000..89382b5df
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,90 @@
+---
+icon: lucide/utensils
+---
+
+# libfork
+
+`libfork` is a C++ coroutine-tasking library for strict fork-join parallelism.
+It gives programs a small async-function vocabulary, a scheduler-independent
+execution model, and worker stacks designed for fine-grained parallel tasks.
+
+At the top level, users import the library as a C++ module:
+
+```cpp
+import libfork;
+```
+
+The core idea is simple: a task may fork child tasks, continue with local work,
+and then join before reading child results or returning to its own parent. The
+runtime uses continuation stealing: the worker that performs a fork continues
+with the child, while another worker may steal the parent continuation.
+
+```cpp
+import std;
+import libfork;
+
+struct fib {
+  template <lf::worker_context Context>
+  static auto operator()(lf::env<Context>, std::int64_t n)
+      -> lf::task<std::int64_t, Context> {
+    if (n < 2) {
+      co_return n;
+    }
+
+    std::int64_t lhs = 0;
+    std::int64_t rhs = 0;
+
+    auto sc = co_await lf::scope();
+    co_await sc.fork(&rhs, fib{}, n - 2);
+    co_await sc.call(&lhs, fib{}, n - 1);
+    co_await sc.join();
+
+    co_return lhs + rhs;
+  }
+};
+
+auto main() -> int {
+  lf::mono_busy_pool<lf::geometric_stack<>> pool{4};
+  auto result = lf::schedule(pool, fib{}, 20).get();
+  return result == 6765 ? 0 : 1;
+}
+```
+
+## Start here
+
+- [Installation](installation.md) covers prerequisites, configuration,
+  building, and a first program.
+- [Quickstart](quickstart.md) explains the fork-join model, scheduling, cancellation,
+  exceptions, algorithms, and the stack model.
+- [API reference](api/index.md) documents the exported `libfork` modules.
+- [Benchmarks](benchmarks/index.md) describes the benchmark suite.
+- [Contributing](contributing.md) lists the local development workflow.
+
+## Design in one page
+
+`libfork` tasks are C++ coroutines returning `lf::task<T, Context>`. The first
+argument is normally `lf::env<Context>`, which lets libfork pass context through
+the task graph without constructing user-visible runtime objects.
+
+Tasks run inside a strict fork-join tree. A fork starts a child that may run in
+parallel with the parent continuation. A call starts a child inline and is useful
+when there is no profitable continuation left to steal. A join waits for all
+outstanding children in the current scope.
+
+Schedulers are separate from task code. The same task can run on the synchronous
+`inline_scheduler`, a monomorphic busy-waiting pool, or a polymorphic pool. The
+default practical choice for parallel work today is:
+
+```cpp
+using pool_type = lf::mono_busy_pool<lf::geometric_stack<>>;
+```
+
+The module surface is intentionally split:
+
+- `libfork.core` defines tasks, scopes, scheduling, receivers, cancellation,
+  contexts, handles, projections, and concepts.
+- `libfork.batteries` provides worker stacks, deques, context policies, and
+  context implementations.
+- `libfork.schedulers` provides scheduler implementations.
+- `libfork.algorithm` provides higher-level fork-join algorithms such as
+  `for_each` and `fold`.
diff --git a/docs/installation.md b/docs/installation.md
new file mode 100644
index 000000000..84049d375
--- /dev/null
+++ b/docs/installation.md
@@ -0,0 +1,133 @@
+---
+icon: lucide/package-check
+---
+
+# Installation
+
+`libfork` is a C++26 module-based library. The current tree uses CMake module
+file sets and C++23 `import std`, so the compiler and CMake invocation matter.
+
+## Requirements
+
+Use Homebrew-provided build tools and the repository toolchain file.
+
+On macOS:
+
+```sh
+brew install cmake ninja catch2 google-benchmark clang-format codespell llvm
+```
+
+On Linux:
+
+```sh
+brew install cmake ninja catch2 google-benchmark clang-format codespell gcc binutils
+```
+
+The required configure command differs by platform:
+
+```sh
+# macOS
+cmake --preset ci-hardened -DCMAKE_TOOLCHAIN_FILE=cmake/llvm-brew-toolchain.cmake
+
+# Linux
+cmake --preset ci-hardened -DCMAKE_TOOLCHAIN_FILE=cmake/gcc-brew-toolchain.cmake
+```
+
+!!! warning
+
+    Always pass the toolchain file. Without it, CMake may fail to discover
+    `import std` support or the standard library module metadata.
+
+## Build and test
+
+For normal development use the hardened preset:
+
+```sh
+cmake --preset ci-hardened -DCMAKE_TOOLCHAIN_FILE=cmake/llvm-brew-toolchain.cmake
+cmake --build --preset ci-hardened
+ctest --preset ci-hardened
+```
+
+For benchmark builds use the release preset:
+
+```sh
+cmake --preset ci-release -DCMAKE_TOOLCHAIN_FILE=cmake/llvm-brew-toolchain.cmake
+cmake --build --preset ci-release
+```
+
+On Linux, replace `cmake/llvm-brew-toolchain.cmake` with
+`cmake/gcc-brew-toolchain.cmake`.
+
+## First task
+
+A libfork async function is a function object that returns `lf::task<T, Context>`.
+The first argument is usually `lf::env<Context>`.
+
+```cpp
+import std;
+import libfork;
+
+struct answer {
+  template <lf::worker_context Context>
+  static auto operator()(lf::env<Context>) -> lf::task<int, Context> {
+    co_return 42;
+  }
+};
+
+auto main() -> int {
+  lf::mono_busy_pool<lf::geometric_stack<>> pool{2};
+  int value = lf::schedule(pool, answer{}).get();
+  return value == 42 ? 0 : 1;
+}
+```
+
+`schedule` returns an `lf::receiver<T>`. Call `.get()` on the receiver to wait
+for completion, consume the receiver, return the task result, or rethrow an
+exception stored by the task.
+
+## First fork-join scope
+
+Use `lf::scope()` inside a task when it needs children:
+
+```cpp
+struct sum_pair {
+  template <lf::worker_context Context>
+  static auto operator()(lf::env<Context>, int a, int b) -> lf::task<int, Context> {
+    co_return a + b;
+  }
+};
+
+struct parent {
+  template <lf::worker_context Context>
+  static auto operator()(lf::env<Context>) -> lf::task<int, Context> {
+    int left = 0;
+    int right = 0;
+
+    auto sc = co_await lf::scope();
+    co_await sc.fork(&left, sum_pair{}, 1, 2);
+    co_await sc.call(&right, sum_pair{}, 3, 4);
+    co_await sc.join();
+
+    co_return left + right;
+  }
+};
+```
+
+Use `fork` for work that may run in parallel with the continuation. Use `call`
+when running the child inline is the better fit. Results written through return
+addresses are safe to read only after the matching `join`.
+
+## Consuming from another project
+
+The current project is module-based. A consuming CMake project should link the
+library target and compile with a compiler/toolchain that supports C++ module
+file sets and `import std`.
+
+```cmake
+find_package(libfork CONFIG REQUIRED)
+
+target_link_libraries(app PRIVATE libfork::libfork)
+target_compile_features(app PRIVATE cxx_std_26)
+```
+
+When building libfork from this source tree, prefer the repository presets above.
diff --git a/docs/quickstart.md b/docs/quickstart.md
new file mode 100644
index 000000000..f405b9b5c
--- /dev/null
+++ b/docs/quickstart.md
@@ -0,0 +1,176 @@
+---
+icon: lucide/rocket
+---
+
+# Quickstart
+
+This quickstart explains how the current module-based libfork API fits together.
+
+## Fork-join tasks
+
+Libfork models work as a strict fork-join tree. A task may create children, but
+it must join those children before returning. This restriction keeps the task
+graph structured and lets the runtime move continuations between workers without
+requiring users to manage task lifetimes manually.
+
+An async function is any callable that returns `lf::task<T, Context>` when
+invoked in a worker context:
+
+```cpp
+struct work {
+  template <lf::worker_context Context>
+  static auto operator()(lf::env<Context>, int input) -> lf::task<int, Context> {
+    co_return input * 2;
+  }
+};
+```
+
+The `lf::env<Context>` argument is supplied by libfork. It identifies the worker
+context type and allows the same callable to be used with different schedulers.
+
+## Fork, call, join
+
+Inside a task, `co_await lf::scope()` returns a scope object with `fork`, `call`,
+`fork_drop`, `call_drop`, and `join`.
+
+```cpp
+auto sc = co_await lf::scope();
+co_await sc.fork(&left, child{}, 1);
+co_await sc.call(&right, child{}, 2);
+co_await sc.join();
+```
+
+`fork` exposes the parent continuation for stealing and immediately starts the
+child on the current worker. `call` starts the child inline and is useful when
+there is no useful continuation left to steal. `join` waits until all children
+created through the scope have finished.
+
+Use the `_drop` variants when the child result is intentionally ignored:
+
+```cpp
+co_await sc.fork_drop(side_effect{}, item);
+co_await sc.call_drop(cleanup{}, item);
+```
+
+## Result storage
+
+Non-void child results are written into caller-provided storage:
+
+```cpp
+int result = 0;
+co_await sc.fork(&result, compute{}, input);
+co_await sc.join();
+```
+
+The pointer must remain valid until after the join. A common pattern is to keep
+child result variables in the parent coroutine frame and read them only after
+`join`.
+
+## Continuation stealing
+
+On a fork, libfork pushes a handle to the parent continuation into the worker
+context and runs the child immediately. Another worker may steal that
+continuation and resume it. This differs from child-stealing runtimes, where the
+new child is normally offered to other workers.
+
+The important user-facing consequence is that execution may resume on a
+different OS thread after any `co_await`. Code inside tasks should not assume
+thread affinity unless it uses an explicit scheduling awaitable.
+
+## Worker stacks
+
+Coroutine frames created by fork/call are allocated from a worker stack. The
+provided stacks trade simplicity, speed, and bounded memory:
+
+- `geometric_stack` is the general-purpose segmented stack.
+- `slab_stack` uses a fixed-capacity slab and throws `std::bad_alloc` on
+  overflow.
+- `adaptor_stack` delegates every push/pop to an allocator.
+
+Schedulers combine a stack with a context policy, such as `adapt_vector` for a
+single-threaded inline scheduler or `adapt_deque` for stealing between workers.
+
+## Exceptions
+
+If a child task exits with an exception, libfork stores the exception in the
+parent and rethrows it at `join`.
+
+```cpp
+auto sc = co_await lf::scope();
+co_await sc.fork_drop(may_throw{}, input);
+co_await sc.join(); // rethrows if the child failed
+```
+
+Because libfork is strict fork-join, task code should structure potentially
+throwing work so outstanding children are still joined before the task exits.
+When in doubt, join in the same lexical region that created the children.
+
+`receiver::get()` also rethrows exceptions from a scheduled root task.
+
+## Cancellation
+
+Use `lf::child_scope()` to create a scope with its own `stop_source`. Children
+launched through that scope inherit its stop token.
+
+```cpp
+struct maybe_run {
+  template <lf::worker_context Context>
+  static auto operator()(lf::env<Context>) -> lf::task<void, Context> {
+    auto sc = co_await lf::child_scope();
+    sc.request_stop();
+    co_await sc.fork_drop(child_work{});
+    co_await sc.join();
+  }
+};
+```
+
+For root tasks, construct `lf::recv_state<T, true>` and pass it to `schedule`.
+The returned `lf::receiver<T, true>` exposes `stop_source()`:
+
+```cpp
+lf::recv_state<int, true> state;
+auto recv = lf::schedule(pool, std::move(state), root_task{});
+recv.stop_source().request_stop();
+```
+
+When a cancellable receiver is consumed after cancellation,
+`receiver::get()` throws `lf::operation_cancelled_error`.
+
+## Explicit scheduling
+
+Libfork supports context-switching awaitables. A type is an `lf::awaitable<T,
+Context>` when it can be acquired through `operator co_await` and its awaitable
+has:
+
+```cpp
+auto await_ready() -> bool;
+auto await_suspend(lf::sched_handle<Context>, Context&) -> void;
+auto await_resume();
+```
+
+`await_suspend` receives a schedulable handle and the current context. It may
+post that handle to another scheduler, allowing a task to hop between pools.
+
+## Algorithms
+
+The algorithm module provides fork-join operations over random-access ranges.
+
+`lf::for_each` applies a synchronous or asynchronous function to every element:
+
+```cpp
+auto recv = lf::schedule(pool, lf::for_each, std::span(values), [](int& x) {
+  x *= 2;
+});
+std::move(recv).get();
+```
+
+`lf::fold` reduces a non-empty range to `std::optional<T>`, returning
+`std::nullopt` for empty input:
+
+```cpp
+auto recv = lf::schedule(pool, lf::fold, std::span(values), std::plus<>{});
+auto sum = std::move(recv).get();
+```
+
+Both algorithms accept iterator/sentinel pairs or ranges. Overloads with an
+explicit chunk size require that size to be positive.
diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css
new file mode 100644
index 000000000..5d18bf78f
--- /dev/null
+++ b/docs/stylesheets/extra.css
@@ -0,0 +1,9 @@
+.md-typeset .admonition,
+.md-typeset details {
+  font-size: inherit;
+}
+
+.md-typeset .admonition-title,
+.md-typeset summary {
+  font-size: inherit;
+}
diff --git a/include/libfork/__impl/assume.hpp b/include/libfork/__impl/assume.hpp
new file mode 100644
index 000000000..b40cd4fab
--- /dev/null
+++ b/include/libfork/__impl/assume.hpp
@@ -0,0 +1,56 @@
+#pragma once
+
+#include "libfork/__impl/exception.hpp"
+
+/**
+ * @file assume.hpp
+ *
+ * @brief A collection of internal macros.
+ *
+ * These macros are not safe to use unless `import std` is in scope.
+ */
+
+/**
+ * @brief If expr evaluates to `false`, terminates the program with an error message.
+ *
+ * This macro is always active, regardless of optimization settings or `NDEBUG`.
+ */
+#define LF_ENSURE(...)                                                                                       \
+  do {                                                                                                       \
+    if (!(__VA_ARGS__)) {                                                                                    \
+      LF_TERMINATE("Assumption '" #__VA_ARGS__ "' failed!");                                                 \
+    }                                                                                                        \
+  } while (false)
+
+/**
+ * @brief Invokes undefined behavior if ``expr`` evaluates to `false`.
+ *
+ * \rst
+ *
+ *  .. warning::
+ *
+ *    This has different semantics than ``[[assume(expr)]]`` as it WILL evaluate the
+ *    expression at runtime. Hence you should conservatively only use this macro
+ *    if ``expr`` is side-effect free and cheap to evaluate.
+ *
+ * \endrst
+ */
+#ifdef NDEBUG
+  #define LF_ASSUME(...)                                                                                     \
+    do {                                                                                                     \
+      if (!(__VA_ARGS__)) {                                                                                  \
+        ::std::unreachable();                                                                                \
+      }                                                                                                      \
+    } while (false)
+#else
+  #define LF_ASSUME(...) LF_ENSURE(__VA_ARGS__)
+#endif
+
+#ifdef NDEBUG
+  #define LF_UNREACHABLE()                                                                                   \
+    do {                                                                                                     \
+      ::std::unreachable();                                                                                  \
+    } while (false)
+#else
+  #define LF_UNREACHABLE() LF_TERMINATE("This code should be unreachable!");
+#endif
diff --git a/include/libfork/__impl/compiler.hpp b/include/libfork/__impl/compiler.hpp
new file mode 100644
index 000000000..8e71083fc
--- /dev/null
+++ b/include/libfork/__impl/compiler.hpp
@@ -0,0 +1,57 @@
+#pragma once
+
+#include "libfork/__impl/exception.hpp"
+
+/**
+ * @file compiler.hpp
+ *
+ * @brief A collection of internal macros.
+ *
+ * These macros are standalone i.e. they can be used without importing/including anything else.
+ */
+
+// =============== Inlining/optimization =============== //
+
+/**
+ * @brief Macro to use next to 'inline' to force a function to be inlined.
+ *
+ * \rst
+ *
+ * .. note::
+ *
+ *    This does not imply the c++'s `inline` keyword which also has an effect on linkage.
+ *
+ * \endrst
+ */
+#if !defined(LF_FORCE_INLINE)
+  #if defined(_MSC_VER) && !defined(__clang__)
+    #define LF_FORCE_INLINE __forceinline
+  #elif defined(__GNUC__) && __GNUC__ > 3
+  // Clang also defines __GNUC__ (as 4)
+    #define LF_FORCE_INLINE __attribute__((__always_inline__))
+  #else
+    #define LF_FORCE_INLINE
+  #endif
+#endif
+
+/**
+ * @brief Macro to prevent a function to be inlined.
+ */
+#if !defined(LF_NO_INLINE)
+  #if defined(_MSC_VER) && !defined(__clang__)
+    #define LF_NO_INLINE __declspec(noinline)
+  #elif defined(__GNUC__) && __GNUC__ > 3
+    // Clang also defines __GNUC__ (as 4)
+    #if defined(__CUDACC__)
+      // nvcc doesn't always parse __noinline__, see: https://svn.boost.org/trac/boost/ticket/9392
+      #define LF_NO_INLINE __attribute__((noinline))
+    #elif defined(__HIP__)
+      // See https://github.com/boostorg/config/issues/392
+      #define LF_NO_INLINE __attribute__((noinline))
+    #else
+      #define LF_NO_INLINE __attribute__((__noinline__))
+    #endif
+  #else
+    #define LF_NO_INLINE
+  #endif
+#endif
diff --git a/include/libfork/__impl/exception.hpp b/include/libfork/__impl/exception.hpp
new file mode 100644
index 000000000..9868d7e75
--- /dev/null
+++ b/include/libfork/__impl/exception.hpp
@@ -0,0 +1,78 @@
+#pragma once
+
+/**
+ * @file exception.hpp
+ *
+ * @brief A collection of internal macros for exception handling.
+ *
+ * These macros are standalone i.e. they can be used without importing/including anything else.
+ */
+
+/**
+ * @brief Detects if the compiler has exceptions enabled.
+ *
+ * Overridable by defining `LF_COMPILER_EXCEPTIONS` globally.
+ */
+#ifndef LF_COMPILER_EXCEPTIONS
+  #if defined(__cpp_exceptions) || (defined(_MSC_VER) && defined(_CPPUNWIND)) || defined(__EXCEPTIONS)
+    #define LF_COMPILER_EXCEPTIONS 1
+  #else
+    #define LF_COMPILER_EXCEPTIONS 0
+  #endif
+#endif
+
+namespace lf::impl {
+
+/**
+ * @brief Calls `std::terminate` after printing `msg`.
+ */
+[[noreturn]]
+void terminate_with(char const *message, char const *file, int line) noexcept;
+
+} // namespace lf::impl
+
+#define LF_TERMINATE(message) ::lf::impl::terminate_with((message), __FILE__, __LINE__)
+
+#if LF_COMPILER_EXCEPTIONS
+  /**
+   * @brief Expands to ``try`` if exceptions are enabled, otherwise expands to ``if constexpr (true)``.
+   */
+  #define LF_TRY try
+  /**
+   * @brief Expands to ``catch (...)`` if exceptions are enabled, otherwise ``if constexpr (false)``.
+   */
+  #define LF_CATCH_ALL catch (...)
+  /**
+   * @brief Expands to ``catch (__VA_ARGS__)`` if exceptions are enabled, otherwise ``if constexpr (false)``.
+   */
+  #define LF_CATCH(...) catch (__VA_ARGS__)
+  /**
+   * @brief Expands to ``throw X`` if exceptions are enabled, otherwise terminates the program.
+   */
+  #define LF_THROW(X) throw X
+  /**
+   * @brief Expands to ``throw`` if exceptions are enabled, otherwise terminates the program.
+   */
+  #define LF_RETHROW throw
+#else
+  /**
+   * @brief Expands to ``try`` if exceptions are enabled, otherwise expands to ``if constexpr (true)``.
+   */
+  #define LF_TRY if constexpr (true)
+  /**
+   * @brief Expands to ``catch (...)`` if exceptions are enabled, otherwise ``if constexpr (false)``.
+   */
+  #define LF_CATCH_ALL if constexpr (false)
+  /**
+   * @brief Expands to ``catch (__VA_ARGS__)`` if exceptions are enabled, otherwise ``if constexpr (false)``.
+   */
+  #define LF_CATCH(...) if constexpr (false)
+  /**
+   * @brief Expands to ``throw X`` if exceptions are enabled, otherwise terminates the program.
+   */
+  #define LF_THROW(X) LF_TERMINATE("Tried to throw '" #X "' without compiler exceptions")
+  /**
+   * @brief Expands to ``throw`` if exceptions are enabled, otherwise terminates the program.
+   */
+  #define LF_RETHROW LF_TERMINATE("Tried to rethrow without compiler exceptions")
+#endif
diff --git a/include/libfork/__impl/utils.hpp b/include/libfork/__impl/utils.hpp
new file mode 100644
index 000000000..2adbf49c9
--- /dev/null
+++ b/include/libfork/__impl/utils.hpp
@@ -0,0 +1,32 @@
+#pragma once
+
+/**
+ * @file utils.hpp
+ *
+ * @brief A collection of internal utility macros.
+ *
+ * These macros are not safe to use unless `import std` is in scope.
+ */
+
+// =============== Utility =============== //
+
+// clang-format off
+
+/**
+ * @brief Use like `BOOST_HOF_RETURNS` to define a function/lambda with all the noexcept/decltype specifiers.
+ *
+ * This macro is not truly variadic but the ``...`` allows commas in the macro argument.
+ */
+#define LF_HOF(...) noexcept(noexcept(__VA_ARGS__)) -> decltype(__VA_ARGS__) { return __VA_ARGS__;}
+
+// clang-format on
+
+/**
+ * @brief Use like `std::forward` to perfectly forward an expression.
+ */
+#define LF_FWD(...) ::std::forward<decltype(__VA_ARGS__)>(__VA_ARGS__)
+
+/**
+ * @brief Use to define a `T` that is aligned to the required alignment of `std::atomic_ref<T>`.
+ */
+#define ATOMIC_ALIGN(T) alignas(std::atomic_ref<T>::required_alignment) T
diff --git a/include/libfork/version.hpp b/include/libfork/version.hpp
index d048ad622..03f18744e 100644
--- a/include/libfork/version.hpp
+++ b/include/libfork/version.hpp
@@ -1,3 +1,5 @@
+#pragma once
+
 /**
  * @brief __[public]__ The major version of libfork.
  *
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 000000000..9bc1e7df1
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,9 @@
+[project]
+name = "libfork"
+version = "0.1.0"
+description = "Documentation for libfork"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "zensical>=0.0.41",
+]
diff --git a/src/algorithm/algorithm.cxx b/src/algorithm/algorithm.cxx
new file mode 100644
index 000000000..ad83a0302
--- /dev/null
+++ b/src/algorithm/algorithm.cxx
@@ -0,0 +1,5 @@
+export module libfork.algorithm;
+
+export import :concepts;
+export import :for_each;
+export import :fold;
diff --git a/src/algorithm/concepts.cxx b/src/algorithm/concepts.cxx
new file mode 100644
index 000000000..962417c0c
--- /dev/null
+++ b/src/algorithm/concepts.cxx
@@ -0,0 +1,15 @@
+export module libfork.algorithm:concepts;
+
+import std;
+
+import libfork.core;
+
+namespace lf {
+
+export template <typename T>
+concept sized_random_access_range = std::ranges::random_access_range<T> && std::ranges::sized_range<T>;
+
+template <typename T>
+concept default_movable = std::default_initializable<T> && std::movable<T>;
+
+} // namespace lf
diff --git a/src/algorithm/fold.cxx b/src/algorithm/fold.cxx
new file mode 100644
index 000000000..76a18e46d
--- /dev/null
+++ b/src/algorithm/fold.cxx
@@ -0,0 +1,320 @@
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/exception.hpp"
+export module libfork.algorithm:fold;
+
+import std;
+
+import libfork.core;
+
+import :concepts;
+
+namespace lf {
+
+// TODO: this needs lf::just!
+
+template <typename Fn, typename Context, typename I>
+concept indirectly_foldable =
+    indirect_semigroup<Fn, Context, I> && default_movable<indirect_semigroup_t<Fn, Context, I>>;
+
+export struct fold_chunk_error final : libfork_exception {
+  [[nodiscard]]
+  constexpr auto what() const noexcept -> const char * override {
+    return "fold chunk size must be positive";
+  }
+};
+
+struct fold_impl {
+ private:
+  template <typename T>
+  using iter_diff_t = std::iter_difference_t<T>;
+
+  template <typename Context, typename I, typename Proj, typename Bop>
+  using result_t = lf::indirect_semigroup_t<Bop, Context, projected<Context, I, Proj>>;
+
+  template <typename Context, typename I, typename Proj, typename Bop>
+  using task_t = lf::task<result_t<Context, I, Proj, Bop>, Context>;
+
+ public:
+  // (1) iterator-pair, chunk size n >= 1
+  template <worker_context X,                                 //
+            std::random_access_iterator I,                    //
+            std::sized_sentinel_for<I> S,                     //
+            projectable<X, I> Proj = std::identity,           //
+            indirectly_foldable<X, projected<X, I, Proj>> Bop //
+            >
+  static auto
+  operator()(env<X>, I head, S tail, iter_diff_t<I> n, Bop bop, Proj proj = {}) -> task_t<X, I, Proj, Bop> {
+
+    using result_type = result_t<X, I, Proj, Bop>;
+
+    LF_ASSUME(n > 0);
+
+    auto len = tail - head;
+
+    LF_ASSUME(len > 0);
+
+    if (len <= n) {
+
+      if constexpr (async::indirectly_regular_unary_invocable<Proj, X, I>) {
+
+        // TODO: could optimize if this type == accumulator type
+        async_result_t<Proj &, X, std::iter_reference_t<I>> init;
+        {
+          auto sc = co_await scope();
+          co_await sc.call(&init, proj, *head);
+          co_await sc.join();
+        }
+
+        result_type acc(std::move(init));
+
+        for (++head; head != tail; ++head) {
+
+          async_result_t<Proj &, X, std::iter_reference_t<I>> tmp;
+          {
+            auto sc = co_await scope();
+            co_await sc.call(&tmp, proj, *head);
+            co_await sc.join();
+          }
+
+          if constexpr (async::indirect_semigroup<Bop, X, projected<X, I, Proj>>) {
+            auto sc = co_await scope();
+            co_await sc.call(&acc, bop, std::move(acc), std::move(tmp));
+            co_await sc.join();
+          } else {
+            acc = std::invoke(bop, std::move(acc), std::move(tmp));
+          }
+        }
+
+        co_return acc;
+
+      } else {
+
+        result_type acc(std::invoke(proj, *head));
+
+        for (++head; head != tail; ++head) {
+          if constexpr (async::indirect_semigroup<Bop, X, projected<X, I, Proj>>) {
+            auto sc = co_await scope();
+            co_await sc.call(&acc, bop, std::move(acc), std::invoke(proj, *head));
+            co_await sc.join();
+          } else {
+            acc = std::invoke(bop, std::move(acc), std::invoke(proj, *head));
+          }
+        }
+
+        co_return acc;
+      }
+    }
+
+    auto mid = head + (len / 2);
+
+    result_type lhs;
+    result_type rhs;
+
+    {
+      auto sc = co_await scope();
+      co_await sc.fork(&lhs, fold_impl{}, head, mid, n, bop, proj);
+      co_await sc.call(&rhs, fold_impl{}, mid, tail, n, bop, std::move(proj));
+      co_await sc.join();
+    }
+
+    if constexpr (async::indirect_semigroup<Bop, X, projected<X, I, Proj>>) {
+      // TODO: can this be a tail call / use current return address?
+      auto sc = co_await scope();
+      co_await sc.call(&lhs, bop, std::move(lhs), std::move(rhs));
+      co_await sc.join();
+      co_return lhs;
+    } else {
+      co_return std::invoke(bop, std::move(lhs), std::move(rhs));
+    }
+  }
+
+  // (2) iterator-pair, n == 1 specialization (no n parameter)
+  template <worker_context X,                                 //
+            std::random_access_iterator I,                    //
+            std::sized_sentinel_for<I> S,                     //
+            projectable<X, I> Proj = std::identity,           //
+            indirectly_foldable<X, projected<X, I, Proj>> Bop //
+            >
+  static auto operator()(env<X>, I head, S tail, Bop bop, Proj proj = {}) -> task_t<X, I, Proj, Bop> {
+
+    using result_type = result_t<X, I, Proj, Bop>;
+
+    auto len = tail - head;
+
+    switch (len) {
+      case 0:
+        LF_UNREACHABLE();
+      case 1:
+        if constexpr (async::indirectly_regular_unary_invocable<Proj, X, I>) {
+
+          async_result_t<Proj &, X, std::iter_reference_t<I>> init;
+          auto sc = co_await scope();
+          co_await sc.call(&init, proj, *head);
+          co_await sc.join();
+
+          co_return result_type(std::move(init));
+        } else {
+          co_return result_type(std::invoke(proj, *head));
+        }
+      case 2:
+        if constexpr (async::indirectly_regular_unary_invocable<Proj, X, I>) {
+
+          using proj_result_type = async_result_t<Proj &, X, std::iter_reference_t<I>>;
+
+          proj_result_type lhs;
+          proj_result_type rhs;
+
+          {
+            auto sc = co_await scope();
+            co_await sc.call(&lhs, proj, *head);
+            co_await sc.call(&rhs, proj, *(head + 1));
+            co_await sc.join();
+          }
+
+          if constexpr (async::indirect_semigroup<Bop, X, projected<X, I, Proj>>) {
+            result_type ret;
+            auto sc = co_await scope();
+            co_await sc.call(&ret, bop, std::move(lhs), std::move(rhs));
+            co_await sc.join();
+            co_return ret;
+          } else {
+            co_return std::invoke(bop, std::move(lhs), std::move(rhs));
+          }
+        } else {
+          if constexpr (async::indirect_semigroup<Bop, X, projected<X, I, Proj>>) {
+            result_type ret;
+            auto sc = co_await scope();
+            co_await sc.call(&ret, bop, std::invoke(proj, *head), std::invoke(proj, *(head + 1)));
+            co_await sc.join();
+            co_return ret;
+          } else {
+            co_return std::invoke(bop, std::invoke(proj, *head), std::invoke(proj, *(head + 1)));
+          }
+        }
+    }
+
+    auto mid = head + (len / 2);
+
+    result_type lhs;
+    result_type rhs;
+
+    {
+      auto sc = co_await scope();
+      co_await sc.fork(&lhs, fold_impl{}, head, mid, bop, proj);
+      co_await sc.call(&rhs, fold_impl{}, mid, tail, bop, std::move(proj));
+      co_await sc.join();
+    }
+
+    if constexpr (async::indirect_semigroup<Bop, X, projected<X, I, Proj>>) {
+      auto sc = co_await scope();
+      co_await sc.call(&lhs, bop, std::move(lhs), std::move(rhs));
+      co_await sc.join();
+      co_return lhs;
+    } else {
+      co_return std::invoke(bop, std::move(lhs), std::move(rhs));
+    }
+  }
+};
+
+struct fold_fn {
+ private:
+  template <typename T>
+  using iter_diff_t = std::iter_difference_t<T>;
+
+  template <typename T>
+  using range_diff_t = std::ranges::range_difference_t<T>;
+
+  template <typename Range>
+  using iterator_t = std::ranges::iterator_t<Range>;
+
+  template <typename Context, typename I, typename Proj, typename Bop>
+  using result_t = lf::indirect_semigroup_t<Bop, Context, projected<Context, I, Proj>>;
+
+  template <typename Context, typename I, typename Proj, typename Bop>
+  using task_t = lf::task<std::optional<result_t<Context, I, Proj, Bop>>, Context>;
+
+ public:
+  // (1) iterator-pair, chunk size n >= 1
+  template <worker_context X,                                 //
+            std::random_access_iterator I,                    //
+            std::sized_sentinel_for<I> S,                     //
+            projectable<X, I> Proj = std::identity,           //
+            indirectly_foldable<X, projected<X, I, Proj>> Bop //
+            >
+  static auto
+  operator()(env<X>, I head, S tail, iter_diff_t<I> n, Bop bop, Proj proj = {}) -> task_t<X, I, Proj, Bop> {
+
+    if (n <= 0) {
+      LF_THROW(fold_chunk_error{});
+    }
+
+    if (tail == head) {
+      co_return std::nullopt;
+    }
+
+    result_t<X, I, Proj, Bop> result;
+
+    {
+      auto sc = co_await scope();
+      co_await sc.call(&result, fold_impl{}, head, tail, n, std::move(bop), std::move(proj));
+      co_await sc.join();
+    }
+
+    co_return std::move(result);
+  }
+
+  // (2) iterator-pair, n == 1 specialization (no n parameter)
+  template <worker_context X,                                 //
+            std::random_access_iterator I,                    //
+            std::sized_sentinel_for<I> S,                     //
+            projectable<X, I> Proj = std::identity,           //
+            indirectly_foldable<X, projected<X, I, Proj>> Bop //
+            >
+  static auto operator()(env<X>, I head, S tail, Bop bop, Proj proj = {}) -> task_t<X, I, Proj, Bop> {
+
+    if (tail == head) {
+      co_return std::nullopt;
+    }
+
+    result_t<X, I, Proj, Bop> result;
+
+    {
+      auto sc = co_await scope();
+      co_await sc.call(&result, fold_impl{}, head, tail, std::move(bop), std::move(proj));
+      co_await sc.join();
+    }
+
+    co_return std::move(result);
+  }
+
+  // clang-format off
+
+  // (3) range + n -> dispatches to (1)
+  template <worker_context X,                                                 //
+            sized_random_access_range Range,                                  //
+            projectable<X, iterator_t<Range>> Proj = std::identity,           //
+            indirectly_foldable<X, projected<X, iterator_t<Range>, Proj>> Bop //
+            >
+  static auto
+  operator()(env<X> context, Range &&range, range_diff_t<Range> n, Bop bop, Proj proj = {}) -> task_t<X, iterator_t<Range>, Proj, Bop> {
+    return fold_fn{}(context, std::ranges::begin(range), std::ranges::end(range), n, std::move(bop), std::move(proj));
+  }
+
+  // (4) range, n == 1 -> dispatches to (2)
+  template <worker_context X,                                                 //
+            sized_random_access_range Range,                                  //
+            projectable<X, iterator_t<Range>> Proj = std::identity,           //
+            indirectly_foldable<X, projected<X, iterator_t<Range>, Proj>> Bop //
+            >
+  static auto
+  operator()(env<X> context, Range &&range, Bop bop, Proj proj = {}) -> task_t<X, iterator_t<Range>, Proj, Bop> {
+    return fold_fn{}(context, std::ranges::begin(range), std::ranges::end(range), std::move(bop), std::move(proj));
+  }
+
+  // clang-format on
+};
+
+export inline constexpr fold_fn fold = {};
+
+} // namespace lf
diff --git a/src/algorithm/for_each.cxx b/src/algorithm/for_each.cxx
new file mode 100644
index 000000000..ff1b0ed55
--- /dev/null
+++ b/src/algorithm/for_each.cxx
@@ -0,0 +1,119 @@
+module;
+#include "libfork/__impl/assume.hpp"
+export module libfork.algorithm:for_each;
+
+import std;
+
+import libfork.core;
+
+import :concepts;
+
+namespace lf {
+
+// TODO: relax the constraints?
+//
+// consider a projection if it makes the concept unifyable with fold etc
+
+struct for_each_impl {
+ private:
+  template <worker_context Context>
+  using task = lf::task<void, Context>;
+
+  template <typename T>
+  using iter_difference_t = std::iter_difference_t<T>;
+
+  template <typename T>
+  using range_difference_t = std::ranges::range_difference_t<T>;
+
+ public:
+  // (1) iterator-pair, chunk size n > 1
+  template <worker_context Context, std::random_access_iterator I, std::sized_sentinel_for<I> S, typename Fn>
+    requires indirectly_regular_unary_invocable<Fn, Context, I>
+  static auto
+  operator()(env<Context> /* env */, I head, S tail, iter_difference_t<I> n, Fn fn) -> task<Context> {
+
+    LF_ASSUME(n > 0);
+
+    auto len = tail - head;
+
+    LF_ASSUME(len >= 0);
+
+    if (len <= n) {
+      if constexpr (async::indirectly_regular_unary_invocable<Fn, Context, I>) {
+        // Prefer async
+        auto sc = co_await scope();
+        for (; head != tail; ++head) {
+          co_await sc.call_drop(fn, *head);
+        }
+        co_await sc.join();
+      } else {
+        for (; head != tail; ++head) {
+          std::invoke(fn, *head);
+        }
+      }
+      co_return;
+    }
+
+    auto mid = head + (len / 2);
+    auto sc = co_await scope();
+    co_await sc.fork(for_each_impl{}, head, mid, n, fn);
+    co_await sc.call(for_each_impl{}, mid, tail, n, std::move(fn));
+    co_await sc.join();
+  }
+
+  // (2) iterator-pair, n == 1 specialization (no n parameter)
+  template <worker_context Context, std::random_access_iterator I, std::sized_sentinel_for<I> S, typename Fn>
+    requires indirectly_regular_unary_invocable<Fn, Context, I>
+  static auto operator()(env<Context> /* env */, I head, S tail, Fn fn) -> task<Context> {
+
+    auto len = tail - head;
+
+    LF_ASSUME(len >= 0);
+
+    switch (len) {
+      case 0:
+        co_return;
+      case 1:
+        if constexpr (async::indirectly_regular_unary_invocable<Fn, Context, I>) {
+          auto sc = co_await scope();
+          co_await sc.call_drop(fn, *head);
+          co_await sc.join();
+        } else {
+          std::invoke(fn, *head);
+        }
+        co_return;
+    }
+
+    // TODO: round mid-point up
+    // Special case for (case 2/3) for async invocables to avoid the redundant task
+    // Benchmarks for ^ to see if it matters
+
+    auto mid = head + (len / 2);
+    auto sc = co_await scope();
+    co_await sc.fork(for_each_impl{}, head, mid, fn);
+    co_await sc.call(for_each_impl{}, mid, tail, std::move(fn));
+    co_await sc.join();
+  }
+
+  // (3) range + n -> dispatches to (1) or (2)
+  template <worker_context Context, sized_random_access_range Range, typename Fn>
+    requires indirectly_regular_unary_invocable<Fn, Context, std::ranges::iterator_t<Range>>
+  static auto
+  operator()(env<Context> context, Range &&range, range_difference_t<Range> n, Fn fn) -> task<Context> {
+    if (n == 1) {
+      return for_each_impl{}(context, std::ranges::begin(range), std::ranges::end(range), std::move(fn));
+    }
+    return for_each_impl{}(context, std::ranges::begin(range), std::ranges::end(range), n, std::move(fn));
+  }
+
+  // (4) range, n == 1 -> dispatches to (2)
+  template <worker_context Context, sized_random_access_range Range, typename Fn>
+    requires indirectly_regular_unary_invocable<Fn, Context, std::ranges::iterator_t<Range>>
+  static auto operator()(env<Context> context, Range &&range, Fn fn) -> task<Context> {
+    return for_each_impl{}(context, std::ranges::begin(range), std::ranges::end(range), std::move(fn));
+  }
+};
+
+export inline constexpr for_each_impl for_each = {};
+
+} // namespace lf
diff --git a/src/batteries/adaptor_stack.cxx b/src/batteries/adaptor_stack.cxx
new file mode 100644
index 000000000..2c333b7b7
--- /dev/null
+++ b/src/batteries/adaptor_stack.cxx
@@ -0,0 +1,117 @@
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/compiler.hpp"
+#include "libfork/__impl/exception.hpp"
+export module libfork.batteries:adaptor_stack;
+
+import std;
+
+import libfork.utils;
+
+namespace lf {
+
+/**
+ * @brief An adaptor_stack wraps a standard allocator to satisfy the worker_stack concept.
+ *
+ * Every push/pop directly allocates/deallocates through the allocator. This is the simplest
+ * possible stack implementation — no caching, no geometric growth — just a thin wrapper.
+ *
+ * For this to conform to `worker_stack` the allocators void pointer type must be `void *`.
+ */
+export template <allocator_of<std::byte> Allocator = std::allocator<std::byte>>
+class adaptor_stack {
+
+  struct alignas(k_new_align) aligned {};
+
+  static_assert(sizeof(aligned) == k_new_align);
+
+  using align_trait = std::allocator_traits<Allocator>::template rebind_traits<aligned>;
+  using align_alloc = align_trait::allocator_type;
+
+  using alloc_ptr = align_trait::pointer;
+  using void_ptr = align_trait::void_pointer;
+
+  using size_type = align_trait::size_type;
+
+  struct release_t {
+    explicit constexpr release_t(key_t /*unused*/) noexcept {}
+  };
+
+  class checkpoint_t {
+   public:
+    constexpr checkpoint_t() noexcept = default;
+    constexpr auto operator==(checkpoint_t const &) const noexcept -> bool = default;
+
+   private:
+    friend adaptor_stack;
+    explicit constexpr checkpoint_t(align_alloc const &alloc) noexcept
+        : m_alloc(alloc) {}
+
+    struct empty {
+      constexpr empty() noexcept = default;
+      constexpr auto operator==(empty const &) const noexcept -> bool = default;
+      explicit constexpr empty(align_alloc const & /*unused*/) noexcept {}
+    };
+
+    std::conditional_t<align_trait::is_always_equal::value, empty, align_alloc> m_alloc;
+  };
+
+ public:
+  constexpr adaptor_stack() noexcept(noexcept(Allocator()))
+      : adaptor_stack(Allocator()) {}
+  explicit constexpr adaptor_stack(Allocator const &alloc) noexcept
+      : m_alloc(alloc) {}
+
+  // TODO: drop constexpr for =default and =delete across the lib
+
+  constexpr adaptor_stack(adaptor_stack const &) = delete;
+  constexpr adaptor_stack(adaptor_stack &&) = delete;
+
+  constexpr auto operator=(adaptor_stack const &) -> adaptor_stack & = delete;
+  constexpr auto operator=(adaptor_stack &&) -> adaptor_stack & = delete;
+
+  /**
+   * @brief Get a checkpoint of the stack.
+   */
+  [[nodiscard]]
+  constexpr auto checkpoint() noexcept -> checkpoint_t {
+    return checkpoint_t{m_alloc};
+  }
+
+  /**
+   * @brief Allocate size bytes and return a pointer to the allocation.
+   */
+  [[nodiscard]]
+  constexpr auto push(std::size_t size) -> void_ptr {
+    LF_ASSUME(size > 0);
+    size_type num_aligned = safe_cast<size_type>(((size - 1) / k_new_align) + 1);
+    return static_cast<void_ptr>(align_trait::allocate(m_alloc, num_aligned));
+  }
+
+  /**
+   * @brief Deallocate the allocation at ptr of size n.
+   */
+  constexpr void pop(void_ptr ptr, [[maybe_unused]] std::size_t size) noexcept {
+    LF_ASSUME(size > 0);
+    size_type num_aligned = safe_cast<size_type>(((size - 1) / k_new_align) + 1);
+    align_trait::deallocate(m_alloc, static_cast<alloc_ptr>(ptr), num_aligned);
+  }
+
+  [[nodiscard]]
+  constexpr auto prepare_release() const noexcept -> release_t {
+    return release_t{key()};
+  }
+
+  constexpr void release([[maybe_unused]] release_t key) noexcept {}
+
+  constexpr void acquire(checkpoint_t const &ckpt) noexcept {
+    if constexpr (!align_trait::is_always_equal::value) {
+      m_alloc = ckpt.m_alloc;
+    }
+  }
+
+ private:
+  align_alloc m_alloc;
+};
+
+} // namespace lf
diff --git a/src/batteries/adaptors.cxx b/src/batteries/adaptors.cxx
new file mode 100644
index 000000000..243fe043a
--- /dev/null
+++ b/src/batteries/adaptors.cxx
@@ -0,0 +1,71 @@
+export module libfork.batteries:adaptors;
+
+import std;
+
+import libfork.core;
+import libfork.utils;
+
+import :deque;
+
+namespace lf {
+
+export template <allocator_of<unsafe_steal_handle> Allocator = std::allocator<unsafe_steal_handle>>
+class adapt_vector {
+ public:
+  constexpr adapt_vector() noexcept(noexcept(Allocator()))
+      : adapt_vector(Allocator()) {}
+
+  explicit constexpr adapt_vector(Allocator const &alloc) noexcept
+      : m_vector(alloc) {}
+
+  constexpr void push(unsafe_steal_handle value) { m_vector.push_back(value); }
+
+  constexpr auto pop() noexcept -> unsafe_steal_handle {
+    if (!m_vector.empty()) {
+      unsafe_steal_handle value = m_vector.back();
+      m_vector.pop_back();
+      return value;
+    }
+    return {};
+  }
+
+ private:
+  std::vector<unsafe_steal_handle, Allocator> m_vector;
+};
+
+export template <allocator_of<std::atomic<unsafe_steal_handle>> Allocator =
+                     std::allocator<std::atomic<unsafe_steal_handle>>>
+class adapt_deque {
+ public:
+  using size_type = deque<unsafe_steal_handle, Allocator>::size_type;
+
+ private:
+  static constexpr size_type k_default_capacity = 1024 * 32;
+
+ public:
+  constexpr adapt_deque()
+      : adapt_deque(k_default_capacity, Allocator()) {}
+
+  explicit constexpr adapt_deque(size_type capacity, Allocator const &alloc = Allocator())
+      : m_deque{capacity, alloc} {}
+
+  constexpr void push(unsafe_steal_handle value) { m_deque.push(value); }
+
+  constexpr auto pop() noexcept -> unsafe_steal_handle {
+    return m_deque.pop([] static noexcept -> unsafe_steal_handle {
+      return {};
+    });
+  }
+
+  constexpr auto steal() noexcept -> unsafe_steal_handle {
+    if (auto [_, result] = m_deque.thief().steal()) {
+      return result;
+    }
+    return {};
+  }
+
+ private:
+  deque<unsafe_steal_handle, Allocator> m_deque;
+};
+
+} // namespace lf
diff --git a/src/batteries/batteries.cxx b/src/batteries/batteries.cxx
new file mode 100644
index 000000000..c58f0e23b
--- /dev/null
+++ b/src/batteries/batteries.cxx
@@ -0,0 +1,8 @@
+export module libfork.batteries;
+
+export import :deque;
+export import :geometric_stack;
+export import :adaptor_stack;
+export import :slab_stack;
+export import :adaptors;
+export import :contexts;
diff --git a/src/batteries/contexts.cxx b/src/batteries/contexts.cxx
new file mode 100644
index 000000000..15319beee
--- /dev/null
+++ b/src/batteries/contexts.cxx
@@ -0,0 +1,115 @@
+module;
+#include "libfork/__impl/exception.hpp"
+export module libfork.batteries:contexts;
+
+import std;
+
+import libfork.core;
+import libfork.utils;
+
+namespace lf {
+
+// =================== Context Policies =================== //
+
+/**
+ * @brief The simplest context policy is just a LIFO stack of type-erased handles.
+ *
+ * Context policies (unlike full contexts) are not aware of the full context
+ * type hence, operate on untyped handles. This is inherently unsafe. To
+ * prevent UB a policy must not give-out the handles it receives. All
+ * operations must be managed through either `derived_poly_context` or
+ * `mono_context`.
+ */
+export template <typename T>
+concept deque_policy = lifo_stack<T, unsafe_steal_handle>;
+
+// TODO: consider the methods/concepts needed for a auto/scheduling worker
+// context that has a `post` method.
+
+/**
+ * @brief An extension of `deque_policy` that supports FIFO stealing of handles.
+ */
+export template <typename T>
+concept stealable_deque_policy = deque_policy<T> && requires (T &policy) {
+  { policy.steal() } -> std::same_as<unsafe_steal_handle>;
+};
+
+// =================== Contexts =================== //
+
+template <typename Base, worker_stack Stack, deque_policy Deque, typename ContextType>
+class context_base : public Base {
+ public:
+  constexpr context_base() = default;
+
+  template <typename... StackArgs, typename... DequeArgs>
+    requires std::constructible_from<Stack, StackArgs...> && std::constructible_from<Deque, DequeArgs...>
+  constexpr context_base(
+      std::piecewise_construct_t,
+      std::tuple<StackArgs...> stack_args,
+      std::tuple<DequeArgs...> deque_args) noexcept(std::is_nothrow_constructible_v<Stack, StackArgs...> &&
+                                                    std::is_nothrow_constructible_v<Deque, DequeArgs...>)
+      : context_base(std::move(stack_args),
+                     std::move(deque_args),
+                     std::index_sequence_for<StackArgs...>{},
+                     std::index_sequence_for<DequeArgs...>{}) {}
+
+  [[nodiscard]]
+  constexpr auto steal() noexcept(noexcept(m_container.steal())) -> steal_handle<ContextType>
+    requires stealable_deque_policy<Deque>
+  {
+    return {key(), get(key(), m_container.steal())};
+  }
+
+ protected:
+  Deque m_container;
+
+ private:
+  template <typename... StackArgs, typename... DequeArgs, std::size_t... Is, std::size_t... Js>
+  constexpr context_base(
+      std::tuple<StackArgs...> stack_args,
+      std::tuple<DequeArgs...> deque_args,
+      std::index_sequence<Is...>,
+      std::index_sequence<Js...>) noexcept(std::is_nothrow_constructible_v<Stack, StackArgs...> &&
+                                           std::is_nothrow_constructible_v<Deque, DequeArgs...>)
+      : Base(std::get<Is>(std::move(stack_args))...),
+        m_container(std::get<Js>(std::move(deque_args))...) {}
+};
+
+/**
+ * @brief A polymorphic worker context composed of a `worker_stack` and a `deque_policy`.
+ */
+export template <worker_stack Stack, deque_policy Deque>
+class derived_poly_context : public context_base<poly_context<Stack>, Stack, Deque, poly_context<Stack>> {
+  using base = context_base<poly_context<Stack>, Stack, Deque, poly_context<Stack>>;
+
+ public:
+  using context_type = poly_context<Stack>;
+
+  using base::base;
+
+  constexpr void push(steal_handle<context_type> handle) final { this->m_container.push(handle); }
+
+  constexpr auto pop() noexcept -> steal_handle<context_type> final {
+    return {key(), get(key(), this->m_container.pop())};
+  }
+};
+
+export template <worker_stack Stack, deque_policy Deque>
+class mono_context : public context_base<base_context<Stack>, Stack, Deque, mono_context<Stack, Deque>> {
+  using base = context_base<base_context<Stack>, Stack, Deque, mono_context>;
+
+ public:
+  using context_type = mono_context;
+
+  using base::base;
+
+  constexpr void push(steal_handle<context_type> handle) noexcept(noexcept(this->m_container.push(handle))) {
+    this->m_container.push(handle);
+  }
+
+  constexpr auto pop() noexcept -> steal_handle<context_type> {
+    return {key(), get(key(), this->m_container.pop())};
+  }
+};
+
+} // namespace lf
diff --git a/src/batteries/deque.cxx b/src/batteries/deque.cxx
new file mode 100644
index 000000000..254d18d67
--- /dev/null
+++ b/src/batteries/deque.cxx
@@ -0,0 +1,462 @@
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/exception.hpp"
+export module libfork.batteries:deque;
+
+import std;
+
+import libfork.core;
+import libfork.utils;
+
+namespace lf {
+
+/**
+ * @brief Test is a type is suitable for use with `lf::deque`.
+ *
+ * This requires it to be `lf::lock_free` and `std::default_initializable`.
+ */
+export template <typename T>
+concept dequeable = lock_free<T> && std::default_initializable<T>;
+
+/**
+ * @brief Thrown when a push operation fails because the deque is full.
+ */
+export struct deque_full_error final : libfork_exception {
+  [[nodiscard]]
+  constexpr auto what() const noexcept -> const char * override {
+    return "push failed because deque is full";
+  }
+};
+
+/**
+ * @brief A basic wrapper around a c-style array that provides modulo load/stores.
+ *
+ * This class is designed for internal use only. It provides a c-style API that is
+ * used efficiently by deque for low level atomic operations.
+ *
+ * @tparam T The type of the elements in the array.
+ */
+template <dequeable T, allocator_of<std::atomic<T>> Allocator>
+struct atomic_ring_buf {
+ private:
+  using traits = std::allocator_traits<Allocator>;
+  using pointer = traits::pointer;
+
+ public:
+  using diff_type = traits::difference_type;
+  using size_type = traits::size_type;
+
+  /**
+   * @brief Construct a new ring buff object
+   *
+   * @param cap The capacity of the buffer, MUST be a power of 2.
+   * @param alloc The allocator used to allocate the buffer.
+   */
+  constexpr atomic_ring_buf(diff_type cap, Allocator const &alloc)
+      : m_alloc{alloc},
+        m_cap{cap},
+        m_mask{cap - 1} {
+
+    LF_ASSUME(cap > 0 && std::has_single_bit(safe_cast<size_type>(cap)));
+
+    m_buf = traits::allocate(m_alloc, safe_cast<size_type>(cap));
+
+    diff_type i = 0;
+
+    LF_TRY {
+      // Begin the lifetime of each atomic.
+      for (; i < cap; ++i) {
+        traits::construct(m_alloc, std::to_address(m_buf + i));
+      }
+    } LF_CATCH_ALL {
+      clean_up(i);
+      LF_RETHROW;
+    }
+  }
+
+  atomic_ring_buf(atomic_ring_buf const &) = delete;
+  atomic_ring_buf(atomic_ring_buf &&) = delete;
+  auto operator=(atomic_ring_buf const &) -> atomic_ring_buf & = delete;
+  auto operator=(atomic_ring_buf &&) -> atomic_ring_buf & = delete;
+
+  constexpr ~atomic_ring_buf() noexcept { clean_up(m_cap); }
+
+  /**
+   * @brief Get the capacity of the buffer.
+   */
+  [[nodiscard]]
+  constexpr auto capacity() const noexcept -> diff_type {
+    return m_cap;
+  }
+  /**
+   * @brief Store ``val`` at ``index % this->capacity()``.
+   */
+  constexpr auto store(diff_type index, T const &val) noexcept -> void {
+    LF_ASSUME(index >= 0);
+    std::to_address(m_buf + (index & m_mask))->store(val, std::memory_order_relaxed);
+  }
+  /**
+   * @brief Load value at ``index % this->capacity()``.
+   */
+  [[nodiscard]]
+  constexpr auto load(diff_type index) const noexcept -> T {
+    LF_ASSUME(index >= 0);
+    return std::to_address(m_buf + (index & m_mask))->load(std::memory_order_relaxed);
+  }
+
+ private:
+  /**
+   * @brief Destroy the first `n` elements and deallocate the buffer.
+   */
+  constexpr void clean_up(diff_type n) noexcept {
+
+    LF_ASSUME(0 <= n && n <= m_cap);
+
+    for (diff_type i = n - 1; i >= 0; --i) {
+      traits::destroy(m_alloc, std::to_address(m_buf + i));
+    }
+    traits::deallocate(m_alloc, m_buf, safe_cast<size_type>(m_cap));
+  }
+
+  [[no_unique_address]]
+  Allocator m_alloc;
+  pointer m_buf{};
+  diff_type m_cap;
+  diff_type m_mask;
+};
+
+/**
+ * @brief Error codes for ``deque`` 's ``steal()`` operation.
+ */
+export enum class err : std::uint8_t {
+  /**
+   * @brief The ``steal()`` operation succeeded.
+   */
+  none = 0,
+  /**
+   * @brief  Lost the ``steal()`` race hence, the ``steal()`` operation failed.
+   */
+  lost,
+  /**
+   * @brief The deque is empty and hence, the ``steal()`` operation failed.
+   */
+  empty,
+};
+
+/**
+ * @brief The return type of a `lf::deque` `steal()` operation.
+ *
+ * This type is suitable for structured bindings. We return a custom type instead of a
+ * `std::optional` to allow for more information to be returned as to why a steal may fail.
+ */
+export template <typename T>
+struct steal_t {
+  /**
+   * @brief Check if the operation succeeded.
+   */
+  [[nodiscard]]
+  constexpr explicit operator bool() const noexcept {
+    return code == err::none;
+  }
+  /**
+   * @brief Get the value like ``std::optional``.
+   *
+   * Requires ``code == err::none`` .
+   */
+  [[nodiscard]]
+  constexpr auto operator*() const noexcept -> T {
+    LF_ASSUME(code == err::none);
+    return val;
+  }
+  /**
+   * @brief Get the value ``like std::optional``.
+   *
+   * Requires ``code == err::none`` .
+   */
+  [[nodiscard]]
+  constexpr auto operator->() const noexcept -> T const * {
+    LF_ASSUME(code == err::none);
+    return std::addressof(val);
+  }
+  /**
+   * @brief The error code of the ``steal()`` operation.
+   */
+  err code;
+  /**
+   * @brief The value stolen from the deque, Only valid if ``code == err::none``.
+   */
+  T val;
+};
+
+/**
+ * @brief A functor that returns ``std::nullopt``.
+ */
+template <typename T>
+struct return_nullopt {
+  /**
+   * @brief Returns ``std::nullopt``.
+   */
+  [[nodiscard]]
+  static constexpr auto operator()() noexcept -> std::optional<T> {
+    return {};
+  }
+};
+
+/**
+ * @brief A bounded lock-free single-producer multiple-consumer work-stealing deque.
+ *
+ * Implements the "Chase-Lev" deque described in the papers, `"Dynamic Circular Work-Stealing deque"
+ * <https://doi.org/10.1145/1073970.1073974>`_ and `"Correct and Efficient Work-Stealing for Weak
+ * Memory Models" <https://doi.org/10.1145/2442516.2442524>`_.
+ *
+ * Only the deque owner can perform ``pop()`` and ``push()`` operations where the deque behaves
+ * like a LIFO stack. Others can (only) ``steal()`` data from the deque, they see a FIFO deque.
+ * All threads must have finished using the deque before it is destructed.
+ *
+ * Also see:
+
+ * - Rust: https://github.com/crossbeam-rs/crossbeam/blob/master/crossbeam-deque/src/deque.rs
+ * - CDSC: https://dl.acm.org/doi/epdf/10.1145/2544173.2509514
+ *
+ * @tparam T The type of the elements in the deque.
+ */
+export template <dequeable T, allocator_of<std::atomic<T>> Allocator = std::allocator<std::atomic<T>>>
+class deque {
+ public:
+  using diff_type = atomic_ring_buf<T, Allocator>::diff_type;
+  using size_type = atomic_ring_buf<T, Allocator>::size_type;
+
+  using value_type = T;
+  using allocator_type = Allocator;
+
+  deque(deque const &) = delete;
+  deque(deque &&) = delete;
+  auto operator=(deque const &) -> deque & = delete;
+  auto operator=(deque &&) -> deque & = delete;
+
+  /**
+   * @brief A non-owning handle that can be used to steal items from the deque.
+   *
+   * All non-owner interactions with the deque should be made through this handle.
+   */
+  class thief_handle {
+
+    friend class deque;
+
+    explicit thief_handle(deque *queue) noexcept
+        : m_queue{queue} {
+      LF_ASSUME(queue != nullptr);
+    }
+
+   public:
+    /**
+     * @brief Check if the deque is empty.
+     */
+    [[nodiscard]]
+    constexpr auto empty(this thief_handle self) noexcept -> bool {
+      diff_type const top = self.m_queue->m_top.load(acquire);
+      std::atomic_thread_fence(seq_cst);
+      diff_type const bottom = self.m_queue->m_bottom.load(acquire);
+      return top >= bottom;
+    }
+    /**
+     * @brief Get the number of elements in the deque.
+     */
+    [[nodiscard]]
+    constexpr auto size(this thief_handle self) noexcept -> size_type {
+      return safe_cast<size_type>(self.ssize());
+    }
+    /**
+     * @brief Get the number of elements in the deque as a signed integer.
+     */
+    [[nodiscard]]
+    constexpr auto ssize(this thief_handle self) noexcept -> diff_type {
+      diff_type const top = self.m_queue->m_top.load(acquire);
+      std::atomic_thread_fence(seq_cst);
+      diff_type const bottom = self.m_queue->m_bottom.load(acquire);
+      return std::max(bottom - top, diff_type{0});
+    }
+    /**
+     * @brief Get the capacity of the deque.
+     */
+    [[nodiscard]]
+    constexpr auto capacity(this thief_handle self) noexcept -> diff_type {
+      return self.m_queue->capacity();
+    }
+    /**
+     * @brief Steal an item from the deque.
+     *
+     * Any threads can try to steal an item from the deque. This operation can
+     * fail if the deque is empty or if another thread simultaneously stole an
+     * item from the deque.
+     */
+    constexpr auto steal(this thief_handle self) noexcept -> steal_t<T> {
+      //
+      diff_type top = self.m_queue->m_top.load(acquire);
+      std::atomic_thread_fence(seq_cst);
+      diff_type const bottom = self.m_queue->m_bottom.load(acquire);
+
+      if (top < bottom) {
+        // Must load *before* acquiring the slot as slot may be overwritten immediately after
+        // acquiring. This load is NOT required to be atomic even-though it may race with an overwrite
+        // as we only return the value if we win the race below guaranteeing we had no race during our
+        // read. If we lose the race then 'x' could be corrupt due to read-during-write race but as T
+        // is trivially destructible this does not matter.
+        T tmp = self.m_queue->m_buf.load(top);
+
+        static_assert(std::is_trivially_destructible_v<T>, "'atomicable' should guarantee this already");
+
+        if (!self.m_queue->m_top.compare_exchange_strong(top, top + 1, seq_cst, relaxed)) {
+          return {.code = err::lost, .val = {}};
+        }
+        return {.code = err::none, .val = tmp};
+      }
+      return {.code = err::empty, .val = {}};
+    }
+
+   private:
+    deque *m_queue;
+  };
+
+  /**
+   * @brief Construct a new empty deque object.
+   *
+   * @param cap The capacity of the deque (will be rounded to the next power of two).
+   * @param alloc Allocator used to allocate the internal buffer.
+   */
+  constexpr explicit deque(size_type cap, Allocator const &alloc = Allocator())
+      : m_buf(round_capacity(cap), alloc) {}
+
+  /**
+   * @brief Check if the deque is empty.
+   */
+  [[nodiscard]]
+  constexpr auto empty() const noexcept -> bool {
+    diff_type const bottom = m_bottom.load(relaxed);
+    diff_type const top = m_top.load(seq_cst);
+    return top >= bottom;
+  }
+  /**
+   * @brief Get the number of elements in the deque.
+   */
+  [[nodiscard]]
+  constexpr auto size() const noexcept -> size_type {
+    return safe_cast<size_type>(ssize());
+  }
+  /**
+   * @brief Get the number of elements in the deque as a signed integer.
+   */
+  [[nodiscard]]
+  constexpr auto ssize() const noexcept -> diff_type {
+    diff_type const bottom = m_bottom.load(relaxed);
+    diff_type const top = m_top.load(seq_cst);
+    return std::max(bottom - top, diff_type{0});
+  }
+  /**
+   * @brief Get the capacity of the deque.
+   */
+  [[nodiscard]]
+  constexpr auto capacity() const noexcept -> diff_type {
+    return m_buf.capacity();
+  }
+  /**
+   * @brief Get a non-owning `thief_handle` that can be used to steal items from the deque.
+   */
+  constexpr auto thief() noexcept -> thief_handle { return thief_handle{this}; }
+
+  /**
+   * @brief Push an item into the deque.
+   *
+   * Only the owner thread can insert an item into the deque. This will throw
+   * an exception if the deque is full. This returns the number of elements in
+   * the deque before the push.
+   *
+   * @param val Value to add to the deque.
+   */
+  constexpr auto push(T val) -> diff_type {
+
+    diff_type const bottom = m_bottom.load(relaxed);
+    diff_type const top = m_top.load(acquire);
+    diff_type const ssize = bottom - top;
+
+    if (m_buf.capacity() < ssize + 1) {
+      LF_THROW(deque_full_error{});
+    }
+
+    // Construct new object, this does not have to be atomic as no one can steal
+    // this item until after we store the new value of bottom, ordering is
+    // maintained by surrounding atomics.
+    m_buf.store(bottom, val);
+
+    std::atomic_thread_fence(release);
+    m_bottom.store(bottom + 1, relaxed);
+
+    // This was the size just before the push, upon return the size could be any
+    // smaller number, down to zero, as stealers could have stolen all the
+    // tasks.
+    return ssize;
+  }
+
+  /**
+   * @brief Pop an item from the deque.
+   *
+   * Only the owner thread can pop out an item from the deque. If the buffer is
+   * empty calls `when_empty` and returns the result. By default, `when_empty`
+   * is a no-op that returns a null `std::optional<T>`.
+   */
+  template <std::invocable Fn = return_nullopt<T>>
+    requires std::convertible_to<T, std::invoke_result_t<Fn>>
+  constexpr auto
+  pop(Fn &&when_empty = {}) noexcept(std::is_nothrow_invocable_v<Fn>) -> std::invoke_result_t<Fn> {
+
+    diff_type const bottom = m_bottom.load(relaxed) - 1; //
+    m_bottom.store(bottom, relaxed);                     // Stealers can no longer steal.
+
+    std::atomic_thread_fence(seq_cst);
+
+    diff_type top = m_top.load(relaxed);
+
+    if (top <= bottom) {
+      // Non-empty deque
+
+      // This load is not required to be atomic as we are the exclusive writer.
+      T val = m_buf.load(bottom);
+
+      if (top == bottom) {
+        // The last item could get stolen, by a stealer that loaded bottom before our write above.
+        if (!m_top.compare_exchange_strong(top, top + 1, seq_cst, relaxed)) {
+          // Failed race, thief got the last item.
+          m_bottom.store(bottom + 1, relaxed);
+          return std::invoke(std::forward<Fn>(when_empty));
+        }
+        m_bottom.store(bottom + 1, relaxed);
+      }
+      return val;
+    }
+    m_bottom.store(bottom + 1, relaxed);
+    return std::invoke(std::forward<Fn>(when_empty));
+  }
+
+ private:
+  alignas(k_cache_line) atomic_ring_buf<T, Allocator> m_buf;
+  alignas(k_cache_line) std::atomic<diff_type> m_top{0};
+  alignas(k_cache_line) std::atomic<diff_type> m_bottom{0};
+
+  // Convenience aliases.
+  static constexpr std::memory_order relaxed = std::memory_order_relaxed;
+  static constexpr std::memory_order consume = std::memory_order_consume;
+  static constexpr std::memory_order acquire = std::memory_order_acquire;
+  static constexpr std::memory_order release = std::memory_order_release;
+  static constexpr std::memory_order seq_cst = std::memory_order_seq_cst;
+
+  /**
+   * @brief Round `cap` up to the next power of two as a `diff_type`.
+   */
+  static constexpr auto round_capacity(size_type cap) -> diff_type {
+    constexpr auto max_cap = std::bit_floor(safe_cast<size_type>(std::numeric_limits<diff_type>::max()));
+    LF_ASSUME(0 < cap && cap <= max_cap);
+    return safe_cast<diff_type>(std::bit_ceil(cap));
+  }
+};
+
+} // namespace lf
diff --git a/src/batteries/geometric_stack.cxx b/src/batteries/geometric_stack.cxx
new file mode 100644
index 000000000..c34648442
--- /dev/null
+++ b/src/batteries/geometric_stack.cxx
@@ -0,0 +1,449 @@
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/compiler.hpp"
+#include "libfork/__impl/exception.hpp"
+export module libfork.batteries:geometric_stack;
+
+import std;
+
+import libfork.utils;
+
+namespace lf {
+
+/**
+ * @brief A geometric_stack is a user-space (geometric-growth) segmented program stack.
+ *
+ * This protects against hot-splitting by keeping a single cached segment.
+ *
+ * For this to conform to `worker_stack` the allocators void pointer type must be `void *`
+ */
+export template <allocator_of<std::byte> Allocator = std::allocator<std::byte>>
+class geometric_stack {
+
+  struct ctrl;
+  struct node;
+
+  using ctrl_traits = std::allocator_traits<Allocator>::template rebind_traits<ctrl>;
+  using node_traits = std::allocator_traits<Allocator>::template rebind_traits<node>;
+
+  using ctrl_ptr = ctrl_traits::pointer;
+  using node_ptr = node_traits::pointer;
+
+  using void_ptr = node_traits::void_pointer;
+
+  using size_type = node_traits::size_type;
+  using diff_type = node_traits::difference_type;
+
+  struct release_t {
+    explicit constexpr release_t(key_t) noexcept {}
+  };
+
+  class checkpoint_t {
+   public:
+    constexpr checkpoint_t() noexcept = default;
+    constexpr auto operator==(checkpoint_t const &) const noexcept -> bool = default;
+
+   private:
+    friend geometric_stack;
+    explicit constexpr checkpoint_t(ctrl_ptr ptr) noexcept
+        : m_ctrl(ptr) {}
+    ctrl_ptr m_ctrl = nullptr;
+  };
+
+ public:
+  constexpr geometric_stack() noexcept(noexcept(Allocator()))
+      : geometric_stack(Allocator()) {}
+  explicit constexpr geometric_stack(Allocator const &alloc) noexcept
+      : m_ctrl_alloc(alloc) {}
+
+  constexpr geometric_stack(geometric_stack const &other) = delete;
+  constexpr geometric_stack(geometric_stack &&other) = delete;
+
+  constexpr auto operator=(geometric_stack const &other) -> geometric_stack & = delete;
+  constexpr auto operator=(geometric_stack &&other) -> geometric_stack & = delete;
+
+  constexpr ~geometric_stack() noexcept {
+    LF_ASSUME(empty());
+    delete_ctrl(m_ctrl);
+  }
+
+  /**
+   * @brief Test if the stack is empty (all pushes have been popped).
+   */
+  [[nodiscard]]
+  constexpr auto empty() const noexcept -> bool {
+
+    if (m_ctrl != nullptr) {
+      LF_ASSUME(m_ctrl->top != nullptr);
+    } else {
+      return true;
+    }
+
+    if (m_ctrl->top->prev != nullptr) {
+      return false;
+    }
+
+    return m_sp == m_lo;
+  }
+
+  /**
+   * @brief Get a checkpoint of the stack that can be used to acquire it from another stack allocator.
+   */
+  [[nodiscard]]
+  constexpr auto checkpoint() noexcept -> checkpoint_t {
+    return checkpoint_t{m_ctrl};
+  }
+
+  /**
+   * @brief Allocate size bytes on the stack and return a pointer to the base of the allocation.
+   */
+  [[nodiscard]]
+  constexpr auto push(std::size_t size) -> void_ptr {
+    // Zero sized pushed are an error
+    LF_ASSUME(size > 0);
+
+    // Very careful math to avoid superfluous instructions on this (very) hot path.
+    diff_type push_bytes = safe_cast<diff_type>(round_to_multiple<sizeof(node)>(size));
+
+    constexpr diff_type node_size = sizeof(node);
+
+    LF_ASSUME(push_bytes >= node_size);
+    LF_ASSUME(push_bytes % node_size == 0);
+
+    // Optimized to just the subtraction because multiplication cancels the implicit division.
+    diff_type free_bytes = node_size * (m_hi - m_sp);
+
+    if (push_bytes > free_bytes) [[unlikely]] {
+      return push_cached(push_bytes);
+    }
+
+    LF_ASSUME(m_ctrl != nullptr);
+    LF_ASSUME(m_ctrl->top != nullptr);
+
+    // Compiler should optimize this division away when it fuses it with the
+    // implicit multiplication in the pointer arithmetic below.
+    diff_type num_nodes = push_bytes / node_size;
+
+    // node_ptr -> void_ptr
+    return static_cast<void_ptr>(std::exchange(m_sp, m_sp + num_nodes));
+  }
+
+  /**
+   * @brief Deallocate the most recent allocation of size bytes at ptr, which
+   * must be the most recent allocation returned by push and not yet popped.
+   */
+  constexpr void pop(void_ptr ptr, [[maybe_unused]] std::size_t n) noexcept {
+
+    LF_ASSUME(m_ctrl != nullptr);
+    LF_ASSUME(m_ctrl->top != nullptr);
+    LF_ASSUME(!empty());
+    LF_ASSUME(m_sp != nullptr);
+    LF_ASSUME(ptr != nullptr);
+
+    // Inverse of push: void_ptr -> node_ptr
+    auto sp = static_cast<node_ptr>(ptr);
+
+    if (m_sp == m_lo) [[unlikely]] {
+      return pop_shuffle(sp);
+    }
+
+    m_sp = sp;
+  }
+
+  [[nodiscard]]
+  constexpr auto prepare_release() noexcept -> release_t {
+
+    // Guard against null release
+    if (m_ctrl != nullptr) {
+      m_ctrl->sp_cache = m_sp;
+    }
+
+    return release_t{key()};
+  }
+
+  constexpr void release([[maybe_unused]] release_t) noexcept {
+
+    // Don't delete, will be resumed from a checkpoint.
+    m_ctrl = nullptr;
+
+    m_lo = nullptr;
+    m_sp = nullptr;
+    m_hi = nullptr;
+  }
+
+  constexpr void acquire(checkpoint_t ckpt) noexcept {
+
+    LF_ASSUME(empty());
+    LF_ASSUME(ckpt.m_ctrl != m_ctrl);
+
+    if (ckpt.m_ctrl == nullptr) {
+      return;
+    }
+
+    delete_ctrl(m_ctrl);
+
+    m_ctrl = ckpt.m_ctrl;
+
+    if constexpr (!node_traits::is_always_equal::value) {
+      // Need to propagate allocator
+      m_ctrl_alloc = typename ctrl_traits::allocator_type{std::as_const(m_ctrl->node_alloc)};
+    }
+
+    LF_ASSUME(m_ctrl->top != nullptr);
+
+    load_local<from::cache>();
+  }
+
+ private:
+  // ============== Types ==============  //
+
+  enum class from : char {
+    top,
+    cache,
+    none,
+  };
+
+  struct alignas(k_new_align) node {
+    node_ptr prev;  // Linked list (past)
+    diff_type size; // Usable-size of the stacklet
+  };
+
+  struct ctrl {
+    [[no_unique_address]]
+    typename node_traits::allocator_type node_alloc;
+
+    node_ptr top = nullptr;      // Most recent stacklet i.e. the top of the stack.
+    node_ptr cache = nullptr;    // Cached (empty) stacklet for hot-split guarding.
+    node_ptr sp_cache = nullptr; // Cached stack pointer for this stacklet.
+  };
+
+  // ============== Members ==============  //
+
+  [[no_unique_address]]
+  typename ctrl_traits::allocator_type m_ctrl_alloc;
+
+  ctrl_ptr m_ctrl = nullptr; // The control block for the stack.
+
+  node_ptr m_lo = nullptr; // The base pointer for the current stacklet.
+  node_ptr m_sp = nullptr; // The stack pointer for the current stacklet.
+  node_ptr m_hi = nullptr; // The one-past-the-end pointer for the current stacklet.
+
+  // ============== Methods ==============  //
+
+  /**
+   * @brief Make local pointers point to the current stacklet in the control block.
+   *
+   * Assumes that the control block and top stacklet are non-nullptr.
+   */
+  template <from StackPtr>
+  constexpr auto load_local() noexcept -> void {
+
+    LF_ASSUME(m_ctrl != nullptr);
+    LF_ASSUME(m_ctrl->top != nullptr);
+
+    constexpr diff_type one{1};
+
+    m_lo = m_ctrl->top + one;
+    m_hi = m_lo + m_ctrl->top->size;
+
+    if constexpr (StackPtr == from::cache) {
+      m_sp = m_ctrl->sp_cache;
+    } else if constexpr (StackPtr == from::top) {
+      m_sp = m_lo;
+    } else {
+      static_assert(StackPtr == from::none);
+    }
+  }
+
+  /**
+   * @brief Allocate and construct a new control block with a single stacklet of size bytes.
+   */
+  [[nodiscard]]
+  constexpr auto new_ctrl(this geometric_stack &self, diff_type num_nodes) -> ctrl_ptr {
+
+    ctrl_ptr new_ctrl = ctrl_traits::allocate(self.m_ctrl_alloc, 1);
+
+    LF_TRY {
+      // Propagate ctrl allocator to control blocks node allocator.
+      ctrl_traits::construct(self.m_ctrl_alloc, std::to_address(new_ctrl), std::as_const(self.m_ctrl_alloc));
+      LF_TRY {
+        new_ctrl->top = new_node(new_ctrl, num_nodes);
+      } LF_CATCH_ALL {
+        // Clean up construction
+        ctrl_traits::destroy(self.m_ctrl_alloc, std::to_address(new_ctrl));
+        LF_RETHROW;
+      }
+    } LF_CATCH_ALL {
+      // Clean up allocation
+      ctrl_traits::deallocate(self.m_ctrl_alloc, new_ctrl, 1);
+      LF_RETHROW;
+    }
+
+    return new_ctrl;
+  }
+
+  /**
+   * @brief Clean and delete the control block and all stacklets.
+   */
+  constexpr void delete_ctrl(this geometric_stack &self, ctrl_ptr ctrl) noexcept {
+    if (ctrl != nullptr) {
+      LF_ASSUME(ctrl->top != nullptr);
+      LF_ASSUME(ctrl->top->prev == nullptr);
+
+      // Clea-up stacklets
+      delete_node(ctrl, ctrl->top);
+      delete_node(ctrl, ctrl->cache);
+
+      // Finally delete the control block.
+      ctrl_traits::destroy(self.m_ctrl_alloc, std::to_address(ctrl));
+      ctrl_traits::deallocate(self.m_ctrl_alloc, ctrl, 1);
+    }
+  }
+
+  /**
+   * @brief Allocate node with size bytes for stacklet.
+   *
+   * This function is strongly exception-safe.
+   */
+  [[nodiscard]]
+  static constexpr auto new_node(ctrl_ptr ctrl, diff_type num_nodes) -> node_ptr {
+
+    // Allocation should be a multiple of the node size
+    LF_ASSUME(num_nodes > 0);
+    LF_ASSUME(ctrl != nullptr);
+
+    // Allocation/deallocation requires size_type, +1 for the header node
+    size_type allocate_nodes = 1 + safe_cast<size_type>(num_nodes);
+
+    node_ptr next_node = node_traits::allocate(ctrl->node_alloc, allocate_nodes);
+
+    LF_TRY {
+      // Construct the header
+      node_traits::construct(ctrl->node_alloc, std::to_address(next_node), nullptr, num_nodes);
+    } LF_CATCH_ALL {
+      node_traits::deallocate(ctrl->node_alloc, next_node, allocate_nodes);
+      LF_RETHROW;
+    }
+
+    return next_node;
+  }
+
+  /**
+   * @brief Delete a (possibly null) node and it's associated stacklet.
+   */
+  static constexpr auto delete_node(ctrl_ptr ctrl, node_ptr ptr) noexcept -> void {
+    if (ptr != nullptr) {
+      LF_ASSUME(ctrl != nullptr);
+      // Size doesn't include the header node so we +1 here.
+      size_type allocated_nodes = 1 + safe_cast<size_type>(ptr->size);
+      node_traits::destroy(ctrl->node_alloc, std::to_address(ptr));
+      node_traits::deallocate(ctrl->node_alloc, ptr, allocated_nodes);
+    }
+  }
+
+  [[nodiscard]]
+  constexpr auto push_cached(diff_type push_bytes) -> void_ptr {
+
+    // Have to be very careful in this function to be strongly exception-safe!
+
+    constexpr diff_type node_size = sizeof(node);
+
+    LF_ASSUME(push_bytes >= node_size);
+    LF_ASSUME(push_bytes % node_size == 0);
+
+    diff_type num_nodes = safe_cast<diff_type>(push_bytes / node_size);
+
+    LF_ASSUME(num_nodes > 0);
+
+    if (m_ctrl == nullptr) {
+      // Initial stacklet wants to be quite large
+      constexpr diff_type min_nodes = (k_page_size / sizeof(node)) - 1;
+
+      m_ctrl = new_ctrl(std::max(min_nodes, num_nodes));
+
+      // Local copies of the new top
+      load_local<from::top>();
+      // Do the allocation.
+      return static_cast<void_ptr>(std::exchange(m_sp, m_sp + num_nodes));
+    }
+
+    LF_ASSUME(m_ctrl->top != nullptr);
+
+    if (m_ctrl->cache != nullptr && m_ctrl->cache->size >= num_nodes) {
+
+      // We have space in the cache. No allocations on this path, nothing cam throw.
+
+      if (m_sp == m_lo) {
+        // There is nothing allocated on the current stacklet/top but it doesn't
+        // have enough space hence, we need to delete top such that we don't end up
+        // with an empty stacklet in the chain. This would break deletion otherwise.
+        node_ptr empty_top = m_ctrl->top;
+        m_ctrl->top = m_ctrl->top->prev; // top could be null now
+        delete_node(m_ctrl, empty_top);
+      }
+
+      // Shuffle cache to the top.
+      m_ctrl->cache->prev = m_ctrl->top;
+      m_ctrl->top = m_ctrl->cache;
+      m_ctrl->cache = nullptr;
+
+      // Local copies of the new top
+      load_local<from::top>();
+      // Do the allocation.
+      return static_cast<void_ptr>(std::exchange(m_sp, m_sp + num_nodes));
+    }
+
+    // We need to allocate a new stacklet to fit this allocation, we choose to
+    // grow geometrically to try to avoid too many allocations. Fine if this
+    // throws
+    node_ptr new_top = new_node(m_ctrl, std::max(num_nodes, 2 * m_ctrl->top->size));
+
+    // Nothing can throw after this point
+
+    // We didn't use the cache because it wasn't big enough, we should delete it
+    // now because we had to grow the stack. We couldn't do this until now because
+    // new_node may have thrown.
+    delete_node(m_ctrl, std::exchange(m_ctrl->cache, nullptr));
+
+    if (m_sp == m_lo) {
+      // There is nothing allocated on the current stacklet/top but it doesn't
+      // have enough space hence, we need to delete top such that we don't end up
+      // with an empty stacklet in the chain. This would break deletion otherwise.
+      node_ptr empty_top = m_ctrl->top;
+      m_ctrl->top = m_ctrl->top->prev; // top could be null now
+      delete_node(m_ctrl, empty_top);
+    }
+
+    // Commit the new/node
+    new_top->prev = m_ctrl->top;
+    m_ctrl->top = new_top;
+
+    // Local copies of the new top
+    load_local<from::top>();
+    // Do the allocation.
+    return static_cast<void_ptr>(std::exchange(m_sp, m_sp + num_nodes));
+  }
+
+  constexpr void pop_shuffle(node_ptr sp) noexcept {
+
+    // Shuffle top/cache
+    LF_ASSUME(!empty());
+    LF_ASSUME(m_ctrl != nullptr);
+    LF_ASSUME(m_ctrl->top != nullptr);       // Pop from empty stack
+    LF_ASSUME(m_ctrl->top->prev != nullptr); // ^
+
+    // Shuffle top to cache
+    node_ptr old_cache = m_ctrl->cache;
+    m_ctrl->cache = m_ctrl->top;
+    delete_node(m_ctrl, old_cache);
+
+    // Go back one stacklet
+    m_ctrl->top = m_ctrl->top->prev;
+
+    // Local copies of the new top
+    load_local<from::none>();
+    m_sp = sp;
+  }
+};
+
+} // namespace lf
diff --git a/src/batteries/slab_stack.cxx b/src/batteries/slab_stack.cxx
new file mode 100644
index 000000000..526e1f581
--- /dev/null
+++ b/src/batteries/slab_stack.cxx
@@ -0,0 +1,239 @@
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/compiler.hpp"
+#include "libfork/__impl/exception.hpp"
+export module libfork.batteries:slab_stack;
+
+import std;
+
+import libfork.utils;
+
+namespace lf {
+
+/**
+ * @brief A slab_stack is a user-space stack backed by a single fixed-size slab of memory.
+ *
+ * For this to conform to `worker_stack` the allocators void pointer type must be `void *`
+ */
+export template <allocator_of<std::byte> Allocator = std::allocator<std::byte>>
+class slab_stack {
+
+  struct node;
+
+  using node_traits = std::allocator_traits<Allocator>::template rebind_traits<node>;
+  using node_alloc_t = node_traits::allocator_type;
+
+  using node_ptr = node_traits::pointer;
+  using void_ptr = node_traits::void_pointer;
+
+  using size_type = node_traits::size_type;
+  using diff_type = node_traits::difference_type;
+
+  struct alignas(k_new_align) node {
+    [[no_unique_address]]
+    node_alloc_t node_alloc; // Propagated to new owners on acquire.
+    node_ptr sp_cache;       // Stack pointer saved across release/acquire.
+    diff_type size;          // Usable node count following this header.
+  };
+
+  static constexpr diff_type k_default_nodes = safe_cast<diff_type>(4 * k_page_size / sizeof(node)) - 1;
+
+  static_assert(k_default_nodes > 0);
+
+  struct release_t {
+    explicit constexpr release_t(key_t) noexcept {}
+  };
+
+  class checkpoint_t {
+   public:
+    constexpr checkpoint_t() noexcept = default;
+    constexpr auto operator==(checkpoint_t const &) const noexcept -> bool = default;
+
+   private:
+    friend slab_stack;
+    explicit constexpr checkpoint_t(node_ptr ptr) noexcept
+        : m_ctrl(ptr) {}
+    node_ptr m_ctrl = nullptr;
+  };
+
+ public:
+  constexpr slab_stack()
+      : slab_stack(k_default_nodes, Allocator()) {}
+
+  // TODO: what is appropriate unit for initialisation
+  // TODO: remove default constructor?
+  // TODO: what type to initialize with signed vs unsigned?
+
+  explicit constexpr slab_stack(diff_type num_nodes, Allocator const &alloc = Allocator())
+      : m_alloc(alloc),
+        m_init_size(num_nodes) {
+    init_slab(num_nodes);
+  }
+
+  constexpr slab_stack(slab_stack const &) = delete;
+  constexpr slab_stack(slab_stack &&) = delete;
+
+  constexpr auto operator=(slab_stack const &) -> slab_stack & = delete;
+  constexpr auto operator=(slab_stack &&) -> slab_stack & = delete;
+
+  constexpr ~slab_stack() noexcept {
+    LF_ASSUME(empty());
+    delete_ctrl(m_ctrl);
+  }
+
+  /**
+   * @brief Test if the stack is empty (all pushes have been popped).
+   */
+  [[nodiscard]]
+  constexpr auto empty() const noexcept -> bool {
+    return m_sp == m_lo;
+  }
+
+  /**
+   * @brief Get a checkpoint of the stack for transfer to another stack instance.
+   */
+  [[nodiscard]]
+  constexpr auto checkpoint() noexcept -> checkpoint_t {
+    return checkpoint_t{m_ctrl};
+  }
+
+  /**
+   * @brief Allocate size bytes on the stack and return a pointer to the base of the allocation.
+   */
+  [[nodiscard]]
+  constexpr auto push(std::size_t size) -> void_ptr {
+    LF_ASSUME(size > 0);
+
+    constexpr diff_type node_size = sizeof(node);
+
+    diff_type push_bytes = safe_cast<diff_type>(round_to_multiple<sizeof(node)>(size));
+
+    LF_ASSUME(push_bytes >= node_size);
+    LF_ASSUME(push_bytes % node_size == 0);
+
+    // Optimized to just the subtraction because multiplication cancels the implicit division.
+    diff_type free_bytes = node_size * (m_hi - m_sp);
+
+    if (push_bytes > free_bytes) [[unlikely]] {
+      LF_THROW(std::bad_alloc{});
+    }
+
+    diff_type num_nodes = push_bytes / node_size;
+
+    // node_ptr -> void_ptr
+    return static_cast<void_ptr>(std::exchange(m_sp, m_sp + num_nodes));
+  }
+
+  /**
+   * @brief Deallocate the most recent allocation of n bytes at ptr.
+   */
+  constexpr void pop(void_ptr ptr, [[maybe_unused]] std::size_t n) noexcept {
+    LF_ASSUME(!empty());
+    LF_ASSUME(m_sp != nullptr);
+    LF_ASSUME(ptr != nullptr);
+
+    // Inverse of push: void_ptr -> node_ptr
+    m_sp = static_cast<node_ptr>(ptr);
+  }
+
+  /**
+   * @brief Make ready for a call to release().
+   */
+  [[nodiscard]]
+  constexpr auto prepare_release() noexcept -> release_t {
+    // Guard against null ctrl (failed prior allocation in release()).
+    if (m_ctrl != nullptr) {
+      m_ctrl->sp_cache = m_sp;
+    }
+    return release_t{key()};
+  }
+
+  constexpr void release([[maybe_unused]] release_t) noexcept {
+
+    // Hand off the current slab to whoever holds the checkpoint; clear local state.
+    m_ctrl = nullptr;
+
+    m_lo = nullptr;
+    m_sp = nullptr;
+    m_hi = nullptr;
+
+    // Pre-allocate a fresh slab for our next use.
+
+    LF_TRY {
+      init_slab(m_init_size);
+    } LF_CATCH_ALL {
+      // If ^ throws, swallow the exception — push will see no space
+      // i.e. (m_hi - m_sp == 0) and throw instead.
+    }
+  }
+
+  constexpr void acquire(checkpoint_t ckpt) noexcept {
+
+    LF_ASSUME(empty());
+    LF_ASSUME(ckpt.m_ctrl != m_ctrl);
+
+    if (ckpt.m_ctrl == nullptr) {
+      return;
+    }
+
+    // Discard the empty slab (may be null on alloc failure).
+    delete_ctrl(m_ctrl);
+
+    m_ctrl = ckpt.m_ctrl;
+
+    if constexpr (!node_traits::is_always_equal::value) {
+      m_alloc = node_alloc_t{std::as_const(m_ctrl->node_alloc)};
+    }
+
+    load_local();
+  }
+
+ private:
+  [[no_unique_address]]
+  node_alloc_t m_alloc;
+
+  diff_type m_init_size;
+
+  node_ptr m_ctrl = nullptr; // Header node (fused ctrl+first-node of the slab).
+  node_ptr m_lo = nullptr;   // Base of usable space (m_ctrl + 1).
+  node_ptr m_sp = nullptr;   // Stack pointer for the current slab.
+  node_ptr m_hi = nullptr;   // One-past-the-end of usable space.
+
+  // Restore local pointers from the header node, taking sp from the cache.
+  constexpr void load_local() noexcept {
+    LF_ASSUME(m_ctrl != nullptr);
+    m_lo = m_ctrl + 1;
+    m_hi = m_lo + m_ctrl->size;
+    m_sp = m_ctrl->sp_cache;
+  }
+
+  // Allocate and construct a fresh slab with num_nodes usable nodes.
+  constexpr void init_slab(diff_type num_nodes) {
+    LF_ASSUME(num_nodes > 0);
+
+    size_type total = safe_cast<size_type>(1 + num_nodes);
+    m_ctrl = node_traits::allocate(m_alloc, total);
+
+    LF_TRY {
+      node_traits::construct(m_alloc, std::to_address(m_ctrl), m_alloc, nullptr, num_nodes);
+    } LF_CATCH_ALL {
+      node_traits::deallocate(m_alloc, m_ctrl, total);
+      m_ctrl = nullptr;
+      LF_RETHROW;
+    }
+
+    m_lo = m_sp = m_ctrl + 1;
+    m_hi = m_lo + num_nodes;
+  }
+
+  // Destroy and deallocate a slab (no-op if null).
+  constexpr void delete_ctrl(node_ptr ctrl) noexcept {
+    if (ctrl != nullptr) {
+      size_type total = safe_cast<size_type>(1 + ctrl->size);
+      node_traits::destroy(m_alloc, std::to_address(ctrl));
+      node_traits::deallocate(m_alloc, ctrl, total);
+    }
+  }
+};
+
+} // namespace lf
diff --git a/src/core/awaitables.cxx b/src/core/awaitables.cxx
new file mode 100644
index 000000000..6c4333bab
--- /dev/null
+++ b/src/core/awaitables.cxx
@@ -0,0 +1,336 @@
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/exception.hpp"
+#include "libfork/__impl/utils.hpp"
+export module libfork.core:awaitables;
+
+import std;
+
+import libfork.utils;
+
+import :concepts_context;
+import :frame;
+import :handles;
+import :task;
+import :thread_locals;
+import :final_suspend;
+import :concepts_awaitable;
+import :execute;
+
+namespace lf {
+
+/**
+ * @brief Call inside a catch block, stash current exception in `frame`.
+ */
+template <typename Checkpoint>
+constexpr void stash_current_exception(frame_type<Checkpoint> *frame) noexcept {
+  // No synchronization is done via exception_bit, hence we can use relaxed atomics
+  // and rely on the usual fork/join synchronization to ensure memory ordering.
+  if (frame->atomic_except().exchange(1, std::memory_order_relaxed) == 0) {
+
+    frame->except.construct(std::current_exception());
+
+    // Should have been called from inside a catch block
+    LF_ASSUME(*frame->except != nullptr);
+  }
+}
+
+/**
+ * @brief To handle tasks on a WSQ that have been "effectively stolen".
+ *
+ * If explicit scheduling has occurred then there may be tasks on a workers WSQ
+ * that have been "effectively stolen" from another worker. These can be
+ * handled in any order, but we must treat (mark them) as stolen.
+ *
+ * All of these task will eventually reach a join point.
+ *
+ * While running the ancestor several things can happen:
+ *   We hit a join in the ancestor:
+ *       Case: (win join) then we take the stack:
+ *         OK to treat tasks on our WSQ as non-stolen.
+ *       Case (lose join):
+ *         Continue to resume other effectively stolen tasks on our WSQ.
+ *   We lose a join in some descendent of the ancestor:
+ *     OK => all task on our WSQ must have been stole by other threads and hence,
+ *     handled as stolen appropriately.
+ *
+ * TODO: benchmark order i.e. could self-steal?
+ */
+template <worker_context Context>
+[[nodiscard]]
+constexpr auto resume_effectively_stolen(Context &context) -> coro<> {
+
+  if (steal_handle<Context> last_pushed = context.pop()) {
+    return consume(last_pushed);
+  }
+
+  return std::noop_coroutine();
+}
+
+// =============== Fork/Call =============== //
+
+/**
+ * @brief In a separate function to allow it to be placed in cold block.
+ */
+template <typename T, typename Context>
+constexpr void
+destroy_child_stash_exception(frame_t<Context> *child, coro<promise_type<T, Context>> parent) noexcept {
+  // Clean-up the child that will never be resumed.
+  child->handle().destroy();
+  // Stash in the parent's frame which will then be resumed.
+  stash_current_exception(&parent.promise().frame);
+}
+
+/**
+ * @brief Awaitable for forking/calling an async function.
+ */
+template <category Cat, worker_context Context>
+struct async_awaitable : std::suspend_always {
+
+  static_assert(Cat == category::call || Cat == category::fork, "Invalid category for awaitable");
+
+  frame_t<Context> *child;
+
+  template <typename T>
+  constexpr auto
+  await_suspend(this async_awaitable self, coro<promise_type<T, Context>> parent) noexcept -> coro<> {
+
+    // TODO: test of having a dedicated is_stopped awaitable is quicker
+
+    if (!self.child) [[unlikely]] {
+      // Noop if an exception was thrown.
+      return parent;
+    }
+
+    if (self.child->stop_requested()) [[unlikely]] {
+      // Noop if stopped, must clean-up the child that will never be resumed.
+      return self.child->handle().destroy(), parent;
+    }
+
+    // Propagate parent->child relationships
+    self.child->parent = &parent.promise().frame;
+
+    if constexpr (Cat == category::call) {
+      // Should be the default
+      LF_ASSUME(self.child->kind == category::call);
+    } else {
+      self.child->kind = Cat;
+    }
+
+    if constexpr (Cat == category::fork) {
+      // It is critical to pass self by-value here, after the call to push()
+      // the object `*this` may be destroyed, if passing by ref it would be
+      // use-after-free to then access self in the following line to fetch the
+      // handle.
+      LF_TRY {
+        get_tls_context<Context>().push(steal_handle<Context>{key(), &parent.promise().frame});
+      } LF_CATCH_ALL {
+        return destroy_child_stash_exception(self.child, parent), parent;
+      }
+    }
+
+    return self.child->handle();
+  }
+};
+
+// =============== Join =============== //
+
+template <worker_context Context>
+struct join_awaitable {
+
+  frame_t<Context> *frame;
+
+  constexpr auto await_ready(this join_awaitable self) noexcept -> bool {
+    if (not_null(self.frame)->steals == 0) [[likely]] {
+      if (self.frame->stop_requested()) [[unlikely]] {
+        // Must unconditionally suspended if stopped
+        return false;
+      }
+      // If no steals then we are the only owner of the parent and we are
+      // ready to join. Therefore, no need to reset the control block.
+      return true;
+    }
+    return false;
+  }
+
+  constexpr auto await_suspend(this join_awaitable self, coro<> task) noexcept -> coro<> {
+    // Currently   self.joins  = k_u16_max  - num_joined
+    //
+    // We set           joins  = self->joins - (k_u16_max - num_steals)
+    //                         = num_steals - num_joined
+    //
+    // Hence               joined = k_u16_max - num_joined
+    //         k_u16_max - joined = num_joined
+
+    // Lemma:
+    //
+    //    If a thread is at a join and steals have occurred then the
+    //    thread can never own the stack of the current frame.
+    //
+    // This is because threads follow the work-first principle, so for the
+    // owner to be running this task it would have to have re-stolen it from a
+    // thief. Which implies it would have run the final suspend of the child
+    // that had it's continuation stolen, where it would have had to release
+    // the stack, because the parent was at not at the join.
+
+    LF_ASSUME(self.frame);
+
+    std::uint32_t steals = self.frame->steals;
+    std::uint32_t offset = k_u16_max - steals;
+    std::uint32_t joined = self.frame->atomic_joins().fetch_sub(offset, std::memory_order_release);
+
+    // If this was a stop:
+    //
+    // steals = 0, joins = k_u16_max then:
+    //
+    // steals = 0
+    // offset = k_u16_max
+    // joined = k_u16_max, (self.frame->joins is now 0)
+    //
+    // k_u16_max - joined = 0 = steals, hence win the if
+
+    if (steals == k_u16_max - joined) {
+      // We set joins after all children had completed therefore we can resume the task.
+      // Need to acquire to ensure we see all writes by other threads to the result.
+      std::atomic_thread_fence(std::memory_order_acquire);
+
+      if (self.frame->stop_requested()) [[unlikely]] {
+        return self.handle_stop();
+      }
+
+      // We must reset the control block and take the stack. We should never
+      // own the stack at this point because we must have stolen the stack.
+      self.take_stack();
+      self.frame->reset_counters();
+      return task;
+    }
+    // Someone else is responsible for running this task.
+
+    // We cannot touch *this or dereference self as someone may have resumed already!
+    // We cannot currently own this stack (checking would violate above).
+
+    // If no explicit scheduling then we must have an empty WSQ as we stole this task.
+
+    // If explicit scheduling then we may have tasks on our WSQ if we performed a self-steal
+    // in a switch awaitable. In this case we can/must do another self-steal.
+
+    // A throw here triggers std::terminate which is intended
+    return resume_effectively_stolen(get_tls_context<Context>());
+  }
+
+  constexpr void await_resume(this join_awaitable self) {
+    // We should have been reset
+    LF_ASSUME(self.frame->steals == 0);
+    LF_ASSUME(self.frame->joins == k_u16_max);
+
+    // Outside parallel regions so can touch non-atomically.
+    //
+    // A task that completes by responding to cancellation will drop any
+    // exceptions however, a task may still throw exceptions even if cancelled.
+    // Here we must rethrow even if cancelled because we can't re-suspend at
+    // this point.
+    if constexpr (LF_COMPILER_EXCEPTIONS) {
+      if (self.frame->exception_bit) [[unlikely]] {
+        self.rethrow_exception();
+      }
+    }
+
+    LF_ASSUME(self.frame->exception_bit == 0);
+  }
+
+  constexpr auto take_stack(this join_awaitable self) noexcept -> void {
+    stack_t<Context> &stack = get_tls_stack<Context>();
+    LF_ASSUME(self.frame->stack_ckpt != stack.checkpoint());
+    stack.acquire(std::as_const(self.frame->stack_ckpt));
+  }
+
+  [[nodiscard]]
+  constexpr auto handle_stop(this join_awaitable self) noexcept -> coro<> {
+    // Only need to take the stack if there were steals
+    if (self.frame->steals > 0) {
+      self.take_stack();
+    }
+
+    // We always need to reset the connters as we modified
+    self.frame->reset_counters();
+
+    // Drop any exceptions in the now-stopped task
+    if constexpr (LF_COMPILER_EXCEPTIONS) {
+      if (self.frame->exception_bit) [[unlikely]] {
+        std::ignore = extract_exception(self.frame);
+      }
+    }
+
+    return final_suspend_leading<Context>(self.frame);
+  }
+
+  [[noreturn]]
+  constexpr void rethrow_exception(this join_awaitable self) {
+    std::rethrow_exception(extract_exception(self.frame));
+  }
+};
+
+// =============== Context Switch =============== //
+
+template <worker_context Context, awaitable<Context> T>
+struct switch_awaitable final {
+
+  static_assert(plain_object<T>, "Expecting remove cv-ref");
+  static_assert(custom_awaitable_methods<T, Context>, "Expecting methods");
+
+  [[no_unique_address]]
+  T value;
+
+  constexpr auto await_ready() LF_HOF(value.await_ready())
+
+  constexpr auto await_resume() LF_HOF(std::forward<T>(value).await_resume())
+
+  template <typename R>
+  constexpr auto
+  await_suspend(coro<promise_type<R, Context>> parent) noexcept(nothrow_await_suspend<T, Context>) -> coro<> {
+
+    Context &context = get_tls_context<Context>();
+    frame_t<Context> &parent_frame = parent.promise().frame;
+
+    // This thread currently own the "resumable" handle of this coroutine
+    // however, it may not own the stack. By the same logic as the join Lemma:
+    //
+    // steals = 0, implies we own the stack => we should give it up.
+    //  otherwise, we do not own the stack  => our stack should be empty.
+
+    bool owns_stack = parent_frame.steals == 0;
+
+    // Must prepare before calling await_suspend, don't need to pre-release
+    // because await_suspend can't continue parent inline (because execute()
+    // would throw).
+    auto release_key = context.stack().prepare_release();
+
+    // Schedule this coroutine for execution, cannot touch underlying after this.
+    // If this throw parent is resumed and exception is re-thrown and that is ok.
+    value.await_suspend(sched_handle<Context>{key(), &parent.promise().frame}, context);
+
+    // Nothing can/should throw from this point on.
+
+    if (owns_stack) {
+      context.stack().release(std::move(release_key));
+    }
+
+    // Eventually dest will fail to pop() the ancestor task that we 'could'
+    // pop() here and then treat it as a task that was stolen from it.
+
+    // We terminate on throw as we can't resume the parent, and we can't
+    // re-throw the exception at this point.
+    return [&] noexcept -> coro<> {
+      return resume_effectively_stolen(context);
+    }();
+  }
+};
+
+/**
+ * @brief Utility to deduce the awaitable type and perfect forward to switch_awaitable.
+ */
+template <typename Context, typename T>
+constexpr auto switch_awaitable_for(T &&x)
+    LF_HOF(switch_awaitable<Context, std::remove_cvref_t<T>>{LF_FWD(x)})
+
+} // namespace lf
diff --git a/src/core/concepts/awaitable.cxx b/src/core/concepts/awaitable.cxx
new file mode 100644
index 000000000..e95a6b876
--- /dev/null
+++ b/src/core/concepts/awaitable.cxx
@@ -0,0 +1,85 @@
+module;
+#include "libfork/__impl/utils.hpp"
+export module libfork.core:concepts_awaitable;
+
+import std;
+
+import :handles;
+import :concepts_context;
+
+namespace lf {
+
+template <typename T>
+concept member_co_awaitable = requires (T &&t) { std::forward<T>(t).operator co_await(); };
+
+template <typename T>
+concept operator_co_awaitable = requires (T &&t) { operator co_await(std::forward<T>(t)); };
+
+template <typename T>
+[[nodiscard]]
+constexpr auto do_acquire_awaitable(T &&t) LF_HOF(LF_FWD(t))
+
+template <member_co_awaitable T>
+[[nodiscard]]
+constexpr auto do_acquire_awaitable(T &&t) LF_HOF(LF_FWD(t).operator co_await())
+
+template <operator_co_awaitable T>
+[[nodiscard]]
+constexpr auto do_acquire_awaitable(T &&t)
+    LF_HOF(operator co_await(LF_FWD(t)))
+
+/**
+ * @brief Specify that an awaitable can be unambiguously acquired from `T` by free/member operator co_await.
+ *
+ * If neither operator is present `T` is assumed to be a plain awaitable.
+ */
+template <typename T>
+concept awaitable_acquirable = requires (T &&x) { do_acquire_awaitable(std::forward<T>(x)); };
+
+/**
+ * @brief Extracts the awaitable from `T` by invoking the appropriate operator co_await, or returning `T`
+ * itself if neither operator is present.
+ */
+export template <awaitable_acquirable T>
+constexpr auto acquire_awaitable(T &&t)
+    LF_HOF(do_acquire_awaitable(LF_FWD(t)))
+
+/**
+ * @brief Specifies that a cv-ref stripped type is constructible from `T`.
+ */
+template <typename T>
+concept storable = std::constructible_from<std::remove_cvref_t<T>, T &&>;
+
+/**
+ * @brief Checks for methods.
+ */
+template <typename T, typename Context>
+concept custom_awaitable_methods = requires (T x, Context &ctx, sched_handle<Context> handle) {
+  { x.await_ready() } -> std::convertible_to<bool>;
+  { x.await_suspend(handle, ctx) } -> std::same_as<void>;
+  { x.await_resume() };
+};
+
+template <typename T, typename Context>
+concept nothrow_await_suspend = requires (T x, Context &ctx, sched_handle<Context> handle) {
+  { x.await_suspend(handle, ctx) } noexcept;
+};
+
+/**
+ * @brief Checks for methods + storable
+ */
+template <typename T, typename Context>
+concept custom_awaitable = storable<T> && custom_awaitable_methods<std::remove_cvref_t<T>, Context>;
+
+/**
+ * @brief  Specifies the requirements for a context-switching awaitable type.
+ *
+ * Note: await_suspend may not complete inline i.e. the current thread remains
+ * bound to the context.
+ */
+export template <typename T, typename Context>
+concept awaitable = worker_context<Context> && requires (T &&x) {
+  { acquire_awaitable(std::forward<T>(x)) } -> custom_awaitable<Context>;
+};
+
+} // namespace lf
diff --git a/src/core/concepts/context.cxx b/src/core/concepts/context.cxx
new file mode 100644
index 000000000..359da3ae0
--- /dev/null
+++ b/src/core/concepts/context.cxx
@@ -0,0 +1,49 @@
+export module libfork.core:concepts_context;
+
+import std;
+
+import libfork.utils;
+
+import :concepts_stack;
+import :handles;
+
+namespace lf {
+
+template <typename T>
+concept ref_to_worker_stack = std::is_lvalue_reference_v<T> && worker_stack<std::remove_reference_t<T>>;
+
+/**
+ * @brief Specifies that a type acts as a LIFO stack over U.
+ */
+export template <typename T, typename U>
+concept lifo_stack = plain_object<T> && requires (T context, U val) {
+  { context.push(val) } -> std::same_as<void>;
+  { context.pop() } noexcept -> std::same_as<U>;
+};
+
+/**
+ * @brief Defines the API for a libfork compatible worker context.
+ *
+ * This requires that `T` is an object type and supports the following operations:
+ *
+ * - Push/pop a steal handle onto the context in a LIFO manner.
+ * - Have a `worker_stack` that can be accessed via `stack()`.
+ */
+export template <typename T>
+concept worker_context = lifo_stack<T, steal_handle<T>> && requires (T context) {
+  { context.stack() } noexcept -> ref_to_worker_stack;
+};
+
+/**
+ * @brief Fetch the stack type of a worker context `T`.
+ */
+export template <worker_context T>
+using stack_t = std::remove_reference_t<decltype(std::declval<T &>().stack())>;
+
+/**
+ * @brief Fetch the checkpoint type of a worker context `T`.
+ */
+template <worker_context T>
+using checkpoint_t = decltype(std::declval<stack_t<T> &>().checkpoint());
+
+} // namespace lf
diff --git a/src/core/concepts/indirect.cxx b/src/core/concepts/indirect.cxx
new file mode 100644
index 000000000..90e6d2e48
--- /dev/null
+++ b/src/core/concepts/indirect.cxx
@@ -0,0 +1,131 @@
+export module libfork.core:concepts_indirect;
+
+import std;
+
+import :concepts_invocable;
+import :concepts_context;
+
+namespace lf {
+
+// A type can derive from this to opt-into indirect-value-t customization
+struct indirect_value_customization {};
+
+template <typename I>
+struct indirect_value {
+  using type = std::iter_value_t<I> &;
+};
+
+// strip cv-ref qualifiers
+template <typename T>
+  requires (!std::same_as<T, std::remove_cvref_t<T>>)
+struct indirect_value<T> : indirect_value<std::remove_cvref_t<T>> {};
+
+// Specialization for types that customize
+template <std::derived_from<indirect_value_customization> T>
+  requires std::same_as<T, std::remove_cvref_t<T>>
+struct indirect_value<T> {
+  using type = T::indirect_value_type;
+};
+
+template <typename I>
+using indirect_value_t = indirect_value<I>::type;
+
+// ========= Core concepts =========
+
+namespace sync {
+
+/**
+ * @brief A version of `std::indirectly_unary_invocable` that supports
+ * libfork's projection type.
+ */
+export template <typename Fn, typename I>
+concept indirectly_unary_invocable =                         //
+    std::indirectly_readable<I> &&                           //
+    std::copy_constructible<Fn> &&                           //
+    std::invocable<Fn &, indirect_value_t<I>> &&             //
+    std::invocable<Fn &, std::iter_reference_t<I>> &&        //
+    std::common_reference_with<                              //
+        std::invoke_result_t<Fn &, indirect_value_t<I>>,     //
+        std::invoke_result_t<Fn &, std::iter_reference_t<I>> //
+        >;                                                   //
+
+/**
+ * @brief A version of `std::indirectly_regular_unary_invocable` that supports
+ * libfork's projection type.
+ */
+export template <typename Fn, typename I>
+concept indirectly_regular_unary_invocable =                  //
+    std::indirectly_readable<I> &&                            //
+    std::copy_constructible<Fn> &&                            //
+    std::regular_invocable<Fn &, indirect_value_t<I>> &&      //
+    std::regular_invocable<Fn &, std::iter_reference_t<I>> && //
+    std::common_reference_with<                               //
+        std::invoke_result_t<Fn &, indirect_value_t<I>>,      //
+        std::invoke_result_t<Fn &, std::iter_reference_t<I>>  //
+        >;                                                    //
+
+} // namespace sync
+
+namespace async {
+
+/**
+ * @brief A variant of `std::indirectly_unary_invocable` that supports
+ * libfork's projection type and requires an async invocable.
+ */
+export template <typename Fn, typename Context, typename I>
+concept indirectly_unary_invocable =                            //
+    worker_context<Context> &&                                  //
+    std::indirectly_readable<I> &&                              //
+    std::copy_constructible<Fn> &&                              //
+    async_invocable<Fn &, Context, indirect_value_t<I>> &&      //
+    async_invocable<Fn &, Context, std::iter_reference_t<I>> && //
+    std::common_reference_with<                                 //
+        async_result_t<Fn &, Context, indirect_value_t<I>>,     //
+        async_result_t<Fn &, Context, std::iter_reference_t<I>> //
+        >;                                                      //
+
+/**
+ * @brief A variant of `std::indirectly_regular_unary_invocable` that supports
+ * libfork's projection type and requires an async invocable.
+ */
+export template <typename Fn, typename Context, typename I>
+concept indirectly_regular_unary_invocable =                            //
+    worker_context<Context> &&                                          //
+    std::indirectly_readable<I> &&                                      //
+    std::copy_constructible<Fn> &&                                      //
+    async_regular_invocable<Fn &, Context, indirect_value_t<I>> &&      //
+    async_regular_invocable<Fn &, Context, std::iter_reference_t<I>> && //
+    std::common_reference_with<                                         //
+        async_result_t<Fn &, Context, indirect_value_t<I>>,             //
+        async_result_t<Fn &, Context, std::iter_reference_t<I>>         //
+        >;                                                              //
+
+} // namespace async
+
+/**
+ * @brief A variant of `std::indirectly_unary_invocable` that supports either
+ * sync or async invocables.
+ *
+ * In general if a function is both sync and async invocable it is expected
+ * that the async version will be preferred.
+ */
+export template <typename Fn, typename Context, typename I>
+concept indirectly_unary_invocable =
+    async::indirectly_unary_invocable<Fn, Context, I> || sync::indirectly_unary_invocable<Fn, I>;
+
+// clang-format off
+
+/**
+ * @brief A variant of `std::indirectly_regular_unary_invocable` that supports
+ * either sync or async invocables.
+ *
+ * In general if a function is both sync and async invocable it is expected
+ * that the async version will be preferred.
+ */
+export template <typename Fn, typename Context, typename I>
+concept indirectly_regular_unary_invocable = 
+    async::indirectly_regular_unary_invocable<Fn, Context, I> || sync::indirectly_regular_unary_invocable<Fn, I>;
+
+// clang-format on
+
+} // namespace lf
diff --git a/src/core/concepts/invocable.cxx b/src/core/concepts/invocable.cxx
new file mode 100644
index 000000000..424d7b385
--- /dev/null
+++ b/src/core/concepts/invocable.cxx
@@ -0,0 +1,73 @@
+module;
+#include "libfork/__impl/utils.hpp"
+export module libfork.core:concepts_invocable;
+
+import std;
+
+import libfork.utils;
+
+import :task;
+import :concepts_context;
+
+namespace lf {
+
+template <typename Context>
+struct ctx_invoke_t {
+  // Explicitly constrained so overload resolution selects prefers
+  template <typename... Args, typename Fn>
+    requires std::invocable<Fn, env<Context>, Args...>
+  static constexpr auto operator()(Fn &&fn, Args &&...args)
+      LF_HOF(std::invoke(std::forward<Fn>(fn), env<Context>{key()}, std::forward<Args>(args)...))
+
+  template <typename... Args, typename Fn>
+  static constexpr auto operator()(Fn &&fn, Args &&...args)
+      LF_HOF(std::invoke(std::forward<Fn>(fn), std::forward<Args>(args)...))
+};
+
+template <typename R, typename Context>
+concept task_from = specialization_of<R, task> && std::same_as<Context, typename R::context_type>;
+
+/**
+ * @brief Test if a callable `Fn` when invoked with `Args...` returns an `lf::task`.
+ */
+export template <typename Fn, typename Context, typename... Args>
+concept async_invocable =
+    worker_context<Context> && requires (ctx_invoke_t<Context> gn, Fn &&fn, Args &&...args) {
+      { std::invoke(gn, std::forward<Fn>(fn), std::forward<Args>(args)...) } -> task_from<Context>;
+    };
+
+/**
+ * @brief Same semantic requirements as `std::regular_invocable`.
+ */
+export template <typename Fn, typename Context, typename... Args>
+concept async_regular_invocable = async_invocable<Fn, Context, Args...>;
+
+/**
+ * @brief Subsumes `async_invocable` and checks that the invocation is `noexcept`.
+ */
+export template <typename Fn, typename Context, typename... Args>
+concept async_nothrow_invocable =
+    async_invocable<Fn, Context, Args...> && std::is_nothrow_invocable_v<ctx_invoke_t<Context>, Fn, Args...>;
+
+/**
+ * @brief The result type of invoking an async function `Fn` with `Args...`.
+ */
+export template <typename Fn, typename Context, typename... Args>
+  requires async_invocable<Fn, Context, Args...>
+using async_result_t = std::invoke_result_t<ctx_invoke_t<Context>, Fn, Args...>::value_type;
+
+/**
+ * @brief Subsumes `async_invocable` and checks the result type is `R`.
+ */
+export template <typename Fn, typename R, typename Context, typename... Args>
+concept async_invocable_to =
+    async_invocable<Fn, Context, Args...> && std::same_as<R, async_result_t<Fn, Context, Args...>>;
+
+/**
+ * @brief Subsumes `async_nothrow_invocable` and `async_invocable_to`.
+ */
+export template <typename Fn, typename R, typename Context, typename... Args>
+concept async_nothrow_invocable_to =
+    async_nothrow_invocable<Fn, Context, Args...> && async_invocable_to<Fn, R, Context, Args...>;
+
+} // namespace lf
diff --git a/src/core/concepts/scheduler.cxx b/src/core/concepts/scheduler.cxx
new file mode 100644
index 000000000..2818d6143
--- /dev/null
+++ b/src/core/concepts/scheduler.cxx
@@ -0,0 +1,29 @@
+export module libfork.core:concepts_scheduler;
+
+import std;
+
+import :handles;
+
+namespace lf {
+
+export template <typename T>
+concept has_context_typedef = requires { typename std::remove_cvref_t<T>::context_type; };
+
+export template <has_context_typedef T>
+using context_t = typename std::remove_cvref_t<T>::context_type;
+
+/**
+ * @brief An object capable of scheduling a libfork task for execution.
+ *
+ * These are typed to a context, the `post` method must:
+ *
+ * - Satisfy the strong exception guarantee.
+ * - Guarantee eventual execution of the task associated with `handle`.
+ */
+export template <typename Sch>
+concept scheduler =
+    has_context_typedef<Sch> && requires (Sch &&scheduler, sched_handle<context_t<Sch>> handle) {
+      { std::forward<Sch>(scheduler).post(handle) } -> std::same_as<void>;
+    };
+
+} // namespace lf
diff --git a/src/core/concepts/semigroup.cxx b/src/core/concepts/semigroup.cxx
new file mode 100644
index 000000000..6b6067aae
--- /dev/null
+++ b/src/core/concepts/semigroup.cxx
@@ -0,0 +1,154 @@
+export module libfork.core:concepts_semigroup;
+
+import std;
+
+import :concepts_invocable;
+import :concepts_indirect;
+
+namespace lf {
+
+// === Semigroup
+
+namespace sync {
+
+template <typename R, typename Fn, typename... Args>
+concept invocable_to = std::invocable<Fn, Args...> && std::same_as<std::invoke_result_t<Fn, Args...>, R>;
+
+template <typename R, typename Fn, typename T>
+concept semigroup_r =                //
+    std::constructible_from<R, T> && //
+    invocable_to<R, Fn, T, T> &&     //
+    invocable_to<R, Fn, T, R> &&     //
+    invocable_to<R, Fn, R, T> &&     //
+    invocable_to<R, Fn, R, R>;       //
+
+template <typename R, typename Fn, typename I>
+concept indirect_semigroup_r =                                              //
+    semigroup_r<R, Fn &, indirect_value_t<I>> &&                            //
+    semigroup_r<R, Fn &, std::iter_reference_t<I>> &&                       //
+    invocable_to<R, Fn &, indirect_value_t<I>, std::iter_reference_t<I>> && //
+    invocable_to<R, Fn &, std::iter_reference_t<I>, indirect_value_t<I>>;   //
+
+/**
+ * @brief A semigroup is a set `S` and an associative binary operation `·`, such that `S` is closed under `·`.
+ *
+ * Associativity means that for all `a, b, c` in `S`, `(a · b) · c = a · (b · c)`.
+ *
+ * Example: `(Z, +)` is a semigroup, since we can add any two integers and the result is also an integer.
+ *
+ * Example: `(Z, /)` is not a semigroup, since `2/3` s not an integer.
+ *
+ * Example: `(Z, -)` is not a semigroup, since `(1 - 1) - 1 != 1 - (1 - 1)`.
+ *
+ * Let `t`, `u` and `f` be objects of types `T`, `U` and `Fn` respectively.
+ * Then the following expression must be valid:
+ *
+ * ```
+ * f(u, t)
+ * ```
+ *
+ * And return the same type `R` for all combinations of `T` and `U` being `R`,
+ * `indirect_value_t<I>` and `std::iter_reference_t<I>`.
+ */
+export template <typename Fn, typename I>
+concept indirect_semigroup =                                                  //
+    std::indirectly_readable<I> &&                                            //
+    std::copy_constructible<Fn> &&                                            //
+    std::regular_invocable<Fn &, indirect_value_t<I>, indirect_value_t<I>> && //
+    indirect_semigroup_r<                                                     //
+        std::invoke_result_t<Fn &, indirect_value_t<I>, indirect_value_t<I>>, //
+        Fn,                                                                   //
+        I                                                                     //
+        >;                                                                    //
+
+} // namespace sync
+
+namespace async {
+
+template <typename R, typename Fn, typename Context, typename T>
+concept semigroup_r =                           //
+    std::constructible_from<R, T> &&            //
+    async_invocable_to<Fn, R, Context, T, T> && //
+    async_invocable_to<Fn, R, Context, T, R> && //
+    async_invocable_to<Fn, R, Context, R, T> && //
+    async_invocable_to<Fn, R, Context, R, R>;   //
+
+template <typename R, typename Fn, typename Context, typename I>
+concept indirect_semigroup_r =                                                             //
+    semigroup_r<R, Fn &, Context, indirect_value_t<I>> &&                                  //
+    semigroup_r<R, Fn &, Context, std::iter_reference_t<I>> &&                             //
+    async_invocable_to<Fn &, R, Context, indirect_value_t<I>, std::iter_reference_t<I>> && //
+    async_invocable_to<Fn &, R, Context, std::iter_reference_t<I>, indirect_value_t<I>>;   //
+
+/**
+ * @brief A semigroup is a set `S` and an associative binary operation `·`, such that `S` is closed under `·`.
+ *
+ * Associativity means that for all `a, b, c` in `S`, `(a · b) · c = a · (b · c)`.
+ *
+ * Example: `(Z, +)` is a semigroup, since we can add any two integers and the result is also an integer.
+ *
+ * Example: `(Z, /)` is not a semigroup, since `2/3` s not an integer.
+ *
+ * Example: `(Z, -)` is not a semigroup, since `(1 - 1) - 1 != 1 - (1 - 1)`.
+ *
+ * Let `t`, `u` and `f` be objects of types `T`, `U` and `Fn` respectively.
+ * Then the following expression must be valid:
+ *
+ * ```
+ * R ret;
+ * co_await scope.call(std::addressof(ret), f, u, t)
+ * ```
+ *
+ * And return the same type `R` for all combinations of `T` and `U` being `R`,
+ * `indirect_value_t<I>` and `std::iter_reference_t<I>`.
+ */
+export template <typename Fn, typename Context, typename I>
+concept indirect_semigroup =                                                     //
+    std::indirectly_readable<I> &&                                               //
+    worker_context<Context> &&                                                   //
+    std::copy_constructible<Fn> &&                                               //
+    async_invocable<Fn &, Context, indirect_value_t<I>, indirect_value_t<I>> &&  //
+    indirect_semigroup_r<                                                        //
+        async_result_t<Fn &, Context, indirect_value_t<I>, indirect_value_t<I>>, //
+        Fn,                                                                      //
+        Context,                                                                 //
+        I                                                                        //
+        >;                                                                       //
+
+} // namespace async
+
+/**
+ * @brief Either a synchronous or asynchronous semigroup.
+ */
+export template <typename Fn, typename Context, typename I>
+concept indirect_semigroup = async::indirect_semigroup<Fn, Context, I> || sync::indirect_semigroup<Fn, I>;
+
+/**
+ * @brief A semantic requirement that the semigroup operation is commutative.
+ *
+ * Commutativity requires `a · b = b · a` for all `a`, `b` in the set `S`.
+ */
+export template <typename Fn, typename Context, typename I>
+concept indirect_commutative_semigroup = indirect_semigroup<Fn, Context, I>;
+
+template <typename Fn, typename Context, typename I>
+struct indirect_semigroup_result {
+  using type = std::invoke_result_t<Fn &, indirect_value_t<I>, indirect_value_t<I>>;
+};
+
+template <typename Fn, typename Context, typename I>
+  requires async::indirect_semigroup<Fn, Context, I>
+struct indirect_semigroup_result<Fn, Context, I> {
+  using type = async_result_t<Fn &, Context, indirect_value_t<I>, indirect_value_t<I>>;
+};
+
+/**
+ * @brief Get the result type of an indirect semigroup operation.
+ *
+ * This is the type of the result of applying the semigroup operation to two elements of the set.
+ */
+export template <typename Fn, typename Context, typename I>
+  requires indirect_semigroup<Fn, Context, I>
+using indirect_semigroup_t = indirect_semigroup_result<Fn, Context, I>::type;
+
+} // namespace lf
diff --git a/src/core/concepts/stack.cxx b/src/core/concepts/stack.cxx
new file mode 100644
index 000000000..a85efab37
--- /dev/null
+++ b/src/core/concepts/stack.cxx
@@ -0,0 +1,52 @@
+export module libfork.core:concepts_stack;
+
+import std;
+
+import libfork.utils;
+
+namespace lf {
+
+template <typename T>
+  requires std::is_object_v<T>
+consteval auto constify(T &&x) noexcept -> std::add_const_t<T> &;
+
+/**
+ * @brief Defines the API for a libfork compatible stack.
+ *
+ * - After construction `this` is empty and push is valid.
+ * - Pop is valid provided the FILO order is respected.
+ * - Push produces pointers aligned to __STDCPP_DEFAULT_NEW_ALIGNMENT__.
+ * - Destruction is expected to only occur when the stack is empty.
+ * - Result of `.checkpoint()` is expected to:
+ *     - Be "cheap to copy".
+ *     - Have a null state (default constructed) that only compares equal to itself.
+ *     - Is allowed to return null if push has never been called.
+ *     - Compare equal if and only if they are both null or they allocate from the same stack.
+ *     - Have no preconditions about when it's called.
+ * - Prepare release puts the stack into a state which another thread can acquire it.
+ * - Release detaches the current stack and leaves `this` empty.
+ *     - This may be called concurrently with acquire
+ * - Acquire attaches to the stack that the checkpoint came from:
+ *     - It is only called the stack is empty.
+ *     - It is only called with a checkpoint not equal to the current checkpoint.
+ *     - It is called after prepare release (and no other functions in between)
+ *
+ * Fast-path operations: empty, push, pop, checkpoint
+ * Slow-path operations: release, acquire
+ */
+export template <typename T>
+concept worker_stack = plain_object<T> && requires (T stack, std::size_t n, void *ptr) {
+  { stack.push(n) } -> std::same_as<void *>;
+  { stack.pop(ptr, n) } noexcept -> std::same_as<void>;
+  { stack.checkpoint() } noexcept -> std::regular;
+  { stack.prepare_release() } noexcept -> std::movable;
+  { stack.release(stack.prepare_release()) } noexcept -> std::same_as<void>;
+  { stack.acquire(constify(stack.checkpoint())) } noexcept -> std::same_as<void>;
+};
+
+// TODO: Allocator aware stack
+
+// export template <typename T>
+// concept aa_worker_stack = worker_stack<T> && true;
+
+} // namespace lf
diff --git a/src/core/core.cxx b/src/core/core.cxx
new file mode 100644
index 000000000..1638781c2
--- /dev/null
+++ b/src/core/core.cxx
@@ -0,0 +1,26 @@
+export module libfork.core;
+
+export import :concepts_invocable;
+export import :concepts_scheduler;
+export import :concepts_context;
+export import :concepts_stack;
+export import :concepts_awaitable;
+export import :concepts_indirect;
+export import :concepts_semigroup;
+export import :frame;
+export import :task;
+export import :thread_locals;
+export import :poly_context;
+export import :ops;
+export import :handles;
+export import :promise;
+export import :schedule;
+export import :root;
+export import :execute;
+export import :receiver;
+export import :stop;
+export import :exception;
+export import :final_suspend;
+export import :awaitables;
+export import :projected;
+export import :lift;
diff --git a/src/core/exception.cxx b/src/core/exception.cxx
new file mode 100644
index 000000000..102447051
--- /dev/null
+++ b/src/core/exception.cxx
@@ -0,0 +1,12 @@
+export module libfork.core:exception;
+
+import std;
+
+namespace lf {
+
+/**
+ * @brief Base class for all libfork exceptions.
+ */
+export struct libfork_exception : std::exception {};
+
+} // namespace lf
diff --git a/src/core/execute.cxx b/src/core/execute.cxx
new file mode 100644
index 000000000..12bfe6027
--- /dev/null
+++ b/src/core/execute.cxx
@@ -0,0 +1,120 @@
+
+module;
+#include "libfork/__impl/assume.hpp"
+export module libfork.core:execute;
+
+import std;
+
+import :frame;
+import :thread_locals;
+import :concepts_context;
+import :handles;
+import :exception;
+
+namespace lf {
+
+export struct execute_error final : libfork_exception {
+  [[nodiscard]]
+  constexpr auto what() const noexcept -> const char * override {
+    return "execute called from within a worker thread!";
+  }
+};
+
+/**
+ * @brief Bind this thread to a context and execute the scheduled tasks on that context/thread.
+ *
+ * This should not be called from a thread already bound to a context, once this call returns
+ * the thread is unbound from the context.
+ *
+ * The handle must not be null.
+ */
+export template <worker_context Context>
+constexpr void execute(Context &context, sched_handle<Context> handle) {
+
+  LF_ASSUME(handle);
+
+  if (thread_local_context<Context> != nullptr) {
+    LF_THROW(execute_error{});
+  }
+
+  thread_local_context<Context> = std::addressof(context);
+
+  defer _ = [] static noexcept -> void {
+    thread_local_context<Context> = nullptr;
+  };
+
+  auto *frame = static_cast<frame_type<checkpoint_t<Context>> *>(get(key(), handle));
+
+  // We should only take the stack if it was the stack owner that we are
+  // resuming from, same logic as in switch_awaitable_suspend
+  if (frame->steals == 0) {
+
+    auto const &ckpt = frame->stack_ckpt;
+
+    if (ckpt != get_tls_stack<Context>().checkpoint()) {
+      context.stack().acquire(ckpt);
+    }
+  }
+
+  frame->handle().resume();
+}
+
+export struct steal_overflow_error final : libfork_exception {
+  [[nodiscard]]
+  constexpr auto what() const noexcept -> const char * override {
+    return "a single task has been stolen 65,535 times";
+  }
+};
+
+/**
+ * @brief Consume a steal handle, marks it as stolen and returns the handle of the stolen task.
+ *
+ * The current thread must resume the handle.
+ *
+ * May throw `steal_overflow_error` if the task has been stolen enough times to
+ * overflow the steal counter.
+ */
+template <worker_context Context>
+constexpr auto consume(steal_handle<Context> handle) -> std::coroutine_handle<> {
+
+  LF_ASSUME(handle);
+
+  auto *frame = static_cast<frame_type<checkpoint_t<Context>> *>(get(key(), handle));
+
+  if (frame->steals + 1 == k_u16_max) {
+    // Can't allow equal to k_u16_max because that is the sentinel
+    LF_THROW(steal_overflow_error{});
+  }
+
+  frame->steals += 1;
+
+  return frame->handle();
+}
+
+/**
+ * @brief Bind this thread to a context and execute the scheduled tasks on that context/thread.
+ *
+ * This should not be called from a thread already bound to a context, once this call returns
+ * the thread is unbound from the context.
+ *
+ * The handle must not be null.
+ */
+export template <worker_context Context>
+constexpr void execute(Context &context, steal_handle<Context> handle) {
+
+  LF_ASSUME(handle);
+
+  if (thread_local_context<Context> != nullptr) {
+    LF_THROW(execute_error{});
+  }
+
+  thread_local_context<Context> = std::addressof(context);
+
+  defer _ = [] static noexcept -> void {
+    thread_local_context<Context> = nullptr;
+  };
+
+  consume(handle).resume();
+}
+
+} // namespace lf
diff --git a/src/core/final_suspend.cxx b/src/core/final_suspend.cxx
new file mode 100644
index 000000000..b10f54238
--- /dev/null
+++ b/src/core/final_suspend.cxx
@@ -0,0 +1,243 @@
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/compiler.hpp"
+export module libfork.core:final_suspend;
+
+import std;
+
+import libfork.utils;
+
+import :concepts_context;
+import :frame;
+import :handles;
+import :thread_locals;
+
+namespace lf {
+
+template <typename T = void>
+using coro = std::coroutine_handle<T>;
+
+template <worker_context Context>
+using frame_t = frame_type<checkpoint_t<Context>>;
+
+// =============== Extract exception =============== //
+
+/**
+ * @brief Pull an exception out of a frame and clean-up the union/allocation.
+ */
+template <typename Checkpoint>
+[[nodiscard]]
+constexpr auto extract_exception(frame_type<Checkpoint> *frame) noexcept -> std::exception_ptr {
+
+  LF_ASSUME(frame->exception_bit); // Should only be called if an exception was thrown.
+
+  // Local copy
+  std::exception_ptr except = std::move(*frame->except);
+
+  // Should have been set by stash_current_exception
+  LF_ASSUME(except != nullptr);
+
+  // Clean-up exception state
+  frame->exception_bit = 0;
+  frame->except.destroy();
+
+  return except; // NRVO
+}
+
+// =============== Final =============== //
+
+/**
+ * @brief The full final suspend logic.
+ *
+ * The final suspend logic is fully expressed in this function in brief:
+ *
+ * - Try to resume parent if a call.
+ * - Try to resume parent if a fork with no stealing.
+ * - Try to resume a stolen forked task if last to complete.
+ *
+ * This function also handles cancellation (of the parent) by iteratively
+ * climbing up the parent chain.
+ *
+ * This function is split and repeated as two separate functions to allow the
+ * hot-path code to be inlined more easily into the final suspend.
+ */
+template <worker_context Context>
+[[nodiscard]]
+constexpr auto final_suspend_full(Context &context, frame_t<Context> *frame) noexcept -> coro<> {
+  for (;;) {
+    // Validate final state
+    LF_ASSUME(frame);
+    LF_ASSUME(frame->kind != category::root);
+    LF_ASSUME(frame->steals == 0);
+    LF_ASSUME(frame->joins == k_u16_max);
+    LF_ASSUME(frame->exception_bit == 0);
+
+    // Local copies (before we destroy frame)
+    category const kind = frame->kind;
+
+    frame_t<Context> *parent = not_null(frame->parent);
+
+    // Before resuming the next (or exiting) we should clean-up the current frame.
+    // Can't use frame from this point onwards
+    frame->handle().destroy();
+
+    if (kind == category::call) {
+      return parent->handle();
+    }
+
+    // Given we are not a call we must be a fork hence, our
+    // parent can't be a root as they can only call.
+    LF_ASSUME(kind == category::fork);
+    LF_ASSUME(parent->kind != category::root);
+
+    if (steal_handle<Context> last_pushed = context.pop()) {
+      // No-one stole continuation, we are the exclusive owner of parent -> just keep ripping!
+      LF_ASSUME(last_pushed == steal_handle<Context>{key(), parent});
+      // This is not a join point so no state (i.e. counters) is guaranteed.
+      return parent->handle();
+    }
+
+    // An owner is a worker who:
+    //
+    // - Created the task.
+    // - OR had the task submitted to them.
+    // - OR won the task at a join.
+    //
+    // An owner of a task owns the stack the task is on.
+    //
+    // As the worker who completed the child task this thread owns the stack the child task was on.
+    //
+    // Either:
+    //
+    // 1. The parent is on the same stack as the child.
+    // 2. OR the parent is on a different stack to the child.
+    //
+    // Case (1) implies: we owned the parent; forked the child task; then the parent was then stolen.
+    // Case (2) implies: we stole the parent task; then forked the child; then the parent was stolen.
+    //
+    // Case (2) implies that our stack is empty.
+
+    // As soon as we do the `fetch_sub` below the parent task is no longer safe
+    // to access as it may be resumed and then destroyed by another thread. Hence
+    // we must make copies on-the-stack of any data we may need if we lose the
+    // join race.
+    bool const owner = parent->stack_ckpt == context.stack().checkpoint();
+
+    // As soon as we do the fetch_sub (if we lose) someone may acquire
+    // the stack so we must prepare it for release now.
+    auto release_key = context.stack().prepare_release();
+
+    // Register with parent we have completed this child task.
+    if (parent->atomic_joins().fetch_sub(1, std::memory_order_release) == 1) {
+      // Parent has reached join and we are the last child task to complete. We
+      // are the exclusive owner of the parent and therefore, we must continue
+      // parent. As we won the race, acquire all writes before resuming.
+      std::atomic_thread_fence(std::memory_order_acquire);
+
+      if (!owner) {
+        // In case of scenario (2) we must acquire the parent's stack.
+        context.stack().acquire(std::as_const(parent->stack_ckpt));
+      }
+
+      // Must reset parent's control block before resuming parent.
+      parent->reset_counters();
+
+      if (parent->stop_requested()) [[unlikely]] {
+        // Don't resume if stopped
+        if constexpr (LF_COMPILER_EXCEPTIONS) {
+          if (parent->exception_bit) [[unlikely]] {
+            std::ignore = extract_exception(parent);
+          }
+        }
+        frame = parent;
+        continue;
+      }
+
+      return parent->handle();
+    }
+
+    if (owner) {
+      // We were unable to resume the parent and we were its owner, as the
+      // resuming thread will take ownership of the parent's we must give it up.
+      context.stack().release(std::move(release_key));
+    }
+
+    // We did not win the join-race, we cannot dereference the parent pointer now
+    // as the frame may now be freed by the winner. Parent has not reached join
+    // or we are not the last child to complete. We are now out of jobs, we must
+    // yield to the executor.
+
+    // Else, case (2), our stack has no allocations on it, it may be used later.
+    return std::noop_coroutine();
+  }
+}
+
+template <worker_context Context>
+[[nodiscard]]
+constexpr auto final_suspend_trailing(Context &context, frame_t<Context> *parent) noexcept -> coro<> {
+
+  bool const owner = parent->stack_ckpt == context.stack().checkpoint();
+
+  auto release_key = context.stack().prepare_release();
+
+  if (parent->atomic_joins().fetch_sub(1, std::memory_order_release) == 1) {
+
+    std::atomic_thread_fence(std::memory_order_acquire);
+
+    if (!owner) {
+      context.stack().acquire(std::as_const(parent->stack_ckpt));
+    }
+
+    parent->reset_counters();
+
+    if (parent->stop_requested()) [[unlikely]] {
+      if constexpr (LF_COMPILER_EXCEPTIONS) {
+        if (parent->exception_bit) [[unlikely]] {
+          std::ignore = extract_exception(parent);
+        }
+      }
+      return final_suspend_full<Context>(context, parent);
+    }
+
+    return parent->handle();
+  }
+
+  if (owner) {
+    context.stack().release(std::move(release_key));
+  }
+
+  return std::noop_coroutine();
+}
+
+template <worker_context Context>
+[[nodiscard]]
+constexpr auto final_suspend_leading(frame_t<Context> *frame) noexcept -> coro<> {
+
+  LF_ASSUME(frame);
+  LF_ASSUME(frame->steals == 0);
+  LF_ASSUME(frame->joins == k_u16_max);
+  LF_ASSUME(frame->exception_bit == 0);
+
+  category const kind = frame->kind;
+
+  frame_t<Context> *parent = not_null(frame->parent);
+
+  frame->handle().destroy();
+
+  if (kind == category::call) {
+    return parent->handle();
+  }
+
+  LF_ASSUME(kind == category::fork);
+
+  Context &context = get_tls_context<Context>();
+
+  if (steal_handle<Context> last_pushed = context.pop()) {
+    LF_ASSUME(last_pushed == steal_handle<Context>{key(), parent});
+    return parent->handle();
+  }
+
+  return final_suspend_trailing<Context>(context, parent);
+}
+
+} // namespace lf
diff --git a/src/core/frame.cxx b/src/core/frame.cxx
new file mode 100644
index 000000000..3a12696e2
--- /dev/null
+++ b/src/core/frame.cxx
@@ -0,0 +1,80 @@
+module;
+#include "libfork/__impl/compiler.hpp"
+#include "libfork/__impl/utils.hpp"
+export module libfork.core:frame;
+
+import std;
+
+import libfork.utils;
+
+import :stop;
+
+namespace lf {
+
+enum class category : std::uint8_t {
+  call = 0,
+  fork,
+  root,
+};
+
+struct frame_base {};
+
+// TODO: make everything (deque etc) allocator aware...
+
+template <typename Checkpoint>
+struct frame_type : frame_base {
+
+  // == Member variables == //
+
+  // TODO: add checked accessors for all the things (including except etc)
+
+  // Only set if an exception is thrown, otherwise uninit
+  uninitialized<std::exception_ptr> except;
+
+  frame_type *parent;
+  stop_source::stop_token stop_token;
+
+  [[no_unique_address]]
+  Checkpoint stack_ckpt;
+
+  ATOMIC_ALIGN(std::uint32_t) joins = 0;        // Atomic is 32 bits for speed
+  std::uint16_t steals = 0;                     // In debug do overflow checking
+  category kind = static_cast<category>(0);     // Fork/Call
+  ATOMIC_ALIGN(std::uint8_t) exception_bit = 0; // Atomically set
+
+  // == Member functions == //
+
+  // Explicitly post construction, this allows the compiler to emit a single
+  // instruction for the zero init then an instruction for the joins init,
+  // instead of three instructions.
+  explicit constexpr frame_type(Checkpoint &&ckpt) noexcept(std::is_nothrow_move_constructible_v<Checkpoint>)
+      : stack_ckpt(std::move(ckpt)) {
+    joins = k_u16_max;
+  }
+
+  [[nodiscard]]
+  constexpr auto stop_requested() const noexcept -> bool {
+    // TODO: Should exception trigger stop?
+    return stop_token.stop_requested();
+  }
+
+  [[nodiscard]]
+  constexpr auto handle() LF_HOF(std::coroutine_handle<frame_type>::from_promise(*this))
+
+  [[nodiscard]]
+  constexpr auto atomic_joins() noexcept -> std::atomic_ref<std::uint32_t> {
+    return std::atomic_ref{joins};
+  }
+
+  [[nodiscard]]
+  constexpr auto atomic_except() noexcept -> std::atomic_ref<std::uint8_t> {
+    return std::atomic_ref{exception_bit};
+  }
+
+  constexpr void reset_counters() noexcept {
+    joins = k_u16_max;
+    steals = 0;
+  }
+};
+
+} // namespace lf
diff --git a/src/core/handles.cxx b/src/core/handles.cxx
new file mode 100644
index 000000000..5f9ce4ddb
--- /dev/null
+++ b/src/core/handles.cxx
@@ -0,0 +1,72 @@
+export module libfork.core:handles;
+
+import libfork.utils;
+
+import :frame;
+
+namespace lf {
+
+// =================== Untyped handles =================== //
+
+class handle {
+ public:
+  constexpr handle() = default;
+  constexpr handle(key_t, frame_base *ptr) noexcept
+      : m_ptr{ptr} {}
+  constexpr auto operator==(handle const &) const noexcept -> bool = default;
+  constexpr explicit operator bool() const noexcept { return m_ptr != nullptr; }
+
+ private:
+  [[nodiscard]]
+  constexpr friend auto get(key_t, handle h) noexcept -> frame_base * {
+    return h.m_ptr;
+  }
+
+  frame_base *m_ptr = nullptr;
+};
+
+/**
+ * @brief An untyped steal-handle.
+ *
+ * For use by context policies that need to store handles in an untyped manner.
+ */
+export struct unsafe_steal_handle : handle {
+  using handle::handle;
+};
+
+/**
+ * @brief An untyped schedule-handle.
+ *
+ * For use by context policies that need to store handles in an untyped manner.
+ */
+export struct unsafe_sched_handle : handle {
+  using handle::handle;
+};
+
+// =================== Tagged handles =================== //
+
+/**
+ * @brief A handle to a task that can be stolen and resumed with `execute`.
+ *
+ * The coroutine behind this task is always suspended at fork point.
+ *
+ * @tparam T the Context of the handle.
+ */
+export template <typename T>
+struct steal_handle : unsafe_steal_handle {
+  using unsafe_steal_handle::unsafe_steal_handle;
+};
+
+/**
+ * @brief A handle to a task that can be resumed with `execute`.
+ *
+ * The coroutine behind this task is either not-yet-started or suspended at a context-switch.
+ *
+ * @tparam T the Context of the handle.
+ */
+export template <typename T>
+struct sched_handle : unsafe_sched_handle {
+  using unsafe_sched_handle::unsafe_sched_handle;
+};
+
+} // namespace lf
diff --git a/src/core/lift.cxx b/src/core/lift.cxx
new file mode 100644
index 000000000..a5e627a82
--- /dev/null
+++ b/src/core/lift.cxx
@@ -0,0 +1,90 @@
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/exception.hpp"
+#include "libfork/__impl/utils.hpp"
+export module libfork.core:lift;
+
+import libfork.utils;
+
+import :task;
+import :ops;
+import :final_suspend;
+import :awaitables;
+
+namespace lf {
+
+template <typename T>
+using lift_store_t = std::conditional_t<std::is_lvalue_reference_v<T>, T, std::remove_cvref_t<T>>;
+
+struct lift_impl {
+ private:
+  template <typename Fn, typename Context, typename... Args>
+  using task_t = task<std::invoke_result_t<Fn, Args...>, Context>;
+
+  template <typename Fn, typename Context, typename... Args>
+  static auto impl(lift_store_t<Fn> fn, lift_store_t<Args>... args) -> task_t<Fn, Context, Args...> {
+    co_return std::invoke(static_cast<Fn>(fn), static_cast<Args>(args)...);
+  }
+
+ public:
+  template <typename Fn, typename Context, typename... Args>
+    requires std::invocable<Fn &&, Args &&...>
+  static auto operator()(env<Context>, Fn &&fn, Args &&...args) -> task_t<Fn &&, Context, Args &&...> {
+    return impl<Fn &&, Context, Args &&...>(LF_FWD(fn), LF_FWD(args)...);
+  }
+};
+
+// TODO: merge fn in ops to args
+
+/**
+ * @brief Lifts a synchronous function into an asynchronous task.
+ *
+ * Forked lifted tasks capture rvalues by value.
+ * Called lifted tasks have an optimized path that avoids creating a new task.
+ *
+ * Both invocations respect cancellation and push exceptions to the parent scope.
+ */
+export inline constexpr lift_impl lift{};
+
+/**
+ * @brief An optimization for non-forked lifted functions.
+ */
+template <worker_context Context, bool StopToken, typename R, typename Fn, typename... Args>
+struct lifted_awaitable : std::suspend_never {
+
+  [[no_unique_address]]
+  pkg<category::call, StopToken, Context, R, Fn, Args...> pkg;
+
+  frame_t<Context> *parent;
+
+  constexpr void await_resume() noexcept {
+
+    // Noop if stop has been requested.
+    if constexpr (StopToken) {
+      if (pkg.stop_token.stop_requested()) {
+        return;
+      }
+    } else {
+      if (parent->stop_requested()) {
+        return;
+      }
+    }
+
+    LF_TRY {
+      if constexpr (std::is_void_v<R>) {
+        std::move(pkg.args).apply([](lift_impl, auto &&fn, auto &&...args) -> void {
+          std::invoke(LF_FWD(fn), LF_FWD(args)...);
+        });
+      } else {
+        std::move(pkg.args).apply([addr = pkg.return_addr](lift_impl, auto &&fn, auto &&...args) -> void {
+          LF_ASSUME(addr);
+          *addr = std::invoke(LF_FWD(fn), LF_FWD(args)...);
+        });
+      }
+    } LF_CATCH(...) {
+      stash_current_exception(parent);
+    }
+  }
+};
+
+} // namespace lf
diff --git a/src/core/ops.cxx b/src/core/ops.cxx
new file mode 100644
index 000000000..13e4c9d55
--- /dev/null
+++ b/src/core/ops.cxx
@@ -0,0 +1,208 @@
+module;
+#include "libfork/__impl/utils.hpp"
+export module libfork.core:ops;
+
+import std;
+
+import libfork.utils;
+
+import :concepts_invocable;
+import :frame;
+import :stop;
+
+namespace lf {
+
+// Placeholder types for absent optional fields.
+struct no_stop_t {};
+struct no_ret_t {};
+
+// =============== Value-or-reference storage policy =============== //
+
+// For rvalue-reference arguments that are trivially copyable and fit in two
+// pointer-sized words, store by value inside pkg instead of keeping a reference.
+// This lets [[no_unique_address]] collapse empty functors to zero bytes and
+// allows the compiler to treat the stored values as local data (no aliasing).
+template <typename T>
+concept small_trivially_copyable = !std::is_reference_v<T>                     //
+                                   && std::is_trivially_copyable_v<T>          //
+                                   && sizeof(T) <= 2 * sizeof(void *)          //
+                                   && alignof(T) <= alignof(std::max_align_t); //
+
+// Only collapses rvalue refs; lvalue refs are kept as-is to preserve reference semantics.
+template <typename T>
+using store_as_t =
+    std::conditional_t<std::is_rvalue_reference_v<T> && small_trivially_copyable<std::remove_cvref_t<T>>,
+                       std::remove_cvref_t<T>,
+                       T>;
+
+// clang-format off
+
+template <category Cat, bool StopToken, typename Context, typename R, typename Fn, typename... Args>
+struct [[nodiscard("You should immediately co_await this!")]] pkg {
+  [[no_unique_address]] std::conditional_t<StopToken, stop_source::stop_token, no_stop_t> stop_token;
+  [[no_unique_address]] std::conditional_t<std::is_void_v<R>, no_ret_t, R *> return_addr;
+  [[no_unique_address]] tuple<Fn, Args...> args;
+};
+
+// clang-format on
+
+// =============== Join =============== //
+
+struct join_type {};
+
+/**
+ * @brief Base class shared by scope_ops and child_scope_ops.
+ *
+ * Provides a member `join()` so that `co_await sc.join()` works on any scope type.
+ */
+struct scope_base {
+  [[nodiscard("You should immediately co_await this!")]]
+  static constexpr auto join() noexcept -> join_type {
+    return {};
+  }
+};
+
+// =============== Scope ops (no embedded stop source) =============== //
+
+template <typename Context>
+struct scope_ops : scope_base {
+ private:
+  template <typename R, typename Fn, typename... Args>
+  using call_pkg = pkg<category::call, false, Context, R, store_as_t<Fn &&>, store_as_t<Args &&>...>;
+
+  template <typename R, typename Fn, typename... Args>
+  using fork_pkg = pkg<category::fork, false, Context, R, store_as_t<Fn &&>, store_as_t<Args &&>...>;
+
+ public:
+  // Default constructible
+  scope_ops() noexcept = default;
+
+  // Immovable
+  scope_ops(const scope_ops &) = delete;
+  scope_ops(scope_ops &&) = delete;
+  auto operator=(const scope_ops &) -> scope_ops & = delete;
+  auto operator=(scope_ops &&) -> scope_ops & = delete;
+
+  // === Fork === //
+
+  template <typename R, typename... Args, async_invocable_to<R, Context, Args...> Fn>
+  static constexpr auto fork(R *ret, Fn &&fn, Args &&...args) noexcept -> fork_pkg<R, Fn, Args...> {
+    return {.return_addr = ret, .args = {LF_FWD(fn), LF_FWD(args)...}};
+  }
+  template <typename... Args, async_invocable<Context, Args...> Fn>
+  static constexpr auto fork_drop(Fn &&fn, Args &&...args) noexcept -> fork_pkg<void, Fn, Args...> {
+    return {.return_addr = {}, .args = {LF_FWD(fn), LF_FWD(args)...}};
+  }
+  template <typename... Args, async_invocable_to<void, Context, Args...> Fn>
+  static constexpr auto fork(Fn &&fn, Args &&...args) noexcept -> fork_pkg<void, Fn, Args...> {
+    return {.return_addr = {}, .args = {LF_FWD(fn), LF_FWD(args)...}};
+  }
+
+  // === Call === //
+
+  template <typename R, typename... Args, async_invocable_to<R, Context, Args...> Fn>
+  static constexpr auto call(R *ret, Fn &&fn, Args &&...args) noexcept -> call_pkg<R, Fn, Args...> {
+    return {.return_addr = ret, .args = {LF_FWD(fn), LF_FWD(args)...}};
+  }
+  template <typename... Args, async_invocable<Context, Args...> Fn>
+  static constexpr auto call_drop(Fn &&fn, Args &&...args) noexcept -> call_pkg<void, Fn, Args...> {
+    return {.return_addr = {}, .args = {LF_FWD(fn), LF_FWD(args)...}};
+  }
+  template <typename... Args, async_invocable_to<void, Context, Args...> Fn>
+  static constexpr auto call(Fn &&fn, Args &&...args) noexcept -> call_pkg<void, Fn, Args...> {
+    return {.return_addr = {}, .args = {LF_FWD(fn), LF_FWD(args)...}};
+  }
+};
+
+// ==== Scope awaitable ==== //
+
+template <worker_context Context>
+struct scope_awaitable : std::suspend_never {
+  static constexpr auto await_resume() noexcept -> scope_ops<Context> { return {}; }
+};
+
+struct scope_type {};
+
+export [[nodiscard("You should immediately co_await this!")]]
+constexpr auto scope() noexcept -> scope_type {
+  return {};
+}
+
+// =============== Child scope ops (with embedded stop source) =============== //
+
+/**
+ * @brief A scope that is a stop_source.
+ */
+template <typename Context>
+struct child_scope_ops : scope_base, stop_source {
+ private:
+  template <typename R, typename Fn, typename... Args>
+  using call_pkg = pkg<category::call, true, Context, R, store_as_t<Fn &&>, store_as_t<Args &&>...>;
+
+  template <typename R, typename Fn, typename... Args>
+  using fork_pkg = pkg<category::fork, true, Context, R, store_as_t<Fn &&>, store_as_t<Args &&>...>;
+
+ public:
+  /**
+   * @brief Construct the scope, chaining its stop source onto the parent's token.
+   */
+  explicit constexpr child_scope_ops(stop_source::stop_token parent) noexcept
+      : stop_source(parent) {}
+
+  // Immovable (stop_source base is immovable)
+  child_scope_ops(const child_scope_ops &) = delete;
+  child_scope_ops(child_scope_ops &&) = delete;
+  auto operator=(const child_scope_ops &) -> child_scope_ops & = delete;
+  auto operator=(child_scope_ops &&) -> child_scope_ops & = delete;
+
+  // === Fork (binds this scope's stop source as child stop source) === //
+
+  template <typename R, typename... Args, async_invocable_to<R, Context, Args...> Fn>
+  constexpr auto fork(R *ret, Fn &&fn, Args &&...args) noexcept -> fork_pkg<R, Fn, Args...> {
+    return {.stop_token = token(), .return_addr = ret, .args = {LF_FWD(fn), LF_FWD(args)...}};
+  }
+  template <typename... Args, async_invocable<Context, Args...> Fn>
+  constexpr auto fork_drop(Fn &&fn, Args &&...args) noexcept -> fork_pkg<void, Fn, Args...> {
+    return {.stop_token = token(), .return_addr = {}, .args = {LF_FWD(fn), LF_FWD(args)...}};
+  }
+  template <typename... Args, async_invocable_to<void, Context, Args...> Fn>
+  constexpr auto fork(Fn &&fn, Args &&...args) noexcept -> fork_pkg<void, Fn, Args...> {
+    return {.stop_token = token(), .return_addr = {}, .args = {LF_FWD(fn), LF_FWD(args)...}};
+  }
+
+  // === Call (binds this scope's stop source as child stop source) === //
+
+  template <typename R, typename... Args, async_invocable_to<R, Context, Args...> Fn>
+  constexpr auto call(R *ret, Fn &&fn, Args &&...args) noexcept -> call_pkg<R, Fn, Args...> {
+    return {.stop_token = token(), .return_addr = ret, .args = {LF_FWD(fn), LF_FWD(args)...}};
+  }
+  template <typename... Args, async_invocable<Context, Args...> Fn>
+  constexpr auto call_drop(Fn &&fn, Args &&...args) noexcept -> call_pkg<void, Fn, Args...> {
+    return {.stop_token = token(), .return_addr = {}, .args = {LF_FWD(fn), LF_FWD(args)...}};
+  }
+  template <typename... Args, async_invocable_to<void, Context, Args...> Fn>
+  constexpr auto call(Fn &&fn, Args &&...args) noexcept -> call_pkg<void, Fn, Args...> {
+    return {.stop_token = token(), .return_addr = {}, .args = {LF_FWD(fn), LF_FWD(args)...}};
+  }
+};
+
+// =============== child_scope_awaitable =============== //
+
+template <worker_context Context>
+struct child_scope_awaitable : std::suspend_never {
+
+  stop_source::stop_token parent_stop_token;
+
+  constexpr auto await_resume(this child_scope_awaitable self) noexcept -> child_scope_ops<Context> {
+    return child_scope_ops<Context>{self.parent_stop_token};
+  }
+};
+
+struct child_scope_type {};
+
+export [[nodiscard("You should immediately co_await this!")]]
+constexpr auto child_scope() noexcept -> child_scope_type {
+  return {};
+}
+
+} // namespace lf
diff --git a/src/core/poly_context.cxx b/src/core/poly_context.cxx
new file mode 100644
index 000000000..f651d11b0
--- /dev/null
+++ b/src/core/poly_context.cxx
@@ -0,0 +1,58 @@
+module;
+#include "libfork/__impl/exception.hpp"
+export module libfork.core:poly_context;
+
+import std;
+
+import :concepts_stack;
+import :handles;
+import :exception;
+
+namespace lf {
+
+export template <worker_stack Stack>
+class base_context {
+ public:
+  auto stack() noexcept -> Stack & { return m_stack; }
+
+ protected:
+  constexpr base_context() = default;
+
+  template <typename... Args>
+    requires std::constructible_from<Stack, Args...>
+  explicit(sizeof...(Args) ==
+           1) constexpr base_context(Args &&...args) noexcept(std::is_nothrow_constructible_v<Stack, Args...>)
+      : m_stack(std::forward<Args>(args)...) {}
+
+ private:
+  Stack m_stack;
+};
+
+export struct post_error final : libfork_exception {
+  [[nodiscard]]
+  constexpr auto what() const noexcept -> const char * override {
+    return "derived context does not support posting tasks.";
+  }
+};
+
+/**
+ * @brief A worker context polymorphic in push/pop/post.
+ *
+ * This is the canonical/blessed base class in libfork for polymorphic uses
+ * cases. Although possible, libfork does not recommend contexts polymorphic
+ * in the `.stack` member
+ */
+export template <worker_stack Stack>
+class poly_context : public base_context<Stack> {
+ public:
+  using base_context<Stack>::base_context;
+
+  virtual void push(steal_handle<poly_context>) = 0;
+  virtual auto pop() noexcept -> steal_handle<poly_context> = 0;
+
+  virtual void post([[maybe_unused]] sched_handle<poly_context> handle) { LF_THROW(post_error{}); }
+
+  virtual ~poly_context() noexcept = default;
+};
+
+} // namespace lf
diff --git a/src/core/projected.cxx b/src/core/projected.cxx
new file mode 100644
index 000000000..db2781a48
--- /dev/null
+++ b/src/core/projected.cxx
@@ -0,0 +1,87 @@
+export module libfork.core:projected;
+
+import std;
+
+import :concepts_invocable;
+import :concepts_context;
+import :concepts_indirect;
+
+namespace lf {
+
+template <typename I>
+struct conditional_difference_type : indirect_value_customization {};
+
+template <std::weakly_incrementable I>
+struct conditional_difference_type<I> : indirect_value_customization {
+  using difference_type = std::iter_difference_t<I>;
+};
+
+// C++26 ADL firewalled implementation.
+template <typename I, typename Fn, typename Context>
+struct projected_impl;
+
+// sync
+template <typename I, typename Fn, typename>
+struct projected_impl {
+  struct type : conditional_difference_type<I> {
+   private:
+    friend struct indirect_value<type>;
+    using indirect_value_type = std::invoke_result_t<Fn &, indirect_value_t<I>>;
+
+    using reference_type = std::invoke_result_t<Fn &, std::iter_reference_t<I>>;
+
+   public:
+    using value_type = std::remove_cvref_t<reference_type>;
+    auto operator*() const -> reference_type;
+  };
+};
+
+// async
+template <typename I, typename Fn, typename Context>
+  requires async::indirectly_unary_invocable<Fn, Context, I>
+struct projected_impl<I, Fn, Context> {
+  struct type : conditional_difference_type<I> {
+   private:
+    friend struct indirect_value<type>;
+    using indirect_value_type = async_result_t<Fn &, Context, indirect_value_t<I>>;
+
+    using reference_type = async_result_t<Fn &, Context, std::iter_reference_t<I>>;
+
+   public:
+    using value_type = std::remove_cvref_t<reference_type>;
+    auto operator*() const -> reference_type;
+  };
+};
+
+template <typename Fn, typename Context, typename... T>
+concept async_defaultable =
+    ((async_invocable<Fn, Context, T> && std::default_initializable<async_result_t<Fn, Context, T>>) && ...);
+
+template <typename Fn, typename Context, typename I>
+concept async_defaultable_impl =
+    async_defaultable<Fn, Context, std::iter_reference_t<I>, indirect_value_t<I>>;
+
+template <typename Fn, typename Context, typename I>
+concept indirect_async_defaultable =
+    !async::indirectly_regular_unary_invocable<Fn, Context, I> || async_defaultable_impl<Fn, Context, I>;
+
+/**
+ * @brief Test if `I` can be projected through `Fn` in the context of `Context`
+ *
+ * This requires the standard indirectly regular invocable and, in addition,
+ * async projections must return a default initializable type.
+ *
+ * Projectable && async::indirectly_unary_invocable implies an async projection
+ * with default initializable async result type.
+ */
+export template <typename Fn, typename Context, typename I>
+concept projectable =
+    indirectly_regular_unary_invocable<Fn, Context, I> && indirect_async_defaultable<Fn, Context, I>;
+
+/**
+ * @brief A version of `std::projected` that supports both regular invocables and async invocables.
+ */
+export template <worker_context Context, std::indirectly_readable I, projectable<Context, I> Fn>
+using projected = projected_impl<I, Fn, Context>::type;
+
+} // namespace lf
diff --git a/src/core/promise.cxx b/src/core/promise.cxx
new file mode 100644
index 000000000..921d232a7
--- /dev/null
+++ b/src/core/promise.cxx
@@ -0,0 +1,248 @@
+module;
+#include <version>
+
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/exception.hpp"
+#include "libfork/__impl/utils.hpp"
+export module libfork.core:promise;
+
+import std;
+
+import libfork.utils;
+
+import :concepts_awaitable;
+import :concepts_context;
+import :concepts_invocable;
+import :frame;
+import :stop;
+import :task;
+import :thread_locals;
+import :ops;
+import :handles;
+import :final_suspend;
+import :awaitables;
+import :lift;
+
+// TODO: vet constexpr usage in the library
+
+namespace lf {
+
+// =============== Final awaitable =============== //
+
+struct final_awaitable : std::suspend_always {
+  template <returnable T, worker_context Context>
+  constexpr static auto await_suspend(coro<promise_type<T, Context>> handle) noexcept -> coro<> {
+    return final_suspend_leading<Context>(&handle.promise().frame);
+  }
+};
+
+// =============== Frame mixin =============== //
+
+template <worker_context Context>
+struct mixin_frame {
+
+  // === For internal use === //
+
+  using enum category;
+
+  template <typename Self>
+    requires (!std::is_const_v<Self>)
+  [[nodiscard]]
+  constexpr auto handle(this Self &self)
+      LF_HOF(coro<Self>::from_promise(self))
+
+  // === Called by the compiler === //
+
+  // --- Allocation
+
+  static auto operator new(std::size_t sz) noexcept(noexcept(get_tls_stack<Context>().push(sz))) -> void * {
+    void *ptr = get_tls_stack<Context>().push(sz);
+    LF_ASSUME(is_sufficiently_aligned<k_new_align>(ptr));
+    return std::assume_aligned<k_new_align>(ptr);
+  }
+
+  static auto operator delete(void *p, std::size_t sz) noexcept -> void {
+    get_tls_stack<Context>().pop(p, sz);
+  }
+
+  // --- Await transformations
+
+  // Fork/call
+  template <category Cat, bool StopToken, typename R, typename Fn, typename... Args>
+  constexpr auto
+  await_transform(this auto &self, pkg<Cat, StopToken, Context, R, Fn, Args...> &&pkg) noexcept {
+    LF_TRY {
+      return self.await_transform_pkg(std::move(pkg));
+    } LF_CATCH_ALL {
+      stash_current_exception(&self.frame);
+    }
+    return async_awaitable<Cat, Context>{.child = nullptr};
+  }
+
+  // Specialization for lifted functions
+  template <bool StopToken, typename R, typename Fn, typename... Args>
+    requires std::same_as<std::remove_cvref_t<Fn>, lift_impl>
+  constexpr auto
+  await_transform(this auto &self, pkg<category::call, StopToken, Context, R, Fn, Args...> &&pkg) noexcept {
+    return lifted_awaitable<Context, StopToken, R, Fn, Args...>{
+        .pkg = std::move(pkg),
+        .parent = &self.frame,
+    };
+  }
+
+  // Custom awaitable
+  template <awaitable<Context> T>
+  static constexpr auto await_transform(T &&x)
+      LF_HOF(switch_awaitable_for<Context>(acquire_awaitable(LF_FWD(x))))
+
+  // Join
+  constexpr auto await_transform(this auto &self, join_type) noexcept -> join_awaitable<Context> {
+    return {.frame = &self.frame};
+  }
+
+  // Scope getter (propagate stop token)
+  static constexpr auto await_transform(scope_type) noexcept -> scope_awaitable<Context> { return {}; }
+
+  // Scope getter (new attached stop token)
+  constexpr auto
+  await_transform(this auto const &self, child_scope_type) noexcept -> child_scope_awaitable<Context> {
+    return {.parent_stop_token = self.frame.stop_token};
+  }
+
+  // --- Other
+
+  constexpr static auto initial_suspend() noexcept -> std::suspend_always { return {}; }
+
+  constexpr static auto final_suspend() noexcept -> final_awaitable { return {}; }
+
+  constexpr void unhandled_exception(this auto &self) noexcept {
+    // Stash the exception in the parent which will rethrow at the join.
+    stash_current_exception(self.frame.parent);
+  }
+
+ private:
+  template <category Cat, bool StopToken, typename R, typename Fn, typename... Args>
+  constexpr auto
+  await_transform_pkg(this auto const &self, pkg<Cat, StopToken, Context, R, Fn, Args...> &&pkg) noexcept(
+      async_nothrow_invocable<Fn, Context, Args...>) -> async_awaitable<Cat, Context> {
+
+    using U = async_result_t<Fn, Context, Args...>;
+
+    promise_type<U, Context> *child_promise = get(key(), std::move(pkg.args).apply(ctx_invoke_t<Context>{}));
+
+    LF_ASSUME(child_promise);
+
+    // void can signal drop return.
+    static_assert(std::same_as<R, U> || std::is_void_v<R>);
+
+    if constexpr (!std::is_void_v<R>) {
+      child_promise->return_address = not_null(pkg.return_addr);
+    } else if constexpr (!std::is_void_v<U>) {
+      // Set child's return address to null to inhibit the return
+      child_promise->return_address = nullptr;
+    }
+
+    if constexpr (StopToken) {
+      // TODO: need some kind of API to launch an unstoppable task?
+      LF_ASSUME(pkg.stop_token.stop_possible());
+      child_promise->frame.stop_token = pkg.stop_token;
+    } else {
+      child_promise->frame.stop_token = self.frame.stop_token;
+    }
+
+    return {.child = &child_promise->frame};
+  }
+};
+
+// =============== Promise (void) =============== //
+
+template <worker_context Context>
+struct promise_type<void, Context> : mixin_frame<Context> {
+
+  // Putting init here allows:
+  //  1. Frame not to need to know about the checkpoint type
+  //  2. Compiler merge double read of thread local here and in allocator
+  frame_t<Context> frame{get_tls_stack<Context>().checkpoint()};
+
+  constexpr auto get_return_object() noexcept -> task<void, Context> { return {key(), this}; }
+
+  constexpr static void return_void() noexcept {}
+};
+
+// =============== Promise (non-void) =============== //
+
+template <returnable T, worker_context Context>
+struct promise_type : mixin_frame<Context> {
+
+  // Putting init here allows:
+  //  1. Frame not to need to know about the checkpoint type
+  //  2. Compiler merge double read of thread local here and in allocator
+  frame_t<Context> frame{get_tls_stack<Context>().checkpoint()};
+  T *return_address;
+
+  constexpr auto get_return_object() noexcept -> task<T, Context> { return {key(), this}; }
+
+  template <typename U = T>
+    requires std::assignable_from<T &, U &&>
+  constexpr void return_value(U &&value) noexcept(std::is_nothrow_assignable_v<T &, U &&>) {
+    if (return_address) {
+      *return_address = LF_FWD(value);
+    }
+  }
+};
+
+} // namespace lf
+
+// =============== std specialization =============== //
+
+template <typename R, lf::worker_context Context, typename... Args>
+struct std::coroutine_traits<lf::task<R, Context>, Args...> {
+  using promise_type = ::lf::promise_type<R, Context>;
+};
+
+template <typename R, typename Self, lf::worker_context Context, typename... Args>
+struct std::coroutine_traits<lf::task<R, Context>, Self, Args...> {
+  using promise_type = ::lf::promise_type<R, Context>;
+};
+
+// =============== Layout invariants =============== //
+
+// clang-format off
+
+namespace {
+
+struct unit_checkpoint {
+  auto operator==(unit_checkpoint const &) const -> bool = default;
+};
+
+struct unit_stack {
+  static auto push(std::size_t) -> void *;
+  static auto pop(void *, std::size_t) noexcept -> void;
+  static auto checkpoint() noexcept -> unit_checkpoint;
+  static auto prepare_release() noexcept -> int;
+  static auto release(int) noexcept -> void;
+  static auto acquire(unit_checkpoint) noexcept -> void;
+};
+
+struct unit_context {
+  void push(lf::steal_handle<unit_context>);
+  auto pop() noexcept -> lf::steal_handle<unit_context>;
+  auto stack() noexcept -> unit_stack &;
+};
+
+static_assert(lf::worker_context<unit_context>);
+
+using frame_t = lf::frame_type<unit_checkpoint>;
+
+static_assert(std::is_standard_layout_v<frame_t>);
+static_assert(alignof(lf::promise_type<void, unit_context>) == alignof(frame_t));
+static_assert(alignof(lf::promise_type<int, unit_context>) == alignof(frame_t));
+static_assert(std::is_standard_layout_v<lf::promise_type<void, unit_context>>);
+static_assert(std::is_standard_layout_v<lf::promise_type<int, unit_context>>);
+
+#ifdef __cpp_lib_is_pointer_interconvertible
+static_assert(std::is_pointer_interconvertible_with_class(&lf::promise_type<void, unit_context>::frame));
+static_assert(std::is_pointer_interconvertible_with_class(&lf::promise_type<int, unit_context>::frame));
+#endif
+
+} // namespace
diff --git a/src/core/receiver.cxx b/src/core/receiver.cxx
new file mode 100644
index 000000000..bfa0f42c7
--- /dev/null
+++ b/src/core/receiver.cxx
@@ -0,0 +1,214 @@
+
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/exception.hpp"
+export module libfork.core:receiver;
+
+import std;
+
+import :stop;
+import :exception;
+
+namespace lf {
+
+export struct broken_receiver_error final : libfork_exception {
+  [[nodiscard]]
+  constexpr auto what() const noexcept -> const char * override {
+    return "receiver is in invalid state";
+  }
+};
+
+export struct operation_cancelled_error final : libfork_exception {
+  [[nodiscard]]
+  constexpr auto what() const noexcept -> const char * override {
+    return "operation was cancelled";
+  }
+};
+
+/**
+ * @brief Shared state between a scheduled task and its receiver handle.
+ */
+template <typename T, bool Stoppable = false>
+struct hidden_receiver_state {
+
+  struct empty_1 {};
+  struct empty_2 {};
+
+  alignas(k_new_align) std::array<std::byte, 1024> buffer{};
+
+  [[no_unique_address]]
+  std::conditional_t<std::is_void_v<T>, empty_1, T> return_value{};
+
+  std::exception_ptr exception;
+  std::atomic_flag ready;
+
+  [[no_unique_address]]
+  std::conditional_t<Stoppable, stop_source, empty_2> stop;
+
+  constexpr hidden_receiver_state() = default;
+
+  template <typename... Args>
+    requires std::constructible_from<T, Args...>
+  constexpr explicit(sizeof...(Args) == 1)
+      hidden_receiver_state(Args &&...args) noexcept(std::is_nothrow_constructible_v<T, Args...>)
+      : return_value(std::forward<Args>(args)...) {}
+};
+
+/// Convenience alias — used throughout the core partitions.
+template <typename T, bool Stoppable = false>
+using state_handle = std::shared_ptr<hidden_receiver_state<T, Stoppable>>;
+
+/**
+ * @brief Lightweight move-only handle owning a pre-allocated root task state.
+ *
+ * Construction allocates a `hidden_receiver_state<T, Stoppable>` which embeds a
+ * 1 KiB aligned buffer; the root coroutine frame is placement-constructed
+ * into that buffer by `schedule`.
+ *
+ * Constructors mirror `make_shared` / `allocate_shared`:
+ *
+ *   recv_state<T> s;                               // default-init return value
+ *   recv_state<T> s{v1, v2};                       // in-place init: T{v1, v2}
+ *   recv_state<T> s{allocator_arg, alloc};         // default-init, custom allocator
+ *   recv_state<T> s{allocator_arg, alloc, v1, v2}; // in-place init + custom allocator
+ */
+export template <typename T, bool Stoppable = false>
+class recv_state {
+  using state_type = hidden_receiver_state<T, Stoppable>;
+
+ public:
+  /// Default: value-initialise via `std::make_shared`.
+  constexpr recv_state()
+      : m_ptr(std::make_shared<state_type>()) {}
+
+  /// Value-init from args: forwards `args` to `hidden_receiver_state`'s constructor
+  /// (in-place construction of the return value) via `std::make_shared`.
+  template <typename... Args>
+    requires std::constructible_from<state_type, Args...>
+  constexpr explicit(sizeof...(Args) == 1) recv_state(Args &&...args)
+      : m_ptr(std::make_shared<state_type>(std::forward<Args>(args)...)) {}
+
+  /// Allocator-aware, default return value: allocate via `std::allocate_shared`.
+  template <simple_allocator Alloc>
+  constexpr recv_state(std::allocator_arg_t, Alloc const &alloc)
+      : m_ptr(std::allocate_shared<state_type>(alloc)) {}
+
+  /// Allocator-aware with value-init args.
+  template <simple_allocator Alloc, typename... Args>
+    requires std::constructible_from<state_type, Args...>
+  constexpr recv_state(std::allocator_arg_t, Alloc const &alloc, Args &&...args)
+      : m_ptr(std::allocate_shared<state_type>(alloc, std::forward<Args>(args)...)) {}
+
+  // Move-only.
+  constexpr recv_state(recv_state &&) noexcept = default;
+  constexpr auto operator=(recv_state &&) noexcept -> recv_state & = default;
+  constexpr recv_state(recv_state const &) = delete;
+  constexpr auto operator=(recv_state const &) -> recv_state & = delete;
+
+ private:
+  [[nodiscard]]
+  friend constexpr auto get(key_t, recv_state &&self) noexcept -> state_handle<T, Stoppable> {
+    return std::move(self.m_ptr);
+  }
+
+  state_handle<T, Stoppable> m_ptr;
+};
+
+export template <typename T, bool Stoppable = false>
+class receiver {
+
+  using state_type = hidden_receiver_state<T, Stoppable>;
+
+ public:
+  constexpr receiver(key_t, state_handle<T, Stoppable> state) noexcept
+      : m_state(std::move(state)) {}
+
+  // Move only
+  constexpr receiver(receiver &&) noexcept = default;
+  constexpr receiver(const receiver &) = delete;
+  constexpr auto operator=(receiver &&) noexcept -> receiver & = default;
+  constexpr auto operator=(const receiver &) -> receiver & = delete;
+
+  /**
+   * @brief Test if connected to a receiver state.
+   */
+  [[nodiscard]]
+  constexpr auto valid() const noexcept -> bool {
+    return m_state != nullptr;
+  }
+
+  /**
+   * @brief Test if the associated task has completed (either successfully or with an exception/cancellation).
+   */
+  [[nodiscard]]
+  constexpr auto ready() const -> bool {
+    if (!valid()) {
+      LF_THROW(broken_receiver_error{});
+    }
+    return m_state->ready.test();
+  }
+
+  /**
+   * @brief Wait for the associated task to complete (either successfully or with an exception/cancellation).
+   *
+   * May be called multiple times.
+   */
+  constexpr void wait() const {
+    if (!valid()) {
+      LF_THROW(broken_receiver_error{});
+    }
+    m_state->ready.wait(false);
+  }
+
+  /**
+   * @brief Get a reference to the stop_source for this task, allowing the caller to request cancellation.
+   *
+   * Only available when Stoppable=true.
+   */
+  [[nodiscard]]
+  constexpr auto stop_source() -> stop_source &
+    requires Stoppable
+  {
+    if (!valid()) {
+      LF_THROW(broken_receiver_error{});
+    }
+    return m_state->stop;
+  }
+
+  /**
+   * @brief Wait for the associated task to complete and return its result, or rethrow.
+   *
+   * If the receiver was cancelled this will throw an exception.
+   *
+   * This may only be called once; the state is consumed and the receiver becomes invalid.
+   */
+  [[nodiscard]]
+  constexpr auto get() && -> T {
+
+    wait();
+
+    // State will be cleaned up on unwind
+    std::shared_ptr state = std::exchange(m_state, nullptr);
+
+    LF_ASSUME(state != nullptr);
+
+    if (state->exception) {
+      std::rethrow_exception(state->exception);
+    }
+
+    if constexpr (Stoppable) {
+      if (state->stop.stop_requested()) {
+        LF_THROW(operation_cancelled_error{});
+      }
+    }
+
+    if constexpr (!std::is_void_v<T>) {
+      return std::move(state->return_value);
+    }
+  }
+
+ private:
+  state_handle<T, Stoppable> m_state;
+};
+
+} // namespace lf
diff --git a/src/core/root.cxx b/src/core/root.cxx
new file mode 100644
index 000000000..12c9b955b
--- /dev/null
+++ b/src/core/root.cxx
@@ -0,0 +1,204 @@
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/exception.hpp"
+export module libfork.core:root;
+
+import std;
+
+import :concepts_context;
+import :concepts_invocable;
+import :frame;
+import :promise;
+import :receiver;
+import :thread_locals;
+import :task;
+import :exception;
+
+namespace lf {
+
+/**
+ * @brief Thrown if the root coroutine frame is too large for the embedded buffer.
+ */
+export struct root_alloc_error final : libfork_exception {
+  [[nodiscard]]
+  constexpr auto what() const noexcept -> const char * override {
+    return "root coroutine frame exceeds hidden_receiver_state buffer size";
+  }
+};
+
+struct get_frame_t {};
+
+template <typename Checkpoint>
+struct root_task {
+  struct promise_type {
+
+    frame_type<Checkpoint> frame{Checkpoint{}};
+
+    /// Owns a ref to the hidden_receiver_state hosting this frame's buffer.
+    std::shared_ptr<void> keep_alive;
+
+    template <typename R, bool Stoppable, typename... Args>
+    constexpr explicit promise_type(state_handle<R, Stoppable> const &recv, Args const &...) noexcept
+        : keep_alive(recv) {}
+
+    template <typename R, bool Stoppable, typename... Args>
+    static auto
+    operator new(std::size_t size, state_handle<R, Stoppable> const &recv, Args const &...) -> void * {
+
+      LF_ASSUME(recv != nullptr);
+
+      if (size > recv->buffer.size()) {
+        LF_THROW(root_alloc_error{});
+      }
+
+      return recv->buffer.data();
+    }
+
+    /// No-op: the buffer is owned by the hidden_receiver_state, not the frame.
+    static auto operator delete(void * /*ptr*/, std::size_t /*size*/) noexcept -> void {}
+
+    struct frame_awaitable : std::suspend_never {
+      frame_type<Checkpoint> *frame;
+      [[nodiscard]]
+      constexpr auto await_resume() const noexcept -> frame_type<Checkpoint> * {
+        return frame;
+      }
+    };
+
+    constexpr auto await_transform([[maybe_unused]] get_frame_t tag) noexcept -> frame_awaitable {
+      return {.frame = &frame};
+    }
+
+    struct call_awaitable : std::suspend_always {
+      frame_type<Checkpoint> *child;
+      constexpr auto await_suspend([[maybe_unused]] coro<promise_type> root) const noexcept -> coro<> {
+        return child->handle();
+      }
+    };
+
+    constexpr auto await_transform(frame_type<Checkpoint> *child) noexcept -> call_awaitable {
+      return {.child = child};
+    }
+
+    constexpr auto get_return_object() noexcept -> root_task { return {.promise = this}; }
+
+    constexpr static auto initial_suspend() noexcept -> std::suspend_always { return {}; }
+
+    /**
+     * @brief Custom final_suspend.
+     *
+     * The root coroutine frame lives inside the hidden_receiver_state's embedded
+     * buffer, so the hidden_receiver_state must outlive the frame teardown.
+     *
+     *   1. `std::exchange` the keep-alive shared_ptr into a local on the
+     *      host stack, leaving the promise member null.
+     *   2. `handle.destroy()` — runs parameter + promise destructors (including
+     *      the now-null `keep_alive`) and our no-op `operator delete`.
+     *      No frame-memory access occurs after the handle returns.
+     *   3. On return, the stack-local `shared_ptr<void>` dies; if its ref
+     *      was the last, it destroys the hidden_receiver_state cleanly — we are
+     *      no longer executing inside the buffer.
+     */
+    struct final_awaiter : std::suspend_always {
+      void await_suspend(std::coroutine_handle<promise_type> handle) const noexcept {
+        std::shared_ptr<void> local = std::exchange(handle.promise().keep_alive, nullptr);
+        LF_ASSUME(local != nullptr);
+        handle.destroy();
+        // `local` released here — possibly freeing hidden_receiver_state on return.
+      }
+    };
+
+    constexpr static auto final_suspend() noexcept -> final_awaiter { return {}; }
+
+    constexpr static void return_void() noexcept {}
+
+    [[noreturn]]
+    constexpr void unhandled_exception() noexcept {
+      // Any exceptions escaping the root task are a bug.
+      LF_UNREACHABLE();
+    }
+  };
+
+  promise_type *promise;
+};
+
+template <worker_context Context, typename R, bool Stoppable, typename Fn, typename... Args>
+  requires async_invocable_to<Fn, R, Context, Args...>
+[[nodiscard]]
+auto //
+root_pkg(state_handle<R, Stoppable> recv, Fn fn, Args... args) -> root_task<checkpoint_t<Context>> {
+
+  // This should be resumed on a valid context.
+  LF_ASSUME(thread_local_context<Context> != nullptr);
+
+  using checkpoint = checkpoint_t<Context>;
+
+  // Pointer to this root_task's own frame.
+  frame_type<checkpoint> *root = not_null(co_await get_frame_t{});
+
+  // Manual "call" invocation of the user-supplied task.
+
+  using result_type = async_result_t<Fn, Context, Args...>;
+  using promise_type = promise_type<result_type, Context>;
+
+  promise_type *child = nullptr;
+
+  if (root->stop_requested()) {
+    // The root task was cancelled before it even started, we can skip
+    // straight to cleanup.
+    goto cleanup;
+  }
+
+  LF_TRY {
+    // Potentially throwing
+    child = get(key(), ctx_invoke_t<Context>{}(std::move(fn), std::move(args)...));
+  } LF_CATCH_ALL {
+    recv->exception = std::current_exception();
+    goto cleanup;
+  }
+
+  LF_ASSUME(child != nullptr);
+
+  // Propagate parent/stop info to child
+  child->frame.parent = root;
+  child->frame.stop_token = root->stop_token;
+
+  LF_ASSUME(child->frame.kind == category::call);
+
+  if constexpr (!std::is_void_v<async_result_t<Fn, Context, Args...>>) {
+    child->return_address = std::addressof(recv->return_value);
+  }
+
+  // Begin normal execution of the child task, it will clean itself
+  // up (i.e. .destroy()) at the final suspend
+  co_await &child->frame;
+
+  // Now we have been resumed the child is done, it could have completed via:
+  //
+  // - Normal return
+  // - Exception
+  // - Cancellation (in which case it would have dropped any exceptions)
+  //
+  // For symmetry with a normal task we unconditionally propagate exceptions here,
+  // effectively this is an `await_resume`.
+
+  if constexpr (LF_COMPILER_EXCEPTIONS) {
+    if (root->exception_bit) {
+      // The child threw an exception, propagate it to the receiver.
+      recv->exception = extract_exception(root);
+    }
+  }
+
+cleanup:
+  // Notify the receiver that the task is done.
+  recv->ready.test_and_set();
+  recv->ready.notify_one();
+
+  LF_ASSUME(root->steals == 0);
+  LF_ASSUME(root->joins == k_u16_max);
+  LF_ASSUME(root->exception_bit == 0);
+
+  co_return;
+}
+
+} // namespace lf
diff --git a/src/core/schedule.cxx b/src/core/schedule.cxx
new file mode 100644
index 000000000..0a328063f
--- /dev/null
+++ b/src/core/schedule.cxx
@@ -0,0 +1,114 @@
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/compiler.hpp"
+#include "libfork/__impl/exception.hpp"
+export module libfork.core:schedule;
+
+import std;
+
+import :concepts_invocable;
+import :concepts_scheduler;
+import :frame;
+import :stop;
+import :thread_locals;
+import :promise;
+import :root;
+import :handles;
+import :receiver;
+import :exception;
+
+namespace lf {
+
+export struct schedule_error final : libfork_exception {
+  [[nodiscard]]
+  constexpr auto what() const noexcept -> const char * override {
+    return "schedule called from within a worker thread!";
+  }
+};
+
+template <typename T>
+concept decay_copyable = std::convertible_to<T, std::decay_t<T>>;
+
+/**
+ * @brief Schedule a function using a caller-provided `recv_state`.
+ *
+ * This will create a root task that stores decayed copies of `Fn` and
+ * `Args...` in its frame, then post it to the scheduler. The root task must
+ * then be resumed by a worker which will perform the invocation of `Fn`.
+ *
+ * The return address/exception and possibly stop token of the root task are
+ * bound to the provided `recv_state` and can be observed by the caller via the
+ * returned `receiver`.
+ *
+ * Strongly exception safe.
+ */
+export template <scheduler Sch, typename R, bool Stoppable, decay_copyable Fn, decay_copyable... Args>
+  requires async_invocable_to<std::decay_t<Fn>, R, context_t<Sch>, std::decay_t<Args>...>
+[[nodiscard("Fire and forget is an anti-pattern")]]
+constexpr auto
+schedule(Sch &&sch, recv_state<R, Stoppable> state, Fn &&fn, Args &&...args) -> receiver<R, Stoppable> {
+
+  using context_type = context_t<Sch>;
+
+  if (thread_local_context<context_type> != nullptr) {
+    LF_THROW(schedule_error{});
+  }
+
+  state_handle<R, Stoppable> state_ptr = get(key(), std::move(state));
+
+  LF_ASSUME(state_ptr != nullptr);
+
+  // root_pkg's operator new may throw root_alloc_error if the frame is
+  // too large; if so, `state_ptr` goes out of scope and destroys the state.
+  root_task task = root_pkg<context_type>(state_ptr, std::forward<Fn>(fn), std::forward<Args>(args)...);
+
+  LF_ASSUME(task.promise != nullptr);
+
+  task.promise->frame.kind = category::root;
+  task.promise->frame.parent = nullptr;
+
+  if constexpr (Stoppable) {
+    task.promise->frame.stop_token = state_ptr->stop.token();
+  } else {
+    task.promise->frame.stop_token = stop_source::stop_token{}; // non-cancellable root
+  }
+
+  LF_TRY {
+    std::forward<Sch>(sch).post(sched_handle<context_type>{key(), &task.promise->frame});
+    // If ^ didn't throw then the root_task will destroy itself at the final suspend.
+  } LF_CATCH_ALL {
+    // Otherwise, if it did throw, we must clean up
+    task.promise->frame.handle().destroy();
+    LF_RETHROW;
+  }
+
+  return {key(), std::move(state_ptr)};
+}
+
+template <typename T>
+concept schedulable_return = std::is_void_v<T> || (std::default_initializable<T> && std::movable<T>);
+
+template <typename Fn, typename Context, typename... Args>
+concept default_schedulable =
+    async_invocable<Fn, Context, Args...> && schedulable_return<async_result_t<Fn, Context, Args...>>;
+
+template <typename Fn, typename Context, typename... Args>
+using async_decay_result_t = async_result_t<std::decay_t<Fn>, Context, std::decay_t<Args>...>;
+
+/**
+ * @brief Convenience overload: default-constructs a non-cancellable recv_state.
+ *
+ * Uses the default allocator (`make_shared`) for all allocations.
+ */
+export template <scheduler Sch, decay_copyable Fn, decay_copyable... Args>
+  requires default_schedulable<std::decay_t<Fn>, context_t<Sch>, std::decay_t<Args>...>
+[[nodiscard("Fire and forget is an anti-pattern")]]
+constexpr auto
+schedule(Sch &&sch, Fn &&fn, Args &&...args) -> receiver<async_decay_result_t<Fn, context_t<Sch>, Args...>> {
+  using result_type = async_decay_result_t<Fn, context_t<Sch>, Args...>;
+  recv_state<result_type, false> state;
+  return schedule(
+      std::forward<Sch>(sch), std::move(state), std::forward<Fn>(fn), std::forward<Args>(args)...);
+}
+
+} // namespace lf
diff --git a/src/core/stop.cxx b/src/core/stop.cxx
new file mode 100644
index 000000000..149b0adde
--- /dev/null
+++ b/src/core/stop.cxx
@@ -0,0 +1,126 @@
+export module libfork.core:stop;
+
+import std;
+
+import libfork.utils;
+
+namespace lf {
+
+/**
+ * @brief Similar to a linked-list of std::stop_source but with an embedded stop_state.
+ */
+export class stop_source {
+ public:
+  /**
+   * @brief Lightweight public handle to a stop_source chain.
+   *
+   * A stop_token is a non-owning pointer-sized wrapper around a stop_source.
+   */
+  class stop_token {
+   public:
+    /**
+     * @brief Construct a null (unstoppable) token.
+     */
+    constexpr stop_token() noexcept = default;
+
+    /**
+     * @brief Returns true if a stop source is associated (stopping is possible).
+     */
+    [[nodiscard]]
+    constexpr auto stop_possible() const noexcept -> bool {
+      return m_src != nullptr;
+    }
+
+    /**
+     * @brief Returns true if any stop source in the ancestor chain has been stopped.
+     *
+     * A null token always returns false.
+     *
+     * Complexity: O(chain depth). Every task that creates a child_scope adds one
+     * node to the chain, so deeply-nested task hierarchies pay proportionally more
+     * per stop check.
+     */
+    [[nodiscard]]
+    constexpr auto stop_requested() const noexcept -> bool {
+      return deep_stop_requested(m_src);
+    }
+
+   private:
+    friend class stop_source;
+
+    explicit constexpr stop_token(stop_source const *src) noexcept
+        : m_src(src) {}
+
+    stop_source const *m_src = nullptr;
+  };
+
+  /**
+   * @brief Construct a root stop source with no parent.
+   */
+  constexpr stop_source() noexcept = default;
+
+  /**
+   * @brief Construct a stop source chained onto the given parent token.
+   */
+  constexpr explicit stop_source(stop_token parent) noexcept
+      : m_parent(parent.m_src) {}
+
+  // Immovable
+  constexpr stop_source(const stop_source &) noexcept = delete;
+  constexpr stop_source(stop_source &&) noexcept = delete;
+  constexpr auto operator=(const stop_source &) noexcept -> stop_source & = delete;
+  constexpr auto operator=(stop_source &&) noexcept -> stop_source & = delete;
+
+  /**
+   * @brief Get a handle to this stop source.
+   */
+  [[nodiscard]]
+  constexpr auto token() const noexcept -> stop_token {
+    return stop_token{this};
+  }
+
+  /**
+   * @brief Returns true if any stop source in the ancestor chain has been stopped.
+   *
+   * Complexity: O(chain depth). Every task that creates a child_scope adds one
+   * node to the chain, so deeply-nested task hierarchies pay proportionally more
+   * per stop check.
+   */
+  [[nodiscard]]
+  constexpr auto stop_requested() const noexcept -> bool {
+    return deep_stop_requested(this);
+  }
+
+  /**
+   * @brief Request that this stop source (and all its children) stop.
+   */
+  constexpr auto request_stop() noexcept -> void { m_stop.store(1, std::memory_order_release); }
+
+  /**
+   * @brief Same as `request_stop`, but returns true if this is the first time stop has been requested.
+   */
+  [[nodiscard("You can use request_stop() if you don't need the return value")]]
+  constexpr auto race_request_stop() noexcept -> bool {
+    return m_stop.exchange(1, std::memory_order_release) == 0;
+  }
+
+ private:
+  /**
+   * @brief Test if any stop request has been made in the current chain.
+   *
+   * Safe to call with a null pointer, in which case it returns false.
+   */
+  [[nodiscard]]
+  friend constexpr auto deep_stop_requested(stop_source const *src) noexcept -> bool {
+    for (stop_source const *ptr = src; ptr != nullptr; ptr = ptr->m_parent) {
+      if (ptr->m_stop.load(std::memory_order_acquire) == 1) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  stop_source const *m_parent = nullptr;
+  std::atomic<std::uint32_t> m_stop = 0;
+};
+} // namespace lf
diff --git a/src/core/task.cxx b/src/core/task.cxx
new file mode 100644
index 000000000..17a45eebd
--- /dev/null
+++ b/src/core/task.cxx
@@ -0,0 +1,46 @@
+export module libfork.core:task;
+
+import std;
+
+import libfork.utils;
+
+import :concepts_context;
+
+namespace lf {
+
+/**
+ * @brief A type returnable from libfork's async functions/coroutines.
+ *
+ * This requires that `T` is `void` or a `std::movable` type.
+ */
+export template <typename T>
+concept returnable = std::is_void_v<T> || (plain_object<T> && std::movable<T>);
+
+export template <worker_context>
+struct env {
+  explicit constexpr env(key_t) noexcept {}
+};
+
+// Forward-declare promise_type so task can reference it as a pointer.
+template <returnable T, worker_context Context>
+struct promise_type;
+
+/**
+ * @brief The return type for libfork's async functions/coroutines.
+ */
+export template <returnable T, worker_context Context>
+class task {
+ public:
+  using value_type = T;
+  using context_type = Context;
+
+  constexpr task(key_t, promise_type<T, Context> *promise) noexcept
+      : m_promise(promise) {}
+
+ private:
+  friend constexpr auto get(key_t, task t) noexcept -> promise_type<T, Context> * { return t.m_promise; }
+
+  promise_type<T, Context> *m_promise;
+};
+
+} // namespace lf
diff --git a/src/core/thread_locals.cxx b/src/core/thread_locals.cxx
new file mode 100644
index 000000000..67a278378
--- /dev/null
+++ b/src/core/thread_locals.cxx
@@ -0,0 +1,33 @@
+export module libfork.core:thread_locals;
+
+import libfork.utils;
+
+import :concepts_context;
+
+namespace lf {
+
+/**
+ * @brief Thread-local pointer to the current worker context.
+ */
+template <worker_context Context>
+constinit inline thread_local Context *thread_local_context = nullptr;
+
+// TODO: implications of thread local on constexpr
+
+/**
+ * @brief A getter for the current worker context, checks for null in debug.
+ */
+template <worker_context Context>
+constexpr auto get_tls_context() noexcept -> Context & {
+  return *not_null(thread_local_context<Context>);
+}
+
+/**
+ * @brief A getter for the current worker context's stack, checks for null in debug.
+ */
+template <worker_context Context>
+constexpr auto get_tls_stack() noexcept -> stack_t<Context> & {
+  return get_tls_context<Context>().stack();
+}
+
+} // namespace lf
diff --git a/src/exception.cpp b/src/exception.cpp
new file mode 100644
index 000000000..a05e60c0e
--- /dev/null
+++ b/src/exception.cpp
@@ -0,0 +1,20 @@
+#include <cstdio>
+
+#include "libfork/__impl/exception.hpp"
+
+import std;
+
+namespace lf::impl {
+
+[[noreturn]]
+void terminate_with(char const *message, char const *file, int line) noexcept {
+  LF_TRY {
+    std::println(stderr, "{} {}:{}: {}", std::this_thread::get_id(), file, line, message);
+  } LF_CATCH_ALL {
+    // Drop exceptions during termination
+  }
+  // TODO: can we get a stack trace here?
+  std::terminate();
+}
+
+} // namespace lf::impl
diff --git a/src/libfork.cxx b/src/libfork.cxx
new file mode 100644
index 000000000..bbc053d21
--- /dev/null
+++ b/src/libfork.cxx
@@ -0,0 +1,6 @@
+export module libfork;
+
+export import libfork.core;
+export import libfork.batteries;
+export import libfork.schedulers;
+export import libfork.algorithm;
diff --git a/src/schedulers/busy.cxx b/src/schedulers/busy.cxx
new file mode 100644
index 000000000..f7bc92955
--- /dev/null
+++ b/src/schedulers/busy.cxx
@@ -0,0 +1,142 @@
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/compiler.hpp"
+export module libfork.schedulers:basic_busy_pool;
+
+import std;
+
+import libfork.utils;
+import libfork.core;
+import libfork.batteries;
+
+namespace lf {
+
+struct invalid_workers_error : std::exception {
+  [[nodiscard]]
+  constexpr auto what() const noexcept -> const char * override {
+    return "A thread pool must have at least one worker.";
+  }
+};
+
+export enum class pool_kind { mono, poly };
+
+export template <pool_kind Kind,
+                 worker_stack Stack,
+                 stealable_deque_policy Deque = adapt_deque<>,
+                 simple_allocator Alloc = std::allocator<std::byte>>
+class basic_busy_pool {
+
+  using context = std::conditional_t<     //
+      Kind == pool_kind::poly,            //
+      derived_poly_context<Stack, Deque>, //
+      mono_context<Stack, Deque>          //
+      >;
+
+ public:
+  using context_type = context::context_type;
+
+  // TODO: sleep when zero work
+
+  explicit basic_busy_pool(std::size_t n = std::thread::hardware_concurrency(), Alloc const &alloc = Alloc())
+      : m_contexts(n) {
+
+    // TODO: propagate alloc to m_contexts, m_posted, etc.
+    (void)alloc;
+
+    if (n < 1) {
+      LF_THROW(invalid_workers_error{});
+    }
+
+    LF_TRY{
+      for (std::size_t id = 0; id < n; ++id) {
+        m_threads.emplace_back([this, id](std::stop_token stop) -> void {
+          worker(std::move(stop), id);
+        });
+      }
+    } LF_CATCH_ALL {
+      // Force joins before members (which threads reference) are destroyed.
+      join_all();
+      LF_RETHROW;
+    }
+  }
+
+  basic_busy_pool(basic_busy_pool const &) = delete;
+  basic_busy_pool(basic_busy_pool &&) = delete;
+
+  auto operator=(basic_busy_pool const &) -> basic_busy_pool & = delete;
+  auto operator=(basic_busy_pool &&) -> basic_busy_pool & = delete;
+
+  ~basic_busy_pool() { join_all(); }
+
+  void post(sched_handle<context_type> handle) {
+    // TODO: use a lock-free queue here
+    auto lock = std::unique_lock(m_mutex);
+    m_posted.push_back(handle);
+  }
+
+ private:
+  void worker(std::stop_token stop, std::size_t id) {
+
+    LF_ASSUME(id < m_contexts.size());
+
+    context &ctx = m_contexts[id];
+
+    std::size_t const n = m_contexts.size();
+
+    std::default_random_engine rng(safe_cast<unsigned>(id + 1));
+    std::uniform_int_distribution<std::size_t> dist(0, n - 2);
+
+    constexpr int k_steal_attempts = 1024;
+
+    while (!stop.stop_requested()) {
+
+      if (auto lock = std::unique_lock(m_mutex); !m_posted.empty()) {
+        sched_handle task = m_posted.back();
+        m_posted.pop_back();
+        lock.unlock();
+        execute(static_cast<context_type &>(ctx), task);
+        continue;
+      }
+
+      if (n > 1) {
+        for (int i = 0; i < k_steal_attempts; ++i) {
+
+          std::size_t victim = dist(rng);
+
+          if (victim >= id) {
+            victim += 1;
+          }
+
+          LF_ASSUME(victim < n);
+          LF_ASSUME(victim != id);
+
+          if (auto result = m_contexts[victim].steal()) {
+            execute(static_cast<context_type &>(ctx), result);
+            continue;
+          }
+        }
+      }
+    }
+  }
+
+  void join_all() {
+    m_threads.clear(); // jthread calls stop and joins in destructor
+  }
+
+  std::vector<context> m_contexts;
+  std::vector<std::jthread> m_threads;
+  std::mutex m_mutex;
+  std::vector<sched_handle<context_type>> m_posted;
+};
+
+export template <worker_stack Stack,
+                 stealable_deque_policy Deque = adapt_deque<>,
+                 simple_allocator Alloc = std::allocator<std::byte>>
+using mono_busy_pool = basic_busy_pool<pool_kind::mono, Stack, Deque, Alloc>;
+
+export template <worker_stack Stack,
+                 stealable_deque_policy Deque = adapt_deque<>,
+                 simple_allocator Alloc = std::allocator<std::byte>>
+using poly_busy_pool = basic_busy_pool<pool_kind::poly, Stack, Deque, Alloc>;
+
+} // namespace lf
diff --git a/src/schedulers/inline.cxx b/src/schedulers/inline.cxx
new file mode 100644
index 000000000..c0d58bd5b
--- /dev/null
+++ b/src/schedulers/inline.cxx
@@ -0,0 +1,51 @@
+export module libfork.schedulers:inline_scheduler;
+
+import std;
+
+import libfork.core;
+
+import libfork.batteries;
+
+namespace lf {
+
+// TODO: think about initialization:
+// - do we need default initializable on stack/context?
+// - with allocators
+
+// TODO: Can we store the context directly in TLS?
+
+template <typename Derived, typename Base>
+concept derived_context_from = worker_context<Base> && std::derived_from<Derived, Base>;
+
+export template <typename Context>
+concept derived_worker_context =
+    has_context_typedef<Context> && derived_context_from<Context, context_t<Context>>;
+
+export template <derived_worker_context Context>
+class inline_scheduler {
+ public:
+  using context_type = Context::context_type;
+
+  inline_scheduler() = default;
+
+  template <typename... Args>
+    requires std::constructible_from<Context, Args...>
+  explicit(sizeof...(Args) == 1)
+      inline_scheduler(Args &&...args) noexcept(std::is_nothrow_constructible_v<Context, Args...>)
+      : m_context(std::forward<Args>(args)...) {}
+
+  void post(lf::sched_handle<context_type> handle) {
+    execute(static_cast<context_type &>(m_context), handle);
+  }
+
+ private:
+  Context m_context;
+};
+
+export template <worker_stack Stack, deque_policy Deque>
+using mono_inline_scheduler = inline_scheduler<mono_context<Stack, Deque>>;
+
+export template <worker_stack Stack, deque_policy Deque>
+using poly_inline_scheduler = inline_scheduler<derived_poly_context<Stack, Deque>>;
+
+} // namespace lf
diff --git a/src/schedulers/schedulers.cxx b/src/schedulers/schedulers.cxx
new file mode 100644
index 000000000..0ab5c0a27
--- /dev/null
+++ b/src/schedulers/schedulers.cxx
@@ -0,0 +1,4 @@
+export module libfork.schedulers;
+
+export import :inline_scheduler;
+export import :basic_busy_pool;
diff --git a/src/utils/concepts.cxx b/src/utils/concepts.cxx
new file mode 100644
index 000000000..ad1d7a11b
--- /dev/null
+++ b/src/utils/concepts.cxx
@@ -0,0 +1,65 @@
+export module libfork.utils:concepts;
+
+import std;
+
+namespace lf {
+
+// =========== Atomic related concepts =========== //
+
+export template <typename T>
+concept plain_object = std::is_object_v<T> && std::same_as<T, std::remove_reference_t<T>>;
+
+/**
+ * @brief Verify a type is suitable for use with `std::atomic`
+ *
+ * This requires a `TriviallyCopyable` type satisfying both `CopyConstructible` and `CopyAssignable`.
+ */
+export template <typename T>
+concept atomicable = plain_object<T> &&                 //
+                     std::is_trivially_copyable_v<T> && //
+                     std::is_copy_constructible_v<T> && //
+                     std::is_move_constructible_v<T> && //
+                     std::is_copy_assignable_v<T> &&    //
+                     std::is_move_assignable_v<T>;      //
+
+/**
+ * @brief A concept that verifies a type is lock-free when used with `std::atomic`.
+ */
+export template <typename T>
+concept lock_free = atomicable<T> && std::atomic<T>::is_always_lock_free;
+
+// ========== Specialization ========== //
+
+template <typename T, template <typename...> typename Template>
+struct is_specialization_of : std::false_type {};
+
+template <template <typename...> typename Template, typename... Args>
+struct is_specialization_of<Template<Args...>, Template> : std::true_type {};
+
+/**
+ * @brief Test if `T` is a specialization of the template `Template`.
+ */
+export template <typename T, template <typename...> typename Template>
+concept specialization_of = is_specialization_of<std::remove_cvref_t<T>, Template>::value;
+
+// ==== Allocators
+
+/**
+ * @brief Lifted from the c++26 exposition only requirement.
+ */
+export template <class T>
+concept simple_allocator =
+    std::copy_constructible<T> && std::equality_comparable<T> && requires (T alloc, std::size_t n) {
+      { *alloc.allocate(n) } -> std::same_as<typename T::value_type &>;
+      { alloc.deallocate(alloc.allocate(n), n) };
+    };
+
+/**
+ * @brief Semantically `T` must be a std:: Allocator
+ *
+ * Doesn't specify the full API just 'simple-allocator' exposition only concept.
+ */
+export template <class T, typename U>
+concept allocator_of = simple_allocator<T> && std::same_as<typename T::value_type, U>;
+
+} // namespace lf
diff --git a/src/utils/constants.cxx b/src/utils/constants.cxx
new file mode 100644
index 000000000..e5f303761
--- /dev/null
+++ b/src/utils/constants.cxx
@@ -0,0 +1,24 @@
+export module libfork.utils:constants;
+
+import std;
+
+namespace lf {
+
+export constexpr std::size_t k_kilobyte = 1024;
+export constexpr std::size_t k_megabyte = 1024 * k_kilobyte;
+
+export constexpr std::uint32_t k_u16_max = std::numeric_limits<std::uint16_t>::max();
+
+export constexpr std::size_t k_new_align = __STDCPP_DEFAULT_NEW_ALIGNMENT__;
+export constexpr std::size_t k_page_size = 4096; // 4 KiB on most systems.
+
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Winterference-size"
+#endif // #ifdef __GNUC__
+export constexpr std::size_t k_cache_line = std::hardware_destructive_interference_size;
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+  #pragma GCC diagnostic pop
+#endif // #ifdef __GNUC__
+
+} // namespace lf
diff --git a/src/utils/defer.cxx b/src/utils/defer.cxx
new file mode 100644
index 000000000..b906035b3
--- /dev/null
+++ b/src/utils/defer.cxx
@@ -0,0 +1,55 @@
+module;
+#include "libfork/__impl/compiler.hpp"
+export module libfork.utils:defer;
+
+import std;
+
+namespace lf {
+
+/**
+ * @brief Basic implementation of a Golang-like defer.
+ *
+ * \rst
+ *
+ * Use like:
+ *
+ * .. code::
+ *
+ *    auto * ptr = c_api_init();
+ *
+ *    defer _ = [&ptr] noexcept {
+ *      c_api_clean_up(ptr);
+ *    };
+ *
+ *    // Code that may throw
+ *
+ * \endrst
+ */
+export template <class Fn>
+  requires std::is_nothrow_invocable_v<Fn> && std::is_object_v<Fn>
+class [[nodiscard("Defer will execute unless bound to a name!")]] defer {
+ public:
+  defer(defer const &) = delete;
+  defer(defer &&) = delete;
+  auto operator=(defer const &) -> defer & = delete;
+  auto operator=(defer &&) -> defer & = delete;
+
+  /**
+   * @brief Construct a new Defer object.
+   *
+   * @param fn Nullary invocable forwarded into object and invoked by destructor.
+   */
+  constexpr defer(Fn &&fn) noexcept(std::is_nothrow_constructible_v<Fn, Fn &&>)
+      : m_fn(std::forward<Fn>(fn)) {}
+
+  /**
+   * @brief Calls the invocable.
+   */
+  LF_FORCE_INLINE constexpr ~defer() noexcept { std::invoke(std::move(m_fn)); }
+
+ private:
+  [[no_unique_address]]
+  Fn m_fn;
+};
+
+} // namespace lf
diff --git a/src/utils/tuple.cxx b/src/utils/tuple.cxx
new file mode 100644
index 000000000..977abb144
--- /dev/null
+++ b/src/utils/tuple.cxx
@@ -0,0 +1,101 @@
+module;
+#include "libfork/__impl/utils.hpp"
+export module libfork.utils:tuple;
+
+import std;
+
+namespace lf {
+
+// TODO: Replace with reflection tuple?
+
+//========== Copy Qualifiers =============//
+
+template <typename T, typename U>
+struct copy_cvref {
+ private:
+  using u0 = std::remove_reference_t<T>;
+  using u1 = std::conditional_t<std::is_const_v<u0>, std::add_const_t<U>, U>;
+  using u2 = std::conditional_t<std::is_volatile_v<u0>, std::add_volatile_t<u1>, u1>;
+  using u3 = std::conditional_t<std::is_lvalue_reference_v<T>, std::add_lvalue_reference_t<u2>, u2>;
+  using u4 = std::conditional_t<std::is_rvalue_reference_v<T>, std::add_rvalue_reference_t<u3>, u3>;
+
+ public:
+  using type = u4;
+};
+
+/**
+ * Copy the const/volatile/reference qualifiers from `From` to `To`.
+ */
+export template <typename From, typename To>
+using copy_cvref_t = copy_cvref<From, To>::type;
+
+template <int I, typename T>
+struct tuple_leaf {
+  [[no_unique_address]]
+  T elem;
+};
+
+// In GCC 15 name mangling is not implemented for function signatures
+// with 'deducing this' yet, so we fall back to the old implementation.
+#if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ <= 16)
+
+template <std::size_t I, typename... Ts>
+struct index;
+
+template <typename T, typename... Ts>
+struct index<0, T, Ts...> : std::type_identity<T> {};
+
+template <std::size_t I, typename T, typename... Ts>
+struct index<I, T, Ts...> : index<I - 1, Ts...> {};
+
+  #define INDEX_HACK(I, Pack) typename index<I, Pack>::type
+
+#else
+  #define INDEX_HACK(I, Pack) Pack[I]
+#endif
+
+//========== Tuple =============//
+
+template <typename, typename...>
+struct tuple_impl;
+
+template <std::size_t... Is, typename... Ts>
+struct tuple_impl<std::index_sequence<Is...>, Ts...> : tuple_leaf<Is, Ts>... {
+  template <std::size_t I, typename Self>
+  [[nodiscard]]
+  constexpr auto get(this Self &&self) noexcept -> copy_cvref_t<Self &&, INDEX_HACK(I, Ts...)> {
+    return static_cast<copy_cvref_t<Self &&, Ts...[I]>>(LF_FWD(self).template tuple_leaf<I, Ts...[I]>::elem);
+  }
+
+  [[nodiscard]]
+  constexpr auto apply(this auto &&self, auto &&fn)
+      LF_HOF(std::invoke(LF_FWD(fn), LF_FWD(self).template get<Is>()...))
+};
+
+/**
+ * @brief A minimal non-recursive tuple-as-aggregate implementation.
+ *
+ * This is a very stripped back tuple that only:
+ *
+ * - Provides `.get<I>()` member function.
+ * - Provides an `apply(fn)` member function.
+ * - Supports structured bindings.
+ *
+ * This has the advantage of significantly faster compilation times
+ * compared to the standard library's `std::tuple`. In addition it is an
+ * aggregate type hence, is trivially copyable/constructable/destructible
+ * conditional on the types it contains. It even works as an NTTP.
+ */
+export template <typename... Ts>
+struct tuple final : tuple_impl<std::index_sequence_for<Ts...>, Ts...> {};
+
+template <typename... Ts>
+tuple(Ts &&...) -> tuple<Ts...>;
+
+} // namespace lf
+
+template <typename... Ts>
+struct std::tuple_size<lf::tuple<Ts...>> : std::integral_constant<std::size_t, sizeof...(Ts)> {};
+
+template <std::size_t I, typename... Ts>
+struct std::tuple_element<I, lf::tuple<Ts...>> : std::type_identity<Ts... [I]> {};
diff --git a/src/utils/uninitialized.cxx b/src/utils/uninitialized.cxx
new file mode 100644
index 000000000..36888acb3
--- /dev/null
+++ b/src/utils/uninitialized.cxx
@@ -0,0 +1,43 @@
+
+export module libfork.utils:uninitialized;
+
+import std;
+
+import :concepts;
+
+namespace lf {
+
+#if defined(__cpp_trivial_union) && __cpp_trivial_union >= 202306L
+  #pragma message("TODO: __cpp_trivial_union is available — remove union workaround")
+#endif
+
+export template <plain_object T>
+class uninitialized {
+ public:
+  constexpr uninitialized() = default;
+
+  constexpr uninitialized(uninitialized const &) = delete;
+  constexpr uninitialized(uninitialized &&) = delete;
+
+  constexpr auto operator=(uninitialized const &) -> uninitialized & = delete;
+  constexpr auto operator=(uninitialized &&) -> uninitialized & = delete;
+
+  constexpr ~uninitialized() = default;
+
+  auto operator->() noexcept -> T * { return std::launder(std::bit_cast<T *>(auto{buffer})); }
+
+  auto operator*() noexcept -> T & { return *this->operator->(); }
+
+  template <class... Args>
+    requires std::constructible_from<T, Args...>
+  auto construct(Args &&...args) noexcept(std::is_nothrow_constructible_v<T, Args...>) -> T & {
+    return *::new (static_cast<void *>(buffer)) T(std::forward<Args>(args)...);
+  }
+
+  void destroy() noexcept { (**this).~T(); }
+
+ private:
+  alignas(T) std::byte buffer[sizeof(T)];
+};
+
+} // namespace lf
diff --git a/src/utils/utility.cxx b/src/utils/utility.cxx
new file mode 100644
index 000000000..31ff3fd6d
--- /dev/null
+++ b/src/utils/utility.cxx
@@ -0,0 +1,88 @@
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/exception.hpp"
+export module libfork.utils:utility;
+
+import std;
+
+namespace lf {
+
+/**
+ * @brief Safe integral cast, will terminate if the cast would overflow in debug.
+ */
+export template <std::integral To, std::integral From>
+[[nodiscard]]
+constexpr auto safe_cast(From val) noexcept -> To {
+
+  constexpr auto to_min = std::numeric_limits<To>::min();
+  constexpr auto to_max = std::numeric_limits<To>::max();
+
+  constexpr auto from_min = std::numeric_limits<From>::min();
+  constexpr auto from_max = std::numeric_limits<From>::max();
+
+  if constexpr (std::cmp_greater(to_min, from_min)) {
+    LF_ASSUME(val >= static_cast<From>(to_min));
+  }
+
+  if constexpr (std::cmp_less(to_max, from_max)) {
+    LF_ASSUME(val <= static_cast<From>(to_max));
+  }
+
+  return static_cast<To>(val);
+}
+
+/**
+ * @brief Assume a pointer is not null, otherwise terminates the program in debug mode.
+ */
+export template <typename T>
+[[nodiscard]]
+constexpr auto
+not_null(T *ptr, std::source_location const loc = std::source_location::current()) noexcept -> T * {
+  if (!ptr) {
+#ifdef NDEBUG
+    LF_UNREACHABLE();
+#else
+    impl::terminate_with("Null pointer dereferenced!", loc.file_name(), safe_cast<int>(loc.line()));
+#endif
+  }
+  return ptr;
+}
+
+export class key_t;
+
+export constexpr auto key() noexcept -> key_t;
+
+/**
+ * @brief Only way to get one is via un-exported `key()` function.
+ */
+export class key_t {
+ public:
+  friend constexpr auto key() noexcept -> key_t { return {}; }
+
+ private:
+  constexpr key_t() = default;
+};
+
+/**
+ * @brief Test if a pointer is aligned to a multiple of `Align`.
+ *
+ * Supports fancy pointers, doesn't require an object to exist at the pointer.
+ */
+export template <std::size_t Align, typename Ptr>
+  requires (std::has_single_bit(Align))
+[[nodiscard]]
+auto is_sufficiently_aligned(Ptr const &ptr) noexcept -> bool {
+  return (std::bit_cast<std::uintptr_t>(std::to_address(ptr)) & (Align - 1)) == 0;
+}
+
+/**
+ * @brief Round up size to a multiple of `Align` for alignment purposes.
+ */
+export template <std::size_t Align>
+  requires (std::has_single_bit(Align))
+[[nodiscard]]
+constexpr auto round_to_multiple(std::size_t size) noexcept -> std::size_t {
+  return (size + Align - 1) & ~(Align - 1);
+}
+
+} // namespace lf
diff --git a/src/utils/utils.cxx b/src/utils/utils.cxx
new file mode 100644
index 000000000..bb9096f08
--- /dev/null
+++ b/src/utils/utils.cxx
@@ -0,0 +1,8 @@
+export module libfork.utils;
+
+export import :utility;
+export import :constants;
+export import :tuple;
+export import :concepts;
+export import :defer;
+export import :uninitialized;
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
new file mode 100644
index 000000000..250f01026
--- /dev/null
+++ b/test/CMakeLists.txt
@@ -0,0 +1,23 @@
+cmake_minimum_required(VERSION 4.2.1 FATAL_ERROR)
+
+project(libfork_test LANGUAGES CXX)
+
+# ---- Dependencies ----
+
+find_package(Catch2 REQUIRED)
+
+# --------- Tests ---------
+
+file(GLOB_RECURSE TEST_SOURCES CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp")
+
+add_executable(libfork_test ${TEST_SOURCES})
+target_link_libraries(libfork_test PRIVATE libfork::libfork Catch2::Catch2WithMain)
+target_compile_features(libfork_test PRIVATE cxx_std_26)
+
+if(DEFINED Catch2_SOURCE_DIR)
+  list(APPEND CMAKE_MODULE_PATH ${Catch2_SOURCE_DIR}/extras)
+endif()
+
+include(Catch)
+
+catch_discover_tests(libfork_test)
diff --git a/test/src/adaptors.cpp b/test/src/adaptors.cpp
new file mode 100644
index 000000000..435404443
--- /dev/null
+++ b/test/src/adaptors.cpp
@@ -0,0 +1,77 @@
+#include <catch2/catch_test_macros.hpp>
+
+import std;
+
+import libfork;
+
+using namespace lf;
+
+TEST_CASE("Constructible", "[adaptors]") {
+  // adapt_vector's default ctor only stores the allocator; no allocation occurs,
+  // so it should be noexcept when the allocator's default ctor is noexcept.
+  STATIC_REQUIRE(std::is_nothrow_default_constructible_v<adapt_vector<>>);
+
+  // adapt_deque's default ctor delegates to a capacity-taking ctor that
+  // allocates an internal buffer, so it must NOT be noexcept — bad_alloc must
+  // propagate instead of invoking std::terminate.
+  STATIC_REQUIRE(!std::is_nothrow_default_constructible_v<adapt_deque<>>);
+}
+
+TEST_CASE("adapt_deque: default constructor allocates", "[adaptors]") {
+  adapt_deque<> d;
+  // Freshly constructed: steal returns an empty handle.
+  REQUIRE_FALSE(d.steal());
+  REQUIRE_FALSE(d.pop());
+}
+
+TEST_CASE("adapt_vector: default constructor does not allocate", "[adaptors]") {
+  adapt_vector<> v;
+  REQUIRE_FALSE(v.pop());
+}
+
+namespace {
+
+using test_stack = geometric_stack<>;
+using test_deque = adapt_deque<>;
+
+TEST_CASE("derived_poly_context: piecewise construction compiles", "[contexts]") {
+  derived_poly_context<test_stack, test_deque> ctx{
+      std::piecewise_construct,
+      std::tuple<>{},
+      std::tuple<std::size_t>{1024},
+  };
+
+  std::ignore = ctx;
+}
+
+TEST_CASE("derived_poly_context: piecewise construction forwards stack args", "[contexts]") {
+  derived_poly_context<test_stack, test_deque> ctx{
+      std::piecewise_construct,
+      std::tuple<std::allocator<std::byte>>{},
+      std::tuple<std::size_t>{1024},
+  };
+
+  std::ignore = ctx;
+}
+
+TEST_CASE("mono_context: piecewise construction compiles", "[contexts]") {
+  mono_context<test_stack, test_deque> ctx{
+      std::piecewise_construct,
+      std::tuple<>{},
+      std::tuple<std::size_t>{1024},
+  };
+
+  std::ignore = ctx;
+}
+
+TEST_CASE("mono_context: piecewise construction forwards stack args", "[contexts]") {
+  mono_context<test_stack, test_deque> ctx{
+      std::piecewise_construct,
+      std::tuple<std::allocator<std::byte>>{},
+      std::tuple<std::size_t>{1024},
+  };
+
+  std::ignore = ctx;
+}
+
+} // namespace
diff --git a/test/src/cancel.cpp b/test/src/cancel.cpp
new file mode 100644
index 000000000..12260c62b
--- /dev/null
+++ b/test/src/cancel.cpp
@@ -0,0 +1,773 @@
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+#include "libfork/__impl/exception.hpp"
+
+import std;
+
+import libfork;
+
+// Exhaustive tests for all stop-token paths in promise.cxx.
+//
+// Stop check-points in promise.cxx:
+//
+//   A. awaitable::await_suspend (StopToken=true):
+//        child->stop_requested() → child not spawned (fork/call via child_scope_ops)
+//
+//   B. awaitable::await_suspend (StopToken=false):
+//        parent.promise().frame.stop_requested() → child not spawned (fork/call via scope_ops)
+//
+//   C. final_suspend_full / final_suspend_trailing:
+//        parent->stop_requested() after winning join race → exception dropped,
+//        iterative ancestor cleanup (exercises concurrent/stolen path)
+//
+//   D. join_awaitable::await_ready:
+//        stop_requested() forces suspension even when steals==0
+//
+//   E. join_awaitable::await_suspend:
+//        stop_requested() after winning join race → handle_stop()
+//
+//   F. handle_stop (exception_bit set on stopped frame):
+//        exception dropped, not propagated to caller
+//
+//   G. Nested child_scope chain propagation:
+//        inner child_scope inherits parent's stop token; stopping the outer
+//        source propagates through the chain to the inner scope.
+//
+//   H. Stoppable receiver / pre-cancelled root:
+//        recv_state<T, true> + receiver::request_stop() immediately after
+//        schedule() — covers the goto-cleanup fast path in root.cxx on
+//        schedulers where the task has not yet begun running.  Racy in
+//        principle, so the test only asserts completion, not that the body
+//        was skipped.
+//
+//   I. Stress tests: concurrent cancellation under contention.
+
+namespace {
+
+// ============================================================
+// Basic helper tasks
+// ============================================================
+
+struct count_up {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, std::atomic<int> &count) -> lf::task<int, Context> {
+    co_return count.fetch_add(1);
+  }
+};
+
+struct count_up_void {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, std::atomic<int> &count) -> lf::task<void, Context> {
+    count.fetch_add(1);
+    co_return;
+  }
+};
+
+// ============================================================
+// A. Cancel=true: child-specific cancellation via child_scope_ops.
+//
+//    child_scope_ops binds its stop_source as Cancel=true on every fork/call.
+//    Calling sc.request_stop() before launching exercises
+//    awaitable::await_suspend's Cancel=true branch.
+// ============================================================
+
+template <typename Context>
+auto test_call_drop_cancelled(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  auto sc = co_await lf::child_scope();
+  sc.request_stop();
+  co_await sc.call_drop(count_up_void{}, count);
+  co_await sc.join();
+  co_return count.load() == 0;
+}
+
+template <typename Context>
+auto test_call_cancelled(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  int result = 99;
+  auto sc = co_await lf::child_scope();
+  sc.request_stop();
+  co_await sc.call(&result, count_up{}, count);
+  co_await sc.join();
+  co_return result == 99 && count.load() == 0;
+}
+
+template <typename Context>
+auto test_fork_drop_cancelled(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  auto sc = co_await lf::child_scope();
+  sc.request_stop();
+  co_await sc.fork_drop(count_up_void{}, count);
+  co_await sc.join();
+  co_return count.load() == 0;
+}
+
+template <typename Context>
+auto test_fork_cancelled(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  int result = 99;
+  auto sc = co_await lf::child_scope();
+  sc.request_stop();
+  co_await sc.fork(&result, count_up{}, count);
+  co_await sc.join();
+  co_return result == 99 && count.load() == 0;
+}
+
+template <typename Context>
+auto test_call_not_cancelled(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  int result = 0;
+  auto sc = co_await lf::child_scope();
+  co_await sc.call(&result, count_up{}, count);
+  co_await sc.join();
+  co_return result == 0 && count.load() == 1;
+}
+
+template <typename Context>
+auto test_fork_not_cancelled(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  int result = 0;
+  auto sc = co_await lf::child_scope();
+  co_await sc.fork(&result, count_up{}, count);
+  co_await sc.join();
+  co_return result == 0 && count.load() == 1;
+}
+
+template <typename Context>
+auto test_multiple_cancelled(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  auto sc = co_await lf::child_scope();
+  sc.request_stop();
+  co_await sc.fork_drop(count_up_void{}, count);
+  co_await sc.fork_drop(count_up_void{}, count);
+  co_await sc.fork_drop(count_up_void{}, count);
+  co_await sc.join();
+  co_return count.load() == 0;
+}
+
+template <typename Context>
+auto test_mixed_cancel(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  auto sc_run = co_await lf::child_scope();
+  auto sc_skip = co_await lf::child_scope();
+  sc_skip.request_stop();
+  co_await sc_run.fork_drop(count_up_void{}, count);  // runs
+  co_await sc_skip.fork_drop(count_up_void{}, count); // skipped
+  co_await sc_run.fork_drop(count_up_void{}, count);  // runs
+  co_await sc_run.join();
+  co_await sc_skip.join();
+  co_return count.load() == 2;
+}
+
+// ============================================================
+// B. StopToken=false: parent frame stop propagation.
+//
+//    An inner task receives a stop_source& that IS its own frame's stop
+//    source (bound via child_scope_ops::call_drop / StopToken=true).  It calls
+//    request_stop() on it, making its own stop_requested() return true, then
+//    tries to launch sub-tasks via scope_ops (StopToken=false).  Those are
+//    skipped because parent.frame.stop_requested() is true (path B).
+//    At join, handle_stop fires (paths D+E).
+// ============================================================
+
+struct inner_call_after_self_cancel {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, lf::stop_source &my_cancel, std::atomic<int> &count)
+      -> lf::task<void, Context> {
+    my_cancel.request_stop(); // make this frame's stop_requested() == true
+    auto sc = co_await lf::scope();
+    co_await sc.call_drop(count_up_void{}, count); // StopToken=false: stop requested → skip
+    co_await sc.fork_drop(count_up_void{}, count); // StopToken=false: stop requested → skip
+    co_await sc.join();                            // paths D+E: join fires handle_stop
+    count.fetch_add(100);                          // must not be reached
+  }
+};
+
+template <typename Context>
+auto test_call_parent_stop_source(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  auto outer_sc = co_await lf::child_scope();
+  // Pass the scope's stop_source by reference so the inner task can cancel it.
+  co_await outer_sc.call_drop(inner_call_after_self_cancel{}, outer_sc, count);
+  co_await outer_sc.join();
+  co_return count.load() == 0;
+}
+
+// ============================================================
+// C/D/E. Concurrent cancellation: final_suspend + join interaction.
+// ============================================================
+
+// A child task that cancels a stop_source then completes normally.
+struct cancel_source {
+  template <typename Context>
+  static auto
+  operator()(lf::env<Context>, lf::stop_source &src, std::atomic<int> &count) -> lf::task<void, Context> {
+    count.fetch_add(1);
+    src.request_stop();
+    co_return;
+  }
+};
+
+struct inner_fork_then_cancel_at_join {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, lf::stop_source &my_cancel, std::atomic<int> &count)
+      -> lf::task<void, Context> {
+    auto sc = co_await lf::scope();
+    co_await sc.fork_drop(cancel_source{}, my_cancel, count);
+    co_await sc.join();   // stop_requested() after child requests stop → handle_stop
+    count.fetch_add(100); // must not be reached
+  }
+};
+
+template <typename Context>
+auto test_fork_cancel_at_join(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  auto outer_sc = co_await lf::child_scope();
+  co_await outer_sc.call_drop(inner_fork_then_cancel_at_join{}, outer_sc, count);
+  co_await outer_sc.join();
+  co_return count.load() == 1;
+}
+
+// ============================================================
+// F. Exception + cancellation interaction.
+// ============================================================
+
+#if LF_COMPILER_EXCEPTIONS
+
+struct just_throw {
+  template <typename Context>
+  static auto operator()(lf::env<Context>) -> lf::task<void, Context> {
+    throw std::runtime_error("test exception");
+    co_return;
+  }
+};
+
+struct inner_forks_throwing {
+  template <typename Context>
+  static auto operator()(lf::env<Context>) -> lf::task<void, Context> {
+    auto sc = co_await lf::scope();
+    co_await sc.fork_drop(just_throw{});
+    co_await sc.join(); // not cancelled → rethrow
+    co_return;
+  }
+};
+
+template <typename Context>
+auto test_exception_propagates(lf::env<Context>) -> lf::task<void, Context> {
+  auto outer_sc = co_await lf::child_scope();
+  co_await outer_sc.call_drop(inner_forks_throwing{});
+  co_await outer_sc.join();
+}
+
+struct cancel_source_and_throw {
+  template <typename Context>
+  static auto
+  operator()(lf::env<Context>, lf::stop_source &src, std::atomic<int> &count) -> lf::task<void, Context> {
+    count.fetch_add(1);
+    src.request_stop();
+    throw std::runtime_error("should be dropped");
+    co_return;
+  }
+};
+
+struct inner_cancel_and_throw {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, lf::stop_source &my_cancel, std::atomic<int> &count)
+      -> lf::task<void, Context> {
+    auto sc = co_await lf::scope();
+    co_await sc.fork_drop(cancel_source_and_throw{}, my_cancel, count);
+    co_await sc.join();   // stop requested + exception → handle_stop drops exception
+    count.fetch_add(100); // must not be reached
+  }
+};
+
+template <typename Context>
+auto test_exception_dropped_when_cancelled(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  auto outer_sc = co_await lf::child_scope();
+  co_await outer_sc.call_drop(inner_cancel_and_throw{}, outer_sc, count);
+  co_await outer_sc.join();
+  co_return count.load() == 1;
+}
+
+struct just_throw_and_count {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, std::atomic<int> &count) -> lf::task<void, Context> {
+    count.fetch_add(1);
+    throw std::runtime_error("sibling exception");
+    co_return;
+  }
+};
+
+struct inner_sibling_throws_and_cancel {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, lf::stop_source &my_cancel, std::atomic<int> &count)
+      -> lf::task<void, Context> {
+    auto sc = co_await lf::scope();
+    co_await sc.fork_drop(just_throw_and_count{}, count);
+    co_await sc.fork_drop(cancel_source{}, my_cancel, count);
+    co_await sc.join();   // stop requested; exceptions dropped
+    count.fetch_add(100); // must not be reached
+  }
+};
+
+template <typename Context>
+auto test_sibling_exception_dropped_when_cancelled(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  auto outer_sc = co_await lf::child_scope();
+  co_await outer_sc.call_drop(inner_sibling_throws_and_cancel{}, outer_sc, count);
+  co_await outer_sc.join();
+  auto c = count.load();
+  co_return c >= 2 && c < 100;
+}
+
+#endif // LF_COMPILER_EXCEPTIONS
+
+// ============================================================
+// G. Nested child_scope chain propagation.
+//
+//    A child_scope created inside a task that runs under another child_scope
+//    has m_parent pointing to the outer scope's stop_source.  Stopping the
+//    outer source propagates through the chain, making the inner scope's
+//    stop_requested() return true (path A).
+// ============================================================
+
+struct inner_with_nested_scope {
+  template <typename Context>
+  static auto
+  operator()(lf::env<Context>, lf::stop_source &outer, std::atomic<int> &count) -> lf::task<void, Context> {
+    auto inner_sc = co_await lf::child_scope();
+    // Cancel the outer scope; inner_sc.m_parent == &outer, so the chain fires.
+    outer.request_stop();
+    co_await inner_sc.fork_drop(count_up_void{}, count); // skipped: inner_sc is stopped
+    co_await inner_sc.join();                            // handle_stop
+    count.fetch_add(100);                                // must not be reached
+  }
+};
+
+template <typename Context>
+auto test_nested_child_scope_chain(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  auto outer_sc = co_await lf::child_scope();
+  co_await outer_sc.call_drop(inner_with_nested_scope{}, outer_sc, count);
+  co_await outer_sc.join();
+  co_return count.load() == 0;
+}
+
+// ============================================================
+// H. Stoppable receiver / pre-cancelled root.
+//
+//    Using recv_state<T, true> + receiver::request_stop() exercises the
+//    goto-cleanup fast path in root.cxx when stop is requested before the
+//    worker resumes the task.
+// ============================================================
+
+template <typename Context>
+auto pre_cancelled_root_fn(lf::env<Context>, std::atomic<bool> *ran) -> lf::task<void, Context> {
+  ran->store(true, std::memory_order_relaxed);
+  co_return;
+}
+
+// ============================================================
+// I. Stress tests: concurrent cancellation under contention.
+//
+//    These tests fork many tasks across multiple threads to maximize the
+//    probability of hitting the concurrent paths in final_suspend_full,
+//    final_suspend_trailing, and join_awaitable::await_suspend with
+//    stop_requested() == true.
+// ============================================================
+
+// --- Leaf task: does a tiny amount of work then returns.
+struct leaf_work {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, std::atomic<int> &count) -> lf::task<void, Context> {
+    count.fetch_add(1, std::memory_order_relaxed);
+    co_return;
+  }
+};
+
+// --- Fan-out many forks, one sibling cancels the scope mid-flight.
+//
+//     With enough forks and threads, some children will be in-flight when
+//     stop fires. This exercises:
+//       - final_suspend_trailing: child completes, wins join race, sees stop
+//       - final_suspend_full: iterative ancestor climb on stopped frames
+//       - awaitable::await_suspend: children launched after stop are skipped
+//       - join_awaitable: stop detected at join with steals > 0
+
+struct stress_fan_cancel_inner {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, lf::stop_source &my_stop, std::atomic<int> &count, int width)
+      -> lf::task<void, Context> {
+    auto sc = co_await lf::scope();
+
+    // Fork width children; the last one cancels this scope.
+    for (int i = 0; i < width; ++i) {
+      if (i == width / 2) {
+        co_await sc.fork_drop(cancel_source{}, my_stop, count);
+      } else {
+        co_await sc.fork_drop(leaf_work{}, count);
+      }
+    }
+    co_await sc.join();
+    // Should not be reached — cancel_source fired mid-fan.
+    count.fetch_add(10000, std::memory_order_relaxed);
+  }
+};
+
+template <typename Context>
+auto stress_fan_cancel(lf::env<Context>, int width) -> lf::task<void, Context> {
+  std::atomic<int> count = 0;
+  auto outer = co_await lf::child_scope();
+  co_await outer.call_drop(stress_fan_cancel_inner{}, outer, count, width);
+  co_await outer.join();
+}
+
+// --- Deep recursive fork tree with cancellation at a specific depth.
+//
+//     This creates a binary tree of forks. When a node at the target depth
+//     fires, it cancels the scope. This stresses:
+//       - final_suspend_full loop: many frames in the ancestor chain may be
+//         stopped, causing iterative climbing
+//       - final_suspend_trailing: stolen forks completing concurrently
+//       - Stack ownership transfer under cancellation
+
+struct tree_cancel_node {
+  template <typename Context>
+  static auto
+  operator()(lf::env<Context>, lf::stop_source &root_stop, std::atomic<int> &count, int depth, int cancel_at)
+      -> lf::task<void, Context> {
+    count.fetch_add(1, std::memory_order_relaxed);
+
+    if (depth <= 0) {
+      co_return;
+    }
+
+    if (depth == cancel_at) {
+      root_stop.request_stop();
+      co_return;
+    }
+
+    auto sc = co_await lf::scope();
+    co_await sc.fork_drop(tree_cancel_node{}, root_stop, count, depth - 1, cancel_at);
+    co_await sc.fork_drop(tree_cancel_node{}, root_stop, count, depth - 1, cancel_at);
+    co_await sc.join();
+  }
+};
+
+template <typename Context>
+auto stress_tree_cancel(lf::env<Context>, int depth, int cancel_at) -> lf::task<void, Context> {
+  std::atomic<int> count = 0;
+  auto outer = co_await lf::child_scope();
+  co_await outer.call_drop(tree_cancel_node{}, outer, count, depth, cancel_at);
+  co_await outer.join();
+}
+
+// --- Repeated schedule + cancel: exercises root.cxx stop path and receiver.
+//
+//     Rapidly schedules tasks and immediately cancels them. The race between
+//     the worker picking up the task and the cancellation request stresses
+//     the root_pkg pre-cancelled path and final_suspend from root frames.
+
+struct busy_leaf {
+  template <typename Context>
+  static auto operator()(lf::env<Context>) -> lf::task<void, Context> {
+    co_return;
+  }
+};
+
+// --- Many-fork cancel with nested child_scopes at multiple levels.
+//
+//     An outer child_scope forks N tasks. Each inner task creates its own
+//     child_scope and forks M children. Mid-way, the outer scope is cancelled.
+//     This tests chain propagation under concurrent fork completion, hitting
+//     final_suspend_full's iterative climb through multiple nested stopped
+//     frames.
+
+struct nested_inner_worker {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, std::atomic<int> &count, int width) -> lf::task<void, Context> {
+    auto sc = co_await lf::scope();
+    for (int i = 0; i < width; ++i) {
+      co_await sc.fork_drop(leaf_work{}, count);
+    }
+    co_await sc.join();
+  }
+};
+
+struct nested_cancel_orchestrator {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, lf::stop_source &root_stop, std::atomic<int> &count, int width)
+      -> lf::task<void, Context> {
+    auto sc = co_await lf::scope();
+    for (int i = 0; i < width; ++i) {
+      if (i == width / 2) {
+        // Cancel after forking half the work
+        root_stop.request_stop();
+      }
+      co_await sc.fork_drop(nested_inner_worker{}, count, width);
+    }
+    co_await sc.join();
+    // Should not be reached
+    count.fetch_add(100000, std::memory_order_relaxed);
+  }
+};
+
+template <typename Context>
+auto stress_nested_cancel(lf::env<Context>, int width) -> lf::task<void, Context> {
+  std::atomic<int> count = 0;
+  auto outer = co_await lf::child_scope();
+  co_await outer.call_drop(nested_cancel_orchestrator{}, outer, count, width);
+  co_await outer.join();
+}
+
+// ============================================================
+// Run all tests against a given scheduler
+// ============================================================
+
+template <typename Sch>
+void tests(Sch &scheduler) {
+
+  using Ctx = lf::context_t<Sch>;
+
+  SECTION("call_drop: pre-cancelled child is not run") {
+    auto recv = schedule(scheduler, test_call_drop_cancelled<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("call: pre-cancelled child is not run, return address not written") {
+    auto recv = schedule(scheduler, test_call_cancelled<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("fork_drop: pre-cancelled child is not run") {
+    auto recv = schedule(scheduler, test_fork_drop_cancelled<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("fork: pre-cancelled child is not run, return address not written") {
+    auto recv = schedule(scheduler, test_fork_cancelled<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("call: positive - not cancelled, child runs and writes result") {
+    auto recv = schedule(scheduler, test_call_not_cancelled<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("fork: positive - not cancelled, child runs and writes result") {
+    auto recv = schedule(scheduler, test_fork_not_cancelled<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("multiple fork_drops: all pre-cancelled, none run") {
+    auto recv = schedule(scheduler, test_multiple_cancelled<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("mixed scopes: only non-cancelled children run") {
+    auto recv = schedule(scheduler, test_mixed_cancel<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("call_drop/fork_drop: skipped when parent frame is cancelled; join fires handle_cancel") {
+    auto recv = schedule(scheduler, test_call_parent_stop_source<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("fork child cancels parent stop source; join detects cancel via handle_cancel") {
+    auto recv = schedule(scheduler, test_fork_cancel_at_join<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("nested child_scope: stopping outer scope propagates to inner via chain") {
+    auto recv = schedule(scheduler, test_nested_child_scope_chain<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("stoppable receiver: recv_state + request_stop completes cleanly") {
+    std::atomic<bool> ran = false;
+    lf::recv_state<void, true> state;
+    auto recv = lf::schedule(scheduler, std::move(state), pre_cancelled_root_fn<Ctx>, &ran);
+    REQUIRE(recv.valid());
+    recv.stop_source().request_stop();
+
+#if LF_COMPILER_EXCEPTIONS
+    REQUIRE_THROWS_AS(std::move(recv).get(), lf::operation_cancelled_error);
+#else
+    recv.wait();
+#endif
+
+    // The task body may or may not have run depending on scheduler timing;
+    // what matters is that get() completes without error.
+    std::ignore = ran.load();
+  }
+
+  // --- Stress tests (paths C/D/E under contention) ---
+
+  SECTION("stress: fan-out cancel, width=16") {
+    auto recv = schedule(scheduler, stress_fan_cancel<Ctx>, 16);
+    REQUIRE(recv.valid());
+    std::move(recv).get();
+  }
+
+  SECTION("stress: fan-out cancel, width=64") {
+    auto recv = schedule(scheduler, stress_fan_cancel<Ctx>, 64);
+    REQUIRE(recv.valid());
+    std::move(recv).get();
+  }
+
+  SECTION("stress: tree cancel depth=6, cancel at depth=3") {
+    auto recv = schedule(scheduler, stress_tree_cancel<Ctx>, 6, 3);
+    REQUIRE(recv.valid());
+    std::move(recv).get();
+  }
+
+  SECTION("stress: tree cancel depth=8, cancel at depth=1 (near leaf)") {
+    auto recv = schedule(scheduler, stress_tree_cancel<Ctx>, 8, 1);
+    REQUIRE(recv.valid());
+    std::move(recv).get();
+  }
+
+  SECTION("stress: tree cancel depth=8, cancel at depth=7 (near root)") {
+    auto recv = schedule(scheduler, stress_tree_cancel<Ctx>, 8, 7);
+    REQUIRE(recv.valid());
+    std::move(recv).get();
+  }
+
+  SECTION("stress: nested child_scope cancel, width=8") {
+    auto recv = schedule(scheduler, stress_nested_cancel<Ctx>, 8);
+    REQUIRE(recv.valid());
+    std::move(recv).get();
+  }
+
+  SECTION("stress: nested child_scope cancel, width=32") {
+    auto recv = schedule(scheduler, stress_nested_cancel<Ctx>, 32);
+    REQUIRE(recv.valid());
+    std::move(recv).get();
+  }
+
+  SECTION("stress: rapid schedule + cancel") {
+    for (int i = 0; i < 50; ++i) {
+      lf::recv_state<void, true> state;
+      auto recv = lf::schedule(scheduler, std::move(state), busy_leaf{});
+      recv.stop_source().request_stop();
+#if LF_COMPILER_EXCEPTIONS
+      REQUIRE_THROWS_AS(std::move(recv).get(), lf::operation_cancelled_error);
+#else
+      recv.wait();
+#endif
+    }
+  }
+
+#if LF_COMPILER_EXCEPTIONS
+
+  SECTION("exception propagates through join when frame is NOT cancelled") {
+    auto recv = schedule(scheduler, test_exception_propagates<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE_THROWS_AS(std::move(recv).get(), std::runtime_error);
+  }
+
+  SECTION("exception in cancelled frame is dropped by handle_cancel; recv.get() does not throw") {
+    auto recv = schedule(scheduler, test_exception_dropped_when_cancelled<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("sibling exception dropped when another sibling cancels the frame") {
+    auto recv = schedule(scheduler, test_sibling_exception_dropped_when_cancelled<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+#endif // LF_COMPILER_EXCEPTIONS
+}
+
+using mono_inline_ctx = lf::mono_context<lf::geometric_stack<>, lf::adapt_vector<>>;
+using poly_inline_ctx = lf::derived_poly_context<lf::geometric_stack<>, lf::adapt_vector<>>;
+
+} // namespace
+
+TEMPLATE_TEST_CASE("Inline cancel", "[cancel]", mono_inline_ctx, poly_inline_ctx) {
+  lf::inline_scheduler<TestType> sch{};
+  tests(sch);
+}
+
+namespace {
+
+using mono_busy_thread_pool = lf::mono_busy_pool<lf::geometric_stack<>>;
+using poly_busy_thread_pool = lf::poly_busy_pool<lf::geometric_stack<>>;
+
+} // namespace
+
+TEMPLATE_TEST_CASE("Busy cancel", "[cancel]", mono_busy_thread_pool, poly_busy_thread_pool) {
+
+  STATIC_REQUIRE(lf::scheduler<TestType>);
+
+  for (std::size_t thr = 1; thr < 4; ++thr) {
+    DYNAMIC_SECTION("threads=" << thr) {
+      TestType scheduler{thr};
+      tests(scheduler);
+    }
+  }
+}
+
+namespace {
+
+// Stress tests repeated at higher thread counts to maximize contention.
+template <typename Sch>
+void stress_tests(Sch &scheduler) {
+  using Ctx = lf::context_t<Sch>;
+
+  SECTION("stress: repeated fan cancel") {
+    for (int rep = 0; rep < 20; ++rep) {
+      auto recv = schedule(scheduler, stress_fan_cancel<Ctx>, 32);
+      REQUIRE(recv.valid());
+      std::move(recv).get();
+    }
+  }
+
+  SECTION("stress: repeated tree cancel") {
+    for (int rep = 0; rep < 20; ++rep) {
+      auto recv = schedule(scheduler, stress_tree_cancel<Ctx>, 7, 3);
+      REQUIRE(recv.valid());
+      std::move(recv).get();
+    }
+  }
+
+  SECTION("stress: repeated nested cancel") {
+    for (int rep = 0; rep < 20; ++rep) {
+      auto recv = schedule(scheduler, stress_nested_cancel<Ctx>, 16);
+      REQUIRE(recv.valid());
+      std::move(recv).get();
+    }
+  }
+}
+
+} // namespace
+
+TEMPLATE_TEST_CASE("Busy cancel stress", "[cancel][stress]", mono_busy_thread_pool, poly_busy_thread_pool) {
+
+  STATIC_REQUIRE(lf::scheduler<TestType>);
+
+  std::size_t max_thr = std::min(8UZ, static_cast<std::size_t>(std::thread::hardware_concurrency()));
+
+  for (std::size_t thr = 2; thr <= max_thr; thr *= 2) {
+    DYNAMIC_SECTION("threads=" << thr) {
+      TestType scheduler{thr};
+      stress_tests(scheduler);
+    }
+  }
+}
diff --git a/test/src/concepts.cpp b/test/src/concepts.cpp
new file mode 100644
index 000000000..5e02e1caa
--- /dev/null
+++ b/test/src/concepts.cpp
@@ -0,0 +1,427 @@
+#include <catch2/catch_test_macros.hpp>
+
+import std;
+
+import libfork;
+import libfork.utils;
+
+using namespace lf;
+
+using test_stack = geometric_stack<>;
+using test_context = mono_context<test_stack, adapt_vector<>>;
+
+TEST_CASE("Concepts: atomicable", "[concepts]") {
+  STATIC_REQUIRE(atomicable<std::byte>);
+  STATIC_REQUIRE(atomicable<void *>);
+
+  struct trivial {
+    int x;
+    float y;
+  };
+
+  STATIC_REQUIRE(atomicable<trivial>);
+
+  STATIC_REQUIRE_FALSE(atomicable<std::string>);
+  STATIC_REQUIRE_FALSE(atomicable<const int>);
+  STATIC_REQUIRE_FALSE(atomicable<int &>);
+}
+
+TEST_CASE("Concepts: lock_free", "[concepts]") {
+  STATIC_REQUIRE(lock_free<std::byte>);
+  STATIC_REQUIRE(lock_free<void *>);
+}
+
+namespace {
+
+template <typename... T>
+struct my_template {};
+
+} // namespace
+
+TEST_CASE("Concepts: specialization_of", "[concepts]") {
+  STATIC_REQUIRE(specialization_of<std::vector<int>, std::vector>);
+  STATIC_REQUIRE(specialization_of<my_template<int, float>, my_template>);
+  STATIC_REQUIRE(specialization_of<task<int, test_context>, task>);
+
+  STATIC_REQUIRE_FALSE(specialization_of<int, std::vector>);
+  STATIC_REQUIRE_FALSE(specialization_of<std::vector<int>, my_template>);
+}
+
+TEST_CASE("Concepts: returnable", "[concepts]") {
+  STATIC_REQUIRE(returnable<void>);
+  STATIC_REQUIRE(returnable<int>);
+  STATIC_REQUIRE(returnable<std::unique_ptr<int>>);
+
+  struct non_movable {
+    non_movable() = default;
+    non_movable(const non_movable &) = delete;
+    non_movable(non_movable &&) = delete;
+  };
+
+  STATIC_REQUIRE_FALSE(returnable<non_movable>);
+}
+
+TEST_CASE("Concepts: worker_stack", "[concepts]") {
+  STATIC_REQUIRE(worker_stack<test_stack>);
+
+  struct bad_stack {
+    struct ckpt {
+      auto operator==(ckpt const &) const -> bool = default;
+    };
+    static auto push(std::size_t) -> void *;
+    static auto pop(void *, std::size_t) -> void; // missing noexcept
+    static auto checkpoint() noexcept -> ckpt;
+    static auto prepare_release() noexcept -> int;
+    static auto release(int) noexcept -> void;
+    static auto acquire(ckpt const &) noexcept -> void;
+  };
+
+  STATIC_REQUIRE_FALSE(worker_stack<bad_stack>);
+}
+
+TEST_CASE("Concepts: worker_context", "[concepts]") {
+  STATIC_REQUIRE(worker_context<test_context>);
+
+  struct missing_push {
+    auto pop() noexcept -> lf::steal_handle<missing_push>;
+    auto stack() noexcept -> test_stack &;
+  };
+
+  STATIC_REQUIRE_FALSE(worker_context<missing_push>);
+}
+
+TEST_CASE("Concepts: async_invocable", "[concepts]") {
+
+  auto async_fn_env(env<test_context>, int) -> task<int, test_context>;
+  auto async_fn_no_env(int) -> task<int, test_context>;
+  auto not_async_fn(int) -> int;
+
+  struct both_invocable {
+    auto operator()(env<test_context>, int) const -> task<int, test_context>;
+    auto operator()(int) const -> task<double, test_context>;
+  };
+
+  // Basic positive cases
+  STATIC_REQUIRE(async_invocable<decltype(async_fn_env), test_context, int>);
+  STATIC_REQUIRE(async_invocable<decltype(async_fn_no_env), test_context, int>);
+
+  // Arg mismatch
+  STATIC_REQUIRE_FALSE(async_invocable<decltype(async_fn_env), test_context, int *>);
+  STATIC_REQUIRE_FALSE(async_invocable<decltype(async_fn_no_env), test_context, double *>);
+
+  // Result type check
+  STATIC_REQUIRE(std::same_as<async_result_t<decltype(async_fn_env), test_context, int>, int>);
+
+  // Preference check: when both are available, it should pick the one with env
+  // and return int task.
+  STATIC_REQUIRE(async_invocable_to<both_invocable, int, test_context, int>);
+  // Verification that it didn't pick the double one
+  STATIC_REQUIRE_FALSE(async_invocable_to<both_invocable, double, test_context, int>);
+
+  // Fails unless return is a task
+  STATIC_REQUIRE_FALSE(async_invocable<decltype(not_async_fn), test_context, int>);
+
+  // Need a valid context of a different type
+  struct mock_context {
+    void push(lf::steal_handle<mock_context>);
+    void post(lf::sched_handle<mock_context>);
+    auto pop() noexcept -> lf::steal_handle<mock_context>;
+    auto stack() noexcept -> test_stack &;
+  };
+
+  STATIC_REQUIRE(worker_context<mock_context>);
+
+  // Fails because the result task's context doesn't match the provided context
+  STATIC_REQUIRE_FALSE(async_invocable<decltype(async_fn_no_env), mock_context, int>);
+}
+
+TEST_CASE("Concepts: async_nothrow_invocable", "[concepts]") {
+
+  struct nothrow_async {
+    auto operator()(int) const noexcept -> task<int, test_context>;
+  };
+
+  struct throwing_async {
+    auto operator()(int) const -> task<int, test_context>;
+  };
+
+  STATIC_REQUIRE(async_nothrow_invocable<nothrow_async, test_context, int>);
+  STATIC_REQUIRE_FALSE(async_nothrow_invocable<throwing_async, test_context, int>);
+}
+
+namespace {
+
+using semigroup_iter = std::vector<int>::iterator;
+using semigroup_const_iter = std::vector<int>::const_iterator;
+
+struct semigroup_context {
+  void push(lf::steal_handle<semigroup_context>);
+  void post(lf::sched_handle<semigroup_context>);
+  auto pop() noexcept -> lf::steal_handle<semigroup_context>;
+  auto stack() noexcept -> test_stack &;
+};
+
+static_assert(worker_context<semigroup_context>);
+
+struct semigroup_bad_context {};
+
+struct semigroup_acc {
+  semigroup_acc() = default;
+  explicit semigroup_acc(int);
+};
+
+struct semigroup_non_default {
+  semigroup_non_default() = delete;
+  explicit semigroup_non_default(int);
+  semigroup_non_default(semigroup_non_default const &) = default;
+  semigroup_non_default(semigroup_non_default &&) = default;
+  auto operator=(semigroup_non_default const &) -> semigroup_non_default & = default;
+  auto operator=(semigroup_non_default &&) -> semigroup_non_default & = default;
+};
+
+static_assert(returnable<semigroup_non_default>);
+static_assert(!std::default_initializable<semigroup_non_default>);
+
+struct sync_int_semigroup {
+  auto operator()(int, int) const -> int;
+};
+
+struct sync_acc_semigroup {
+  auto operator()(int, int) const -> semigroup_acc;
+  auto operator()(int, semigroup_acc) const -> semigroup_acc;
+  auto operator()(semigroup_acc, int) const -> semigroup_acc;
+  auto operator()(semigroup_acc, semigroup_acc) const -> semigroup_acc;
+};
+
+struct sync_mutable_ref_semigroup {
+  auto operator()(int &, int &) const -> semigroup_acc;
+  auto operator()(int &, semigroup_acc) const -> semigroup_acc;
+  auto operator()(semigroup_acc, int &) const -> semigroup_acc;
+  auto operator()(semigroup_acc, semigroup_acc) const -> semigroup_acc;
+};
+
+struct sync_missing_mixed {
+  auto operator()(int, int) const -> semigroup_acc;
+  auto operator()(semigroup_acc, semigroup_acc) const -> semigroup_acc;
+};
+
+struct sync_wrong_mixed_return {
+  auto operator()(int, int) const -> semigroup_acc;
+  auto operator()(int, semigroup_acc) const -> int;
+  auto operator()(semigroup_acc, int) const -> semigroup_acc;
+  auto operator()(semigroup_acc, semigroup_acc) const -> semigroup_acc;
+};
+
+struct sync_not_copyable {
+  sync_not_copyable() = default;
+  sync_not_copyable(sync_not_copyable const &) = delete;
+  sync_not_copyable(sync_not_copyable &&) = default;
+  auto operator()(int, int) const -> int;
+};
+
+struct sync_project_int {
+  auto operator()(int const &) const -> int;
+};
+
+using sync_projected_iter = projected<test_context, semigroup_iter, sync_project_int>;
+
+struct async_project_int {
+  template <typename Context>
+  static auto operator()(env<Context>, int const &) -> task<int, Context>;
+};
+
+using async_projected_iter = projected<test_context, semigroup_iter, async_project_int>;
+
+struct async_int_semigroup {
+  template <typename Context>
+  static auto operator()(env<Context>, int, int) -> task<int, Context>;
+};
+
+struct async_acc_semigroup {
+  template <typename Context>
+  static auto operator()(env<Context>, int, int) -> task<semigroup_acc, Context>;
+  template <typename Context>
+  static auto operator()(env<Context>, int, semigroup_acc) -> task<semigroup_acc, Context>;
+  template <typename Context>
+  static auto operator()(env<Context>, semigroup_acc, int) -> task<semigroup_acc, Context>;
+  template <typename Context>
+  static auto operator()(env<Context>, semigroup_acc, semigroup_acc) -> task<semigroup_acc, Context>;
+};
+
+struct async_missing_mixed {
+  template <typename Context>
+  static auto operator()(env<Context>, int, int) -> task<semigroup_acc, Context>;
+  template <typename Context>
+  static auto operator()(env<Context>, semigroup_acc, semigroup_acc) -> task<semigroup_acc, Context>;
+};
+
+struct async_wrong_mixed_return {
+  template <typename Context>
+  static auto operator()(env<Context>, int, int) -> task<semigroup_acc, Context>;
+  template <typename Context>
+  static auto operator()(env<Context>, int, semigroup_acc) -> task<int, Context>;
+  template <typename Context>
+  static auto operator()(env<Context>, semigroup_acc, int) -> task<semigroup_acc, Context>;
+  template <typename Context>
+  static auto operator()(env<Context>, semigroup_acc, semigroup_acc) -> task<semigroup_acc, Context>;
+};
+
+struct async_wrong_context {
+  template <typename Context>
+  static auto operator()(env<Context>, int, int) -> task<int, semigroup_context>;
+};
+
+struct async_not_copyable {
+  async_not_copyable() = default;
+  async_not_copyable(async_not_copyable const &) = delete;
+  async_not_copyable(async_not_copyable &&) = default;
+
+  template <typename Context>
+  static auto operator()(env<Context>, int, int) -> task<int, Context>;
+};
+
+struct hybrid_semigroup {
+  auto operator()(int, int) const -> long;
+
+  template <typename Context>
+  static auto operator()(env<Context>, int, int) -> task<double, Context>;
+};
+
+} // namespace
+
+TEST_CASE("Concepts: sync indirect_semigroup", "[concepts]") {
+  STATIC_REQUIRE(lf::sync::indirect_semigroup<sync_int_semigroup, semigroup_iter>);
+  STATIC_REQUIRE(lf::sync::indirect_semigroup<sync_int_semigroup, semigroup_const_iter>);
+  STATIC_REQUIRE(lf::sync::indirect_semigroup<sync_acc_semigroup, semigroup_const_iter>);
+  STATIC_REQUIRE(lf::sync::indirect_semigroup<sync_int_semigroup, sync_projected_iter>);
+  STATIC_REQUIRE(lf::sync::indirect_semigroup<sync_int_semigroup, async_projected_iter>);
+
+  // A mutable-reference-only callable works for mutable iterators where
+  // indirect_value_t and iter_reference_t are both int&.
+  STATIC_REQUIRE(lf::sync::indirect_semigroup<sync_mutable_ref_semigroup, semigroup_iter>);
+
+  // const_iterator has indirect_value_t<int&> but iter_reference_t<const int&>,
+  // so the iter_reference_t checks reject mutable-only binary operations.
+  STATIC_REQUIRE_FALSE(lf::sync::indirect_semigroup<sync_mutable_ref_semigroup, semigroup_const_iter>);
+
+  STATIC_REQUIRE_FALSE(lf::sync::indirect_semigroup<sync_missing_mixed, semigroup_const_iter>);
+  STATIC_REQUIRE_FALSE(lf::sync::indirect_semigroup<sync_wrong_mixed_return, semigroup_const_iter>);
+  STATIC_REQUIRE_FALSE(lf::sync::indirect_semigroup<sync_not_copyable, semigroup_iter>);
+  STATIC_REQUIRE_FALSE(lf::sync::indirect_semigroup<sync_int_semigroup, int>);
+  STATIC_REQUIRE_FALSE(lf::sync::indirect_semigroup<async_int_semigroup, semigroup_iter>);
+}
+
+TEST_CASE("Concepts: async indirect_semigroup", "[concepts]") {
+  STATIC_REQUIRE(lf::async::indirect_semigroup<async_int_semigroup, test_context, semigroup_iter>);
+  STATIC_REQUIRE(lf::async::indirect_semigroup<async_int_semigroup, test_context, semigroup_const_iter>);
+  STATIC_REQUIRE(lf::async::indirect_semigroup<async_acc_semigroup, test_context, semigroup_const_iter>);
+  STATIC_REQUIRE(lf::async::indirect_semigroup<async_int_semigroup, test_context, sync_projected_iter>);
+  STATIC_REQUIRE(lf::async::indirect_semigroup<async_int_semigroup, test_context, async_projected_iter>);
+
+  STATIC_REQUIRE_FALSE(lf::async::indirect_semigroup<sync_int_semigroup, test_context, semigroup_iter>);
+  STATIC_REQUIRE_FALSE(
+      lf::async::indirect_semigroup<async_missing_mixed, test_context, semigroup_const_iter>);
+  STATIC_REQUIRE_FALSE(
+      lf::async::indirect_semigroup<async_wrong_mixed_return, test_context, semigroup_const_iter>);
+  STATIC_REQUIRE_FALSE(lf::async::indirect_semigroup<async_wrong_context, test_context, semigroup_iter>);
+  STATIC_REQUIRE_FALSE(lf::async::indirect_semigroup<async_not_copyable, test_context, semigroup_iter>);
+  STATIC_REQUIRE_FALSE(
+      lf::async::indirect_semigroup<async_int_semigroup, semigroup_bad_context, semigroup_iter>);
+  STATIC_REQUIRE_FALSE(lf::async::indirect_semigroup<async_int_semigroup, test_context, int>);
+}
+
+TEST_CASE("Concepts: indirect_semigroup dispatch and result type", "[concepts]") {
+  STATIC_REQUIRE(indirect_semigroup<sync_int_semigroup, semigroup_bad_context, semigroup_iter>);
+  STATIC_REQUIRE(indirect_semigroup<async_int_semigroup, test_context, semigroup_iter>);
+  STATIC_REQUIRE(indirect_semigroup<hybrid_semigroup, test_context, semigroup_iter>);
+
+  STATIC_REQUIRE_FALSE(indirect_semigroup<async_int_semigroup, semigroup_bad_context, semigroup_iter>);
+  STATIC_REQUIRE_FALSE(indirect_semigroup<sync_missing_mixed, test_context, semigroup_const_iter>);
+
+  STATIC_REQUIRE(
+      std::same_as<indirect_semigroup_t<sync_int_semigroup, semigroup_bad_context, semigroup_iter>, int>);
+  STATIC_REQUIRE(std::same_as<indirect_semigroup_t<sync_acc_semigroup, semigroup_bad_context, semigroup_iter>,
+                              semigroup_acc>);
+  STATIC_REQUIRE(std::same_as<indirect_semigroup_t<async_int_semigroup, test_context, semigroup_iter>, int>);
+
+  // When both branches match, the async-specialized result type wins.
+  STATIC_REQUIRE(std::same_as<indirect_semigroup_t<hybrid_semigroup, test_context, semigroup_iter>, double>);
+}
+
+namespace {
+
+struct plain_awaitable {};
+
+struct member_co_await {
+  auto operator co_await() -> plain_awaitable;
+};
+
+struct free_co_await {};
+
+[[maybe_unused]]
+auto operator co_await(free_co_await) -> plain_awaitable &;
+
+struct both_co_await {
+  auto operator co_await() -> plain_awaitable;
+};
+
+[[maybe_unused]]
+auto operator co_await(both_co_await) -> plain_awaitable;
+
+template <typename T>
+concept can_acquire = requires (T &&t) { acquire_awaitable(std::forward<T>(t)); };
+
+} // namespace
+
+TEST_CASE("Concepts: awaitable_acquirable", "[concepts]") {
+  // Generic identity overload accepts any type — even non-awaiters.
+  STATIC_REQUIRE(can_acquire<plain_awaitable>);
+  STATIC_REQUIRE(can_acquire<int>);
+
+  // A single operator co_await — member or free — disambiguates the dispatch.
+  STATIC_REQUIRE(can_acquire<member_co_await>);
+  STATIC_REQUIRE(can_acquire<free_co_await>);
+
+  // Defining BOTH member and free operator co_await leaves the dispatch ambiguous.
+  STATIC_REQUIRE_FALSE(can_acquire<both_co_await>);
+}
+
+TEST_CASE("acquire_awaitable", "[concepts]") {
+  // No operator co_await: returns the argument unchanged, preserving value category.
+  using acq_plain_xref = decltype(acquire_awaitable(std::declval<plain_awaitable>()));
+  using acq_plain_lref = decltype(acquire_awaitable(std::declval<plain_awaitable &>()));
+  using acq_plain_cref = decltype(acquire_awaitable(std::declval<plain_awaitable const &>()));
+
+  STATIC_REQUIRE(std::same_as<acq_plain_xref, plain_awaitable &&>);
+  STATIC_REQUIRE(std::same_as<acq_plain_lref, plain_awaitable &>);
+  STATIC_REQUIRE(std::same_as<acq_plain_cref, plain_awaitable const &>);
+
+  // Member operator co_await: returns whatever the member call produces.
+  STATIC_REQUIRE(std::same_as<decltype(acquire_awaitable(std::declval<member_co_await>())), plain_awaitable>);
+
+  // Free operator co_await: returns whatever the ADL-found free call produces.
+  STATIC_REQUIRE(std::same_as<decltype(acquire_awaitable(std::declval<free_co_await>())), plain_awaitable &>);
+}
+
+namespace {
+
+struct lf_awaitable {
+  auto await_ready() -> bool;
+  auto await_suspend(lf::sched_handle<test_context>, test_context &) -> void;
+  auto await_resume() -> void;
+};
+
+struct bad_lf_awaitable {
+  auto await_ready() -> bool;
+  auto await_suspend(lf::sched_handle<test_context>, int &) -> void; // Wrong context type
+  auto await_resume() -> void;
+};
+
+} // namespace
+
+TEST_CASE("Concepts: awaitable_impl", "[concepts]") {
+
+  STATIC_REQUIRE(lf::awaitable<lf_awaitable, test_context>);
+
+  STATIC_REQUIRE_FALSE(lf::awaitable<bad_lf_awaitable, test_context>);
+}
diff --git a/test/src/deque.cpp b/test/src/deque.cpp
new file mode 100644
index 000000000..f23fcca92
--- /dev/null
+++ b/test/src/deque.cpp
@@ -0,0 +1,210 @@
+#include <catch2/catch_test_macros.hpp>
+
+import std;
+
+import libfork;
+
+using namespace lf;
+
+TEST_CASE("Deque: Concepts", "[deque]") {
+  STATIC_REQUIRE(dequeable<int>);
+  STATIC_REQUIRE(dequeable<double>);
+  STATIC_REQUIRE_FALSE(dequeable<std::string>); // Not trivially copyable
+}
+
+TEST_CASE("Deque: Single thread as stack", "[deque]") {
+  // Only need a few
+  lf::deque<int> deque{16};
+
+  REQUIRE(deque.empty());
+
+  for (int i = 0; i < 10; ++i) {
+    auto pre = deque.push(i);
+    REQUIRE(pre == i);
+    REQUIRE(deque.ssize() == i + 1);
+  }
+
+  for (int i = 9; i >= 0; --i) {
+    auto item = deque.pop();
+    REQUIRE(item);
+    REQUIRE(*item == i);
+  }
+
+  REQUIRE(deque.empty());
+}
+
+TEST_CASE("Deque: Custom pop when_empty", "[deque]") {
+  lf::deque<int> deque{2};
+
+  auto result = deque.pop([]() {
+    return -1;
+  });
+  REQUIRE(result == -1);
+
+  deque.push(42);
+  result = deque.pop([]() {
+    return -1;
+  });
+  REQUIRE(result == 42);
+
+  result = deque.pop([]() {
+    return -1;
+  });
+  REQUIRE(result == -1);
+}
+
+namespace {
+
+void test_deque(std::size_t n_pushes, std::size_t n_consumers, bool do_pop) {
+
+  // The tested queue
+  lf::deque<std::uint64_t> deque{n_pushes};
+
+  // To store removed elements
+  std::vector<std::uint64_t> pops{};
+  std::vector<std::vector<std::uint64_t>> steals(n_consumers);
+
+  // Sync
+  std::atomic_flag done{};
+  std::latch start{static_cast<std::ptrdiff_t>(n_consumers + 1)};
+
+  //
+  std::mt19937_64 rng{std::random_device{}()};
+  std::uniform_int_distribution<std::size_t> dist{0, 4};
+
+  std::vector<std::thread> consumers;
+
+  // Consumers steal items and write to their respective vectors
+  for (std::size_t i = 0; i < n_consumers; ++i) {
+    consumers.emplace_back([&, i] {
+      // Wait for initialization to complete
+      start.arrive_and_wait();
+
+      for (;;) {
+        auto [err, item] = deque.thief().steal();
+
+        switch (err) {
+          case lf::err::none:
+            steals[i].push_back(item);
+            [[fallthrough]];
+          case lf::err::lost:
+            break;
+          case lf::err::empty:
+            if (done.test()) {
+              return;
+            }
+            std::this_thread::yield();
+        }
+      }
+    });
+  }
+
+  // Setup, producer write n/2 items
+  for (std::size_t i = 0; i < n_pushes / 2; ++i) {
+    deque.push(i);
+  }
+
+  // Start the consumers
+  start.arrive_and_wait();
+
+  // Push remaining items
+  if (do_pop) {
+    for (std::size_t i = n_pushes / 2; i < n_pushes;) {
+      if (dist(rng) == 0) {
+        if (auto item = deque.pop()) {
+          pops.push_back(*item);
+        }
+      } else {
+        deque.push(i);
+        ++i;
+      }
+    }
+  } else {
+    for (std::size_t i = n_pushes / 2; i < n_pushes; ++i) {
+      deque.push(i);
+    }
+  }
+
+  // Drain the queue
+  if (do_pop) {
+    for (;;) {
+      if (auto item = deque.pop()) {
+        pops.push_back(*item);
+      } else {
+        if (done.test_and_set()) {
+          break;
+        }
+      }
+    }
+  } else {
+    while (!deque.empty()) {
+      std::this_thread::yield();
+    }
+    done.test_and_set();
+  }
+
+  // Stop the consumers
+  for (auto &c : consumers) {
+    c.join();
+  }
+
+  // Verify ascending for each thief
+  for (auto &&deq : steals) {
+    REQUIRE(std::ranges::is_sorted(deq));
+  }
+
+  // Accumulate all items
+  std::vector<std::uint64_t> all = pops;
+
+  for (auto &&deq : steals) {
+    all.insert(all.end(), deq.begin(), deq.end());
+  }
+
+  REQUIRE(all.size() == n_pushes);
+
+  std::ranges::sort(all);
+
+  for (std::size_t i = 0; i < n_pushes; ++i) {
+    REQUIRE(all[i] == i);
+  }
+}
+
+constexpr std::size_t max_elements = 1uz << 20;
+
+} // namespace
+
+TEST_CASE("Deque: Single threaded", "[deque]") {
+  for (std::size_t i = 1; i <= max_elements; i <<= 1) {
+    DYNAMIC_SECTION("Elements: " << i) { test_deque(i, 0, true); }
+  }
+}
+
+TEST_CASE("Deque: SPSC no-pop", "[deque]") {
+  for (std::size_t i = 1; i <= max_elements; i <<= 1) {
+    DYNAMIC_SECTION("Elements: " << i) { test_deque(i, 1, false); }
+  }
+}
+
+TEST_CASE("Deque: SPSC with-pop", "[deque]") {
+  for (std::size_t i = 1; i <= max_elements; i <<= 1) {
+    DYNAMIC_SECTION("Elements: " << i) { test_deque(i, 1, true); }
+  }
+}
+
+TEST_CASE("Deque: MPSC no-pop", "[deque]") {
+  unsigned int max_threads = std::min(4u, std::thread::hardware_concurrency());
+  for (std::size_t i = 1; i <= (max_elements >> 2); i <<= 1) {
+    for (std::size_t j = 2; j <= max_threads; ++j) {
+      DYNAMIC_SECTION("Elements: " << i << " Consumers: " << j) { test_deque(i, j, false); }
+    }
+  }
+}
+
+TEST_CASE("Deque: MPSC with-pop", "[deque]") {
+  unsigned int max_threads = std::min(4u, std::thread::hardware_concurrency());
+  for (std::size_t i = 1; i <= (max_elements >> 2); i <<= 1) {
+    for (std::size_t j = 2; j <= max_threads; ++j) {
+      DYNAMIC_SECTION("Elements: " << i << " Consumers: " << j) { test_deque(i, j, true); }
+    }
+  }
+}
diff --git a/test/src/explicit_scheduling.cpp b/test/src/explicit_scheduling.cpp
new file mode 100644
index 000000000..416c7893e
--- /dev/null
+++ b/test/src/explicit_scheduling.cpp
@@ -0,0 +1,766 @@
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+#include "libfork/__impl/exception.hpp"
+
+import std;
+
+import libfork;
+
+// ============================================================
+// Tests for the explicit scheduling machinery (see
+// docs/explicit-scheduling.md). The goal is to exercise:
+//
+//   1. switch_awaitable's stack-handoff & "effectively stolen"
+//      drain logic across all interesting suspension shapes
+//      (root, child, before/after fork, recursion).
+//   2. Concept conformance for `lf::awaitable`.
+//   3. Exception propagation from both `await_suspend` (before
+//      publishing) and `await_resume` (after the hop).
+//
+// Where possible, tests verify *which pool* the task resumed on
+// (not just "not the caller thread"). To do so we precompute the
+// worker-thread-id set of each pool by submitting a barrier-blocked
+// task per worker and capturing its TID.
+// ============================================================
+
+namespace {
+
+// ---- Pool aliases ---------------------------------------------------------
+
+using mono_pool = lf::mono_busy_pool<lf::geometric_stack<>>;
+using poly_pool = lf::poly_busy_pool<lf::geometric_stack<>>;
+
+constexpr std::array<std::size_t, 3> k_worker_counts{1, 2, 4};
+
+// ---- Reference fib --------------------------------------------------------
+
+[[nodiscard]]
+auto fib_ref(std::int64_t n) -> std::int64_t {
+  if (n < 2) {
+    return n;
+  }
+  return fib_ref(n - 1) + fib_ref(n - 2);
+}
+
+// ---- Worker TID discovery -------------------------------------------------
+//
+// To force every worker in `pool` to expose its TID, post N tasks (one per
+// worker) that each block on a shared barrier. Because each task blocks
+// before completing, every worker must dequeue exactly one task — once all
+// are dequeued and the barrier opens, each task records its TID and
+// returns. The result set therefore contains exactly N distinct TIDs.
+
+struct record_tid_fn {
+  template <typename Context>
+  static auto
+  operator()(lf::env<Context>, std::barrier<> *sync, std::thread::id *out) -> lf::task<void, Context> {
+    *out = std::this_thread::get_id();
+    sync->arrive_and_wait();
+    co_return;
+  }
+};
+
+template <typename Pool>
+[[nodiscard]]
+auto discover_worker_tids(Pool &pool, std::size_t n_workers) -> std::unordered_set<std::thread::id> {
+  std::barrier sync(static_cast<std::ptrdiff_t>(n_workers));
+  std::vector<std::thread::id> tids(n_workers);
+  std::vector<lf::receiver<void>> recvs;
+  recvs.reserve(n_workers);
+  for (std::size_t i = 0; i < n_workers; ++i) {
+    recvs.push_back(lf::schedule(pool, record_tid_fn{}, &sync, &tids[i]));
+  }
+  for (auto &r : recvs) {
+    std::move(r).get();
+  }
+  std::unordered_set<std::thread::id> set(tids.begin(), tids.end());
+  REQUIRE(set.size() == n_workers); // Sanity: each worker ran exactly one barrier task.
+  return set;
+}
+
+// ============================================================
+// Awaitables
+// ============================================================
+
+// Plain awaitable: hops to `target` and optionally records the TID it
+// resumed on.
+template <typename Pool>
+struct hop_to {
+  Pool *target;
+  std::atomic<std::thread::id> *resumed_on = nullptr;
+
+  auto await_ready() noexcept -> bool { return false; }
+
+  auto await_suspend(lf::sched_handle<typename Pool::context_type> h,
+                     typename Pool::context_type & /*ctx*/) noexcept -> void {
+    target->post(h);
+  }
+
+  auto await_resume() noexcept -> void {
+    if (resumed_on != nullptr) {
+      resumed_on->store(std::this_thread::get_id(), std::memory_order_relaxed);
+    }
+  }
+};
+
+// await_suspend throws *before* publishing the handle. Exercises
+// `prepare_release` reversibility — the parent must be resumed inline
+// with the exception live.
+template <typename Pool>
+struct hop_throw_in_suspend {
+  auto await_ready() noexcept -> bool { return false; }
+  auto await_suspend(lf::sched_handle<typename Pool::context_type>, typename Pool::context_type &) -> void {
+    LF_THROW(std::runtime_error{"suspend"});
+  }
+  auto await_resume() noexcept -> void {}
+};
+
+// Successfully hops, then throws from `await_resume` on the destination
+// thread. Exception must propagate through the rest of the task.
+template <typename Pool>
+struct hop_throw_in_resume {
+  Pool *target;
+  auto await_ready() noexcept -> bool { return false; }
+  auto await_suspend(lf::sched_handle<typename Pool::context_type> h, typename Pool::context_type &) noexcept
+      -> void {
+    target->post(h);
+  }
+  auto await_resume() -> void { LF_THROW(std::runtime_error{"resume"}); }
+};
+
+// Posts the handle from a *separate* detached thread, simulating an
+// I/O-style awaitable where the source worker can return before the
+// handle is published.
+template <typename Pool>
+struct hop_deferred_post {
+  Pool *target;
+
+  auto await_ready() noexcept -> bool { return false; }
+  auto await_suspend(lf::sched_handle<typename Pool::context_type> h, typename Pool::context_type &) noexcept
+      -> void {
+    std::thread([h, t = target]() mutable {
+      std::this_thread::sleep_for(std::chrono::milliseconds(2));
+      t->post(h);
+    }).detach();
+  }
+  auto await_resume() noexcept -> void {}
+};
+
+// Member operator co_await wrapper.
+template <typename Pool>
+struct hop_member_op {
+  Pool *target;
+  auto operator co_await() noexcept -> hop_to<Pool> { return hop_to<Pool>{target}; }
+};
+
+// Free operator co_await wrapper.
+template <typename Pool>
+struct hop_free_op {
+  Pool *target;
+};
+
+template <typename Pool>
+[[nodiscard]]
+auto operator co_await(hop_free_op<Pool> h) noexcept -> hop_to<Pool> {
+  return hop_to<Pool>{h.target};
+}
+
+// ============================================================
+// Task functors (all members templated on Context — no out-of-class
+// member-template definitions are needed because each functor only
+// references awaitables / functors declared above it).
+// ============================================================
+
+struct fib_child {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, std::int64_t n) -> lf::task<std::int64_t, Context> {
+    if (n < 2) {
+      co_return n;
+    }
+    std::int64_t lhs = 0;
+    std::int64_t rhs = 0;
+    auto sc = co_await lf::scope();
+    co_await sc.fork(&rhs, fib_child{}, n - 2);
+    co_await sc.call(&lhs, fib_child{}, n - 1);
+    co_await sc.join();
+    co_return lhs + rhs;
+  }
+};
+
+// Hop once and record the resumed TID.
+struct hop_record_tid {
+  template <typename Context, typename Pool>
+  static auto
+  operator()(lf::env<Context>, Pool *other, std::atomic<std::thread::id> *out) -> lf::task<void, Context> {
+    co_await hop_to<Pool>{other, out};
+  }
+};
+
+// A→B→A→B→A; record the TID at each of the 5 points.
+struct round_trip {
+  template <typename Context, typename Pool>
+  static auto operator()(lf::env<Context>, Pool *a, Pool *b, std::array<std::thread::id, 5> *ids)
+      -> lf::task<void, Context> {
+    (*ids)[0] = std::this_thread::get_id();
+    co_await hop_to<Pool>{b};
+    (*ids)[1] = std::this_thread::get_id();
+    co_await hop_to<Pool>{a};
+    (*ids)[2] = std::this_thread::get_id();
+    co_await hop_to<Pool>{b};
+    (*ids)[3] = std::this_thread::get_id();
+    co_await hop_to<Pool>{a};
+    (*ids)[4] = std::this_thread::get_id();
+  }
+};
+
+// Hop to `other`, then fork `n_forks` fib(k) computations and sum.
+struct hop_then_fib_sum {
+  template <typename Context, typename Pool>
+  static auto
+  operator()(lf::env<Context>, Pool *other, int n_forks, std::int64_t k) -> lf::task<std::int64_t, Context> {
+    co_await hop_to<Pool>{other};
+    std::vector<std::int64_t> results(static_cast<std::size_t>(n_forks), 0);
+    auto sc = co_await lf::scope();
+    for (int i = 0; i < n_forks; ++i) {
+      co_await sc.fork(&results[static_cast<std::size_t>(i)], fib_child{}, k);
+    }
+    co_await sc.join();
+    std::int64_t sum = 0;
+    for (auto v : results) {
+      sum += v;
+    }
+    co_return sum;
+  }
+};
+
+// Fork first, *then* hop, then join. Stresses
+// `resume_effectively_stolen` — at suspension the parent has live
+// children on the original pool's WSQ.
+struct fib_then_hop_sum {
+  template <typename Context, typename Pool>
+  static auto
+  operator()(lf::env<Context>, Pool *other, int n_forks, std::int64_t k) -> lf::task<std::int64_t, Context> {
+    std::vector<std::int64_t> results(static_cast<std::size_t>(n_forks), 0);
+    auto sc = co_await lf::scope();
+    for (int i = 0; i < n_forks; ++i) {
+      co_await sc.fork(&results[static_cast<std::size_t>(i)], fib_child{}, k);
+    }
+    co_await hop_to<Pool>{other};
+    co_await sc.join();
+    std::int64_t sum = 0;
+    for (auto v : results) {
+      sum += v;
+    }
+    co_return sum;
+  }
+};
+
+// Recursive fib that hops at every odd depth, alternating destinations.
+struct switch_fib {
+  template <typename Context, typename Pool>
+  static auto operator()(lf::env<Context>, std::int64_t n, Pool *pa, Pool *pb, int depth)
+      -> lf::task<std::int64_t, Context> {
+    if (n < 2) {
+      co_return n;
+    }
+    if (depth % 2 == 1) {
+      Pool *dest = (depth / 2) % 2 == 0 ? pb : pa;
+      co_await hop_to<Pool>{dest};
+    }
+    std::int64_t lhs = 0;
+    std::int64_t rhs = 0;
+    auto sc = co_await lf::scope();
+    co_await sc.fork(&rhs, switch_fib{}, n - 2, pa, pb, depth + 1);
+    co_await sc.call(&lhs, switch_fib{}, n - 1, pa, pb, depth + 1);
+    co_await sc.join();
+    co_return lhs + rhs;
+  }
+};
+
+// A forked child that hops to `other` before computing fib(n).
+struct hop_fib_child {
+  template <typename Context, typename Pool>
+  static auto operator()(lf::env<Context>, Pool *other, std::int64_t n) -> lf::task<std::int64_t, Context> {
+    co_await hop_to<Pool>{other};
+    co_return fib_ref(n);
+  }
+};
+
+struct fork_hop_fib {
+  template <typename Context, typename Pool>
+  static auto operator()(lf::env<Context>, Pool *other, std::int64_t n) -> lf::task<std::int64_t, Context> {
+    std::int64_t result = 0;
+    auto sc = co_await lf::scope();
+    co_await sc.fork(&result, hop_fib_child{}, other, n);
+    co_await sc.join();
+    co_return result;
+  }
+};
+
+// Alternate hops between a/b for `k` iterations; each hop records the
+// resumed TID into `ids` (held under `mu`).
+struct alternating_hops {
+  template <typename Context, typename Pool>
+  static auto
+  operator()(lf::env<Context>, Pool *a, Pool *b, int k, std::vector<std::thread::id> *ids, std::mutex *mu)
+      -> lf::task<void, Context> {
+    for (int i = 0; i < k; ++i) {
+      Pool *dest = (i % 2 == 0) ? b : a;
+      co_await hop_to<Pool>{dest};
+      auto lock = std::scoped_lock(*mu);
+      ids->push_back(std::this_thread::get_id());
+    }
+  }
+};
+
+// Hop to one's own pool — tests the self-hop / single-worker path.
+struct self_hop {
+  template <typename Context, typename Pool>
+  static auto
+  operator()(lf::env<Context>, Pool *p, std::atomic<std::thread::id> *out) -> lf::task<void, Context> {
+    co_await hop_to<Pool>{p, out};
+  }
+};
+
+struct member_op_hop {
+  template <typename Context, typename Pool>
+  static auto
+  operator()(lf::env<Context>, Pool *other, std::atomic<std::thread::id> *out) -> lf::task<void, Context> {
+    co_await hop_member_op<Pool>{other};
+    out->store(std::this_thread::get_id(), std::memory_order_relaxed);
+  }
+};
+
+struct free_op_hop {
+  template <typename Context, typename Pool>
+  static auto
+  operator()(lf::env<Context>, Pool *other, std::atomic<std::thread::id> *out) -> lf::task<void, Context> {
+    co_await hop_free_op<Pool>{other};
+    out->store(std::this_thread::get_id(), std::memory_order_relaxed);
+  }
+};
+
+struct deferred_hop_task {
+  template <typename Context, typename Pool>
+  static auto
+  operator()(lf::env<Context>, Pool *other, std::atomic<std::thread::id> *out) -> lf::task<void, Context> {
+    co_await hop_deferred_post<Pool>{other};
+    out->store(std::this_thread::get_id(), std::memory_order_relaxed);
+  }
+};
+
+// Binary-tree recursion that hops at odd levels.
+struct hop_tree {
+  template <typename Context, typename Pool>
+  static auto operator()(lf::env<Context>, int depth, Pool *a, Pool *b, std::atomic<int> *leaf_count)
+      -> lf::task<void, Context> {
+    if (depth == 0) {
+      leaf_count->fetch_add(1, std::memory_order_relaxed);
+      co_return;
+    }
+    if (depth % 2 == 1) {
+      (void)a;
+      co_await hop_to<Pool>{b};
+    }
+    auto sc = co_await lf::scope();
+    co_await sc.fork(hop_tree{}, depth - 1, a, b, leaf_count);
+    co_await sc.call(hop_tree{}, depth - 1, a, b, leaf_count);
+    co_await sc.join();
+  }
+};
+
+#if LF_COMPILER_EXCEPTIONS
+
+struct throw_in_suspend_root {
+  template <typename Context, typename Pool>
+  static auto operator()(lf::env<Context>, Pool *other) -> lf::task<void, Context> {
+    (void)other;
+    co_await hop_throw_in_suspend<Pool>{};
+  }
+};
+
+struct throw_in_suspend_child {
+  template <typename Context, typename Pool>
+  static auto operator()(lf::env<Context>, Pool *other) -> lf::task<void, Context> {
+    (void)other;
+    co_await hop_throw_in_suspend<Pool>{};
+  }
+};
+
+struct fork_throw_in_suspend {
+  template <typename Context, typename Pool>
+  static auto operator()(lf::env<Context>, Pool *other) -> lf::task<void, Context> {
+    auto sc = co_await lf::scope();
+    co_await sc.fork_drop(throw_in_suspend_child{}, other);
+    co_await sc.join();
+  }
+};
+
+struct throw_in_resume_root {
+  template <typename Context, typename Pool>
+  static auto operator()(lf::env<Context>, Pool *other) -> lf::task<void, Context> {
+    co_await hop_throw_in_resume<Pool>{other};
+  }
+};
+
+#endif // LF_COMPILER_EXCEPTIONS
+
+// ============================================================
+// Concept conformance helpers (compile-time only)
+//
+// These are scoped to "extra" cases that aren't exercised in
+// concepts.cpp — namely the `await_suspend` return-type and
+// noexcept-vs-throwing distinctions specific to switch_awaitable.
+// ============================================================
+
+namespace concept_checks {
+
+using test_stack = lf::geometric_stack<>;
+using test_context = lf::mono_context<test_stack, lf::adapt_vector<>>;
+
+// await_ready returns a type that is *not* convertible to bool.
+struct non_bool_ready {
+  struct not_bool {};
+  auto await_ready() noexcept -> not_bool { return {}; }
+  auto await_suspend(lf::sched_handle<test_context>, test_context &) noexcept -> void {}
+  auto await_resume() noexcept -> void {}
+};
+
+// await_suspend returns non-void.
+struct non_void_suspend {
+  auto await_ready() noexcept -> bool { return false; }
+  auto await_suspend(lf::sched_handle<test_context>, test_context &) noexcept -> int { return 0; }
+  auto await_resume() noexcept -> void {}
+};
+
+// noexcept on await_suspend is *not* a requirement of `lf::awaitable`
+// (only of `nothrow_await_suspend`).
+struct throwing_suspend_awaitable {
+  auto await_ready() noexcept -> bool { return false; }
+  auto await_suspend(lf::sched_handle<test_context>, test_context &) -> void {}
+  auto await_resume() noexcept -> void {}
+};
+
+} // namespace concept_checks
+
+} // namespace
+
+// ============================================================
+// Test cases
+// ============================================================
+
+// ---- 1. One-shot switch ---------------------------------------------------
+
+TEMPLATE_TEST_CASE("explicit-sched: one-shot", "[explicit-sched]", mono_pool, poly_pool) {
+  for (auto n : k_worker_counts) {
+    DYNAMIC_SECTION("workers=" << n) {
+      TestType pool_a{n};
+      TestType pool_b{n};
+
+      auto a_tids = discover_worker_tids(pool_a, n);
+      auto b_tids = discover_worker_tids(pool_b, n);
+
+      std::atomic<std::thread::id> resumed_on;
+      auto recv = lf::schedule(pool_a, hop_record_tid{}, &pool_b, &resumed_on);
+      std::move(recv).get();
+
+      auto tid = resumed_on.load();
+      REQUIRE(b_tids.contains(tid));
+      REQUIRE_FALSE(a_tids.contains(tid));
+    }
+  }
+}
+
+// ---- 2. Round-trip A→B→A→B→A ---------------------------------------------
+
+TEMPLATE_TEST_CASE("explicit-sched: round-trip", "[explicit-sched]", mono_pool, poly_pool) {
+  for (auto n : k_worker_counts) {
+    DYNAMIC_SECTION("workers=" << n) {
+      TestType pool_a{n};
+      TestType pool_b{n};
+
+      auto a_tids = discover_worker_tids(pool_a, n);
+      auto b_tids = discover_worker_tids(pool_b, n);
+
+      std::array<std::thread::id, 5> ids{};
+      auto recv = lf::schedule(pool_a, round_trip{}, &pool_a, &pool_b, &ids);
+      std::move(recv).get();
+
+      // Points 0, 2, 4 are on A; 1, 3 are on B.
+      for (std::size_t i : {std::size_t{0}, std::size_t{2}, std::size_t{4}}) {
+        REQUIRE(a_tids.contains(ids[i]));
+      }
+      for (std::size_t i : {std::size_t{1}, std::size_t{3}}) {
+        REQUIRE(b_tids.contains(ids[i]));
+      }
+    }
+  }
+}
+
+// ---- 3. Switch then fork-join --------------------------------------------
+
+TEMPLATE_TEST_CASE("explicit-sched: switch then fork-join", "[explicit-sched]", mono_pool, poly_pool) {
+  for (auto n : k_worker_counts) {
+    DYNAMIC_SECTION("workers=" << n) {
+      TestType pool_a{n};
+      TestType pool_b{n};
+
+      constexpr int n_forks = 8;
+      constexpr std::int64_t k = 10;
+
+      auto recv = lf::schedule(pool_a, hop_then_fib_sum{}, &pool_b, n_forks, k);
+      auto sum = std::move(recv).get();
+      REQUIRE(sum == fib_ref(k) * n_forks);
+    }
+  }
+}
+
+// ---- 4. Fork then switch then join ---------------------------------------
+
+TEMPLATE_TEST_CASE("explicit-sched: fork then switch then join", "[explicit-sched]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+    DYNAMIC_SECTION("workers=" << n_workers) {
+      TestType pool_a{n_workers};
+      TestType pool_b{n_workers};
+
+      constexpr std::int64_t k = 8;
+
+      for (int n : {1, 8, 64}) {
+        DYNAMIC_SECTION("n_forks=" << n) {
+          auto recv = lf::schedule(pool_a, fib_then_hop_sum{}, &pool_b, n, k);
+          auto sum = std::move(recv).get();
+          REQUIRE(sum == fib_ref(k) * n);
+        }
+      }
+    }
+  }
+}
+
+// ---- 5. Nested fork/switch/recursive-fib ---------------------------------
+
+TEMPLATE_TEST_CASE("explicit-sched: nested fork/switch/join", "[explicit-sched]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+    DYNAMIC_SECTION("workers=" << n_workers) {
+      TestType pool_a{n_workers};
+      TestType pool_b{n_workers};
+
+      for (std::int64_t n : {12, 16, 20}) {
+        DYNAMIC_SECTION("n=" << n) {
+          auto recv = lf::schedule(pool_a, switch_fib{}, n, &pool_a, &pool_b, 0);
+          auto result = std::move(recv).get();
+          REQUIRE(result == fib_ref(n));
+        }
+      }
+    }
+  }
+}
+
+// ---- 6. Switch inside forked child ---------------------------------------
+
+TEMPLATE_TEST_CASE("explicit-sched: switch inside forked child", "[explicit-sched]", mono_pool, poly_pool) {
+  for (auto n : k_worker_counts) {
+    DYNAMIC_SECTION("workers=" << n) {
+      TestType pool_a{n};
+      TestType pool_b{n};
+
+      constexpr std::int64_t k = 10;
+      auto recv = lf::schedule(pool_a, fork_hop_fib{}, &pool_b, k);
+      auto result = std::move(recv).get();
+      REQUIRE(result == fib_ref(k));
+    }
+  }
+}
+
+// ---- 7. Many independent multi-hop tasks (stress) ------------------------
+
+TEMPLATE_TEST_CASE("explicit-sched: independent multi-hop tasks", "[explicit-sched]", mono_pool, poly_pool) {
+  for (auto n : k_worker_counts) {
+    DYNAMIC_SECTION("workers=" << n) {
+      TestType pool_a{n};
+      TestType pool_b{n};
+
+      auto a_tids = discover_worker_tids(pool_a, n);
+      auto b_tids = discover_worker_tids(pool_b, n);
+
+      constexpr int M = 64;
+      constexpr int K = 8;
+
+      std::vector<std::vector<std::thread::id>> per_task_ids(M);
+      std::vector<std::mutex> per_task_mu(M);
+      std::vector<lf::receiver<void>> recvs;
+      recvs.reserve(M);
+      for (std::size_t i = 0; i < M; ++i) {
+        recvs.push_back(
+            lf::schedule(pool_a, alternating_hops{}, &pool_a, &pool_b, K, &per_task_ids[i], &per_task_mu[i]));
+      }
+      for (auto &r : recvs) {
+        std::move(r).get();
+      }
+
+      // Each hop must land on the addressed pool.
+      for (auto const &ids : per_task_ids) {
+        REQUIRE(ids.size() == K);
+        for (int i = 0; i < K; ++i) {
+          // i=0 → b, i=1 → a, ...
+          auto const &expect = (i % 2 == 0) ? b_tids : a_tids;
+          REQUIRE(expect.contains(ids[static_cast<std::size_t>(i)]));
+        }
+      }
+    }
+  }
+}
+
+// ---- 8. Member operator co_await -----------------------------------------
+
+TEMPLATE_TEST_CASE("explicit-sched: member operator co_await", "[explicit-sched]", mono_pool, poly_pool) {
+  for (auto n : k_worker_counts) {
+    DYNAMIC_SECTION("workers=" << n) {
+      TestType pool_a{n};
+      TestType pool_b{n};
+      auto b_tids = discover_worker_tids(pool_b, n);
+
+      std::atomic<std::thread::id> resumed_on;
+      auto recv = lf::schedule(pool_a, member_op_hop{}, &pool_b, &resumed_on);
+      std::move(recv).get();
+      REQUIRE(b_tids.contains(resumed_on.load()));
+    }
+  }
+}
+
+// ---- 9. Free operator co_await -------------------------------------------
+
+TEMPLATE_TEST_CASE("explicit-sched: free operator co_await", "[explicit-sched]", mono_pool, poly_pool) {
+  for (auto n : k_worker_counts) {
+    DYNAMIC_SECTION("workers=" << n) {
+      TestType pool_a{n};
+      TestType pool_b{n};
+      auto b_tids = discover_worker_tids(pool_b, n);
+
+      std::atomic<std::thread::id> resumed_on;
+      auto recv = lf::schedule(pool_a, free_op_hop{}, &pool_b, &resumed_on);
+      std::move(recv).get();
+      REQUIRE(b_tids.contains(resumed_on.load()));
+    }
+  }
+}
+
+// ---- 10. Deferred post (handle published from a foreign thread) -----------
+
+// Simulates an I/O-style awaitable: `await_suspend` returns immediately
+// without publishing the handle, then a separate thread eventually
+// posts it. Verifies that the source worker can leave switch_awaitable
+// (and possibly drain effectively-stolen tasks) before the destination
+// resumes.
+TEMPLATE_TEST_CASE("explicit-sched: posted from foreign thread", "[explicit-sched]", mono_pool, poly_pool) {
+  for (auto n : k_worker_counts) {
+    DYNAMIC_SECTION("workers=" << n) {
+      TestType pool_a{n};
+      TestType pool_b{n};
+      auto b_tids = discover_worker_tids(pool_b, n);
+
+      std::atomic<std::thread::id> resumed_on;
+      auto recv = lf::schedule(pool_a, deferred_hop_task{}, &pool_b, &resumed_on);
+      std::move(recv).get();
+      REQUIRE(b_tids.contains(resumed_on.load()));
+    }
+  }
+}
+
+// ---- 11. Self-hop --------------------------------------------------------
+
+// A hop into one's own pool must resume on a worker of that pool.
+// With 1 worker, the resumed TID must be the same worker.
+TEMPLATE_TEST_CASE("explicit-sched: self-hop", "[explicit-sched]", mono_pool, poly_pool) {
+  for (auto n : k_worker_counts) {
+    DYNAMIC_SECTION("workers=" << n) {
+      TestType pool{n};
+      auto tids = discover_worker_tids(pool, n);
+
+      std::atomic<std::thread::id> resumed_on;
+      auto recv = lf::schedule(pool, self_hop{}, &pool, &resumed_on);
+      std::move(recv).get();
+      REQUIRE(tids.contains(resumed_on.load()));
+    }
+  }
+}
+
+// ---- 12. Exception in await_suspend --------------------------------------
+
+#if LF_COMPILER_EXCEPTIONS
+
+TEMPLATE_TEST_CASE("explicit-sched: throwing await_suspend", "[explicit-sched]", mono_pool, poly_pool) {
+  for (auto n : k_worker_counts) {
+    DYNAMIC_SECTION("workers=" << n) {
+      TestType pool_a{n};
+      TestType pool_b{n};
+
+      SECTION("at root") {
+        auto recv = lf::schedule(pool_a, throw_in_suspend_root{}, &pool_b);
+        REQUIRE_THROWS_AS(std::move(recv).get(), std::runtime_error);
+      }
+
+      SECTION("inside forked child") {
+        auto recv = lf::schedule(pool_a, fork_throw_in_suspend{}, &pool_b);
+        REQUIRE_THROWS_AS(std::move(recv).get(), std::runtime_error);
+      }
+    }
+  }
+}
+
+// ---- 13. Exception in await_resume ---------------------------------------
+
+TEMPLATE_TEST_CASE("explicit-sched: throwing await_resume", "[explicit-sched]", mono_pool, poly_pool) {
+  for (auto n : k_worker_counts) {
+    DYNAMIC_SECTION("workers=" << n) {
+      TestType pool_a{n};
+      TestType pool_b{n};
+
+      auto recv = lf::schedule(pool_a, throw_in_resume_root{}, &pool_b);
+      REQUIRE_THROWS_AS(std::move(recv).get(), std::runtime_error);
+    }
+  }
+}
+
+#endif // LF_COMPILER_EXCEPTIONS
+
+// ---- 14. Concept conformance (compile-time) ------------------------------
+
+// concepts.cpp covers the basic positive case + the wrong-context-type
+// negative case. The cases below cover return-type and noexcept-vs-throwing
+// requirements specific to switch_awaitable.
+TEST_CASE("explicit-sched: concept conformance", "[explicit-sched]") {
+  using namespace concept_checks;
+
+  STATIC_REQUIRE_FALSE(lf::awaitable<non_bool_ready, test_context>);
+  STATIC_REQUIRE_FALSE(lf::awaitable<non_void_suspend, test_context>);
+
+  // Throwing await_suspend still satisfies awaitable; only nothrow_await_suspend
+  // (used as a noexcept-spec on switch_awaitable::await_suspend) is finer.
+  STATIC_REQUIRE(lf::awaitable<throwing_suspend_awaitable, test_context>);
+}
+
+// ---- 15. Stress: hop binary tree -----------------------------------------
+
+TEMPLATE_TEST_CASE("explicit-sched: hop-binary-tree", "[explicit-sched][stress]", mono_pool, poly_pool) {
+  constexpr int depth = 12;
+  constexpr int expect_leaves = 1 << depth;
+
+  std::size_t const hw = static_cast<std::size_t>(std::thread::hardware_concurrency());
+
+  for (std::size_t thr : {std::size_t{2}, std::size_t{4}, std::size_t{8}}) {
+    if (thr > hw) {
+      break;
+    }
+    DYNAMIC_SECTION("threads=" << thr) {
+      std::atomic<int> leaf_count{0};
+      std::size_t const half = thr / 2 + (thr % 2);
+      TestType pool_a{half};
+      TestType pool_b{thr - half + 1}; // at least 1
+
+      auto recv = lf::schedule(pool_a, hop_tree{}, depth, &pool_a, &pool_b, &leaf_count);
+      std::move(recv).get();
+      REQUIRE(leaf_count.load() == expect_leaves);
+    }
+  }
+}
diff --git a/test/src/fold.cpp b/test/src/fold.cpp
new file mode 100644
index 000000000..4a4d035c0
--- /dev/null
+++ b/test/src/fold.cpp
@@ -0,0 +1,391 @@
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+#include "libfork/__impl/exception.hpp"
+
+import std;
+
+import libfork;
+
+namespace {
+
+using mono_pool = lf::mono_busy_pool<lf::geometric_stack<>>;
+using poly_pool = lf::poly_busy_pool<lf::geometric_stack<>>;
+
+constexpr std::array<std::size_t, 3> k_worker_counts{1, 2, 4};
+constexpr std::array<std::size_t, 11> k_sizes{1, 2, 3, 4, 5, 6, 8, 9, 97, 1024, 4096};
+
+constexpr auto sum_0_to_n_minus_1(std::size_t n) -> std::size_t { return (n * (n - 1)) / 2; }
+
+constexpr auto sum_squares_0_to_n_minus_1(std::size_t n) -> std::size_t {
+  return n == 0 ? 0 : (n * (n - 1) * (2 * n - 1)) / 6;
+}
+
+template <typename T, typename U>
+void require_fold_result(std::optional<T> result, U expected) {
+  REQUIRE(result.has_value());
+  REQUIRE(*result == expected);
+}
+
+struct projected_record {
+  std::size_t value;
+};
+
+struct explicit_sum {
+  std::size_t value;
+
+  explicit_sum() = default;
+  explicit explicit_sum(std::size_t v)
+      : value(v) {}
+};
+
+constexpr auto explicit_sum_value(std::size_t value) -> std::size_t { return value; }
+constexpr auto explicit_sum_value(explicit_sum value) -> std::size_t { return value.value; }
+
+struct explicit_sum_plus {
+  template <typename L, typename R>
+  auto operator()(L const &lhs, R const &rhs) const -> explicit_sum {
+    return explicit_sum{explicit_sum_value(lhs) + explicit_sum_value(rhs)};
+  }
+};
+
+} // namespace
+
+TEMPLATE_TEST_CASE("fold: iterator-pair, n=1 (no n parameter)", "[fold]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+
+    TestType pool{n_workers};
+
+    for (auto n : k_sizes) {
+      DYNAMIC_SECTION("workers=" << n_workers << " len=" << n) {
+
+        std::vector v{std::from_range, std::ranges::iota_view(0UZ, n)};
+
+        auto recv = lf::schedule(pool, lf::fold, v.begin(), v.end(), std::plus<>{});
+
+        require_fold_result(std::move(recv).get(), sum_0_to_n_minus_1(n));
+      }
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE("fold: empty iterator-pair returns nullopt", "[fold]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+    TestType pool{n_workers};
+
+    DYNAMIC_SECTION("workers=" << n_workers) {
+      std::vector<std::size_t> v;
+
+      auto unchunked = lf::schedule(pool, lf::fold, v.begin(), v.end(), std::plus<>{});
+      REQUIRE_FALSE(std::move(unchunked).get().has_value());
+
+      auto chunked = lf::schedule(pool, lf::fold, v.begin(), v.end(), 4UZ, std::plus<>{});
+      REQUIRE_FALSE(std::move(chunked).get().has_value());
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE("fold: iterator-pair, n>1", "[fold]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+
+    TestType pool{n_workers};
+
+    for (auto n : k_sizes) {
+      for (auto ch : k_sizes) {
+        DYNAMIC_SECTION("workers=" << n_workers << " len=" << n << " chunk=" << ch) {
+
+          std::vector v{std::from_range, std::ranges::iota_view(0UZ, n)};
+
+          auto recv = lf::schedule(pool, lf::fold, v.begin(), v.end(), ch, std::plus<>{});
+
+          require_fold_result(std::move(recv).get(), sum_0_to_n_minus_1(n));
+        }
+      }
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE("fold: range + n", "[fold]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+
+    TestType pool{n_workers};
+
+    for (auto n : k_sizes) {
+      for (auto ch : k_sizes) {
+        DYNAMIC_SECTION("workers=" << n_workers << " len=" << n << " chunk=" << ch) {
+
+          std::vector v{std::from_range, std::ranges::iota_view(0UZ, n)};
+
+          auto recv = lf::schedule(pool, lf::fold, std::span(v), ch, std::plus<>{});
+
+          require_fold_result(std::move(recv).get(), sum_0_to_n_minus_1(n));
+        }
+      }
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE("fold: empty range returns nullopt", "[fold]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+    TestType pool{n_workers};
+
+    DYNAMIC_SECTION("workers=" << n_workers) {
+      std::vector<std::size_t> v;
+
+      auto span_unchunked = lf::schedule(pool, lf::fold, std::span(v), std::plus<>{});
+      REQUIRE_FALSE(std::move(span_unchunked).get().has_value());
+
+      auto span_chunked = lf::schedule(pool, lf::fold, std::span(v), 4UZ, std::plus<>{});
+      REQUIRE_FALSE(std::move(span_chunked).get().has_value());
+
+      auto empty_view = std::views::iota(0UZ, 0UZ);
+
+      auto view_unchunked = lf::schedule(pool, lf::fold, empty_view, std::plus<>{});
+      REQUIRE_FALSE(std::move(view_unchunked).get().has_value());
+
+      auto view_chunked = lf::schedule(pool, lf::fold, empty_view, 4UZ, std::plus<>{});
+      REQUIRE_FALSE(std::move(view_chunked).get().has_value());
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE("fold: range no n (default chunk)", "[fold]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+
+    TestType pool{n_workers};
+
+    for (auto n : k_sizes) {
+      DYNAMIC_SECTION("workers=" << n_workers << " len=" << n) {
+
+        std::vector v{std::from_range, std::ranges::iota_view(0UZ, n)};
+
+        auto recv = lf::schedule(pool, lf::fold, std::span(v), std::plus<>{});
+
+        require_fold_result(std::move(recv).get(), sum_0_to_n_minus_1(n));
+      }
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE("fold: non-trivial sync projection (sum of squares)", "[fold]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+
+    TestType pool{n_workers};
+
+    for (auto n : k_sizes) {
+      for (auto ch : k_sizes) {
+        DYNAMIC_SECTION("workers=" << n_workers << " len=" << n << " chunk=" << ch) {
+
+          std::vector v{std::from_range, std::ranges::iota_view(0UZ, n)};
+
+          auto recv =
+              lf::schedule(pool, lf::fold, std::span(v), ch, std::plus<>{}, [](std::size_t x) -> std::size_t {
+                return x * x;
+              });
+
+          require_fold_result(std::move(recv).get(), sum_squares_0_to_n_minus_1(n));
+        }
+      }
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE("fold: sync projection accepts pointer-to-member", "[fold]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+
+    TestType pool{n_workers};
+
+    std::vector<projected_record> v;
+    v.reserve(16);
+    for (auto i : std::views::iota(0UZ, 16UZ)) {
+      v.push_back(projected_record{.value = i});
+    }
+
+    auto chunked = lf::schedule(pool, lf::fold, std::span(v), 32UZ, std::plus<>{}, &projected_record::value);
+    require_fold_result(std::move(chunked).get(), sum_0_to_n_minus_1(v.size()));
+
+    auto single = lf::schedule(
+        pool, lf::fold, v.begin(), std::next(v.begin()), std::plus<>{}, &projected_record::value);
+    require_fold_result(std::move(single).get(), 0UZ);
+  }
+}
+
+TEMPLATE_TEST_CASE("fold: sync projection can initialize explicit accumulator",
+                   "[fold]",
+                   mono_pool,
+                   poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+
+    TestType pool{n_workers};
+
+    std::vector v{std::from_range, std::views::iota(0UZ, 16UZ)};
+
+    auto chunked = lf::schedule(pool, lf::fold, std::span(v), 32UZ, explicit_sum_plus{}, std::identity{});
+    auto chunked_result = std::move(chunked).get();
+    REQUIRE(chunked_result.has_value());
+    REQUIRE(chunked_result->value == sum_0_to_n_minus_1(v.size()));
+
+    auto single =
+        lf::schedule(pool, lf::fold, v.begin(), std::next(v.begin()), explicit_sum_plus{}, std::identity{});
+    auto single_result = std::move(single).get();
+    REQUIRE(single_result.has_value());
+    REQUIRE(single_result->value == 0UZ);
+  }
+}
+
+TEMPLATE_TEST_CASE("fold: stateful Bop (counter increment)", "[fold]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+
+    TestType pool{n_workers};
+
+    for (auto n : k_sizes) {
+      DYNAMIC_SECTION("workers=" << n_workers << " len=" << n) {
+
+        std::vector v{std::from_range, std::ranges::iota_view(0UZ, n)};
+
+        struct counting_plus {
+          std::atomic<std::size_t> *calls;
+          auto operator()(std::size_t a, std::size_t b) const -> std::size_t {
+            calls->fetch_add(1, std::memory_order_relaxed);
+            return a + b;
+          }
+        };
+
+        std::atomic<std::size_t> calls{0};
+
+        auto recv = lf::schedule(pool, lf::fold, std::span(v), counting_plus{&calls});
+
+        require_fold_result(std::move(recv).get(), sum_0_to_n_minus_1(n));
+        REQUIRE(calls.load() == n - 1UZ);
+      }
+    }
+  }
+}
+
+namespace {
+
+struct async_plus {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, std::size_t a, std::size_t b) -> lf::task<std::size_t, Context> {
+    co_return a + b;
+  }
+};
+
+struct async_square {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, std::size_t x) -> lf::task<std::size_t, Context> {
+    co_return x *x;
+  }
+};
+
+} // namespace
+
+TEMPLATE_TEST_CASE("fold: async Bop — iterator-pair, n=1", "[fold]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+    TestType pool{n_workers};
+    for (auto n : k_sizes) {
+      DYNAMIC_SECTION("workers=" << n_workers << " len=" << n) {
+        std::vector v{std::from_range, std::ranges::iota_view(0UZ, n)};
+        auto recv = lf::schedule(pool, lf::fold, v.begin(), v.end(), async_plus{});
+        require_fold_result(std::move(recv).get(), sum_0_to_n_minus_1(n));
+      }
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE("fold: async Bop — range + n", "[fold]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+    TestType pool{n_workers};
+    for (auto n : k_sizes) {
+      for (auto ch : k_sizes) {
+        DYNAMIC_SECTION("workers=" << n_workers << " len=" << n << " chunk=" << ch) {
+          std::vector v{std::from_range, std::ranges::iota_view(0UZ, n)};
+          auto recv = lf::schedule(pool, lf::fold, std::span(v), ch, async_plus{});
+          require_fold_result(std::move(recv).get(), sum_0_to_n_minus_1(n));
+        }
+      }
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE("fold: async Proj — iterator-pair, n>1", "[fold]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+    TestType pool{n_workers};
+    for (auto n : k_sizes) {
+      for (auto ch : k_sizes) {
+        DYNAMIC_SECTION("workers=" << n_workers << " len=" << n << " chunk=" << ch) {
+          std::vector v{std::from_range, std::ranges::iota_view(0UZ, n)};
+          auto recv = lf::schedule(pool, lf::fold, v.begin(), v.end(), ch, std::plus<>{}, async_square{});
+          require_fold_result(std::move(recv).get(), sum_squares_0_to_n_minus_1(n));
+        }
+      }
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE("fold: async Bop + async Proj — range + n", "[fold]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+    TestType pool{n_workers};
+    for (auto n : k_sizes) {
+      for (auto ch : k_sizes) {
+        DYNAMIC_SECTION("workers=" << n_workers << " len=" << n << " chunk=" << ch) {
+          std::vector v{std::from_range, std::ranges::iota_view(0UZ, n)};
+          auto recv = lf::schedule(pool, lf::fold, std::span(v), ch, async_plus{}, async_square{});
+          require_fold_result(std::move(recv).get(), sum_squares_0_to_n_minus_1(n));
+        }
+      }
+    }
+  }
+}
+
+#if LF_COMPILER_EXCEPTIONS
+
+TEMPLATE_TEST_CASE("fold: non-positive chunk throws", "[fold]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+    DYNAMIC_SECTION("workers=" << n_workers) {
+
+      TestType pool{n_workers};
+
+      std::vector v{1UZ, 2UZ, 3UZ};
+      std::vector<std::size_t> empty;
+
+      auto zero_iter = lf::schedule(pool, lf::fold, v.begin(), v.end(), 0, std::plus<>{});
+      REQUIRE_THROWS_AS(std::move(zero_iter).get(), lf::fold_chunk_error);
+
+      auto negative_iter = lf::schedule(pool, lf::fold, v.begin(), v.end(), -1, std::plus<>{});
+      REQUIRE_THROWS_AS(std::move(negative_iter).get(), lf::fold_chunk_error);
+
+      auto zero_range = lf::schedule(pool, lf::fold, std::span(v), 0, std::plus<>{});
+      REQUIRE_THROWS_AS(std::move(zero_range).get(), lf::fold_chunk_error);
+
+      auto negative_range = lf::schedule(pool, lf::fold, std::span(v), -1, std::plus<>{});
+      REQUIRE_THROWS_AS(std::move(negative_range).get(), lf::fold_chunk_error);
+
+      auto zero_empty = lf::schedule(pool, lf::fold, empty.begin(), empty.end(), 0, std::plus<>{});
+      REQUIRE_THROWS_AS(std::move(zero_empty).get(), lf::fold_chunk_error);
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE("fold: exception from Bop propagates", "[fold]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+    DYNAMIC_SECTION("workers=" << n_workers) {
+
+      TestType pool{n_workers};
+
+      std::vector v{std::from_range, std::ranges::iota_view(0UZ, 1024UZ)};
+
+      auto recv = lf::schedule(
+          pool, lf::fold, v.begin(), v.end(), 4UZ, [](std::size_t a, std::size_t b) -> std::size_t {
+            if (a == 500 || b == 500) {
+              throw std::runtime_error{"boom"};
+            }
+            return a + b;
+          });
+
+      REQUIRE_THROWS_AS(std::move(recv).get(), std::runtime_error);
+    }
+  }
+}
+
+#endif // LF_COMPILER_EXCEPTIONS
diff --git a/test/src/for_each.cpp b/test/src/for_each.cpp
new file mode 100644
index 000000000..b653b861d
--- /dev/null
+++ b/test/src/for_each.cpp
@@ -0,0 +1,323 @@
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+#include "libfork/__impl/exception.hpp"
+
+import std;
+
+import libfork;
+
+namespace {
+
+using mono_pool = lf::mono_busy_pool<lf::geometric_stack<>>;
+using poly_pool = lf::poly_busy_pool<lf::geometric_stack<>>;
+
+constexpr std::array<std::size_t, 3> k_worker_counts{1, 2, 4};
+constexpr std::array<std::size_t, 12> k_sizes{0, 1, 2, 3, 4, 5, 6, 8, 9, 97, 1024, 4096};
+
+} // namespace
+
+TEMPLATE_TEST_CASE("for_each: iterator-pair, n=1 (no n parameter)", "[for_each]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+
+    TestType pool{n_workers};
+
+    for (auto n : k_sizes) {
+      DYNAMIC_SECTION("workers=" << n_workers << " len=" << n) {
+
+        std::vector v{std::from_range, std::ranges::iota_view(0UZ, n)};
+
+        auto recv = lf::schedule(pool, lf::for_each, v.begin(), v.end(), [](std::size_t &x) -> void {
+          x += 1;
+        });
+
+        std::move(recv).get();
+
+        for (std::size_t i = 0; i < v.size(); ++i) {
+          REQUIRE(v[i] == i + 1);
+        }
+      }
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE("for_each: iterator-pair, n>1", "[for_each]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+
+    TestType pool{n_workers};
+
+    for (auto n : k_sizes) {
+      for (auto ch : std::span(k_sizes).subspan(1)) {
+        DYNAMIC_SECTION("workers=" << n_workers << " len=" << n << " chunk=" << ch) {
+
+          std::vector v{std::from_range, std::ranges::iota_view(0UZ, n)};
+
+          auto recv = lf::schedule(pool, lf::for_each, v.begin(), v.end(), ch, [](std::size_t &x) -> void {
+            x += 1;
+          });
+
+          std::move(recv).get();
+
+          for (std::size_t i = 0; i < v.size(); ++i) {
+            REQUIRE(v[i] == i + 1);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE("for_each: range + n", "[for_each]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+
+    TestType pool{n_workers};
+
+    for (auto n : k_sizes) {
+      for (auto ch : std::span(k_sizes).subspan(1)) {
+        DYNAMIC_SECTION("workers=" << n_workers << " len=" << n << " chunk=" << ch) {
+
+          std::vector v{std::from_range, std::ranges::iota_view(0UZ, n)};
+
+          auto recv = lf::schedule(pool, lf::for_each, std::span(v), ch, [](std::size_t &x) -> void {
+            x += 1;
+          });
+
+          std::move(recv).get();
+
+          for (std::size_t i = 0; i < v.size(); ++i) {
+            REQUIRE(v[i] == i + 1);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE("for_each: range no n (default chunk)", "[for_each]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+
+    TestType pool{n_workers};
+
+    for (auto n : k_sizes) {
+      DYNAMIC_SECTION("workers=" << n_workers << " len=" << n) {
+
+        std::vector v{std::from_range, std::ranges::iota_view(0UZ, n)};
+
+        auto recv = lf::schedule(pool, lf::for_each, std::span(v), [](std::size_t &x) -> void {
+          x += 1;
+        });
+
+        std::move(recv).get();
+
+        for (std::size_t i = 0; i < v.size(); ++i) {
+          REQUIRE(v[i] == i + 1);
+        }
+      }
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE("for_each: exact invocation count and read-only Fn", "[for_each]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+
+    TestType pool{n_workers};
+
+    for (auto n : k_sizes) {
+      for (auto ch : std::span(k_sizes).subspan(1)) {
+        DYNAMIC_SECTION("workers=" << n_workers << " len=" << n << " chunk=" << ch) {
+
+          std::vector v{std::from_range, std::ranges::iota_view(0UZ, n)};
+
+          std::atomic<std::size_t> count{0};
+          std::atomic<std::size_t> sum{0};
+
+          auto recv = lf::schedule(
+              pool, lf::for_each, std::span(v), ch, [&count, &sum](std::size_t const &x) -> void {
+                count.fetch_add(1, std::memory_order_relaxed);
+                sum.fetch_add(x, std::memory_order_relaxed);
+              });
+
+          std::move(recv).get();
+
+          REQUIRE(count.load() == n);
+          REQUIRE(sum.load() == (n * (n - 1)) / 2);
+
+          for (std::size_t i = 0; i < v.size(); ++i) {
+            REQUIRE(v[i] == i);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE("for_each: stateful Fn (captured-by-ref counter)", "[for_each]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+
+    TestType pool{n_workers};
+
+    for (auto n : k_sizes) {
+      DYNAMIC_SECTION("workers=" << n_workers << " len=" << n) {
+
+        std::vector v{std::from_range, std::ranges::iota_view(0UZ, n)};
+
+        struct stateful {
+          std::atomic<std::size_t> *hits;
+          std::size_t tag;
+          void operator()(std::size_t &x) const {
+            hits->fetch_add(1, std::memory_order_relaxed);
+            x += tag;
+          }
+        };
+
+        std::atomic<std::size_t> hits{0};
+
+        auto recv = lf::schedule(pool, lf::for_each, std::span(v), stateful{&hits, 7});
+
+        std::move(recv).get();
+
+        REQUIRE(hits.load() == n);
+        for (std::size_t i = 0; i < v.size(); ++i) {
+          REQUIRE(v[i] == i + 7);
+        }
+      }
+    }
+  }
+}
+
+namespace {
+
+struct async_inc {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, std::size_t &x) -> lf::task<void, Context> {
+    x += 1;
+    co_return;
+  }
+};
+
+struct async_count_sum {
+  std::atomic<std::size_t> *count;
+  std::atomic<std::size_t> *sum;
+
+  template <typename Context>
+  auto operator()(lf::env<Context>, std::size_t const &x) const -> lf::task<void, Context> {
+    count->fetch_add(1, std::memory_order_relaxed);
+    sum->fetch_add(x, std::memory_order_relaxed);
+    co_return;
+  }
+};
+
+} // namespace
+
+TEMPLATE_TEST_CASE("for_each: async Fn — iterator-pair, n=1", "[for_each]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+    TestType pool{n_workers};
+    for (auto n : k_sizes) {
+      DYNAMIC_SECTION("workers=" << n_workers << " len=" << n) {
+        std::vector v{std::from_range, std::ranges::iota_view(0UZ, n)};
+        auto recv = lf::schedule(pool, lf::for_each, v.begin(), v.end(), async_inc{});
+        std::move(recv).get();
+        for (std::size_t i = 0; i < v.size(); ++i) {
+          REQUIRE(v[i] == i + 1);
+        }
+      }
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE("for_each: async Fn — iterator-pair, n>1", "[for_each]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+    TestType pool{n_workers};
+    for (auto n : k_sizes) {
+      for (auto ch : std::span(k_sizes).subspan(1)) {
+        DYNAMIC_SECTION("workers=" << n_workers << " len=" << n << " chunk=" << ch) {
+          std::vector v{std::from_range, std::ranges::iota_view(0UZ, n)};
+          auto recv = lf::schedule(pool, lf::for_each, v.begin(), v.end(), ch, async_inc{});
+          std::move(recv).get();
+          for (std::size_t i = 0; i < v.size(); ++i) {
+            REQUIRE(v[i] == i + 1);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE("for_each: async Fn — range + n", "[for_each]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+    TestType pool{n_workers};
+    for (auto n : k_sizes) {
+      for (auto ch : std::span(k_sizes).subspan(1)) {
+        DYNAMIC_SECTION("workers=" << n_workers << " len=" << n << " chunk=" << ch) {
+          std::vector v{std::from_range, std::ranges::iota_view(0UZ, n)};
+          auto recv = lf::schedule(pool, lf::for_each, std::span(v), ch, async_inc{});
+          std::move(recv).get();
+          for (std::size_t i = 0; i < v.size(); ++i) {
+            REQUIRE(v[i] == i + 1);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE("for_each: async Fn — range no n", "[for_each]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+    TestType pool{n_workers};
+    for (auto n : k_sizes) {
+      DYNAMIC_SECTION("workers=" << n_workers << " len=" << n) {
+        std::vector v{std::from_range, std::ranges::iota_view(0UZ, n)};
+        auto recv = lf::schedule(pool, lf::for_each, std::span(v), async_inc{});
+        std::move(recv).get();
+        for (std::size_t i = 0; i < v.size(); ++i) {
+          REQUIRE(v[i] == i + 1);
+        }
+      }
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE("for_each: async Fn — exact invocation count", "[for_each]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+    TestType pool{n_workers};
+    for (auto n : k_sizes) {
+      for (auto ch : std::span(k_sizes).subspan(1)) {
+        DYNAMIC_SECTION("workers=" << n_workers << " len=" << n << " chunk=" << ch) {
+          std::vector v{std::from_range, std::ranges::iota_view(0UZ, n)};
+          std::atomic<std::size_t> count{0};
+          std::atomic<std::size_t> sum{0};
+          auto recv = lf::schedule(pool, lf::for_each, std::span(v), ch, async_count_sum{&count, &sum});
+          std::move(recv).get();
+          REQUIRE(count.load() == n);
+          REQUIRE(sum.load() == (n * (n - 1)) / 2);
+          for (std::size_t i = 0; i < v.size(); ++i) {
+            REQUIRE(v[i] == i);
+          }
+        }
+      }
+    }
+  }
+}
+
+#if LF_COMPILER_EXCEPTIONS
+
+TEMPLATE_TEST_CASE("for_each: exception from Fn propagates", "[for_each]", mono_pool, poly_pool) {
+  for (auto n_workers : k_worker_counts) {
+    DYNAMIC_SECTION("workers=" << n_workers) {
+
+      TestType pool{n_workers};
+
+      std::vector v{std::from_range, std::ranges::iota_view(0UZ, 1024UZ)};
+
+      auto recv = lf::schedule(pool, lf::for_each, v.begin(), v.end(), [](std::size_t &x) -> void {
+        if (x == 500) {
+          throw std::runtime_error{"boom"};
+        }
+      });
+
+      REQUIRE_THROWS_AS(std::move(recv).get(), std::runtime_error);
+    }
+  }
+}
+
+#endif // LF_COMPILER_EXCEPTIONS
diff --git a/test/src/lift.cpp b/test/src/lift.cpp
new file mode 100644
index 000000000..b0bd6f844
--- /dev/null
+++ b/test/src/lift.cpp
@@ -0,0 +1,334 @@
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+#include "libfork/__impl/exception.hpp"
+
+import std;
+
+import libfork;
+
+// TODO: unify all the tests pools/schedulers
+
+namespace {
+
+using mono_inline_ctx = lf::mono_context<lf::geometric_stack<>, lf::adapt_vector<>>;
+using poly_inline_ctx = lf::derived_poly_context<lf::geometric_stack<>, lf::adapt_vector<>>;
+using mono_inline_scheduler = lf::inline_scheduler<mono_inline_ctx>;
+using poly_inline_scheduler = lf::inline_scheduler<poly_inline_ctx>;
+using mono_pool = lf::mono_busy_pool<lf::geometric_stack<>>;
+using poly_pool = lf::poly_busy_pool<lf::geometric_stack<>>;
+
+template <typename Scheduler>
+[[nodiscard]]
+auto make_scheduler() -> Scheduler {
+  if constexpr (std::constructible_from<Scheduler, std::size_t>) {
+    return Scheduler{2};
+  } else {
+    return Scheduler{};
+  }
+}
+
+struct plus_fn {
+  [[nodiscard]]
+  constexpr auto operator()(int lhs, int rhs) const noexcept -> int {
+    return lhs + rhs;
+  }
+};
+
+struct add_to_atomic {
+  void operator()(std::atomic<int> *value, int delta) const noexcept {
+    value->fetch_add(delta, std::memory_order_relaxed);
+  }
+};
+
+struct add_to_atomic_return {
+  [[nodiscard]]
+  auto operator()(std::atomic<int> *value, int delta) const noexcept -> int {
+    value->fetch_add(delta, std::memory_order_relaxed);
+    return delta;
+  }
+};
+
+struct append_value {
+  void operator()(std::vector<int> &values, int value) const { values.push_back(value); }
+};
+
+struct take_unique {
+  [[nodiscard]]
+  auto operator()(std::unique_ptr<int> value) const noexcept -> int {
+    return *value;
+  }
+};
+
+struct move_only_plus {
+  std::unique_ptr<int> bias;
+
+  explicit move_only_plus(int value)
+      : bias(std::make_unique<int>(value)) {}
+
+  move_only_plus(move_only_plus &&) noexcept = default;
+  auto operator=(move_only_plus &&) noexcept -> move_only_plus & = default;
+  move_only_plus(move_only_plus const &) = delete;
+  auto operator=(move_only_plus const &) -> move_only_plus & = delete;
+
+  [[nodiscard]]
+  auto operator()(int value) && noexcept -> int {
+    return value + *bias;
+  }
+};
+
+struct multiplier {
+  int factor;
+
+  [[nodiscard]]
+  auto apply(int value) const noexcept -> int {
+    return factor * value;
+  }
+};
+
+struct run_lift_scope_ops {
+  template <typename Context>
+  static auto operator()(lf::env<Context>) -> lf::task<bool, Context> {
+    bool ok = true;
+    int call_result = 0;
+    int fork_result = 0;
+    int move_result = 0;
+    int member_result = 0;
+    std::atomic<int> effects{0};
+    std::vector<int> values;
+
+    auto sc = co_await lf::scope();
+
+    co_await sc.call(&call_result, lf::lift, plus_fn{}, 2, 3);
+    ok = ok && call_result == 5;
+
+    co_await sc.call(lf::lift, append_value{}, values, 7);
+    ok = ok && values == std::vector{7};
+
+    co_await sc.call_drop(lf::lift, add_to_atomic_return{}, &effects, 11);
+    ok = ok && effects.load(std::memory_order_relaxed) == 11;
+
+    co_await sc.call_drop(lf::lift, add_to_atomic{}, &effects, 13);
+    ok = ok && effects.load(std::memory_order_relaxed) == 24;
+
+    co_await sc.call(&move_result, lf::lift, take_unique{}, std::make_unique<int>(17));
+    ok = ok && move_result == 17;
+
+    co_await sc.call(&member_result, lf::lift, &multiplier::apply, multiplier{3}, 5);
+    ok = ok && member_result == 15;
+
+    co_await sc.fork(&fork_result, lf::lift, plus_fn{}, 19, 23);
+    co_await sc.fork(lf::lift, add_to_atomic{}, &effects, 29);
+    co_await sc.fork_drop(lf::lift, add_to_atomic_return{}, &effects, 31);
+    co_await sc.fork_drop(lf::lift, add_to_atomic{}, &effects, 37);
+    co_await sc.join();
+
+    ok = ok && fork_result == 42;
+    ok = ok && effects.load(std::memory_order_relaxed) == 121;
+
+    co_await sc.fork(&move_result, lf::lift, take_unique{}, std::make_unique<int>(41));
+    co_await sc.fork(&member_result, lf::lift, &multiplier::apply, multiplier{7}, 6);
+    co_await sc.fork(&call_result, lf::lift, move_only_plus{5}, 8);
+    co_await sc.join();
+
+    ok = ok && move_result == 41;
+    ok = ok && member_result == 42;
+    ok = ok && call_result == 13;
+
+    co_return ok;
+  }
+};
+
+struct run_child_lift_scope_ops {
+  template <typename Context>
+  static auto operator()(lf::env<Context>) -> lf::task<bool, Context> {
+    bool ok = true;
+    int call_result = 0;
+    int fork_result = 0;
+    int move_result = 0;
+    int member_result = 0;
+    std::atomic<int> effects{0};
+    std::vector<int> values;
+
+    auto sc = co_await lf::child_scope();
+
+    co_await sc.call(&call_result, lf::lift, plus_fn{}, 2, 3);
+    ok = ok && call_result == 5;
+
+    co_await sc.call(lf::lift, append_value{}, values, 7);
+    ok = ok && values == std::vector{7};
+
+    co_await sc.call_drop(lf::lift, add_to_atomic_return{}, &effects, 11);
+    ok = ok && effects.load(std::memory_order_relaxed) == 11;
+
+    co_await sc.call_drop(lf::lift, add_to_atomic{}, &effects, 13);
+    ok = ok && effects.load(std::memory_order_relaxed) == 24;
+
+    co_await sc.call(&move_result, lf::lift, take_unique{}, std::make_unique<int>(17));
+    ok = ok && move_result == 17;
+
+    co_await sc.call(&member_result, lf::lift, &multiplier::apply, multiplier{3}, 5);
+    ok = ok && member_result == 15;
+
+    co_await sc.fork(&fork_result, lf::lift, plus_fn{}, 19, 23);
+    co_await sc.fork(lf::lift, add_to_atomic{}, &effects, 29);
+    co_await sc.fork_drop(lf::lift, add_to_atomic_return{}, &effects, 31);
+    co_await sc.fork_drop(lf::lift, add_to_atomic{}, &effects, 37);
+    co_await sc.join();
+
+    ok = ok && fork_result == 42;
+    ok = ok && effects.load(std::memory_order_relaxed) == 121;
+
+    co_await sc.fork(&move_result, lf::lift, take_unique{}, std::make_unique<int>(41));
+    co_await sc.fork(&member_result, lf::lift, &multiplier::apply, multiplier{7}, 6);
+    co_await sc.fork(&call_result, lf::lift, move_only_plus{5}, 8);
+    co_await sc.join();
+
+    ok = ok && move_result == 41;
+    ok = ok && member_result == 42;
+    ok = ok && call_result == 13;
+
+    co_return ok;
+  }
+};
+
+struct run_cancelled_child_lift_scope_ops {
+  template <typename Context>
+  static auto operator()(lf::env<Context>) -> lf::task<bool, Context> {
+    int call_result = 101;
+    int fork_result = 202;
+    std::atomic<int> effects{0};
+
+    auto sc = co_await lf::child_scope();
+    sc.request_stop();
+
+    co_await sc.call(&call_result, lf::lift, add_to_atomic_return{}, &effects, 1);
+    co_await sc.call(lf::lift, add_to_atomic{}, &effects, 2);
+    co_await sc.call_drop(lf::lift, add_to_atomic_return{}, &effects, 4);
+    co_await sc.call_drop(lf::lift, add_to_atomic{}, &effects, 8);
+
+    co_await sc.fork(&fork_result, lf::lift, add_to_atomic_return{}, &effects, 16);
+    co_await sc.fork(lf::lift, add_to_atomic{}, &effects, 32);
+    co_await sc.fork_drop(lf::lift, add_to_atomic_return{}, &effects, 64);
+    co_await sc.fork_drop(lf::lift, add_to_atomic{}, &effects, 128);
+    co_await sc.join();
+
+    co_return call_result == 101 && fork_result == 202 && effects.load(std::memory_order_relaxed) == 0;
+  }
+};
+
+#if LF_COMPILER_EXCEPTIONS
+
+struct throwing_sync {
+  [[noreturn]]
+  auto operator()() const -> int {
+    LF_THROW(std::runtime_error{"lift"});
+  }
+};
+
+struct run_call_lift_exception {
+  template <typename Context>
+  static auto operator()(lf::env<Context>) -> lf::task<void, Context> {
+    int result = 0;
+    auto sc = co_await lf::scope();
+    co_await sc.call(&result, lf::lift, throwing_sync{});
+    co_await sc.join();
+    co_return;
+  }
+};
+
+struct run_fork_lift_exception {
+  template <typename Context>
+  static auto operator()(lf::env<Context>) -> lf::task<void, Context> {
+    int result = 0;
+    auto sc = co_await lf::scope();
+    co_await sc.fork(&result, lf::lift, throwing_sync{});
+    co_await sc.join();
+    co_return;
+  }
+};
+
+#endif // LF_COMPILER_EXCEPTIONS
+
+template <typename Scheduler>
+void check_lift_scope_ops(Scheduler &scheduler) {
+  auto scope_recv = lf::schedule(scheduler, run_lift_scope_ops{});
+  REQUIRE(std::move(scope_recv).get());
+
+  auto child_scope_recv = lf::schedule(scheduler, run_child_lift_scope_ops{});
+  REQUIRE(std::move(child_scope_recv).get());
+
+  auto cancelled_recv = lf::schedule(scheduler, run_cancelled_child_lift_scope_ops{});
+  REQUIRE(std::move(cancelled_recv).get());
+}
+
+} // namespace
+
+TEST_CASE("lift: concept conformance", "[lift]") {
+  STATIC_REQUIRE(lf::async_invocable<decltype(lf::lift), mono_inline_ctx, plus_fn, int, int>);
+  STATIC_REQUIRE(lf::async_invocable_to<decltype(lf::lift), int, mono_inline_ctx, plus_fn, int, int>);
+  STATIC_REQUIRE(lf::async_invocable_to<decltype(lf::lift),
+                                        void,
+                                        mono_inline_ctx,
+                                        add_to_atomic,
+                                        std::atomic<int> *,
+                                        int>);
+  STATIC_REQUIRE(
+      lf::async_invocable_to<decltype(lf::lift), int, mono_inline_ctx, take_unique, std::unique_ptr<int>>);
+  STATIC_REQUIRE(lf::async_invocable_to<decltype(lf::lift), int, mono_inline_ctx, move_only_plus, int>);
+  STATIC_REQUIRE(lf::async_invocable_to<decltype(lf::lift),
+                                        int,
+                                        mono_inline_ctx,
+                                        int (multiplier::*)(int) const,
+                                        multiplier,
+                                        int>);
+
+  STATIC_REQUIRE_FALSE(lf::async_invocable<plus_fn, mono_inline_ctx, int, int>);
+  STATIC_REQUIRE_FALSE(lf::async_invocable_to<decltype(lf::lift),
+                                              int,
+                                              mono_inline_ctx,
+                                              add_to_atomic,
+                                              std::atomic<int> *,
+                                              int>);
+}
+
+TEMPLATE_TEST_CASE(
+    "lift: direct schedule", "[lift]", mono_inline_scheduler, poly_inline_scheduler, mono_pool, poly_pool) {
+  auto scheduler = make_scheduler<TestType>();
+
+  auto value_recv = lf::schedule(scheduler, lf::lift, plus_fn{}, 20, 22);
+  REQUIRE(std::move(value_recv).get() == 42);
+
+  std::atomic<int> effect{0};
+  auto void_recv = lf::schedule(scheduler, lf::lift, add_to_atomic{}, &effect, 17);
+  std::move(void_recv).get();
+  REQUIRE(effect.load(std::memory_order_relaxed) == 17);
+
+  auto move_recv = lf::schedule(scheduler, lf::lift, take_unique{}, std::make_unique<int>(29));
+  REQUIRE(std::move(move_recv).get() == 29);
+}
+
+TEMPLATE_TEST_CASE(
+    "lift: scope operations", "[lift]", mono_inline_scheduler, poly_inline_scheduler, mono_pool, poly_pool) {
+  auto scheduler = make_scheduler<TestType>();
+  check_lift_scope_ops(scheduler);
+}
+
+#if LF_COMPILER_EXCEPTIONS
+
+TEMPLATE_TEST_CASE("lift: exceptions propagate",
+                   "[lift]",
+                   mono_inline_scheduler,
+                   poly_inline_scheduler,
+                   mono_pool,
+                   poly_pool) {
+  auto scheduler = make_scheduler<TestType>();
+
+  auto call_recv = lf::schedule(scheduler, run_call_lift_exception{});
+  REQUIRE_THROWS_AS(std::move(call_recv).get(), std::runtime_error);
+
+  auto fork_recv = lf::schedule(scheduler, run_fork_lift_exception{});
+  REQUIRE_THROWS_AS(std::move(fork_recv).get(), std::runtime_error);
+}
+
+#endif // LF_COMPILER_EXCEPTIONS
diff --git a/test/src/projected.cpp b/test/src/projected.cpp
new file mode 100644
index 000000000..6eafa2cf6
--- /dev/null
+++ b/test/src/projected.cpp
@@ -0,0 +1,275 @@
+#include <catch2/catch_test_macros.hpp>
+
+import std;
+
+import libfork;
+
+namespace {
+
+using mono_pool = lf::mono_busy_pool<lf::geometric_stack<>>;
+using context_t = mono_pool::context_type;
+
+using vec_iter = std::vector<int>::iterator;
+using cvec_iter = std::vector<int>::const_iterator;
+
+// ============================================================================
+// Projection function objects (I -> R)
+// ============================================================================
+
+struct sync_proj {
+  auto operator()(int const &) const -> double;
+};
+
+struct async_proj {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, int const &) -> lf::task<double, Context>;
+};
+
+// Both sync- and async-invocable; sync wins the for_each `if constexpr` branch
+// and the disjunction is satisfied either way.
+struct hybrid_proj {
+  auto operator()(int const &) const -> double;
+  template <typename Context>
+  static auto operator()(lf::env<Context>, int const &) -> lf::task<double, Context>;
+};
+
+// Reference-returning sync projection — value_type strips cvref, deref keeps it.
+// (The async equivalent is not testable: `task<T, ...>` requires `returnable<T>`,
+// which forbids references.)
+struct sync_ref_proj {
+  auto operator()(int &) const -> int &;
+};
+
+// Returns different types depending on lvalue vs rvalue invocation.
+// `std::projected` and (correctly) `lf::projected` invoke through `Proj &`, so
+// `value_type` and `iter_reference_t` must both be `double`. If the impl uses
+// `Proj` (rvalue) instead, `value_type` becomes `int`.
+struct dual_qualified_proj {
+  auto operator()(int const &) & -> double;
+  auto operator()(int const &) && -> int;
+};
+
+// Both sync- and async-invocable, with *different* return types.
+// `invoke_result`'s constrained partial specialization picks async, so the
+// projected's `value_type` must be `double` (the async result), not `long`.
+struct dual_mode_proj {
+  auto operator()(int const &) const -> long;
+  template <typename Context>
+  static auto operator()(lf::env<Context>, int const &) -> lf::task<double, Context>;
+};
+
+// double -> std::string, used for nesting tests.
+struct str_proj {
+  auto operator()(double const &) const -> std::string;
+};
+
+struct async_str_proj {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, double const &) -> lf::task<std::string, Context>;
+};
+
+// ============================================================================
+// Consumer function objects (for `indirectly_unary_invocable`)
+// ============================================================================
+
+struct sync_fn {
+  void operator()(int &) const;
+};
+
+struct async_fn {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, int &) -> lf::task<void, Context>;
+};
+
+struct hybrid_fn {
+  void operator()(int &) const;
+  template <typename Context>
+  static auto operator()(lf::env<Context>, int &) -> lf::task<void, Context>;
+};
+
+struct not_invocable {};
+
+struct binary_fn {
+  void operator()(int &, int &) const;
+};
+
+struct wrong_arg_fn {
+  void operator()(double &) const;
+};
+
+struct async_fn_no_copy {
+  async_fn_no_copy() = default;
+  async_fn_no_copy(async_fn_no_copy const &) = delete;
+  async_fn_no_copy(async_fn_no_copy &&) = default;
+  template <typename Context>
+  static auto operator()(lf::env<Context>, int &) -> lf::task<void, Context>;
+};
+
+struct bad_ctx {};
+
+struct readable_only {
+  using value_type = int;
+  auto operator*() const -> int &;
+};
+
+template <typename T>
+concept has_difference_type = requires { typename T::difference_type; };
+
+// ============================================================================
+// Type aliases under test
+// ============================================================================
+
+template <typename I, typename Fn, typename Context>
+using projected = lf::projected<Context, I, Fn>;
+
+using sync_projected = projected<vec_iter, sync_proj, context_t>;
+using async_projected = projected<vec_iter, async_proj, context_t>;
+using hybrid_projected = projected<vec_iter, hybrid_proj, context_t>;
+using sync_cprojected = projected<cvec_iter, sync_proj, context_t>;
+using sync_ref_projected = projected<vec_iter, sync_ref_proj, context_t>;
+using projected_no_diff = projected<readable_only, sync_proj, context_t>;
+using dual_qualified_projected = projected<vec_iter, dual_qualified_proj, context_t>;
+using dual_mode_projected = projected<vec_iter, dual_mode_proj, context_t>;
+
+using sync_then_sync = projected<sync_projected, str_proj, context_t>;
+using const_sync_then_sync = projected<sync_projected const, str_proj, context_t>;
+using async_then_async = projected<async_projected, async_str_proj, context_t>;
+using sync_then_async = projected<sync_projected, async_str_proj, context_t>;
+using async_then_sync = projected<async_projected, str_proj, context_t>;
+
+} // namespace
+
+TEST_CASE("projected: value_type, difference_type, dereference", "[projected]") {
+  STATIC_REQUIRE(std::same_as<sync_projected::value_type, double>);
+  STATIC_REQUIRE(std::same_as<async_projected::value_type, double>);
+  STATIC_REQUIRE(std::same_as<hybrid_projected::value_type, double>);
+
+  STATIC_REQUIRE(std::same_as<sync_projected::difference_type, std::iter_difference_t<vec_iter>>);
+  STATIC_REQUIRE(std::same_as<async_projected::difference_type, std::iter_difference_t<vec_iter>>);
+  STATIC_REQUIRE(std::same_as<hybrid_projected::difference_type, std::iter_difference_t<vec_iter>>);
+
+  STATIC_REQUIRE(std::indirectly_readable<sync_projected>);
+  STATIC_REQUIRE(std::indirectly_readable<async_projected>);
+  STATIC_REQUIRE(std::indirectly_readable<hybrid_projected>);
+
+  STATIC_REQUIRE(std::same_as<std::iter_reference_t<sync_projected>, double>);
+  STATIC_REQUIRE(std::same_as<std::iter_reference_t<async_projected>, double>);
+  STATIC_REQUIRE(std::same_as<std::iter_reference_t<hybrid_projected>, double>);
+
+  STATIC_REQUIRE(std::same_as<std::iter_value_t<sync_projected>, double>);
+  STATIC_REQUIRE(std::same_as<std::iter_value_t<async_projected>, double>);
+}
+
+TEST_CASE("projected: const-iterator source", "[projected]") {
+  STATIC_REQUIRE(std::same_as<sync_cprojected::value_type, double>);
+  STATIC_REQUIRE(std::same_as<std::iter_reference_t<sync_cprojected>, double>);
+}
+
+TEST_CASE("projected: reference-returning projection", "[projected]") {
+  STATIC_REQUIRE(std::same_as<sync_ref_projected::value_type, int>);
+  STATIC_REQUIRE(std::same_as<std::iter_reference_t<sync_ref_projected>, int &>);
+}
+
+TEST_CASE("projected: invokes through Proj& (matches std::projected)", "[projected]") {
+  // `operator() &` returns double; `operator() &&` returns int. Both
+  // `std::projected::value_type` and `operator*()` are computed via lvalue
+  // invocation, so the result must be `double` here.
+  STATIC_REQUIRE(std::same_as<dual_qualified_projected::value_type, double>);
+  STATIC_REQUIRE(std::same_as<std::iter_reference_t<dual_qualified_projected>, double>);
+}
+
+TEST_CASE("projected: async invocation takes precedence over sync", "[projected]") {
+  // dual_mode_proj is invocable both ways: sync returns `long`, async returns `double`.
+  // `invoke_result`'s constrained partial specialization (gated on async_invocable)
+  // is more constrained than the primary, so the async branch wins.
+  STATIC_REQUIRE(std::indirectly_unary_invocable<dual_mode_proj, vec_iter>);
+  STATIC_REQUIRE(lf::async::indirectly_unary_invocable<dual_mode_proj, context_t, vec_iter>);
+
+  STATIC_REQUIRE(std::same_as<dual_mode_projected::value_type, double>);
+  STATIC_REQUIRE(std::same_as<std::iter_reference_t<dual_mode_projected>, double>);
+}
+
+TEST_CASE("projected: difference_type only when source is weakly_incrementable", "[projected]") {
+  STATIC_REQUIRE(std::indirectly_readable<readable_only>);
+  STATIC_REQUIRE_FALSE(std::weakly_incrementable<readable_only>);
+
+  STATIC_REQUIRE(std::same_as<projected_no_diff::value_type, double>);
+  STATIC_REQUIRE_FALSE(has_difference_type<projected_no_diff>);
+  STATIC_REQUIRE(has_difference_type<sync_projected>);
+  STATIC_REQUIRE(has_difference_type<async_projected>);
+}
+
+TEST_CASE("indirectly_unary_invocable: sync, async, hybrid", "[projected]") {
+  STATIC_REQUIRE(lf::indirectly_unary_invocable<sync_fn, context_t, vec_iter>);
+  STATIC_REQUIRE(lf::indirectly_unary_invocable<async_fn, context_t, vec_iter>);
+  STATIC_REQUIRE(lf::indirectly_unary_invocable<hybrid_fn, context_t, vec_iter>);
+  STATIC_REQUIRE(lf::indirectly_regular_unary_invocable<sync_fn, context_t, vec_iter>);
+  STATIC_REQUIRE(lf::indirectly_regular_unary_invocable<async_fn, context_t, vec_iter>);
+  STATIC_REQUIRE(lf::indirectly_regular_unary_invocable<hybrid_fn, context_t, vec_iter>);
+
+  STATIC_REQUIRE_FALSE(lf::indirectly_unary_invocable<not_invocable, context_t, vec_iter>);
+  STATIC_REQUIRE_FALSE(lf::indirectly_unary_invocable<binary_fn, context_t, vec_iter>);
+  STATIC_REQUIRE_FALSE(lf::indirectly_unary_invocable<wrong_arg_fn, context_t, vec_iter>);
+  STATIC_REQUIRE_FALSE(lf::indirectly_regular_unary_invocable<not_invocable, context_t, vec_iter>);
+  STATIC_REQUIRE_FALSE(lf::indirectly_regular_unary_invocable<binary_fn, context_t, vec_iter>);
+  STATIC_REQUIRE_FALSE(lf::indirectly_regular_unary_invocable<wrong_arg_fn, context_t, vec_iter>);
+}
+
+TEST_CASE("indirectly_unary_invocable: sync branch ignores Context", "[projected]") {
+  // The sync side of the disjunction does not depend on Context, so a non-worker
+  // context still validates a plain sync function but rejects an async-only one.
+  STATIC_REQUIRE_FALSE(lf::worker_context<bad_ctx>);
+  STATIC_REQUIRE(lf::indirectly_unary_invocable<sync_fn, bad_ctx, vec_iter>);
+  STATIC_REQUIRE_FALSE(lf::indirectly_unary_invocable<async_fn, bad_ctx, vec_iter>);
+}
+
+TEST_CASE("sync::indirectly_unary_invocable: sync-only variant", "[projected]") {
+  STATIC_REQUIRE(lf::sync::indirectly_unary_invocable<sync_fn, vec_iter>);
+  STATIC_REQUIRE(lf::sync::indirectly_regular_unary_invocable<sync_fn, vec_iter>);
+
+  STATIC_REQUIRE_FALSE(lf::sync::indirectly_unary_invocable<async_fn, vec_iter>);
+  STATIC_REQUIRE_FALSE(lf::sync::indirectly_regular_unary_invocable<async_fn, vec_iter>);
+  STATIC_REQUIRE_FALSE(lf::sync::indirectly_unary_invocable<not_invocable, vec_iter>);
+  STATIC_REQUIRE_FALSE(lf::sync::indirectly_regular_unary_invocable<not_invocable, vec_iter>);
+}
+
+TEST_CASE("async::indirectly_unary_invocable: async-only variant", "[projected]") {
+  STATIC_REQUIRE_FALSE(lf::async::indirectly_unary_invocable<sync_fn, context_t, vec_iter>);
+  STATIC_REQUIRE_FALSE(lf::async::indirectly_regular_unary_invocable<sync_fn, context_t, vec_iter>);
+  STATIC_REQUIRE(lf::async::indirectly_unary_invocable<async_fn, context_t, vec_iter>);
+  STATIC_REQUIRE(lf::async::indirectly_regular_unary_invocable<async_fn, context_t, vec_iter>);
+  STATIC_REQUIRE(lf::async::indirectly_unary_invocable<hybrid_fn, context_t, vec_iter>);
+  STATIC_REQUIRE(lf::async::indirectly_regular_unary_invocable<hybrid_fn, context_t, vec_iter>);
+
+  // copy_constructible<Fn> required by both branches.
+  STATIC_REQUIRE_FALSE(lf::async::indirectly_unary_invocable<async_fn_no_copy, context_t, vec_iter>);
+  STATIC_REQUIRE_FALSE(lf::async::indirectly_regular_unary_invocable<async_fn_no_copy, context_t, vec_iter>);
+  STATIC_REQUIRE_FALSE(lf::indirectly_unary_invocable<async_fn_no_copy, context_t, vec_iter>);
+  STATIC_REQUIRE_FALSE(lf::indirectly_regular_unary_invocable<async_fn_no_copy, context_t, vec_iter>);
+
+  // indirectly_readable<I> required.
+  STATIC_REQUIRE_FALSE(lf::async::indirectly_unary_invocable<async_fn, context_t, int>);
+  STATIC_REQUIRE_FALSE(lf::async::indirectly_regular_unary_invocable<async_fn, context_t, int>);
+
+  // worker_context<Context> required.
+  STATIC_REQUIRE_FALSE(lf::async::indirectly_unary_invocable<async_fn, bad_ctx, vec_iter>);
+  STATIC_REQUIRE_FALSE(lf::async::indirectly_regular_unary_invocable<async_fn, bad_ctx, vec_iter>);
+  STATIC_REQUIRE_FALSE(lf::async::indirectly_unary_invocable<hybrid_fn, bad_ctx, vec_iter>);
+  STATIC_REQUIRE_FALSE(lf::async::indirectly_regular_unary_invocable<hybrid_fn, bad_ctx, vec_iter>);
+}
+
+TEST_CASE("projected: pipelined / nested projection", "[projected]") {
+  // Validates that `indirect_value`'s specialization for projected types kicks in;
+  // otherwise the outer projection couldn't satisfy the concept.
+  STATIC_REQUIRE(std::same_as<sync_then_sync::value_type, std::string>);
+  STATIC_REQUIRE(std::same_as<std::iter_reference_t<sync_then_sync>, std::string>);
+
+  STATIC_REQUIRE(std::same_as<const_sync_then_sync::value_type, std::string>);
+  STATIC_REQUIRE(std::same_as<std::iter_reference_t<const_sync_then_sync>, std::string>);
+
+  STATIC_REQUIRE(std::same_as<async_then_async::value_type, std::string>);
+  STATIC_REQUIRE(std::same_as<std::iter_reference_t<async_then_async>, std::string>);
+
+  STATIC_REQUIRE(std::same_as<sync_then_async::value_type, std::string>);
+  STATIC_REQUIRE(std::same_as<async_then_sync::value_type, std::string>);
+}
diff --git a/test/src/schedule.cpp b/test/src/schedule.cpp
new file mode 100644
index 000000000..6476a3a3a
--- /dev/null
+++ b/test/src/schedule.cpp
@@ -0,0 +1,122 @@
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+#include "libfork/__impl/exception.hpp"
+
+import std;
+
+import libfork;
+
+namespace {
+
+using lf::env;
+using lf::task;
+
+template <typename Context>
+auto simple_function(env<Context> /*unused*/) -> task<bool, Context> {
+  co_return true;
+}
+
+template <typename Context>
+auto void_function(env<Context> /*unused*/) -> task<void, Context> {
+  co_return;
+}
+
+template <typename Context>
+auto throwing_function(env<Context> /*unused*/) -> task<void, Context> {
+  LF_THROW(std::runtime_error{"This function always throws"});
+  co_return;
+}
+
+// Task whose argument is a big enough value-type to push the root coroutine
+// frame past the 1 KiB embedded buffer in hidden_receiver_state.
+template <typename Context>
+auto big_arg_function(env<Context> /*unused*/, std::array<std::byte, 2048> /*unused*/)
+    -> task<void, Context> {
+  co_return;
+}
+
+template <typename Sch>
+void simple_tests(Sch &scheduler) {
+  SECTION("void") {
+    auto recv = schedule(scheduler, void_function<lf::context_t<Sch>>);
+    REQUIRE(recv.valid());
+    std::move(recv).get();
+  }
+
+  SECTION("non-void") {
+    auto recv = schedule(scheduler, simple_function<lf::context_t<Sch>>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get() == true);
+  }
+
+  SECTION("explicit recv_state") {
+    lf::recv_state<bool, false> state;
+    auto recv = schedule(scheduler, std::move(state), simple_function<lf::context_t<Sch>>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get() == true);
+  }
+
+  SECTION("stoppable recv_state") {
+    lf::recv_state<bool, true> state;
+    auto recv = schedule(scheduler, std::move(state), simple_function<lf::context_t<Sch>>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get() == true);
+  }
+
+  SECTION("recv_state with explicit allocator") {
+    std::allocator<std::byte> alloc;
+    lf::recv_state<bool, false> state{std::allocator_arg, alloc};
+    auto recv = schedule(scheduler, std::move(state), simple_function<lf::context_t<Sch>>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get() == true);
+  }
+
+  SECTION("recv_state with value-init") {
+    // Pre-initialise the return slot; the task overwrites it.
+    lf::recv_state<bool, false> state{false};
+    auto recv = schedule(scheduler, std::move(state), simple_function<lf::context_t<Sch>>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get() == true);
+  }
+
+#if LF_COMPILER_EXCEPTIONS
+  SECTION("throwing") {
+    auto recv = schedule(scheduler, throwing_function<lf::context_t<Sch>>);
+    REQUIRE(recv.valid());
+    REQUIRE_THROWS_AS(std::move(recv).get(), std::runtime_error);
+  }
+
+  SECTION("frame too large -> root_alloc_error") {
+    std::array<std::byte, 2048> big{};
+    REQUIRE_THROWS_AS(schedule(scheduler, big_arg_function<lf::context_t<Sch>>, big), lf::root_alloc_error);
+  }
+#endif
+}
+
+using mono_inline_ctx = lf::mono_context<lf::geometric_stack<>, lf::adapt_vector<>>;
+using poly_inline_ctx = lf::derived_poly_context<lf::geometric_stack<>, lf::adapt_vector<>>;
+
+} // namespace
+
+TEMPLATE_TEST_CASE("Inline schedule", "[schedule]", mono_inline_ctx, poly_inline_ctx) {
+  lf::inline_scheduler<TestType> scheduler;
+  simple_tests(scheduler);
+}
+
+namespace {
+
+using mono_busy_thread_pool = lf::mono_busy_pool<lf::geometric_stack<>>;
+using poly_busy_thread_pool = lf::poly_busy_pool<lf::geometric_stack<>>;
+
+} // namespace
+
+TEMPLATE_TEST_CASE("Busy schedule", "[schedule]", mono_busy_thread_pool, poly_busy_thread_pool) {
+
+  STATIC_REQUIRE(lf::scheduler<TestType>);
+
+  for (std::size_t thr = 1; thr < 4; ++thr) {
+    TestType scheduler{thr};
+    simple_tests(scheduler);
+  }
+}
diff --git a/test/src/stack.cpp b/test/src/stack.cpp
new file mode 100644
index 000000000..674758c08
--- /dev/null
+++ b/test/src/stack.cpp
@@ -0,0 +1,354 @@
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+#include "libfork/__impl/exception.hpp"
+
+import std;
+
+import libfork;
+import libfork.utils;
+
+using lf::adaptor_stack;
+using lf::geometric_stack;
+using lf::k_new_align;
+using lf::slab_stack;
+using lf::worker_stack;
+
+namespace {
+
+auto not_constexpr() {}
+
+} // namespace
+
+#define expect(expr)                                                                                         \
+  if consteval {                                                                                             \
+    if (!(expr)) {                                                                                           \
+      not_constexpr();                                                                                       \
+    }                                                                                                        \
+  } else {                                                                                                   \
+    REQUIRE(expr);                                                                                           \
+  }
+
+#define TEST_CONSTEXPR(...)                                                                                  \
+  constexpr auto impl = __VA_ARGS__;                                                                         \
+  REQUIRE(impl());                                                                                           \
+  STATIC_REQUIRE(impl())
+
+namespace {
+
+constexpr void check_alignment(void *ptr) {
+
+  expect(ptr != nullptr);
+
+  if !consteval {
+    REQUIRE(lf::is_sufficiently_aligned<k_new_align>(ptr));
+  }
+}
+
+constexpr void check_empty(auto const &stack) {
+  if constexpr (requires { stack.empty(); }) {
+    expect(stack.empty());
+  }
+}
+
+constexpr void check_non_empty(auto const &stack) {
+  if constexpr (requires { stack.empty(); }) {
+    expect(!stack.empty());
+  }
+}
+
+} // namespace
+
+// Stack types that may hit slab_stack's fixed capacity need exception support
+// to signal overflow. Under -fno-exceptions, drop slab_stack from those tests.
+#if LF_COMPILER_EXCEPTIONS
+  #define STACK_TYPES_ALL geometric_stack<>, adaptor_stack<>, slab_stack<>
+#else
+  #define STACK_TYPES_ALL geometric_stack<>, adaptor_stack<>
+#endif
+
+TEMPLATE_TEST_CASE("Concept", "[stacks]", geometric_stack<>, adaptor_stack<>, slab_stack<>) {
+  STATIC_REQUIRE(worker_stack<TestType>); //
+}
+
+TEMPLATE_TEST_CASE("Basic push and pop", "[stacks]", geometric_stack<>, adaptor_stack<>, slab_stack<>) {
+  TEST_CONSTEXPR([]() -> bool {
+    TestType stack;
+    check_empty(stack);
+
+    void *p1 = stack.push(10);
+    check_alignment(p1);
+    check_non_empty(stack);
+
+    void *p2 = stack.push(20);
+    check_alignment(p2);
+    expect(p2 != p1);
+    check_non_empty(stack);
+
+    // Pop in FILO order
+    stack.pop(p2, 20);
+    stack.pop(p1, 10);
+    check_empty(stack);
+
+    return true;
+  });
+}
+
+TEMPLATE_TEST_CASE("Ckpt/Acquire/Release", "[stacks]", geometric_stack<>, adaptor_stack<>, slab_stack<>) {
+  TEST_CONSTEXPR([]() -> bool {
+    TestType stack1;
+    void *p1 = stack1.push(100);
+    auto cp1 = stack1.checkpoint();
+
+    TestType stack2;
+    auto cp2 = stack2.checkpoint();
+
+    using C = decltype(cp1);
+
+    expect(((cp1 == C{} && cp2 == C{}) || cp1 != cp2));
+
+    auto key1 = stack1.prepare_release();
+    stack2.acquire(cp1);
+    stack1.release(key1);
+    expect(stack2.checkpoint() == cp1);
+    stack2.pop(p1, 100);
+
+    return true;
+  });
+}
+
+TEMPLATE_TEST_CASE("Single pass", "[stacks]", STACK_TYPES_ALL) {
+  for (int k = 0; k < 10; ++k) {
+
+    TestType stack;
+    std::mt19937_64 rng{std::random_device{}()};
+    std::uniform_int_distribution<std::size_t> size_dist{1, 200};
+    std::uniform_int_distribution<std::size_t> depth_dist{5, 5000};
+
+    struct entry {
+      void *ptr;
+      std::size_t size;
+    };
+
+    // Perform several rounds of deep push/pop sequences
+    for (int i = 0; i < 2; ++i) {
+      std::vector<entry> entries;
+      const std::size_t depth = depth_dist(rng);
+
+      // Push phase — break early if slab_stack exhausts its fixed capacity
+      for (std::size_t j = 0; j < depth; ++j) {
+        std::size_t s = size_dist(rng);
+        void *p = nullptr;
+        LF_TRY {
+          p = stack.push(s);
+        } LF_CATCH(std::bad_alloc const &) {
+          break;
+        }
+        check_alignment(p);
+        entries.push_back({.ptr = p, .size = s});
+      }
+
+      // Pop phase (FILO) — use entries.size() in case push exited early
+      for (std::size_t j = entries.size(); j > 0; --j) {
+        auto const &e = entries[j - 1];
+        stack.pop(e.ptr, e.size);
+      }
+
+      check_empty(stack);
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE("Randomized push/pop", "[stacks]", STACK_TYPES_ALL) {
+  TestType stack;
+  std::mt19937_64 rng{std::random_device{}()};
+  std::bernoulli_distribution push_dist{0.51};
+  std::uniform_int_distribution<std::size_t> size_dist{1, 512};
+
+  struct entry {
+    void *ptr;
+    std::size_t size;
+  };
+  std::vector<entry> entries;
+  std::size_t total_pushed = 0;
+  const std::size_t target_pushed = 10'000'000;
+
+  while (total_pushed < target_pushed) {
+
+    if (entries.empty()) {
+      check_empty(stack);
+    }
+
+    if (entries.empty() || push_dist(rng)) {
+      std::size_t s = size_dist(rng);
+      void *p = nullptr;
+      LF_TRY {
+        p = stack.push(s);
+      } LF_CATCH(std::bad_alloc const &) {
+        // slab_stack exhausted; clean up and finish
+        break;
+      }
+      check_alignment(p);
+      entries.push_back({.ptr = p, .size = s});
+      total_pushed++;
+    } else {
+      auto e = entries.back();
+      stack.pop(e.ptr, e.size);
+      entries.pop_back();
+    }
+  }
+
+  // Clean up remaining entries
+  while (!entries.empty()) {
+    auto e = entries.back();
+    stack.pop(e.ptr, e.size);
+    entries.pop_back();
+  }
+
+  check_empty(stack);
+}
+
+TEMPLATE_TEST_CASE("Spikey randomized push/pop", "[stacks]", STACK_TYPES_ALL) {
+  TestType stack;
+  std::mt19937_64 rng{std::random_device{}()};
+
+  // Higher probability of push after push, higher probability of pop after pop
+  std::bernoulli_distribution push_after_push{0.95};
+  std::bernoulli_distribution push_after_pop{0.1};
+  std::uniform_int_distribution<std::size_t> size_dist{1, 512};
+
+  struct entry {
+    void *ptr;
+    std::size_t size;
+  };
+  std::vector<entry> entries;
+  std::size_t total_pushed = 0;
+  const std::size_t target_pushed = 10'000'000;
+  bool last_was_push = true;
+
+  while (total_pushed < target_pushed) {
+
+    bool do_push = true;
+
+    if (entries.empty()) {
+      check_empty(stack);
+      do_push = true;
+    } else if (last_was_push) {
+      do_push = push_after_push(rng);
+    } else {
+      do_push = push_after_pop(rng);
+    }
+
+    if (do_push) {
+      std::size_t s = size_dist(rng);
+      void *p = nullptr;
+      LF_TRY {
+        p = stack.push(s);
+      } LF_CATCH(std::bad_alloc const &) {
+        // slab_stack exhausted; clean up and finish
+        break;
+      }
+      check_alignment(p);
+      entries.push_back({.ptr = p, .size = s});
+      total_pushed++;
+      last_was_push = true;
+    } else {
+      auto e = entries.back();
+      stack.pop(e.ptr, e.size);
+      entries.pop_back();
+      last_was_push = false;
+    }
+  }
+
+  // Clean up remaining entries
+  while (!entries.empty()) {
+    auto e = entries.back();
+    stack.pop(e.ptr, e.size);
+    entries.pop_back();
+  }
+  check_empty(stack);
+}
+
+// ---- slab_stack specific ----
+//
+// Tests that exercise behaviour unique to slab_stack's fixed-size design.
+
+#if LF_COMPILER_EXCEPTIONS
+TEST_CASE("slab_stack - throws when full", "[stacks]") {
+  // Use a tiny slab (2 usable nodes) to exercise the overflow path precisely.
+  lf::slab_stack<> stack(2);
+
+  void *p1 = stack.push(k_new_align);
+  void *p2 = stack.push(k_new_align);
+  REQUIRE_THROWS_AS(stack.push(k_new_align), std::bad_alloc);
+
+  stack.pop(p2, k_new_align);
+  stack.pop(p1, k_new_align);
+  check_empty(stack);
+}
+#endif
+
+TEST_CASE("slab_stack - single pass", "[stacks]") {
+  for (int k = 0; k < 10; ++k) {
+    // Slab sized to hold the worst-case live footprint without early exit:
+    // depth_max (5000) * roundup(size_max (200), k_new_align=16) / k_new_align
+    // = 5000 * 13 = 65 000 nodes, with headroom.
+    lf::slab_stack<> stack(70'000);
+    std::mt19937_64 rng{std::random_device{}()};
+    std::uniform_int_distribution<std::size_t> size_dist{1, 200};
+    std::uniform_int_distribution<std::size_t> depth_dist{5, 5000};
+
+    struct entry {
+      void *ptr;
+      std::size_t size;
+    };
+
+    for (int i = 0; i < 2; ++i) {
+      std::vector<entry> entries;
+      const std::size_t depth = depth_dist(rng);
+
+      for (std::size_t j = 0; j < depth; ++j) {
+        std::size_t s = size_dist(rng);
+        void *p = stack.push(s);
+        check_alignment(p);
+        entries.push_back({.ptr = p, .size = s});
+      }
+
+      for (std::size_t j = depth; j > 0; --j) {
+        auto const &e = entries[j - 1];
+        stack.pop(e.ptr, e.size);
+      }
+
+      check_empty(stack);
+    }
+  }
+}
+
+#if LF_COMPILER_EXCEPTIONS
+
+TEST_CASE("slab_stack - release/acquire preserves capacity", "[stacks]") {
+  // Regression: acquire must propagate the non-default capacity via m_ctrl->size,
+  // not silently revert to k_default_nodes.
+  constexpr int N = 4;
+  lf::slab_stack<> src(N);
+  lf::slab_stack<> dst;
+
+  void *p = src.push(k_new_align);
+  auto cp = src.checkpoint();
+  auto key = src.prepare_release();
+  dst.acquire(cp);
+  src.release(key);
+
+  // dst should have room for N-1 more pushes (one already used), and then throw.
+  std::vector<void *> ptrs{p};
+  for (int i = 1; i < N; ++i) {
+    ptrs.push_back(dst.push(k_new_align));
+  }
+  REQUIRE_THROWS_AS(dst.push(k_new_align), std::bad_alloc);
+  for (auto it = ptrs.rbegin(); it != ptrs.rend(); ++it) {
+    dst.pop(*it, k_new_align);
+  }
+  REQUIRE(dst.empty());
+}
+
+#endif
diff --git a/test/src/tuple.cpp b/test/src/tuple.cpp
new file mode 100644
index 000000000..aeecc40d4
--- /dev/null
+++ b/test/src/tuple.cpp
@@ -0,0 +1,234 @@
+#include <catch2/catch_test_macros.hpp>
+
+import std;
+
+import libfork.utils;
+
+namespace {
+
+template <typename T>
+struct control_struct {
+  T val;
+};
+
+struct nil {};
+
+template <typename T>
+using get = decltype(std::declval<T>().template get<0>());
+
+template <typename T>
+using val = decltype(std::get<0>(std::declval<T>()));
+
+template <typename T>
+void check_accessor_types() {
+  using tupl_t = lf::tuple<T>;
+  using ctrl_t = std::tuple<T>;
+
+  STATIC_REQUIRE(std::same_as<get<tupl_t &>, val<ctrl_t &>>);
+  STATIC_REQUIRE(std::same_as<get<tupl_t const &>, val<ctrl_t const &>>);
+  STATIC_REQUIRE(std::same_as<get<tupl_t &&>, val<ctrl_t &&>>);
+  STATIC_REQUIRE(std::same_as<get<tupl_t const &&>, val<ctrl_t const &&>>);
+
+  // Force instantiation of above
+
+  int x = 0;
+  tupl_t t{static_cast<T>(x)};
+
+  std::ignore = static_cast<tupl_t &>(t).template get<0>();
+  std::ignore = static_cast<tupl_t const &>(t).template get<0>();
+  std::ignore = static_cast<tupl_t &&>(t).template get<0>();
+  std::ignore = static_cast<tupl_t const &&>(t).template get<0>();
+}
+
+} // namespace
+
+TEST_CASE("Tuple accessor types", "[tuple]") {
+  check_accessor_types<int>();
+  check_accessor_types<int &>();
+  check_accessor_types<int &&>();
+
+  check_accessor_types<int const>();
+  check_accessor_types<int const &>();
+  check_accessor_types<int const &&>();
+}
+
+TEST_CASE("Tuple size optimization", "[tuple]") {
+
+  STATIC_REQUIRE(sizeof(lf::tuple<nil>) == 1);
+  STATIC_REQUIRE(sizeof(lf::tuple<int>) == sizeof(int));
+
+  STATIC_REQUIRE(sizeof(lf::tuple<int, nil>) == sizeof(int));
+  STATIC_REQUIRE(sizeof(lf::tuple<nil, int>) == sizeof(int));
+  STATIC_REQUIRE(sizeof(lf::tuple<nil, nil>) == sizeof(std::tuple<nil, nil>));
+  STATIC_REQUIRE(sizeof(lf::tuple<int, int>) == 2 * sizeof(int));
+
+  STATIC_REQUIRE(sizeof(lf::tuple<nil, nil, int>) == sizeof(int));
+  STATIC_REQUIRE(sizeof(lf::tuple<int, nil, nil>) == 2 * sizeof(int)); // TODO: fixable?
+  STATIC_REQUIRE(sizeof(lf::tuple<nil, int, nil>) == 2 * sizeof(int));
+}
+
+TEST_CASE("Tuple triviality", "[tuple]") {
+  //
+  using trivial_tuple = lf::tuple<int, double, nil>;
+
+  STATIC_REQUIRE(std::is_aggregate_v<trivial_tuple>);
+
+  STATIC_REQUIRE(std::is_trivially_default_constructible_v<trivial_tuple>);
+  STATIC_REQUIRE(std::is_trivially_copy_constructible_v<trivial_tuple>);
+  STATIC_REQUIRE(std::is_trivially_move_constructible_v<trivial_tuple>);
+  STATIC_REQUIRE(std::is_trivially_copy_assignable_v<trivial_tuple>);
+  STATIC_REQUIRE(std::is_trivially_move_assignable_v<trivial_tuple>);
+  STATIC_REQUIRE(std::is_trivially_destructible_v<trivial_tuple>);
+}
+
+TEST_CASE("Tuple construction", "[tuple]") {
+  lf::tuple<int, double> _{1, 0.};
+  lf::tuple<nil, int, nil> _{nil{}, 2, nil{}};
+}
+
+TEST_CASE("Tuple apply", "[tuple]") {
+
+  struct move_only {
+    move_only() = default;
+    move_only(move_only const &) = delete;
+    move_only(move_only &&) = default;
+  };
+
+  lf::tuple<int, move_only> val{1, move_only{}};
+
+  REQUIRE(std::move(val).apply([](int x, move_only) -> bool {
+    return x == 1;
+  }));
+}
+
+TEST_CASE("Tuple structured bindings", "[tuple]") {
+
+  lf::tuple<int, nil> tup{1, nil{}};
+
+  auto &&[i, n] = tup;
+
+  REQUIRE(i == 1);
+  REQUIRE(std::is_same_v<decltype(n), nil>);
+
+  i += 1;
+
+  REQUIRE(tup.get<0>() == 2);
+}
+
+TEST_CASE("Tuple CTAD", "[tuple]") {
+  int x = 42;
+  lf::tuple t_lval{x};
+  STATIC_REQUIRE(std::is_same_v<decltype(t_lval), lf::tuple<int &>>);
+
+  lf::tuple t_rval{42};
+  STATIC_REQUIRE(std::is_same_v<decltype(t_rval), lf::tuple<int>>);
+
+  const int cx = 42;
+  lf::tuple t_clval{cx};
+  STATIC_REQUIRE(std::is_same_v<decltype(t_clval), lf::tuple<const int &>>);
+}
+
+TEST_CASE("Tuple reference semantics", "[tuple]") {
+  int x = 1;
+  lf::tuple<int &> t{x};
+  t.get<0>() = 2;
+  REQUIRE(x == 2);
+
+  auto &[ref] = t;
+  ref = 3;
+  REQUIRE(x == 3);
+}
+
+TEST_CASE("Tuple rvalue reference semantics", "[tuple]") {
+  int x = 1;
+  // tuple holding rvalue ref to x (cast to rvalue)
+  lf::tuple<int &&> t{std::move(x)};
+
+  // Accessing lvalue tuple -> lvalue ref to member (which is rvalue ref) -> int&
+  STATIC_REQUIRE(std::is_same_v<decltype(t.get<0>()), int &>);
+
+  // Modifying via the stored rvalue ref
+  t.get<0>() = 2;
+  REQUIRE(x == 2);
+
+  // Accessing rvalue tuple -> xvalue member -> int&&
+  STATIC_REQUIRE(std::is_same_v<decltype(std::move(t).get<0>()), int &&>);
+}
+
+TEST_CASE("Tuple apply value categories", "[tuple]") {
+  lf::tuple<int> t{42};
+
+  // Check lvalue arg
+  bool called_lvalue = false;
+  t.apply([&](int &x) {
+    called_lvalue = true;
+    REQUIRE(x == 42);
+  });
+  REQUIRE(called_lvalue);
+
+  // Check rvalue arg
+  bool called_rvalue = false;
+  std::move(t).apply([&](int &&x) {
+    called_rvalue = true;
+    REQUIRE(x == 42);
+  });
+  REQUIRE(called_rvalue);
+
+  // Check const lvalue arg
+  const lf::tuple<int> ct{42};
+  bool called_const_lvalue = false;
+  ct.apply([&](const int &x) {
+    called_const_lvalue = true;
+    REQUIRE(x == 42);
+  });
+  REQUIRE(called_const_lvalue);
+}
+
+TEST_CASE("Tuple empty", "[tuple]") {
+  lf::tuple<> t{};
+  STATIC_REQUIRE(sizeof(t) == 1);
+  STATIC_REQUIRE(std::tuple_size_v<decltype(t)> == 0);
+}
+
+TEST_CASE("Tuple move-only types", "[tuple]") {
+  auto ptr = std::make_unique<int>(42);
+  lf::tuple<std::unique_ptr<int>> t{std::move(ptr)};
+
+  REQUIRE(ptr == nullptr);
+  REQUIRE(*t.get<0>() == 42);
+
+  // Move out
+  auto ptr2 = std::move(t.get<0>());
+  REQUIRE(*ptr2 == 42);
+  REQUIRE(t.get<0>() == nullptr);
+}
+
+TEST_CASE("Tuple nested", "[tuple]") {
+  lf::tuple<lf::tuple<int, int>, int> t{1, 2, 3};
+
+  REQUIRE(t.get<0>().get<0>() == 1);
+  REQUIRE(t.get<0>().get<1>() == 2);
+  REQUIRE(t.get<1>() == 3);
+}
+
+TEST_CASE("Tuple const structured bindings", "[tuple]") {
+  lf::tuple<int, int> t{10, 20};
+  const auto &[x, y] = t;
+
+  STATIC_REQUIRE(std::is_same_v<decltype(x), const int>);
+  STATIC_REQUIRE(std::is_same_v<decltype(y), const int>);
+
+  t.get<0>() = 0;
+  t.get<1>() = 1;
+
+  REQUIRE(x == 0);
+  REQUIRE(y == 1);
+}
+
+TEST_CASE("Tuple traits", "[tuple]") {
+  using T = lf::tuple<int, double, char>;
+  STATIC_REQUIRE(std::tuple_size_v<T> == 3);
+  STATIC_REQUIRE(std::is_same_v<std::tuple_element_t<0, T>, int>);
+  STATIC_REQUIRE(std::is_same_v<std::tuple_element_t<1, T>, double>);
+  STATIC_REQUIRE(std::is_same_v<std::tuple_element_t<2, T>, char>);
+}
diff --git a/test/src/utility.cpp b/test/src/utility.cpp
new file mode 100644
index 000000000..57b33d3c7
--- /dev/null
+++ b/test/src/utility.cpp
@@ -0,0 +1,25 @@
+#include <catch2/catch_test_macros.hpp>
+
+import std;
+
+import libfork;
+import libfork.utils;
+
+TEST_CASE("Defer properties", "[defer]") {
+  using fn_t = void (*)() noexcept;
+  STATIC_REQUIRE(!std::is_copy_constructible_v<lf::defer<fn_t>>);
+  STATIC_REQUIRE(!std::is_move_constructible_v<lf::defer<fn_t>>);
+  STATIC_REQUIRE(!std::is_copy_assignable_v<lf::defer<fn_t>>);
+  STATIC_REQUIRE(!std::is_move_assignable_v<lf::defer<fn_t>>);
+}
+
+TEST_CASE("Defer executes on scope exit", "[defer]") {
+  int count = 0;
+  {
+    lf::defer _ = [&count]() noexcept -> void {
+      ++count;
+    };
+    REQUIRE(count == 0);
+  }
+  REQUIRE(count == 1);
+}
diff --git a/test/src/version.cpp b/test/src/version.cpp
new file mode 100644
index 000000000..ebfbb88d8
--- /dev/null
+++ b/test/src/version.cpp
@@ -0,0 +1,25 @@
+#include <catch2/catch_test_macros.hpp>
+
+#include "libfork/version.hpp"
+
+#ifndef LF_VERSION_MAJOR
+  #error LF_VERSION_MAJOR macro is missing
+#endif
+
+#ifndef LF_VERSION_MINOR
+  #error LF_VERSION_MINOR macro is missing
+#endif
+
+#ifndef LF_VERSION_PATCH
+  #error LF_VERSION_PATCH macro is missing
+#endif
+
+TEST_CASE("Version header", "[version]") {
+  constexpr std::size_t major{LF_VERSION_MAJOR};
+  constexpr std::size_t minor{LF_VERSION_MINOR};
+  constexpr std::size_t patch{LF_VERSION_PATCH};
+
+  REQUIRE(major >= 4);
+  REQUIRE(minor >= 0);
+  REQUIRE(patch >= 0);
+}
diff --git a/tools/deps.py b/tools/deps.py
new file mode 100644
index 000000000..9732ce0b7
--- /dev/null
+++ b/tools/deps.py
@@ -0,0 +1,19 @@
+import re
+import os
+
+files = [f for f in os.listdir("src/core") if f.endswith(".cxx") and f != "core.cxx"]
+deps = {}
+
+for f in files:
+    partition = f.split(".")[0]
+    with open(f"src/core/{f}", "r") as file:
+        content = file.read()
+        imports = re.findall(r"import :([^;]+);", content)
+        deps[partition] = imports
+
+print("digraph \"libfork.core\" {")
+print("  node [shape=box];")
+for p, i in sorted(deps.items()):
+    for dep in sorted(i):
+        print(f"  \"{p}\" -> \"{dep}\";")
+print("}")
diff --git a/docs/tour.md b/tour.md
similarity index 98%
rename from docs/tour.md
rename to tour.md
index 4c1800dd3..45b7b1fe5 100644
--- a/docs/tour.md
+++ b/tour.md
@@ -25,7 +25,7 @@ Definitions:
 - __Parent:__ A task that spawns other tasks.
 - __Child:__ A task that is spawned by another task.
 
-The tasking/fork-join interface is designed to mirror [Cilk](https://en.wikipedia.org/wiki/Cilk) and other fork-join frameworks. The best way to learn is by example, lets start with the canonical introduction to fork-join, the recursive Fibonacci function, in regular C++ it looks like this:
+The tasking/fork-join interface is designed to mirror [Cilk](https://en.wikipedia.org/wiki/Cilk) and other fork-join frameworks. The best way to learn is by example, let's start with the canonical introduction to fork-join, the recursive Fibonacci function, in regular C++ it looks like this:
 
 ```cpp
 auto fib(int n) -> int {
diff --git a/uv.lock b/uv.lock
new file mode 100644
index 000000000..6c399ac61
--- /dev/null
+++ b/uv.lock
@@ -0,0 +1,241 @@
+version = 1
+revision = 3
+requires-python = ">=3.13"
+
+[[package]]
+name = "click"
+version = "8.3.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/bb/63/f9e1ea081ce35720d8b92acde70daaedace594dc93b693c869e0d5910718/click-8.3.3.tar.gz", hash = "sha256:398329ad4837b2ff7cbe1dd166a4c0f8900c3ca3a218de04466f38f6497f18a2", size = 328061, upload-time = "2026-04-22T15:11:27.506Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ae/44/c1221527f6a71a01ec6fbad7fa78f1d50dfa02217385cf0fa3eec7087d59/click-8.3.3-py3-none-any.whl", hash = "sha256:a2bf429bb3033c89fa4936ffb35d5cb471e3719e1f3c8a7c3fff0b8314305613", size = 110502, upload-time = "2026-04-22T15:11:25.044Z" },
+]
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
+]
+
+[[package]]
+name = "deepmerge"
+version = "2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a8/3a/b0ba594708f1ad0bc735884b3ad854d3ca3bdc1d741e56e40bbda6263499/deepmerge-2.0.tar.gz", hash = "sha256:5c3d86081fbebd04dd5de03626a0607b809a98fb6ccba5770b62466fe940ff20", size = 19890, upload-time = "2024-08-30T05:31:50.308Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2d/82/e5d2c1c67d19841e9edc74954c827444ae826978499bde3dfc1d007c8c11/deepmerge-2.0-py3-none-any.whl", hash = "sha256:6de9ce507115cff0bed95ff0ce9ecc31088ef50cbdf09bc90a09349a318b3d00", size = 13475, upload-time = "2024-08-30T05:31:48.659Z" },
+]
+
+[[package]]
+name = "jinja2"
+version = "3.1.6"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markupsafe" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" },
+]
+
+[[package]]
+name = "libfork"
+version = "0.1.0"
+source = { virtual = "." }
+dependencies = [
+    { name = "zensical" },
+]
+
+[package.metadata]
+requires-dist = [{ name = "zensical", specifier = ">=0.0.41" }]
+
+[[package]]
+name = "markdown"
+version = "3.10.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/2b/f4/69fa6ed85ae003c2378ffa8f6d2e3234662abd02c10d216c0ba96081a238/markdown-3.10.2.tar.gz", hash = "sha256:994d51325d25ad8aa7ce4ebaec003febcce822c3f8c911e3b17c52f7f589f950", size = 368805, upload-time = "2026-02-09T14:57:26.942Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/de/1f/77fa3081e4f66ca3576c896ae5d31c3002ac6607f9747d2e3aa49227e464/markdown-3.10.2-py3-none-any.whl", hash = "sha256:e91464b71ae3ee7afd3017d9f358ef0baf158fd9a298db92f1d4761133824c36", size = 108180, upload-time = "2026-02-09T14:57:25.787Z" },
+]
+
+[[package]]
+name = "markupsafe"
+version = "3.0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/38/2f/907b9c7bbba283e68f20259574b13d005c121a0fa4c175f9bed27c4597ff/markupsafe-3.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e1cf1972137e83c5d4c136c43ced9ac51d0e124706ee1c8aa8532c1287fa8795", size = 11622, upload-time = "2025-09-27T18:36:41.777Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/d9/5f7756922cdd676869eca1c4e3c0cd0df60ed30199ffd775e319089cb3ed/markupsafe-3.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:116bb52f642a37c115f517494ea5feb03889e04df47eeff5b130b1808ce7c219", size = 12029, upload-time = "2025-09-27T18:36:43.257Z" },
+    { url = "https://files.pythonhosted.org/packages/00/07/575a68c754943058c78f30db02ee03a64b3c638586fba6a6dd56830b30a3/markupsafe-3.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:133a43e73a802c5562be9bbcd03d090aa5a1fe899db609c29e8c8d815c5f6de6", size = 24374, upload-time = "2025-09-27T18:36:44.508Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/21/9b05698b46f218fc0e118e1f8168395c65c8a2c750ae2bab54fc4bd4e0e8/markupsafe-3.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccfcd093f13f0f0b7fdd0f198b90053bf7b2f02a3927a30e63f3ccc9df56b676", size = 22980, upload-time = "2025-09-27T18:36:45.385Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/71/544260864f893f18b6827315b988c146b559391e6e7e8f7252839b1b846a/markupsafe-3.0.3-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:509fa21c6deb7a7a273d629cf5ec029bc209d1a51178615ddf718f5918992ab9", size = 21990, upload-time = "2025-09-27T18:36:46.916Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/28/b50fc2f74d1ad761af2f5dcce7492648b983d00a65b8c0e0cb457c82ebbe/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4afe79fb3de0b7097d81da19090f4df4f8d3a2b3adaa8764138aac2e44f3af1", size = 23784, upload-time = "2025-09-27T18:36:47.884Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/76/104b2aa106a208da8b17a2fb72e033a5a9d7073c68f7e508b94916ed47a9/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:795e7751525cae078558e679d646ae45574b47ed6e7771863fcc079a6171a0fc", size = 21588, upload-time = "2025-09-27T18:36:48.82Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/99/16a5eb2d140087ebd97180d95249b00a03aa87e29cc224056274f2e45fd6/markupsafe-3.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8485f406a96febb5140bfeca44a73e3ce5116b2501ac54fe953e488fb1d03b12", size = 23041, upload-time = "2025-09-27T18:36:49.797Z" },
+    { url = "https://files.pythonhosted.org/packages/19/bc/e7140ed90c5d61d77cea142eed9f9c303f4c4806f60a1044c13e3f1471d0/markupsafe-3.0.3-cp313-cp313-win32.whl", hash = "sha256:bdd37121970bfd8be76c5fb069c7751683bdf373db1ed6c010162b2a130248ed", size = 14543, upload-time = "2025-09-27T18:36:51.584Z" },
+    { url = "https://files.pythonhosted.org/packages/05/73/c4abe620b841b6b791f2edc248f556900667a5a1cf023a6646967ae98335/markupsafe-3.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:9a1abfdc021a164803f4d485104931fb8f8c1efd55bc6b748d2f5774e78b62c5", size = 15113, upload-time = "2025-09-27T18:36:52.537Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/3a/fa34a0f7cfef23cf9500d68cb7c32dd64ffd58a12b09225fb03dd37d5b80/markupsafe-3.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:7e68f88e5b8799aa49c85cd116c932a1ac15caaa3f5db09087854d218359e485", size = 13911, upload-time = "2025-09-27T18:36:53.513Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/d7/e05cd7efe43a88a17a37b3ae96e79a19e846f3f456fe79c57ca61356ef01/markupsafe-3.0.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:218551f6df4868a8d527e3062d0fb968682fe92054e89978594c28e642c43a73", size = 11658, upload-time = "2025-09-27T18:36:54.819Z" },
+    { url = "https://files.pythonhosted.org/packages/99/9e/e412117548182ce2148bdeacdda3bb494260c0b0184360fe0d56389b523b/markupsafe-3.0.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3524b778fe5cfb3452a09d31e7b5adefeea8c5be1d43c4f810ba09f2ceb29d37", size = 12066, upload-time = "2025-09-27T18:36:55.714Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/e6/fa0ffcda717ef64a5108eaa7b4f5ed28d56122c9a6d70ab8b72f9f715c80/markupsafe-3.0.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4e885a3d1efa2eadc93c894a21770e4bc67899e3543680313b09f139e149ab19", size = 25639, upload-time = "2025-09-27T18:36:56.908Z" },
+    { url = "https://files.pythonhosted.org/packages/96/ec/2102e881fe9d25fc16cb4b25d5f5cde50970967ffa5dddafdb771237062d/markupsafe-3.0.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8709b08f4a89aa7586de0aadc8da56180242ee0ada3999749b183aa23df95025", size = 23569, upload-time = "2025-09-27T18:36:57.913Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/30/6f2fce1f1f205fc9323255b216ca8a235b15860c34b6798f810f05828e32/markupsafe-3.0.3-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b8512a91625c9b3da6f127803b166b629725e68af71f8184ae7e7d54686a56d6", size = 23284, upload-time = "2025-09-27T18:36:58.833Z" },
+    { url = "https://files.pythonhosted.org/packages/58/47/4a0ccea4ab9f5dcb6f79c0236d954acb382202721e704223a8aafa38b5c8/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9b79b7a16f7fedff2495d684f2b59b0457c3b493778c9eed31111be64d58279f", size = 24801, upload-time = "2025-09-27T18:36:59.739Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/70/3780e9b72180b6fecb83a4814d84c3bf4b4ae4bf0b19c27196104149734c/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:12c63dfb4a98206f045aa9563db46507995f7ef6d83b2f68eda65c307c6829eb", size = 22769, upload-time = "2025-09-27T18:37:00.719Z" },
+    { url = "https://files.pythonhosted.org/packages/98/c5/c03c7f4125180fc215220c035beac6b9cb684bc7a067c84fc69414d315f5/markupsafe-3.0.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8f71bc33915be5186016f675cd83a1e08523649b0e33efdb898db577ef5bb009", size = 23642, upload-time = "2025-09-27T18:37:01.673Z" },
+    { url = "https://files.pythonhosted.org/packages/80/d6/2d1b89f6ca4bff1036499b1e29a1d02d282259f3681540e16563f27ebc23/markupsafe-3.0.3-cp313-cp313t-win32.whl", hash = "sha256:69c0b73548bc525c8cb9a251cddf1931d1db4d2258e9599c28c07ef3580ef354", size = 14612, upload-time = "2025-09-27T18:37:02.639Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/98/e48a4bfba0a0ffcf9925fe2d69240bfaa19c6f7507b8cd09c70684a53c1e/markupsafe-3.0.3-cp313-cp313t-win_amd64.whl", hash = "sha256:1b4b79e8ebf6b55351f0d91fe80f893b4743f104bff22e90697db1590e47a218", size = 15200, upload-time = "2025-09-27T18:37:03.582Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/72/e3cc540f351f316e9ed0f092757459afbc595824ca724cbc5a5d4263713f/markupsafe-3.0.3-cp313-cp313t-win_arm64.whl", hash = "sha256:ad2cf8aa28b8c020ab2fc8287b0f823d0a7d8630784c31e9ee5edea20f406287", size = 13973, upload-time = "2025-09-27T18:37:04.929Z" },
+    { url = "https://files.pythonhosted.org/packages/33/8a/8e42d4838cd89b7dde187011e97fe6c3af66d8c044997d2183fbd6d31352/markupsafe-3.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:eaa9599de571d72e2daf60164784109f19978b327a3910d3e9de8c97b5b70cfe", size = 11619, upload-time = "2025-09-27T18:37:06.342Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/64/7660f8a4a8e53c924d0fa05dc3a55c9cee10bbd82b11c5afb27d44b096ce/markupsafe-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c47a551199eb8eb2121d4f0f15ae0f923d31350ab9280078d1e5f12b249e0026", size = 12029, upload-time = "2025-09-27T18:37:07.213Z" },
+    { url = "https://files.pythonhosted.org/packages/da/ef/e648bfd021127bef5fa12e1720ffed0c6cbb8310c8d9bea7266337ff06de/markupsafe-3.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f34c41761022dd093b4b6896d4810782ffbabe30f2d443ff5f083e0cbbb8c737", size = 24408, upload-time = "2025-09-27T18:37:09.572Z" },
+    { url = "https://files.pythonhosted.org/packages/41/3c/a36c2450754618e62008bf7435ccb0f88053e07592e6028a34776213d877/markupsafe-3.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:457a69a9577064c05a97c41f4e65148652db078a3a509039e64d3467b9e7ef97", size = 23005, upload-time = "2025-09-27T18:37:10.58Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/20/b7fdf89a8456b099837cd1dc21974632a02a999ec9bf7ca3e490aacd98e7/markupsafe-3.0.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e8afc3f2ccfa24215f8cb28dcf43f0113ac3c37c2f0f0806d8c70e4228c5cf4d", size = 22048, upload-time = "2025-09-27T18:37:11.547Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/a7/591f592afdc734f47db08a75793a55d7fbcc6902a723ae4cfbab61010cc5/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ec15a59cf5af7be74194f7ab02d0f59a62bdcf1a537677ce67a2537c9b87fcda", size = 23821, upload-time = "2025-09-27T18:37:12.48Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/33/45b24e4f44195b26521bc6f1a82197118f74df348556594bd2262bda1038/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:0eb9ff8191e8498cca014656ae6b8d61f39da5f95b488805da4bb029cccbfbaf", size = 21606, upload-time = "2025-09-27T18:37:13.485Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/0e/53dfaca23a69fbfbbf17a4b64072090e70717344c52eaaaa9c5ddff1e5f0/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2713baf880df847f2bece4230d4d094280f4e67b1e813eec43b4c0e144a34ffe", size = 23043, upload-time = "2025-09-27T18:37:14.408Z" },
+    { url = "https://files.pythonhosted.org/packages/46/11/f333a06fc16236d5238bfe74daccbca41459dcd8d1fa952e8fbd5dccfb70/markupsafe-3.0.3-cp314-cp314-win32.whl", hash = "sha256:729586769a26dbceff69f7a7dbbf59ab6572b99d94576a5592625d5b411576b9", size = 14747, upload-time = "2025-09-27T18:37:15.36Z" },
+    { url = "https://files.pythonhosted.org/packages/28/52/182836104b33b444e400b14f797212f720cbc9ed6ba34c800639d154e821/markupsafe-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:bdc919ead48f234740ad807933cdf545180bfbe9342c2bb451556db2ed958581", size = 15341, upload-time = "2025-09-27T18:37:16.496Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/18/acf23e91bd94fd7b3031558b1f013adfa21a8e407a3fdb32745538730382/markupsafe-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:5a7d5dc5140555cf21a6fefbdbf8723f06fcd2f63ef108f2854de715e4422cb4", size = 14073, upload-time = "2025-09-27T18:37:17.476Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/f0/57689aa4076e1b43b15fdfa646b04653969d50cf30c32a102762be2485da/markupsafe-3.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1353ef0c1b138e1907ae78e2f6c63ff67501122006b0f9abad68fda5f4ffc6ab", size = 11661, upload-time = "2025-09-27T18:37:18.453Z" },
+    { url = "https://files.pythonhosted.org/packages/89/c3/2e67a7ca217c6912985ec766c6393b636fb0c2344443ff9d91404dc4c79f/markupsafe-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1085e7fbddd3be5f89cc898938f42c0b3c711fdcb37d75221de2666af647c175", size = 12069, upload-time = "2025-09-27T18:37:19.332Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/00/be561dce4e6ca66b15276e184ce4b8aec61fe83662cce2f7d72bd3249d28/markupsafe-3.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1b52b4fb9df4eb9ae465f8d0c228a00624de2334f216f178a995ccdcf82c4634", size = 25670, upload-time = "2025-09-27T18:37:20.245Z" },
+    { url = "https://files.pythonhosted.org/packages/50/09/c419f6f5a92e5fadde27efd190eca90f05e1261b10dbd8cbcb39cd8ea1dc/markupsafe-3.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed51ac40f757d41b7c48425901843666a6677e3e8eb0abcff09e4ba6e664f50", size = 23598, upload-time = "2025-09-27T18:37:21.177Z" },
+    { url = "https://files.pythonhosted.org/packages/22/44/a0681611106e0b2921b3033fc19bc53323e0b50bc70cffdd19f7d679bb66/markupsafe-3.0.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f190daf01f13c72eac4efd5c430a8de82489d9cff23c364c3ea822545032993e", size = 23261, upload-time = "2025-09-27T18:37:22.167Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/57/1b0b3f100259dc9fffe780cfb60d4be71375510e435efec3d116b6436d43/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e56b7d45a839a697b5eb268c82a71bd8c7f6c94d6fd50c3d577fa39a9f1409f5", size = 24835, upload-time = "2025-09-27T18:37:23.296Z" },
+    { url = "https://files.pythonhosted.org/packages/26/6a/4bf6d0c97c4920f1597cc14dd720705eca0bf7c787aebc6bb4d1bead5388/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:f3e98bb3798ead92273dc0e5fd0f31ade220f59a266ffd8a4f6065e0a3ce0523", size = 22733, upload-time = "2025-09-27T18:37:24.237Z" },
+    { url = "https://files.pythonhosted.org/packages/14/c7/ca723101509b518797fedc2fdf79ba57f886b4aca8a7d31857ba3ee8281f/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5678211cb9333a6468fb8d8be0305520aa073f50d17f089b5b4b477ea6e67fdc", size = 23672, upload-time = "2025-09-27T18:37:25.271Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d", size = 14819, upload-time = "2025-09-27T18:37:26.285Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9", size = 15426, upload-time = "2025-09-27T18:37:27.316Z" },
+    { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" },
+]
+
+[[package]]
+name = "pygments"
+version = "2.20.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991, upload-time = "2026-03-29T13:29:33.898Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" },
+]
+
+[[package]]
+name = "pymdown-extensions"
+version = "10.21.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markdown" },
+    { name = "pyyaml" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/df/08/f1c908c581fd11913da4711ea7ba32c0eee40b0190000996bb863b0c9349/pymdown_extensions-10.21.2.tar.gz", hash = "sha256:c3f55a5b8a1d0edf6699e35dcbea71d978d34ff3fa79f3d807b8a5b3fa90fbdc", size = 853922, upload-time = "2026-03-29T15:01:55.233Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f7/27/a2fc51a4a122dfd1015e921ae9d22fee3d20b0b8080d9a704578bf9deece/pymdown_extensions-10.21.2-py3-none-any.whl", hash = "sha256:5c0fd2a2bea14eb39af8ff284f1066d898ab2187d81b889b75d46d4348c01638", size = 268901, upload-time = "2026-03-29T15:01:53.244Z" },
+]
+
+[[package]]
+name = "pyyaml"
+version = "6.0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8", size = 181669, upload-time = "2025-09-25T21:32:23.673Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1", size = 173252, upload-time = "2025-09-25T21:32:25.149Z" },
+    { url = "https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c", size = 767081, upload-time = "2025-09-25T21:32:26.575Z" },
+    { url = "https://files.pythonhosted.org/packages/49/1e/a55ca81e949270d5d4432fbbd19dfea5321eda7c41a849d443dc92fd1ff7/pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5", size = 841159, upload-time = "2025-09-25T21:32:27.727Z" },
+    { url = "https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6", size = 801626, upload-time = "2025-09-25T21:32:28.878Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/11/ba845c23988798f40e52ba45f34849aa8a1f2d4af4b798588010792ebad6/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6", size = 753613, upload-time = "2025-09-25T21:32:30.178Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/e0/7966e1a7bfc0a45bf0a7fb6b98ea03fc9b8d84fa7f2229e9659680b69ee3/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be", size = 794115, upload-time = "2025-09-25T21:32:31.353Z" },
+    { url = "https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size = 137427, upload-time = "2025-09-25T21:32:32.58Z" },
+    { url = "https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size = 154090, upload-time = "2025-09-25T21:32:33.659Z" },
+    { url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246, upload-time = "2025-09-25T21:32:34.663Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" },
+    { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" },
+    { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" },
+    { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" },
+    { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" },
+    { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" },
+]
+
+[[package]]
+name = "tomli"
+version = "2.4.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/22/de/48c59722572767841493b26183a0d1cc411d54fd759c5607c4590b6563a6/tomli-2.4.1.tar.gz", hash = "sha256:7c7e1a961a0b2f2472c1ac5b69affa0ae1132c39adcb67aba98568702b9cc23f", size = 17543, upload-time = "2026-03-25T20:22:03.828Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/07/06/b823a7e818c756d9a7123ba2cda7d07bc2dd32835648d1a7b7b7a05d848d/tomli-2.4.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:36d2bd2ad5fb9eaddba5226aa02c8ec3fa4f192631e347b3ed28186d43be6b54", size = 155866, upload-time = "2026-03-25T20:21:31.65Z" },
+    { url = "https://files.pythonhosted.org/packages/14/6f/12645cf7f08e1a20c7eb8c297c6f11d31c1b50f316a7e7e1e1de6e2e7b7e/tomli-2.4.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:eb0dc4e38e6a1fd579e5d50369aa2e10acfc9cace504579b2faabb478e76941a", size = 149887, upload-time = "2026-03-25T20:21:33.028Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/e0/90637574e5e7212c09099c67ad349b04ec4d6020324539297b634a0192b0/tomli-2.4.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c7f2c7f2b9ca6bdeef8f0fa897f8e05085923eb091721675170254cbc5b02897", size = 243704, upload-time = "2026-03-25T20:21:34.51Z" },
+    { url = "https://files.pythonhosted.org/packages/10/8f/d3ddb16c5a4befdf31a23307f72828686ab2096f068eaf56631e136c1fdd/tomli-2.4.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f3c6818a1a86dd6dca7ddcaaf76947d5ba31aecc28cb1b67009a5877c9a64f3f", size = 251628, upload-time = "2026-03-25T20:21:36.012Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/f1/dbeeb9116715abee2485bf0a12d07a8f31af94d71608c171c45f64c0469d/tomli-2.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d312ef37c91508b0ab2cee7da26ec0b3ed2f03ce12bd87a588d771ae15dcf82d", size = 247180, upload-time = "2026-03-25T20:21:37.136Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/74/16336ffd19ed4da28a70959f92f506233bd7cfc2332b20bdb01591e8b1d1/tomli-2.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:51529d40e3ca50046d7606fa99ce3956a617f9b36380da3b7f0dd3dd28e68cb5", size = 251674, upload-time = "2026-03-25T20:21:38.298Z" },
+    { url = "https://files.pythonhosted.org/packages/16/f9/229fa3434c590ddf6c0aa9af64d3af4b752540686cace29e6281e3458469/tomli-2.4.1-cp313-cp313-win32.whl", hash = "sha256:2190f2e9dd7508d2a90ded5ed369255980a1bcdd58e52f7fe24b8162bf9fedbd", size = 97976, upload-time = "2026-03-25T20:21:39.316Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/1e/71dfd96bcc1c775420cb8befe7a9d35f2e5b1309798f009dca17b7708c1e/tomli-2.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:8d65a2fbf9d2f8352685bc1364177ee3923d6baf5e7f43ea4959d7d8bc326a36", size = 108755, upload-time = "2026-03-25T20:21:40.248Z" },
+    { url = "https://files.pythonhosted.org/packages/83/7a/d34f422a021d62420b78f5c538e5b102f62bea616d1d75a13f0a88acb04a/tomli-2.4.1-cp313-cp313-win_arm64.whl", hash = "sha256:4b605484e43cdc43f0954ddae319fb75f04cc10dd80d830540060ee7cd0243cd", size = 95265, upload-time = "2026-03-25T20:21:41.219Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/fb/9a5c8d27dbab540869f7c1f8eb0abb3244189ce780ba9cd73f3770662072/tomli-2.4.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:fd0409a3653af6c147209d267a0e4243f0ae46b011aa978b1080359fddc9b6cf", size = 155726, upload-time = "2026-03-25T20:21:42.23Z" },
+    { url = "https://files.pythonhosted.org/packages/62/05/d2f816630cc771ad836af54f5001f47a6f611d2d39535364f148b6a92d6b/tomli-2.4.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a120733b01c45e9a0c34aeef92bf0cf1d56cfe81ed9d47d562f9ed591a9828ac", size = 149859, upload-time = "2026-03-25T20:21:43.386Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/48/66341bdb858ad9bd0ceab5a86f90eddab127cf8b046418009f2125630ecb/tomli-2.4.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:559db847dc486944896521f68d8190be1c9e719fced785720d2216fe7022b662", size = 244713, upload-time = "2026-03-25T20:21:44.474Z" },
+    { url = "https://files.pythonhosted.org/packages/df/6d/c5fad00d82b3c7a3ab6189bd4b10e60466f22cfe8a08a9394185c8a8111c/tomli-2.4.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01f520d4f53ef97964a240a035ec2a869fe1a37dde002b57ebc4417a27ccd853", size = 252084, upload-time = "2026-03-25T20:21:45.62Z" },
+    { url = "https://files.pythonhosted.org/packages/00/71/3a69e86f3eafe8c7a59d008d245888051005bd657760e96d5fbfb0b740c2/tomli-2.4.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7f94b27a62cfad8496c8d2513e1a222dd446f095fca8987fceef261225538a15", size = 247973, upload-time = "2026-03-25T20:21:46.937Z" },
+    { url = "https://files.pythonhosted.org/packages/67/50/361e986652847fec4bd5e4a0208752fbe64689c603c7ae5ea7cb16b1c0ca/tomli-2.4.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ede3e6487c5ef5d28634ba3f31f989030ad6af71edfb0055cbbd14189ff240ba", size = 256223, upload-time = "2026-03-25T20:21:48.467Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/9a/b4173689a9203472e5467217e0154b00e260621caa227b6fa01feab16998/tomli-2.4.1-cp314-cp314-win32.whl", hash = "sha256:3d48a93ee1c9b79c04bb38772ee1b64dcf18ff43085896ea460ca8dec96f35f6", size = 98973, upload-time = "2026-03-25T20:21:49.526Z" },
+    { url = "https://files.pythonhosted.org/packages/14/58/640ac93bf230cd27d002462c9af0d837779f8773bc03dee06b5835208214/tomli-2.4.1-cp314-cp314-win_amd64.whl", hash = "sha256:88dceee75c2c63af144e456745e10101eb67361050196b0b6af5d717254dddf7", size = 109082, upload-time = "2026-03-25T20:21:50.506Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/2f/702d5e05b227401c1068f0d386d79a589bb12bf64c3d2c72ce0631e3bc49/tomli-2.4.1-cp314-cp314-win_arm64.whl", hash = "sha256:b8c198f8c1805dc42708689ed6864951fd2494f924149d3e4bce7710f8eb5232", size = 96490, upload-time = "2026-03-25T20:21:51.474Z" },
+    { url = "https://files.pythonhosted.org/packages/45/4b/b877b05c8ba62927d9865dd980e34a755de541eb65fffba52b4cc495d4d2/tomli-2.4.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:d4d8fe59808a54658fcc0160ecfb1b30f9089906c50b23bcb4c69eddc19ec2b4", size = 164263, upload-time = "2026-03-25T20:21:52.543Z" },
+    { url = "https://files.pythonhosted.org/packages/24/79/6ab420d37a270b89f7195dec5448f79400d9e9c1826df982f3f8e97b24fd/tomli-2.4.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7008df2e7655c495dd12d2a4ad038ff878d4ca4b81fccaf82b714e07eae4402c", size = 160736, upload-time = "2026-03-25T20:21:53.674Z" },
+    { url = "https://files.pythonhosted.org/packages/02/e0/3630057d8eb170310785723ed5adcdfb7d50cb7e6455f85ba8a3deed642b/tomli-2.4.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1d8591993e228b0c930c4bb0db464bdad97b3289fb981255d6c9a41aedc84b2d", size = 270717, upload-time = "2026-03-25T20:21:55.129Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/b4/1613716072e544d1a7891f548d8f9ec6ce2faf42ca65acae01d76ea06bb0/tomli-2.4.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:734e20b57ba95624ecf1841e72b53f6e186355e216e5412de414e3c51e5e3c41", size = 278461, upload-time = "2026-03-25T20:21:56.228Z" },
+    { url = "https://files.pythonhosted.org/packages/05/38/30f541baf6a3f6df77b3df16b01ba319221389e2da59427e221ef417ac0c/tomli-2.4.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8a650c2dbafa08d42e51ba0b62740dae4ecb9338eefa093aa5c78ceb546fcd5c", size = 274855, upload-time = "2026-03-25T20:21:57.653Z" },
+    { url = "https://files.pythonhosted.org/packages/77/a3/ec9dd4fd2c38e98de34223b995a3b34813e6bdadf86c75314c928350ed14/tomli-2.4.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:504aa796fe0569bb43171066009ead363de03675276d2d121ac1a4572397870f", size = 283144, upload-time = "2026-03-25T20:21:59.089Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/be/605a6261cac79fba2ec0c9827e986e00323a1945700969b8ee0b30d85453/tomli-2.4.1-cp314-cp314t-win32.whl", hash = "sha256:b1d22e6e9387bf4739fbe23bfa80e93f6b0373a7f1b96c6227c32bef95a4d7a8", size = 108683, upload-time = "2026-03-25T20:22:00.214Z" },
+    { url = "https://files.pythonhosted.org/packages/12/64/da524626d3b9cc40c168a13da8335fe1c51be12c0a63685cc6db7308daae/tomli-2.4.1-cp314-cp314t-win_amd64.whl", hash = "sha256:2c1c351919aca02858f740c6d33adea0c5deea37f9ecca1cc1ef9e884a619d26", size = 121196, upload-time = "2026-03-25T20:22:01.169Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/cd/e80b62269fc78fc36c9af5a6b89c835baa8af28ff5ad28c7028d60860320/tomli-2.4.1-cp314-cp314t-win_arm64.whl", hash = "sha256:eab21f45c7f66c13f2a9e0e1535309cee140182a9cdae1e041d02e47291e8396", size = 100393, upload-time = "2026-03-25T20:22:02.137Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/61/cceae43728b7de99d9b847560c262873a1f6c98202171fd5ed62640b494b/tomli-2.4.1-py3-none-any.whl", hash = "sha256:0d85819802132122da43cb86656f8d1f8c6587d54ae7dcaf30e90533028b49fe", size = 14583, upload-time = "2026-03-25T20:22:03.012Z" },
+]
+
+[[package]]
+name = "zensical"
+version = "0.0.41"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "deepmerge" },
+    { name = "jinja2" },
+    { name = "markdown" },
+    { name = "pygments" },
+    { name = "pymdown-extensions" },
+    { name = "pyyaml" },
+    { name = "tomli" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/89/d6/b3e931233e53a2377ef5915cc6e786845c3263306874a469af8fb569ef9c/zensical-0.0.41.tar.gz", hash = "sha256:6c3c90301123749dfc26a210d6c080f0691253c7c765ad308a10b4518369a6fe", size = 3927788, upload-time = "2026-05-09T14:35:29.005Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/72/08/ee18207c9b4e3ada74a0f4adf253bea90da39ae43772761cd91072e3a1fc/zensical-0.0.41-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:f06a0015dcfdf7aeca73f4998a401db65db0ae2dd72da9629a7be8f9a4d0b7b6", size = 12701539, upload-time = "2026-05-09T14:34:48.6Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/93/d4635fbbce8171cf71dd64285d9f6d5773a2b624b928f1dd8acaf1ee9f9f/zensical-0.0.41-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:4e524ce68c9ff082ffaded9f742407097cf51bab692b7bc18d3c174b966174fe", size = 12560038, upload-time = "2026-05-09T14:34:51.666Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/4a/1730a30377bbb0914ed740e0e289d379b0552673b6cf912aefe7a205440c/zensical-0.0.41-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4afe35331cd2394c408cd362458936479cc0ed4fb272478498e4794aafc7414", size = 12942926, upload-time = "2026-05-09T14:34:54.393Z" },
+    { url = "https://files.pythonhosted.org/packages/32/e3/d9a0416ef4edc043ce9f404a66f1934f102bcb645b103abb26b180ba5680/zensical-0.0.41-cp310-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:15a850285050f03aeb3b67ce7d99943093059fe8d32fc7731fa9f27be45c64cc", size = 12912711, upload-time = "2026-05-09T14:34:57.174Z" },
+    { url = "https://files.pythonhosted.org/packages/68/d0/775852783bef835425306a2fcd8236ef14fd19160e1b4261e192bf2d9f54/zensical-0.0.41-cp310-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:35052e9dbefabe3a71c4836cfc4afa6c9469e5eeddc2a3ee750803ae3fe777dc", size = 13275869, upload-time = "2026-05-09T14:34:59.93Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/95/554273cc09a270ced0213d3e0aac8b3fc2b472fc2b26771d56fc8fd55047/zensical-0.0.41-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a47f459205fb55f64dcb6c65e9f3c2fa00a2b4306c5ef1b71b9a50c45007071d", size = 12980177, upload-time = "2026-05-09T14:35:02.81Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/b5/d74d5040b3121db5c72b0134f0455641b90b1277fb1330a8e5e0029ca8d3/zensical-0.0.41-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:aa3b3b3a4e6f75f6bb3c1aca1fad7a96cebf54cbd4e31122f6876503b8801666", size = 13119629, upload-time = "2026-05-09T14:35:07.105Z" },
+    { url = "https://files.pythonhosted.org/packages/62/9a/93527acd7750092d7fca2e6c43fe2b8f1e85e1c96a1002baf6a08201c6f7/zensical-0.0.41-cp310-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:565133fd48b2ce939698c174c0c1c6470407a8fb6a90a2bb0eeec97cd4344444", size = 13182183, upload-time = "2026-05-09T14:35:10.105Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/7e/d77e4c809bfcbad40db85a6a7beeda2ee5c964232e0186783c3a837a7d0b/zensical-0.0.41-cp310-abi3-musllinux_1_2_i686.whl", hash = "sha256:cec0a2b05eaaace0c7424bab3f2884da03ade212cac4ba4487c58691ec13ec65", size = 13330444, upload-time = "2026-05-09T14:35:13.245Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/e8/ecbb7e34bff88aa892c676b8b2e2ddf425f94d66cbb84b80016095191b77/zensical-0.0.41-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1736f0cb7686628cc6f53952d208423f20b542f0c16b0c2ddd7e702bf6e41fdd", size = 13263093, upload-time = "2026-05-09T14:35:20.826Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/6f/48b2f81ce708d19bb807d94716f2772ec4b74389b6d29024669fc470df08/zensical-0.0.41-cp310-abi3-win32.whl", hash = "sha256:34a78645c68fba152faacb66516c895283166154f8b15b61440a6c21c84f0974", size = 12253644, upload-time = "2026-05-09T14:35:23.598Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/92/5cf943133f61b996965743deeaff467f278135521f58d83ca68d2601ded3/zensical-0.0.41-cp310-abi3-win_amd64.whl", hash = "sha256:00d80cd573152e0efb655143bbdfe8788eb4b33167a802639fdb1b1800b724ac", size = 12483190, upload-time = "2026-05-09T14:35:26.43Z" },
+]
diff --git a/zensical.toml b/zensical.toml
new file mode 100644
index 000000000..58ace2835
--- /dev/null
+++ b/zensical.toml
@@ -0,0 +1,346 @@
+[project]
+
+# The site_name is shown in the page header and the browser window title
+#
+# Read more: https://zensical.org/docs/setup/basics/#site_name
+site_name = "libfork"
+
+# The site_description is included in the HTML head and should contain a
+# meaningful description of the site content for use by search engines.
+#
+# Read more: https://zensical.org/docs/setup/basics/#site_description
+site_description = "A bleeding-edge, lock-free, wait-free, continuation-stealing tasking library built on C++20's coroutines"
+
+# The site_author attribute. This is used in the HTML head element.
+#
+# Read more: https://zensical.org/docs/setup/basics/#site_author
+site_author = "Conor Williams"
+
+# The site_url is the canonical URL for your site. When building online
+# documentation you should set this.
+# Read more: https://zensical.org/docs/setup/basics/#site_url
+site_url = "https://conorwilliams.github.io/libfork/"
+
+# The copyright notice appears in the page footer and can contain an HTML
+# fragment.
+#
+# Read more: https://zensical.org/docs/setup/basics/#copyright
+copyright = """
+Copyright &copy; 2026 Conor Williams.
+"""
+
+# Directory of the site artifacts generated by the build command.
+site_dir = "build/site"
+
+# Additional stylesheets, relative to the docs directory.
+extra_css = ["stylesheets/extra.css"]
+
+# Inform that this is a docs site.
+repo_url = "https://github.com/conorwilliams/libfork"
+
+# Set branch for edit/view links
+# TODO: make this point to main
+edit_uri = "edit/modules/docs/"
+
+# Explicit sidebar order. Paths are relative to the docs directory.
+nav = [
+  { "Home" = "index.md" },
+  { "Installation" = "installation.md" },
+  { "Quickstart" = "quickstart.md" },
+  { "Changelog" = "ChangeLog.md" },
+  { "Contributing" = "contributing.md" },
+  { "Performance" = [
+    "benchmarks/index.md",
+    { "Benchmarks" = [
+      "benchmarks/benchmarks/index.md",
+      { "Fibonacci" = "benchmarks/benchmarks/fib.md" },
+      { "Unbalanced Trees" = "benchmarks/benchmarks/uts.md" },
+      { "Heat" = "benchmarks/benchmarks/heat.md" },
+      { "Integrate" = "benchmarks/benchmarks/integrate.md" },
+      { "Knapsack" = "benchmarks/benchmarks/knapsack.md" },
+      { "Mandelbrot" = "benchmarks/benchmarks/mandelbrot.md" },
+      { "Matrix multiply" = "benchmarks/benchmarks/matmul.md" },
+      { "Strassen" = "benchmarks/benchmarks/strassen.md" },
+      { "N-queens" = "benchmarks/benchmarks/nqueens.md" },
+      { "Primes" = "benchmarks/benchmarks/primes.md" },
+      { "Quicksort" = "benchmarks/benchmarks/quicksort.md" },
+      { "Mergesort" = "benchmarks/benchmarks/mergesort.md" },
+      { "Skynet" = "benchmarks/benchmarks/skynet.md" },
+      { "Scan" = "benchmarks/benchmarks/scan.md" },
+      { "Fold" = "benchmarks/benchmarks/fold.md" },
+      { "Random scheduler" = "benchmarks/benchmarks/switch-random.md" },
+      { "I/O scheduler" = "benchmarks/benchmarks/switch-io-pool.md" },
+    ] },
+  ] },
+  { "API" = [
+    "api/index.md",
+    { "Core" = [
+      "api/core/index.md",
+      "api/core/task.md",
+      "api/core/env.md",
+      "api/core/scope.md",
+      "api/core/scheduling.md",
+      "api/core/receiver.md",
+      "api/core/cancellation.md",
+      "api/core/context.md",
+      "api/core/handles.md",
+      "api/core/projected.md",
+      "api/core/concepts.md",
+      "api/core/exceptions.md",
+    ] },
+    { "Batteries" = "api/batteries.md" },
+    { "Schedulers" = "api/schedulers.md" },
+    { "Algorithm" = "api/algorithm.md" },
+  ] },
+]
+
+# ----------------------------------------------------------------------------
+# Section for configuring theme options
+# ----------------------------------------------------------------------------
+[project.theme]
+
+# With the "favicon" option you can set your own image to use as the icon
+# browsers will use in the browser title bar or tab bar. The path provided
+# must be relative to the "docs_dir".
+#
+# Read more:
+# - https://zensical.org/docs/setup/logo-and-icons/#favicon
+# - https://developer.mozilla.org/en-US/docs/Glossary/Favicon
+#
+favicon = "favicon/favicon.ico"
+
+# Zensical supports more than 60 different languages. This means that the
+# labels and tooltips that Zensical's templates produce are translated.
+# The "language" option allows you to set the language used. This language
+# is also indicated in the HTML head element to help with accessibility
+# and guide search engines and translation tools.
+#
+# The default language is "en" (English). It is possible to create
+# sites with multiple languages and configure a language selector. See
+# the documentation for details.
+#
+# Read more:
+# - https://zensical.org/docs/setup/language/
+#
+language = "en"
+
+# Zensical provides a number of feature toggles that change the behavior
+# of the documentation site.
+features = [
+  # Zensical includes an announcement bar. This feature allows users to
+  # dismiss it when they have read the announcement.
+  # https://zensical.org/docs/setup/header/#announcement-bar
+  "announce.dismiss",
+
+  # If you have a repository configured and turn on this feature, Zensical
+  # will generate an edit button for the page. This works for common
+  # repository hosting services.
+  # https://zensical.org/docs/setup/repository/#content-actions
+  "content.action.edit",
+
+  # If you have a repository configured and turn on this feature, Zensical
+  # will generate a button that allows the user to view the Markdown
+  # code for the current page.
+  # https://zensical.org/docs/setup/repository/#content-actions
+  "content.action.view",
+
+  # Code annotations allow you to add an icon with a tooltip to your
+  # code blocks to provide explanations at crucial points.
+  # https://zensical.org/docs/authoring/code-blocks/#code-annotations
+  "content.code.annotate",
+
+  # This feature turns on a button in code blocks that allow users to
+  # copy the content to their clipboard without first selecting it.
+  # https://zensical.org/docs/authoring/code-blocks/#code-copy-button
+  "content.code.copy",
+
+  # Code blocks can include a button to allow for the selection of line
+  # ranges by the user.
+  # https://zensical.org/docs/authoring/code-blocks/#code-selection-button
+  "content.code.select",
+
+  # Zensical can render footnotes as inline tooltips, so the user can read
+  # the footnote without leaving the context of the document.
+  # https://zensical.org/docs/authoring/footnotes/#footnote-tooltips
+  "content.footnote.tooltips",
+
+  # If you have many content tabs that have the same titles (e.g., "Python",
+  # "JavaScript", "Cobol"), this feature causes all of them to switch to
+  # at the same time when the user chooses their language in one.
+  # https://zensical.org/docs/authoring/content-tabs/#linked-content-tabs
+  "content.tabs.link",
+
+  # With this feature enabled users can add tooltips to links that will be
+  # displayed when the mouse pointer hovers the link.
+  # https://zensical.org/docs/authoring/tooltips/#improved-tooltips
+  "content.tooltips",
+
+  # With this feature enabled, Zensical will automatically hide parts
+  # of the header when the user scrolls past a certain point.
+  # https://zensical.org/docs/setup/header/#automatic-hiding
+  # "header.autohide",
+
+  # Turn on this feature to expand all collapsible sections in the
+  # navigation sidebar by default.
+  # https://zensical.org/docs/setup/navigation/#navigation-expansion
+  # "navigation.expand",
+
+  # This feature turns on navigation elements in the footer that allow the
+  # user to navigate to a next or previous page.
+  # https://zensical.org/docs/setup/footer/#navigation
+  "navigation.footer",
+
+  # When section index pages are enabled, documents can be directly attached
+  # to sections, which is particularly useful for providing overview pages.
+  # https://zensical.org/docs/setup/navigation/#section-index-pages
+  "navigation.indexes",
+
+  # When instant navigation is enabled, clicks on all internal links will be
+  # intercepted and dispatched via XHR without fully reloading the page.
+  # https://zensical.org/docs/setup/navigation/#instant-navigation
+  "navigation.instant",
+
+  # With instant prefetching, your site will start to fetch a page once the
+  # user hovers over a link. This will reduce the perceived loading time
+  # for the user.
+  # https://zensical.org/docs/setup/navigation/#instant-prefetching
+  "navigation.instant.prefetch",
+
+  # In order to provide a better user experience on slow connections when
+  # using instant navigation, a progress indicator can be enabled.
+  # https://zensical.org/docs/setup/navigation/#progress-indicator
+  #"navigation.instant.progress",
+
+  # When navigation paths are activated, a breadcrumb navigation is rendered
+  # above the title of each page
+  # https://zensical.org/docs/setup/navigation/#navigation-path
+  "navigation.path",
+
+  # When pruning is enabled, only the visible navigation items are included
+  # in the rendered HTML, reducing the size of the built site by 33% or more.
+  # https://zensical.org/docs/setup/navigation/#navigation-pruning
+  #"navigation.prune",
+
+  # When sections are enabled, top-level sections are rendered as groups in
+  # the sidebar for viewports above 1220px, but remain as-is on mobile.
+  # https://zensical.org/docs/setup/navigation/#navigation-sections
+  "navigation.sections",
+
+  # When tabs are enabled, top-level sections are rendered in a menu layer
+  # below the header for viewports above 1220px, but remain as-is on mobile.
+  # https://zensical.org/docs/setup/navigation/#navigation-tabs
+  #"navigation.tabs",
+
+  # When sticky tabs are enabled, navigation tabs will lock below the header
+  # and always remain visible when scrolling down.
+  # https://zensical.org/docs/setup/navigation/#sticky-navigation-tabs
+  #"navigation.tabs.sticky",
+
+  # A back-to-top button can be shown when the user, after scrolling down,
+  # starts to scroll up again.
+  # https://zensical.org/docs/setup/navigation/#back-to-top-button
+  "navigation.top",
+
+  # When anchor tracking is enabled, the URL in the address bar is
+  # automatically updated with the active anchor as highlighted in the table
+  # of contents.
+  # https://zensical.org/docs/setup/navigation/#anchor-tracking
+  "navigation.tracking",
+
+  # When search highlighting is enabled and a user clicks on a search result,
+  # Zensical will highlight all occurrences after following the link.
+  # https://zensical.org/docs/setup/search/#search-highlighting
+  "search.highlight",
+
+  # When anchor following for the table of contents is enabled, the sidebar
+  # is automatically scrolled so that the active anchor is always visible.
+  # https://zensical.org/docs/setup/navigation/#anchor-following
+  # "toc.follow",
+
+  # When navigation integration for the table of contents is enabled, it is
+  # always rendered as part of the navigation sidebar on the left.
+  # https://zensical.org/docs/setup/navigation/#navigation-integration
+  #"toc.integrate",
+]
+
+# ----------------------------------------------------------------------------
+# If you don't have a dedicated project logo, you can use a built-in icon from
+# the icon sets shipped in Zensical. Please note that the setting lives in a
+# different subsection, and that the above take precedence over the icon.
+#
+# Read more:
+# - https://zensical.org/docs/setup/logo-and-icons
+# - https://github.com/zensical/ui/tree/master/dist/.icons
+# ----------------------------------------------------------------------------
+[project.theme.icon]
+logo = "lucide/utensils"
+
+# ----------------------------------------------------------------------------
+# In the "palette" subsection you can configure options for the color scheme.
+# You can configure different color schemes, e.g., to turn on dark mode,
+# that the user can switch between. Each color scheme can be further
+# customized.
+#
+# Read more:
+# - https://zensical.org/docs/setup/colors/
+# ----------------------------------------------------------------------------
+[[project.theme.palette]]
+scheme = "default"
+toggle.icon = "lucide/sun"
+toggle.name = "Switch to dark mode"
+
+[[project.theme.palette]]
+scheme = "slate"
+toggle.icon = "lucide/moon"
+toggle.name = "Switch to light mode"
+
+# ----------------------------------------------------------------------------
+# The "extra" section contains miscellaneous settings.
+# ----------------------------------------------------------------------------
+[[project.extra.social]]
+icon = "fontawesome/brands/github"
+link = "https://github.com/conorwilliams/libfork"
+
+# ----------------------------------------------------------------------------
+# In this section you can configure the Markdown extensions that are used when
+# rendering your documentation. We enable the most useful extensions by default,
+# but you can customize this list to your needs.
+#
+# Read more:
+# - https://zensical.org/docs/setup/extensions/
+# ----------------------------------------------------------------------------
+[project.markdown_extensions.abbr]
+[project.markdown_extensions.admonition]
+[project.markdown_extensions.attr_list]
+[project.markdown_extensions.def_list]
+[project.markdown_extensions.footnotes]
+[project.markdown_extensions.md_in_html]
+[project.markdown_extensions.toc]
+permalink = true
+[project.markdown_extensions.pymdownx.arithmatex]
+generic = true
+[project.markdown_extensions.pymdownx.betterem]
+[project.markdown_extensions.pymdownx.caret]
+[project.markdown_extensions.pymdownx.details]
+[project.markdown_extensions.pymdownx.emoji]
+emoji_generator = "zensical.extensions.emoji.to_svg"
+emoji_index = "zensical.extensions.emoji.twemoji"
+[project.markdown_extensions.pymdownx.highlight]
+anchor_linenums = true
+line_spans = "__span"
+pygments_lang_class = true
+[project.markdown_extensions.pymdownx.inlinehilite]
+[project.markdown_extensions.pymdownx.keys]
+[project.markdown_extensions.pymdownx.magiclink]
+[project.markdown_extensions.pymdownx.mark]
+[project.markdown_extensions.pymdownx.smartsymbols]
+[project.markdown_extensions.pymdownx.superfences]
+custom_fences = [
+  { name = "mermaid", class = "mermaid", format = "pymdownx.superfences.fence_code_format" },
+]
+[project.markdown_extensions.pymdownx.tabbed]
+alternate_style = true
+combine_header_slug = true
+[project.markdown_extensions.pymdownx.tasklist]
+custom_checkbox = true
+[project.markdown_extensions.pymdownx.tilde]