From 6914d4bc317d47120b613f3006a1b47f1afdcb69 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 26 May 2026 19:00:58 +0000 Subject: [PATCH 01/17] CI/build modernization + fix atomics for CUDA 13+ --- .github/workflows/codeql.yml | 104 ----------------------------- .github/workflows/lint.yml | 15 ++--- .github/workflows/ut-rocm.yml | 64 ------------------ .github/workflows/ut.yml | 119 ++++++++++++++++++++++++++++++++++ CMakeLists.txt | 19 ++++-- ark/api/executor.cpp | 8 ++- tools/lint.sh | 74 +++++++++++++++++++++ 7 files changed, 219 insertions(+), 184 deletions(-) delete mode 100644 .github/workflows/codeql.yml delete mode 100644 .github/workflows/ut-rocm.yml create mode 100644 .github/workflows/ut.yml create mode 100644 tools/lint.sh diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml deleted file mode 100644 index 7ac2f1649..000000000 --- a/.github/workflows/codeql.yml +++ /dev/null @@ -1,104 +0,0 @@ -name: CodeQL - -on: - push: - branches: - - main - pull_request: - branches: - - main - schedule: - - cron: '42 20 * * 4' - -jobs: - analyze-cuda: - name: Analyze (CUDA) - strategy: - fail-fast: false - matrix: - language: [ 'cpp' ] - concurrency: - group: ${{ github.workflow }}-cuda-${{ github.ref }} - cancel-in-progress: true - runs-on: ubuntu-latest - container: - image: ghcr.io/microsoft/ark/ark:base-dev-cuda12.2 - permissions: - actions: read - contents: read - security-events: write - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Check disk space - run: | - df -h - - # Initializes the CodeQL tools for scanning. - - name: Initialize CodeQL - uses: github/codeql-action/init@v3 - with: - languages: ${{ matrix.language }} - - - name: Dubious ownership exception - run: | - git config --global --add safe.directory /__w/ark/ark - - - name: Build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_CUDA=ON -DARK_BUILD_TESTS=OFF .. - make build ark_py - - - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v3 - with: - category: "/language:${{matrix.language}}" - - analyze-rocm: - name: Analyze (ROCM) - strategy: - fail-fast: false - matrix: - language: [ 'cpp' ] - concurrency: - group: ${{ github.workflow }}-rocm-${{ github.ref }} - cancel-in-progress: true - runs-on: ubuntu-latest - container: - image: ghcr.io/microsoft/ark/ark:build-rocm6.1 - permissions: - actions: read - contents: read - security-events: write - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Check disk space - run: | - df -h - - # Initializes the CodeQL tools for scanning. - - name: Initialize CodeQL - uses: github/codeql-action/init@v3 - with: - languages: ${{ matrix.language }} - - - name: Dubious ownership exception - run: | - git config --global --add safe.directory /__w/ark/ark - - - name: Build - run: | - mkdir build && cd build - CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_ROCM=ON -DARK_BUILD_TESTS=OFF .. - make -j build ark_py - - - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v3 - with: - category: "/language:${{matrix.language}}" diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index c799e86c6..0fe0cf826 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -7,7 +7,7 @@ on: jobs: linters: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest steps: - name: Check out Git repository @@ -16,22 +16,19 @@ jobs: - name: Install ClangFormat run: sudo apt-get install -y clang-format - - name: Run git-clang-format - run: git clang-format --style=file --diff - - name: Set up Python uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: '3.12' - name: Install Python dependencies - run: python3.8 -m pip install black + run: pip install black - - name: Run black - run: python3.8 -m black --check --config pyproject.toml . + - name: Run lint + run: bash tools/lint.sh dry spelling: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest steps: - name: Check out Git repository diff --git a/.github/workflows/ut-rocm.yml b/.github/workflows/ut-rocm.yml deleted file mode 100644 index ac8ed0e90..000000000 --- a/.github/workflows/ut-rocm.yml +++ /dev/null @@ -1,64 +0,0 @@ -name: "Unit Tests (ROCm)" - -on: - push: - branches: - - main - pull_request: - branches: - - main - -jobs: - UnitTest: - runs-on: [ self-hosted, AMD ] - defaults: - run: - shell: bash - strategy: - matrix: - rocm: [ rocm6.0 ] - concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.rocm }} - cancel-in-progress: true - # container: - # image: "ghcr.io/microsoft/ark/ark:base-dev-${{ matrix.rocm }}" - # options: --privileged --ipc=host --security-opt seccomp=unconfined --group-add video --ulimit memlock=-1:-1 - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Dubious ownership exception - run: | - git config --global --add safe.directory /__w/ark/ark - - - name: Build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Debug .. - make -j ut - - - name: RunUT - run: | - cd build && ARK_ROOT=$PWD ARK_IGNORE_BINARY_CACHE=1 ctest --stop-on-failure --verbose --schedule-random - - - name: ReportCoverage - run: | - cd build - lcov --capture --directory . --output-file coverage.info - lcov --remove coverage.info \ - '/usr/*' \ - '/tmp/*' \ - '*/third_party/*' \ - '*/ark/*_test.*' \ - '*/examples/*' \ - '*/python/*' \ - '*/ark/unittest/unittest_utils.cc' \ - --output-file coverage.info - lcov --list coverage.info - bash <(curl -s https://codecov.io/bash) -f coverage.info || echo "Codecov did not collect coverage reports" - - - name: BuildPython - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install . diff --git a/.github/workflows/ut.yml b/.github/workflows/ut.yml new file mode 100644 index 000000000..0929c75e2 --- /dev/null +++ b/.github/workflows/ut.yml @@ -0,0 +1,119 @@ +name: "Unit Tests" + +on: + push: + branches: + - main + pull_request: + branches: + - main + types: [opened, synchronize, reopened, ready_for_review] + +jobs: + UnitTest: + defaults: + run: + shell: bash + timeout-minutes: 60 + permissions: + actions: read + contents: read + security-events: write + strategy: + fail-fast: false + matrix: + include: + - platform: cuda + runner: [self-hosted, CUDA] + container: nvcr.io/nvidia/pytorch:26.03-py3 + container_options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 + - platform: rocm + runner: [self-hosted, ROCM] + container: rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0 + container_options: --privileged --ipc=host --security-opt seccomp=unconfined --group-add video --ulimit memlock=-1:-1 + runs-on: ${{ matrix.runner }} + concurrency: + group: ${{ github.workflow }}-${{ matrix.platform }}-${{ github.ref }} + cancel-in-progress: true + container: + image: ${{ matrix.container }} + options: ${{ matrix.container_options }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Dubious ownership exception + run: | + git config --global --add safe.directory /__w/ark/ark + + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: cpp + + - name: Build + run: | + apt-get update && apt-get install -y lcov + mkdir build && cd build + CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug" + if [ "${{ matrix.platform }}" = "rocm" ]; then + CMAKE_ARGS="$CMAKE_ARGS -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc" + fi + cmake $CMAKE_ARGS .. + make -j ut ark_py + + - name: Run C++ UT + run: | + cd build + ARK_ROOT=$PWD ctest --stop-on-failure --verbose --schedule-random + + - name: C++ Coverage + run: | + cd build + lcov --capture --directory . --output-file cpp_coverage.info + lcov --remove cpp_coverage.info \ + '/usr/*' \ + '/tmp/*' \ + '*/build/*' \ + '*/third_party/*' \ + '*/ark/*_test.*' \ + '*/examples/*' \ + '*/python/*' \ + '*/ark/unittest/unittest_utils.cpp' \ + --output-file cpp_coverage.info + lcov --list cpp_coverage.info + + - name: Install Python Dependencies + run: | + python3 -m pip install -r requirements.txt + + - name: Run Python UT + run: | + cd build + PYTHONPATH=$PWD/python ARK_ROOT=$PWD python3 -m pytest \ + --cov=python/ark \ + --cov-report lcov:py_coverage.info \ + --verbose \ + ../python/unittest/ + + - name: Report Coverage + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} + run: | + cd build + lcov -a cpp_coverage.info -a py_coverage.info -o coverage.info + bash <(curl -s https://codecov.io/bash) -f coverage.info || echo "Codecov did not collect coverage reports" + + - name: Install Python + run: | + python3 -m pip install . + + - name: Run Tutorials + run: | + python3 ./examples/tutorial/quickstart_tutorial.py + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:cpp-${{ matrix.platform }}" diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d5de19d1..437746888 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,13 +65,24 @@ if(ARK_USE_CUDA) endif() # Set CUDA architectures - if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 11) + if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 13) + # CUDA 13+ dropped sm_60 and sm_70 + set(CMAKE_CUDA_ARCHITECTURES 80 90) + elseif(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 12) + set(CMAKE_CUDA_ARCHITECTURES 60 70 80 90) + elseif(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 11) set(CMAKE_CUDA_ARCHITECTURES 60 70 80) endif() - # Hopper architecture - if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 12) - set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} 90) + # CUDA 13+ moved CCCL headers into a cccl/ subdirectory. + # Add it to the include path so third-party code (e.g. MSCCL++) + # that includes can still find the headers. + if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 13) + set(CCCL_INCLUDE_DIR "${CUDAToolkit_INCLUDE_DIRS}/cccl") + if(EXISTS "${CCCL_INCLUDE_DIR}") + include_directories(SYSTEM "${CCCL_INCLUDE_DIR}") + message(STATUS "CUDA 13+: added CCCL include dir ${CCCL_INCLUDE_DIR}") + endif() endif() else() # ARK_USE_ROCM set(CMAKE_HIP_STANDARD 17) diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp index 3fcd2e8f7..9b13df240 100644 --- a/ark/api/executor.cpp +++ b/ark/api/executor.cpp @@ -30,12 +30,14 @@ #include "utils/utils_net.hpp" #if defined(ARK_CUDA) -#include +#include static int atomicLoadRelaxed(int *ptr) { - return mscclpp::atomicLoad(ptr, mscclpp::memoryOrderRelaxed); + return cuda::atomic_ref{*ptr}.load( + cuda::memory_order_relaxed); } static void atomicStoreRelaxed(int *ptr, int val) { - mscclpp::atomicStore(ptr, val, mscclpp::memoryOrderRelaxed); + cuda::atomic_ref{*ptr}.store( + val, cuda::memory_order_relaxed); } #elif defined(ARK_ROCM) static int atomicLoadRelaxed(int *ptr) { diff --git a/tools/lint.sh b/tools/lint.sh new file mode 100644 index 000000000..5c97626b0 --- /dev/null +++ b/tools/lint.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash + +PROJECT_ROOT=$(dirname "$(realpath "$0")")/.. +LINT_CPP=false +LINT_PYTHON=false +DRY_RUN=false +EXIT_CODE=0 + +usage() { + echo "Usage: $0 [cpp] [py] [dry]" + echo " cpp Lint C++ code" + echo " py Lint Python code" + echo " dry Dry run mode (no changes made)" +} + +# Parse arguments +for arg in "$@"; do + case "$arg" in + cpp) + LINT_CPP=true + ;; + py) + LINT_PYTHON=true + ;; + dry) + DRY_RUN=true + ;; + *) + echo "Error: Unknown argument '$arg'" + usage + exit 1 + ;; + esac +done + +# If no cpp or py specified, default to both +if [ "$LINT_CPP" = false ] && [ "$LINT_PYTHON" = false ]; then + LINT_CPP=true + LINT_PYTHON=true +fi + +if $LINT_CPP; then + echo "Linting C++ code..." + # Find all git-tracked files with .c/.h/.cpp/.hpp/.cc/.cu/.cuh extensions + files=$(git -C "$PROJECT_ROOT" ls-files --cached | grep -E '\.(c|h|cpp|hpp|cc|cu|cuh)$' | sed "s|^|$PROJECT_ROOT/|") + if [ -n "$files" ]; then + if $DRY_RUN; then + clang-format -style=file --dry-run --Werror $files + else + clang-format -style=file -i $files + fi + if [ $? -ne 0 ]; then + EXIT_CODE=1 + fi + fi +fi + +if $LINT_PYTHON; then + echo "Linting Python code..." + # Find all git-tracked files with .py extension + files=$(git -C "$PROJECT_ROOT" ls-files --cached | grep -E '\.py$' | sed "s|^|$PROJECT_ROOT/|") + if [ -n "$files" ]; then + if $DRY_RUN; then + python3 -m black --check --diff $files + else + python3 -m black $files + fi + if [ $? -ne 0 ]; then + EXIT_CODE=1 + fi + fi +fi + +exit $EXIT_CODE From 05363cfac31d6289b354e0b3aecfc3fd667e4406 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 26 May 2026 20:22:10 +0000 Subject: [PATCH 02/17] Deep-review fixes: README badge, lint.sh improvements, CI guards - Fix README badge to point to renamed ut.yml workflow - Add cmake version guards for CCCL include path - Improve lint.sh error messages and exit codes - CI workflow: tighten schedule-only step conditions --- .github/workflows/ut-cuda.yml | 94 ----------------------------------- .github/workflows/ut.yml | 9 ++++ CMakeLists.txt | 5 +- README.md | 4 +- tools/lint.sh | 12 +++-- 5 files changed, 23 insertions(+), 101 deletions(-) delete mode 100644 .github/workflows/ut-cuda.yml diff --git a/.github/workflows/ut-cuda.yml b/.github/workflows/ut-cuda.yml deleted file mode 100644 index 10b0679da..000000000 --- a/.github/workflows/ut-cuda.yml +++ /dev/null @@ -1,94 +0,0 @@ -name: "Unit Tests (CUDA)" - -on: - push: - branches: - - main - pull_request: - branches: - - main - types: [opened, synchronize, reopened, ready_for_review] - -jobs: - UnitTest: - runs-on: [ self-hosted, A100 ] - defaults: - run: - shell: bash - timeout-minutes: 30 - strategy: - matrix: - cuda: [ cuda11.8, cuda12.2 ] - concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.cuda }} - cancel-in-progress: true - container: - image: "ghcr.io/microsoft/ark/ark:base-dev-${{ matrix.cuda }}" - options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: LockGPUClock - run: | - sudo nvidia-smi -pm 1 - for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do - sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i - done - - - name: Dubious ownership exception - run: | - git config --global --add safe.directory /__w/ark/ark - - - name: Build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Debug .. - make -j ut ark_py - - - name: Run C++ UT - run: | - cd build - ARK_ROOT=$PWD ctest --stop-on-failure --verbose --schedule-random - lcov --capture --directory . --output-file cpp_coverage.info - lcov --remove cpp_coverage.info \ - '/usr/*' \ - '/tmp/*' \ - '*/build/*' \ - '*/third_party/*' \ - '*/ark/*_test.*' \ - '*/examples/*' \ - '*/python/*' \ - '*/ark/unittest/unittest_utils.cpp' \ - --output-file cpp_coverage.info - lcov --list cpp_coverage.info - - - name: Install Python Dependencies - run: | - python3 -m pip install -r requirements.txt - - - name: Run Python UT - run: | - cd build - PYTHONPATH=$PWD/python ARK_ROOT=$PWD python3 -m pytest \ - --cov=python/ark \ - --cov-report lcov:py_coverage.info \ - --verbose \ - ../python/unittest/test.py - - - name: Report Coverage - env: - CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} - run: | - cd build - lcov -a cpp_coverage.info -a py_coverage.info -o coverage.info - bash <(curl -s https://codecov.io/bash) -f coverage.info || echo "Codecov did not collect coverage reports" - - - name: Install Python - run: | - python3 -m pip install . - - - name: Run Tutorials - run: | - python3 ./examples/tutorial/quickstart_tutorial.py diff --git a/.github/workflows/ut.yml b/.github/workflows/ut.yml index 0929c75e2..91378fd31 100644 --- a/.github/workflows/ut.yml +++ b/.github/workflows/ut.yml @@ -8,6 +8,8 @@ on: branches: - main types: [opened, synchronize, reopened, ready_for_review] + schedule: + - cron: '42 20 * * 4' jobs: UnitTest: @@ -64,11 +66,13 @@ jobs: make -j ut ark_py - name: Run C++ UT + if: github.event_name != 'schedule' run: | cd build ARK_ROOT=$PWD ctest --stop-on-failure --verbose --schedule-random - name: C++ Coverage + if: github.event_name != 'schedule' run: | cd build lcov --capture --directory . --output-file cpp_coverage.info @@ -85,10 +89,12 @@ jobs: lcov --list cpp_coverage.info - name: Install Python Dependencies + if: github.event_name != 'schedule' run: | python3 -m pip install -r requirements.txt - name: Run Python UT + if: github.event_name != 'schedule' run: | cd build PYTHONPATH=$PWD/python ARK_ROOT=$PWD python3 -m pytest \ @@ -98,6 +104,7 @@ jobs: ../python/unittest/ - name: Report Coverage + if: github.event_name != 'schedule' env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} run: | @@ -106,10 +113,12 @@ jobs: bash <(curl -s https://codecov.io/bash) -f coverage.info || echo "Codecov did not collect coverage reports" - name: Install Python + if: github.event_name != 'schedule' run: | python3 -m pip install . - name: Run Tutorials + if: github.event_name != 'schedule' run: | python3 ./examples/tutorial/quickstart_tutorial.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 437746888..ec2f888f0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -78,10 +78,13 @@ if(ARK_USE_CUDA) # Add it to the include path so third-party code (e.g. MSCCL++) # that includes can still find the headers. if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 13) - set(CCCL_INCLUDE_DIR "${CUDAToolkit_INCLUDE_DIRS}/cccl") + list(GET CUDAToolkit_INCLUDE_DIRS 0 _CUDA_INCLUDE_FIRST) + set(CCCL_INCLUDE_DIR "${_CUDA_INCLUDE_FIRST}/cccl") if(EXISTS "${CCCL_INCLUDE_DIR}") include_directories(SYSTEM "${CCCL_INCLUDE_DIR}") message(STATUS "CUDA 13+: added CCCL include dir ${CCCL_INCLUDE_DIR}") + else() + message(WARNING "CUDA 13+: CCCL include dir not found at ${CCCL_INCLUDE_DIR}. Build may fail.") endif() endif() else() # ARK_USE_ROCM diff --git a/README.md b/README.md index 0143a3ed6..3260d3e57 100644 --- a/README.md +++ b/README.md @@ -4,13 +4,13 @@ A GPU-driven system framework for scalable AI applications. [![Latest Release](https://img.shields.io/github/release/microsoft/ark.svg)](https://github.com/microsoft/ark/releases/latest) [![License](https://img.shields.io/github/license/microsoft/ark.svg)](LICENSE) -[![CodeQL](https://github.com/microsoft/ark/actions/workflows/codeql.yml/badge.svg)](https://github.com/microsoft/ark/actions/workflows/codeql.yml) +[![Unit Tests](https://github.com/microsoft/ark/actions/workflows/ut.yml/badge.svg)](https://github.com/microsoft/ark/actions/workflows/ut.yml) [![codecov](https://codecov.io/gh/microsoft/ark/graph/badge.svg?token=XmMOK85GOB)](https://codecov.io/gh/microsoft/ark) | Pipelines | Build Status | |-------------------|-------------------| | Unit Tests (CUDA) | [![Build Status](https://dev.azure.com/binyli/HPC/_apis/build/status%2Fark-test?branchName=main)](https://dev.azure.com/binyli/HPC/_build/latest?definitionId=6&branchName=main) | -| Unit Tests (ROCm) | [![Unit Tests (ROCm)](https://github.com/microsoft/ark/actions/workflows/ut-rocm.yml/badge.svg?branch=main)](https://github.com/microsoft/ark/actions/workflows/ut-rocm.yml) | +| Unit Tests (ROCm) | [![Unit Tests (ROCm)](https://github.com/microsoft/ark/actions/workflows/ut.yml/badge.svg?branch=main)](https://github.com/microsoft/ark/actions/workflows/ut.yml) | *NOTE (Nov 2023): ROCm unit tests will be replaced into an Azure pipeline in the future.* diff --git a/tools/lint.sh b/tools/lint.sh index 5c97626b0..a424a9b33 100644 --- a/tools/lint.sh +++ b/tools/lint.sh @@ -42,7 +42,8 @@ fi if $LINT_CPP; then echo "Linting C++ code..." # Find all git-tracked files with .c/.h/.cpp/.hpp/.cc/.cu/.cuh extensions - files=$(git -C "$PROJECT_ROOT" ls-files --cached | grep -E '\.(c|h|cpp|hpp|cc|cu|cuh)$' | sed "s|^|$PROJECT_ROOT/|") + # Exclude third_party/ to match project convention (not our code to format) + files=$(git -C "$PROJECT_ROOT" ls-files --cached | grep -E '\.(c|h|cpp|hpp|cc|cu|cuh)$' | grep -v -E '(third_party/)' | sed "s|^|$PROJECT_ROOT/|") if [ -n "$files" ]; then if $DRY_RUN; then clang-format -style=file --dry-run --Werror $files @@ -58,12 +59,15 @@ fi if $LINT_PYTHON; then echo "Linting Python code..." # Find all git-tracked files with .py extension - files=$(git -C "$PROJECT_ROOT" ls-files --cached | grep -E '\.py$' | sed "s|^|$PROJECT_ROOT/|") + # Exclude paths matching pyproject.toml [tool.black] exclude patterns; + # black's exclude regex only applies during directory traversal, not to + # explicitly-listed files, so we must filter them out here. + files=$(git -C "$PROJECT_ROOT" ls-files --cached | grep -E '\.py$' | grep -v -E '(third_party/|\.eggs/|docs/|examples/llama/llama/)' | sed "s|^|$PROJECT_ROOT/|") if [ -n "$files" ]; then if $DRY_RUN; then - python3 -m black --check --diff $files + python3 -m black --check --diff --config "$PROJECT_ROOT/pyproject.toml" $files else - python3 -m black $files + python3 -m black --config "$PROJECT_ROOT/pyproject.toml" $files fi if [ $? -ne 0 ]; then EXIT_CODE=1 From ebe3566ffd3e09e3d468abe052fe4866d2a3aa36 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 4 Jun 2026 03:24:10 +0000 Subject: [PATCH 03/17] =?UTF-8?q?Fix=20InvalidStateError=20=E2=86=92=20Int?= =?UTF-8?q?ernalError=20in=20ops=5Fmatmul.cpp?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit InvalidStateError doesn't exist in ARK's error hierarchy. This was introduced in PR #248 (fused matmul epilogues). Replace with InternalError which is the correct class for unexpected-state errors. --- ark/ops/ops_matmul.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ark/ops/ops_matmul.cpp b/ark/ops/ops_matmul.cpp index 51d80a107..eaa04130b 100644 --- a/ark/ops/ops_matmul.cpp +++ b/ark/ops/ops_matmul.cpp @@ -334,7 +334,7 @@ std::string ModelOpMatmulGelu::impl_name(const Json &config) const { if (name.substr(0, 7) == "matmul<") { name = "matmul_gelu<" + name.substr(7); } else { - ERR(InvalidStateError, "unexpected matmul impl_name format: ", name); + ERR(InternalError, "unexpected matmul impl_name format: ", name); } return name; } @@ -372,7 +372,7 @@ std::string ModelOpMatmulScale::impl_name(const Json &config) const { // Remove trailing ">" and add scale bits name = name.substr(0, name.size() - 1) + ", " + std::to_string(conv.u) + ">"; } else { - ERR(InvalidStateError, "unexpected matmul impl_name format: ", name); + ERR(InternalError, "unexpected matmul impl_name format: ", name); } return name; } From f0bed597d4340210472121a303d6c8ff7ad2701b Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 4 Jun 2026 04:43:04 +0000 Subject: [PATCH 04/17] Address deep-review: CodeQL separation, lint.sh hardening, CI fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Move CodeQL to separate job on ubuntu-latest (no GPU needed) - Add 'if: github.event_name \!= schedule' to UnitTest job - Update actions/setup-python v4 → v5 in lint.yml - Add set -o pipefail to lint.sh - Quote $files via xargs in lint.sh to handle paths safely - Remove stale ROCm Azure migration note from README - Rename badge to 'Unit Tests' for unified workflow --- .github/workflows/lint.yml | 2 +- .github/workflows/ut.yml | 44 ++++++++++++++++++++++++++++++++------ README.md | 4 +--- tools/lint.sh | 23 +++++++++++--------- 4 files changed, 52 insertions(+), 21 deletions(-) mode change 100644 => 100755 tools/lint.sh diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 0fe0cf826..5db3339c0 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -17,7 +17,7 @@ jobs: run: sudo apt-get install -y clang-format - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.12' diff --git a/.github/workflows/ut.yml b/.github/workflows/ut.yml index 91378fd31..39fd3d95f 100644 --- a/.github/workflows/ut.yml +++ b/.github/workflows/ut.yml @@ -13,6 +13,7 @@ on: jobs: UnitTest: + if: github.event_name != 'schedule' defaults: run: shell: bash @@ -20,7 +21,6 @@ jobs: permissions: actions: read contents: read - security-events: write strategy: fail-fast: false matrix: @@ -49,11 +49,6 @@ jobs: run: | git config --global --add safe.directory /__w/ark/ark - - name: Initialize CodeQL - uses: github/codeql-action/init@v3 - with: - languages: cpp - - name: Build run: | apt-get update && apt-get install -y lcov @@ -122,7 +117,42 @@ jobs: run: | python3 ./examples/tutorial/quickstart_tutorial.py + CodeQL: + defaults: + run: + shell: bash + timeout-minutes: 60 + runs-on: ubuntu-latest + container: + image: ghcr.io/microsoft/ark/ark:base-dev-cuda12.2 + permissions: + actions: read + contents: read + security-events: write + concurrency: + group: ${{ github.workflow }}-codeql-${{ github.ref }} + cancel-in-progress: true + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Dubious ownership exception + run: | + git config --global --add safe.directory /__w/ark/ark + + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: cpp + + - name: Build + run: | + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_CUDA=ON -DARK_BUILD_TESTS=OFF .. + make build ark_py + - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v3 with: - category: "/language:cpp-${{ matrix.platform }}" + category: "/language:cpp" diff --git a/README.md b/README.md index 3260d3e57..6f533e1bd 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,7 @@ A GPU-driven system framework for scalable AI applications. | Pipelines | Build Status | |-------------------|-------------------| | Unit Tests (CUDA) | [![Build Status](https://dev.azure.com/binyli/HPC/_apis/build/status%2Fark-test?branchName=main)](https://dev.azure.com/binyli/HPC/_build/latest?definitionId=6&branchName=main) | -| Unit Tests (ROCm) | [![Unit Tests (ROCm)](https://github.com/microsoft/ark/actions/workflows/ut.yml/badge.svg?branch=main)](https://github.com/microsoft/ark/actions/workflows/ut.yml) | - -*NOTE (Nov 2023): ROCm unit tests will be replaced into an Azure pipeline in the future.* +| Unit Tests | [![Unit Tests](https://github.com/microsoft/ark/actions/workflows/ut.yml/badge.svg?branch=main)](https://github.com/microsoft/ark/actions/workflows/ut.yml) | See [Quick Start](docs/quickstart.md) to quickly get started. diff --git a/tools/lint.sh b/tools/lint.sh old mode 100644 new mode 100755 index a424a9b33..d6ce5cf98 --- a/tools/lint.sh +++ b/tools/lint.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +set -o pipefail PROJECT_ROOT=$(dirname "$(realpath "$0")")/.. LINT_CPP=false @@ -46,12 +47,13 @@ if $LINT_CPP; then files=$(git -C "$PROJECT_ROOT" ls-files --cached | grep -E '\.(c|h|cpp|hpp|cc|cu|cuh)$' | grep -v -E '(third_party/)' | sed "s|^|$PROJECT_ROOT/|") if [ -n "$files" ]; then if $DRY_RUN; then - clang-format -style=file --dry-run --Werror $files + if ! echo "$files" | xargs -d '\n' clang-format -style=file --dry-run --Werror; then + EXIT_CODE=1 + fi else - clang-format -style=file -i $files - fi - if [ $? -ne 0 ]; then - EXIT_CODE=1 + if ! echo "$files" | xargs -d '\n' clang-format -style=file -i; then + EXIT_CODE=1 + fi fi fi fi @@ -65,12 +67,13 @@ if $LINT_PYTHON; then files=$(git -C "$PROJECT_ROOT" ls-files --cached | grep -E '\.py$' | grep -v -E '(third_party/|\.eggs/|docs/|examples/llama/llama/)' | sed "s|^|$PROJECT_ROOT/|") if [ -n "$files" ]; then if $DRY_RUN; then - python3 -m black --check --diff --config "$PROJECT_ROOT/pyproject.toml" $files + if ! echo "$files" | xargs -d '\n' python3 -m black --check --diff --config "$PROJECT_ROOT/pyproject.toml"; then + EXIT_CODE=1 + fi else - python3 -m black --config "$PROJECT_ROOT/pyproject.toml" $files - fi - if [ $? -ne 0 ]; then - EXIT_CODE=1 + if ! echo "$files" | xargs -d '\n' python3 -m black --config "$PROJECT_ROOT/pyproject.toml"; then + EXIT_CODE=1 + fi fi fi fi From b155f4c2436180851398f15aa66113d6e5c34473 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 4 Jun 2026 05:24:34 +0000 Subject: [PATCH 05/17] Fix lint formatting in executor and matmul --- ark/api/executor.cpp | 7 ++--- ark/ops/ops_matmul.cpp | 64 ++++++++++++++++++++++-------------------- 2 files changed, 37 insertions(+), 34 deletions(-) diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp index 9b13df240..9c4e2f7e3 100644 --- a/ark/api/executor.cpp +++ b/ark/api/executor.cpp @@ -323,8 +323,8 @@ void CommResource::connect(const PlanJson &plan_json, add_proxy_channel(resource->eth); add_proxy_channel(resource->ib); } - std::map< - int, std::vector>> + std::map>> sm_semaphores; for (auto &[remote_rank, resource] : rank_to_resource_) { // NOTE: We can create multiple semaphores here if we need in the future @@ -895,8 +895,7 @@ void Executor::Impl::compile(const std::string &plan, int device_id, gpu_manager->set_current(); timer_begin_ = gpu_manager->create_event(); timer_end_ = gpu_manager->create_event(); - flag_ = gpu_manager->malloc_host( - sizeof(int), gpuHostAllocMapped); + flag_ = gpu_manager->malloc_host(sizeof(int), gpuHostAllocMapped); stream_ = gpu_manager->create_stream(); } PlanResourceKey key(plan, device_id, name); diff --git a/ark/ops/ops_matmul.cpp b/ark/ops/ops_matmul.cpp index eaa04130b..d7a570b7a 100644 --- a/ark/ops/ops_matmul.cpp +++ b/ark/ops/ops_matmul.cpp @@ -2,12 +2,12 @@ // Licensed under the MIT license. #include "ops_matmul.hpp" -#include "ops_copy.hpp" -#include "../model/model_tensor.hpp" #include +#include "../model/model_tensor.hpp" #include "ops_common.hpp" +#include "ops_copy.hpp" #include "utils/utils_math.hpp" namespace ark { @@ -231,17 +231,18 @@ static const Json select_tile_config(const ArchRef arch, const Dims &mnk) { DimType M = mnk[0], N = mnk[1]; // Candidate tiles: {TileM, TileN, NumWarps} - // Ordered from smallest to largest. For each, M%TileM==0 and N%TileN==0 required. - struct TileConfig { DimType tm; DimType tn; int nw; }; + // Ordered from smallest to largest. For each, M%TileM==0 and N%TileN==0 + // required. + struct TileConfig { + DimType tm; + DimType tn; + int nw; + }; // Only tiles validated to compile with CUTLASS 2.x epilogue. // [32,*] tiles fail due to epilogue OutputTileOptimalThreadMap zero-size. static const TileConfig candidates[] = { - {64, 64, 4}, - {64, 128, 4}, - {128, 64, 4}, - {128, 128, 8}, - {128, 256, 8}, - {256, 128, 8}, + {64, 64, 4}, {64, 128, 4}, {128, 64, 4}, + {128, 128, 8}, {128, 256, 8}, {256, 128, 8}, }; // Find the best tile: prefer larger tiles (more compute per tile, better // pipeline amortization) but fall back to smaller tiles when there aren't @@ -249,7 +250,8 @@ static const Json select_tile_config(const ArchRef arch, int best = -1; size_t best_tasks = 0; size_t best_tile_area = 0; - for (int i = 0; i < (int)(sizeof(candidates) / sizeof(candidates[0])); i++) { + for (int i = 0; i < (int)(sizeof(candidates) / sizeof(candidates[0])); + i++) { auto &c = candidates[i]; if (M % c.tm == 0 && N % c.tn == 0) { size_t tasks = (M / c.tm) * (N / c.tn); @@ -257,7 +259,11 @@ static const Json select_tile_config(const ArchRef arch, bool pick = (best == -1); if (!pick && best_tasks < 4 && tasks > best_tasks) pick = true; if (!pick && tasks >= 4 && tile_area > best_tile_area) pick = true; - if (pick) { best = i; best_tasks = tasks; best_tile_area = tile_area; } + if (pick) { + best = i; + best_tasks = tasks; + best_tile_area = tile_area; + } } } if (best == -1) { @@ -350,7 +356,8 @@ Tensor Model::matmul_gelu(Tensor input, Tensor other, Tensor output, // ---- MatmulScale: matmul with register-level scale fusion ---- -ModelOpMatmulScale::ModelOpMatmulScale(ModelTensorRef input, ModelTensorRef other, +ModelOpMatmulScale::ModelOpMatmulScale(ModelTensorRef input, + ModelTensorRef other, ModelTensorRef output, bool trans_input, bool trans_other, float scale) : ModelOpMatmul(input, other, output, trans_input, trans_other) { @@ -366,11 +373,15 @@ std::string ModelOpMatmulScale::impl_name(const Json &config) const { if (name.substr(0, 7) == "matmul<") { // Insert scale parameter: matmul_scale<..., ScaleBits> // Encode scale as integer bits for template parameter - union { float f; uint32_t u; } conv; + union { + float f; + uint32_t u; + } conv; conv.f = scale; name = "matmul_scale<" + name.substr(7); // Remove trailing ">" and add scale bits - name = name.substr(0, name.size() - 1) + ", " + std::to_string(conv.u) + ">"; + name = name.substr(0, name.size() - 1) + ", " + std::to_string(conv.u) + + ">"; } else { ERR(InternalError, "unexpected matmul impl_name format: ", name); } @@ -410,8 +421,8 @@ ModelOpMatmulAdd::ModelOpMatmulAdd(ModelTensorRef input, ModelTensorRef other, if (residual->strides() != output->strides()) { ERR(InvalidUsageError, "MatmulAdd requires residual and output to have matching strides. " - "Residual strides: ", residual->strides(), - ", output strides: ", output->strides()); + "Residual strides: ", + residual->strides(), ", output strides: ", output->strides()); } ModelTensorRef result = std::make_shared(*output); @@ -549,14 +560,13 @@ Tensor Model::matmul_add(Tensor input, Tensor other, Tensor residual, const std::string &name) { return impl_ ->create_op(name, input.ref(), other.ref(), - residual.ref(), output.ref(), - trans_input, trans_other) + residual.ref(), output.ref(), trans_input, + trans_other) ->result_tensors()[0]; } -Tensor Model::mma(Tensor input, Tensor other, Tensor output, - bool trans_input, bool trans_other, - const std::string &name) { +Tensor Model::mma(Tensor input, Tensor other, Tensor output, bool trans_input, + bool trans_other, const std::string &name) { return impl_ ->create_op(name, input.ref(), other.ref(), output.ref(), trans_input, trans_other) @@ -587,10 +597,8 @@ std::string ModelOpMma::impl_name(const Json &config) const { return ModelOpMatmul::impl_name(config); } - Tensor Model::store(Tensor output, Tensor input, const std::string &name) { - return impl_ - ->create_op(name, input.ref(), output.ref()) + return impl_->create_op(name, input.ref(), output.ref()) ->result_tensors()[0]; } @@ -622,11 +630,7 @@ std::string ModelOpStore::impl_name(const Json &config) const { vec_string(read_tensors_[0]->shape().dims4()), vec_string(write_tensors_[0]->strides().dims4()), vec_string(write_tensors_[0]->shape().dims4()), - vec_string(unit_out_dims.dims4()), - std::to_string(num_warps), - "0"}); + vec_string(unit_out_dims.dims4()), std::to_string(num_warps), "0"}); } - - } // namespace ark From 124e2ea89e68be42e45e5b9ebe033da7e798c926 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 4 Jun 2026 05:27:30 +0000 Subject: [PATCH 06/17] Apply code formatting required by lint --- ark/api/context.cpp | 4 +- ark/api/planner.cpp | 20 ++--- ark/api/planner_test.cpp | 34 ++++----- ark/context_impl.cpp | 4 +- ark/context_impl.hpp | 3 +- ark/gpu/gpu.hpp | 4 +- ark/include/ark.hpp | 2 +- ark/include/ark/executor.hpp | 5 +- ark/include/ark/model.hpp | 16 ++-- ark/include/ark/planner.hpp | 4 +- ark/include/ark/tensor.hpp | 4 +- ark/include/kernels/comm.h | 3 +- ark/include/kernels/common/arch.h | 4 +- ark/include/kernels/common/broadcast.h | 79 ++++++++------------ ark/include/kernels/common/sync.h | 3 +- ark/include/kernels/common/vector_type.h | 19 +++-- ark/include/kernels/gemm_ck.h | 95 +++++++++++++----------- ark/include/kernels/gemm_cutlass.h | 9 +-- ark/include/kernels/gemm_fused.h | 30 ++++---- ark/include/kernels/gemm_scale.h | 5 +- ark/include/kernels/matmul.h | 40 +++++----- ark/include/kernels/matmul_fused.h | 24 +++--- ark/include/kernels/reduce.h | 42 ++++++----- ark/model/model_buffer.cpp | 3 +- ark/model/model_context_manager.cpp | 4 +- ark/model/model_graph_impl.hpp | 2 +- ark/model/model_op.hpp | 4 +- ark/model/model_tensor.cpp | 12 +-- ark/model/model_tensor.hpp | 3 +- ark/ops/ops_all_reduce_test.cpp | 7 +- ark/ops/ops_broadcast.cpp | 16 ++-- ark/ops/ops_communication.cpp | 69 +++++++++-------- ark/ops/ops_communication.hpp | 1 - ark/ops/ops_communication_test.cpp | 6 +- ark/ops/ops_embedding.cpp | 8 +- ark/ops/ops_matmul.hpp | 3 +- ark/ops/ops_noop.cpp | 4 +- ark/ops/ops_reduce.cpp | 4 +- ark/ops/ops_scalar.cpp | 16 ++-- ark/ops/ops_test_common.cpp | 13 ++-- ark/ops/ops_test_common.hpp | 13 ++-- ark/ops/ops_transpose.cpp | 8 +- examples/llama/model.py | 2 +- examples/llama/model_test.py | 2 - python/ark/__init__.py | 1 - python/ark/executor.py | 1 - python/ark/model.py | 1 - python/ark/ops.py | 2 +- python/ark/planner.py | 1 - python/ark/runtime.py | 1 - python/tensor_py.cpp | 2 +- python/unittest/test_placeholder.py | 50 ++++++------- 52 files changed, 350 insertions(+), 362 deletions(-) diff --git a/ark/api/context.cpp b/ark/api/context.cpp index 702247ddf..087e0e7c9 100644 --- a/ark/api/context.cpp +++ b/ark/api/context.cpp @@ -29,8 +29,6 @@ void Context::set(const std::string& key, const std::string& value, this->impl_->set(key, value_json, type); } -std::string Context::dump() const { - return this->impl_->dump().dump(); -} +std::string Context::dump() const { return this->impl_->dump().dump(); } } // namespace ark diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp index d7e96e957..c48e19c50 100644 --- a/ark/api/planner.cpp +++ b/ark/api/planner.cpp @@ -211,8 +211,8 @@ std::string Planner::Impl::plan(bool pretty) const { Dims tile(trim_leading_ones); std::stringstream ss; - ss << "Result shape is not divided by tile " - << tile << ". Op: " << op->serialize().dump(); + ss << "Result shape is not divided by tile " << tile + << ". Op: " << op->serialize().dump(); auto not_divided_error = ss.str(); auto &result_shape = result_tensors[0]->padded_shape(); @@ -224,11 +224,10 @@ std::string Planner::Impl::plan(bool pretty) const { max_num_tasks = 1; for (int i = 0; i < tile4.ndims(); i++) { if (tile4[i] == 0) { - ERR(PlanError, "Tile dimension is zero. Op: ", - op->serialize().dump()); + ERR(PlanError, + "Tile dimension is zero. Op: ", op->serialize().dump()); } - max_num_tasks *= - (result_shape4[i] + tile4[i] - 1) / tile4[i]; + max_num_tasks *= (result_shape4[i] + tile4[i] - 1) / tile4[i]; } if (max_num_tasks == 0) ERR(InternalError, "max_num_tasks == 0"); } @@ -328,10 +327,13 @@ std::string Planner::Impl::plan(bool pretty) const { max_processor_id = std::max(max_processor_id, num_processors); } else if (processor_group_root == -1) { processor_group_root = ctx_processor_range_list.front()[0]; - processor_group["ProcessorRange"] = ctx_processor_range_list.front()[1]; - resource_group["ProcessorRange"] = ctx_processor_range_list.back()[1]; + processor_group["ProcessorRange"] = + ctx_processor_range_list.front()[1]; + resource_group["ProcessorRange"] = + ctx_processor_range_list.back()[1]; max_processor_id = std::max( - max_processor_id, ctx_processor_range_list.front()[1][1].get()); + max_processor_id, + ctx_processor_range_list.front()[1][1].get()); } else { new_processor_group = false; resource_group["ProcessorRange"] = diff --git a/ark/api/planner_test.cpp b/ark/api/planner_test.cpp index 7507ea023..e557ee307 100644 --- a/ark/api/planner_test.cpp +++ b/ark/api/planner_test.cpp @@ -87,8 +87,9 @@ ark::unittest::State test_planner_context_processor_range() { auto t = model.add(t0, t1); tensors.push_back(t); - UNITTEST_EQ(ctx.get("ProcessorRange"), - ark::Json({subctx.id(), {0 * (int)i, 2 * (int)i}}).dump()); + UNITTEST_EQ( + ctx.get("ProcessorRange"), + ark::Json({subctx.id(), {0 * (int)i, 2 * (int)i}}).dump()); } UNITTEST_TRUE(model.verify()); @@ -131,15 +132,13 @@ ark::unittest::State test_planner_context_warp_range() { ctx.warp_range(0, 4); t3 = model.relu(t2); - UNITTEST_EQ(ctx.get("WarpRange"), - ark::Json({ctx.id(), {0, 4}}).dump()); + UNITTEST_EQ(ctx.get("WarpRange"), ark::Json({ctx.id(), {0, 4}}).dump()); // node 2 ctx.warp_range(2, 4); t4 = model.sqrt(t3); - UNITTEST_EQ(ctx.get("WarpRange"), - ark::Json({ctx.id(), {2, 4}}).dump()); + UNITTEST_EQ(ctx.get("WarpRange"), ark::Json({ctx.id(), {2, 4}}).dump()); // Invalid usage: range (0, 4) is out of previous range (2, 4) UNITTEST_THROW(ctx.warp_range(0, 4), ark::PlanError); @@ -197,15 +196,13 @@ ark::unittest::State test_planner_context_sram_range() { ctx.sram_range(0, 4); t3 = model.relu(t2); - UNITTEST_EQ(ctx.get("SramRange"), - ark::Json({ctx.id(), {0, 4}}).dump()); + UNITTEST_EQ(ctx.get("SramRange"), ark::Json({ctx.id(), {0, 4}}).dump()); // node 2 ctx.sram_range(2, 4); t4 = model.sqrt(t3); - UNITTEST_EQ(ctx.get("SramRange"), - ark::Json({ctx.id(), {2, 4}}).dump()); + UNITTEST_EQ(ctx.get("SramRange"), ark::Json({ctx.id(), {2, 4}}).dump()); // Invalid usage: range (0, 4) is out of previous range (2, 4) UNITTEST_THROW(ctx.sram_range(0, 4), ark::PlanError); @@ -263,15 +260,13 @@ ark::unittest::State test_planner_context_sync() { ctx.sync(false); t3 = model.relu(t2); - UNITTEST_EQ(ctx.get("Sync"), - ark::Json({ctx.id(), false}).dump()); + UNITTEST_EQ(ctx.get("Sync"), ark::Json({ctx.id(), false}).dump()); // node 2 ctx.sync(true); t4 = model.sqrt(t3); - UNITTEST_EQ(ctx.get("Sync"), - ark::Json({ctx.id(), true}).dump()); + UNITTEST_EQ(ctx.get("Sync"), ark::Json({ctx.id(), true}).dump()); } { // node 3 @@ -280,8 +275,7 @@ ark::unittest::State test_planner_context_sync() { ctx.sync(true); t5 = model.exp(t2); - UNITTEST_EQ(ctx.get("Sync"), - ark::Json({ctx.id(), true}).dump()); + UNITTEST_EQ(ctx.get("Sync"), ark::Json({ctx.id(), true}).dump()); } UNITTEST_TRUE(model.verify()); @@ -297,8 +291,9 @@ ark::unittest::State test_planner_context_sync() { UNITTEST_EQ(nodes[1]->context.at("Sync"), ark::Json({{sync_id_1, true}, {sync_id_1, false}})); UNITTEST_GE(nodes[2]->context.size(), 1); - UNITTEST_EQ(nodes[2]->context.at("Sync"), - ark::Json({{sync_id_1, true}, {sync_id_1, false}, {sync_id_1, true}})); + UNITTEST_EQ( + nodes[2]->context.at("Sync"), + ark::Json({{sync_id_1, true}, {sync_id_1, false}, {sync_id_1, true}})); UNITTEST_GE(nodes[3]->context.size(), 1); UNITTEST_EQ(nodes[3]->context.at("Sync"), ark::Json({{sync_id_2, true}, {sync_id_2, true}})); @@ -361,7 +356,8 @@ ark::unittest::State test_planner_context_config() { ark::Json({{cfg_id_1, {{"key0", "val1"}}}})); UNITTEST_GE(nodes[2]->context.size(), 1); UNITTEST_EQ(nodes[2]->context.at("Config"), - ark::Json({{cfg_id_1, {{"key0", "val1"}}}, {cfg_id_1, {{"key1", "val2"}}}})); + ark::Json({{cfg_id_1, {{"key0", "val1"}}}, + {cfg_id_1, {{"key1", "val2"}}}})); UNITTEST_GE(nodes[3]->context.size(), 1); UNITTEST_EQ(nodes[3]->context.at("Config"), ark::Json({{cfg_id_2, {{"key2", "val3"}}}})); diff --git a/ark/context_impl.cpp b/ark/context_impl.cpp index c4f95f2c3..0eca1bf0e 100644 --- a/ark/context_impl.cpp +++ b/ark/context_impl.cpp @@ -52,8 +52,6 @@ bool Context::Impl::has(const std::string& key) const { return context_manager_->has(key); } -Json Context::Impl::dump() const { - return context_manager_->dump(); -} +Json Context::Impl::dump() const { return context_manager_->dump(); } } // namespace ark diff --git a/ark/context_impl.hpp b/ark/context_impl.hpp index b79353296..cf1509167 100644 --- a/ark/context_impl.hpp +++ b/ark/context_impl.hpp @@ -17,7 +17,8 @@ class Context::Impl { Json get(const std::string& key) const; - void set(const std::string& key, const Json& value_json, ContextType type = ContextType::Overwrite); + void set(const std::string& key, const Json& value_json, + ContextType type = ContextType::Overwrite); bool has(const std::string& key) const; diff --git a/ark/gpu/gpu.hpp b/ark/gpu/gpu.hpp index dbcd50f3e..1010683c3 100644 --- a/ark/gpu/gpu.hpp +++ b/ark/gpu/gpu.hpp @@ -21,7 +21,7 @@ constexpr auto alias = cuda_const; #define ARK_GPU_DEFINE_FUNC_ALIAS(alias, cuda_func, rocm_func) \ template \ - inline auto alias(Args &&... args) { \ + inline auto alias(Args &&...args) { \ return cuda_func(std::forward(args)...); \ } @@ -35,7 +35,7 @@ constexpr auto alias = rocm_const; #define ARK_GPU_DEFINE_FUNC_ALIAS(alias, cuda_func, rocm_func) \ template \ - inline auto alias(Args &&... args) { \ + inline auto alias(Args &&...args) { \ return rocm_func(std::forward(args)...); \ } diff --git a/ark/include/ark.hpp b/ark/include/ark.hpp index b1955bf9c..90f23b2f1 100644 --- a/ark/include/ark.hpp +++ b/ark/include/ark.hpp @@ -14,9 +14,9 @@ #include #include #include +#include #include #include -#include #include #include #include diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp index 2e97ffe78..765cd0f27 100644 --- a/ark/include/ark/executor.hpp +++ b/ark/include/ark/executor.hpp @@ -52,9 +52,8 @@ class Executor { bool record = false); /// Run the executor for `iter` iterations. - void run( - int iter, - const std::unordered_map &placeholder_data = {}); + void run(int iter, + const std::unordered_map &placeholder_data = {}); /// Wait for the previous run to finish. void wait(int64_t max_spin_count = -1); diff --git a/ark/include/ark/model.hpp b/ark/include/ark/model.hpp index caec2da24..deabcfb7c 100644 --- a/ark/include/ark/model.hpp +++ b/ark/include/ark/model.hpp @@ -151,23 +151,19 @@ class Model : public ModelGraph { Tensor matmul(Tensor input, Tensor other, Tensor output = NullTensor, bool trans_input = false, bool trans_other = false, const std::string &name = ""); - Tensor matmul_gelu(Tensor input, Tensor other, - Tensor output = NullTensor, + Tensor matmul_gelu(Tensor input, Tensor other, Tensor output = NullTensor, bool trans_input = false, bool trans_other = false, const std::string &name = ""); Tensor matmul_scale(Tensor input, Tensor other, float scale, - Tensor output = NullTensor, - bool trans_input = false, bool trans_other = false, - const std::string &name = ""); + Tensor output = NullTensor, bool trans_input = false, + bool trans_other = false, const std::string &name = ""); Tensor matmul_add(Tensor input, Tensor other, Tensor residual, - Tensor output = NullTensor, - bool trans_input = false, bool trans_other = false, - const std::string &name = ""); + Tensor output = NullTensor, bool trans_input = false, + bool trans_other = false, const std::string &name = ""); Tensor mma(Tensor input, Tensor other, Tensor output = NullTensor, bool trans_input = false, bool trans_other = false, const std::string &name = ""); - Tensor store(Tensor output, Tensor input, - const std::string &name = ""); + Tensor store(Tensor output, Tensor input, const std::string &name = ""); // Implements the 'im2col' method for 2D convolution layers, which takes an // `input` tensor and reshapes it to a 2D matrix by extracting image patches // from the input tensor based on the provided parameters. diff --git a/ark/include/ark/planner.hpp b/ark/include/ark/planner.hpp index 9547848b9..b34acbc39 100644 --- a/ark/include/ark/planner.hpp +++ b/ark/include/ark/planner.hpp @@ -38,8 +38,8 @@ class Planner { ~Planner(); - using ConfigRule = std::function; + using ConfigRule = std::function; void install_config_rule(ConfigRule rule); diff --git a/ark/include/ark/tensor.hpp b/ark/include/ark/tensor.hpp index aa8dcaa68..67eda64ae 100644 --- a/ark/include/ark/tensor.hpp +++ b/ark/include/ark/tensor.hpp @@ -69,9 +69,7 @@ std::ostream &operator<<(std::ostream &os, const Tensor &tensor); namespace std { template <> struct hash { - size_t operator()(const ark::Tensor &t) const noexcept { - return t.id(); - } + size_t operator()(const ark::Tensor &t) const noexcept { return t.id(); } }; } // namespace std diff --git a/ark/include/kernels/comm.h b/ark/include/kernels/comm.h index 9075bb728..4a2deca80 100644 --- a/ark/include/kernels/comm.h +++ b/ark/include/kernels/comm.h @@ -414,8 +414,7 @@ DEVICE void read_reduce_and_write( DataType, NelemPerThread, Rank, NPeers, nelems_per_rank>>::run(dst, src, scratch, peer_offsets, uop_idx); - } - else { + } else { PacketType *scratch = reinterpret_cast(scratch_base); comm::PacketReduce< OutDims, OutShape, UnitOutDims, NumWarps, SmemBytes, PacketType, diff --git a/ark/include/kernels/common/arch.h b/ark/include/kernels/common/arch.h index e268ad78c..7eff95c7b 100644 --- a/ark/include/kernels/common/arch.h +++ b/ark/include/kernels/common/arch.h @@ -32,13 +32,13 @@ DEVICE int warp_id() { #if defined(ARK_TARGET_CUDA_ARCH) #define ARCH_ALIAS_FUNC(alias, cuda_func, hip_func) \ template \ - inline auto alias(Args &&... args) { \ + inline auto alias(Args &&...args) { \ return cuda_func(std::forward(args)...); \ } #elif defined(ARK_TARGET_ROCM_ARCH) #define ARCH_ALIAS_FUNC(alias, cuda_func, hip_func) \ template \ - inline auto alias(Args &&... args) { \ + inline auto alias(Args &&...args) { \ return hip_func(std::forward(args)...); \ } #endif diff --git a/ark/include/kernels/common/broadcast.h b/ark/include/kernels/common/broadcast.h index 86e84e5d0..d64a31fd5 100644 --- a/ark/include/kernels/common/broadcast.h +++ b/ark/include/kernels/common/broadcast.h @@ -41,22 +41,17 @@ struct Broadcast1Intrinsic { static constexpr int InConsecBytes = InConsecLen * sizeof(InputType); static constexpr int OutNelemPerThread = - (OutConsecBytes % 16 == 0) - ? 16 / sizeof(OutputType) - : (OutConsecBytes % 8 == 0) - ? 8 / sizeof(OutputType) - : (OutConsecBytes % 4 == 0) - ? 4 / sizeof(OutputType) - : (OutConsecBytes % 2 == 0) ? 2 / sizeof(OutputType) - : 1; + (OutConsecBytes % 16 == 0) ? 16 / sizeof(OutputType) + : (OutConsecBytes % 8 == 0) ? 8 / sizeof(OutputType) + : (OutConsecBytes % 4 == 0) ? 4 / sizeof(OutputType) + : (OutConsecBytes % 2 == 0) ? 2 / sizeof(OutputType) + : 1; static constexpr int InNelemPerThread = - (InConsecBytes % 16 == 0) - ? 16 / sizeof(InputType) - : (InConsecBytes % 8 == 0) - ? 8 / sizeof(InputType) - : (InConsecBytes % 4 == 0) - ? 4 / sizeof(InputType) - : (InConsecBytes % 2 == 0) ? 2 / sizeof(InputType) : 1; + (InConsecBytes % 16 == 0) ? 16 / sizeof(InputType) + : (InConsecBytes % 8 == 0) ? 8 / sizeof(InputType) + : (InConsecBytes % 4 == 0) ? 4 / sizeof(InputType) + : (InConsecBytes % 2 == 0) ? 2 / sizeof(InputType) + : 1; static constexpr int NelemPerThread = BroadcastInput ? OutNelemPerThread @@ -155,43 +150,35 @@ struct Broadcast2Intrinsic { static constexpr int In1ConsecBytes = In1ConsecLen * sizeof(InputType); static constexpr int OutNelemPerThread = - (OutConsecBytes % 16 == 0) - ? 16 / sizeof(OutputType) - : (OutConsecBytes % 8 == 0) - ? 8 / sizeof(OutputType) - : (OutConsecBytes % 4 == 0) - ? 4 / sizeof(OutputType) - : (OutConsecBytes % 2 == 0) ? 2 / sizeof(OutputType) - : 1; + (OutConsecBytes % 16 == 0) ? 16 / sizeof(OutputType) + : (OutConsecBytes % 8 == 0) ? 8 / sizeof(OutputType) + : (OutConsecBytes % 4 == 0) ? 4 / sizeof(OutputType) + : (OutConsecBytes % 2 == 0) ? 2 / sizeof(OutputType) + : 1; static constexpr int In0NelemPerThread = - (In0ConsecBytes % 16 == 0) - ? 16 / sizeof(InputType) - : (In0ConsecBytes % 8 == 0) - ? 8 / sizeof(InputType) - : (In0ConsecBytes % 4 == 0) - ? 4 / sizeof(InputType) - : (In0ConsecBytes % 2 == 0) ? 2 / sizeof(InputType) : 1; + (In0ConsecBytes % 16 == 0) ? 16 / sizeof(InputType) + : (In0ConsecBytes % 8 == 0) ? 8 / sizeof(InputType) + : (In0ConsecBytes % 4 == 0) ? 4 / sizeof(InputType) + : (In0ConsecBytes % 2 == 0) ? 2 / sizeof(InputType) + : 1; static constexpr int In1NelemPerThread = - (In1ConsecBytes % 16 == 0) - ? 16 / sizeof(InputType) - : (In1ConsecBytes % 8 == 0) - ? 8 / sizeof(InputType) - : (In1ConsecBytes % 4 == 0) - ? 4 / sizeof(InputType) - : (In1ConsecBytes % 2 == 0) ? 2 / sizeof(InputType) : 1; + (In1ConsecBytes % 16 == 0) ? 16 / sizeof(InputType) + : (In1ConsecBytes % 8 == 0) ? 8 / sizeof(InputType) + : (In1ConsecBytes % 4 == 0) ? 4 / sizeof(InputType) + : (In1ConsecBytes % 2 == 0) ? 2 / sizeof(InputType) + : 1; static constexpr int NelemPerThread = - (BroadcastInput0 && BroadcastInput1) - ? OutNelemPerThread - : BroadcastInput0 - ? math::gcd::value - : BroadcastInput1 - ? math::gcd::value - : math::gcd::value>::value; + (BroadcastInput0 && BroadcastInput1) ? OutNelemPerThread + : BroadcastInput0 + ? math::gcd::value + : BroadcastInput1 + ? math::gcd::value + : math::gcd::value>::value; static_assert(math::is_pow2::value, "NelemPerThread must be power of 2"); diff --git a/ark/include/kernels/common/sync.h b/ark/include/kernels/common/sync.h index cf22e357d..ffa5ac33b 100644 --- a/ark/include/kernels/common/sync.h +++ b/ark/include/kernels/common/sync.h @@ -120,7 +120,8 @@ DEVICE void sync_warps() { if (atomicInc(&state->cnt[group_id], MaxOldCnt) == MaxOldCnt) { state->flag[group_id] = tmp; } else { - while (atomicAdd(&state->flag[group_id], 0) != tmp); + while (atomicAdd(&state->flag[group_id], 0) != tmp) + ; } state->is_inc_flag[group_id] = tmp; } diff --git a/ark/include/kernels/common/vector_type.h b/ark/include/kernels/common/vector_type.h index 1e5316e20..24f9dff3f 100644 --- a/ark/include/kernels/common/vector_type.h +++ b/ark/include/kernels/common/vector_type.h @@ -77,8 +77,8 @@ struct IntrinsicCompute1Exists { template static auto test(...) -> std::false_type; - static constexpr bool value = decltype( - test(type::Constant::zero()))::value; + static constexpr bool value = decltype(test( + type::Constant::zero()))::value; }; template @@ -90,9 +90,9 @@ struct IntrinsicCompute2Exists { template static auto test(...) -> std::false_type; - static constexpr bool value = decltype( - test(type::Constant::zero(), - type::Constant::zero()))::value; + static constexpr bool value = decltype(test( + type::Constant::zero(), + type::Constant::zero()))::value; }; template @@ -198,11 +198,10 @@ struct DefaultNelemPerThread { : math::min::value; static const int value = - (sizeof(OutDataType) <= 2 && ConsecutiveDimLen % 8 == 0) - ? 8 - : (ConsecutiveDimLen % 4 == 0) - ? 4 - : (ConsecutiveDimLen % 2 == 0) ? 2 : 1; + (sizeof(OutDataType) <= 2 && ConsecutiveDimLen % 8 == 0) ? 8 + : (ConsecutiveDimLen % 4 == 0) ? 4 + : (ConsecutiveDimLen % 2 == 0) ? 2 + : 1; }; } // namespace ark diff --git a/ark/include/kernels/gemm_ck.h b/ark/include/kernels/gemm_ck.h index 478419691..a15cf49e0 100644 --- a/ark/include/kernels/gemm_ck.h +++ b/ark/include/kernels/gemm_ck.h @@ -90,13 +90,15 @@ struct CkGemmConfig::value; static constexpr auto MXdlPerWave = (TileSizeM == 16) ? 1 - : (TileSizeM < TileSizeN) - ? 1 << (LogMNXdlPerWave / 2) - : 1 << (LogMNXdlPerWave - LogMNXdlPerWave / 2); + : (TileSizeM < TileSizeN) + ? 1 << (LogMNXdlPerWave / 2) + : 1 << (LogMNXdlPerWave - LogMNXdlPerWave / 2); static constexpr auto NXdlPerWave = MNXdlPerWave / MXdlPerWave; static constexpr bool Is_256x256x128 = @@ -197,13 +199,15 @@ struct CkGemmConfig, typename std::conditional, S<1, 0, 2>>::type, typename std::conditional, S<1, 0, 2>>::type, - (IsColA ? 1 : 2), (!IsColA ? 8 : Is_128x128x64 ? 4 : MXdlPerWave), 8, - true, S<4, NumThreads / 4, 1>, + (IsColA ? 1 : 2), + (!IsColA ? 8 + : Is_128x128x64 ? 4 + : MXdlPerWave), + 8, true, S<4, NumThreads / 4, 1>, typename std::conditional, S<0, 2, 1>>::type, typename std::conditional, S<0, 2, 1>>::type, (IsColB ? 2 : 1), - (IsColB ? 8 - : Is_128x32x256 - ? 8 - : (Is_128x32x128 || Is_128x64x128 || Is_128x128x128) - ? 4 - : (Is_128x32x64 || Is_64x32x32) ? 2 : NXdlPerWave), + (IsColB ? 8 + : Is_128x32x256 ? 8 + : (Is_128x32x128 || Is_128x64x128 || Is_128x128x128) ? 4 + : (Is_128x32x64 || Is_64x32x32) ? 2 + : NXdlPerWave), 8, true, 7, 1, 1, LoopSched, PipelineVer>; using ImplXdlCShuffle = @@ -234,16 +240,17 @@ struct CkGemmConfig, S<1, 0, 2>>::type, typename std::conditional, S<1, 0, 2>>::type, (IsColA ? 1 : 2), - (!IsColA ? 8 : (AK1 == 2 || Is_128x128x64) ? 4 : MXdlPerWave), AK1, - (AK1 == 8), S, + (!IsColA ? 8 + : (AK1 == 2 || Is_128x128x64) ? 4 + : MXdlPerWave), + AK1, (AK1 == 8), S, typename std::conditional, S<0, 2, 1>>::type, typename std::conditional, S<0, 2, 1>>::type, (IsColB ? 2 : 1), (IsColB ? 8 - : (BK1 == 2 || Is_256x128x256 || Is_128x128x128 || - Is_128x64x128) - ? 4 - : NXdlPerWave), + : (BK1 == 2 || Is_256x128x256 || Is_128x128x128 || Is_128x64x128) + ? 4 + : NXdlPerWave), BK1, (BK1 == 8), 1, 1, S<1, (Is_128x128x128 || Is_128x64x128 || Is_128x32x128 || @@ -255,16 +262,17 @@ struct CkGemmConfig; #if (DEBUG_CK != 0) - PrintDeviceGemmXdlCShuffle< - NumThreads, TileSizeM, TileSizeN, 32, AK1, BK1, 32, 32, MXdlPerWave, - NXdlPerWave, - (!IsColA ? 8 : (AK1 == 2 || Is_128x128x64) ? 4 : MXdlPerWave), - (IsColB - ? 8 - : (BK1 == 2 || Is_256x128x256 || Is_128x128x128 || Is_128x64x128) - ? 4 - : NXdlPerWave), - 1, 1> + PrintDeviceGemmXdlCShuffle p; #endif // (DEBUG_CK != 0) }; @@ -286,9 +294,9 @@ struct CkGemmConfig::value; static constexpr auto MXdlPerWave = (TileSizeM == 16) ? 1 - : (TileSizeM < TileSizeN) - ? 1 << (LogMNXdlPerWave / 2) - : 1 << (LogMNXdlPerWave - LogMNXdlPerWave / 2); + : (TileSizeM < TileSizeN) + ? 1 << (LogMNXdlPerWave / 2) + : 1 << (LogMNXdlPerWave - LogMNXdlPerWave / 2); static constexpr auto NXdlPerWave = MNXdlPerWave / MXdlPerWave; static constexpr bool Is_256x256x128 = @@ -307,7 +315,8 @@ struct CkGemmConfig, S<1, 0, 2>>::type, typename std::conditional, S<1, 0, 2>>::type, (IsColA ? 1 : 2), - (!IsColA ? 8 : (AK1 == 2 || Is_128x128x64) ? 4 : MXdlPerWave), AK1, - (AK1 == 8), S, + (!IsColA ? 8 + : (AK1 == 2 || Is_128x128x64) ? 4 + : MXdlPerWave), + AK1, (AK1 == 8), S, typename std::conditional, S<0, 2, 1>>::type, typename std::conditional, S<0, 2, 1>>::type, (IsColB ? 2 : 1), (IsColB ? 8 - : (BK1 == 2 || Is_256x128x256 || Is_128x128x128 || - Is_128x64x128) - ? 4 - : NXdlPerWave), + : (BK1 == 2 || Is_256x128x256 || Is_128x128x128 || Is_128x64x128) + ? 4 + : NXdlPerWave), BK1, (BK1 == 8), 1, 1, S<1, (Is_128x128x128 || Is_128x64x128 || Is_128x32x128 || diff --git a/ark/include/kernels/gemm_cutlass.h b/ark/include/kernels/gemm_cutlass.h index dec27b3b8..a3ca728d5 100644 --- a/ark/include/kernels/gemm_cutlass.h +++ b/ark/include/kernels/gemm_cutlass.h @@ -20,10 +20,9 @@ #include "cutlass/epilogue/thread/linear_combination.h" // clang-format on -#include "cutlass/epilogue/thread/linear_combination_gelu.h" - #include "common/checker.h" #include "common/unit_op.h" +#include "cutlass/epilogue/thread/linear_combination_gelu.h" namespace ark { @@ -323,8 +322,7 @@ template DEVICE void gemm_cuda_add(DataTypeC *D, DataTypeA *A, DataTypeB *B, - DataTypeC *Residual, int uop_idx, - int smem_per_warp) { + DataTypeC *Residual, int uop_idx, int smem_per_warp) { #if (ARK_TARGET_CUDA_ARCH == 60) using ArchTag = cutlass::arch::Sm60; #elif (ARK_TARGET_CUDA_ARCH == 70) @@ -349,8 +347,7 @@ DEVICE void gemm_cuda_add(DataTypeC *D, DataTypeA *A, DataTypeB *B, using GemmType = typename ark::GemmConfiguration< UnitOp, cutlass::arch::OpClassTensorOp, ArchTag, DataTypeA, LayoutA, DataTypeB, LayoutB, DataTypeC, LayoutC, - cutlass::gemm::GemmShape>::Gemm; + cutlass::gemm::GemmShape>::Gemm; using GemmKernel = typename GemmType::GemmKernel; IsEq(); diff --git a/ark/include/kernels/gemm_fused.h b/ark/include/kernels/gemm_fused.h index 047c4b1c8..f9fe9ec5d 100644 --- a/ark/include/kernels/gemm_fused.h +++ b/ark/include/kernels/gemm_fused.h @@ -7,7 +7,8 @@ // global→global data path for fused elementwise ops. // // Data flow: -// global(A,B) → shared → MMA → accum(registers) → Functor → Epilogue → global(C) +// global(A,B) → shared → MMA → accum(registers) → Functor → Epilogue → +// global(C) // // vs current matmul + separate elementwise: // global(A,B) → shared → MMA → Epilogue → global(C) @@ -91,8 +92,7 @@ template DEVICE void gemm_with_functor(DataTypeC *C, DataTypeA *A, DataTypeB *B, - Functor functor, - int uop_idx, int smem_per_warp) { + Functor functor, int uop_idx, int smem_per_warp) { #if (ARK_TARGET_CUDA_ARCH == 60) using ArchTag = cutlass::arch::Sm60; #elif (ARK_TARGET_CUDA_ARCH == 70) @@ -145,8 +145,8 @@ DEVICE void gemm_with_functor(DataTypeC *C, DataTypeA *A, DataTypeB *B, ark::GemmThreadblockSwizzle swizzle; cutlass::gemm::GemmCoord tiled_shape(swizzle.get_tiled_shape()); - typename GemmKernel::Params params(problem_size, tiled_shape, - ref_a, ref_b, ref_c, ref_c); + typename GemmKernel::Params params(problem_size, tiled_shape, ref_a, ref_b, + ref_c, ref_c); params.swizzle_log_tile = uop_idx; typename GemmKernel::SharedStorage *ps = @@ -174,13 +174,13 @@ DEVICE void gemm_with_functor(DataTypeC *C, DataTypeA *A, DataTypeB *B, (ProblemSizeK + Mma::Shape::kK - 1) / Mma::Shape::kK; int thread_idx = threadIdx.x % GemmKernel::kThreadCount; - typename Mma::IteratorA iterator_A( - params.params_A, params.ref_A.data(), - {ProblemSizeM, ProblemSizeK}, thread_idx, tb_offset_A); + typename Mma::IteratorA iterator_A(params.params_A, params.ref_A.data(), + {ProblemSizeM, ProblemSizeK}, thread_idx, + tb_offset_A); - typename Mma::IteratorB iterator_B( - params.params_B, params.ref_B.data(), - {ProblemSizeK, ProblemSizeN}, thread_idx, tb_offset_B); + typename Mma::IteratorB iterator_B(params.params_B, params.ref_B.data(), + {ProblemSizeK, ProblemSizeN}, thread_idx, + tb_offset_B); int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0) % GemmKernel::WarpCount::kCount; @@ -230,8 +230,8 @@ template DEVICE void gemm_cutlass_fused(DataTypeC *C, DataTypeA *A, DataTypeB *B, - Functor functor, - int uop_idx, int smem_per_warp) { + Functor functor, int uop_idx, + int smem_per_warp) { using CutDataTypeA = typename cutlass::platform::conditional< std::is_same::value, cutlass::half_t, typename cutlass::platform::conditional< @@ -259,8 +259,8 @@ DEVICE void gemm_cutlass_fused(DataTypeC *C, DataTypeA *A, DataTypeB *B, gemm_with_functor(pC, pA, pB, functor, - uop_idx, smem_per_warp); + TileSizeN, UnitOp, Functor>(pC, pA, pB, functor, uop_idx, + smem_per_warp); #else static_assert(false, "Unsupported CUDA arch."); #endif diff --git a/ark/include/kernels/gemm_scale.h b/ark/include/kernels/gemm_scale.h index b6025d013..1532c5945 100644 --- a/ark/include/kernels/gemm_scale.h +++ b/ark/include/kernels/gemm_scale.h @@ -68,7 +68,10 @@ DEVICE void gemm_cuda_scale(DataTypeC *C, DataTypeA *A, DataTypeB *B, cutlass::gemm::GemmCoord tiled_shape(swizzle.get_tiled_shape()); // Decode scale from bits - union { uint32_t u; float f; } conv; + union { + uint32_t u; + float f; + } conv; conv.u = ScaleBits; // Create OutputOp params with alpha=scale, beta=0 diff --git a/ark/include/kernels/matmul.h b/ark/include/kernels/matmul.h index 664fdc1f0..77b140de4 100644 --- a/ark/include/kernels/matmul.h +++ b/ark/include/kernels/matmul.h @@ -136,10 +136,10 @@ DEVICE void matmul_gelu(DataTypeC *C, DataTypeA *A, DataTypeB *B, int uop_idx, DataTypeC *pC = &C[un * BatchStrideNC + uc * BatchStrideCC]; #if defined(ARK_TARGET_CUDA_ARCH) - gemm_cutlass_gelu(pC, pA, pB, uop_idx, smem_per_warp); + gemm_cutlass_gelu( + pC, pA, pB, uop_idx, smem_per_warp); #elif defined(ARK_TARGET_ROCM_ARCH) static_assert(false, "matmul_gelu not supported on ROCm."); #endif @@ -213,20 +213,24 @@ template -DEVICE void matmul_scale(DataTypeC *C, DataTypeA *A, DataTypeB *B, - int uop_idx, int smem_per_warp) { - static_assert(NCA::D2 == 1 && NCA::D3 == 1, "NCA should be two dimensional."); - static_assert(NCB::D2 == 1 && NCB::D3 == 1, "NCB should be two dimensional."); - static_assert(TileShape::D2 == 1 && TileShape::D3 == 1, "TileShape should be two dimensional."); + int NumWarps, int SmemBytes, uint32_t ScaleBits, typename DataTypeA, + typename DataTypeB, typename DataTypeC> +DEVICE void matmul_scale(DataTypeC *C, DataTypeA *A, DataTypeB *B, int uop_idx, + int smem_per_warp) { + static_assert(NCA::D2 == 1 && NCA::D3 == 1, + "NCA should be two dimensional."); + static_assert(NCB::D2 == 1 && NCB::D3 == 1, + "NCB should be two dimensional."); + static_assert(TileShape::D2 == 1 && TileShape::D3 == 1, + "TileShape should be two dimensional."); static_assert(ProblemSize::D3 == 1, "ProblemSize D3 should be 1."); constexpr int NC = (NCA::D0 > NCB::D0) ? NCA::D0 : NCB::D0; constexpr int CC = (NCA::D1 > NCB::D1) ? NCA::D1 : NCB::D1; using OutShape = Vec; using UnitOutDims = Vec<1, 1, TileShape::D0, TileShape::D1>; - using UnitOp_t = UnitOp; + using UnitOp_t = + UnitOp; int un = UnitOp_t::uop_idx_n(uop_idx); int uc = UnitOp_t::uop_idx_c(uop_idx); @@ -235,17 +239,19 @@ DEVICE void matmul_scale(DataTypeC *C, DataTypeA *A, DataTypeB *B, DataTypeC *pC = &C[un * BatchStrideNC + uc * BatchStrideCC]; // Decode scale from bit-pattern template parameter - union { uint32_t u; float f; } conv; + union { + uint32_t u; + float f; + } conv; conv.u = ScaleBits; FunctorScale functor{conv.f}; gemm_cutlass_fused< typename std::remove_const::type, LeadingDims::D0, IsColumnA, typename std::remove_const::type, LeadingDims::D3, IsColumnB, - DataTypeC, LeadingDims::D1, - ProblemSize::D0, ProblemSize::D1, ProblemSize::D2, - TileShape::D0, TileShape::D1, - UnitOp_t, FunctorScale>(pC, pA, pB, functor, uop_idx, smem_per_warp); + DataTypeC, LeadingDims::D1, ProblemSize::D0, ProblemSize::D1, + ProblemSize::D2, TileShape::D0, TileShape::D1, UnitOp_t, FunctorScale>( + pC, pA, pB, functor, uop_idx, smem_per_warp); } } // namespace ark diff --git a/ark/include/kernels/matmul_fused.h b/ark/include/kernels/matmul_fused.h index 396000dea..b4d9d1423 100644 --- a/ark/include/kernels/matmul_fused.h +++ b/ark/include/kernels/matmul_fused.h @@ -1,8 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. // -// matmul_fused: matmul with a post-MMA functor applied on register accumulators. -// Wraps gemm_fused.h for ARK's op interface. +// matmul_fused: matmul with a post-MMA functor applied on register +// accumulators. Wraps gemm_fused.h for ARK's op interface. #ifndef ARK_KERNELS_MATMUL_FUSED_H_ #define ARK_KERNELS_MATMUL_FUSED_H_ @@ -21,13 +21,14 @@ template -DEVICE void matmul_scale(DataTypeC *C, DataTypeA *A, DataTypeB *B, - float scale, int uop_idx, int smem_per_warp) { +DEVICE void matmul_scale(DataTypeC *C, DataTypeA *A, DataTypeB *B, float scale, + int uop_idx, int smem_per_warp) { constexpr int NC = (NCA::D0 > NCB::D0) ? NCA::D0 : NCB::D0; constexpr int CC = (NCA::D1 > NCB::D1) ? NCA::D1 : NCB::D1; using OutShape = Vec; using UnitOutDims = Vec<1, 1, TileShape::D0, TileShape::D1>; - using UnitOp_t = UnitOp; + using UnitOp_t = + UnitOp; constexpr int LeadingDimA = LeadingDims::D0; constexpr int LeadingDimB = LeadingDims::D3; @@ -41,13 +42,12 @@ DEVICE void matmul_scale(DataTypeC *C, DataTypeA *A, DataTypeB *B, DataTypeC *pC = &C[un * BatchStrideNC + uc * BatchStrideCC]; FunctorScale functor{scale}; - gemm_cutlass_fused< - typename std::remove_const::type, LeadingDimA, IsColumnA, - typename std::remove_const::type, LeadingDimB, IsColumnB, - DataTypeC, LeadingDimC, - ProblemSize::D0, ProblemSize::D1, ProblemSize::D2, - TileShape::D0, TileShape::D1, - UnitOp_t, FunctorScale>(pC, pA, pB, functor, uop_idx, smem_per_warp); + gemm_cutlass_fused::type, LeadingDimA, + IsColumnA, typename std::remove_const::type, + LeadingDimB, IsColumnB, DataTypeC, LeadingDimC, + ProblemSize::D0, ProblemSize::D1, ProblemSize::D2, + TileShape::D0, TileShape::D1, UnitOp_t, FunctorScale>( + pC, pA, pB, functor, uop_idx, smem_per_warp); } } // namespace ark diff --git a/ark/include/kernels/reduce.h b/ark/include/kernels/reduce.h index 62af5840b..7abbad0ba 100644 --- a/ark/include/kernels/reduce.h +++ b/ark/include/kernels/reduce.h @@ -357,13 +357,11 @@ struct WwiseReduce { ReduceShapeChecker; constexpr int InConsecBytes = sizeof(DataType) * InShape::W; constexpr int NelemPerThread = - (InConsecBytes % 16 == 0) - ? 16 / sizeof(DataType) - : (InConsecBytes % 8 == 0) - ? 8 / sizeof(DataType) - : (InConsecBytes % 4 == 0) - ? 4 / sizeof(DataType) - : (InConsecBytes % 2 == 0) ? 2 / sizeof(DataType) : 1; + (InConsecBytes % 16 == 0) ? 16 / sizeof(DataType) + : (InConsecBytes % 8 == 0) ? 8 / sizeof(DataType) + : (InConsecBytes % 4 == 0) ? 4 / sizeof(DataType) + : (InConsecBytes % 2 == 0) ? 2 / sizeof(DataType) + : 1; constexpr int NonReduceDimLength = UnitOutDims::N * UnitOutDims::C * UnitOutDims::H; @@ -411,20 +409,30 @@ struct WwiseReduce { if constexpr (NelemPerThread > 8) { #pragma unroll for (int i = 8; i < NelemPerThread; i += 8) { - ReduceType::template reduce<8>(&reduced[0], &reduced[0], &reduced[i]); + ReduceType::template reduce<8>(&reduced[0], &reduced[0], + &reduced[i]); } - ReduceType::template reduce<4>(&reduced[0], &reduced[0], &reduced[4]); - ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]); - ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]); + ReduceType::template reduce<4>(&reduced[0], &reduced[0], + &reduced[4]); + ReduceType::template reduce<2>(&reduced[0], &reduced[0], + &reduced[2]); + ReduceType::template reduce<1>(&reduced[0], &reduced[0], + &reduced[1]); } else if constexpr (NelemPerThread == 8) { - ReduceType::template reduce<4>(&reduced[0], &reduced[0], &reduced[4]); - ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]); - ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]); + ReduceType::template reduce<4>(&reduced[0], &reduced[0], + &reduced[4]); + ReduceType::template reduce<2>(&reduced[0], &reduced[0], + &reduced[2]); + ReduceType::template reduce<1>(&reduced[0], &reduced[0], + &reduced[1]); } else if constexpr (NelemPerThread == 4) { - ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]); - ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]); + ReduceType::template reduce<2>(&reduced[0], &reduced[0], + &reduced[2]); + ReduceType::template reduce<1>(&reduced[0], &reduced[0], + &reduced[1]); } else if constexpr (NelemPerThread == 2) { - ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]); + ReduceType::template reduce<1>(&reduced[0], &reduced[0], + &reduced[1]); } if constexpr (InShape::W % ThreadsPerRow != 0) { diff --git a/ark/model/model_buffer.cpp b/ark/model/model_buffer.cpp index a54b6e81f..3778190d1 100644 --- a/ark/model/model_buffer.cpp +++ b/ark/model/model_buffer.cpp @@ -80,8 +80,7 @@ std::shared_ptr ModelBuffer::deserialize(const Json &serialized) { } else if (!serialized.contains("SendTags")) { ERR(ModelError, "ModelBuffer deserialization failed: missing SendTags"); } else if (!serialized.contains("RecvTags")) { - ERR(ModelError, - "ModelBuffer deserialization failed: missing RecvTags"); + ERR(ModelError, "ModelBuffer deserialization failed: missing RecvTags"); } else if (!serialized.contains("IsExternal")) { ERR(ModelError, "ModelBuffer deserialization failed: missing IsExternal"); diff --git a/ark/model/model_context_manager.cpp b/ark/model/model_context_manager.cpp index 799cce785..e3be664f9 100644 --- a/ark/model/model_context_manager.cpp +++ b/ark/model/model_context_manager.cpp @@ -27,8 +27,6 @@ Json ModelContextManager::get(const std::string& key) const { return context_stack_->get(key); } -Json ModelContextManager::dump() const { - return context_stack_->dump(); -} +Json ModelContextManager::dump() const { return context_stack_->dump(); } } // namespace ark diff --git a/ark/model/model_graph_impl.hpp b/ark/model/model_graph_impl.hpp index b9646d057..18c33f28a 100644 --- a/ark/model/model_graph_impl.hpp +++ b/ark/model/model_graph_impl.hpp @@ -54,7 +54,7 @@ class ModelGraph::Impl { Impl &operator=(const Impl &other); template - ModelOpRef create_op(const std::string &name, Args &&... args) { + ModelOpRef create_op(const std::string &name, Args &&...args) { ModelOpRef op = std::make_shared(std::forward(args)...); std::string name_copy; if (name.empty()) { diff --git a/ark/model/model_op.hpp b/ark/model/model_op.hpp index ab261eb20..6c5bbbbfd 100644 --- a/ark/model/model_op.hpp +++ b/ark/model/model_op.hpp @@ -50,8 +50,8 @@ class ModelOp { return ""; } - virtual std::vector impl_args([ - [maybe_unused]] const Json &config) const { + virtual std::vector impl_args( + [[maybe_unused]] const Json &config) const { return {}; } diff --git a/ark/model/model_tensor.cpp b/ark/model/model_tensor.cpp index 5a98651e7..ff16c1466 100644 --- a/ark/model/model_tensor.cpp +++ b/ark/model/model_tensor.cpp @@ -94,13 +94,9 @@ size_t ModelTensor::shape_bytes() const { return shape_.nelems() * data_type_->bytes(); } -void *ModelTensor::data() const { - return buffer_->data(); -} +void *ModelTensor::data() const { return buffer_->data(); } -void *ModelTensor::data(void *data) { - return buffer_->data(data); -} +void *ModelTensor::data(void *data) { return buffer_->data(data); } bool ModelTensor::is_external() const { return buffer_->is_external(); } @@ -143,8 +139,8 @@ std::shared_ptr ModelTensor::deserialize(const Json &serialized) { serialized["PaddedShape"].get>()); ret->id_ = serialized["Id"]; if (serialized.contains("Location")) { - ret->location_ = static_cast( - serialized["Location"].get()); + ret->location_ = + static_cast(serialized["Location"].get()); } return ret; } diff --git a/ark/model/model_tensor.hpp b/ark/model/model_tensor.hpp index 3174a9139..0eff61b66 100644 --- a/ark/model/model_tensor.hpp +++ b/ark/model/model_tensor.hpp @@ -17,7 +17,8 @@ using ModelDataType = std::shared_ptr; enum class TensorLocation { GLOBAL, // GPU global memory (HBM) — default, current behavior SHARED, // Shared memory (SMEM) — scoped to one thread block - REGISTER, // Register file — scoped to one warp group (no buffer allocation) + REGISTER, // Register file — scoped to one warp group (no buffer + // allocation) // TODO: Register-level fusion is not yet implemented. // Planner and buffer allocator do not yet skip global // allocation for REGISTER tensors. See ModelOpMma/ModelOpStore. diff --git a/ark/ops/ops_all_reduce_test.cpp b/ark/ops/ops_all_reduce_test.cpp index 8cf68b085..dc0ad8219 100644 --- a/ark/ops/ops_all_reduce_test.cpp +++ b/ark/ops/ops_all_reduce_test.cpp @@ -91,7 +91,8 @@ ark::Tensor all_reduce_packet(ark::Model &m, ark::Tensor input, int rank, std::vector outputs; size_t out_off = flag % 2 == 0 ? 0 : nbytes_per_rank * 2; ark::Dims out_shape = {nbytes_per_rank * 2}; - ark::Dims out_strides = {nbytes_per_rank * 2 * 2}; // packet + double buffer + ark::Dims out_strides = {nbytes_per_rank * 2 * + 2}; // packet + double buffer for (int i = 0; i < rank_num; i++) { if (i != rank) { outputs.push_back(m.tensor(out_shape, ark::UINT8, out_strides, @@ -121,7 +122,8 @@ void test_all_reduce_packet_internal(ark::DimType nelem) { ark::Model m(gpu_id, NumGpus); ark::Tensor ones = m.tensor({nelem}, ark::FP16); ark::Tensor data = m.mul(ones, float(gpu_id + 1)); - ark::Tensor output = all_reduce_packet(m, data, gpu_id, NumGpus, 1, data); + ark::Tensor output = + all_reduce_packet(m, data, gpu_id, NumGpus, 1, data); std::vector ones_vec(ones.shape().nelems(), ark::half_t(1.0f)); @@ -186,7 +188,6 @@ ark::Tensor all_reduce_sm(ark::Model &m, ark::Tensor input, int rank, return res; } - template void test_all_reduce_sm_internal(ark::DimType nelem) { auto config_rule = [nelem](const std::string op_str, const std::string) { diff --git a/ark/ops/ops_broadcast.cpp b/ark/ops/ops_broadcast.cpp index 2fd02b801..8642feefd 100644 --- a/ark/ops/ops_broadcast.cpp +++ b/ark/ops/ops_broadcast.cpp @@ -39,13 +39,13 @@ std::string ModelOpBroadcast1::impl_name(const Json &config) const { std::to_string(0)}); } -std::vector ModelOpBroadcast1::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpBroadcast1::impl_args( + [[maybe_unused]] const Json &config) const { return {result_tensors_[0], read_tensors_[0]}; } -Json ModelOpBroadcast1::default_config([ - [maybe_unused]] const ArchRef arch) const { +Json ModelOpBroadcast1::default_config( + [[maybe_unused]] const ArchRef arch) const { Json config; config["NumWarps"] = 1; config["SramBytes"] = 0; @@ -108,8 +108,8 @@ std::string ModelOpBroadcast2::impl_name(const Json &config) const { std::to_string(0)}); } -std::vector ModelOpBroadcast2::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpBroadcast2::impl_args( + [[maybe_unused]] const Json &config) const { std::vector args; args.emplace_back(result_tensors_[0]); args.emplace_back(read_tensors_[0]); @@ -117,8 +117,8 @@ std::vector ModelOpBroadcast2::impl_args([ return args; } -Json ModelOpBroadcast2::default_config([ - [maybe_unused]] const ArchRef arch) const { +Json ModelOpBroadcast2::default_config( + [[maybe_unused]] const ArchRef arch) const { Json config; config["NumWarps"] = 1; config["SramBytes"] = 0; diff --git a/ark/ops/ops_communication.cpp b/ark/ops/ops_communication.cpp index c5be1ca65..4e221e173 100644 --- a/ark/ops/ops_communication.cpp +++ b/ark/ops/ops_communication.cpp @@ -71,8 +71,8 @@ std::string ModelOpSend::impl_name(const Json &config) const { output->data_type()->type_str()}); } -std::vector ModelOpSend::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpSend::impl_args( + [[maybe_unused]] const Json &config) const { return {ModelOffset(write_tensors_[0]), ModelOffset(read_tensors_[0])}; } @@ -107,13 +107,13 @@ std::string ModelOpSendDone::impl_name(const Json &config) const { std::to_string(remote_rank)}); } -std::vector ModelOpSendDone::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpSendDone::impl_args( + [[maybe_unused]] const Json &config) const { return {}; } -Json ModelOpSendDone::default_config([ - [maybe_unused]] const ArchRef arch) const { +Json ModelOpSendDone::default_config( + [[maybe_unused]] const ArchRef arch) const { return {{"ChannelType", "Proxy"}, {"NumTasks", 1}, {"NumWarps", 1}, @@ -138,8 +138,8 @@ ModelOpRecv::ModelOpRecv(ModelTensorRef output, int remote_rank, int tag) } std::string ModelOpRecv::impl_name(const Json &config) const { - check_fields_config(config, - {"ChannelType", "NumTasks", "NumWarps", "SramBytes", "Wait"}); + check_fields_config( + config, {"ChannelType", "NumTasks", "NumWarps", "SramBytes", "Wait"}); std::string channel_type = config["ChannelType"]; bool wait = config["Wait"]; if (channel_type != "Proxy" && channel_type != "SecondaryProxy" && @@ -155,8 +155,8 @@ std::string ModelOpRecv::impl_name(const Json &config) const { std::to_string(max_spin_cnt), std::to_string(wait)}); } -std::vector ModelOpRecv::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpRecv::impl_args( + [[maybe_unused]] const Json &config) const { return {}; } @@ -231,13 +231,13 @@ std::string ModelOpSendPacket::impl_name(const Json &config) const { packet_type, std::to_string(flag)}); } -std::vector ModelOpSendPacket::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpSendPacket::impl_args( + [[maybe_unused]] const Json &config) const { return {ModelOffset(write_tensors_[0]), ModelOffset(read_tensors_[0])}; } -Json ModelOpSendPacket::default_config([ - [maybe_unused]] const ArchRef arch) const { +Json ModelOpSendPacket::default_config( + [[maybe_unused]] const ArchRef arch) const { Json config; if (arch->belongs_to(ARCH_ROCM)) { config["PacketType"] = "mscclpp::LL8Packet"; @@ -324,13 +324,13 @@ std::string ModelOpRecvPacket::impl_name(const Json &config) const { packet_type, std::to_string(flag)}); } -std::vector ModelOpRecvPacket::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpRecvPacket::impl_args( + [[maybe_unused]] const Json &config) const { return {ModelOffset(write_tensors_[0]), ModelOffset(read_tensors_[1])}; } -Json ModelOpRecvPacket::default_config([ - [maybe_unused]] const ArchRef arch) const { +Json ModelOpRecvPacket::default_config( + [[maybe_unused]] const ArchRef arch) const { Json config; if (arch->belongs_to(ARCH_ROCM)) { config["PacketType"] = "mscclpp::LL8Packet"; @@ -418,8 +418,8 @@ std::string ModelOpRecvReduceSendPacket::impl_name(const Json &config) const { input->data_type()->type_str(), std::to_string(flag)}); } -std::vector ModelOpRecvReduceSendPacket::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpRecvReduceSendPacket::impl_args( + [[maybe_unused]] const Json &config) const { std::vector args = {write_tensors_[0], read_tensors_[0], read_tensors_[1]}; for (size_t i = 1; i < write_tensors_.size(); ++i) { @@ -431,8 +431,8 @@ std::vector ModelOpRecvReduceSendPacket::impl_args([ return args; } -Json ModelOpRecvReduceSendPacket::default_config([ - [maybe_unused]] const ArchRef arch) const { +Json ModelOpRecvReduceSendPacket::default_config( + [[maybe_unused]] const ArchRef arch) const { Json config; if (arch->belongs_to(ARCH_ROCM)) { config["PacketType"] = "mscclpp::LL8Packet"; @@ -452,12 +452,10 @@ Json ModelOpRecvReduceSendPacket::default_config([ return config; } -ModelOpRecvReduceSend::ModelOpRecvReduceSend(ModelTensorRef input, - ModelTensorRef output, int rank, - const std::vector &remote_ranks, - int recv_tag, int output_tag, - std::vector &peer_output_refs, - ModelTensorRef scratch) +ModelOpRecvReduceSend::ModelOpRecvReduceSend( + ModelTensorRef input, ModelTensorRef output, int rank, + const std::vector &remote_ranks, int recv_tag, int output_tag, + std::vector &peer_output_refs, ModelTensorRef scratch) : ModelOp("RecvReduceSend") { check_null(input); uint32_t n_remote_ranks = remote_ranks.size(); @@ -519,8 +517,8 @@ std::string ModelOpRecvReduceSend::impl_name(const Json &config) const { input->data_type()->type_str(), input->data_type()->type_str()}); } -std::vector ModelOpRecvReduceSend::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpRecvReduceSend::impl_args( + [[maybe_unused]] const Json &config) const { std::vector args = {write_tensors_[0], read_tensors_[0], read_tensors_[1]}; for (size_t i = 1; i < write_tensors_.size(); ++i) { @@ -532,8 +530,8 @@ std::vector ModelOpRecvReduceSend::impl_args([ return args; } -Json ModelOpRecvReduceSend::default_config([ - [maybe_unused]] const ArchRef arch) const { +Json ModelOpRecvReduceSend::default_config( + [[maybe_unused]] const ArchRef arch) const { Json config; config["NumWarps"] = 1; config["SramBytes"] = 0; @@ -576,12 +574,13 @@ std::string ModelOpDeviceSync::impl_name(const Json &config) const { std::to_string(peer_num), std::to_string(rank)}); } -std::vector ModelOpDeviceSync::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpDeviceSync::impl_args( + [[maybe_unused]] const Json &config) const { return {}; } -Json ModelOpDeviceSync::default_config([[maybe_unused]] const ArchRef arch) const { +Json ModelOpDeviceSync::default_config( + [[maybe_unused]] const ArchRef arch) const { return {{"ChannelType", "Proxy"}, {"NumTasks", 1}, {"NumWarps", 1}, diff --git a/ark/ops/ops_communication.hpp b/ark/ops/ops_communication.hpp index 23f3b84af..f0c0134f2 100644 --- a/ark/ops/ops_communication.hpp +++ b/ark/ops/ops_communication.hpp @@ -103,7 +103,6 @@ class ModelOpRecvReduceSend : public ModelOp { Json default_config(const ArchRef arch = ARCH_ANY) const override; }; - class ModelOpDeviceSync : public ModelOp { public: ModelOpDeviceSync() = default; diff --git a/ark/ops/ops_communication_test.cpp b/ark/ops/ops_communication_test.cpp index de7c42833..e5ffc8804 100644 --- a/ark/ops/ops_communication_test.cpp +++ b/ark/ops/ops_communication_test.cpp @@ -346,7 +346,8 @@ ark::unittest::State test_communication_send_recv_reduce_packet() { ark::unittest::spawn_process([gpu_id]() { ark::Model model(gpu_id, 2); ark::Tensor tns_data = model.tensor({1024}, ark::FP16); - std::vector shard_tensors = model.sharding(tns_data, 0, 512); + std::vector shard_tensors = + model.sharding(tns_data, 0, 512); int peer_gpu_id = (gpu_id + 1) % 2; model.send_packet(shard_tensors[peer_gpu_id], peer_gpu_id, 0, 1); @@ -389,8 +390,7 @@ ark::unittest::State test_communication_send_recv_reduce() { config["NumTasks"] = 4; config["NumWarps"] = 4; config["SramBytes"] = 0; - } - else if (op.at("Type") == "DeviceSync") { + } else if (op.at("Type") == "DeviceSync") { config["ChannelType"] = "Sm"; config["NumTasks"] = 1; config["NumWarps"] = 1; diff --git a/ark/ops/ops_embedding.cpp b/ark/ops/ops_embedding.cpp index 2d6b63720..8f29aba9a 100644 --- a/ark/ops/ops_embedding.cpp +++ b/ark/ops/ops_embedding.cpp @@ -54,13 +54,13 @@ std::string ModelOpEmbedding::impl_name(const Json &config) const { }); } -std::vector ModelOpEmbedding::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpEmbedding::impl_args( + [[maybe_unused]] const Json &config) const { return {result_tensors_[0], read_tensors_[0], read_tensors_[1]}; } -Json ModelOpEmbedding::default_config([ - [maybe_unused]] const ArchRef arch) const { +Json ModelOpEmbedding::default_config( + [[maybe_unused]] const ArchRef arch) const { Json config; config["NumWarps"] = 1; config["SramBytes"] = 0; diff --git a/ark/ops/ops_matmul.hpp b/ark/ops/ops_matmul.hpp index 39ab34894..50e83316d 100644 --- a/ark/ops/ops_matmul.hpp +++ b/ark/ops/ops_matmul.hpp @@ -86,7 +86,8 @@ class ModelOpStore : public ModelOpCopy { public: ModelOpStore() = default; ModelOpStore(ModelTensorRef input, ModelTensorRef output); - // Override to use "copy" kernel (not "store" which clashes with load_store.h) + // Override to use "copy" kernel (not "store" which clashes with + // load_store.h) std::string impl_name(const Json &config) const override; }; diff --git a/ark/ops/ops_noop.cpp b/ark/ops/ops_noop.cpp index 894ab29be..50d1c2640 100644 --- a/ark/ops/ops_noop.cpp +++ b/ark/ops/ops_noop.cpp @@ -16,8 +16,8 @@ std::string ModelOpNoop::impl_name([[maybe_unused]] const Json &config) const { return function_name_string("noop"); } -std::vector ModelOpNoop::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpNoop::impl_args( + [[maybe_unused]] const Json &config) const { return {}; } diff --git a/ark/ops/ops_reduce.cpp b/ark/ops/ops_reduce.cpp index 78dd9d7e6..f2ae5e783 100644 --- a/ark/ops/ops_reduce.cpp +++ b/ark/ops/ops_reduce.cpp @@ -106,8 +106,8 @@ std::string ModelOpReduce::impl_name(const Json &config) const { }); } -std::vector ModelOpReduce::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpReduce::impl_args( + [[maybe_unused]] const Json &config) const { return {result_tensors_[0], read_tensors_[0]}; } diff --git a/ark/ops/ops_scalar.cpp b/ark/ops/ops_scalar.cpp index 944a7247c..c65bc93de 100644 --- a/ark/ops/ops_scalar.cpp +++ b/ark/ops/ops_scalar.cpp @@ -39,14 +39,14 @@ std::string ModelOpScalarAssign::impl_name(const Json &config) const { std::to_string(num_warps), std::to_string(0)}); } -std::vector ModelOpScalarAssign::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpScalarAssign::impl_args( + [[maybe_unused]] const Json &config) const { float val = args_.at("Value").value(); return {result_tensors_[0], val}; } -Json ModelOpScalarAssign::default_config([ - [maybe_unused]] const ArchRef arch) const { +Json ModelOpScalarAssign::default_config( + [[maybe_unused]] const ArchRef arch) const { Json config; config["NumWarps"] = 1; config["SramBytes"] = 0; @@ -84,8 +84,8 @@ ModelOpScalarAdd::ModelOpScalarAdd(ModelTensorRef input, float factor, verify(); } -std::vector ModelOpScalarAdd::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpScalarAdd::impl_args( + [[maybe_unused]] const Json &config) const { float factor = args_.at("Factor").value(); return {result_tensors_[0], read_tensors_[0], factor}; } @@ -106,8 +106,8 @@ ModelOpScalarMul::ModelOpScalarMul(ModelTensorRef input, float factor, verify(); } -std::vector ModelOpScalarMul::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpScalarMul::impl_args( + [[maybe_unused]] const Json &config) const { float factor = args_.at("Factor").value(); return {result_tensors_[0], read_tensors_[0], factor}; } diff --git a/ark/ops/ops_test_common.cpp b/ark/ops/ops_test_common.cpp index bfbe79a70..f902e626d 100644 --- a/ark/ops/ops_test_common.cpp +++ b/ark/ops/ops_test_common.cpp @@ -32,12 +32,13 @@ std::ostream &operator<<(std::ostream &os, const OpsTestResult &result) { return os; } -OpsTestResult op_test( - const std::string &test_name_prefix, const Model &model, - const std::vector &inputs, const std::vector &outputs, - OpsTestBaseline baseline, const std::vector &inputs_data, - const std::vector &config_rules, - bool print_on_error) { +OpsTestResult op_test(const std::string &test_name_prefix, const Model &model, + const std::vector &inputs, + const std::vector &outputs, + OpsTestBaseline baseline, + const std::vector &inputs_data, + const std::vector &config_rules, + bool print_on_error) { DefaultExecutor exe(model, -1, nullptr, config_rules); std::vector>> inputs_data_storages; diff --git a/ark/ops/ops_test_common.hpp b/ark/ops/ops_test_common.hpp index 12fb88a7b..cd3f0b7f6 100644 --- a/ark/ops/ops_test_common.hpp +++ b/ark/ops/ops_test_common.hpp @@ -167,12 +167,13 @@ using OpsTestBaseline = std::function &inputs, const std::vector &outputs, - OpsTestBaseline baseline, const std::vector &inputs_data = {}, - const std::vector &config_rules = {}, - bool print_on_error = false); +OpsTestResult op_test(const std::string &test_name_prefix, const Model &model, + const std::vector &inputs, + const std::vector &outputs, + OpsTestBaseline baseline, + const std::vector &inputs_data = {}, + const std::vector &config_rules = {}, + bool print_on_error = false); OpsTestGpuMem to_gpu(void *host_ptr, size_t size); diff --git a/ark/ops/ops_transpose.cpp b/ark/ops/ops_transpose.cpp index b7a67c8c0..f1b079c2d 100644 --- a/ark/ops/ops_transpose.cpp +++ b/ark/ops/ops_transpose.cpp @@ -112,13 +112,13 @@ std::string ModelOpTranspose::impl_name(const Json &config) const { }); } -std::vector ModelOpTranspose::impl_args([ - [maybe_unused]] const Json &config) const { +std::vector ModelOpTranspose::impl_args( + [[maybe_unused]] const Json &config) const { return {result_tensors_[0], read_tensors_[0]}; } -Json ModelOpTranspose::default_config([ - [maybe_unused]] const ArchRef arch) const { +Json ModelOpTranspose::default_config( + [[maybe_unused]] const ArchRef arch) const { Json config; config["NumWarps"] = 1; config["SramBytes"] = 0; diff --git a/examples/llama/model.py b/examples/llama/model.py index ebd424612..ad3c2f0b9 100644 --- a/examples/llama/model.py +++ b/examples/llama/model.py @@ -2,7 +2,7 @@ # Licensed under the MIT license. """LLaMA 2 Transformer model. - Correspond to https://github.com/facebookresearch/llama/blob/main/llama/model.py +Correspond to https://github.com/facebookresearch/llama/blob/main/llama/model.py """ import ark diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py index 6b7f3a5bb..ec525b4d7 100644 --- a/examples/llama/model_test.py +++ b/examples/llama/model_test.py @@ -11,7 +11,6 @@ import multiprocessing as mp from pathlib import Path - sys.path.append("llama") import llama.model as model_pt import model as model_ark @@ -21,7 +20,6 @@ from model import ModelArgs, ModelArgs7B from generator import precompute_freqs_cis - ckpt_dir: str = "" numpy_dtype_to_torch_dtype: dict = { diff --git a/python/ark/__init__.py b/python/ark/__init__.py index 63480262c..61ff98a31 100644 --- a/python/ark/__init__.py +++ b/python/ark/__init__.py @@ -9,7 +9,6 @@ from .core import version from .model import Model - __version__ = version() diff --git a/python/ark/executor.py b/python/ark/executor.py index 14f0817a8..84809c49a 100644 --- a/python/ark/executor.py +++ b/python/ark/executor.py @@ -3,7 +3,6 @@ from .core import CoreExecutor - __all__ = ["Executor"] diff --git a/python/ark/model.py b/python/ark/model.py index e103d4083..87c7279a3 100644 --- a/python/ark/model.py +++ b/python/ark/model.py @@ -5,7 +5,6 @@ from . import log from .core import CoreModel - __all__ = ["Model"] ModelState = NewType("ModelState", None) diff --git a/python/ark/ops.py b/python/ark/ops.py index c0eefa2e0..2b7e387f3 100644 --- a/python/ark/ops.py +++ b/python/ark/ops.py @@ -9,7 +9,6 @@ from .model import Model from . import log - __all__ = [ "tensor", "parameter", @@ -493,6 +492,7 @@ def recv( Model.get_model().recv(output._tensor, remote_rank, tag, name) ) + ################################################################################ diff --git a/python/ark/planner.py b/python/ark/planner.py index 0ed9113e1..79b0fb7e4 100644 --- a/python/ark/planner.py +++ b/python/ark/planner.py @@ -9,7 +9,6 @@ from .core import CorePlanner, CorePlannerContext from .model import Model - __all__ = ["Plan", "PlannerContext", "Planner"] diff --git a/python/ark/runtime.py b/python/ark/runtime.py index 0edfd26ec..b924aff82 100644 --- a/python/ark/runtime.py +++ b/python/ark/runtime.py @@ -11,7 +11,6 @@ from .model import Model from typing import Dict - __all__ = ["Runtime"] diff --git a/python/tensor_py.cpp b/python/tensor_py.cpp index c6fde978e..d8d687546 100644 --- a/python/tensor_py.cpp +++ b/python/tensor_py.cpp @@ -9,7 +9,7 @@ namespace py = pybind11; -void register_tensor(py::module &m) { +void register_tensor(py::module& m) { py::class_(m, "CoreTensor") .def("id", &ark::Tensor::id) .def("shape", &ark::Tensor::shape) diff --git a/python/unittest/test_placeholder.py b/python/unittest/test_placeholder.py index 640cc0e3c..74744853e 100644 --- a/python/unittest/test_placeholder.py +++ b/python/unittest/test_placeholder.py @@ -13,9 +13,7 @@ def test_placeholder_is_external(): assert t_placeholder.is_external(), "Placeholder tensor should be external" t_regular = ark.tensor([64], ark.fp32) - assert not t_regular.is_external(), ( - "Regular tensor should not be external" - ) + assert not t_regular.is_external(), "Regular tensor should not be external" @pytest_ark(need_torch=True) @@ -34,9 +32,9 @@ def test_placeholder_immediate_binding(): result = out.to_numpy() expected = torch_data.cpu().numpy() + 1.0 - assert np.allclose(result, expected), ( - f"max diff: {np.max(np.abs(result - expected))}" - ) + assert np.allclose( + result, expected + ), f"max diff: {np.max(np.abs(result - expected))}" @pytest_ark(need_torch=True) @@ -44,7 +42,9 @@ def test_placeholder_scalar_add(): """Test placeholder with scalar addition on non-aligned shape.""" import torch - torch_data = torch.arange(10, dtype=torch.float32, device="cuda:0").reshape(10, 1) + torch_data = torch.arange(10, dtype=torch.float32, device="cuda:0").reshape( + 10, 1 + ) t = ark.placeholder([10, 1], ark.fp32, data=torch_data) out = ark.add(t, 5.0) @@ -55,9 +55,9 @@ def test_placeholder_scalar_add(): result = out.to_numpy() expected = torch_data.cpu().numpy() + 5.0 - assert np.allclose(result, expected), ( - f"max diff: {np.max(np.abs(result - expected))}" - ) + assert np.allclose( + result, expected + ), f"max diff: {np.max(np.abs(result - expected))}" @pytest_ark(need_torch=True) @@ -79,9 +79,9 @@ def test_placeholder_multiple(): result = out.to_numpy() expected = torch_a.cpu().numpy() + torch_b.cpu().numpy() - assert np.allclose(result, expected), ( - f"max diff: {np.max(np.abs(result - expected))}" - ) + assert np.allclose( + result, expected + ), f"max diff: {np.max(np.abs(result - expected))}" @pytest_ark(need_torch=True) @@ -100,9 +100,9 @@ def test_placeholder_fp16(): result = out.to_numpy() expected = torch_data.cpu().numpy() * 0.5 - assert np.allclose(result, expected, atol=1e-2), ( - f"max diff: {np.max(np.abs(result - expected))}" - ) + assert np.allclose( + result, expected, atol=1e-2 + ), f"max diff: {np.max(np.abs(result - expected))}" @pytest_ark(need_torch=True) @@ -122,9 +122,9 @@ def test_placeholder_from_torch(): result = out.to_numpy() expected = torch_tensor.cpu().numpy() + 10.0 - assert np.allclose(result, expected), ( - f"max diff: {np.max(np.abs(result - expected))}" - ) + assert np.allclose( + result, expected + ), f"max diff: {np.max(np.abs(result - expected))}" @pytest_ark(need_torch=True) @@ -144,9 +144,9 @@ def test_placeholder_tensor_mappings_launch(): result = out.to_numpy() expected = torch_input.cpu().numpy() * 3.0 - assert np.allclose(result, expected), ( - f"max diff: {np.max(np.abs(result - expected))}" - ) + assert np.allclose( + result, expected + ), f"max diff: {np.max(np.abs(result - expected))}" @pytest_ark(need_torch=True) @@ -169,9 +169,9 @@ def test_placeholder_runtime_rebinding(): result2 = out.to_numpy() assert np.allclose(result1, 6.0), f"Run 1: expected 6.0, got {result1[:5]}" - assert np.allclose(result2, 11.0), ( - f"Run 2: expected 11.0, got {result2[:5]}" - ) + assert np.allclose( + result2, 11.0 + ), f"Run 2: expected 11.0, got {result2[:5]}" @pytest_ark(need_torch=True) From c8a3ac34dbd47fa29b1425d0b540fe2da67b2dd3 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 4 Jun 2026 05:30:45 +0000 Subject: [PATCH 07/17] Retry CI after flaky lint run From 6d9d84d4aaf537439965156d5e3d854ef819c0b5 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 4 Jun 2026 05:39:22 +0000 Subject: [PATCH 08/17] Pin lint workflow to Ubuntu 24.04 --- .github/workflows/lint.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 5db3339c0..3f7afe352 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -7,7 +7,7 @@ on: jobs: linters: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - name: Check out Git repository @@ -28,7 +28,7 @@ jobs: run: bash tools/lint.sh dry spelling: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - name: Check out Git repository From e022b62a78da22d05a7dcec3d5f1f589475660ed Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 4 Jun 2026 05:50:08 +0000 Subject: [PATCH 09/17] Apply local clang-format fixes --- ark/include/kernels/common/sync.h | 3 +-- ark/include/kernels/common/vector_type.h | 9 +++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ark/include/kernels/common/sync.h b/ark/include/kernels/common/sync.h index ffa5ac33b..cf22e357d 100644 --- a/ark/include/kernels/common/sync.h +++ b/ark/include/kernels/common/sync.h @@ -120,8 +120,7 @@ DEVICE void sync_warps() { if (atomicInc(&state->cnt[group_id], MaxOldCnt) == MaxOldCnt) { state->flag[group_id] = tmp; } else { - while (atomicAdd(&state->flag[group_id], 0) != tmp) - ; + while (atomicAdd(&state->flag[group_id], 0) != tmp); } state->is_inc_flag[group_id] = tmp; } diff --git a/ark/include/kernels/common/vector_type.h b/ark/include/kernels/common/vector_type.h index 24f9dff3f..f247c53ee 100644 --- a/ark/include/kernels/common/vector_type.h +++ b/ark/include/kernels/common/vector_type.h @@ -71,8 +71,8 @@ struct Constant { template struct IntrinsicCompute1Exists { template - static auto test(const InputVtype &) - -> decltype(&U::compute, std::true_type{}); + static auto test(const InputVtype &) -> decltype(&U::compute, + std::true_type{}); template static auto test(...) -> std::false_type; @@ -84,8 +84,9 @@ struct IntrinsicCompute1Exists { template struct IntrinsicCompute2Exists { template - static auto test(const InputVtype &, const InputVtype &) - -> decltype(&U::compute, std::true_type{}); + static auto test(const InputVtype &, + const InputVtype &) -> decltype(&U::compute, + std::true_type{}); template static auto test(...) -> std::false_type; From 28a7f1035abbaea125a2436d9f5bda9f085e3f52 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 4 Jun 2026 07:07:25 +0000 Subject: [PATCH 10/17] Harden CodeQL build in unit-test workflow --- .github/workflows/ut.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ut.yml b/.github/workflows/ut.yml index 39fd3d95f..55d969774 100644 --- a/.github/workflows/ut.yml +++ b/.github/workflows/ut.yml @@ -122,7 +122,7 @@ jobs: run: shell: bash timeout-minutes: 60 - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 container: image: ghcr.io/microsoft/ark/ark:base-dev-cuda12.2 permissions: @@ -150,7 +150,7 @@ jobs: run: | mkdir build && cd build cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_CUDA=ON -DARK_BUILD_TESTS=OFF .. - make build ark_py + cmake --build . -j --target ark ark_py - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v3 From 472c64bd89c0e1b43b2a6eaede2dbf5f58a60bd6 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 4 Jun 2026 07:12:28 +0000 Subject: [PATCH 11/17] Remove advanced CodeQL job from unit tests workflow --- .github/workflows/ut.yml | 39 --------------------------------------- 1 file changed, 39 deletions(-) diff --git a/.github/workflows/ut.yml b/.github/workflows/ut.yml index 55d969774..c38189ff8 100644 --- a/.github/workflows/ut.yml +++ b/.github/workflows/ut.yml @@ -117,42 +117,3 @@ jobs: run: | python3 ./examples/tutorial/quickstart_tutorial.py - CodeQL: - defaults: - run: - shell: bash - timeout-minutes: 60 - runs-on: ubuntu-24.04 - container: - image: ghcr.io/microsoft/ark/ark:base-dev-cuda12.2 - permissions: - actions: read - contents: read - security-events: write - concurrency: - group: ${{ github.workflow }}-codeql-${{ github.ref }} - cancel-in-progress: true - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Dubious ownership exception - run: | - git config --global --add safe.directory /__w/ark/ark - - - name: Initialize CodeQL - uses: github/codeql-action/init@v3 - with: - languages: cpp - - - name: Build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_CUDA=ON -DARK_BUILD_TESTS=OFF .. - cmake --build . -j --target ark ark_py - - - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v3 - with: - category: "/language:cpp" From 5f585ec7c09f3dbf793ca4fe7e67e73197f26eba Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 4 Jun 2026 07:30:05 +0000 Subject: [PATCH 12/17] Use existing A100 runner label for CUDA tests --- .github/workflows/ut.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ut.yml b/.github/workflows/ut.yml index c38189ff8..195f1bed8 100644 --- a/.github/workflows/ut.yml +++ b/.github/workflows/ut.yml @@ -26,7 +26,7 @@ jobs: matrix: include: - platform: cuda - runner: [self-hosted, CUDA] + runner: [self-hosted, a100] container: nvcr.io/nvidia/pytorch:26.03-py3 container_options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 - platform: rocm From 8602071fe26481d36e00c7a04624208a3b691398 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 4 Jun 2026 07:32:26 +0000 Subject: [PATCH 13/17] Restore CUDA runner label after runner update --- .github/workflows/ut.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ut.yml b/.github/workflows/ut.yml index 195f1bed8..c38189ff8 100644 --- a/.github/workflows/ut.yml +++ b/.github/workflows/ut.yml @@ -26,7 +26,7 @@ jobs: matrix: include: - platform: cuda - runner: [self-hosted, a100] + runner: [self-hosted, CUDA] container: nvcr.io/nvidia/pytorch:26.03-py3 container_options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 - platform: rocm From 0a608e60eefbb018e664ea1ddc8ee6141cb6ae3a Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 4 Jun 2026 08:16:51 +0000 Subject: [PATCH 14/17] Fix stale NumTasks in communication tests --- ark/ops/ops_communication_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ark/ops/ops_communication_test.cpp b/ark/ops/ops_communication_test.cpp index e5ffc8804..c9f375d70 100644 --- a/ark/ops/ops_communication_test.cpp +++ b/ark/ops/ops_communication_test.cpp @@ -194,7 +194,7 @@ ark::unittest::State test_communication_send_recv_bidir_sm() { config["ChannelType"] = "Sm"; config["Signal"] = true; config["Tile"] = {1, 256}; - config["NumTasks"] = 4; + config["NumTasks"] = 2; config["NumWarps"] = 4; config["SramBytes"] = 0; } else if (op.at("Type") == "SendDone") { @@ -387,7 +387,7 @@ ark::unittest::State test_communication_send_recv_reduce() { config["ChannelType"] = "Sm"; config["Signal"] = false; config["Tile"] = {1, 256}; - config["NumTasks"] = 4; + config["NumTasks"] = 2; config["NumWarps"] = 4; config["SramBytes"] = 0; } else if (op.at("Type") == "DeviceSync") { From e312d31092cc18d30db323e39a8cfd390f4c5a7b Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 4 Jun 2026 08:47:40 +0000 Subject: [PATCH 15/17] Fix communication test task counts --- ark/ops/ops_communication_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ark/ops/ops_communication_test.cpp b/ark/ops/ops_communication_test.cpp index c9f375d70..e3e5252a5 100644 --- a/ark/ops/ops_communication_test.cpp +++ b/ark/ops/ops_communication_test.cpp @@ -194,7 +194,7 @@ ark::unittest::State test_communication_send_recv_bidir_sm() { config["ChannelType"] = "Sm"; config["Signal"] = true; config["Tile"] = {1, 256}; - config["NumTasks"] = 2; + config["NumTasks"] = 4; config["NumWarps"] = 4; config["SramBytes"] = 0; } else if (op.at("Type") == "SendDone") { From a2cc5b44602b01783907670684051550a888de77 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 4 Jun 2026 09:00:11 +0000 Subject: [PATCH 16/17] Fix runtime placeholder rebinding --- python/ark/runtime.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/python/ark/runtime.py b/python/ark/runtime.py index b924aff82..9430c6caa 100644 --- a/python/ark/runtime.py +++ b/python/ark/runtime.py @@ -31,6 +31,16 @@ class StateCode(Enum): def __init__(self): self.loop_mode: bool = True self.state: Runtime.StateCode = Runtime.StateCode.Init + self.stream: int = 0 + self.record: bool = False + + def _normalize_tensor_mappings(self, tensor_mappings: Dict) -> Dict: + normalized = {} + for ark_tensor, torch_tensor in tensor_mappings.items(): + if not isinstance(torch_tensor, torch.Tensor): + raise log.InvalidUsageError("Must bind PyTorch tensor") + normalized[ark_tensor._tensor] = torch_tensor.data_ptr() + return normalized def __enter__(self) -> "Runtime": return self @@ -81,13 +91,7 @@ def launch( if self.launched(): # Stop the current running model self.stop() - for ark_tensor in list(tensor_mappings.keys()): - torch_tensor = tensor_mappings[ark_tensor] - if not isinstance(torch_tensor, torch.Tensor): - raise log.InvalidUsageError("Must bind PyTorch tensor") - internal_ark_tensor = ark_tensor._tensor - tensor_mappings[internal_ark_tensor] = torch_tensor.data_ptr() - del tensor_mappings[ark_tensor] + tensor_mappings = self._normalize_tensor_mappings(tensor_mappings) # Recompile if the previous launch was not compiled with the same info # or if this is the first launch exe = Executor.get() @@ -96,6 +100,8 @@ def launch( exe.launch(tensor_mappings, stream, loop_mode, record) self.state = Runtime.StateCode.LaunchedNotRunning self.loop_mode = loop_mode + self.stream = exe.stream() + self.record = record def run( self, @@ -114,12 +120,14 @@ def run( ) if self.state != Runtime.StateCode.LaunchedNotRunning: raise log.InvalidUsageError(f"ARK runtime is not launched") + tensor_mappings = self._normalize_tensor_mappings(tensor_mappings) + exe = Executor.get() + if tensor_mappings and not self.loop_mode: + exe.stop() + exe.launch(tensor_mappings, self.stream, False, self.record) + tensor_mappings = {} self.state = Runtime.StateCode.Running - ph_map = {} - for ark_tensor in list(tensor_mappings.keys()): - t = tensor_mappings[ark_tensor] - ph_map[ark_tensor._tensor] = t.data_ptr() - Executor.get().run(iter, ph_map) + exe.run(iter, tensor_mappings) if not non_blocking: self.wait() From a2016bc42ef2ba26149acef16f13af1e92679338 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 4 Jun 2026 23:42:20 +0000 Subject: [PATCH 17/17] CI fix --- ark/ops/ops_math_test.cpp | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/ark/ops/ops_math_test.cpp b/ark/ops/ops_math_test.cpp index f5774ab8e..74b351f60 100644 --- a/ark/ops/ops_math_test.cpp +++ b/ark/ops/ops_math_test.cpp @@ -93,6 +93,15 @@ void baseline_sqrt(std::vector &outputs, } }; +std::vector stable_positive_radical_inputs(ark::DimType nelems) { + static const float kInputs[] = {0.25f, 1.0f, 4.0f, 16.0f}; + std::vector data(nelems); + for (ark::DimType i = 0; i < nelems; ++i) { + data[i] = kInputs[i % 4]; + } + return data; +} + ark::unittest::State test_gelu_fp32() { ark::Model m; ark::Tensor t = m.tensor({4, 2, 1024}, ark::FP32); @@ -240,9 +249,10 @@ ark::unittest::State test_math_rsqrt_fp32() { ark::Model m; ark::Tensor t = m.tensor({4, 2, 1024}, ark::FP32); ark::Tensor out = m.rsqrt(t); + auto data = stable_positive_radical_inputs(t.shape().nelems()); - auto result = - ark::op_test("math_rsqrt_fp32", m, {t}, {out}, baseline_rsqrt); + auto result = ark::op_test("math_rsqrt_fp32", m, {t}, {out}, + baseline_rsqrt, {data.data()}); UNITTEST_LOG(result); UNITTEST_TRUE(result.max_diff[0] < 1e-4f); return ark::unittest::SUCCESS; @@ -306,9 +316,10 @@ ark::unittest::State test_math_sqrt_fp32() { ark::Model m; ark::Tensor t = m.tensor({4, 2, 1024}, ark::FP32); ark::Tensor out = m.sqrt(t); + auto data = stable_positive_radical_inputs(t.shape().nelems()); - auto result = - ark::op_test("math_sqrt_fp32", m, {t}, {out}, baseline_sqrt); + auto result = ark::op_test("math_sqrt_fp32", m, {t}, {out}, + baseline_sqrt, {data.data()}); UNITTEST_LOG(result); UNITTEST_TRUE(result.max_diff[0] < 1e-6f); return ark::unittest::SUCCESS; @@ -318,9 +329,11 @@ ark::unittest::State test_math_sqrt_fp16_small_last_dim() { ark::Model m; ark::Tensor t = m.tensor({4, 1024, 1}, ark::FP16, {4, 1024, 2}); ark::Tensor out = m.sqrt(t); + auto fp32_data = stable_positive_radical_inputs(t.shape().nelems()); + std::vector data(fp32_data.begin(), fp32_data.end()); auto result = ark::op_test("math_sqrt_fp16_small_last_dim", m, {t}, {out}, - baseline_sqrt); + baseline_sqrt, {data.data()}); UNITTEST_LOG(result); UNITTEST_TRUE(result.max_diff[0] < 1e-4f); return ark::unittest::SUCCESS;