diff --git a/.github/actions/fetch_ctk/action.yml b/.github/actions/fetch_ctk/action.yml index 3720a853..f29aa105 100644 --- a/.github/actions/fetch_ctk/action.yml +++ b/.github/actions/fetch_ctk/action.yml @@ -19,6 +19,26 @@ runs: echo "CTK_CACHE_KEY=mini-ctk-${{ inputs.cuda-version }}-${{ inputs.host-platform }}" >> $GITHUB_ENV echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}-${{ inputs.host-platform }}.tar.gz" >> $GITHUB_ENV + - name: Install dependencies + shell: bash --noprofile --norc -xeuo pipefail {0} + run: | + if (command -v curl 2>&1 >/dev/null) && (command -v zstd 2>&1 >/dev/null); then + echo "All dependencies are found. Do nothing." + exit 0 + fi + if ! command -v sudo 2>&1 >/dev/null; then + if [[ $EUID == 0 ]]; then + alias SUDO="" + else + echo "The following oprations require root access." + exit 1 + fi + else + alias SUDO="sudo" + fi + SUDO apt update + SUDO apt install -y zstd curl + - name: Download CTK cache id: ctk-get-cache uses: actions/cache/restore@v4 @@ -32,14 +52,14 @@ runs: if: ${{ steps.ctk-get-cache.outputs.cache-hit != 'true' }} shell: bash --noprofile --norc -xeuo pipefail {0} run: | - CUDA_PATH="$(pwd)/cuda_toolkit" + CUDA_PATH="./cuda_toolkit" mkdir $CUDA_PATH # The binary archives (redist) are guaranteed to be updated as part of the release posting. CTK_BASE_URL="https://developer.download.nvidia.com/compute/cuda/redist/" CTK_JSON_URL="$CTK_BASE_URL/redistrib_${{ inputs.cuda-version }}.json" if [[ "${{ inputs.host-platform }}" == linux* ]]; then - if [[ "${{ inputs.host-platform }}" == "linux-x64" ]]; then + if [[ "${{ inputs.host-platform }}" == "linux-64" ]]; then CTK_SUBDIR="linux-x86_64" elif [[ "${{ inputs.host-platform }}" == "linux-aarch64" ]]; then CTK_SUBDIR="linux-sbsa" @@ -47,7 +67,7 @@ runs: function extract() { tar -xvf $1 -C $CUDA_PATH --strip-components=1 } - elif [[ "${{ inputs.host-platform }}" == "win-x64" ]]; then + elif [[ "${{ inputs.host-platform }}" == "win-64" ]]; then CTK_SUBDIR="windows-x86_64" function extract() { _TEMP_DIR_=$(mktemp -d) @@ -102,12 +122,13 @@ runs: shell: bash --noprofile --norc -xeuo pipefail {0} run: | ls -l - CUDA_PATH="$(pwd)/cuda_toolkit" + CUDA_PATH="./cuda_toolkit" tar -xzvf $CTK_CACHE_FILENAME ls -l $CUDA_PATH if [ ! -d "$CUDA_PATH/include" ]; then exit 1 fi + CUDA_PATH=$(realpath ${CUDA_PATH}) echo "CUDA_PATH=${CUDA_PATH}" >> $GITHUB_ENV echo "${CUDA_PATH}/bin" >> $GITHUB_PATH diff --git a/.github/workflows/ci-gh.yml b/.github/workflows/ci-gh.yml index 795d6d0a..81cf6da3 100644 --- a/.github/workflows/ci-gh.yml +++ b/.github/workflows/ci-gh.yml @@ -1,4 +1,4 @@ -name: "CI" +name: CI concurrency: group: ${{ startsWith(github.ref_name, 'main') && format('unique-{0}', github.run_id) || format('ci-build-and-test-on-{0}-from-{1}', github.event_name, github.ref_name) }} @@ -12,7 +12,6 @@ on: jobs: ci: - name: "CI" uses: ./.github/workflows/gh-build-and-test.yml secrets: inherit diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml index 13c5feef..2eaad3ef 100644 --- a/.github/workflows/gh-build-and-test.yml +++ b/.github/workflows/gh-build-and-test.yml @@ -5,29 +5,28 @@ jobs: strategy: fail-fast: false matrix: - # TODO: align host-platform names with conda convention host-platform: - - linux-x64 + - linux-64 - linux-aarch64 - - win-x64 + - win-64 python-version: - - "3.13" - - "3.12" - - "3.11" - - "3.10" +# - "3.13" +# - "3.12" +# - "3.11" +# - "3.10" - "3.9" cuda-version: # Note: this is for build-time only. - "12.6.2" - name: Build (${{ matrix.host-platform }}, Python "${{ matrix.python-version }}") + name: Build (${{ matrix.host-platform }}, Python ${{ matrix.python-version }}, CUDA ${{ matrix.cuda-version }}) if: ${{ github.repository_owner == 'nvidia' }} permissions: id-token: write # This is required for configure-aws-credentials contents: read # This is required for actions/checkout - runs-on: ${{ (matrix.host-platform == 'linux-x64' && 'linux-amd64-cpu8') || + runs-on: ${{ (matrix.host-platform == 'linux-64' && 'linux-amd64-cpu8') || (matrix.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') || - (matrix.host-platform == 'win-x64' && 'windows-2019') }} - # (matrix.host-platform == 'win-x64' && 'windows-amd64-cpu8') }} + (matrix.host-platform == 'win-64' && 'windows-2019') }} + # (matrix.host-platform == 'win-64' && 'windows-amd64-cpu8') }} outputs: BUILD_CTK_VER: ${{ steps.pass_env.outputs.CUDA_VERSION }} steps: @@ -116,47 +115,47 @@ jobs: host-platform: ${{ matrix.host-platform }} cuda-version: ${{ matrix.cuda-version }} - - name: Build cuda.bindings wheel - uses: pypa/cibuildwheel@v2.22.0 - env: - CIBW_BUILD: ${{ env.CIBW_BUILD }} - CIBW_ARCHS_LINUX: "native" - CIBW_BUILD_VERBOSITY: 1 - # CIBW mounts the host filesystem under /host - CIBW_ENVIRONMENT_LINUX: > - CUDA_PATH=/host/${{ env.CUDA_PATH }} - PARALLEL_LEVEL=${{ env.PARALLEL_LEVEL }} - CIBW_ENVIRONMENT_WINDOWS: > - CUDA_HOME="$(cygpath -w ${{ env.CUDA_PATH }})" - # PARALLEL_LEVEL=${{ env.PARALLEL_LEVEL }} - with: - package-dir: ./cuda_bindings/ - output-dir: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} - - - name: List the cuda.bindings artifacts directory - shell: bash --noprofile --norc -xeuo pipefail {0} - run: | - if [[ "${{ matrix.host-platform }}" == win* ]]; then - export CHOWN=chown - else - export CHOWN="sudo chown" - fi - $CHOWN -R $(whoami) ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} - ls -lahR ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} - - # TODO: enable this after NVIDIA/cuda-python#297 is resolved - # - name: Check cuda.bindings wheel - # shell: bash --noprofile --norc -xeuo pipefail {0} - # run: | - # twine check ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl - - - name: Upload cuda.bindings build artifacts - uses: actions/upload-artifact@v4 - with: - name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }} - path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl - if-no-files-found: error - overwrite: 'true' +# - name: Build cuda.bindings wheel +# uses: pypa/cibuildwheel@v2.22.0 +# env: +# CIBW_BUILD: ${{ env.CIBW_BUILD }} +# CIBW_ARCHS_LINUX: "native" +# CIBW_BUILD_VERBOSITY: 1 +# # CIBW mounts the host filesystem under /host +# CIBW_ENVIRONMENT_LINUX: > +# CUDA_PATH=/host/${{ env.CUDA_PATH }} +# PARALLEL_LEVEL=${{ env.PARALLEL_LEVEL }} +# CIBW_ENVIRONMENT_WINDOWS: > +# CUDA_HOME="$(cygpath -w ${{ env.CUDA_PATH }})" +# # PARALLEL_LEVEL=${{ env.PARALLEL_LEVEL }} +# with: +# package-dir: ./cuda_bindings/ +# output-dir: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} +# +# - name: List the cuda.bindings artifacts directory +# shell: bash --noprofile --norc -xeuo pipefail {0} +# run: | +# if [[ "${{ matrix.host-platform }}" == win* ]]; then +# export CHOWN=chown +# else +# export CHOWN="sudo chown" +# fi +# $CHOWN -R $(whoami) ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} +# ls -lahR ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} +# +# # TODO: enable this after NVIDIA/cuda-python#297 is resolved +# # - name: Check cuda.bindings wheel +# # shell: bash --noprofile --norc -xeuo pipefail {0} +# # run: | +# # twine check ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl +# +# - name: Upload cuda.bindings build artifacts +# uses: actions/upload-artifact@v4 +# with: +# name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }} +# path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl +# if-no-files-found: error +# overwrite: 'true' - name: Pass environment variables to the next runner id: pass_env @@ -166,31 +165,39 @@ jobs: test: strategy: fail-fast: false + # TODO: add driver version here matrix: - # TODO: align host-platform names with conda convention host-platform: - - linux-x64 + - linux-64 - linux-aarch64 # TODO: enable testing once win-64 GPU runners are up - # - win-x64 + # - win-64 python-version: - - "3.13" - - "3.12" - - "3.11" - - "3.10" +# - "3.13" +# - "3.12" +# - "3.11" +# - "3.10" - "3.9" cuda-version: # Note: this is for test-time only. - "12.6.2" - "12.0.1" - "11.8.0" - name: Test (${{ matrix.host-platform }}, CUDA ${{ matrix.cuda-version }}, Python "${{ matrix.python-version }}") + runner: + - default + include: + - host-platform: linux-64 + python-version: "3.12" + cuda-version: "12.6.2" + runner: H100 + name: Test (${{ matrix.host-platform }}, Python ${{ matrix.python-version }}, CUDA ${{ matrix.cuda-version }}, Runner ${{ matrix.runner }}) if: ${{ (github.repository_owner == 'nvidia') }} permissions: id-token: write # This is required for configure-aws-credentials contents: read # This is required for actions/checkout - runs-on: ${{ (matrix.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') || - (matrix.host-platform == 'linux-aarch64' && 'linux-arm64-gpu-a100-latest-1') }} + runs-on: ${{ (matrix.runner == 'default' && matrix.host-platform == 'linux-64' && 'linux-amd64-gpu-v100-latest-1') || + (matrix.runner == 'default' && matrix.host-platform == 'linux-aarch64' && 'linux-arm64-gpu-a100-latest-1') || + (matrix.runner == 'H100' && 'linux-amd64-gpu-h100-latest-1-testing') }} # Our self-hosted runners require a container # TODO: use a different (nvidia?) container container: @@ -227,42 +234,35 @@ jobs: echo "CUDA_BINDINGS_ARTIFACT_NAME=cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda${{ needs.build.outputs.BUILD_CTK_VER }}-${{ matrix.host-platform }}-${{ github.sha }}" >> $GITHUB_ENV echo "CUDA_BINDINGS_ARTIFACTS_DIR=$(realpath "$REPO_DIR/cuda_bindings/dist")" >> $GITHUB_ENV - - name: Download bindings build artifacts - uses: actions/download-artifact@v4 - with: - name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }} - path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} - - - name: Display structure of downloaded bindings artifacts - shell: bash --noprofile --norc -xeuo pipefail {0} - run: | - pwd - ls -lahR $CUDA_BINDINGS_ARTIFACTS_DIR - - - name: Download core build artifacts - uses: actions/download-artifact@v4 - with: - name: ${{ env.CUDA_CORE_ARTIFACT_NAME }} - path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }} - - - name: Display structure of downloaded core build artifacts - shell: bash --noprofile --norc -xeuo pipefail {0} - run: | - pwd - ls -lahR $CUDA_CORE_ARTIFACTS_DIR +# - name: Download bindings build artifacts +# uses: actions/download-artifact@v4 +# with: +# name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }} +# path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }} +# +# - name: Display structure of downloaded bindings artifacts +# shell: bash --noprofile --norc -xeuo pipefail {0} +# run: | +# pwd +# ls -lahR $CUDA_BINDINGS_ARTIFACTS_DIR +# +# - name: Download core build artifacts +# uses: actions/download-artifact@v4 +# with: +# name: ${{ env.CUDA_CORE_ARTIFACT_NAME }} +# path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }} +# +# - name: Display structure of downloaded core build artifacts +# shell: bash --noprofile --norc -xeuo pipefail {0} +# run: | +# pwd +# ls -lahR $CUDA_CORE_ARTIFACTS_DIR - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - # The cache action needs this - - name: Install zstd - shell: bash --noprofile --norc -xeuo pipefail {0} - run: | - apt update - apt install zstd - - name: Set up mini CTK uses: ./.github/actions/fetch_ctk continue-on-error: false