Skip to content

Commit

Permalink
name cleanup + add H100 runner + install curl
Browse files Browse the repository at this point in the history
  • Loading branch information
leofang committed Dec 16, 2024
1 parent 61813fa commit 92cf593
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 98 deletions.
29 changes: 25 additions & 4 deletions .github/actions/fetch_ctk/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,26 @@ runs:
echo "CTK_CACHE_KEY=mini-ctk-${{ inputs.cuda-version }}-${{ inputs.host-platform }}" >> $GITHUB_ENV
echo "CTK_CACHE_FILENAME=mini-ctk-${{ inputs.cuda-version }}-${{ inputs.host-platform }}.tar.gz" >> $GITHUB_ENV
- name: Install dependencies
shell: bash --noprofile --norc -xeuo pipefail {0}
run: |
if (command -v curl 2>&1 >/dev/null) && (command -v zstd 2>&1 >/dev/null); then
echo "All dependencies are found. Do nothing."
exit 0
fi
if ! command -v sudo 2>&1 >/dev/null; then
if [[ $EUID == 0 ]]; then
alias SUDO=""
else
echo "The following oprations require root access."
exit 1
fi
else
alias SUDO="sudo"
fi
SUDO apt update
SUDO apt install -y zstd curl
- name: Download CTK cache
id: ctk-get-cache
uses: actions/cache/restore@v4
Expand All @@ -32,22 +52,22 @@ runs:
if: ${{ steps.ctk-get-cache.outputs.cache-hit != 'true' }}
shell: bash --noprofile --norc -xeuo pipefail {0}
run: |
CUDA_PATH="$(pwd)/cuda_toolkit"
CUDA_PATH="./cuda_toolkit"
mkdir $CUDA_PATH
# The binary archives (redist) are guaranteed to be updated as part of the release posting.
CTK_BASE_URL="https://developer.download.nvidia.com/compute/cuda/redist/"
CTK_JSON_URL="$CTK_BASE_URL/redistrib_${{ inputs.cuda-version }}.json"
if [[ "${{ inputs.host-platform }}" == linux* ]]; then
if [[ "${{ inputs.host-platform }}" == "linux-x64" ]]; then
if [[ "${{ inputs.host-platform }}" == "linux-64" ]]; then
CTK_SUBDIR="linux-x86_64"
elif [[ "${{ inputs.host-platform }}" == "linux-aarch64" ]]; then
CTK_SUBDIR="linux-sbsa"
fi
function extract() {
tar -xvf $1 -C $CUDA_PATH --strip-components=1
}
elif [[ "${{ inputs.host-platform }}" == "win-x64" ]]; then
elif [[ "${{ inputs.host-platform }}" == "win-64" ]]; then
CTK_SUBDIR="windows-x86_64"
function extract() {
_TEMP_DIR_=$(mktemp -d)
Expand Down Expand Up @@ -102,12 +122,13 @@ runs:
shell: bash --noprofile --norc -xeuo pipefail {0}
run: |
ls -l
CUDA_PATH="$(pwd)/cuda_toolkit"
CUDA_PATH="./cuda_toolkit"
tar -xzvf $CTK_CACHE_FILENAME
ls -l $CUDA_PATH
if [ ! -d "$CUDA_PATH/include" ]; then
exit 1
fi
CUDA_PATH=$(realpath ${CUDA_PATH})
echo "CUDA_PATH=${CUDA_PATH}" >> $GITHUB_ENV
echo "${CUDA_PATH}/bin" >> $GITHUB_PATH
Expand Down
3 changes: 1 addition & 2 deletions .github/workflows/ci-gh.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: "CI"
name: CI

concurrency:
group: ${{ startsWith(github.ref_name, 'main') && format('unique-{0}', github.run_id) || format('ci-build-and-test-on-{0}-from-{1}', github.event_name, github.ref_name) }}
Expand All @@ -12,7 +12,6 @@ on:

jobs:
ci:
name: "CI"
uses:
./.github/workflows/gh-build-and-test.yml
secrets: inherit
184 changes: 92 additions & 92 deletions .github/workflows/gh-build-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,28 @@ jobs:
strategy:
fail-fast: false
matrix:
# TODO: align host-platform names with conda convention
host-platform:
- linux-x64
- linux-64
- linux-aarch64
- win-x64
- win-64
python-version:
- "3.13"
- "3.12"
- "3.11"
- "3.10"
# - "3.13"
# - "3.12"
# - "3.11"
# - "3.10"
- "3.9"
cuda-version:
# Note: this is for build-time only.
- "12.6.2"
name: Build (${{ matrix.host-platform }}, Python "${{ matrix.python-version }}")
name: Build (${{ matrix.host-platform }}, Python ${{ matrix.python-version }}, CUDA ${{ matrix.cuda-version }})
if: ${{ github.repository_owner == 'nvidia' }}
permissions:
id-token: write # This is required for configure-aws-credentials
contents: read # This is required for actions/checkout
runs-on: ${{ (matrix.host-platform == 'linux-x64' && 'linux-amd64-cpu8') ||
runs-on: ${{ (matrix.host-platform == 'linux-64' && 'linux-amd64-cpu8') ||
(matrix.host-platform == 'linux-aarch64' && 'linux-arm64-cpu8') ||
(matrix.host-platform == 'win-x64' && 'windows-2019') }}
# (matrix.host-platform == 'win-x64' && 'windows-amd64-cpu8') }}
(matrix.host-platform == 'win-64' && 'windows-2019') }}
# (matrix.host-platform == 'win-64' && 'windows-amd64-cpu8') }}
outputs:
BUILD_CTK_VER: ${{ steps.pass_env.outputs.CUDA_VERSION }}
steps:
Expand Down Expand Up @@ -116,47 +115,47 @@ jobs:
host-platform: ${{ matrix.host-platform }}
cuda-version: ${{ matrix.cuda-version }}

- name: Build cuda.bindings wheel
uses: pypa/[email protected]
env:
CIBW_BUILD: ${{ env.CIBW_BUILD }}
CIBW_ARCHS_LINUX: "native"
CIBW_BUILD_VERBOSITY: 1
# CIBW mounts the host filesystem under /host
CIBW_ENVIRONMENT_LINUX: >
CUDA_PATH=/host/${{ env.CUDA_PATH }}
PARALLEL_LEVEL=${{ env.PARALLEL_LEVEL }}
CIBW_ENVIRONMENT_WINDOWS: >
CUDA_HOME="$(cygpath -w ${{ env.CUDA_PATH }})"
# PARALLEL_LEVEL=${{ env.PARALLEL_LEVEL }}
with:
package-dir: ./cuda_bindings/
output-dir: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}

- name: List the cuda.bindings artifacts directory
shell: bash --noprofile --norc -xeuo pipefail {0}
run: |
if [[ "${{ matrix.host-platform }}" == win* ]]; then
export CHOWN=chown
else
export CHOWN="sudo chown"
fi
$CHOWN -R $(whoami) ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
ls -lahR ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
# TODO: enable this after NVIDIA/cuda-python#297 is resolved
# - name: Check cuda.bindings wheel
# shell: bash --noprofile --norc -xeuo pipefail {0}
# run: |
# twine check ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl

- name: Upload cuda.bindings build artifacts
uses: actions/upload-artifact@v4
with:
name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }}
path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl
if-no-files-found: error
overwrite: 'true'
# - name: Build cuda.bindings wheel
# uses: pypa/[email protected]
# env:
# CIBW_BUILD: ${{ env.CIBW_BUILD }}
# CIBW_ARCHS_LINUX: "native"
# CIBW_BUILD_VERBOSITY: 1
# # CIBW mounts the host filesystem under /host
# CIBW_ENVIRONMENT_LINUX: >
# CUDA_PATH=/host/${{ env.CUDA_PATH }}
# PARALLEL_LEVEL=${{ env.PARALLEL_LEVEL }}
# CIBW_ENVIRONMENT_WINDOWS: >
# CUDA_HOME="$(cygpath -w ${{ env.CUDA_PATH }})"
# # PARALLEL_LEVEL=${{ env.PARALLEL_LEVEL }}
# with:
# package-dir: ./cuda_bindings/
# output-dir: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
#
# - name: List the cuda.bindings artifacts directory
# shell: bash --noprofile --norc -xeuo pipefail {0}
# run: |
# if [[ "${{ matrix.host-platform }}" == win* ]]; then
# export CHOWN=chown
# else
# export CHOWN="sudo chown"
# fi
# $CHOWN -R $(whoami) ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
# ls -lahR ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
#
# # TODO: enable this after NVIDIA/cuda-python#297 is resolved
# # - name: Check cuda.bindings wheel
# # shell: bash --noprofile --norc -xeuo pipefail {0}
# # run: |
# # twine check ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl
#
# - name: Upload cuda.bindings build artifacts
# uses: actions/upload-artifact@v4
# with:
# name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }}
# path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl
# if-no-files-found: error
# overwrite: 'true'

- name: Pass environment variables to the next runner
id: pass_env
Expand All @@ -166,31 +165,39 @@ jobs:
test:
strategy:
fail-fast: false
# TODO: add driver version here
matrix:
# TODO: align host-platform names with conda convention
host-platform:
- linux-x64
- linux-64
- linux-aarch64
# TODO: enable testing once win-64 GPU runners are up
# - win-x64
# - win-64
python-version:
- "3.13"
- "3.12"
- "3.11"
- "3.10"
# - "3.13"
# - "3.12"
# - "3.11"
# - "3.10"
- "3.9"
cuda-version:
# Note: this is for test-time only.
- "12.6.2"
- "12.0.1"
- "11.8.0"
name: Test (${{ matrix.host-platform }}, CUDA ${{ matrix.cuda-version }}, Python "${{ matrix.python-version }}")
runner:
- default
include:
- host-platform: linux-64
python-version: "3.12"
cuda-version: "12.6.2"
runner: H100
name: Test (${{ matrix.host-platform }}, Python ${{ matrix.python-version }}, CUDA ${{ matrix.cuda-version }}, Runner ${{ matrix.runner }})
if: ${{ (github.repository_owner == 'nvidia') }}
permissions:
id-token: write # This is required for configure-aws-credentials
contents: read # This is required for actions/checkout
runs-on: ${{ (matrix.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') ||
(matrix.host-platform == 'linux-aarch64' && 'linux-arm64-gpu-a100-latest-1') }}
runs-on: ${{ (matrix.runner == 'default' && matrix.host-platform == 'linux-64' && 'linux-amd64-gpu-v100-latest-1') ||
(matrix.runner == 'default' && matrix.host-platform == 'linux-aarch64' && 'linux-arm64-gpu-a100-latest-1') ||
(matrix.runner == 'H100' && 'linux-amd64-gpu-h100-latest-1-testing') }}
# Our self-hosted runners require a container
# TODO: use a different (nvidia?) container
container:
Expand Down Expand Up @@ -227,42 +234,35 @@ jobs:
echo "CUDA_BINDINGS_ARTIFACT_NAME=cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda${{ needs.build.outputs.BUILD_CTK_VER }}-${{ matrix.host-platform }}-${{ github.sha }}" >> $GITHUB_ENV
echo "CUDA_BINDINGS_ARTIFACTS_DIR=$(realpath "$REPO_DIR/cuda_bindings/dist")" >> $GITHUB_ENV
- name: Download bindings build artifacts
uses: actions/download-artifact@v4
with:
name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }}
path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}

- name: Display structure of downloaded bindings artifacts
shell: bash --noprofile --norc -xeuo pipefail {0}
run: |
pwd
ls -lahR $CUDA_BINDINGS_ARTIFACTS_DIR
- name: Download core build artifacts
uses: actions/download-artifact@v4
with:
name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}
path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}

- name: Display structure of downloaded core build artifacts
shell: bash --noprofile --norc -xeuo pipefail {0}
run: |
pwd
ls -lahR $CUDA_CORE_ARTIFACTS_DIR
# - name: Download bindings build artifacts
# uses: actions/download-artifact@v4
# with:
# name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }}
# path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
#
# - name: Display structure of downloaded bindings artifacts
# shell: bash --noprofile --norc -xeuo pipefail {0}
# run: |
# pwd
# ls -lahR $CUDA_BINDINGS_ARTIFACTS_DIR
#
# - name: Download core build artifacts
# uses: actions/download-artifact@v4
# with:
# name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}
# path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
#
# - name: Display structure of downloaded core build artifacts
# shell: bash --noprofile --norc -xeuo pipefail {0}
# run: |
# pwd
# ls -lahR $CUDA_CORE_ARTIFACTS_DIR

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

# The cache action needs this
- name: Install zstd
shell: bash --noprofile --norc -xeuo pipefail {0}
run: |
apt update
apt install zstd
- name: Set up mini CTK
uses: ./.github/actions/fetch_ctk
continue-on-error: false
Expand Down

0 comments on commit 92cf593

Please sign in to comment.