Skip to content

Commit

Permalink
Merge pull request #117 from NVIDIA/e2etestandpushimage
Browse files Browse the repository at this point in the history
enhance e2e test and publish image
  • Loading branch information
cdesiniotis authored Sep 25, 2024
2 parents c238934 + e5b4884 commit d06fa8c
Show file tree
Hide file tree
Showing 5 changed files with 155 additions and 47 deletions.
162 changes: 119 additions & 43 deletions .github/workflows/precompiled.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,14 @@ jobs:
outputs:
driver_branch: ${{ steps.extract_driver_branch.outputs.driver_branch }}
kernel_flavors: ${{ steps.extract_driver_branch.outputs.kernel_flavors }}
dist: ${{ steps.extract_driver_branch.outputs.dist }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Read driver versions
id: extract_driver_branch
run: |
# get driver-branch
# get driver_branch
DRIVER_BRANCH=("535" "550")
driver_branch_json=$(printf '%s\n' "${DRIVER_BRANCH[@]}" | jq -R . | jq -cs .)
echo "driver_branch=$driver_branch_json" >> $GITHUB_OUTPUT
Expand All @@ -41,13 +42,19 @@ jobs:
kernel_flavors_json=$(printf '%s\n' "${KERNEL_FLAVORS[@]}" | jq -R . | jq -cs .)
echo "kernel_flavors=$kernel_flavors_json" >> $GITHUB_OUTPUT
precompiled-image:
# get ubuntu distributions
DIST=("ubuntu22.04")
dist_json=$(printf '%s\n' "${DIST[@]}" | jq -R . | jq -cs .)
echo "dist=$dist_json" >> $GITHUB_OUTPUT
precompiled-build-image:
needs: set-driver-version-matrix
runs-on: ubuntu-latest
strategy:
matrix:
driver-branch: ${{ fromJson(needs.set-driver-version-matrix.outputs.driver_branch) }}
driver_branch: ${{ fromJson(needs.set-driver-version-matrix.outputs.driver_branch) }}
flavor: ${{ fromJson(needs.set-driver-version-matrix.outputs.kernel_flavors) }}
dist: ${{ fromJson(needs.set-driver-version-matrix.outputs.dist) }}
steps:
- uses: actions/checkout@v4
name: Check out code
Expand All @@ -59,7 +66,7 @@ jobs:
REPO_FULL_NAME="${{ github.repository }}"
echo "LABEL_IMAGE_SOURCE=https://github.com/${REPO_FULL_NAME}" >> $GITHUB_ENV
GENERATE_ARTIFACTS="true"
GENERATE_ARTIFACTS="false"
echo "PUSH_ON_BUILD=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
echo "BUILD_MULTI_ARCH_IMAGES=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
Expand All @@ -79,10 +86,10 @@ jobs:
VERSION: ${COMMIT_SHORT_SHA}
BASE_TARGET: jammy
run: |
make DRIVER_BRANCH=${{ matrix.driver-branch }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET}
make DRIVER_BRANCH=${{ matrix.driver_branch }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET}
trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT
docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver-branch }}
docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver_branch }}
# try 3 times every 10 seconds to get the file, if success exit the loop
for i in {1..3}; do
docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break
Expand All @@ -93,19 +100,42 @@ jobs:
IMAGE_NAME: ghcr.io/nvidia/driver
VERSION: ${COMMIT_SHORT_SHA}
PRECOMPILED: "true"
DIST: signed_ubuntu22.04
DIST: signed_${{ matrix.dist }}
run: |
source kernel_version.txt && \
make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver-branch }} build-${DIST}-${DRIVER_VERSION}
make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver_branch }} build-${DIST}-${DRIVER_VERSION}
- name: Save build image as a tar
env:
DIST: ${{ matrix.dist }}
PRIVATE_REGISTRY: "ghcr.io"
run: |
source kernel_version.txt
docker save "${PRIVATE_REGISTRY}/nvidia/driver:${{ matrix.driver_branch }}-${KERNEL_VERSION}-${DIST}" \
-o ./driver-images-${{ matrix.driver_branch }}-${KERNEL_VERSION}-${DIST}.tar
# set env for artifacts upload
echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV
echo "DIST=$DIST" >> $GITHUB_ENV
- name: Upload build image as an artifact
uses: actions/upload-artifact@v4
with:
name: driver-images-${{ matrix.driver_branch }}-${{ env.KERNEL_VERSION }}-${{ env.DIST }}
path: ./driver-images-${{ matrix.driver_branch }}-${{ env.KERNEL_VERSION }}-${{ env.DIST }}.tar
retention-days: 1

determine-e2e-test-matrix:
runs-on: ubuntu-latest
strategy:
matrix:
dist: ${{ fromJson(needs.set-driver-version-matrix.outputs.dist) }}
needs:
- precompiled-image
- precompiled-build-image
- set-driver-version-matrix
outputs:
matrix_values_not_empty: ${{ steps.set_kernel_version.outputs.matrix_values_not_empty }}
matrix_values: ${{ steps.set_kernel_version.outputs.matrix_values }}
dist: ${{ steps.set-driver-version-matrix.outputs.dist }}
steps:
- name: Check out code
uses: actions/checkout@v4
Expand All @@ -120,42 +150,27 @@ jobs:
id: set_kernel_version
env:
BASE_TARGET: "jammy"
DIST: "ubuntu22.04"
DIST: ${{ matrix.dist }}
run: |
echo "matrix_values_not_empty=0" >> $GITHUB_OUTPUT
kernel_flavors_json='${{ needs.set-driver-version-matrix.outputs.kernel_flavors }}'
kernel_flavors=$(echo "$kernel_flavors_json" | jq -r '.[]')
KERNEL_FLAVORS=($(echo "$kernel_flavors_json" | jq -r '.[]'))
driver_branch_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}'
driver_branch=$(echo "$driver_branch_json" | jq -r '.[]')
kernel_versions=()
for kernel_flavor in $kernel_flavors; do
# FIXME -- remove if condition, once azure kernel upgrade starts working
if [[ "$kernel_flavor" == "azure" ]]; then
echo "skipping azure kernel testing"
continue
fi
for DRIVER_BRANCH in $driver_branch; do
source ./tests/scripts/findkernelversion.sh "$BASE_TARGET" "${kernel_flavor}" "$DRIVER_BRANCH" "$DIST"
if [[ "$should_continue" == true ]]; then
echo "matrix_values_not_empty=1" >> $GITHUB_OUTPUT
break
fi
done
if [[ "$should_continue" == false ]]; then
echo "Skipping e2e tests for the following driver tag: ${KERNEL_VERSION}-${kernel_flavor}-${DIST}"
else
KERNEL_VERSION=$(echo "$KERNEL_VERSION" | tr -d ' \n')
kernel_versions+=("$KERNEL_VERSION")
echo "Adding the following tag to the e2e test matrix: ${KERNEL_VERSION}-${kernel_flavor}-${DIST}"
fi
done
DRIVER_BRANCHES=($(echo "$driver_branch_json" | jq -r '.[]'))
source ./tests/scripts/ci-precompiled-helpers.sh
KERNEL_VERSIONS=($(get_kernel_versions_to_test $BASE_TARGET KERNEL_FLAVORS[@] DRIVER_BRANCHES[@] $DIST))
if [ -z "$KERNEL_VERSIONS" ]; then
# no new kernel release
echo "Skipping e2e tests"
exit 0
fi
# Convert array to JSON format and assign
echo "[]" > $GITHUB_WORKSPACE/matrix_values.json
printf '%s\n' "${kernel_versions[@]}" | jq -R . | jq -s . > $GITHUB_WORKSPACE/matrix_values.json
printf '%s\n' "${KERNEL_VERSIONS[@]}" | jq -R . | jq -s . > $GITHUB_WORKSPACE/matrix_values.json
echo "matrix_values=$(cat $GITHUB_WORKSPACE/matrix_values.json | jq -c .)" >> $GITHUB_OUTPUT
echo "matrix_values_not_empty=1" >> $GITHUB_OUTPUT
e2e-tests-nvidiadriver:
runs-on: ubuntu-latest
Expand All @@ -166,9 +181,16 @@ jobs:
strategy:
matrix:
kernel_version: ${{ fromJson(needs.determine-e2e-test-matrix.outputs.matrix_values) }}
dist: ${{ fromJson(needs.set-driver-version-matrix.outputs.dist) }}
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set up Holodeck
uses: NVIDIA/[email protected]
env:
Expand All @@ -195,6 +217,15 @@ jobs:
echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV
KERNEL_VERSION="${{ matrix.kernel_version }}"
echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV
echo "DIST=${{ matrix.dist }}" >> $GITHUB_ENV
driver_branch_json="${{ needs.set-driver-version-matrix.outputs.driver_branch }}"
DRIVER_BRANCHES=($(echo "$driver_branch_json" | jq -r '.[]'))
echo "DRIVER_BRANCHES=${DRIVER_BRANCHES[*]}" >> $GITHUB_ENV
- name: Install GitHub CLI
run: |
sudo apt-get update
sudo apt-get install -y gh
- name: Upgrade the kernel for Precompiled e2e test
env:
Expand All @@ -220,23 +251,29 @@ jobs:
- name: Precompiled e2e test gpu driver validation
env:
TEST_CASE: "./tests/cases/nvidia-driver.sh"
GPU_OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true"
GPU_OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true \
--set driver.imagePullPolicy=Never"
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
rc=0
# for precompiled driver we are setting driver branch as driver version
driver_versions_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}'
driver_versions=$(echo "$driver_versions_json" | jq -r '.[]')
for DRIVER_VERSION in $driver_versions; do
DRIVER_BRANCHES=(${{ env.DRIVER_BRANCHES }})
for DRIVER_VERSION in "${DRIVER_BRANCHES[@]}"; do
echo "Running e2e for DRIVER_VERSION=$DRIVER_VERSION"
image="driver-images-${DRIVER_VERSION}-${KERNEL_VERSION}-${DIST}"
echo "Downloading $image in tests directory"
gh run download --name $image --dir ./tests/
status=0
OPERATOR_OPTIONS="${GPU_OPERATOR_OPTIONS} --set driver.version=${DRIVER_VERSION}"
TEST_CASE_ARGS="${GPU_OPERATOR_OPTIONS} --set driver.version=${DRIVER_VERSION}"
# add escape character for space
OPERATOR_OPTIONS=$(printf '%q ' "$OPERATOR_OPTIONS")
./tests/ci-run-e2e.sh "${TEST_CASE}" "${OPERATOR_OPTIONS}" || status=$?
TEST_CASE_ARGS=$(printf '%q ' "$TEST_CASE_ARGS")
IMAGE_PATH="./tests/driver-images-${DRIVER_VERSION}-${KERNEL_VERSION}-${DIST}.tar"
./tests/ci-run-e2e.sh "${TEST_CASE}" "${TEST_CASE_ARGS}" ${IMAGE_PATH} || status=$?
if [ $status -eq 1 ]; then
echo "e2e validation failed for driver version $DRIVER_VERSION with status $status"
rc=$status
fi
rm -f $IMAGE_PATH
done
./tests/scripts/pull.sh /tmp/logs logs
exit $rc
Expand All @@ -248,3 +285,42 @@ jobs:
name: nvidiadriver-Precompiled-e2e-test-logs
path: ./logs/
retention-days: 15

publish-precompiled-image:
runs-on: ubuntu-latest
needs:
- set-driver-version-matrix
- determine-e2e-test-matrix
- e2e-tests-nvidiadriver
strategy:
matrix:
driver_branch: ${{ fromJson(needs.set-driver-version-matrix.outputs.driver_branch) }}
kernel_version: ${{ fromJson(needs.determine-e2e-test-matrix.outputs.matrix_values) }}
dist: ${{ fromJson(needs.set-driver-version-matrix.outputs.dist) }}
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Set image vars
run: |
echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV
echo "DIST=${{ matrix.dist }}" >> $GITHUB_ENV
- name: Download built image artifact
uses: actions/download-artifact@v4
with:
name: driver-images-${{ matrix.driver_branch }}-${{ matrix.kernel_version }}-${{ env.DIST }}
path: ./

- name: Publish image
run: |
image_path="./driver-images-${{ matrix.driver_branch }}-${{ matrix.kernel_version }}-${{ env.DIST }}.tar"
echo "uploading $image_path"
docker load -i $image_path
docker push ${PRIVATE_REGISTRY}/nvidia/driver:${{ matrix.driver_branch }}-${{ matrix.kernel_version }}-${{ env.DIST }}
4 changes: 4 additions & 0 deletions tests/cases/nvidia-driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ fi

# export gpu-operator options
export TEST_CASE_ARGS="$1"
if [[ $# -eq 2 ]]; then
export IMAGE_PATH="$2"
sudo ctr -n k8s.io images import "$IMAGE_PATH"
fi

SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )"
source "${SCRIPTS_DIR}"/.definitions.sh
Expand Down
2 changes: 1 addition & 1 deletion tests/ci-run-e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

set -xe

if [[ $# -ne 2 ]]; then
if [[ $# -lt 2 ]]; then
echo "TEST_CASE TEST_CASE_ARGS are required"
exit 1
fi
Expand Down
30 changes: 30 additions & 0 deletions tests/scripts/ci-precompiled-helpers.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
get_kernel_versions_to_test() {
if [[ "$#" -ne 4 ]]; then
echo " Error:$0 must be called with BASE_TARGET DRIVER_BRANCHES DRIVER_BRANCHES DIST" >&2
exit 1
fi

local BASE_TARGET="$1"
local -a KERNEL_FLAVORS=("${!2}")
local -a DRIVER_BRANCHES=("${!3}")
local DIST="$4"

kernel_versions=()
for kernel_flavor in "${KERNEL_FLAVORS[@]}"; do
# FIXME -- remove if condition, once azure kernel upgrade starts working
if [[ "$kernel_flavor" == "azure" ]]; then
continue
fi
for DRIVER_BRANCH in "${DRIVER_BRANCHES[@]}"; do
source ./tests/scripts/findkernelversion.sh "$BASE_TARGET" "${kernel_flavor}" "$DRIVER_BRANCH" "$DIST" >&2
if [[ "$should_continue" == true ]]; then
break
fi
done
if [[ "$should_continue" == true ]]; then
KERNEL_VERSION=$(echo "$KERNEL_VERSION" | tr -d ' \n')
kernel_versions+=("$KERNEL_VERSION")
fi
done
echo "${kernel_versions[@]}"
}
4 changes: 1 addition & 3 deletions tests/scripts/findkernelversion.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ export KERNEL_FLAVOR="${2}"
export DRIVER_BRANCH="${3}"
export DIST="${4}"

export REGCTL_VERSION=v0.4.7
export REGCTL_VERSION=v0.7.1
mkdir -p bin
curl -sSLo bin/regctl https://github.com/regclient/regclient/releases/download/${REGCTL_VERSION}/regctl-linux-amd64
chmod a+x bin/regctl
Expand All @@ -22,8 +22,6 @@ export $(grep -oP 'KERNEL_VERSION=[^ ]+' ./kernel_version.txt)

# calculate driver tag
status=0
echo "regctl tag ls nvcr.io/nvidia/driver | grep "^${DRIVER_BRANCH}-${KERNEL_VERSION}-${DIST}$""

regctl tag ls nvcr.io/nvidia/driver | grep "^${DRIVER_BRANCH}-${KERNEL_VERSION}-${DIST}$" || status=$?
if [[ $status -eq 0 ]]; then
export should_continue=false
Expand Down

0 comments on commit d06fa8c

Please sign in to comment.