Skip to content

Pre-compile end-to-end gpu driver validation #156

Pre-compile end-to-end gpu driver validation

Pre-compile end-to-end gpu driver validation #156

Workflow file for this run

# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Run this workflow on a schedule
name: Precompiled images
on:
# SHIVA
# schedule:
# - cron: '00 09 * * *' # scheduled job
pull_request:
types:
- opened
- synchronize
branches:
- main
push:
branches:
- main
jobs:
set-driver-version-matrix:
runs-on: ubuntu-latest
outputs:
driver_versions: ${{ steps.extract_driver_versions.outputs.driver_versions }}
kernel_flavors: ${{ steps.extract_driver_versions.outputs.kernel_flavors }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Read driver versions
id: extract_driver_versions
run: |
driver_versions=$(grep '^DRIVER_VERSIONS' versions.mk | cut -d '=' -f2)
first_numbers=$(echo "$driver_versions" | tr ' ' '\n' | awk -F '.' '{print $1}' | grep -v '^$')
first_numbers_json=$(echo "$first_numbers" | awk 'BEGIN {ORS=","} {print "\"" $0 "\""}' | sed 's/,$//')
first_numbers_json=$(echo "$first_numbers" | jq -R . | jq -cs .)
#first_numbers_json="[$first_numbers_json]"
# FIXME -- remove below line once kernel support 560.35.03 version
first_numbers_json=$(echo "$first_numbers_json" | jq 'map(select(. != "560"))')
echo "driver_versions=$(echo $first_numbers_json)" >> $GITHUB_OUTPUT
# KERNEL_FLAVORS=("aws" "azure" "generic" "nvidia" "oracle")
KERNEL_FLAVORS=("aws" "generic")
kernel_flavors_json=$(printf '%s\n' "${KERNEL_FLAVORS[@]}" | jq -R . | jq -cs .)
echo "kernel_flavors=$kernel_flavors_json" >> $GITHUB_OUTPUT
pre-compiled:
needs: set-driver-version-matrix
when: never

Check failure on line 59 in .github/workflows/precompiled.yaml

View workflow run for this annotation

GitHub Actions / Precompiled images

Invalid workflow file

The workflow is not valid. .github/workflows/precompiled.yaml (Line: 59, Col: 5): Unexpected value 'when'
runs-on: ubuntu-latest
strategy:
matrix:
driver: ${{ fromJson(needs.set-driver-version-matrix.outputs.driver_versions) }}
flavor: ${{ fromJson(needs.set-driver-version-matrix.outputs.kernel_flavors) }}
steps:
- uses: actions/checkout@v4
name: Check out code
- name: Calculate build vars
id: vars
run: |
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV
REPO_FULL_NAME="${{ github.repository }}"
echo "LABEL_IMAGE_SOURCE=https://github.com/${REPO_FULL_NAME}" >> $GITHUB_ENV
GENERATE_ARTIFACTS="true"
echo "PUSH_ON_BUILD=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
echo "BUILD_MULTI_ARCH_IMAGES=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build base image and get kernel version
env:
IMAGE_NAME: ghcr.io/nvidia/driver
VERSION: ${COMMIT_SHORT_SHA}
BASE_TARGET: jammy
run: |
make DRIVER_BRANCH=${{ matrix.driver }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET}
trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT
docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver }}
# try 3 times every 10 seconds to get the file, if success exit the loop
for i in {1..3}; do
docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break
sleep 10
done
- name: Build image
env:
IMAGE_NAME: ghcr.io/nvidia/driver
VERSION: ${COMMIT_SHORT_SHA}
PRECOMPILED: "true"
DIST: signed_ubuntu22.04
run: |
source kernel_version.txt && \
make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver }} build-${DIST}-${DRIVER_VERSION}
e2e-driver-version-compare:
runs-on: ubuntu-latest
needs:
# - pre-compiled
- set-driver-version-matrix
outputs:
matrix_values_not_empty: ${{ steps.set_kernel_version.outputs.matrix_values_not_empty }}
matrix_values: ${{ steps.set_kernel_version.outputs.matrix_values }}
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set kernel version
id: set_kernel_version
env:
BASE_TARGET: "jammy"
DIST: "ubuntu22.04"
run: |
export PRIVATE_REGISTRY="nvcr.io"
echo "matrix_values_not_empty=0" >> $GITHUB_OUTPUT
driver_versions='${{ fromJson(needs.set-driver-version-matrix.outputs.driver_versions) }}'
versions=$(echo "$driver_versions" | jq -r '.[]')
for version in $versions; do
echo "Driver version: $version"
done
flavors='${{ fromJson(needs.set-driver-version-matrix.outputs.kernel_flavors) }}'
flavor=$(echo "$flavors" | jq -r '.[]')
for flavor in $flavors; do
echo "flavor version: $flavor"
done
flavorsd='${{ fromJson(needs.set-driver-version-matrix.outputs.kernel_flavors) }}'
flavorsd_json=$(echo "$flavorsd" | jq 'map(select(. != "azure"))')
flavord=$(echo "$flavorsd_json" | jq -r '.[]')
for flavord in $flavorsd; do
echo "flavord version: $flavord"
done
# # echo "driver: ${{ fromJson(needs.set-driver-version-matrix.outputs.driver_versions) }}"
# # echo "flavor: ${{ fromJson(needs.set-driver-version-matrix.outputs.kernel_flavors) }}'
# FIXME -- remove below line once azure kernel upgrade starts working
kernel_flavors_json="${{ needs.set-driver-version-matrix.outputs.kernel_flavors }}"
echo "SHIVA kernel_flavors_json=$kernel_flavors_json"
kernel_flavors_json=$(echo "$kernel_flavors_json" | jq 'map(select(. != "azure"))')
echo "SHIVA kernel_flavors_json=$kernel_flavors_json"
kernel_flavors=$(echo "$kernel_flavors_json" | jq -r '.[]')
echo "SHIVA ${KERNEL_FLAVORS[@]}"
driver_versions_json="${{ fromJson(needs.set-driver-version-matrix.outputs.driver_versions) }}"
echo "SHIVA driver_versions=$driver_versions_json"
driver_versions=$(echo "$driver_versions_json" | jq -r '.[]')
echo "SHIVA ${driver_versions[@]}"
kernel_versions=()
for kernel_flavor in $kernel_flavors; do
for driver_version in $driver_versions; do
driver_branch=$(echo "${driver_version}" | cut -d '.' -f 1)
echo "$BASE_TARGET $kernel_flavor $driver_branch"
source ./tests/scripts/findkernelversion.sh "$BASE_TARGET" "${kernel_flavor}" "$driver_branch" "$DIST"
if [[ "$should_continue" == true ]]; then
echo "matrix_values_not_empty=1" >> $GITHUB_OUTPUT
break
fi
done
if [[ "$should_continue" == false ]]; then
echo "The last successful e2e-tests-nvidiadriver was on the same tag ($KERNEL_VERSION). Skipping e2e-tests-nvidiadriver."
else
# remove any space , newlines for json format
KERNEL_VERSION=$(echo "$KERNEL_VERSION" | tr -d ' \n')
kernel_versions+=("$KERNEL_VERSION")
echo "Proceeding with $kernel_flavor $KERNEL_VERSION e2e-tests-nvidiadriver."
fi
done
# Convert array to JSON format and assign
echo "[]" > $GITHUB_WORKSPACE/matrix_values.json
printf '%s\n' "${kernel_versions[@]}" | jq -R . | jq -s . > $GITHUB_WORKSPACE/matrix_values.json
echo "matrix_values=$(cat $GITHUB_WORKSPACE/matrix_values.json | jq -c .)" >> $GITHUB_OUTPUT
e2e-tests-nvidiadriver:
runs-on: ubuntu-latest
needs: e2e-driver-version-compare
if: ${{ needs.e2e-driver-version-compare.outputs.matrix_values_not_empty == '1' }}
strategy:
matrix:
kernel_version: ${{ fromJson(needs.e2e-driver-version-compare.outputs.matrix_values) }}
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Set up Holodeck
uses: NVIDIA/[email protected]
env:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
with:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws_ssh_key: ${{ secrets.AWS_SSH_KEY }}
holodeck_config: "tests/holodeck.yaml"
- name: Get public dns name
id: get_public_dns_name
uses: mikefarah/yq@master
with:
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml
- name: Set and Calculate test vars
run: |
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}')
echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV
echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV
KERNEL_VERSION="${{ matrix.kernel_version }}"
echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV
- name: Precompiled e2e test upgrade kernel and Validate gpu driver
env:
UPGRADE_KERNEL_SCRIPT: "./tests/scripts/upgrade-kernel.sh"
TEST_CASE: "./tests/cases/nvidia-driver.sh"
OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true"
run: |
rc=0
for driver_version in ${DRIVER_VERSIONS}; do
echo "Running e2e for DRIVER_VERSION=$driver_version"
DRIVER_VERSION=$(echo "${driver_version}" | cut -d '.' -f 1)
# Use ARG3=OPERATOR_OPTIONS as KERNEL_VERSION in case of kernel upgrade
status=0
./tests/ci-run-e2e.sh "${UPGRADE_KERNEL_SCRIPT}" "${DRIVER_VERSION}" "${KERNEL_VERSION}" || status=$?
# On the target system, all scripts/test-case exit with code 1 for error handling.
# However, since reboot-related disconnections break the SSH connection
# and can cause the entire job to exit, we should ignore all errors except
# exit code 1. During a reboot, exit code 1 will not be thrown, so handling
# other errors as code 1 will ensure proper management of reboot scenarios
if [ $status -eq 1 ]; then
echo "e2e validation failed for driver branch $DRIVER_VERSION and kernel version $KERNEL_VERSION with status $status"
rc=$status
continue
fi
./tests/scripts/remote_retry.sh || status=$?
if [ $status -ne 0 ]; then
echo "Failed to connect to aws instance"
rc=$status
exit 1
fi
./tests/ci-run-e2e.sh "${TEST_CASE}" "${DRIVER_VERSION}" "${OPERATOR_OPTIONS}" || status=$?
if [ $status -eq 1 ]; then
echo "e2e validation failed for driver version $driver_version with status $status"
rc=$status
fi
done
./tests/scripts/pull.sh /tmp/logs logs
exit $rc
- name: Archive test logs
if: ${{ failure() }}
uses: actions/upload-artifact@v4
with:
name: nvidiadriver-Precompiled-e2e-test-logs
path: ./logs/
retention-days: 15