Skip to content

Commit

Permalink
Pre-compiled end-to-end gpu driver validation
Browse files Browse the repository at this point in the history
Signed-off-by: shiva kumar <[email protected]>
  • Loading branch information
shivakunv committed Sep 2, 2024
1 parent 404a65b commit 5d8fc4e
Show file tree
Hide file tree
Showing 15 changed files with 335 additions and 23 deletions.
165 changes: 165 additions & 0 deletions .github/workflows/ci-precompiled.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Pre-Compiled End-to-end tests

on:
schedule:
- cron: '00 10 * * *' # as precompiled job runs daily at 9AM UTC
#SHIVA
push:
branches:
- e2etestdriver

jobs:
e2e-driver-version-compare:
runs-on: ubuntu-latest
outputs:
matrix_values_not_empty: ${{ steps.set_kernel_version.outputs.matrix_values_not_empty }}
matrix_values: ${{ steps.set_kernel_version.outputs.matrix_values }}
steps:

- name: Check out code
uses: actions/checkout@v4

- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Set kernel version
id: set_kernel_version
env:
BASE_TARGET: "jammy"
DIST: "ubuntu22.04"
run: |
export PRIVATE_REGISTRY="ghcr.io"
DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}')
# currently azure image upgrade is failing
# KERNEL_FLAVORS=("aws" "azure" "generic" "nvidia" "oracle")
KERNEL_FLAVORS=("aws" "generic" "nvidia" "oracle")
echo "matrix_values_not_empty=0" >> $GITHUB_OUTPUT
kernel_versions=()
for KERNEL_FLAVOR in "${KERNEL_FLAVORS[@]}"; do
for driver_version in ${DRIVER_VERSIONS}; do
DRIVER_VERSION=$(echo "${driver_version}" | cut -d '.' -f 1)
source ./tests/scripts/findkernelversion.sh "$BASE_TARGET" "${KERNEL_FLAVOR}" "$DRIVER_VERSION" "$DIST"
if [[ "$should_continue" == true ]]; then
echo "matrix_values_not_empty=1" >> $GITHUB_OUTPUT
break
fi
done
if [[ "$should_continue" == false ]]; then
echo "The last successful e2e-tests-nvidiadriver was on the same tag ($KERNEL_VERSION). Skipping e2e-tests-nvidiadriver."
else
# remove any space , newlines for json format
KERNEL_VERSION=$(echo "$KERNEL_VERSION" | tr -d ' \n')
kernel_versions+=("$KERNEL_VERSION")
echo "Proceeding with $KERNEL_FLAVOR $KERNEL_VERSION e2e-tests-nvidiadriver."
fi
done
# Convert array to JSON format and assign
echo "[]" > $GITHUB_WORKSPACE/matrix_values.json
printf '%s\n' "${kernel_versions[@]}" | jq -R . | jq -s . > $GITHUB_WORKSPACE/matrix_values.json
echo "matrix_values=$(cat $GITHUB_WORKSPACE/matrix_values.json | jq -c .)" >> $GITHUB_OUTPUT
e2e-tests-nvidiadriver:
runs-on: ubuntu-latest
needs: e2e-driver-version-compare
if: ${{ needs.e2e-driver-version-compare.outputs.matrix_values_not_empty == '1' }}
strategy:
matrix:
kernel_version: ${{ fromJson(needs.e2e-driver-version-compare.outputs.matrix_values) }}
steps:
- name: Check out code
uses: actions/checkout@v4

- name: Set up Holodeck
uses: NVIDIA/[email protected]
env:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
with:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws_ssh_key: ${{ secrets.AWS_SSH_KEY }}
holodeck_config: "tests/holodeck.yaml"

- name: Get public dns name
id: get_public_dns_name
uses: mikefarah/yq@master
with:
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml

- name: Set and Calculate test vars
run: |
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}')
echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV
echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV
KERNEL_VERSION="${{ matrix.kernel_version }}"
echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV
- name: Precompiled e2e test upgrade kernel and Validate gpu driver
env:
UPGRADE_KERNEL_SCRIPT: "./tests/scripts/upgrade-kernel.sh"
TEST_CASE: "./tests/cases/nvidia-driver.sh"
OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true"
run: |
rc=0
for driver_version in ${DRIVER_VERSIONS}; do
echo "Running e2e for DRIVER_VERSION=$driver_version"
DRIVER_VERSION=$(echo "${driver_version}" | cut -d '.' -f 1)
# Use ARG3=OPERATOR_OPTIONS as KERNEL_VERSION in case of kernel upgrade
status=0
./tests/ci-run-e2e.sh "${UPGRADE_KERNEL_SCRIPT}" "${DRIVER_VERSION}" "${KERNEL_VERSION}" || status=$?
# On the target system, all scripts/test-case exit with code 1 for error handling.
# However, since reboot-related disconnections break the SSH connection
# and can cause the entire job to exit, we should ignore all errors except
# exit code 1. During a reboot, exit code 1 will not be thrown, so handling
# other errors as code 1 will ensure proper management of reboot scenarios
if [ $status -eq 1 ]; then
echo "e2e validation failed for driver branch $DRIVER_VERSION and kernel version $KERNEL_VERSION with status $status"
rc=$status
continue
fi
./tests/scripts/remote_retry.sh || status=$?
if [ $status -ne 0 ]; then
echo "Failed to connect to aws instance"
rc=$status
exit 1
fi
# sleep 120
./tests/ci-run-e2e.sh "${TEST_CASE}" "${DRIVER_VERSION}" "${OPERATOR_OPTIONS}" || status=$?
if [ $status -eq 1 ]; then
echo "e2e validation failed for driver version $driver_version with status $status"
rc=$status
fi
done
./tests/scripts/pull.sh /tmp/logs logs
exit $rc
- name: Archive test logs
if: ${{ failure() }}
uses: actions/upload-artifact@v4
with:
name: nvidiadriver-Precompiled-e2e-test-logs
path: ./logs/
retention-days: 15
14 changes: 8 additions & 6 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,31 +46,33 @@ jobs:
id: get_public_dns_name
uses: mikefarah/yq@master
with:
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml

- name: Set and Calculate test vars
run: |
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}')
echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV
echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV
- name: Validate gpu driver
env:
TEST_CASE: "./tests/cases/nvidia-driver.sh"
OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia"
run: |
echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key}
rc=0
for driver_version in ${DRIVER_VERSIONS}; do
echo "Running e2e for DRIVER_VERSION=$driver_version"
./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} || status=$?
status=0
./tests/ci-run-e2e.sh "${TEST_CASE}" "${COMMIT_SHORT_SHA}-${driver_version}" "${OPERATOR_OPTIONS}" || status=$?
if [ $status -ne 0 ]; then
echo "e2e validation failed for driver version $driver_version with status $status"
rc=$status
fi
done
source ./tests/scripts/.definitions.sh
./tests/scripts/pull.sh /tmp/logs logs
exit $rc
Expand All @@ -80,4 +82,4 @@ jobs:
with:
name: nvidiadriver-e2e-test-logs
path: ./logs/
retention-days: 15
retention-days: 15
11 changes: 6 additions & 5 deletions tests/ci-run-e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@

set -xe

if [[ $# -ne 2 ]]; then
echo "TEST_CASE TARGET_DRIVER_VERSION are required"
if [[ $# -ne 3 ]]; then
echo "TEST_CASE TARGET_DRIVER_VERSION OPERATOR_OPTIONS are required"
echo "Use OPERATOR_OPTIONS as KERNEL_FLAVOR in case of kernel upgrade"
exit 1
fi

export TEST_CASE=${1}
export TARGET_DRIVER_VERSION=${2}

export TEST_CASE="${1}"
export TARGET_DRIVER_VERSION="${2}"
export OPERATOR_OPTIONS="${3}"

TEST_DIR="$(pwd)/tests"

Expand Down
1 change: 1 addition & 0 deletions tests/local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,5 @@ remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites.
remote \
PROJECT="${PROJECT}" \
TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}" \
OPERATOR_OPTIONS=\"${OPERATOR_OPTIONS}\" \
${TEST_CASE}
7 changes: 5 additions & 2 deletions tests/scripts/.definitions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )"

: ${TEST_NAMESPACE:="test-operator"}

: ${PRIVATE_REGISTRY:="ghcr.io"}

: ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"}

: ${TARGET_DRIVER_VERSION:="550.90.07"}
Expand All @@ -24,3 +22,8 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )"
: ${POD_STATUS_TIME_OUT:="2m"}

: ${LOG_DIR:="/tmp/logs"}

: ${OPERATOR_OPTIONS:="--set driver.repository=ghcr.io/nvidia"}
: ${SYSTEM_ONLINE_CHECK_TIMEOUT:="900"}

: ${BASE_TARGET:="jammy"}
4 changes: 4 additions & 0 deletions tests/scripts/.local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,7 @@
function remote() {
${SCRIPT_DIR}/remote.sh "cd ${PROJECT} && "$@""
}

function remote_retry() {
${SCRIPT_DIR}/remote_retry.sh
}
4 changes: 0 additions & 4 deletions tests/scripts/.rsync-excludes

This file was deleted.

2 changes: 2 additions & 0 deletions tests/scripts/.rsync-includes
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
tests/
tests/***
2 changes: 1 addition & 1 deletion tests/scripts/end-to-end-nvidia-driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ ${SCRIPT_DIR}/install-operator.sh

"${SCRIPT_DIR}"/verify-operator.sh

echo "--------------Verification completed for GPU Operator, uninstalling the operator--------------"
echo "--------------Verification completed for GPU Operator, uninstalling the GPU operator--------------"

${SCRIPT_DIR}/uninstall-operator.sh ${TEST_NAMESPACE} "gpu-operator"

Expand Down
32 changes: 32 additions & 0 deletions tests/scripts/findkernelversion.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

if [[ $# -ne 4 ]]; then
echo " BASE_TARGET DRIVER_BRANCH KERNEL_FLAVOR DIST are required"
exit 1
fi

export BASE_TARGET="${1}"
export KERNEL_FLAVOR="${2}"
export DRIVER_BRANCH="${3}"
export DIST="${4}"

export REGCTL_VERSION=v0.4.7
mkdir -p bin
curl -sSLo bin/regctl https://github.com/regclient/regclient/releases/download/${REGCTL_VERSION}/regctl-linux-amd64
chmod a+x bin/regctl
export PATH=$(pwd)/bin:${PATH}

# calculate kernel version of latest image
regctl image get-file ghcr.io/nvidia/driver:base-${BASE_TARGET}-${KERNEL_FLAVOR}-${DRIVER_BRANCH} /var/kernel_version.txt ./kernel_version.txt
export $(grep -oP 'KERNEL_VERSION=[^ ]+' ./kernel_version.txt)

# calculate driver tag
status=0
regctl tag ls "${PRIVATE_REGISTRY}"/nvidia/driver | grep "^${DRIVER_BRANCH}-${KERNEL_VERSION}-${DIST}$" || status=$?
if [[ $status -eq 0 ]]; then
export should_continue=false
else
export should_continue=true
fi
#SHIVA
export should_continue=true
12 changes: 8 additions & 4 deletions tests/scripts/install-operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,14 @@ if [[ "${SKIP_INSTALL}" == "true" ]]; then
exit 0
fi

echo "Checking current kernel version..."
CURRENT_KERNEL=$(uname -r)
echo "Current kernel version: $CURRENT_KERNEL"

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.repository=${PRIVATE_REGISTRY}/nvidia --set driver.version=${TARGET_DRIVER_VERSION}"
OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.version=${TARGET_DRIVER_VERSION}"

# add helm driver repo
helm repo add nvidia ${HELM_NVIDIA_REPO} && helm repo update
Expand All @@ -17,8 +21,8 @@ helm repo add nvidia ${HELM_NVIDIA_REPO} && helm repo update
kubectl create namespace "${TEST_NAMESPACE}"

# Run the helm install command
echo "OPERATOR_OPTIONS: $OPERATOR_OPTIONS"
${HELM} install gpu-operator nvidia/gpu-operator \
echo "OPERATOR_OPTIONS: ${OPERATOR_OPTIONS}"
eval ${HELM} install gpu-operator nvidia/gpu-operator \
-n "${TEST_NAMESPACE}" \
${OPERATOR_OPTIONS} \
"${OPERATOR_OPTIONS}" \
--wait
56 changes: 56 additions & 0 deletions tests/scripts/kernel-upgrade-helper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/bash

if [[ "${SKIP_INSTALL}" == "true" ]]; then
echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}"
exit 0
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source "${SCRIPT_DIR}"/.definitions.sh

export KERNEL_VERSION="${OPERATOR_OPTIONS}"

echo "Checking current kernel version..."
CURRENT_KERNEL=$(uname -r)
echo "Current kernel version: $CURRENT_KERNEL"

if [ "${CURRENT_KERNEL}" != ${KERNEL_VERSION} ]; then
echo ""
echo ""
echo "--------------Upgrading kernel to ${KERNEL_VERSION}--------------"

# Set non-interactive frontend for apt and disable editor prompts
# Perform the installation non-interactively
export DEBIAN_FRONTEND=noninteractive
export EDITOR=/bin/true
echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections

sudo apt-get update -y || true

# The removal of the currently running kernel (apt remove linux-image-*) sometimes works and sometimes does not.
# Occasionally, it requires two reboots, or an apt upgrade. However, removing all traces of the old/current
# kernel from the boot directory works consistently, which is why this approach has been adopted.
sudo rm -rf /boot/*${CURRENT_KERNEL}* || true
sudo rm -rf /lib/modules/*${CURRENT_KERNEL}*
sudo rm -rf /boot/*.old

#install new kernel
sudo apt-get install --allow-downgrades linux-image-${KERNEL_VERSION} linux-headers-${KERNEL_VERSION} linux-modules-${KERNEL_VERSION} -y || exit 1
if [ $? -ne 0 ]; then
echo "Kernel upgrade failed."
exit 1
fi
echo "update grub and initramfs..."
sudo update-grub || true
sudo update-initramfs -u -k ${KERNEL_VERSION} || true
echo "Rebooting ..."
# Run the reboot command with nohup to avoid abrupt SSH closure issues
nohup sudo reboot &

echo "--------------Kernel upgrade completed--------------"
else
echo "--------------Kernel upgrade not required, current kernel version ${KERNEL_VERSION}--------------"
fi

# Exit with a success code since the reboot command was issued successfully
exit 0
Loading

0 comments on commit 5d8fc4e

Please sign in to comment.