Skip to content

Commit ed8ced1

Browse files
committed
Pre-compiled end-to-end gpu driver validation
Signed-off-by: shiva kumar <[email protected]>
1 parent b5d38ba commit ed8ced1

18 files changed

+394
-44
lines changed

.github/workflows/ci.yaml

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -46,31 +46,36 @@ jobs:
4646
id: get_public_dns_name
4747
uses: mikefarah/yq@master
4848
with:
49-
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml
49+
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml
5050

5151
- name: Set and Calculate test vars
5252
run: |
5353
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
5454
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
55+
echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem
5556
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
5657
DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}')
5758
echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV
58-
59+
echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV
60+
5961
- name: Validate gpu driver
6062
env:
6163
TEST_CASE: "./tests/cases/nvidia-driver.sh"
64+
GPU_OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia"
6265
run: |
63-
echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key}
6466
rc=0
65-
for driver_version in ${DRIVER_VERSIONS}; do
66-
echo "Running e2e for DRIVER_VERSION=$driver_version"
67-
./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} || status=$?
67+
for DRIVER_VERSION in ${DRIVER_VERSIONS}; do
68+
echo "Running e2e for DRIVER_VERSION=$DRIVER_VERSION"
69+
status=0
70+
TEST_CASE_ARGS="${GPU_OPERATOR_OPTIONS} --set driver.version=${COMMIT_SHORT_SHA}-${DRIVER_VERSION}"
71+
# add escape character for space
72+
TEST_CASE_ARGS=$(printf '%q ' "$TEST_CASE_ARGS")
73+
./tests/ci-run-e2e.sh "${TEST_CASE}" "${TEST_CASE_ARGS}" || status=$?
6874
if [ $status -ne 0 ]; then
69-
echo "e2e validation failed for driver version $driver_version with status $status"
75+
echo "e2e validation failed for driver version $DRIVER_VERSION with status $status"
7076
rc=$status
7177
fi
7278
done
73-
source ./tests/scripts/.definitions.sh
7479
./tests/scripts/pull.sh /tmp/logs logs
7580
exit $rc
7681
@@ -80,4 +85,4 @@ jobs:
8085
with:
8186
name: nvidiadriver-e2e-test-logs
8287
path: ./logs/
83-
retention-days: 15
88+
retention-days: 15

.github/workflows/precompiled.yaml

Lines changed: 179 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,34 @@ on:
2020
- cron: '00 09 * * *' # scheduled job
2121

2222
jobs:
23-
pre-compiled:
23+
set-driver-version-matrix:
24+
runs-on: ubuntu-latest
25+
outputs:
26+
driver_branch: ${{ steps.extract_driver_branch.outputs.driver_branch }}
27+
kernel_flavors: ${{ steps.extract_driver_branch.outputs.kernel_flavors }}
28+
steps:
29+
- name: Checkout code
30+
uses: actions/checkout@v4
31+
- name: Read driver versions
32+
id: extract_driver_branch
33+
run: |
34+
# get driver-branch
35+
DRIVER_BRANCH=("535" "550")
36+
driver_branch_json=$(printf '%s\n' "${DRIVER_BRANCH[@]}" | jq -R . | jq -cs .)
37+
echo "driver_branch=$driver_branch_json" >> $GITHUB_OUTPUT
38+
39+
# get kernel flavors
40+
KERNEL_FLAVORS=("aws" "azure" "generic" "nvidia" "oracle")
41+
kernel_flavors_json=$(printf '%s\n' "${KERNEL_FLAVORS[@]}" | jq -R . | jq -cs .)
42+
echo "kernel_flavors=$kernel_flavors_json" >> $GITHUB_OUTPUT
43+
44+
precompiled-image:
45+
needs: set-driver-version-matrix
2446
runs-on: ubuntu-latest
2547
strategy:
2648
matrix:
27-
driver:
28-
- 535
29-
- 550
30-
flavor:
31-
- aws
32-
- azure
33-
- generic
34-
- nvidia
35-
- oracle
49+
driver-branch: ${{ fromJson(needs.set-driver-version-matrix.outputs.driver_branch) }}
50+
flavor: ${{ fromJson(needs.set-driver-version-matrix.outputs.kernel_flavors) }}
3651
steps:
3752
- uses: actions/checkout@v4
3853
name: Check out code
@@ -64,10 +79,10 @@ jobs:
6479
VERSION: ${COMMIT_SHORT_SHA}
6580
BASE_TARGET: jammy
6681
run: |
67-
make DRIVER_BRANCH=${{ matrix.driver }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET}
82+
make DRIVER_BRANCH=${{ matrix.driver-branch }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET}
6883
6984
trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT
70-
docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver }}
85+
docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver-branch }}
7186
# try 3 times every 10 seconds to get the file, if success exit the loop
7287
for i in {1..3}; do
7388
docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break
@@ -81,4 +96,155 @@ jobs:
8196
DIST: signed_ubuntu22.04
8297
run: |
8398
source kernel_version.txt && \
84-
make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver }} build-${DIST}-${DRIVER_VERSION}
99+
make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver-branch }} build-${DIST}-${DRIVER_VERSION}
100+
101+
determine-e2e-test-matrix:
102+
runs-on: ubuntu-latest
103+
needs:
104+
- precompiled-image
105+
- set-driver-version-matrix
106+
outputs:
107+
matrix_values_not_empty: ${{ steps.set_kernel_version.outputs.matrix_values_not_empty }}
108+
matrix_values: ${{ steps.set_kernel_version.outputs.matrix_values }}
109+
steps:
110+
- name: Check out code
111+
uses: actions/checkout@v4
112+
- name: Login to GitHub Container Registry
113+
uses: docker/login-action@v3
114+
with:
115+
registry: ghcr.io
116+
username: ${{ github.actor }}
117+
password: ${{ secrets.GITHUB_TOKEN }}
118+
119+
- name: Set kernel version
120+
id: set_kernel_version
121+
env:
122+
BASE_TARGET: "jammy"
123+
DIST: "ubuntu22.04"
124+
run: |
125+
echo "matrix_values_not_empty=0" >> $GITHUB_OUTPUT
126+
127+
kernel_flavors_json='${{ needs.set-driver-version-matrix.outputs.kernel_flavors }}'
128+
kernel_flavors=$(echo "$kernel_flavors_json" | jq -r '.[]')
129+
driver_branch_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}'
130+
driver_branch=$(echo "$driver_branch_json" | jq -r '.[]')
131+
132+
kernel_versions=()
133+
for kernel_flavor in $kernel_flavors; do
134+
# FIXME -- remove if condition, once azure kernel upgrade starts working
135+
if [[ "$kernel_flavor" == "azure" ]]; then
136+
echo "skipping azure kernel testing"
137+
continue
138+
fi
139+
for DRIVER_BRANCH in $driver_branch; do
140+
source ./tests/scripts/findkernelversion.sh "$BASE_TARGET" "${kernel_flavor}" "$DRIVER_BRANCH" "$DIST"
141+
if [[ "$should_continue" == true ]]; then
142+
echo "matrix_values_not_empty=1" >> $GITHUB_OUTPUT
143+
break
144+
fi
145+
done
146+
if [[ "$should_continue" == false ]]; then
147+
echo "Skipping e2e tests for the following driver tag: ${KERNEL_VERSION}-${kernel_flavor}-${DIST}"
148+
else
149+
KERNEL_VERSION=$(echo "$KERNEL_VERSION" | tr -d ' \n')
150+
kernel_versions+=("$KERNEL_VERSION")
151+
echo "Adding the following tag to the e2e test matrix: ${KERNEL_VERSION}-${kernel_flavor}-${DIST}"
152+
fi
153+
done
154+
155+
# Convert array to JSON format and assign
156+
echo "[]" > $GITHUB_WORKSPACE/matrix_values.json
157+
printf '%s\n' "${kernel_versions[@]}" | jq -R . | jq -s . > $GITHUB_WORKSPACE/matrix_values.json
158+
echo "matrix_values=$(cat $GITHUB_WORKSPACE/matrix_values.json | jq -c .)" >> $GITHUB_OUTPUT
159+
160+
e2e-tests-nvidiadriver:
161+
runs-on: ubuntu-latest
162+
needs:
163+
- determine-e2e-test-matrix
164+
- set-driver-version-matrix
165+
if: ${{ needs.determine-e2e-test-matrix.outputs.matrix_values_not_empty == '1' }}
166+
strategy:
167+
matrix:
168+
kernel_version: ${{ fromJson(needs.determine-e2e-test-matrix.outputs.matrix_values) }}
169+
steps:
170+
- name: Check out code
171+
uses: actions/checkout@v4
172+
- name: Set up Holodeck
173+
uses: NVIDIA/[email protected]
174+
env:
175+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
176+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
177+
AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
178+
with:
179+
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
180+
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
181+
aws_ssh_key: ${{ secrets.AWS_SSH_KEY }}
182+
holodeck_config: "tests/holodeck.yaml"
183+
184+
- name: Get public dns name
185+
id: get_public_dns_name
186+
uses: mikefarah/yq@master
187+
with:
188+
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml
189+
- name: Set and Calculate test vars
190+
run: |
191+
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
192+
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
193+
echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem
194+
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
195+
echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV
196+
KERNEL_VERSION="${{ matrix.kernel_version }}"
197+
echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV
198+
199+
- name: Upgrade the kernel for Precompiled e2e test
200+
env:
201+
UPGRADE_KERNEL_SCRIPT: "./tests/scripts/upgrade-kernel.sh"
202+
run: |
203+
status=0
204+
./tests/ci-remote-exec.sh "${UPGRADE_KERNEL_SCRIPT}" "${KERNEL_VERSION}" || status=$?
205+
# On the target system, all scripts/test-case exit with code 1 for error handling.
206+
# However, since reboot-related disconnections break the SSH connection
207+
# and can cause the entire job to exit, we should ignore all errors except
208+
# exit code 1. During a reboot, exit code 1 will not be thrown, so handling
209+
# other errors as code 1 will ensure proper management of reboot scenarios
210+
if [ $status -eq 1 ]; then
211+
echo "Kernel version $KERNEL_VERSION upgrade failed"
212+
exit 1
213+
fi
214+
./tests/scripts/remote_retry.sh || status=$?
215+
if [ $status -ne 0 ]; then
216+
echo "Failed to connect to remote instance"
217+
exit $status
218+
fi
219+
220+
- name: Precompiled e2e test gpu driver validation
221+
env:
222+
TEST_CASE: "./tests/cases/nvidia-driver.sh"
223+
GPU_OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true"
224+
run: |
225+
rc=0
226+
# for precompiled driver we are setting driver branch as driver version
227+
driver_versions_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}'
228+
driver_versions=$(echo "$driver_versions_json" | jq -r '.[]')
229+
for DRIVER_VERSION in $driver_versions; do
230+
echo "Running e2e for DRIVER_VERSION=$DRIVER_VERSION"
231+
status=0
232+
TEST_CASE_ARGS="${GPU_OPERATOR_OPTIONS} --set driver.version=${DRIVER_VERSION}"
233+
# add escape character for space
234+
TEST_CASE_ARGS=$(printf '%q ' "$TEST_CASE_ARGS")
235+
./tests/ci-run-e2e.sh "${TEST_CASE}" "${TEST_CASE_ARGS}" || status=$?
236+
if [ $status -eq 1 ]; then
237+
echo "e2e validation failed for driver version $DRIVER_VERSION with status $status"
238+
rc=$status
239+
fi
240+
done
241+
./tests/scripts/pull.sh /tmp/logs logs
242+
exit $rc
243+
244+
- name: Archive test logs
245+
if: ${{ failure() }}
246+
uses: actions/upload-artifact@v4
247+
with:
248+
name: nvidiadriver-Precompiled-e2e-test-logs
249+
path: ./logs/
250+
retention-days: 15

tests/cases/nvidia-driver.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,14 @@
11
#! /bin/bash
22
# This test case runs the operator installation / test case with the default options.
33

4+
if [[ $# -lt 1 ]]; then
5+
echo "Error: $0 must be called with driver options"
6+
exit 1
7+
fi
8+
9+
# export gpu-operator options
10+
export TEST_CASE_ARGS="$1"
11+
412
SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )"
513
source "${SCRIPTS_DIR}"/.definitions.sh
614

tests/ci-remote-exec.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/bash
2+
3+
set -xe
4+
5+
if [[ $# -lt 1 ]]; then
6+
echo "Error:$0 must be called with 1(REMOTE_EXEC) or more than 1 args (REMOTE_EXEC, ARGS1 ARGS2 etc)"
7+
exit 1
8+
fi
9+
10+
TEST_DIR="$(pwd)/tests"
11+
12+
${TEST_DIR}/remote-exec-local.sh "$@"

tests/ci-run-e2e.sh

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,10 @@
33
set -xe
44

55
if [[ $# -ne 2 ]]; then
6-
echo "TEST_CASE TARGET_DRIVER_VERSION are required"
6+
echo "TEST_CASE TEST_CASE_ARGS are required"
77
exit 1
88
fi
99

10-
export TEST_CASE=${1}
11-
export TARGET_DRIVER_VERSION=${2}
12-
13-
1410
TEST_DIR="$(pwd)/tests"
1511

16-
${TEST_DIR}/local.sh
12+
${TEST_DIR}/local.sh "$@"

tests/local.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,4 @@ remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites.
2323
# are forwarded to the remote shell.
2424
remote \
2525
PROJECT="${PROJECT}" \
26-
TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}" \
27-
${TEST_CASE}
26+
"$@"

tests/remote-exec-local.sh

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#! /bin/bash
2+
3+
if [[ $# -ge 1 ]]; then
4+
REMOTE_EXEC=${1}
5+
test -n "${REMOTE_EXEC}"
6+
fi
7+
test -f ${PROJECT_DIR}/${REMOTE_EXEC}
8+
9+
export PROJECT="gpu-driver-container"
10+
11+
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/scripts && pwd )"
12+
source ${SCRIPT_DIR}/.definitions.sh
13+
source ${SCRIPT_DIR}/.local.sh
14+
15+
# Sync the project folder to the remote
16+
${SCRIPT_DIR}/push.sh
17+
18+
# We trigger the specified script on the remote instance.
19+
remote \
20+
PROJECT="${PROJECT}" \
21+
"$@"

tests/scripts/.definitions.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,13 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )"
1414

1515
: ${TEST_NAMESPACE:="test-operator"}
1616

17-
: ${PRIVATE_REGISTRY:="ghcr.io"}
18-
1917
: ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"}
2018

21-
: ${TARGET_DRIVER_VERSION:="550.90.07"}
22-
2319
: ${DAEMON_POD_STATUS_TIME_OUT:="15m"}
2420
: ${POD_STATUS_TIME_OUT:="2m"}
2521

2622
: ${LOG_DIR:="/tmp/logs"}
23+
24+
: ${SYSTEM_ONLINE_CHECK_TIMEOUT:="900"}
25+
26+
: ${BASE_TARGET:="jammy"}

tests/scripts/.local.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,7 @@
33
function remote() {
44
${SCRIPT_DIR}/remote.sh "cd ${PROJECT} && "$@""
55
}
6+
7+
function remote_retry() {
8+
${SCRIPT_DIR}/remote_retry.sh
9+
}

tests/scripts/.rsync-excludes

Lines changed: 0 additions & 4 deletions
This file was deleted.

0 commit comments

Comments
 (0)