Skip to content

Commit cf2d998

Browse files
committed
end-to-end gpu driver validation
1 parent a019667 commit cf2d998

16 files changed

+325
-0
lines changed

.github/workflows/ci.yaml

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# Copyright 2024 NVIDIA CORPORATION
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
name: CI
16+
17+
on:
18+
workflow_run:
19+
workflows: [image]
20+
types:
21+
- completed
22+
branches:
23+
- main
24+
25+
jobs:
26+
e2e-tests-nvidiadriver:
27+
runs-on: ubuntu-latest
28+
strategy:
29+
matrix:
30+
driver:
31+
- 535.183.06
32+
- 550.90.07
33+
34+
steps:
35+
- name: Check out code
36+
uses: actions/checkout@v4
37+
38+
- name: Set up Holodeck
39+
uses: NVIDIA/holodeck@main
40+
env:
41+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
42+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
43+
AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
44+
AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }}
45+
with:
46+
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
47+
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
48+
aws_ssh_key: ${{ secrets.AWS_SSH_KEY }}
49+
holodeck_config: "tests/holodeck.yaml"
50+
51+
- name: Get public dns name
52+
id: get_public_dns_name
53+
uses: mikefarah/yq@master
54+
with:
55+
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml
56+
57+
- name: Set and Calculate test vars
58+
run: |
59+
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
60+
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
61+
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
62+
63+
- name: Validate gpu driver
64+
env:
65+
TEST_CASE: "./tests/cases/nvidia-driver.sh"
66+
run: |
67+
sudo chmod 644 ${{ github.workspace }}/.cache/key
68+
echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key}
69+
./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${{ matrix.driver }}

tests/cases/nvidia-driver.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#! /bin/bash
2+
# This test case runs the operator installation / test case with the default options.
3+
4+
SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )"
5+
source "${SCRIPTS_DIR}"/.definitions.sh
6+
7+
# Run an end-to-end test cycle
8+
"${SCRIPTS_DIR}"/end-to-end-nvidia-driver.sh

tests/ci-run-e2e.sh

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/bin/bash
2+
3+
set -xe
4+
5+
if [[ $# -ne 2 ]]; then
6+
echo "TEST_CASE TARGET_DRIVER_VERSION are required"
7+
exit 1
8+
fi
9+
10+
export TEST_CASE=${1}
11+
export TARGET_DRIVER_VERSION=${2}
12+
13+
14+
TEST_DIR="$(pwd)/tests"
15+
16+
${TEST_DIR}/local.sh

tests/holodeck.yaml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
apiVersion: holodeck.nvidia.com/v1alpha1
2+
kind: Environment
3+
metadata:
4+
name: HOLODECK_NAME
5+
description: "end-to-end test infrastructure"
6+
spec:
7+
provider: aws
8+
auth:
9+
keyName: cnt-ci
10+
privateKey: HOLODECK_PRIVATE_KEY
11+
instance:
12+
type: g4dn.xlarge
13+
region: us-west-1
14+
ingressIpRanges:
15+
- 0.0.0.0/0
16+
image:
17+
architecture: amd64
18+
imageId: ami-0ce2cb35386fc22e9
19+
containerRuntime:
20+
install: true
21+
name: containerd
22+
kubernetes:
23+
install: true
24+
installer: kubeadm
25+
version: v1.28.5
26+
crictlVersion: v1.28.0

tests/local.sh

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#! /bin/bash
2+
3+
if [[ $# -ge 1 ]]; then
4+
TEST_CASE=${1}
5+
test -n "${TEST_CASE}"
6+
fi
7+
test -f ${PROJECT_DIR}/${TEST_CASE}
8+
9+
export PROJECT="gpu-driver-container"
10+
11+
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/scripts && pwd )"
12+
source ${SCRIPT_DIR}/.definitions.sh
13+
source ${SCRIPT_DIR}/.local.sh
14+
15+
# Sync the project folder to the remote
16+
${SCRIPT_DIR}/push.sh
17+
18+
# We trigger the installation of prerequisites on the remote instance
19+
remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites.sh
20+
21+
# We trigger the specified test case on the remote instance.
22+
# Note: We need to ensure that the required environment variables
23+
# are forwarded to the remote shell.
24+
remote \
25+
PROJECT="${PROJECT}" \
26+
TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}" \
27+
${TEST_CASE}

tests/scripts/.definitions.sh

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/bin/bash
2+
set -e
3+
4+
[[ -z "${DEBUG}" ]] || set -x
5+
6+
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
7+
TEST_DIR="$( cd "${SCRIPT_DIR}/.." && pwd )"
8+
PROJECT_DIR="$( cd "${TEST_DIR}/.." && pwd )"
9+
CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )"
10+
11+
# Set default values if not defined
12+
: ${HELM:="helm"}
13+
: ${PROJECT:="$(basename "${PROJECT_DIR}")"}
14+
15+
: ${TEST_NAMESPACE:="test-operator"}
16+
17+
: ${PRIVATE_REGISTRY:="ghcr.io"}
18+
19+
: ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"}
20+
21+
: ${TARGET_DRIVER_VERSION:="550.90.07"}

tests/scripts/.local.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/usr/env bash
2+
3+
function remote() {
4+
${SCRIPT_DIR}/remote.sh "cd ${PROJECT} && "$@""
5+
}

tests/scripts/.rsync-excludes

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
vendor/
2+
.git
3+
cnt-ci
4+
key.pem

tests/scripts/checks.sh

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/bin/bash
2+
3+
check_pod_ready() {
4+
local pod_label=$1
5+
local current_time=0
6+
while :; do
7+
echo "Checking $pod_label pod"
8+
kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE}
9+
10+
echo "Checking $pod_label pod readiness"
11+
is_pod_ready=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -ojsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' 2>/dev/null || echo "terminated")
12+
13+
if [ "${is_pod_ready}" = "True" ]; then
14+
# Check if the pod is not in terminating state
15+
is_pod_terminating=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -o jsonpath='{.items[0].metadata.deletionGracePeriodSeconds}' 2>/dev/null || echo "terminated")
16+
if [ "${is_pod_terminating}" != "" ]; then
17+
echo "pod $pod_label is in terminating state..."
18+
else
19+
echo "Pod $pod_label is ready"
20+
break;
21+
fi
22+
fi
23+
24+
if [[ "${current_time}" -gt $((60 * 45)) ]]; then
25+
echo "timeout reached"
26+
exit 1;
27+
fi
28+
29+
# Echo useful information on stdout
30+
kubectl get pods -n ${TEST_NAMESPACE}
31+
32+
echo "Sleeping 5 seconds"
33+
current_time=$((${current_time} + 5))
34+
sleep 5
35+
done
36+
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#!/bin/bash
2+
3+
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
4+
source "${SCRIPT_DIR}"/.definitions.sh
5+
6+
echo ""
7+
echo ""
8+
echo "--------------Installing the GPU Operator--------------"
9+
10+
# Install the operator with usePrecompiled mode set to true
11+
${SCRIPT_DIR}/install-operator.sh
12+
13+
"${SCRIPT_DIR}"/verify-operator.sh
14+
echo "--------------Verification completed for GPU Operator--------------"

0 commit comments

Comments
 (0)