20
20
- cron : ' 00 09 * * *' # scheduled job
21
21
22
22
jobs :
23
- pre-compiled :
23
+ set-driver-version-matrix :
24
+ runs-on : ubuntu-latest
25
+ outputs :
26
+ driver_branch : ${{ steps.extract_driver_branch.outputs.driver_branch }}
27
+ kernel_flavors : ${{ steps.extract_driver_branch.outputs.kernel_flavors }}
28
+ steps :
29
+ - name : Checkout code
30
+ uses : actions/checkout@v4
31
+ - name : Read driver versions
32
+ id : extract_driver_branch
33
+ run : |
34
+ # get driver-branch
35
+ DRIVER_BRANCH=("535" "550")
36
+ driver_branch_json=$(printf '%s\n' "${DRIVER_BRANCH[@]}" | jq -R . | jq -cs .)
37
+ echo "driver_branch=$driver_branch_json" >> $GITHUB_OUTPUT
38
+
39
+ # get kernel flavors
40
+ KERNEL_FLAVORS=("aws" "azure" "generic" "nvidia" "oracle")
41
+ kernel_flavors_json=$(printf '%s\n' "${KERNEL_FLAVORS[@]}" | jq -R . | jq -cs .)
42
+ echo "kernel_flavors=$kernel_flavors_json" >> $GITHUB_OUTPUT
43
+
44
+ precompiled-image :
45
+ needs : set-driver-version-matrix
24
46
runs-on : ubuntu-latest
25
47
strategy :
26
48
matrix :
27
- driver :
28
- - 535
29
- - 550
30
- flavor :
31
- - aws
32
- - azure
33
- - generic
34
- - nvidia
35
- - oracle
49
+ driver-branch : ${{ fromJson(needs.set-driver-version-matrix.outputs.driver_branch) }}
50
+ flavor : ${{ fromJson(needs.set-driver-version-matrix.outputs.kernel_flavors) }}
36
51
steps :
37
52
- uses : actions/checkout@v4
38
53
name : Check out code
@@ -64,10 +79,10 @@ jobs:
64
79
VERSION : ${COMMIT_SHORT_SHA}
65
80
BASE_TARGET : jammy
66
81
run : |
67
- make DRIVER_BRANCH=${{ matrix.driver }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET}
82
+ make DRIVER_BRANCH=${{ matrix.driver-branch }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET}
68
83
69
84
trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT
70
- docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver }}
85
+ docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver-branch }}
71
86
# try 3 times every 10 seconds to get the file, if success exit the loop
72
87
for i in {1..3}; do
73
88
docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break
@@ -81,4 +96,155 @@ jobs:
81
96
DIST : signed_ubuntu22.04
82
97
run : |
83
98
source kernel_version.txt && \
84
- make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver }} build-${DIST}-${DRIVER_VERSION}
99
+ make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver-branch }} build-${DIST}-${DRIVER_VERSION}
100
+
101
+ determine-e2e-test-matrix :
102
+ runs-on : ubuntu-latest
103
+ needs :
104
+ - precompiled-image
105
+ - set-driver-version-matrix
106
+ outputs :
107
+ matrix_values_not_empty : ${{ steps.set_kernel_version.outputs.matrix_values_not_empty }}
108
+ matrix_values : ${{ steps.set_kernel_version.outputs.matrix_values }}
109
+ steps :
110
+ - name : Check out code
111
+ uses : actions/checkout@v4
112
+ - name : Login to GitHub Container Registry
113
+ uses : docker/login-action@v3
114
+ with :
115
+ registry : ghcr.io
116
+ username : ${{ github.actor }}
117
+ password : ${{ secrets.GITHUB_TOKEN }}
118
+
119
+ - name : Set kernel version
120
+ id : set_kernel_version
121
+ env :
122
+ BASE_TARGET : " jammy"
123
+ DIST : " ubuntu22.04"
124
+ run : |
125
+ echo "matrix_values_not_empty=0" >> $GITHUB_OUTPUT
126
+
127
+ kernel_flavors_json='${{ needs.set-driver-version-matrix.outputs.kernel_flavors }}'
128
+ kernel_flavors=$(echo "$kernel_flavors_json" | jq -r '.[]')
129
+ driver_branch_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}'
130
+ driver_branch=$(echo "$driver_branch_json" | jq -r '.[]')
131
+
132
+ kernel_versions=()
133
+ for kernel_flavor in $kernel_flavors; do
134
+ # FIXME -- remove if condition, once azure kernel upgrade starts working
135
+ if [[ "$kernel_flavor" == "azure" ]]; then
136
+ echo "skipping azure kernel testing"
137
+ continue
138
+ fi
139
+ for DRIVER_BRANCH in $driver_branch; do
140
+ source ./tests/scripts/findkernelversion.sh "$BASE_TARGET" "${kernel_flavor}" "$DRIVER_BRANCH" "$DIST"
141
+ if [[ "$should_continue" == true ]]; then
142
+ echo "matrix_values_not_empty=1" >> $GITHUB_OUTPUT
143
+ break
144
+ fi
145
+ done
146
+ if [[ "$should_continue" == false ]]; then
147
+ echo "Skipping e2e tests for the following driver tag: ${KERNEL_VERSION}-${kernel_flavor}-${DIST}"
148
+ else
149
+ KERNEL_VERSION=$(echo "$KERNEL_VERSION" | tr -d ' \n')
150
+ kernel_versions+=("$KERNEL_VERSION")
151
+ echo "Adding the following tag to the e2e test matrix: ${KERNEL_VERSION}-${kernel_flavor}-${DIST}"
152
+ fi
153
+ done
154
+
155
+ # Convert array to JSON format and assign
156
+ echo "[]" > $GITHUB_WORKSPACE/matrix_values.json
157
+ printf '%s\n' "${kernel_versions[@]}" | jq -R . | jq -s . > $GITHUB_WORKSPACE/matrix_values.json
158
+ echo "matrix_values=$(cat $GITHUB_WORKSPACE/matrix_values.json | jq -c .)" >> $GITHUB_OUTPUT
159
+
160
+ e2e-tests-nvidiadriver :
161
+ runs-on : ubuntu-latest
162
+ needs :
163
+ - determine-e2e-test-matrix
164
+ - set-driver-version-matrix
165
+ if : ${{ needs.determine-e2e-test-matrix.outputs.matrix_values_not_empty == '1' }}
166
+ strategy :
167
+ matrix :
168
+ kernel_version : ${{ fromJson(needs.determine-e2e-test-matrix.outputs.matrix_values) }}
169
+ steps :
170
+ - name : Check out code
171
+ uses : actions/checkout@v4
172
+ - name : Set up Holodeck
173
+
174
+ env :
175
+ AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
176
+ AWS_ACCESS_KEY_ID : ${{ secrets.AWS_ACCESS_KEY_ID }}
177
+ AWS_SSH_KEY : ${{ secrets.AWS_SSH_KEY }}
178
+ with :
179
+ aws_access_key_id : ${{ secrets.AWS_ACCESS_KEY_ID }}
180
+ aws_secret_access_key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
181
+ aws_ssh_key : ${{ secrets.AWS_SSH_KEY }}
182
+ holodeck_config : " tests/holodeck.yaml"
183
+
184
+ - name : Get public dns name
185
+ id : get_public_dns_name
186
+ uses : mikefarah/yq@master
187
+ with :
188
+ cmd : yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml
189
+ - name : Set and Calculate test vars
190
+ run : |
191
+ echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
192
+ echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
193
+ echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem
194
+ echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
195
+ echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV
196
+ KERNEL_VERSION="${{ matrix.kernel_version }}"
197
+ echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV
198
+
199
+ - name : Upgrade the kernel for Precompiled e2e test
200
+ env :
201
+ UPGRADE_KERNEL_SCRIPT : " ./tests/scripts/upgrade-kernel.sh"
202
+ run : |
203
+ status=0
204
+ ./tests/ci-remote-exec.sh "${UPGRADE_KERNEL_SCRIPT}" "${KERNEL_VERSION}" || status=$?
205
+ # On the target system, all scripts/test-case exit with code 1 for error handling.
206
+ # However, since reboot-related disconnections break the SSH connection
207
+ # and can cause the entire job to exit, we should ignore all errors except
208
+ # exit code 1. During a reboot, exit code 1 will not be thrown, so handling
209
+ # other errors as code 1 will ensure proper management of reboot scenarios
210
+ if [ $status -eq 1 ]; then
211
+ echo "Kernel version $KERNEL_VERSION upgrade failed"
212
+ exit 1
213
+ fi
214
+ ./tests/scripts/remote_retry.sh || status=$?
215
+ if [ $status -ne 0 ]; then
216
+ echo "Failed to connect to remote instance"
217
+ exit $status
218
+ fi
219
+
220
+ - name : Precompiled e2e test gpu driver validation
221
+ env :
222
+ TEST_CASE : " ./tests/cases/nvidia-driver.sh"
223
+ GPU_OPERATOR_OPTIONS : " --set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true"
224
+ run : |
225
+ rc=0
226
+ # for precompiled driver we are setting driver branch as driver version
227
+ driver_versions_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}'
228
+ driver_versions=$(echo "$driver_versions_json" | jq -r '.[]')
229
+ for DRIVER_VERSION in $driver_versions; do
230
+ echo "Running e2e for DRIVER_VERSION=$DRIVER_VERSION"
231
+ status=0
232
+ TEST_CASE_ARGS="${GPU_OPERATOR_OPTIONS} --set driver.version=${DRIVER_VERSION}"
233
+ # add escape character for space
234
+ TEST_CASE_ARGS=$(printf '%q ' "$TEST_CASE_ARGS")
235
+ ./tests/ci-run-e2e.sh "${TEST_CASE}" "${TEST_CASE_ARGS}" || status=$?
236
+ if [ $status -eq 1 ]; then
237
+ echo "e2e validation failed for driver version $DRIVER_VERSION with status $status"
238
+ rc=$status
239
+ fi
240
+ done
241
+ ./tests/scripts/pull.sh /tmp/logs logs
242
+ exit $rc
243
+
244
+ - name : Archive test logs
245
+ if : ${{ failure() }}
246
+ uses : actions/upload-artifact@v4
247
+ with :
248
+ name : nvidiadriver-Precompiled-e2e-test-logs
249
+ path : ./logs/
250
+ retention-days : 15
0 commit comments