Skip to content

Pre end-to-end gpu driver validation #67

Pre end-to-end gpu driver validation

Pre end-to-end gpu driver validation #67

Workflow file for this run

# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: End-to-end tests
on:
workflow_run:
workflows: [image]
types:
- completed
branches:
- e2etestdriver
pull_request:
types:
- opened
- synchronize
branches:
# - main
# - release-*
- e2etestdriver
push:
branches:
# - main
# - release-*
- e2etestdriver
jobs:
e2e-tests-nvidiadriver:
runs-on: ubuntu-latest
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Set up Holodeck
uses: NVIDIA/holodeck@main
env:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
with:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws_ssh_key: ${{ secrets.AWS_SSH_KEY }}
holodeck_config: "tests/holodeck.yaml"
- name: Get public dns name
id: get_public_dns_name
uses: mikefarah/yq@master
with:
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml
- name: Set and Calculate test vars
run: |
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
# SHIVA
# echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
echo "COMMIT_SHORT_SHA=5ba28fea" >> $GITHUB_ENV
DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}')
echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV
# - name: Validate gpu driver
# env:
# TEST_CASE: "./tests/cases/nvidia-driver.sh"
# SSH_RETRY: "0"
# run: |
# sudo chmod 644 ${{ github.workspace }}/.cache/key
# echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key}
# rc=0
# for driver_version in ${DRIVER_VERSIONS}; do
# echo "Running e2e for DRIVER_VERSION=$driver_version"
# ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} ${SSH_RETRY} || status=$?
# if [ $status -ne 0 ]; then
# echo "e2e validation failed for driver version $driver_version with status $status"
# rc=$status
# fi
# done
# source ./tests/scripts/.definitions.sh
# ./tests/scripts/pull.sh ${LOG_DIR} logs
# exit $rc
# - name: Archive test logs
# if: ${{ failure() }}
# uses: actions/upload-artifact@v4
# with:
# name: nvidiadriver-e2e-test-logs
# path: ./logs/
# retention-days: 15
- name: Precompiled e2e test- upgrade kernel and Validate gpu driver
env:
TEST_CASE_KERNEL_UPGRADE: "./tests/cases/nvidia-kernel-upgrade.sh"
TEST_CASE: "./tests/cases/nvidia-driver.sh"
run: |
sudo chmod 644 ${{ github.workspace }}/.cache/key
echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key}
rc=0
for driver_version in ${DRIVER_VERSIONS}; do
echo "Running e2e for DRIVER_VERSION=$driver_version"
export SSH_RETRY="0"
./tests/ci-run-e2e.sh ${TEST_CASE_KERNEL_UPGRADE} ${driver_version} ${SSH_RETRY} || status=$?
if [ $status -ne 0 ]; then
echo "Kernel upgrade failed"
rc=$status
else
# system rebooted enable ssh retry
export SSH_RETRY="1"
DRIVER_BRANCH=$(echo "${TARGET_DRIVER_VERSION}" | cut -d '.' -f 1)
source ./tests/scripts/.definitions.sh
source "${SCRIPT_DIR}"/kernel_version.txt
DRIVER_VERSION="${DRIVER_BRANCH}-${KERNEL_VERSION}-ubuntu22.04"
./tests/ci-run-e2e.sh ${TEST_CASE} ${DRIVER_VERSION} ${SSH_RETRY} || status=$?
if [ $status -ne 0 ]; then
echo "e2e validation failed for driver version $DRIVER_VERSION with status $status"
rc=$status
fi
fi
done
source ./tests/scripts/.definitions.sh
./tests/scripts/pull.sh ${LOG_DIR} logs
exit $rc
- name: Archive test logs
if: ${{ failure() }}
uses: actions/upload-artifact@v4
with:
name: nvidiadriver-Precompiled-e2e-test-logs
path: ./logs/
retention-days: 15