From a65890f570df16999a185e068f59fcbb33c29788 Mon Sep 17 00:00:00 2001 From: shiva kumar Date: Thu, 22 Aug 2024 13:14:06 +0530 Subject: [PATCH] Pre end-to-end gpu driver validation Signed-off-by: shiva kumar --- .github/workflows/ci.yaml | 69 +++++++++++++++++++--- .github/workflows/image.yaml | 10 ++-- tests/cases/nvidia-kernel-upgrade.sh | 8 +++ tests/ci-run-e2e.sh | 6 +- tests/local.sh | 5 ++ tests/scripts/.definitions.sh | 6 ++ tests/scripts/.local.sh | 4 ++ tests/scripts/end-to-end-nvidia-driver.sh | 2 +- tests/scripts/findkernelversion.sh | 18 ++++++ tests/scripts/install-operator.sh | 8 +++ tests/scripts/nvidia-kernel-upgrade-aws.sh | 43 ++++++++++++++ tests/scripts/prerequisites.sh | 3 + tests/scripts/remote_retry.sh | 27 +++++++++ vgpu/src/go.mod | 7 +-- vgpu/src/go.sum | 1 + 15 files changed, 196 insertions(+), 21 deletions(-) create mode 100755 tests/cases/nvidia-kernel-upgrade.sh create mode 100755 tests/scripts/findkernelversion.sh create mode 100755 tests/scripts/nvidia-kernel-upgrade-aws.sh create mode 100755 tests/scripts/remote_retry.sh diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index e44be0ce..a296bda0 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -20,7 +20,21 @@ on: types: - completed branches: - - main + - e2etestdriver + + pull_request: + types: + - opened + - synchronize + branches: + # - main + # - release-* + - e2etestdriver + push: + branches: + # - main + # - release-* + - e2etestdriver jobs: e2e-tests-nvidiadriver: @@ -52,12 +66,43 @@ jobs: run: | echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV - echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV + # SHIVA + # echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV + echo "COMMIT_SHORT_SHA=5ba28fea" >> $GITHUB_ENV DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}') echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV - - name: Validate gpu driver + # - name: Validate gpu driver + # env: + # TEST_CASE: "./tests/cases/nvidia-driver.sh" + # USE_PRECOMPILED: "0" + # run: | + # sudo chmod 644 ${{ github.workspace }}/.cache/key + # echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key} + # rc=0 + # for driver_version in ${DRIVER_VERSIONS}; do + # echo "Running e2e for DRIVER_VERSION=$driver_version" + # ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} ${USE_PRECOMPILED} || status=$? + # if [ $status -ne 0 ]; then + # echo "e2e validation failed for driver version $driver_version with status $status" + # rc=$status + # fi + # done + # source ./tests/scripts/.definitions.sh + # ./tests/scripts/pull.sh ${LOG_DIR} logs + # exit $rc + + # - name: Archive test logs + # if: ${{ failure() }} + # uses: actions/upload-artifact@v4 + # with: + # name: nvidiadriver-e2e-test-logs + # path: ./logs/ + # retention-days: 15 + + - name: Precompiled e2e test- upgrade kernel and Validate gpu driver env: + TEST_CASE_KERNEL_UPGRADE: "./tests/cases/nvidia-kernel-upgrade.sh" TEST_CASE: "./tests/cases/nvidia-driver.sh" run: | sudo chmod 644 ${{ github.workspace }}/.cache/key @@ -65,13 +110,23 @@ jobs: rc=0 for driver_version in ${DRIVER_VERSIONS}; do echo "Running e2e for DRIVER_VERSION=$driver_version" - ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} || status=$? + export USE_PRECOMPILED="0" + ./tests/ci-run-e2e.sh ${TEST_CASE_KERNEL_UPGRADE} ${driver_version} ${USE_PRECOMPILED} || status=$? if [ $status -ne 0 ]; then - echo "e2e validation failed for driver version $driver_version with status $status" + echo "Kernel upgrade failed" rc=$status + else + # system rebooted enable ssh retry + export USE_PRECOMPILED="1" + DRIVER_BRANCH=$(echo "${driver_version}" | cut -d '.' -f 1) + DRIVER_VERSION="${DRIVER_BRANCH}" + ./tests/ci-run-e2e.sh ${TEST_CASE} ${DRIVER_VERSION} ${USE_PRECOMPILED} || status=$? + if [ $status -ne 0 ]; then + echo "e2e validation failed for driver version $DRIVER_VERSION with status $status" + rc=$status + fi fi done - source ./tests/scripts/.definitions.sh ./tests/scripts/pull.sh ${LOG_DIR} logs exit $rc @@ -79,6 +134,6 @@ jobs: if: ${{ failure() }} uses: actions/upload-artifact@v4 with: - name: nvidiadriver-e2e-test-logs + name: nvidiadriver-Precompiled-e2e-test-logs path: ./logs/ retention-days: 15 diff --git a/.github/workflows/image.yaml b/.github/workflows/image.yaml index c167a324..5902b19e 100644 --- a/.github/workflows/image.yaml +++ b/.github/workflows/image.yaml @@ -21,12 +21,14 @@ on: - opened - synchronize branches: - - main - - release-* + # - main + # - release-* + - e2etestdriver_no push: branches: - - main - - release-* + # - main + # - release-* + - e2etestdriver_no jobs: image: diff --git a/tests/cases/nvidia-kernel-upgrade.sh b/tests/cases/nvidia-kernel-upgrade.sh new file mode 100755 index 00000000..82e43572 --- /dev/null +++ b/tests/cases/nvidia-kernel-upgrade.sh @@ -0,0 +1,8 @@ +#! /bin/bash +# This test case runs the operator installation / test case with the default options. + +SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )" +source "${SCRIPTS_DIR}"/.definitions.sh + +# Run an end-to-end test cycle +"${SCRIPTS_DIR}"/nvidia-kernel-upgrade-aws.sh diff --git a/tests/ci-run-e2e.sh b/tests/ci-run-e2e.sh index 621a7a8e..c9543b0b 100755 --- a/tests/ci-run-e2e.sh +++ b/tests/ci-run-e2e.sh @@ -2,14 +2,14 @@ set -xe -if [[ $# -ne 2 ]]; then - echo "TEST_CASE TARGET_DRIVER_VERSION are required" +if [[ $# -ne 3 ]]; then + echo "TEST_CASE TARGET_DRIVER_VERSION USE_PRECOMPILED are required" exit 1 fi export TEST_CASE=${1} export TARGET_DRIVER_VERSION=${2} - +export USE_PRECOMPILED=${3} TEST_DIR="$(pwd)/tests" diff --git a/tests/local.sh b/tests/local.sh index 86918588..640c85ec 100755 --- a/tests/local.sh +++ b/tests/local.sh @@ -12,6 +12,10 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/scripts && pwd )" source ${SCRIPT_DIR}/.definitions.sh source ${SCRIPT_DIR}/.local.sh +if [ "${USE_PRECOMPILED}" == "1" ]; then + remote_retry +fi + # Sync the project folder to the remote ${SCRIPT_DIR}/push.sh @@ -24,4 +28,5 @@ remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites. remote \ PROJECT="${PROJECT}" \ TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}" \ + USE_PRECOMPILED="${USE_PRECOMPILED}" \ ${TEST_CASE} diff --git a/tests/scripts/.definitions.sh b/tests/scripts/.definitions.sh index f254bc00..18d9670d 100644 --- a/tests/scripts/.definitions.sh +++ b/tests/scripts/.definitions.sh @@ -24,3 +24,9 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )" : ${POD_STATUS_TIME_OUT:="2m"} : ${LOG_DIR:="/tmp/logs"} + +: ${USE_PRECOMPILED:="0"} +: ${SYSTEM_ONLINE_CHECK_TIMEOUT:="900"} + +: ${BASE_TARGET:="jammy"} + diff --git a/tests/scripts/.local.sh b/tests/scripts/.local.sh index 7971a404..f3d98b2f 100644 --- a/tests/scripts/.local.sh +++ b/tests/scripts/.local.sh @@ -3,3 +3,7 @@ function remote() { ${SCRIPT_DIR}/remote.sh "cd ${PROJECT} && "$@"" } + +function remote_retry() { + ${SCRIPT_DIR}/remote_retry.sh +} diff --git a/tests/scripts/end-to-end-nvidia-driver.sh b/tests/scripts/end-to-end-nvidia-driver.sh index d272efab..ab2db9a1 100755 --- a/tests/scripts/end-to-end-nvidia-driver.sh +++ b/tests/scripts/end-to-end-nvidia-driver.sh @@ -11,7 +11,7 @@ ${SCRIPT_DIR}/install-operator.sh "${SCRIPT_DIR}"/verify-operator.sh -echo "--------------Verification completed for GPU Operator, uninstalling the operator--------------" +echo "--------------Verification completed for GPU Operator, uninstalling the GPU operator--------------" ${SCRIPT_DIR}/uninstall-operator.sh ${TEST_NAMESPACE} "gpu-operator" diff --git a/tests/scripts/findkernelversion.sh b/tests/scripts/findkernelversion.sh new file mode 100755 index 00000000..4ae834b7 --- /dev/null +++ b/tests/scripts/findkernelversion.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +if [[ "${SKIP_INSTALL}" == "true" ]]; then + echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}" + exit 0 +fi + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source "${SCRIPT_DIR}"/.definitions.sh + +export REGCTL_VERSION=v0.4.7 +mkdir -p bin +curl -sSLo bin/regctl https://github.com/regclient/regclient/releases/download/${REGCTL_VERSION}/regctl-linux-amd64 +chmod a+x bin/regctl +export PATH=$(pwd)/bin:${PATH} +DRIVER_BRANCH=$(echo "${TARGET_DRIVER_VERSION}" | cut -d '.' -f 1) +KERNEL_FLAVOR=$(uname -r | awk -F'-' '{print $3}') +regctl image get-file ghcr.io/nvidia/driver:base-${BASE_TARGET}-${KERNEL_FLAVOR}-${DRIVER_BRANCH} /var/kernel_version.txt ${LOG_DIR}/kernel_version.txt || true diff --git a/tests/scripts/install-operator.sh b/tests/scripts/install-operator.sh index 3acfcffb..02db4d80 100755 --- a/tests/scripts/install-operator.sh +++ b/tests/scripts/install-operator.sh @@ -5,9 +5,17 @@ if [[ "${SKIP_INSTALL}" == "true" ]]; then exit 0 fi +echo "Checking current kernel version..." +CURRENT_KERNEL=$(uname -r) +echo "Current kernel version: $CURRENT_KERNEL" + SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source ${SCRIPT_DIR}/.definitions.sh +if [ "${USE_PRECOMPILED}" == "1" ]; then + OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.usePrecompiled=true" +fi + OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.repository=${PRIVATE_REGISTRY}/nvidia --set driver.version=${TARGET_DRIVER_VERSION}" # add helm driver repo diff --git a/tests/scripts/nvidia-kernel-upgrade-aws.sh b/tests/scripts/nvidia-kernel-upgrade-aws.sh new file mode 100755 index 00000000..bab146f1 --- /dev/null +++ b/tests/scripts/nvidia-kernel-upgrade-aws.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +if [[ "${SKIP_INSTALL}" == "true" ]]; then + echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}" + exit 0 +fi + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source "${SCRIPT_DIR}"/.definitions.sh + +# finding kernel version +${SCRIPT_DIR}/findkernelversion.sh +source "${LOG_DIR}"/kernel_version.txt + +echo "Checking current kernel version..." +CURRENT_KERNEL=$(uname -r) +echo "Current kernel version: $CURRENT_KERNEL" + +echo "" +echo "" +echo "--------------Starting the Precompiled kernel version ${KERNEL_VERSION} upgrade--------------" + +sudo apt-get update -y +sudo apt-get install --allow-downgrades linux-image-${KERNEL_VERSION} -y +if [ $? -ne 0 ]; then + echo "Kernel upgrade failed." + exit 1 +fi + +echo "update grub ..." +sudo sed -i 's/^GRUB_DEFAULT=.*/GRUB_DEFAULT="Advanced options for Ubuntu>Ubuntu, with Linux ${KERNEL_VERSION}"/' /etc/default/grub +sudo cat /etc/default/grub +sudo update-grub +sudo grub-reboot "Advanced options for Ubuntu>Ubuntu, with Linux ${KERNEL_VERSION}" + +echo "Rebooting ..." +# Run the reboot command with nohup to avoid abrupt SSH closure issues +nohup sudo reboot & + +echo "--------------Installation of kernel completed --------------" + +# Exit with a success code since the reboot command was issued successfully +exit 0 diff --git a/tests/scripts/prerequisites.sh b/tests/scripts/prerequisites.sh index ee985e55..a5ccbdd3 100755 --- a/tests/scripts/prerequisites.sh +++ b/tests/scripts/prerequisites.sh @@ -8,6 +8,9 @@ fi SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source "${SCRIPT_DIR}"/.definitions.sh +echo "Create log dir ${LOG_DIR}" +mkdir -p "${LOG_DIR}" + export DEBIAN_FRONTEND=noninteractive echo "Load kernel modules i2c_core and ipmi_msghandler" diff --git a/tests/scripts/remote_retry.sh b/tests/scripts/remote_retry.sh new file mode 100755 index 00000000..7a7b073e --- /dev/null +++ b/tests/scripts/remote_retry.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source ${SCRIPT_DIR}/.definitions.sh +source ${SCRIPT_DIR}/.local.sh + +try_ssh_connection() { + ssh -o ConnectTimeout=10 -i ${private_key} ${instance_hostname} "exit" + return $? +} + +echo "Waiting for aws system to come back online..." +START_TIME=$(date +%s) +while true; do + sleep 60 # sleep before as system restarted earlier + try_ssh_connection + if [ $? -eq 0 ]; then + echo "Successfully connected to aws system after reboot." + break; + fi + ELAPSED_TIME=$(($(date +%s) - START_TIME)) + if [ "$ELAPSED_TIME" -ge "$SYSTEM_ONLINE_CHECK_TIMEOUT" ]; then + echo "Failed to connect to aws within ${SYSTEM_ONLINE_CHECK_TIMEOUT} minutes after reboot." + exit 1 + fi + echo "ssh retry again..." +done diff --git a/vgpu/src/go.mod b/vgpu/src/go.mod index 257a001b..0ba67fb3 100644 --- a/vgpu/src/go.mod +++ b/vgpu/src/go.mod @@ -8,9 +8,4 @@ require ( gopkg.in/yaml.v2 v2.4.0 ) -require ( - github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect - github.com/russross/blackfriday/v2 v2.1.0 // indirect - github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect - golang.org/x/sys v0.8.0 // indirect -) +require golang.org/x/sys v0.8.0 // indirect diff --git a/vgpu/src/go.sum b/vgpu/src/go.sum index d29f2058..cf85e170 100644 --- a/vgpu/src/go.sum +++ b/vgpu/src/go.sum @@ -1,3 +1,4 @@ +github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= github.com/cpuguy83/go-md2man/v2 v2.0.2 h1:p1EgwI/C7NhT0JmVkwCD2ZBK8j4aeHQX2pMHHBfMQ6w= github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=