From 4ed43ab94dd87be44536e7eab72669c68c604ec5 Mon Sep 17 00:00:00 2001 From: shiva kumar Date: Thu, 22 Aug 2024 16:03:04 +0530 Subject: [PATCH] Pre end-to-end gpu driver validation Signed-off-by: shiva kumar --- .github/workflows/ci.yaml | 23 ++++++++++++----------- tests/ci-run-e2e.sh | 4 ++-- tests/local.sh | 5 +++++ tests/scripts/.definitions.sh | 2 +- tests/scripts/install-operator.sh | 7 +++++++ tests/scripts/remote.sh | 4 ---- 6 files changed, 27 insertions(+), 18 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index f231fc43..3b2e76a3 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -75,14 +75,14 @@ jobs: # - name: Validate gpu driver # env: # TEST_CASE: "./tests/cases/nvidia-driver.sh" - # SSH_RETRY: "0" + # USE_PRECOMPILED: "0" # run: | # sudo chmod 644 ${{ github.workspace }}/.cache/key # echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key} # rc=0 # for driver_version in ${DRIVER_VERSIONS}; do # echo "Running e2e for DRIVER_VERSION=$driver_version" - # ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} ${SSH_RETRY} || status=$? + # ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} ${USE_PRECOMPILED} || status=$? # if [ $status -ne 0 ]; then # echo "e2e validation failed for driver version $driver_version with status $status" # rc=$status @@ -110,21 +110,22 @@ jobs: rc=0 for driver_version in ${DRIVER_VERSIONS}; do echo "Running e2e for DRIVER_VERSION=$driver_version" - export SSH_RETRY="1" - ./tests/ci-run-e2e.sh ${TEST_CASE_KERNEL_UPGRADE} ${driver_version} ${SSH_RETRY} || status=$? + export USE_PRECOMPILED="0" + ./tests/ci-run-e2e.sh ${TEST_CASE_KERNEL_UPGRADE} ${driver_version} ${USE_PRECOMPILED} || status=$? if [ $status -ne 0 ]; then echo "Kernel upgrade failed" rc=$status else # system rebooted enable ssh retry - export SSH_RETRY="0" + export USE_PRECOMPILED="1" DRIVER_BRANCH=$(echo "${TARGET_DRIVER_VERSION}" | cut -d '.' -f 1) - source ./tests/scripts/.definitions.sh - mkdir -p logs - ./tests/scripts/pull.sh ${LOG_DIR}/kernel_version.txt logs/kernel_version.txt - source logs/kernel_version.txt - DRIVER_VERSION="${DRIVER_BRANCH}-${KERNEL_VERSION}-ubuntu22.04" - ./tests/ci-run-e2e.sh ${TEST_CASE} ${DRIVER_VERSION} ${SSH_RETRY} || status=$? + # SHIVA + # source ./tests/scripts/.definitions.sh + # mkdir -p logs + # ./tests/scripts/pull.sh ${LOG_DIR}/kernel_version.txt logs/kernel_version.txt + # source logs/kernel_version.txt + DRIVER_VERSION="${DRIVER_BRANCH}" + ./tests/ci-run-e2e.sh ${TEST_CASE} ${DRIVER_VERSION} ${USE_PRECOMPILED} || status=$? if [ $status -ne 0 ]; then echo "e2e validation failed for driver version $DRIVER_VERSION with status $status" rc=$status diff --git a/tests/ci-run-e2e.sh b/tests/ci-run-e2e.sh index 5b6d0a80..c9543b0b 100755 --- a/tests/ci-run-e2e.sh +++ b/tests/ci-run-e2e.sh @@ -3,13 +3,13 @@ set -xe if [[ $# -ne 3 ]]; then - echo "TEST_CASE TARGET_DRIVER_VERSION SSH_RETRY are required" + echo "TEST_CASE TARGET_DRIVER_VERSION USE_PRECOMPILED are required" exit 1 fi export TEST_CASE=${1} export TARGET_DRIVER_VERSION=${2} -export SSH_RETRY=${3} +export USE_PRECOMPILED=${3} TEST_DIR="$(pwd)/tests" diff --git a/tests/local.sh b/tests/local.sh index 86918588..640c85ec 100755 --- a/tests/local.sh +++ b/tests/local.sh @@ -12,6 +12,10 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/scripts && pwd )" source ${SCRIPT_DIR}/.definitions.sh source ${SCRIPT_DIR}/.local.sh +if [ "${USE_PRECOMPILED}" == "1" ]; then + remote_retry +fi + # Sync the project folder to the remote ${SCRIPT_DIR}/push.sh @@ -24,4 +28,5 @@ remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites. remote \ PROJECT="${PROJECT}" \ TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}" \ + USE_PRECOMPILED="${USE_PRECOMPILED}" \ ${TEST_CASE} diff --git a/tests/scripts/.definitions.sh b/tests/scripts/.definitions.sh index d34acefa..18d9670d 100644 --- a/tests/scripts/.definitions.sh +++ b/tests/scripts/.definitions.sh @@ -25,7 +25,7 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )" : ${LOG_DIR:="/tmp/logs"} -: ${SSH_RETRY:="0"} +: ${USE_PRECOMPILED:="0"} : ${SYSTEM_ONLINE_CHECK_TIMEOUT:="900"} : ${BASE_TARGET:="jammy"} diff --git a/tests/scripts/install-operator.sh b/tests/scripts/install-operator.sh index 6a0eb027..64d85c26 100755 --- a/tests/scripts/install-operator.sh +++ b/tests/scripts/install-operator.sh @@ -12,6 +12,13 @@ echo "Current kernel version: $CURRENT_KERNEL" SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source ${SCRIPT_DIR}/.definitions.sh +if [ "${USE_PRECOMPILED}" == "1" ]; then + ${SCRIPT_DIR}/findkernelversion.sh + source "${LOG_DIR}"/kernel_version.txt + TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}-${KERNEL_VERSION}-ubuntu22.04" + OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.usePrecompiled=true" +fi + OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.repository=${PRIVATE_REGISTRY}/nvidia --set driver.version=${TARGET_DRIVER_VERSION}" # add helm driver repo diff --git a/tests/scripts/remote.sh b/tests/scripts/remote.sh index 37b7da92..dcd8cf9c 100755 --- a/tests/scripts/remote.sh +++ b/tests/scripts/remote.sh @@ -6,7 +6,3 @@ source ${SCRIPT_DIR}/.local.sh # keep alive 60sec and timeout after 30 tries ssh -o ServerAliveInterval=60 -o ServerAliveCountMax=30 -i ${private_key} ${instance_hostname} "${@}" -ssh_exit_status=$? -if [ $ssh_exit_status -ne 0 ] && [ "$SSH_RETRY" == "1" ]; then - remote_retry -fi \ No newline at end of file