Skip to content

Commit

Permalink
Pre end-to-end gpu driver validation
Browse files Browse the repository at this point in the history
Signed-off-by: shiva kumar <[email protected]>
  • Loading branch information
shivakunv committed Aug 22, 2024
1 parent df8f5da commit 4ed43ab
Show file tree
Hide file tree
Showing 6 changed files with 27 additions and 18 deletions.
23 changes: 12 additions & 11 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,14 @@ jobs:
# - name: Validate gpu driver
# env:
# TEST_CASE: "./tests/cases/nvidia-driver.sh"
# SSH_RETRY: "0"
# USE_PRECOMPILED: "0"
# run: |
# sudo chmod 644 ${{ github.workspace }}/.cache/key
# echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key}
# rc=0
# for driver_version in ${DRIVER_VERSIONS}; do
# echo "Running e2e for DRIVER_VERSION=$driver_version"
# ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} ${SSH_RETRY} || status=$?
# ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} ${USE_PRECOMPILED} || status=$?
# if [ $status -ne 0 ]; then
# echo "e2e validation failed for driver version $driver_version with status $status"
# rc=$status
Expand Down Expand Up @@ -110,21 +110,22 @@ jobs:
rc=0
for driver_version in ${DRIVER_VERSIONS}; do
echo "Running e2e for DRIVER_VERSION=$driver_version"
export SSH_RETRY="1"
./tests/ci-run-e2e.sh ${TEST_CASE_KERNEL_UPGRADE} ${driver_version} ${SSH_RETRY} || status=$?
export USE_PRECOMPILED="0"
./tests/ci-run-e2e.sh ${TEST_CASE_KERNEL_UPGRADE} ${driver_version} ${USE_PRECOMPILED} || status=$?
if [ $status -ne 0 ]; then
echo "Kernel upgrade failed"
rc=$status
else
# system rebooted enable ssh retry
export SSH_RETRY="0"
export USE_PRECOMPILED="1"
DRIVER_BRANCH=$(echo "${TARGET_DRIVER_VERSION}" | cut -d '.' -f 1)
source ./tests/scripts/.definitions.sh
mkdir -p logs
./tests/scripts/pull.sh ${LOG_DIR}/kernel_version.txt logs/kernel_version.txt
source logs/kernel_version.txt
DRIVER_VERSION="${DRIVER_BRANCH}-${KERNEL_VERSION}-ubuntu22.04"
./tests/ci-run-e2e.sh ${TEST_CASE} ${DRIVER_VERSION} ${SSH_RETRY} || status=$?
# SHIVA
# source ./tests/scripts/.definitions.sh
# mkdir -p logs
# ./tests/scripts/pull.sh ${LOG_DIR}/kernel_version.txt logs/kernel_version.txt
# source logs/kernel_version.txt
DRIVER_VERSION="${DRIVER_BRANCH}"
./tests/ci-run-e2e.sh ${TEST_CASE} ${DRIVER_VERSION} ${USE_PRECOMPILED} || status=$?
if [ $status -ne 0 ]; then
echo "e2e validation failed for driver version $DRIVER_VERSION with status $status"
rc=$status
Expand Down
4 changes: 2 additions & 2 deletions tests/ci-run-e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
set -xe

if [[ $# -ne 3 ]]; then
echo "TEST_CASE TARGET_DRIVER_VERSION SSH_RETRY are required"
echo "TEST_CASE TARGET_DRIVER_VERSION USE_PRECOMPILED are required"
exit 1
fi

export TEST_CASE=${1}
export TARGET_DRIVER_VERSION=${2}
export SSH_RETRY=${3}
export USE_PRECOMPILED=${3}

TEST_DIR="$(pwd)/tests"

Expand Down
5 changes: 5 additions & 0 deletions tests/local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/scripts && pwd )"
source ${SCRIPT_DIR}/.definitions.sh
source ${SCRIPT_DIR}/.local.sh

if [ "${USE_PRECOMPILED}" == "1" ]; then
remote_retry
fi

# Sync the project folder to the remote
${SCRIPT_DIR}/push.sh

Expand All @@ -24,4 +28,5 @@ remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites.
remote \
PROJECT="${PROJECT}" \
TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}" \
USE_PRECOMPILED="${USE_PRECOMPILED}" \
${TEST_CASE}
2 changes: 1 addition & 1 deletion tests/scripts/.definitions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )"

: ${LOG_DIR:="/tmp/logs"}

: ${SSH_RETRY:="0"}
: ${USE_PRECOMPILED:="0"}
: ${SYSTEM_ONLINE_CHECK_TIMEOUT:="900"}

: ${BASE_TARGET:="jammy"}
Expand Down
7 changes: 7 additions & 0 deletions tests/scripts/install-operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@ echo "Current kernel version: $CURRENT_KERNEL"
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

if [ "${USE_PRECOMPILED}" == "1" ]; then
${SCRIPT_DIR}/findkernelversion.sh
source "${LOG_DIR}"/kernel_version.txt
TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}-${KERNEL_VERSION}-ubuntu22.04"
OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.usePrecompiled=true"
fi

OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.repository=${PRIVATE_REGISTRY}/nvidia --set driver.version=${TARGET_DRIVER_VERSION}"

# add helm driver repo
Expand Down
4 changes: 0 additions & 4 deletions tests/scripts/remote.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,3 @@ source ${SCRIPT_DIR}/.local.sh

# keep alive 60sec and timeout after 30 tries
ssh -o ServerAliveInterval=60 -o ServerAliveCountMax=30 -i ${private_key} ${instance_hostname} "${@}"
ssh_exit_status=$?
if [ $ssh_exit_status -ne 0 ] && [ "$SSH_RETRY" == "1" ]; then
remote_retry
fi

0 comments on commit 4ed43ab

Please sign in to comment.