diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index e44be0ce..995ca3c3 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -21,6 +21,19 @@ on: - completed branches: - main + repository_dispatch: + types: [trigger_ci] + branches: + - precompilede2e + pull_request: + types: + - opened + - synchronize + branches: + - precompilede2e + push: + branches: + - precompilede2e jobs: e2e-tests-nvidiadriver: @@ -30,12 +43,20 @@ jobs: - name: Check out code uses: actions/checkout@v4 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Set up Holodeck uses: NVIDIA/holodeck@main env: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }} + AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }} with: aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} @@ -52,20 +73,24 @@ jobs: run: | echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV - echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV + # SHIVA + # echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV + echo "COMMIT_SHORT_SHA=shivaku" >> $GITHUB_ENV DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}') echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV + echo "DOCKER_GITHUB_TOKEN=${{ secrets.GITHUB_TOKEN }}" >> $GITHUB_ENV - name: Validate gpu driver env: TEST_CASE: "./tests/cases/nvidia-driver.sh" + SSH_RETRY: "0" run: | sudo chmod 644 ${{ github.workspace }}/.cache/key echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key} rc=0 for driver_version in ${DRIVER_VERSIONS}; do echo "Running e2e for DRIVER_VERSION=$driver_version" - ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} || status=$? + ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} ${SSH_RETRY} ${DOCKER_GITHUB_TOKEN}|| status=$? if [ $status -ne 0 ]; then echo "e2e validation failed for driver version $driver_version with status $status" rc=$status @@ -82,3 +107,44 @@ jobs: name: nvidiadriver-e2e-test-logs path: ./logs/ retention-days: 15 + + - name: Precompiled e2e test- upgrade kernel and Validate gpu driver + env: + TEST_CASE_KERNEL_UPGRADE: "./tests/cases/nvidia-kernel-upgrade.sh" + TEST_CASE: "./tests/cases/nvidia-driver.sh" + run: | + sudo chmod 644 ${{ github.workspace }}/.cache/key + echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key} + rc=0 + for driver_version in ${DRIVER_VERSIONS}; do + echo "Running e2e for DRIVER_VERSION=$driver_version" + export SSH_RETRY="0" + ./tests/ci-run-e2e.sh ${TEST_CASE_KERNEL_UPGRADE} ${driver_version} ${SSH_RETRY} ${DOCKER_GITHUB_TOKEN} || status=$? + if [ $status -ne 0 ]; then + echo "Kernel upgrade failed" + rc=$status + else + # system rebooted enable ssh retry + export SSH_RETRY="1" + DRIVER_BRANCH=$(echo "${TARGET_DRIVER_VERSION}" | cut -d '.' -f 1) + source ./tests/scripts/.definitions.sh + source "${SCRIPT_DIR}"/kernel_version.txt + DRIVER_VERSION="${DRIVER_BRANCH}-${KERNEL_VERSION}-ubuntu22.04" + ./tests/ci-run-e2e.sh ${TEST_CASE} ${DRIVER_VERSION} ${SSH_RETRY} ${DOCKER_GITHUB_TOKEN} || status=$? + if [ $status -ne 0 ]; then + echo "e2e validation failed for driver version $DRIVER_VERSION with status $status" + rc=$status + fi + fi + done + source ./tests/scripts/.definitions.sh + ./tests/scripts/pull.sh ${LOG_DIR} logs + exit $rc + + - name: Archive test logs + if: ${{ failure() }} + uses: actions/upload-artifact@v4 + with: + name: nvidiadriver-Precompiled-e2e-test-logs + path: ./logs/ + retention-days: 15 diff --git a/.github/workflows/image.yaml b/.github/workflows/image.yaml index c167a324..4a14d323 100644 --- a/.github/workflows/image.yaml +++ b/.github/workflows/image.yaml @@ -22,11 +22,9 @@ on: - synchronize branches: - main - - release-* push: branches: - main - - release-* jobs: image: @@ -62,7 +60,9 @@ jobs: - name: Calculate build vars id: vars run: | - echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV + # SHIVA + # echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV + echo "COMMIT_SHORT_SHA=shivaku" >> $GITHUB_ENV echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}" echo "${REPO_FULL_NAME}" @@ -94,6 +94,7 @@ jobs: VERSION: ${COMMIT_SHORT_SHA} run: | DRIVER_VERSIONS=${{ matrix.driver }} make build-${{ matrix.dist }}-${{ matrix.driver }} + echo "SHIVA" pre-compiled: runs-on: ubuntu-latest @@ -125,7 +126,9 @@ jobs: - name: Calculate build vars id: vars run: | - echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV + # SHIVA + # echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV + echo "COMMIT_SHORT_SHA=shivaku" >> $GITHUB_ENV echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}" echo "${REPO_FULL_NAME}" @@ -160,13 +163,16 @@ jobs: run: | make DRIVER_BRANCH=${{ matrix.driver }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET} - trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT - docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver }} - # try 3 times every 10 seconds to get the file, if success exit the loop - for i in {1..3}; do - docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break - sleep 10 - done + # trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT + # docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver }} + # # try 3 times every 10 seconds to get the file, if success exit the loop + # for i in {1..3}; do + # docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break + # sleep 10 + # done + # echo "KUMAR" + # echo "cat kernel_version.txt" + # echo "==============" - name: Build image env: IMAGE_NAME: ghcr.io/nvidia/driver @@ -174,5 +180,10 @@ jobs: PRECOMPILED: "true" DIST: signed_ubuntu22.04 run: | - source kernel_version.txt && \ + echo "SHIVA" + echo "SHIVA============= ${VERSION}" + echo "cat kernel_version.txt" + echo "==============" + # source kernel_version.txt && \ make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver }} build-${DIST}-${DRIVER_VERSION} + echo "SHIVA" diff --git a/tests/cases/nvidia-kernel-upgrade.sh b/tests/cases/nvidia-kernel-upgrade.sh new file mode 100755 index 00000000..82e43572 --- /dev/null +++ b/tests/cases/nvidia-kernel-upgrade.sh @@ -0,0 +1,8 @@ +#! /bin/bash +# This test case runs the operator installation / test case with the default options. + +SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )" +source "${SCRIPTS_DIR}"/.definitions.sh + +# Run an end-to-end test cycle +"${SCRIPTS_DIR}"/nvidia-kernel-upgrade-aws.sh diff --git a/tests/ci-run-e2e.sh b/tests/ci-run-e2e.sh index 621a7a8e..ddd5c2ae 100755 --- a/tests/ci-run-e2e.sh +++ b/tests/ci-run-e2e.sh @@ -2,14 +2,15 @@ set -xe -if [[ $# -ne 2 ]]; then - echo "TEST_CASE TARGET_DRIVER_VERSION are required" +if [[ $# -ne 4 ]]; then + echo "TEST_CASE TARGET_DRIVER_VERSION SSH_RETRY DOCKER_GITHUB_TOKEN are required" exit 1 fi export TEST_CASE=${1} export TARGET_DRIVER_VERSION=${2} - +export SSH_RETRY=${3} +export DOCKER_GITHUB_TOKEN=${4} TEST_DIR="$(pwd)/tests" diff --git a/tests/local.sh b/tests/local.sh index 86918588..c70b4498 100755 --- a/tests/local.sh +++ b/tests/local.sh @@ -12,16 +12,22 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/scripts && pwd )" source ${SCRIPT_DIR}/.definitions.sh source ${SCRIPT_DIR}/.local.sh +if [ "${SSH_RETRY}" == "1" ]; then + remote_retry +fi + # Sync the project folder to the remote ${SCRIPT_DIR}/push.sh # We trigger the installation of prerequisites on the remote instance remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites.sh + # We trigger the specified test case on the remote instance. # Note: We need to ensure that the required environment variables # are forwarded to the remote shell. remote \ PROJECT="${PROJECT}" \ TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}" \ + DOCKER_GITHUB_TOKEN="${DOCKER_GITHUB_TOKEN}" \ ${TEST_CASE} diff --git a/tests/scripts/.definitions.sh b/tests/scripts/.definitions.sh index f254bc00..2c016bbd 100644 --- a/tests/scripts/.definitions.sh +++ b/tests/scripts/.definitions.sh @@ -24,3 +24,11 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )" : ${POD_STATUS_TIME_OUT:="2m"} : ${LOG_DIR:="/tmp/logs"} + +: ${SSH_RETRY:="0"} +: ${SYSTEM_ONLINE_CHECK_TIMEOUT:="900"} + +: ${BASE_TARGET:="jammy"} +: ${KERNEL_FLAVOR:="generic"} + +: ${DOCKER_GITHUB_TOKEN:=""} diff --git a/tests/scripts/.local.sh b/tests/scripts/.local.sh index 7971a404..f3d98b2f 100644 --- a/tests/scripts/.local.sh +++ b/tests/scripts/.local.sh @@ -3,3 +3,7 @@ function remote() { ${SCRIPT_DIR}/remote.sh "cd ${PROJECT} && "$@"" } + +function remote_retry() { + ${SCRIPT_DIR}/remote_retry.sh +} diff --git a/tests/scripts/findkernelversion.sh b/tests/scripts/findkernelversion.sh new file mode 100755 index 00000000..ad8e47ae --- /dev/null +++ b/tests/scripts/findkernelversion.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +if [[ "${SKIP_INSTALL}" == "true" ]]; then + echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}" + exit 0 +fi + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source "${SCRIPT_DIR}"/.definitions.sh + +export REGCTL_VERSION=v0.4.7 +mkdir -p bin +curl -sSLo bin/regctl https://github.com/regclient/regclient/releases/download/${REGCTL_VERSION}/regctl-linux-amd64 +chmod a+x bin/regctl +export PATH=$(pwd)/bin:${PATH} +DRIVER_BRANCH=$(echo "${TARGET_DRIVER_VERSION}" | cut -d '.' -f 1) +regctl image get-file ghcr.io/nvidia/driver:base-${BASE_TARGET}-${KERNEL_FLAVOR}-${DRIVER_BRANCH} /var/kernel_version.txt ${SCRIPT_DIR}/kernel_version.txt || true diff --git a/tests/scripts/install-operator.sh b/tests/scripts/install-operator.sh index 3acfcffb..9d0cf477 100755 --- a/tests/scripts/install-operator.sh +++ b/tests/scripts/install-operator.sh @@ -5,16 +5,25 @@ if [[ "${SKIP_INSTALL}" == "true" ]]; then exit 0 fi +echo "Checking current kernel version..." +CURRENT_KERNEL=$(uname -r) +echo "Current kernel version: $CURRENT_KERNEL" + SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source ${SCRIPT_DIR}/.definitions.sh -OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.repository=${PRIVATE_REGISTRY}/nvidia --set driver.version=${TARGET_DRIVER_VERSION}" +# add helm driver repo +# SHIVA +#docker login ${HELM_NVIDIA_REPO} -u x-access-token --password $DOCKER_GITHUB_TOKEN -# add helm driver repo helm repo add nvidia ${HELM_NVIDIA_REPO} && helm repo update # Create the test namespace kubectl create namespace "${TEST_NAMESPACE}" +kubectl create secret docker-registry ngc-secret --docker-server=${PRIVATE_REGISTRY}/nvidia --docker-username='$oauthtoken' --docker-password=${DOCKER_GITHUB_TOKEN} -n ${TEST_NAMESPACE} +# SHIVA add for precompiled +# --set driver.usePrecompiled=true +OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.repository=${PRIVATE_REGISTRY}/nvidia --set driver.version=${TARGET_DRIVER_VERSION} --set imagePullSecrets=ngc-secret --set driver.imagePullSecrets={ngc-secret}" # Run the helm install command echo "OPERATOR_OPTIONS: $OPERATOR_OPTIONS" diff --git a/tests/scripts/nvidia-kernel-upgrade-aws.sh b/tests/scripts/nvidia-kernel-upgrade-aws.sh new file mode 100755 index 00000000..ab6f7e78 --- /dev/null +++ b/tests/scripts/nvidia-kernel-upgrade-aws.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +if [[ "${SKIP_INSTALL}" == "true" ]]; then + echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}" + exit 0 +fi + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source "${SCRIPT_DIR}"/.definitions.sh + +# finding kernel version +${SCRIPT_DIR}/findkernelversion.sh +source "${SCRIPT_DIR}"/kernel_version.txt + +echo "Checking current kernel version..." +CURRENT_KERNEL=$(uname -r) +echo "Current kernel version: $CURRENT_KERNEL" + +echo "" +echo "" +echo "--------------Starting the Precompiled kernel version ${KERNEL_VERSION} upgrade--------------" + +sudo apt-get update -y +sudo apt-get install linux-image-${KERNEL_VERSION} -y +if [ $? -ne 0 ]; then + echo "Kernel upgrade failed." + exit 1 +fi + +echo "Checking the upgraded kernel version ${KERNEL_VERSION}..." +CURRENT_KERNEL=$(uname -r) +echo "Upgraded kernel version: $CURRENT_KERNEL" + +echo "update grub ..." +sudo update-grub +echo "Rebooting ..." +# Run the reboot command with nohup to avoid abrupt SSH closure issues +nohup sudo reboot & + +echo "--------------Installation of kernel completed --------------" + +# Exit with a success code since the reboot command was issued successfully +exit 0 diff --git a/tests/scripts/remote_retry.sh b/tests/scripts/remote_retry.sh new file mode 100755 index 00000000..7a7b073e --- /dev/null +++ b/tests/scripts/remote_retry.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source ${SCRIPT_DIR}/.definitions.sh +source ${SCRIPT_DIR}/.local.sh + +try_ssh_connection() { + ssh -o ConnectTimeout=10 -i ${private_key} ${instance_hostname} "exit" + return $? +} + +echo "Waiting for aws system to come back online..." +START_TIME=$(date +%s) +while true; do + sleep 60 # sleep before as system restarted earlier + try_ssh_connection + if [ $? -eq 0 ]; then + echo "Successfully connected to aws system after reboot." + break; + fi + ELAPSED_TIME=$(($(date +%s) - START_TIME)) + if [ "$ELAPSED_TIME" -ge "$SYSTEM_ONLINE_CHECK_TIMEOUT" ]; then + echo "Failed to connect to aws within ${SYSTEM_ONLINE_CHECK_TIMEOUT} minutes after reboot." + exit 1 + fi + echo "ssh retry again..." +done