diff --git a/.github/workflows/ci-precompiled.yaml b/.github/workflows/ci-precompiled.yaml new file mode 100644 index 00000000..d838c563 --- /dev/null +++ b/.github/workflows/ci-precompiled.yaml @@ -0,0 +1,165 @@ +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Pre-Compiled End-to-end tests + +on: + schedule: + - cron: '00 10 * * *' # as precompiled job runs daily at 9AM UTC + #SHIVA + push: + branches: + - e2etestdriver + +jobs: + e2e-driver-version-compare: + runs-on: ubuntu-latest + outputs: + matrix_values_not_empty: ${{ steps.set_kernel_version.outputs.matrix_values_not_empty }} + matrix_values: ${{ steps.set_kernel_version.outputs.matrix_values }} + steps: + + - name: Check out code + uses: actions/checkout@v4 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set kernel version + id: set_kernel_version + env: + BASE_TARGET: "jammy" + DIST: "ubuntu22.04" + run: | + export PRIVATE_REGISTRY="ghcr.io" + DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}') + # currently azure image upgrade is failing + # KERNEL_FLAVORS=("aws" "azure" "generic" "nvidia" "oracle") + KERNEL_FLAVORS=("aws" "generic" "nvidia" "oracle") + echo "matrix_values_not_empty=0" >> $GITHUB_OUTPUT + kernel_versions=() + for KERNEL_FLAVOR in "${KERNEL_FLAVORS[@]}"; do + for driver_version in ${DRIVER_VERSIONS}; do + DRIVER_VERSION=$(echo "${driver_version}" | cut -d '.' -f 1) + source ./tests/scripts/findkernelversion.sh "$BASE_TARGET" "${KERNEL_FLAVOR}" "$DRIVER_VERSION" "$DIST" + if [[ "$should_continue" == true ]]; then + echo "matrix_values_not_empty=1" >> $GITHUB_OUTPUT + break + fi + done + if [[ "$should_continue" == false ]]; then + echo "The last successful e2e-tests-nvidiadriver was on the same tag ($KERNEL_VERSION). Skipping e2e-tests-nvidiadriver." + else + # remove any space , newlines for json format + KERNEL_VERSION=$(echo "$KERNEL_VERSION" | tr -d ' \n') + kernel_versions+=("$KERNEL_VERSION") + echo "Proceeding with $KERNEL_FLAVOR $KERNEL_VERSION e2e-tests-nvidiadriver." + fi + done + # Convert array to JSON format and assign + echo "[]" > $GITHUB_WORKSPACE/matrix_values.json + printf '%s\n' "${kernel_versions[@]}" | jq -R . | jq -s . > $GITHUB_WORKSPACE/matrix_values.json + echo "matrix_values=$(cat $GITHUB_WORKSPACE/matrix_values.json | jq -c .)" >> $GITHUB_OUTPUT + + e2e-tests-nvidiadriver: + runs-on: ubuntu-latest + needs: e2e-driver-version-compare + if: ${{ needs.e2e-driver-version-compare.outputs.matrix_values_not_empty == '1' }} + strategy: + matrix: + kernel_version: ${{ fromJson(needs.e2e-driver-version-compare.outputs.matrix_values) }} + steps: + - name: Check out code + uses: actions/checkout@v4 + + - name: Set up Holodeck + uses: NVIDIA/holodeck@v0.2.1 + env: + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }} + with: + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws_ssh_key: ${{ secrets.AWS_SSH_KEY }} + holodeck_config: "tests/holodeck.yaml" + + - name: Get public dns name + id: get_public_dns_name + uses: mikefarah/yq@master + with: + cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml + + - name: Set and Calculate test vars + run: | + echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV + echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV + echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem + echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV + DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}') + echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV + echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV + KERNEL_VERSION="${{ matrix.kernel_version }}" + echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV + + - name: Precompiled e2e test upgrade kernel and Validate gpu driver + env: + UPGRADE_KERNEL_SCRIPT: "./tests/scripts/upgrade-kernel.sh" + TEST_CASE: "./tests/cases/nvidia-driver.sh" + OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true" + run: | + rc=0 + for driver_version in ${DRIVER_VERSIONS}; do + echo "Running e2e for DRIVER_VERSION=$driver_version" + DRIVER_VERSION=$(echo "${driver_version}" | cut -d '.' -f 1) + # Use ARG3=OPERATOR_OPTIONS as KERNEL_VERSION in case of kernel upgrade + status=0 + ./tests/ci-run-e2e.sh "${UPGRADE_KERNEL_SCRIPT}" "${DRIVER_VERSION}" "${KERNEL_VERSION}" || status=$? + # On the target system, all scripts/test-case exit with code 1 for error handling. + # However, since reboot-related disconnections break the SSH connection + # and can cause the entire job to exit, we should ignore all errors except + # exit code 1. During a reboot, exit code 1 will not be thrown, so handling + # other errors as code 1 will ensure proper management of reboot scenarios + if [ $status -eq 1 ]; then + echo "e2e validation failed for driver branch $DRIVER_VERSION and kernel version $KERNEL_VERSION with status $status" + rc=$status + continue + fi + ./tests/scripts/remote_retry.sh || status=$? + if [ $status -ne 0 ]; then + echo "Failed to connect to aws instance" + rc=$status + exit 1 + fi + # sleep 120 + ./tests/ci-run-e2e.sh "${TEST_CASE}" "${DRIVER_VERSION}" "${OPERATOR_OPTIONS}" || status=$? + if [ $status -eq 1 ]; then + echo "e2e validation failed for driver version $driver_version with status $status" + rc=$status + fi + done + ./tests/scripts/pull.sh /tmp/logs logs + exit $rc + + - name: Archive test logs + if: ${{ failure() }} + uses: actions/upload-artifact@v4 + with: + name: nvidiadriver-Precompiled-e2e-test-logs + path: ./logs/ + retention-days: 15 diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 3c6e8690..264bc62a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -46,31 +46,33 @@ jobs: id: get_public_dns_name uses: mikefarah/yq@master with: - cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml + cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml - name: Set and Calculate test vars run: | echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV + echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}') echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV - + echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV + - name: Validate gpu driver env: TEST_CASE: "./tests/cases/nvidia-driver.sh" + OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia" run: | - echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key} rc=0 for driver_version in ${DRIVER_VERSIONS}; do echo "Running e2e for DRIVER_VERSION=$driver_version" - ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} || status=$? + status=0 + ./tests/ci-run-e2e.sh "${TEST_CASE}" "${COMMIT_SHORT_SHA}-${driver_version}" "${OPERATOR_OPTIONS}" || status=$? if [ $status -ne 0 ]; then echo "e2e validation failed for driver version $driver_version with status $status" rc=$status fi done - source ./tests/scripts/.definitions.sh ./tests/scripts/pull.sh /tmp/logs logs exit $rc @@ -80,4 +82,4 @@ jobs: with: name: nvidiadriver-e2e-test-logs path: ./logs/ - retention-days: 15 + retention-days: 15 diff --git a/tests/ci-run-e2e.sh b/tests/ci-run-e2e.sh index 621a7a8e..c57f3360 100755 --- a/tests/ci-run-e2e.sh +++ b/tests/ci-run-e2e.sh @@ -2,14 +2,15 @@ set -xe -if [[ $# -ne 2 ]]; then - echo "TEST_CASE TARGET_DRIVER_VERSION are required" +if [[ $# -ne 3 ]]; then + echo "TEST_CASE TARGET_DRIVER_VERSION OPERATOR_OPTIONS are required" + echo "Use OPERATOR_OPTIONS as KERNEL_FLAVOR in case of kernel upgrade" exit 1 fi -export TEST_CASE=${1} -export TARGET_DRIVER_VERSION=${2} - +export TEST_CASE="${1}" +export TARGET_DRIVER_VERSION="${2}" +export OPERATOR_OPTIONS="${3}" TEST_DIR="$(pwd)/tests" diff --git a/tests/local.sh b/tests/local.sh index 86918588..67167522 100755 --- a/tests/local.sh +++ b/tests/local.sh @@ -24,4 +24,5 @@ remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites. remote \ PROJECT="${PROJECT}" \ TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}" \ + OPERATOR_OPTIONS=\"${OPERATOR_OPTIONS}\" \ ${TEST_CASE} diff --git a/tests/scripts/.definitions.sh b/tests/scripts/.definitions.sh index f254bc00..e895680f 100644 --- a/tests/scripts/.definitions.sh +++ b/tests/scripts/.definitions.sh @@ -14,8 +14,6 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )" : ${TEST_NAMESPACE:="test-operator"} -: ${PRIVATE_REGISTRY:="ghcr.io"} - : ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"} : ${TARGET_DRIVER_VERSION:="550.90.07"} @@ -24,3 +22,8 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )" : ${POD_STATUS_TIME_OUT:="2m"} : ${LOG_DIR:="/tmp/logs"} + +: ${OPERATOR_OPTIONS:="--set driver.repository=ghcr.io/nvidia"} +: ${SYSTEM_ONLINE_CHECK_TIMEOUT:="900"} + +: ${BASE_TARGET:="jammy"} diff --git a/tests/scripts/.local.sh b/tests/scripts/.local.sh index 7971a404..f3d98b2f 100644 --- a/tests/scripts/.local.sh +++ b/tests/scripts/.local.sh @@ -3,3 +3,7 @@ function remote() { ${SCRIPT_DIR}/remote.sh "cd ${PROJECT} && "$@"" } + +function remote_retry() { + ${SCRIPT_DIR}/remote_retry.sh +} diff --git a/tests/scripts/.rsync-excludes b/tests/scripts/.rsync-excludes deleted file mode 100644 index 06c2f6ef..00000000 --- a/tests/scripts/.rsync-excludes +++ /dev/null @@ -1,4 +0,0 @@ -vendor/ -.git -cnt-ci -key.pem diff --git a/tests/scripts/.rsync-includes b/tests/scripts/.rsync-includes new file mode 100644 index 00000000..f91de959 --- /dev/null +++ b/tests/scripts/.rsync-includes @@ -0,0 +1,2 @@ +tests/ +tests/*** diff --git a/tests/scripts/end-to-end-nvidia-driver.sh b/tests/scripts/end-to-end-nvidia-driver.sh index d272efab..ab2db9a1 100755 --- a/tests/scripts/end-to-end-nvidia-driver.sh +++ b/tests/scripts/end-to-end-nvidia-driver.sh @@ -11,7 +11,7 @@ ${SCRIPT_DIR}/install-operator.sh "${SCRIPT_DIR}"/verify-operator.sh -echo "--------------Verification completed for GPU Operator, uninstalling the operator--------------" +echo "--------------Verification completed for GPU Operator, uninstalling the GPU operator--------------" ${SCRIPT_DIR}/uninstall-operator.sh ${TEST_NAMESPACE} "gpu-operator" diff --git a/tests/scripts/findkernelversion.sh b/tests/scripts/findkernelversion.sh new file mode 100755 index 00000000..eae85a74 --- /dev/null +++ b/tests/scripts/findkernelversion.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +if [[ $# -ne 4 ]]; then + echo " BASE_TARGET DRIVER_BRANCH KERNEL_FLAVOR DIST are required" + exit 1 +fi + +export BASE_TARGET="${1}" +export KERNEL_FLAVOR="${2}" +export DRIVER_BRANCH="${3}" +export DIST="${4}" + +export REGCTL_VERSION=v0.4.7 +mkdir -p bin +curl -sSLo bin/regctl https://github.com/regclient/regclient/releases/download/${REGCTL_VERSION}/regctl-linux-amd64 +chmod a+x bin/regctl +export PATH=$(pwd)/bin:${PATH} + +# calculate kernel version of latest image +regctl image get-file ghcr.io/nvidia/driver:base-${BASE_TARGET}-${KERNEL_FLAVOR}-${DRIVER_BRANCH} /var/kernel_version.txt ./kernel_version.txt +export $(grep -oP 'KERNEL_VERSION=[^ ]+' ./kernel_version.txt) + +# calculate driver tag +status=0 +regctl tag ls "${PRIVATE_REGISTRY}"/nvidia/driver | grep "^${DRIVER_BRANCH}-${KERNEL_VERSION}-${DIST}$" || status=$? +if [[ $status -eq 0 ]]; then + export should_continue=false +else + export should_continue=true +fi +#SHIVA +export should_continue=true \ No newline at end of file diff --git a/tests/scripts/install-operator.sh b/tests/scripts/install-operator.sh index 3acfcffb..b6785512 100755 --- a/tests/scripts/install-operator.sh +++ b/tests/scripts/install-operator.sh @@ -5,10 +5,14 @@ if [[ "${SKIP_INSTALL}" == "true" ]]; then exit 0 fi +echo "Checking current kernel version..." +CURRENT_KERNEL=$(uname -r) +echo "Current kernel version: $CURRENT_KERNEL" + SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source ${SCRIPT_DIR}/.definitions.sh -OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.repository=${PRIVATE_REGISTRY}/nvidia --set driver.version=${TARGET_DRIVER_VERSION}" +OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.version=${TARGET_DRIVER_VERSION}" # add helm driver repo helm repo add nvidia ${HELM_NVIDIA_REPO} && helm repo update @@ -17,8 +21,8 @@ helm repo add nvidia ${HELM_NVIDIA_REPO} && helm repo update kubectl create namespace "${TEST_NAMESPACE}" # Run the helm install command -echo "OPERATOR_OPTIONS: $OPERATOR_OPTIONS" -${HELM} install gpu-operator nvidia/gpu-operator \ +echo "OPERATOR_OPTIONS: ${OPERATOR_OPTIONS}" +eval ${HELM} install gpu-operator nvidia/gpu-operator \ -n "${TEST_NAMESPACE}" \ - ${OPERATOR_OPTIONS} \ + "${OPERATOR_OPTIONS}" \ --wait diff --git a/tests/scripts/kernel-upgrade-helper.sh b/tests/scripts/kernel-upgrade-helper.sh new file mode 100755 index 00000000..a3d8b4a5 --- /dev/null +++ b/tests/scripts/kernel-upgrade-helper.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +if [[ "${SKIP_INSTALL}" == "true" ]]; then + echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}" + exit 0 +fi + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source "${SCRIPT_DIR}"/.definitions.sh + +export KERNEL_VERSION="${OPERATOR_OPTIONS}" + +echo "Checking current kernel version..." +CURRENT_KERNEL=$(uname -r) +echo "Current kernel version: $CURRENT_KERNEL" + +if [ "${CURRENT_KERNEL}" != ${KERNEL_VERSION} ]; then + echo "" + echo "" + echo "--------------Upgrading kernel to ${KERNEL_VERSION}--------------" + + # Set non-interactive frontend for apt and disable editor prompts + # Perform the installation non-interactively + export DEBIAN_FRONTEND=noninteractive + export EDITOR=/bin/true + echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections + + sudo apt-get update -y || true + + # The removal of the currently running kernel (apt remove linux-image-*) sometimes works and sometimes does not. + # Occasionally, it requires two reboots, or an apt upgrade. However, removing all traces of the old/current + # kernel from the boot directory works consistently, which is why this approach has been adopted. + sudo rm -rf /boot/*${CURRENT_KERNEL}* || true + sudo rm -rf /lib/modules/*${CURRENT_KERNEL}* + sudo rm -rf /boot/*.old + + #install new kernel + sudo apt-get install --allow-downgrades linux-image-${KERNEL_VERSION} linux-headers-${KERNEL_VERSION} linux-modules-${KERNEL_VERSION} -y || exit 1 + if [ $? -ne 0 ]; then + echo "Kernel upgrade failed." + exit 1 + fi + echo "update grub and initramfs..." + sudo update-grub || true + sudo update-initramfs -u -k ${KERNEL_VERSION} || true + echo "Rebooting ..." + # Run the reboot command with nohup to avoid abrupt SSH closure issues + nohup sudo reboot & + + echo "--------------Kernel upgrade completed--------------" +else + echo "--------------Kernel upgrade not required, current kernel version ${KERNEL_VERSION}--------------" +fi + +# Exit with a success code since the reboot command was issued successfully +exit 0 diff --git a/tests/scripts/remote_retry.sh b/tests/scripts/remote_retry.sh new file mode 100755 index 00000000..f838884c --- /dev/null +++ b/tests/scripts/remote_retry.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source ${SCRIPT_DIR}/.definitions.sh +INTERVAL=30 +SECONDS_ELAPSED=0 + +set +e +# Function to handle timeout exit +handle_timeout() { + echo "Failed to connect within the timeout period of $AWS_SYSTEM_ONLINE_CHECK_TIMEOUT seconds." + exit 1 +} + +# Set trap for timeout +trap handle_timeout EXIT + +# sleep before to make connection persistent to handle restart of the system +sleep 60; + +while [ $SECONDS_ELAPSED -lt $AWS_SYSTEM_ONLINE_CHECK_TIMEOUT ]; do + # Attempt to connect via SSH and ignore errors + status=0 + ( + ssh -o ConnectTimeout=5 -i ${private_key} ${instance_hostname} "exit" + ) >/dev/null 2>&1 + status=$? + if [ $status -eq 0 ]; then + echo "Successfully connected to ${instance_hostname}." + trap - EXIT # Disable the timeout trap since the connection was successful + exit 0 + fi + + sleep $INTERVAL + SECONDS_ELAPSED=$((SECONDS_ELAPSED + INTERVAL)) + echo "ssh retry...elpased time $SECONDS_ELAPSED" +done diff --git a/tests/scripts/sync.sh b/tests/scripts/sync.sh index cb020752..555d7b86 100755 --- a/tests/scripts/sync.sh +++ b/tests/scripts/sync.sh @@ -12,6 +12,7 @@ source ${SCRIPT_DIR}/.local.sh rsync -e "ssh -i ${private_key} -o StrictHostKeyChecking=no" \ -avz --delete \ - --exclude-from="${SCRIPT_DIR}/.rsync-excludes" \ + --include-from="${SCRIPT_DIR}/.rsync-includes" \ + --exclude='*' \ ${@} diff --git a/tests/scripts/upgrade-kernel.sh b/tests/scripts/upgrade-kernel.sh new file mode 100755 index 00000000..e7d90ec3 --- /dev/null +++ b/tests/scripts/upgrade-kernel.sh @@ -0,0 +1,8 @@ +#! /bin/bash +# This test case runs the operator installation / test case with the default options. + +SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )" +source "${SCRIPTS_DIR}"/.definitions.sh + +# Run an end-to-end test cycle +"${SCRIPTS_DIR}"/kernel-upgrade-helper.sh