From b3958ad677ab9cf39e8080fb9ca01986908f7881 Mon Sep 17 00:00:00 2001 From: shiva kumar Date: Sat, 24 Aug 2024 12:26:37 +0530 Subject: [PATCH] Pre-compiled end-to-end gpu driver validation 1.Pre-compiled end-to-end gpu driver validation Signed-off-by: shiva kumar --- .github/workflows/ci.yaml | 53 +++--------- .github/workflows/ci_precompiled.yaml | 120 ++++++++++++++++++++++++++ .github/workflows/image.yaml | 4 +- tests/ci-run-e2e.sh | 10 ++- tests/local.sh | 7 +- tests/scripts/.definitions.sh | 4 +- tests/scripts/.local.sh | 2 + tests/scripts/install-operator.sh | 14 ++- 8 files changed, 149 insertions(+), 65 deletions(-) create mode 100644 .github/workflows/ci_precompiled.yaml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 3276285a..8558fe07 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -20,7 +20,7 @@ on: types: - completed branches: - - e2etestdriver + - e2etestdriver_no pull_request: types: @@ -29,12 +29,12 @@ on: branches: # - main # - release-* - - e2etestdriver + - e2etestdriver_no push: branches: # - main # - release-* - - e2etestdriver + - e2etestdriver_no jobs: e2e-tests-nvidiadriver: @@ -55,12 +55,12 @@ jobs: aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws_ssh_key: ${{ secrets.AWS_SSH_KEY }} holodeck_config: "tests/holodeck.yaml" - + - name: Get public dns name id: get_public_dns_name uses: mikefarah/yq@master with: - cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml + cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml - name: Set and Calculate test vars run: | @@ -70,17 +70,19 @@ jobs: echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}') echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV - + echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV + - name: Validate gpu driver env: TEST_CASE: "./tests/cases/nvidia-driver.sh" - USE_PRECOMPILED: "0" + OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia" run: | rc=0 for driver_version in ${DRIVER_VERSIONS}; do echo "Running e2e for DRIVER_VERSION=$driver_version" status=0 - ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} ${USE_PRECOMPILED} || status=$? + echo "SHIVA==== ${OPERATOR_OPTIONS}" + ./tests/ci-run-e2e.sh "${TEST_CASE}" "${COMMIT_SHORT_SHA}-${driver_version}" "${OPERATOR_OPTIONS}" || status=$? if [ $status -ne 0 ]; then echo "e2e validation failed for driver version $driver_version with status $status" rc=$status @@ -96,38 +98,3 @@ jobs: name: nvidiadriver-e2e-test-logs path: ./logs/ retention-days: 15 - - - name: Precompiled e2e test- upgrade kernel and Validate gpu driver - env: - TEST_CASE_KERNEL_UPGRADE: "./tests/cases/nvidia-kernel-upgrade.sh" - TEST_CASE: "./tests/cases/nvidia-driver.sh" - USE_PRECOMPILED: "1" - run: | - rc=0 - for driver_version in ${DRIVER_VERSIONS}; do - echo "Running e2e for DRIVER_VERSION=$driver_version" - status=0 - ./tests/ci-run-e2e.sh ${TEST_CASE_KERNEL_UPGRADE} ${driver_version} ${USE_PRECOMPILED} || status=$? - if [ $status -ne 0 ]; then - echo "Kernel upgrade failed" - rc=$status - else - DRIVER_BRANCH=$(echo "${driver_version}" | cut -d '.' -f 1) - DRIVER_VERSION="${DRIVER_BRANCH}" - ./tests/ci-run-e2e.sh ${TEST_CASE} ${DRIVER_VERSION} ${USE_PRECOMPILED} || status=$? - if [ $status -ne 0 ]; then - echo "e2e validation failed for driver version $DRIVER_VERSION with status $status" - rc=$status - fi - fi - done - ./tests/scripts/pull.sh /tmp/logs logs - exit $rc - - - name: Archive test logs - if: ${{ failure() }} - uses: actions/upload-artifact@v4 - with: - name: nvidiadriver-Precompiled-e2e-test-logs - path: ./logs/ - retention-days: 15 diff --git a/.github/workflows/ci_precompiled.yaml b/.github/workflows/ci_precompiled.yaml new file mode 100644 index 00000000..e37aa1df --- /dev/null +++ b/.github/workflows/ci_precompiled.yaml @@ -0,0 +1,120 @@ +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Pre-Compiled End-to-end tests + +on: + workflow_run: + workflows: [image] + types: + - completed + branches: + - e2etestdriver_no + + pull_request: + types: + - opened + - synchronize + branches: + # - main + # - release-* + - e2etestdriver + push: + branches: + # - main + # - release-* + - e2etestdriver + +jobs: + e2e-tests-nvidiadriver: + # strategy: + # matrix: + # flavor: + # - aws + # - azure + # - generic + # - nvidia + # - oracle + runs-on: ubuntu-latest + + steps: + - name: Check out code + uses: actions/checkout@v4 + + - name: Set up Holodeck + uses: NVIDIA/holodeck@v0.2.1 + env: + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }} + with: + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws_ssh_key: ${{ secrets.AWS_SSH_KEY }} + holodeck_config: "tests/holodeck.yaml" + + - name: Get public dns name + id: get_public_dns_name + uses: mikefarah/yq@master + with: + cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml + + - name: Set and Calculate test vars + run: | + echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV + echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV + echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem + echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV + DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}') + echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV + echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV + + - name: Precompiled e2e test- upgrade kernel and Validate gpu driver + env: + TEST_CASE_KERNEL_UPGRADE: "./tests/cases/nvidia-kernel-upgrade.sh" + TEST_CASE: "./tests/cases/nvidia-driver.sh" + OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true" + run: | + rc=0 + echo "SHIVAAAAAAAAA" + for driver_version in ${DRIVER_VERSIONS}; do + echo "Running e2e for DRIVER_VERSION=$driver_version" + status=0 + echo "SHIVA==== ${OPERATOR_OPTIONS}" + ./tests/ci-run-e2e.sh "${TEST_CASE_KERNEL_UPGRADE}" "${driver_version} \"${OPERATOR_OPTIONS}\" || status=$? + echo "shiva3" + if [ $status -ne 0 ]; then + echo "Kernel upgrade failed" + rc=$status + else + ./tests/scripts/remote_retry.sh + DRIVER_BRANCH=$(echo "${driver_version}" | cut -d '.' -f 1) + DRIVER_VERSION="${DRIVER_BRANCH}" + ./tests/ci-run-e2e.sh "${TEST_CASE}" "${DRIVER_VERSION}" "${OPERATOR_OPTIONS}" || status=$? + if [ $status -ne 0 ]; then + echo "e2e validation failed for driver version $DRIVER_VERSION with status $status" + rc=$status + fi + fi + done + ./tests/scripts/pull.sh /tmp/logs logs + exit $rc + + - name: Archive test logs + if: ${{ failure() }} + uses: actions/upload-artifact@v4 + with: + name: nvidiadriver-Precompiled-e2e-test-logs + path: ./logs/ + retention-days: 15 diff --git a/.github/workflows/image.yaml b/.github/workflows/image.yaml index ef794ed3..3a2f1d2c 100644 --- a/.github/workflows/image.yaml +++ b/.github/workflows/image.yaml @@ -23,12 +23,12 @@ on: branches: # - main # - release-* - - e2etestdriver + - e2etestdriver_no push: branches: # - main # - release-* - - e2etestdriver + - e2etestdriver_no jobs: image: diff --git a/tests/ci-run-e2e.sh b/tests/ci-run-e2e.sh index c9543b0b..a54ecfa1 100755 --- a/tests/ci-run-e2e.sh +++ b/tests/ci-run-e2e.sh @@ -3,13 +3,15 @@ set -xe if [[ $# -ne 3 ]]; then - echo "TEST_CASE TARGET_DRIVER_VERSION USE_PRECOMPILED are required" + echo "TEST_CASE TARGET_DRIVER_VERSION OPERATOR_OPTIONS are required" exit 1 fi -export TEST_CASE=${1} -export TARGET_DRIVER_VERSION=${2} -export USE_PRECOMPILED=${3} +export TEST_CASE="${1}" +export TARGET_DRIVER_VERSION="${2}" +export OPERATOR_OPTIONS="${3}" + +echo "SHIVA===== ${OPERATOR_OPTIONS}" TEST_DIR="$(pwd)/tests" diff --git a/tests/local.sh b/tests/local.sh index 640c85ec..318234c3 100755 --- a/tests/local.sh +++ b/tests/local.sh @@ -12,10 +12,6 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/scripts && pwd )" source ${SCRIPT_DIR}/.definitions.sh source ${SCRIPT_DIR}/.local.sh -if [ "${USE_PRECOMPILED}" == "1" ]; then - remote_retry -fi - # Sync the project folder to the remote ${SCRIPT_DIR}/push.sh @@ -25,8 +21,9 @@ remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites. # We trigger the specified test case on the remote instance. # Note: We need to ensure that the required environment variables # are forwarded to the remote shell. +echo "SHIVA====== ${OPERATOR_OPTIONS}" remote \ PROJECT="${PROJECT}" \ TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}" \ - USE_PRECOMPILED="${USE_PRECOMPILED}" \ + OPERATOR_OPTIONS=\"${OPERATOR_OPTIONS}\" \ ${TEST_CASE} diff --git a/tests/scripts/.definitions.sh b/tests/scripts/.definitions.sh index f2b2ec8a..e895680f 100644 --- a/tests/scripts/.definitions.sh +++ b/tests/scripts/.definitions.sh @@ -14,8 +14,6 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )" : ${TEST_NAMESPACE:="test-operator"} -: ${PRIVATE_REGISTRY:="ghcr.io"} - : ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"} : ${TARGET_DRIVER_VERSION:="550.90.07"} @@ -25,7 +23,7 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )" : ${LOG_DIR:="/tmp/logs"} -: ${USE_PRECOMPILED:="0"} +: ${OPERATOR_OPTIONS:="--set driver.repository=ghcr.io/nvidia"} : ${SYSTEM_ONLINE_CHECK_TIMEOUT:="900"} : ${BASE_TARGET:="jammy"} diff --git a/tests/scripts/.local.sh b/tests/scripts/.local.sh index f3d98b2f..4407f486 100644 --- a/tests/scripts/.local.sh +++ b/tests/scripts/.local.sh @@ -1,7 +1,9 @@ #!/usr/env bash function remote() { + echo "SHIVA1" ${SCRIPT_DIR}/remote.sh "cd ${PROJECT} && "$@"" + echo "SHIVA2" } function remote_retry() { diff --git a/tests/scripts/install-operator.sh b/tests/scripts/install-operator.sh index 02db4d80..69152956 100755 --- a/tests/scripts/install-operator.sh +++ b/tests/scripts/install-operator.sh @@ -12,11 +12,8 @@ echo "Current kernel version: $CURRENT_KERNEL" SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source ${SCRIPT_DIR}/.definitions.sh -if [ "${USE_PRECOMPILED}" == "1" ]; then - OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.usePrecompiled=true" -fi - -OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.repository=${PRIVATE_REGISTRY}/nvidia --set driver.version=${TARGET_DRIVER_VERSION}" +echo "SHIVA=== ${OPERATOR_OPTIONS}" +OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.version=${TARGET_DRIVER_VERSION}" # add helm driver repo helm repo add nvidia ${HELM_NVIDIA_REPO} && helm repo update @@ -24,9 +21,10 @@ helm repo add nvidia ${HELM_NVIDIA_REPO} && helm repo update # Create the test namespace kubectl create namespace "${TEST_NAMESPACE}" +echo "SHIVA==1 ${OPERATOR_OPTIONS}" # Run the helm install command -echo "OPERATOR_OPTIONS: $OPERATOR_OPTIONS" -${HELM} install gpu-operator nvidia/gpu-operator \ +eval ${HELM} install gpu-operator nvidia/gpu-operator \ -n "${TEST_NAMESPACE}" \ - ${OPERATOR_OPTIONS} \ + "${OPERATOR_OPTIONS}" \ --wait +echo "SHIVA==2 ${OPERATOR_OPTIONS}" \ No newline at end of file