From 6da9934afc2e1362c2d2fa934717f10aa9c89b4d Mon Sep 17 00:00:00 2001 From: shiva kumar Date: Thu, 22 Aug 2024 13:14:06 +0530 Subject: [PATCH] Pre-compiled end-to-end gpu driver validation Signed-off-by: shiva kumar --- .github/workflows/ci-precompiled.yaml | 192 ++++++++++++++++++++++ .github/workflows/ci.yaml | 14 +- .github/workflows/image.yaml | 10 +- :wq! | 15 ++ tests/ci-run-e2e.sh | 11 +- tests/local.sh | 1 + tests/scripts/.definitions.sh | 7 +- tests/scripts/.local.sh | 4 + tests/scripts/.rsync-excludes | 4 - tests/scripts/.rsync-includes | 2 + tests/scripts/end-to-end-nvidia-driver.sh | 2 +- tests/scripts/findkernelversion.sh | 31 ++++ tests/scripts/install-operator.sh | 12 +- tests/scripts/kernel-upgrade-helper.sh | 56 +++++++ tests/scripts/remote_retry.sh | 67 ++++++++ tests/scripts/sync.sh | 3 +- tests/scripts/upgrade-kernel.sh | 8 + 17 files changed, 412 insertions(+), 27 deletions(-) create mode 100644 .github/workflows/ci-precompiled.yaml create mode 100644 :wq! delete mode 100644 tests/scripts/.rsync-excludes create mode 100644 tests/scripts/.rsync-includes create mode 100755 tests/scripts/findkernelversion.sh create mode 100755 tests/scripts/kernel-upgrade-helper.sh create mode 100755 tests/scripts/remote_retry.sh create mode 100755 tests/scripts/upgrade-kernel.sh diff --git a/.github/workflows/ci-precompiled.yaml b/.github/workflows/ci-precompiled.yaml new file mode 100644 index 00000000..0ae2f689 --- /dev/null +++ b/.github/workflows/ci-precompiled.yaml @@ -0,0 +1,192 @@ +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Pre-Compiled End-to-end tests + +on: + # workflow_run: + # workflows: [image] + # types: + # - completed + # branches: + # - e2etestdriver_no + # workflow_dispatch: + + # pull_request: + # types: + # - opened + # - synchronize + # branches: + # # - main + # # - release-* + # - e2etestdriver + push: + branches: + # - main + # - release-* + - e2etestdriver-test + +jobs: + e2e-driver-version-compare: + runs-on: ubuntu-latest + outputs: + matrix_values_not_empty: ${{ steps.set_kernel_version.outputs.matrix_values_not_empty }} + matrix_values: ${{ steps.set_kernel_version.outputs.matrix_values }} + steps: + + - name: Check out code + uses: actions/checkout@v4 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set kernel version + id: set_kernel_version + env: + BASE_TARGET: "jammy" + DIST: "ubuntu22.04" + run: | + export PRIVATE_REGISTRY="ghcr.io" + DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}') + # currently azure image upgrade is failing + # KERNEL_FLAVORS=("aws" "azure" "generic" "nvidia" "oracle") + KERNEL_FLAVORS=("aws" "generic" "nvidia" "oracle") + echo "matrix_values_not_empty=0" >> $GITHUB_OUTPUT + kernel_versions=() + for KERNEL_FLAVOR in "${KERNEL_FLAVORS[@]}"; do + for driver_version in ${DRIVER_VERSIONS}; do + DRIVER_VERSION=$(echo "${driver_version}" | cut -d '.' -f 1) + source ./tests/scripts/findkernelversion.sh "$BASE_TARGET" "${KERNEL_FLAVOR}" "$DRIVER_VERSION" "$DIST" + if [[ "$should_continue" == true ]]; then + echo "matrix_values_not_empty=1" >> $GITHUB_OUTPUT + break + fi + done + if [[ "$should_continue" == false ]]; then + echo "The last successful e2e-tests-nvidiadriver was on the same tag ($KERNEL_VERSION). Skipping e2e-tests-nvidiadriver." + else + # remove any space , newlines for json format + KERNEL_VERSION=$(echo "$KERNEL_VERSION" | tr -d ' \n') + kernel_versions+=("$KERNEL_VERSION") + echo "Proceeding with $KERNEL_FLAVOR $KERNEL_VERSION e2e-tests-nvidiadriver." + fi + done + # Convert array to JSON format and assign + echo "[]" > $GITHUB_WORKSPACE/matrix_values.json + printf '%s\n' "${kernel_versions[@]}" | jq -R . | jq -s . > $GITHUB_WORKSPACE/matrix_values.json + echo "matrix_values=$(cat $GITHUB_WORKSPACE/matrix_values.json | jq -c .)" >> $GITHUB_OUTPUT + + e2e-tests-nvidiadriver: + runs-on: ubuntu-latest + needs: e2e-driver-version-compare + if: ${{ needs.e2e-driver-version-compare.outputs.matrix_values_not_empty == '1' }} + strategy: + matrix: + kernel_version: ${{ fromJson(needs.e2e-driver-version-compare.outputs.matrix_values) }} + steps: + - name: Check out code + uses: actions/checkout@v4 + + - name: Set up Holodeck + uses: NVIDIA/holodeck@v0.2.1 + env: + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }} + # AWS_REGION: "us-west-1" + with: + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws_ssh_key: ${{ secrets.AWS_SSH_KEY }} + holodeck_config: "tests/holodeck.yaml" + + - name: Get public dns name + id: get_public_dns_name + uses: mikefarah/yq@master + with: + cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml + + - name: Get aws instance id + id: get_instance_id + uses: mikefarah/yq@master + with: + cmd: yq '.status.properties[] | select(.name == "instance-id") | .value' /github/workspace/.cache/holodeck.yaml + + - name: Set and Calculate test vars + run: | + echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV + echo "instance_id=${{ steps.get_instance_id.outputs.result }}" >> $GITHUB_ENV + echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV + echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem + echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV + DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}') + echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV + echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV + KERNEL_VERSION="${{ matrix.kernel_version }}" + echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV + + - name: Precompiled e2e test upgrade kernel and Validate gpu driver + env: + UPGRADE_KERNEL_SCRIPT: "./tests/scripts/upgrade-kernel.sh" + TEST_CASE: "./tests/cases/nvidia-driver.sh" + OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true" + # AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + # AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + # AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }} + # AWS_REGION: "us-west-1" + run: | + rc=0 + for driver_version in ${DRIVER_VERSIONS}; do + echo "Running e2e for DRIVER_VERSION=$driver_version" + DRIVER_VERSION=$(echo "${driver_version}" | cut -d '.' -f 1) + # Use ARG3=OPERATOR_OPTIONS as KERNEL_VERSION in case of kernel upgrade + status=0 + ./tests/ci-run-e2e.sh "${UPGRADE_KERNEL_SCRIPT}" "${DRIVER_VERSION}" "${KERNEL_VERSION}" || status=$? + # On the target system, all scripts/test-case exit with code 1 for error handling. + # However, since reboot-related disconnections break the SSH connection + # and can cause the entire job to exit, we should ignore all errors except + # exit code 1. During a reboot, exit code 1 will not be thrown, so handling + # other errors as code 1 will ensure proper management of reboot scenarios + if [ $status -eq 1 ]; then + echo "e2e validation failed for driver branch $DRIVER_VERSION and kernel version $KERNEL_VERSION with status $status" + rc=$status + continue + fi + # sleep 120 + ./tests/scripts/remote_retry.sh || status=$? + if [ $status -ne 0 ]; then + echo "Failed to connect to aws instance" + rc=$status + exit 1 + fi + ./tests/ci-run-e2e.sh "${TEST_CASE}" "${DRIVER_VERSION}" "${OPERATOR_OPTIONS}" || status=$? + if [ $status -eq 1 ]; then + echo "e2e validation failed for driver version $driver_version with status $status" + rc=$status + fi + done + ./tests/scripts/pull.sh /tmp/logs logs + exit $rc + + - name: Archive test logs + if: ${{ failure() }} + uses: actions/upload-artifact@v4 + with: + name: nvidiadriver-Precompiled-e2e-test-logs + path: ./logs/ + retention-days: 15 diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 3c6e8690..264bc62a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -46,31 +46,33 @@ jobs: id: get_public_dns_name uses: mikefarah/yq@master with: - cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml + cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml - name: Set and Calculate test vars run: | echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV + echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}') echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV - + echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV + - name: Validate gpu driver env: TEST_CASE: "./tests/cases/nvidia-driver.sh" + OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia" run: | - echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key} rc=0 for driver_version in ${DRIVER_VERSIONS}; do echo "Running e2e for DRIVER_VERSION=$driver_version" - ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} || status=$? + status=0 + ./tests/ci-run-e2e.sh "${TEST_CASE}" "${COMMIT_SHORT_SHA}-${driver_version}" "${OPERATOR_OPTIONS}" || status=$? if [ $status -ne 0 ]; then echo "e2e validation failed for driver version $driver_version with status $status" rc=$status fi done - source ./tests/scripts/.definitions.sh ./tests/scripts/pull.sh /tmp/logs logs exit $rc @@ -80,4 +82,4 @@ jobs: with: name: nvidiadriver-e2e-test-logs path: ./logs/ - retention-days: 15 + retention-days: 15 diff --git a/.github/workflows/image.yaml b/.github/workflows/image.yaml index bfeb3dbd..89fd49ce 100644 --- a/.github/workflows/image.yaml +++ b/.github/workflows/image.yaml @@ -21,12 +21,14 @@ on: - opened - synchronize branches: - - main - - release-* + # - main + # - release-* + - e2etestdriver-test-no push: branches: - - main - - release-* + # - main + # - release-* + - e2etestdriver-test-no jobs: image: diff --git a/:wq! b/:wq! new file mode 100644 index 00000000..6dc8525a --- /dev/null +++ b/:wq! @@ -0,0 +1,15 @@ +testing-Pre-compiled end-to-end gpu driver validation + +Signed-off-by: shiva kumar + +# Please enter the commit message for your changes. Lines starting +# with '#' will be ignored, and an empty message aborts the commit. +# +# Date: Sat Aug 31 12:15:28 2024 +0530 +# +# On branch e2etestdriver-test +# Changes to be committed: +# modified: .github/workflows/ci-precompiled.yaml +# modified: .github/workflows/image.yaml +# modified: tests/scripts/remote_retry.sh +# diff --git a/tests/ci-run-e2e.sh b/tests/ci-run-e2e.sh index 621a7a8e..c57f3360 100755 --- a/tests/ci-run-e2e.sh +++ b/tests/ci-run-e2e.sh @@ -2,14 +2,15 @@ set -xe -if [[ $# -ne 2 ]]; then - echo "TEST_CASE TARGET_DRIVER_VERSION are required" +if [[ $# -ne 3 ]]; then + echo "TEST_CASE TARGET_DRIVER_VERSION OPERATOR_OPTIONS are required" + echo "Use OPERATOR_OPTIONS as KERNEL_FLAVOR in case of kernel upgrade" exit 1 fi -export TEST_CASE=${1} -export TARGET_DRIVER_VERSION=${2} - +export TEST_CASE="${1}" +export TARGET_DRIVER_VERSION="${2}" +export OPERATOR_OPTIONS="${3}" TEST_DIR="$(pwd)/tests" diff --git a/tests/local.sh b/tests/local.sh index 86918588..67167522 100755 --- a/tests/local.sh +++ b/tests/local.sh @@ -24,4 +24,5 @@ remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites. remote \ PROJECT="${PROJECT}" \ TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}" \ + OPERATOR_OPTIONS=\"${OPERATOR_OPTIONS}\" \ ${TEST_CASE} diff --git a/tests/scripts/.definitions.sh b/tests/scripts/.definitions.sh index f254bc00..ae46b270 100644 --- a/tests/scripts/.definitions.sh +++ b/tests/scripts/.definitions.sh @@ -14,8 +14,6 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )" : ${TEST_NAMESPACE:="test-operator"} -: ${PRIVATE_REGISTRY:="ghcr.io"} - : ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"} : ${TARGET_DRIVER_VERSION:="550.90.07"} @@ -24,3 +22,8 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )" : ${POD_STATUS_TIME_OUT:="2m"} : ${LOG_DIR:="/tmp/logs"} + +: ${OPERATOR_OPTIONS:="--set driver.repository=ghcr.io/nvidia"} +: ${AWS_SYSTEM_ONLINE_CHECK_TIMEOUT:="900"} + +: ${BASE_TARGET:="jammy"} diff --git a/tests/scripts/.local.sh b/tests/scripts/.local.sh index 7971a404..f3d98b2f 100644 --- a/tests/scripts/.local.sh +++ b/tests/scripts/.local.sh @@ -3,3 +3,7 @@ function remote() { ${SCRIPT_DIR}/remote.sh "cd ${PROJECT} && "$@"" } + +function remote_retry() { + ${SCRIPT_DIR}/remote_retry.sh +} diff --git a/tests/scripts/.rsync-excludes b/tests/scripts/.rsync-excludes deleted file mode 100644 index 06c2f6ef..00000000 --- a/tests/scripts/.rsync-excludes +++ /dev/null @@ -1,4 +0,0 @@ -vendor/ -.git -cnt-ci -key.pem diff --git a/tests/scripts/.rsync-includes b/tests/scripts/.rsync-includes new file mode 100644 index 00000000..f91de959 --- /dev/null +++ b/tests/scripts/.rsync-includes @@ -0,0 +1,2 @@ +tests/ +tests/*** diff --git a/tests/scripts/end-to-end-nvidia-driver.sh b/tests/scripts/end-to-end-nvidia-driver.sh index d272efab..ab2db9a1 100755 --- a/tests/scripts/end-to-end-nvidia-driver.sh +++ b/tests/scripts/end-to-end-nvidia-driver.sh @@ -11,7 +11,7 @@ ${SCRIPT_DIR}/install-operator.sh "${SCRIPT_DIR}"/verify-operator.sh -echo "--------------Verification completed for GPU Operator, uninstalling the operator--------------" +echo "--------------Verification completed for GPU Operator, uninstalling the GPU operator--------------" ${SCRIPT_DIR}/uninstall-operator.sh ${TEST_NAMESPACE} "gpu-operator" diff --git a/tests/scripts/findkernelversion.sh b/tests/scripts/findkernelversion.sh new file mode 100755 index 00000000..260ec3f4 --- /dev/null +++ b/tests/scripts/findkernelversion.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +if [[ $# -ne 4 ]]; then + echo " BASE_TARGET DRIVER_BRANCH KERNEL_FLAVOR DIST are required" + exit 1 +fi + +export BASE_TARGET="${1}" +export KERNEL_FLAVOR="${2}" +export DRIVER_BRANCH="${3}" +export DIST="${4}" + +export REGCTL_VERSION=v0.4.7 +mkdir -p bin +curl -sSLo bin/regctl https://github.com/regclient/regclient/releases/download/${REGCTL_VERSION}/regctl-linux-amd64 +chmod a+x bin/regctl +export PATH=$(pwd)/bin:${PATH} + +# calculate kernel version of latest image +regctl image get-file ghcr.io/nvidia/driver:base-${BASE_TARGET}-${KERNEL_FLAVOR}-${DRIVER_BRANCH} /var/kernel_version.txt ./kernel_version.txt +export $(grep -oP 'KERNEL_VERSION=[^ ]+' ./kernel_version.txt) + +# calculate driver tag +status=0 +regctl tag ls "${PRIVATE_REGISTRY}"/nvidia/driver | grep "^${DRIVER_BRANCH}-${KERNEL_VERSION}-${DIST}$" || status=$? +if [[ $status -eq 0 ]]; then + export should_continue=false +else + export should_continue=true +fi +export should_continue=true diff --git a/tests/scripts/install-operator.sh b/tests/scripts/install-operator.sh index 3acfcffb..b6785512 100755 --- a/tests/scripts/install-operator.sh +++ b/tests/scripts/install-operator.sh @@ -5,10 +5,14 @@ if [[ "${SKIP_INSTALL}" == "true" ]]; then exit 0 fi +echo "Checking current kernel version..." +CURRENT_KERNEL=$(uname -r) +echo "Current kernel version: $CURRENT_KERNEL" + SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source ${SCRIPT_DIR}/.definitions.sh -OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.repository=${PRIVATE_REGISTRY}/nvidia --set driver.version=${TARGET_DRIVER_VERSION}" +OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.version=${TARGET_DRIVER_VERSION}" # add helm driver repo helm repo add nvidia ${HELM_NVIDIA_REPO} && helm repo update @@ -17,8 +21,8 @@ helm repo add nvidia ${HELM_NVIDIA_REPO} && helm repo update kubectl create namespace "${TEST_NAMESPACE}" # Run the helm install command -echo "OPERATOR_OPTIONS: $OPERATOR_OPTIONS" -${HELM} install gpu-operator nvidia/gpu-operator \ +echo "OPERATOR_OPTIONS: ${OPERATOR_OPTIONS}" +eval ${HELM} install gpu-operator nvidia/gpu-operator \ -n "${TEST_NAMESPACE}" \ - ${OPERATOR_OPTIONS} \ + "${OPERATOR_OPTIONS}" \ --wait diff --git a/tests/scripts/kernel-upgrade-helper.sh b/tests/scripts/kernel-upgrade-helper.sh new file mode 100755 index 00000000..a3d8b4a5 --- /dev/null +++ b/tests/scripts/kernel-upgrade-helper.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +if [[ "${SKIP_INSTALL}" == "true" ]]; then + echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}" + exit 0 +fi + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source "${SCRIPT_DIR}"/.definitions.sh + +export KERNEL_VERSION="${OPERATOR_OPTIONS}" + +echo "Checking current kernel version..." +CURRENT_KERNEL=$(uname -r) +echo "Current kernel version: $CURRENT_KERNEL" + +if [ "${CURRENT_KERNEL}" != ${KERNEL_VERSION} ]; then + echo "" + echo "" + echo "--------------Upgrading kernel to ${KERNEL_VERSION}--------------" + + # Set non-interactive frontend for apt and disable editor prompts + # Perform the installation non-interactively + export DEBIAN_FRONTEND=noninteractive + export EDITOR=/bin/true + echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections + + sudo apt-get update -y || true + + # The removal of the currently running kernel (apt remove linux-image-*) sometimes works and sometimes does not. + # Occasionally, it requires two reboots, or an apt upgrade. However, removing all traces of the old/current + # kernel from the boot directory works consistently, which is why this approach has been adopted. + sudo rm -rf /boot/*${CURRENT_KERNEL}* || true + sudo rm -rf /lib/modules/*${CURRENT_KERNEL}* + sudo rm -rf /boot/*.old + + #install new kernel + sudo apt-get install --allow-downgrades linux-image-${KERNEL_VERSION} linux-headers-${KERNEL_VERSION} linux-modules-${KERNEL_VERSION} -y || exit 1 + if [ $? -ne 0 ]; then + echo "Kernel upgrade failed." + exit 1 + fi + echo "update grub and initramfs..." + sudo update-grub || true + sudo update-initramfs -u -k ${KERNEL_VERSION} || true + echo "Rebooting ..." + # Run the reboot command with nohup to avoid abrupt SSH closure issues + nohup sudo reboot & + + echo "--------------Kernel upgrade completed--------------" +else + echo "--------------Kernel upgrade not required, current kernel version ${KERNEL_VERSION}--------------" +fi + +# Exit with a success code since the reboot command was issued successfully +exit 0 diff --git a/tests/scripts/remote_retry.sh b/tests/scripts/remote_retry.sh new file mode 100755 index 00000000..533e1ac5 --- /dev/null +++ b/tests/scripts/remote_retry.sh @@ -0,0 +1,67 @@ +#!/bin/bash + + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source ${SCRIPT_DIR}/.definitions.sh + +# START_TIME=$(date +%s) + # without sleep 120 seconds aws not working +# timeout "${AWS_SYSTEM_ONLINE_CHECK_TIMEOUT}" aws ec2 wait instance-status-ok --instance-ids "${instance_id}" +# if [ $? -ne 0 ]; then +# echo "AWS Instance did not become available within ${AWS_SYSTEM_ONLINE_CHECK_TIMEOUT} seconds." +# exit 1 +# fi +# timeout "${AWS_SYSTEM_ONLINE_CHECK_TIMEOUT}" aws ec2 describe-instances --instance-ids "${instance_id}" --query 'Reservations[0].Instances[0].State.Name' --output text +# STATE=$(aws ec2 describe-instances --instance-ids "${instance_id}" --query 'Reservations[0].Instances[0].State.Name' --output text) +# if [ "$STATE" != "running" ]; then +# echo "AWS Instance is not in running condition within ${AWS_SYSTEM_ONLINE_CHECK_TIMEOUT} seconds." +# exit 1 +# fi + +# while true; do +# INSTANCE_STATUS=$(aws ec2 describe-instance-status --instance-ids "${instance_id}" --query 'InstanceStatuses[0].InstanceStatus.Status' --output text) +# SYSTEM_STATUS=$(aws ec2 describe-instance-status --instance-ids "${instance_id}" --query 'InstanceStatuses[0].SystemStatus.Status' --output text) +# if [ "$INSTANCE_STATUS" == "ok" ] && [ "$SYSTEM_STATUS" == "ok" ]; then +# echo "AWS Instance and System Status: ok" +# exit 0 +# fi +# ELAPSED_TIME=$(($(date +%s) - START_TIME)) +# if [ "$ELAPSED_TIME" -ge "$AWS_SYSTEM_ONLINE_CHECK_TIMEOUT" ]; then +# echo "Failed to connect to aws within ${AWS_SYSTEM_ONLINE_CHECK_TIMEOUT} seconds after reboot." +# exit 1 +# fi +# sleep 10 +# done + +# exit 1 + +INTERVAL=30 +SECONDS_ELAPSED=0 + +# Function to handle timeout exit +handle_timeout() { + echo "Failed to connect within the timeout period of $AWS_SYSTEM_ONLINE_CHECK_TIMEOUT seconds." + exit 1 +} + +# Set trap for timeout +trap handle_timeout EXIT + +sleep $INTERVAL +while [ $SECONDS_ELAPSED -lt $AWS_SYSTEM_ONLINE_CHECK_TIMEOUT ]; do +# echo "Attempting to connect to ${instance_hostname}... (Elapsed time: $SECONDS_ELAPSED seconds)" + + # Attempt to connect via SSH and ignore errors + # ssh -o ConnectTimeout=4 -i ${private_key} ${instance_hostname} "exit" || status=$? + ssh -o ConnectTimeout=5 -i ${private_key} ${instance_hostname} "echo 'AWS system Connected successfully!'" >/dev/null 2>&1 + + # Check if SSH command succeeded + if [ $? -eq 0 ]; then + echo "Successfully connected to ${instance_hostname}." + trap - EXIT # Disable the timeout trap since the connection was successful + exit 0 + fi + + # If SSH fails, wait for the interval before retrying + SECONDS_ELAPSED=$((SECONDS_ELAPSED + INTERVAL)) +done diff --git a/tests/scripts/sync.sh b/tests/scripts/sync.sh index cb020752..555d7b86 100755 --- a/tests/scripts/sync.sh +++ b/tests/scripts/sync.sh @@ -12,6 +12,7 @@ source ${SCRIPT_DIR}/.local.sh rsync -e "ssh -i ${private_key} -o StrictHostKeyChecking=no" \ -avz --delete \ - --exclude-from="${SCRIPT_DIR}/.rsync-excludes" \ + --include-from="${SCRIPT_DIR}/.rsync-includes" \ + --exclude='*' \ ${@} diff --git a/tests/scripts/upgrade-kernel.sh b/tests/scripts/upgrade-kernel.sh new file mode 100755 index 00000000..e7d90ec3 --- /dev/null +++ b/tests/scripts/upgrade-kernel.sh @@ -0,0 +1,8 @@ +#! /bin/bash +# This test case runs the operator installation / test case with the default options. + +SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )" +source "${SCRIPTS_DIR}"/.definitions.sh + +# Run an end-to-end test cycle +"${SCRIPTS_DIR}"/kernel-upgrade-helper.sh