Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Precompilede2e #90

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 68 additions & 2 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,19 @@ on:
- completed
branches:
- main
repository_dispatch:
types: [trigger_ci]
branches:
- precompilede2e
pull_request:
types:
- opened
- synchronize
branches:
- precompilede2e
push:
branches:
- precompilede2e

jobs:
e2e-tests-nvidiadriver:
Expand All @@ -30,12 +43,20 @@ jobs:
- name: Check out code
uses: actions/checkout@v4

- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Set up Holodeck
uses: NVIDIA/holodeck@main
env:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }}
with:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
Expand All @@ -52,20 +73,24 @@ jobs:
run: |
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
# SHIVA
# echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
echo "COMMIT_SHORT_SHA=shivaku" >> $GITHUB_ENV
DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}')
echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV
echo "DOCKER_GITHUB_TOKEN=${{ secrets.GITHUB_TOKEN }}" >> $GITHUB_ENV

- name: Validate gpu driver
env:
TEST_CASE: "./tests/cases/nvidia-driver.sh"
SSH_RETRY: "0"
run: |
sudo chmod 644 ${{ github.workspace }}/.cache/key
echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key}
rc=0
for driver_version in ${DRIVER_VERSIONS}; do
echo "Running e2e for DRIVER_VERSION=$driver_version"
./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} || status=$?
./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} ${SSH_RETRY} ${DOCKER_GITHUB_TOKEN}|| status=$?
if [ $status -ne 0 ]; then
echo "e2e validation failed for driver version $driver_version with status $status"
rc=$status
Expand All @@ -82,3 +107,44 @@ jobs:
name: nvidiadriver-e2e-test-logs
path: ./logs/
retention-days: 15

- name: Precompiled e2e test- upgrade kernel and Validate gpu driver
env:
TEST_CASE_KERNEL_UPGRADE: "./tests/cases/nvidia-kernel-upgrade.sh"
TEST_CASE: "./tests/cases/nvidia-driver.sh"
run: |
sudo chmod 644 ${{ github.workspace }}/.cache/key
echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key}
rc=0
for driver_version in ${DRIVER_VERSIONS}; do
echo "Running e2e for DRIVER_VERSION=$driver_version"
export SSH_RETRY="0"
./tests/ci-run-e2e.sh ${TEST_CASE_KERNEL_UPGRADE} ${driver_version} ${SSH_RETRY} ${DOCKER_GITHUB_TOKEN} || status=$?
if [ $status -ne 0 ]; then
echo "Kernel upgrade failed"
rc=$status
else
# system rebooted enable ssh retry
export SSH_RETRY="1"
DRIVER_BRANCH=$(echo "${TARGET_DRIVER_VERSION}" | cut -d '.' -f 1)
source ./tests/scripts/.definitions.sh
source "${SCRIPT_DIR}"/kernel_version.txt
DRIVER_VERSION="${DRIVER_BRANCH}-${KERNEL_VERSION}-ubuntu22.04"
./tests/ci-run-e2e.sh ${TEST_CASE} ${DRIVER_VERSION} ${SSH_RETRY} ${DOCKER_GITHUB_TOKEN} || status=$?
if [ $status -ne 0 ]; then
echo "e2e validation failed for driver version $DRIVER_VERSION with status $status"
rc=$status
fi
fi
done
source ./tests/scripts/.definitions.sh
./tests/scripts/pull.sh ${LOG_DIR} logs
exit $rc

- name: Archive test logs
if: ${{ failure() }}
uses: actions/upload-artifact@v4
with:
name: nvidiadriver-Precompiled-e2e-test-logs
path: ./logs/
retention-days: 15
35 changes: 23 additions & 12 deletions .github/workflows/image.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,9 @@ on:
- synchronize
branches:
- main
- release-*
push:
branches:
- main
- release-*

jobs:
image:
Expand Down Expand Up @@ -62,7 +60,9 @@ jobs:
- name: Calculate build vars
id: vars
run: |
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
# SHIVA
# echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
echo "COMMIT_SHORT_SHA=shivaku" >> $GITHUB_ENV
echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV
REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}"
echo "${REPO_FULL_NAME}"
Expand Down Expand Up @@ -94,6 +94,7 @@ jobs:
VERSION: ${COMMIT_SHORT_SHA}
run: |
DRIVER_VERSIONS=${{ matrix.driver }} make build-${{ matrix.dist }}-${{ matrix.driver }}
echo "SHIVA"

pre-compiled:
runs-on: ubuntu-latest
Expand Down Expand Up @@ -125,7 +126,9 @@ jobs:
- name: Calculate build vars
id: vars
run: |
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
# SHIVA
# echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
echo "COMMIT_SHORT_SHA=shivaku" >> $GITHUB_ENV
echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV
REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}"
echo "${REPO_FULL_NAME}"
Expand Down Expand Up @@ -160,19 +163,27 @@ jobs:
run: |
make DRIVER_BRANCH=${{ matrix.driver }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET}

trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT
docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver }}
# try 3 times every 10 seconds to get the file, if success exit the loop
for i in {1..3}; do
docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break
sleep 10
done
# trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT
# docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver }}
# # try 3 times every 10 seconds to get the file, if success exit the loop
# for i in {1..3}; do
# docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break
# sleep 10
# done
# echo "KUMAR"
# echo "cat kernel_version.txt"
# echo "=============="
- name: Build image
env:
IMAGE_NAME: ghcr.io/nvidia/driver
VERSION: ${COMMIT_SHORT_SHA}
PRECOMPILED: "true"
DIST: signed_ubuntu22.04
run: |
source kernel_version.txt && \
echo "SHIVA"
echo "SHIVA============= ${VERSION}"
echo "cat kernel_version.txt"
echo "=============="
# source kernel_version.txt && \
make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver }} build-${DIST}-${DRIVER_VERSION}
echo "SHIVA"
8 changes: 8 additions & 0 deletions tests/cases/nvidia-kernel-upgrade.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#! /bin/bash
# This test case runs the operator installation / test case with the default options.

SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )"
source "${SCRIPTS_DIR}"/.definitions.sh

# Run an end-to-end test cycle
"${SCRIPTS_DIR}"/nvidia-kernel-upgrade-aws.sh
7 changes: 4 additions & 3 deletions tests/ci-run-e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@

set -xe

if [[ $# -ne 2 ]]; then
echo "TEST_CASE TARGET_DRIVER_VERSION are required"
if [[ $# -ne 4 ]]; then
echo "TEST_CASE TARGET_DRIVER_VERSION SSH_RETRY DOCKER_GITHUB_TOKEN are required"
exit 1
fi

export TEST_CASE=${1}
export TARGET_DRIVER_VERSION=${2}

export SSH_RETRY=${3}
export DOCKER_GITHUB_TOKEN=${4}

TEST_DIR="$(pwd)/tests"

Expand Down
6 changes: 6 additions & 0 deletions tests/local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,22 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/scripts && pwd )"
source ${SCRIPT_DIR}/.definitions.sh
source ${SCRIPT_DIR}/.local.sh

if [ "${SSH_RETRY}" == "1" ]; then
remote_retry
fi

# Sync the project folder to the remote
${SCRIPT_DIR}/push.sh

# We trigger the installation of prerequisites on the remote instance
remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites.sh


# We trigger the specified test case on the remote instance.
# Note: We need to ensure that the required environment variables
# are forwarded to the remote shell.
remote \
PROJECT="${PROJECT}" \
TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}" \
DOCKER_GITHUB_TOKEN="${DOCKER_GITHUB_TOKEN}" \
${TEST_CASE}
8 changes: 8 additions & 0 deletions tests/scripts/.definitions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,11 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )"
: ${POD_STATUS_TIME_OUT:="2m"}

: ${LOG_DIR:="/tmp/logs"}

: ${SSH_RETRY:="0"}
: ${SYSTEM_ONLINE_CHECK_TIMEOUT:="900"}

: ${BASE_TARGET:="jammy"}
: ${KERNEL_FLAVOR:="generic"}

: ${DOCKER_GITHUB_TOKEN:=""}
4 changes: 4 additions & 0 deletions tests/scripts/.local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,7 @@
function remote() {
${SCRIPT_DIR}/remote.sh "cd ${PROJECT} && "$@""
}

function remote_retry() {
${SCRIPT_DIR}/remote_retry.sh
}
17 changes: 17 additions & 0 deletions tests/scripts/findkernelversion.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

if [[ "${SKIP_INSTALL}" == "true" ]]; then
echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}"
exit 0
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source "${SCRIPT_DIR}"/.definitions.sh

export REGCTL_VERSION=v0.4.7
mkdir -p bin
curl -sSLo bin/regctl https://github.com/regclient/regclient/releases/download/${REGCTL_VERSION}/regctl-linux-amd64
chmod a+x bin/regctl
export PATH=$(pwd)/bin:${PATH}
DRIVER_BRANCH=$(echo "${TARGET_DRIVER_VERSION}" | cut -d '.' -f 1)
regctl image get-file ghcr.io/nvidia/driver:base-${BASE_TARGET}-${KERNEL_FLAVOR}-${DRIVER_BRANCH} /var/kernel_version.txt ${SCRIPT_DIR}/kernel_version.txt || true
13 changes: 11 additions & 2 deletions tests/scripts/install-operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,25 @@ if [[ "${SKIP_INSTALL}" == "true" ]]; then
exit 0
fi

echo "Checking current kernel version..."
CURRENT_KERNEL=$(uname -r)
echo "Current kernel version: $CURRENT_KERNEL"

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.repository=${PRIVATE_REGISTRY}/nvidia --set driver.version=${TARGET_DRIVER_VERSION}"
# add helm driver repo
# SHIVA
#docker login ${HELM_NVIDIA_REPO} -u x-access-token --password $DOCKER_GITHUB_TOKEN

# add helm driver repo
helm repo add nvidia ${HELM_NVIDIA_REPO} && helm repo update

# Create the test namespace
kubectl create namespace "${TEST_NAMESPACE}"
kubectl create secret docker-registry ngc-secret --docker-server=${PRIVATE_REGISTRY}/nvidia --docker-username='$oauthtoken' --docker-password=${DOCKER_GITHUB_TOKEN} -n ${TEST_NAMESPACE}
# SHIVA add for precompiled
# --set driver.usePrecompiled=true
OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.repository=${PRIVATE_REGISTRY}/nvidia --set driver.version=${TARGET_DRIVER_VERSION} --set imagePullSecrets=ngc-secret --set driver.imagePullSecrets={ngc-secret}"

# Run the helm install command
echo "OPERATOR_OPTIONS: $OPERATOR_OPTIONS"
Expand Down
43 changes: 43 additions & 0 deletions tests/scripts/nvidia-kernel-upgrade-aws.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/bash

if [[ "${SKIP_INSTALL}" == "true" ]]; then
echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}"
exit 0
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source "${SCRIPT_DIR}"/.definitions.sh

# finding kernel version
${SCRIPT_DIR}/findkernelversion.sh
source "${SCRIPT_DIR}"/kernel_version.txt

echo "Checking current kernel version..."
CURRENT_KERNEL=$(uname -r)
echo "Current kernel version: $CURRENT_KERNEL"

echo ""
echo ""
echo "--------------Starting the Precompiled kernel version ${KERNEL_VERSION} upgrade--------------"

sudo apt-get update -y
sudo apt-get install linux-image-${KERNEL_VERSION} -y
if [ $? -ne 0 ]; then
echo "Kernel upgrade failed."
exit 1
fi

echo "Checking the upgraded kernel version ${KERNEL_VERSION}..."
CURRENT_KERNEL=$(uname -r)
echo "Upgraded kernel version: $CURRENT_KERNEL"

echo "update grub ..."
sudo update-grub
echo "Rebooting ..."
# Run the reboot command with nohup to avoid abrupt SSH closure issues
nohup sudo reboot &

echo "--------------Installation of kernel completed --------------"

# Exit with a success code since the reboot command was issued successfully
exit 0
27 changes: 27 additions & 0 deletions tests/scripts/remote_retry.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh
source ${SCRIPT_DIR}/.local.sh

try_ssh_connection() {
ssh -o ConnectTimeout=10 -i ${private_key} ${instance_hostname} "exit"
return $?
}

echo "Waiting for aws system to come back online..."
START_TIME=$(date +%s)
while true; do
sleep 60 # sleep before as system restarted earlier
try_ssh_connection
if [ $? -eq 0 ]; then
echo "Successfully connected to aws system after reboot."
break;
fi
ELAPSED_TIME=$(($(date +%s) - START_TIME))
if [ "$ELAPSED_TIME" -ge "$SYSTEM_ONLINE_CHECK_TIMEOUT" ]; then
echo "Failed to connect to aws within ${SYSTEM_ONLINE_CHECK_TIMEOUT} minutes after reboot."
exit 1
fi
echo "ssh retry again..."
done
Loading