Skip to content

Commit

Permalink
end-to-end gpu driver testing enhancement
Browse files Browse the repository at this point in the history
  • Loading branch information
shivakunv committed Aug 16, 2024
1 parent 600b7bf commit bf4b25a
Show file tree
Hide file tree
Showing 7 changed files with 175 additions and 121 deletions.
38 changes: 30 additions & 8 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

name: CI
name: End-to-end tests

on:
workflow_run:
Expand All @@ -22,14 +22,14 @@ on:
branches:
- main

# SHIVA
push:
branches:
- main

jobs:
e2e-tests-nvidiadriver:
runs-on: ubuntu-latest
strategy:
matrix:
driver:
- 535.183.06
- 550.90.07

steps:
- name: Check out code
Expand All @@ -41,7 +41,6 @@ jobs:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }}
with:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
Expand All @@ -59,11 +58,34 @@ jobs:
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
echo "DRIVER_VERSIONS=$(make -f versions.mk DRIVER_VERSIONS)" >> $GITHUB_ENV
- name: Validate gpu driver
env:
TEST_CASE: "./tests/cases/nvidia-driver.sh"
run: |
sudo chmod 644 ${{ github.workspace }}/.cache/key
echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key}
./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${{ matrix.driver }}
rc=0
for driver_version in ${{ env.DRIVER_VERSIONS }}; do
echo "Running e2e for DRIVER_VERSION=$driver_version"
# ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version}
# SHIVA
./tests/ci-run-e2e.sh ${TEST_CASE} ${driver_version}
status=$?
if [ $status -ne 0 ]; then
echo "e2e validation failed for driver version $driver_version with status $status"
rc=$status
fi
done
source ./tests/scripts/.definitions.sh
./tests/scripts/pull.sh ${LOG_DIR} logs
exit $rc
- name: Archive test logs
if: ${{ failure() }}
uses: actions/upload-artifact@v4
with:
name: nvidiadriver-e2e-test-logs
path: ./logs/
retention-days: 15
161 changes: 82 additions & 79 deletions .github/workflows/image.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,86 +93,89 @@ jobs:
IMAGE_NAME: ghcr.io/nvidia/driver
VERSION: ${COMMIT_SHORT_SHA}
run: |
DRIVER_VERSIONS=${{ matrix.driver }} make build-${{ matrix.dist }}-${{ matrix.driver }}
# SHIVA
# DRIVER_VERSIONS=${{ matrix.driver }} make build-${{ matrix.dist }}-${{ matrix.driver }}
echo "SHIVA compeleted image building"
pre-compiled:
runs-on: ubuntu-latest
strategy:
matrix:
driver:
- 535
- 550
flavor:
- aws
- azure
- generic
- nvidia
- oracle
ispr:
- ${{github.event_name == 'pull_request'}}
exclude:
- ispr: true
flavor: azure
- ispr: true
flavor: aws
- ispr: true
flavor: nvidia
- ispr: true
flavor: oracle
steps:
- uses: actions/checkout@v4
name: Check out code
- name: Calculate build vars
id: vars
run: |
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV
REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}"
echo "${REPO_FULL_NAME}"
echo "LABEL_IMAGE_SOURCE=https://github.com/${REPO_FULL_NAME}" >> $GITHUB_ENV
# SHIVA
# pre-compiled:
# runs-on: ubuntu-latest
# strategy:
# matrix:
# driver:
# - 535
# - 550
# flavor:
# - aws
# - azure
# - generic
# - nvidia
# - oracle
# ispr:
# - ${{github.event_name == 'pull_request'}}
# exclude:
# - ispr: true
# flavor: azure
# - ispr: true
# flavor: aws
# - ispr: true
# flavor: nvidia
# - ispr: true
# flavor: oracle
# steps:
# - uses: actions/checkout@v4
# name: Check out code
# - name: Calculate build vars
# id: vars
# run: |
# echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
# echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV
# REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}"
# echo "${REPO_FULL_NAME}"
# echo "LABEL_IMAGE_SOURCE=https://github.com/${REPO_FULL_NAME}" >> $GITHUB_ENV

GENERATE_ARTIFACTS="false"
if [[ "${{ github.actor }}" == "dependabot[bot]" ]]; then
GENERATE_ARTIFACTS="false"
elif [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" == "${{ github.repository }}" ]]; then
GENERATE_ARTIFACTS="true"
elif [[ "${{ github.event_name }}" == "push" ]]; then
GENERATE_ARTIFACTS="true"
fi
echo "PUSH_ON_BUILD=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
echo "BUILD_MULTI_ARCH_IMAGES=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
# GENERATE_ARTIFACTS="false"
# if [[ "${{ github.actor }}" == "dependabot[bot]" ]]; then
# GENERATE_ARTIFACTS="false"
# elif [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" == "${{ github.repository }}" ]]; then
# GENERATE_ARTIFACTS="true"
# elif [[ "${{ github.event_name }}" == "push" ]]; then
# GENERATE_ARTIFACTS="true"
# fi
# echo "PUSH_ON_BUILD=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
# echo "BUILD_MULTI_ARCH_IMAGES=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV

- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build base image and get kernel version
env:
IMAGE_NAME: ghcr.io/nvidia/driver
VERSION: ${COMMIT_SHORT_SHA}
BASE_TARGET: jammy
run: |
make DRIVER_BRANCH=${{ matrix.driver }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET}
# - name: Set up QEMU
# uses: docker/setup-qemu-action@v3
# - name: Set up Docker Buildx
# uses: docker/setup-buildx-action@v3
# - name: Login to GitHub Container Registry
# uses: docker/login-action@v3
# with:
# registry: ghcr.io
# username: ${{ github.actor }}
# password: ${{ secrets.GITHUB_TOKEN }}
# - name: Build base image and get kernel version
# env:
# IMAGE_NAME: ghcr.io/nvidia/driver
# VERSION: ${COMMIT_SHORT_SHA}
# BASE_TARGET: jammy
# run: |
# make DRIVER_BRANCH=${{ matrix.driver }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET}

trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT
docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver }}
# try 3 times every 10 seconds to get the file, if success exit the loop
for i in {1..3}; do
docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break
sleep 10
done
- name: Build image
env:
IMAGE_NAME: ghcr.io/nvidia/driver
VERSION: ${COMMIT_SHORT_SHA}
PRECOMPILED: "true"
DIST: signed_ubuntu22.04
run: |
source kernel_version.txt && \
make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver }} build-${DIST}-${DRIVER_VERSION}
# trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT
# docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver }}
# # try 3 times every 10 seconds to get the file, if success exit the loop
# for i in {1..3}; do
# docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break
# sleep 10
# done
# - name: Build image
# env:
# IMAGE_NAME: ghcr.io/nvidia/driver
# VERSION: ${COMMIT_SHORT_SHA}
# PRECOMPILED: "true"
# DIST: signed_ubuntu22.04
# run: |
# source kernel_version.txt && \
# make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver }} build-${DIST}-${DRIVER_VERSION}
6 changes: 6 additions & 0 deletions tests/scripts/.definitions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,9 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )"
: ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"}

: ${TARGET_DRIVER_VERSION:="550.90.07"}

: ${DAEMON_POD_STATUS_TIME_OUT:="10m"}
: ${POD_STATUS_TIME_OUT:="2m"}
: ${MAX_POD_STATUS_CHECK_TOTAL_TIME:="3000"} #50 minutes

: ${LOG_DIR:="/tmp/logs"}
36 changes: 9 additions & 27 deletions tests/scripts/checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,17 @@

check_pod_ready() {
local pod_label=$1
local pod_status_time_out=$2
local current_time=0
while :; do
echo "Checking $pod_label pod"
kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE}

echo "Checking $pod_label pod"

kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE}

echo "Checking $pod_label pod readiness"
is_pod_ready=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -ojsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' 2>/dev/null || echo "terminated")
echo "Checking $pod_label pod readiness"

if [ "${is_pod_ready}" = "True" ]; then
# Check if the pod is not in terminating state
is_pod_terminating=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -o jsonpath='{.items[0].metadata.deletionGracePeriodSeconds}' 2>/dev/null || echo "terminated")
if [ "${is_pod_terminating}" != "" ]; then
echo "pod $pod_label is in terminating state..."
else
echo "Pod $pod_label is ready"
break;
fi
fi
kubectl wait -n ${TEST_NAMESPACE} --for=condition=Ready pod -l app=$pod_label--timeout ${POD_STATUS_TIME_OUT}

if [[ "${current_time}" -gt $((60 * 45)) ]]; then
echo "timeout reached"
exit 1;
fi

# Echo useful information on stdout
kubectl get pods -n ${TEST_NAMESPACE}

echo "Sleeping 5 seconds"
current_time=$((${current_time} + 5))
sleep 5
done
# print status of pod
kubectl get pods -n ${TEST_NAMESPACE}
}
1 change: 0 additions & 1 deletion tests/scripts/end-to-end-nvidia-driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ echo ""
echo ""
echo "--------------Installing the GPU Operator--------------"

# Install the operator with usePrecompiled mode set to true
${SCRIPT_DIR}/install-operator.sh

"${SCRIPT_DIR}"/verify-operator.sh
Expand Down
12 changes: 12 additions & 0 deletions tests/scripts/pull.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash

if [[ $# -ne 2 ]]; then
echo "Pull requires a source and destination"
exit 1
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh
source ${SCRIPT_DIR}/.local.sh

${SCRIPT_DIR}/sync.sh ${instance_hostname}:${1} ${2}
42 changes: 36 additions & 6 deletions tests/scripts/verify-operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,39 @@ source ${SCRIPT_DIR}/.definitions.sh
# Import the check definitions
source ${SCRIPT_DIR}/checks.sh

check_pod_ready "nvidia-driver-daemonset"
check_pod_ready "nvidia-container-toolkit-daemonset"
check_pod_ready "nvidia-device-plugin-daemonset"
check_pod_ready "nvidia-dcgm-exporter"
check_pod_ready "gpu-feature-discovery"
check_pod_ready "nvidia-operator-validator"
# wait for the nvidia-driver pod to be ready
# If successful, then wait for the validator pod to be ready (this means that the rest of the pods are healthy)
# collect log in case of failure
local start_time=$(date +%s)
local log_dir=${LOG_DIR}
while :; do
current_time=$(date +%s)
elapsed_time=$((current_time - start_time))

# Check if total elapsed time is greater than exit
if [ $elapsed_time -gt $MAX_POD_STATUS_CHECK_TOTAL_TIME ]; then
echo "Total wait time exceeded ${MAX_POD_STATUS_CHECK_TOTAL_TIME} seconds. Exiting..."
kubectl delete pods --all -n ${TEST_NAMESPACE}
exit 1
fi

check_pod_ready "nvidia-driver-daemonset" ${DAEMON_POD_STATUS_TIME_OUT} && \
check_pod_ready "nvidia-operator-validator" ${POD_STATUS_TIME_OUT}

not_ready_pod_status=$(kubectl get pods -n ${TEST_NAMESPACE} --field-selector=status.phase!=Running -o jsonpath='{.items[*].metadata.name}')
if [ -n "$not_ready_pod_status" ]; then
for pod in $not_ready_pod_status; do
echo "Collecting logs for pod: $pod"
echo "------------------------------------------------" >> "${log_dir}/${pod}.describe"
kubectl -n "${ns}" describe pods "${pod}" >> "${log_dir}/${pod}.describe"
kubectl logs $pod -n ${TEST_NAMESPACE} --all-containers=true >> "${log_dir}/${pod}_logs.txt" || true
echo "Logs saved to ${pod}_logs.txt"
done
else
echo "All gpu-operator pods are ready."
kubectl delete pods --all -n ${TEST_NAMESPACE}
break;
fi

sleep 10
done

0 comments on commit bf4b25a

Please sign in to comment.