Skip to content

Commit

Permalink
end-to-end gpu driver validation
Browse files Browse the repository at this point in the history
  • Loading branch information
shivakunv committed Aug 19, 2024
1 parent a019667 commit 6c5a1e9
Show file tree
Hide file tree
Showing 17 changed files with 450 additions and 83 deletions.
107 changes: 107 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: CI

permissions:
id-token: write
contents: read

on:
workflow_run:
workflows: [image]
types:
- success
branches:
- draftawsvalidation

pull_request:
types:
- opened
- synchronize
branches:
# - main
# - release-*
# - drivervalidation
- draftawsvalidation
push:
branches:
# - main
# - release-*
# - drivervalidation
- draftawsvalidation

jobs:
e2e-tests-nvidiadriver:
runs-on: ubuntu-latest
strategy:
matrix:
driver:
- 535.183.06
- 550.90.07

steps:
- name: Check out code
uses: actions/checkout@v4

- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
role-session-name: github-actions-session
role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }}
aws-region: "us-west-1"
role-duration-seconds: 1800

- name: check aws env
run: |
echo "SHIVA kumar"
aws ec2 describe-key-pairs --query 'KeyPairs[*].KeyName' --output text
echo "SHIVA kumar"
# - name: Set up Holodeck
# uses: NVIDIA/holodeck@main
# env:
# AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
# AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
# AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
# AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }}
# with:
# aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
# aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
# aws_ssh_key: ${{ secrets.AWS_SSH_KEY }}
# holodeck_config: "tests/holodeck.yaml"

# - name: Get public dns name
# id: get_public_dns_name
# uses: mikefarah/yq@master
# with:
# cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml

# - name: Set and Calculate test vars
# run: |
# echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
# echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
# # echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
# echo "COMMIT_SHORT_SHA=f0d936d3" >> $GITHUB_ENV

# - name: Validate gpu driver
# env:
# TEST_CASE: "./tests/cases/nvidia-driver.sh"
# run: |
# sudo chmod 644 ${{ github.workspace }}/.cache/key
# echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key}
# ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${{ matrix.driver }}
169 changes: 86 additions & 83 deletions .github/workflows/image.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@ on:
- opened
- synchronize
branches:
- main
- release-*
# - main
# - release-*
- draftawsvalidation
push:
branches:
- main
- release-*
# - main
# - release-*
- draftawsvalidation

jobs:
image:
Expand Down Expand Up @@ -98,86 +100,87 @@ jobs:
IMAGE_NAME: ghcr.io/nvidia/driver
VERSION: ${COMMIT_SHORT_SHA}
run: |
DRIVER_VERSIONS=${{ matrix.driver }} make build-${{ matrix.dist }}-${{ matrix.driver }}
# DRIVER_VERSIONS=${{ matrix.driver }} make build-${{ matrix.dist }}-${{ matrix.driver }}
echo "completed non compiled image"
pre-compiled:
runs-on: ubuntu-latest
strategy:
matrix:
driver:
- 535
- 550
flavor:
- aws
- azure
- generic
- nvidia
- oracle
ispr:
- ${{github.event_name == 'pull_request'}}
exclude:
- ispr: true
flavor: azure
- ispr: true
flavor: aws
- ispr: true
flavor: nvidia
- ispr: true
flavor: oracle
steps:
- uses: actions/checkout@v4
name: Check out code
- name: Calculate build vars
id: vars
run: |
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV
REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}"
echo "${REPO_FULL_NAME}"
echo "LABEL_IMAGE_SOURCE=https://github.com/${REPO_FULL_NAME}" >> $GITHUB_ENV
# pre-compiled:
# runs-on: ubuntu-latest
# strategy:
# matrix:
# driver:
# - 535
# - 550
# flavor:
# - aws
# - azure
# - generic
# - nvidia
# - oracle
# ispr:
# - ${{github.event_name == 'pull_request'}}
# exclude:
# - ispr: true
# flavor: azure
# - ispr: true
# flavor: aws
# - ispr: true
# flavor: nvidia
# - ispr: true
# flavor: oracle
# steps:
# - uses: actions/checkout@v4
# name: Check out code
# - name: Calculate build vars
# id: vars
# run: |
# echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
# echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV
# REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}"
# echo "${REPO_FULL_NAME}"
# echo "LABEL_IMAGE_SOURCE=https://github.com/${REPO_FULL_NAME}" >> $GITHUB_ENV

GENERATE_ARTIFACTS="false"
if [[ "${{ github.actor }}" == "dependabot[bot]" ]]; then
GENERATE_ARTIFACTS="false"
elif [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" == "${{ github.repository }}" ]]; then
GENERATE_ARTIFACTS="true"
elif [[ "${{ github.event_name }}" == "push" ]]; then
GENERATE_ARTIFACTS="true"
fi
echo "PUSH_ON_BUILD=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
echo "BUILD_MULTI_ARCH_IMAGES=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
# GENERATE_ARTIFACTS="false"
# if [[ "${{ github.actor }}" == "dependabot[bot]" ]]; then
# GENERATE_ARTIFACTS="false"
# elif [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" == "${{ github.repository }}" ]]; then
# GENERATE_ARTIFACTS="true"
# elif [[ "${{ github.event_name }}" == "push" ]]; then
# GENERATE_ARTIFACTS="true"
# fi
# echo "PUSH_ON_BUILD=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
# echo "BUILD_MULTI_ARCH_IMAGES=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV

- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build base image and get kernel version
env:
IMAGE_NAME: ghcr.io/nvidia/driver
VERSION: ${COMMIT_SHORT_SHA}
BASE_TARGET: jammy
run: |
make DRIVER_BRANCH=${{ matrix.driver }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET}
# - name: Set up QEMU
# uses: docker/setup-qemu-action@v3
# - name: Set up Docker Buildx
# uses: docker/setup-buildx-action@v3
# - name: Login to GitHub Container Registry
# uses: docker/login-action@v3
# with:
# registry: ghcr.io
# username: ${{ github.actor }}
# password: ${{ secrets.GITHUB_TOKEN }}
# - name: Build base image and get kernel version
# env:
# IMAGE_NAME: ghcr.io/nvidia/driver
# VERSION: ${COMMIT_SHORT_SHA}
# BASE_TARGET: jammy
# run: |
# make DRIVER_BRANCH=${{ matrix.driver }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET}

trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT
docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver }}
# try 3 times every 10 seconds to get the file, if success exit the loop
for i in {1..3}; do
docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break
sleep 10
done
- name: Build image
env:
IMAGE_NAME: ghcr.io/nvidia/driver
VERSION: ${COMMIT_SHORT_SHA}
PRECOMPILED: "true"
DIST: signed_ubuntu22.04
run: |
source kernel_version.txt && \
make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver }} build-${DIST}-${DRIVER_VERSION}
# trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT
# docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver }}
# # try 3 times every 10 seconds to get the file, if success exit the loop
# for i in {1..3}; do
# docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break
# sleep 10
# done
# - name: Build image
# env:
# IMAGE_NAME: ghcr.io/nvidia/driver
# VERSION: ${COMMIT_SHORT_SHA}
# PRECOMPILED: "true"
# DIST: signed_ubuntu22.04
# run: |
# source kernel_version.txt && \
# make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver }} build-${DIST}-${DRIVER_VERSION}
8 changes: 8 additions & 0 deletions tests/cases/nvidia-driver.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#! /bin/bash
# This test case runs the operator installation / test case with the default options.

SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )"
source "${SCRIPTS_DIR}"/.definitions.sh

# Run an end-to-end test cycle
"${SCRIPTS_DIR}"/end-to-end-nvidia-driver.sh
17 changes: 17 additions & 0 deletions tests/ci-run-e2e.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

set -xe

if [[ $# -ne 2 ]]; then
echo "TEST_CASE TARGET_DRIVER_VERSION are required"
exit 1
fi

export TEST_CASE=${1}
export TARGET_DRIVER_VERSION=${2}

echo "SHIVA $TARGET_DRIVER_VERSION"

TEST_DIR="$(pwd)/tests"

${TEST_DIR}/local.sh
26 changes: 26 additions & 0 deletions tests/holodeck.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
apiVersion: holodeck.nvidia.com/v1alpha1
kind: Environment
metadata:
name: HOLODECK_NAME
description: "end-to-end test infrastructure"
spec:
provider: aws
auth:
keyName: cnt-ci
privateKey: HOLODECK_PRIVATE_KEY
instance:
type: g4dn.xlarge
region: us-west-1
ingressIpRanges:
- 0.0.0.0/0
image:
architecture: amd64
imageId: ami-0ce2cb35386fc22e9
containerRuntime:
install: true
name: containerd
kubernetes:
install: true
installer: kubeadm
version: v1.28.5
crictlVersion: v1.28.0
27 changes: 27 additions & 0 deletions tests/local.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#! /bin/bash

if [[ $# -ge 1 ]]; then
TEST_CASE=${1}
test -n "${TEST_CASE}"
fi
test -f ${PROJECT_DIR}/${TEST_CASE}

export PROJECT="gpu-driver-container"

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/scripts && pwd )"
source ${SCRIPT_DIR}/.definitions.sh
source ${SCRIPT_DIR}/.local.sh

# Sync the project folder to the remote
${SCRIPT_DIR}/push.sh

# We trigger the installation of prerequisites on the remote instance
remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites.sh

# We trigger the specified test case on the remote instance.
# Note: We need to ensure that the required environment variables
# are forwarded to the remote shell.
remote \
PROJECT="${PROJECT}" \
TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}" \
${TEST_CASE}
21 changes: 21 additions & 0 deletions tests/scripts/.definitions.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash
set -e

[[ -z "${DEBUG}" ]] || set -x

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
TEST_DIR="$( cd "${SCRIPT_DIR}/.." && pwd )"
PROJECT_DIR="$( cd "${TEST_DIR}/.." && pwd )"
CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )"

# Set default values if not defined
: ${HELM:="helm"}
: ${PROJECT:="$(basename "${PROJECT_DIR}")"}

: ${TEST_NAMESPACE:="test-operator"}

: ${PRIVATE_REGISTRY:="ghcr.io"}

: ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"}

: ${TARGET_DRIVER_VERSION:="550.90.07"}
Loading

0 comments on commit 6c5a1e9

Please sign in to comment.