Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft aws driver kernel validation #85

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 106 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: CI-draft

permissions:
id-token: write
contents: read

on:
workflow_run:
workflows: [image]
types:
- completed
branches:
- main
pull_request:
types:
- opened
- synchronize
branches:
# - main
# - release-*
# - drivervalidation
- draftawsvalidation
push:
branches:
# - main
# - release-*
# - drivervalidation
- draftawsvalidation

jobs:
e2e-tests-nvidiadriver:
runs-on: ubuntu-latest
strategy:
matrix:
driver:
- 535.183.06
- 550.90.07

steps:
- name: Check out code
uses: actions/checkout@v4

- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SSH_KEY }}
role-session-name: github-actions-session
role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }}
aws-region: "us-west-1"
role-duration-seconds: 1800

- name: check aws env
run: |
echo "SHIVA kumar"
aws ec2 describe-key-pairs --query 'KeyPairs[*].KeyName' --output text
echo "SHIVA kumar"


# - name: Set up Holodeck
# uses: NVIDIA/holodeck@main
# env:
# AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
# AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
# AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
# AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }}
# with:
# aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
# aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
# aws_ssh_key: ${{ secrets.AWS_SSH_KEY }}
# holodeck_config: "tests/holodeck.yaml"

# - name: Get public dns name
# id: get_public_dns_name
# uses: mikefarah/yq@master
# with:
# cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml

# - name: Set and Calculate test vars
# run: |
# echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
# echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
# # echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
# echo "COMMIT_SHORT_SHA=f0d936d3" >> $GITHUB_ENV

# - name: Validate gpu driver
# env:
# TEST_CASE: "./tests/cases/nvidia-driver.sh"
# run: |
# sudo chmod 644 ${{ github.workspace }}/.cache/key
# echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key}
# ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${{ matrix.driver }}
169 changes: 86 additions & 83 deletions .github/workflows/image.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@ on:
- opened
- synchronize
branches:
- main
- release-*
# - main
# - release-*
- draftawsvalidation
push:
branches:
- main
- release-*
# - main
# - release-*
- draftawsvalidation

jobs:
image:
Expand Down Expand Up @@ -98,86 +100,87 @@ jobs:
IMAGE_NAME: ghcr.io/nvidia/driver
VERSION: ${COMMIT_SHORT_SHA}
run: |
DRIVER_VERSIONS=${{ matrix.driver }} make build-${{ matrix.dist }}-${{ matrix.driver }}
# DRIVER_VERSIONS=${{ matrix.driver }} make build-${{ matrix.dist }}-${{ matrix.driver }}
echo "completed non compiled image"

pre-compiled:
runs-on: ubuntu-latest
strategy:
matrix:
driver:
- 535
- 550
flavor:
- aws
- azure
- generic
- nvidia
- oracle
ispr:
- ${{github.event_name == 'pull_request'}}
exclude:
- ispr: true
flavor: azure
- ispr: true
flavor: aws
- ispr: true
flavor: nvidia
- ispr: true
flavor: oracle
steps:
- uses: actions/checkout@v4
name: Check out code
- name: Calculate build vars
id: vars
run: |
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV
REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}"
echo "${REPO_FULL_NAME}"
echo "LABEL_IMAGE_SOURCE=https://github.com/${REPO_FULL_NAME}" >> $GITHUB_ENV
# pre-compiled:
# runs-on: ubuntu-latest
# strategy:
# matrix:
# driver:
# - 535
# - 550
# flavor:
# - aws
# - azure
# - generic
# - nvidia
# - oracle
# ispr:
# - ${{github.event_name == 'pull_request'}}
# exclude:
# - ispr: true
# flavor: azure
# - ispr: true
# flavor: aws
# - ispr: true
# flavor: nvidia
# - ispr: true
# flavor: oracle
# steps:
# - uses: actions/checkout@v4
# name: Check out code
# - name: Calculate build vars
# id: vars
# run: |
# echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
# echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV
# REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}"
# echo "${REPO_FULL_NAME}"
# echo "LABEL_IMAGE_SOURCE=https://github.com/${REPO_FULL_NAME}" >> $GITHUB_ENV

GENERATE_ARTIFACTS="false"
if [[ "${{ github.actor }}" == "dependabot[bot]" ]]; then
GENERATE_ARTIFACTS="false"
elif [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" == "${{ github.repository }}" ]]; then
GENERATE_ARTIFACTS="true"
elif [[ "${{ github.event_name }}" == "push" ]]; then
GENERATE_ARTIFACTS="true"
fi
echo "PUSH_ON_BUILD=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
echo "BUILD_MULTI_ARCH_IMAGES=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
# GENERATE_ARTIFACTS="false"
# if [[ "${{ github.actor }}" == "dependabot[bot]" ]]; then
# GENERATE_ARTIFACTS="false"
# elif [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" == "${{ github.repository }}" ]]; then
# GENERATE_ARTIFACTS="true"
# elif [[ "${{ github.event_name }}" == "push" ]]; then
# GENERATE_ARTIFACTS="true"
# fi
# echo "PUSH_ON_BUILD=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV
# echo "BUILD_MULTI_ARCH_IMAGES=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV

- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build base image and get kernel version
env:
IMAGE_NAME: ghcr.io/nvidia/driver
VERSION: ${COMMIT_SHORT_SHA}
BASE_TARGET: jammy
run: |
make DRIVER_BRANCH=${{ matrix.driver }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET}
# - name: Set up QEMU
# uses: docker/setup-qemu-action@v3
# - name: Set up Docker Buildx
# uses: docker/setup-buildx-action@v3
# - name: Login to GitHub Container Registry
# uses: docker/login-action@v3
# with:
# registry: ghcr.io
# username: ${{ github.actor }}
# password: ${{ secrets.GITHUB_TOKEN }}
# - name: Build base image and get kernel version
# env:
# IMAGE_NAME: ghcr.io/nvidia/driver
# VERSION: ${COMMIT_SHORT_SHA}
# BASE_TARGET: jammy
# run: |
# make DRIVER_BRANCH=${{ matrix.driver }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET}

trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT
docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver }}
# try 3 times every 10 seconds to get the file, if success exit the loop
for i in {1..3}; do
docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break
sleep 10
done
- name: Build image
env:
IMAGE_NAME: ghcr.io/nvidia/driver
VERSION: ${COMMIT_SHORT_SHA}
PRECOMPILED: "true"
DIST: signed_ubuntu22.04
run: |
source kernel_version.txt && \
make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver }} build-${DIST}-${DRIVER_VERSION}
# trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT
# docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver }}
# # try 3 times every 10 seconds to get the file, if success exit the loop
# for i in {1..3}; do
# docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break
# sleep 10
# done
# - name: Build image
# env:
# IMAGE_NAME: ghcr.io/nvidia/driver
# VERSION: ${COMMIT_SHORT_SHA}
# PRECOMPILED: "true"
# DIST: signed_ubuntu22.04
# run: |
# source kernel_version.txt && \
# make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver }} build-${DIST}-${DRIVER_VERSION}
8 changes: 8 additions & 0 deletions tests/cases/nvidia-driver.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#! /bin/bash
# This test case runs the operator installation / test case with the default options.

SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )"
source "${SCRIPTS_DIR}"/.definitions.sh

# Run an end-to-end test cycle
"${SCRIPTS_DIR}"/end-to-end-nvidia-driver.sh
17 changes: 17 additions & 0 deletions tests/ci-run-e2e.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

set -xe

if [[ $# -ne 2 ]]; then
echo "TEST_CASE TARGET_DRIVER_VERSION are required"
exit 1
fi

export TEST_CASE=${1}
export TARGET_DRIVER_VERSION=${2}

echo "SHIVA $TARGET_DRIVER_VERSION"

TEST_DIR="$(pwd)/tests"

${TEST_DIR}/local.sh
26 changes: 26 additions & 0 deletions tests/holodeck.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
apiVersion: holodeck.nvidia.com/v1alpha1
kind: Environment
metadata:
name: HOLODECK_NAME
description: "end-to-end test infrastructure"
spec:
provider: aws
auth:
keyName: cnt-ci
privateKey: HOLODECK_PRIVATE_KEY
instance:
type: g4dn.xlarge
region: us-west-1
ingressIpRanges:
- 0.0.0.0/0
image:
architecture: amd64
imageId: ami-0ce2cb35386fc22e9
containerRuntime:
install: true
name: containerd
kubernetes:
install: true
installer: kubeadm
version: v1.28.5
crictlVersion: v1.28.0
27 changes: 27 additions & 0 deletions tests/local.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#! /bin/bash

if [[ $# -ge 1 ]]; then
TEST_CASE=${1}
test -n "${TEST_CASE}"
fi
test -f ${PROJECT_DIR}/${TEST_CASE}

export PROJECT="gpu-driver-container"

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/scripts && pwd )"
source ${SCRIPT_DIR}/.definitions.sh
source ${SCRIPT_DIR}/.local.sh

# Sync the project folder to the remote
${SCRIPT_DIR}/push.sh

# We trigger the installation of prerequisites on the remote instance
remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites.sh

# We trigger the specified test case on the remote instance.
# Note: We need to ensure that the required environment variables
# are forwarded to the remote shell.
remote \
PROJECT="${PROJECT}" \
TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}" \
${TEST_CASE}
21 changes: 21 additions & 0 deletions tests/scripts/.definitions.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash
set -e

[[ -z "${DEBUG}" ]] || set -x

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
TEST_DIR="$( cd "${SCRIPT_DIR}/.." && pwd )"
PROJECT_DIR="$( cd "${TEST_DIR}/.." && pwd )"
CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )"

# Set default values if not defined
: ${HELM:="helm"}
: ${PROJECT:="$(basename "${PROJECT_DIR}")"}

: ${TEST_NAMESPACE:="test-operator"}

: ${PRIVATE_REGISTRY:="ghcr.io"}

: ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"}

: ${TARGET_DRIVER_VERSION:="550.90.07"}
Loading