From 18e9cd8e7834aec1606f9f1d5a960689cf171733 Mon Sep 17 00:00:00 2001 From: Tariq Ibrahim Date: Mon, 16 Sep 2024 10:29:25 -0700 Subject: [PATCH] add ubuntu24.04 driver container Signed-off-by: Tariq Ibrahim --- .common-ci.yml | 19 + .github/workflows/image.yaml | 4 + .gitlab-ci.yml | 14 + .nvidia-ci.yml | 28 + Makefile | 22 +- multi-arch.mk | 1 + ubuntu24.04/Dockerfile | 109 ++++ ubuntu24.04/README.md | 3 + ubuntu24.04/drivers/README.md | 1 + ubuntu24.04/empty | 0 ubuntu24.04/install.sh | 65 +++ ubuntu24.04/nvidia-driver | 732 ++++++++++++++++++++++++++ ubuntu24.04/precompiled/Dockerfile | 78 +++ ubuntu24.04/precompiled/nvidia-driver | 350 ++++++++++++ 14 files changed, 1425 insertions(+), 1 deletion(-) create mode 100644 ubuntu24.04/Dockerfile create mode 100644 ubuntu24.04/README.md create mode 100644 ubuntu24.04/drivers/README.md create mode 100644 ubuntu24.04/empty create mode 100755 ubuntu24.04/install.sh create mode 100755 ubuntu24.04/nvidia-driver create mode 100644 ubuntu24.04/precompiled/Dockerfile create mode 100755 ubuntu24.04/precompiled/nvidia-driver diff --git a/.common-ci.yml b/.common-ci.yml index c37b30c2..86f91be2 100644 --- a/.common-ci.yml +++ b/.common-ci.yml @@ -75,6 +75,12 @@ trigger-pipeline: matrix: - DRIVER_VERSION: [535.216.03, 550.127.08] +# Define the driver versions for jobs that can be run in parallel +.driver-versions-ubuntu24.04: + parallel: + matrix: + - DRIVER_VERSION: [550.127.08] + # Define the matrix of precompiled jobs that can be run in parallel for ubuntu22.04 .driver-versions-precompiled-ubuntu22.04: parallel: @@ -93,6 +99,10 @@ trigger-pipeline: DIST: ubuntu22.04 CVE_UPDATES: "openssl" +.dist-ubuntu24.04: + variables: + DIST: ubuntu24.04 + .dist-rhel8: variables: DIST: rhel8 @@ -199,6 +209,15 @@ trigger-pipeline: OUT_REGISTRY: "${CI_REGISTRY}" OUT_IMAGE_NAME: "${CI_REGISTRY_IMAGE}/staging/driver" +.release:staging-ubuntu24.04: + extends: + - .release-ubuntu24.04 + variables: + OUT_REGISTRY_USER: "${CI_REGISTRY_USER}" + OUT_REGISTRY_TOKEN: "${CI_REGISTRY_PASSWORD}" + OUT_REGISTRY: "${CI_REGISTRY}" + OUT_IMAGE_NAME: "${CI_REGISTRY_IMAGE}/staging/driver" + .release:staging-rhel9: extends: - .release-rhel9 diff --git a/.github/workflows/image.yaml b/.github/workflows/image.yaml index 1ac083b7..b9ed5c7b 100644 --- a/.github/workflows/image.yaml +++ b/.github/workflows/image.yaml @@ -39,6 +39,7 @@ jobs: dist: - ubuntu20.04 - ubuntu22.04 + - ubuntu24.04 - rhel8 ispr: - ${{github.event_name == 'pull_request'}} @@ -49,6 +50,9 @@ jobs: - ispr: true dist: ubuntu20.04 driver: 550.127.08 + - ispr: true + dist: ubuntu24.04 + driver: 535.216.03 fail-fast: false steps: - uses: actions/checkout@v4 diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6cc89a41..d05de9e0 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -49,6 +49,15 @@ include: rules: - if: $CI_PIPELINE_SOURCE != "schedule" +# Define the image build targets +.image-build-ubuntu24.04: + # Perform for each DRIVER_VERSION + extends: + - .driver-versions-ubuntu24.04 + - .image-build-generic + rules: + - if: $CI_PIPELINE_SOURCE != "schedule" + # Define the image build targets .image-build-rhel9: # Perform for each DRIVER_VERSION @@ -69,6 +78,11 @@ image-ubuntu22.04: - .image-build-ubuntu22.04 - .dist-ubuntu22.04 +image-ubuntu24.04: + extends: + - .image-build-ubuntu24.04 + - .dist-ubuntu24.04 + image-rhel8: extends: - .image-build diff --git a/.nvidia-ci.yml b/.nvidia-ci.yml index 2e0fa02c..e1094d9d 100644 --- a/.nvidia-ci.yml +++ b/.nvidia-ci.yml @@ -184,6 +184,18 @@ image-rhel8: - if: $CI_PIPELINE_SOURCE == "merge_request_event" - !reference [.pipeline-trigger-rules, rules] +.scan-ubuntu24.04: + # Repeat for each DRIVER_VERSION + extends: + - .driver-versions-ubuntu24.04 + - .scan-generic + rules: + - !reference [.scan-rules-common, rules] + - if: $CI_PIPELINE_SOURCE == "schedule" + when: never + - if: $CI_PIPELINE_SOURCE == "merge_request_event" + - !reference [.pipeline-trigger-rules, rules] + .scan-precompiled-ubuntu22.04: variables: DIST: signed_ubuntu22.04 @@ -229,6 +241,22 @@ scan-ubuntu22.04-arm64: needs: - image-ubuntu22.04 +scan-ubuntu24.04-amd64: + extends: + - .scan-ubuntu24.04 + - .dist-ubuntu24.04 + - .platform-amd64 + needs: + - image-ubuntu24.04 + +scan-ubuntu24.04-arm64: + extends: + - .scan-ubuntu24.04 + - .dist-ubuntu24.04 + - .platform-arm64 + needs: + - image-ubuntu24.04 + scan-precompiled-ubuntu22.04-amd64: variables: PLATFORM: linux/amd64 diff --git a/Makefile b/Makefile index f145be68..7b338960 100644 --- a/Makefile +++ b/Makefile @@ -54,7 +54,7 @@ OUT_IMAGE_TAG = $(OUT_IMAGE_VERSION)-$(OUT_DIST) OUT_IMAGE = $(OUT_IMAGE_NAME):$(OUT_IMAGE_TAG) ##### Public rules ##### -DISTRIBUTIONS := ubuntu18.04 ubuntu20.04 ubuntu22.04 signed_ubuntu20.04 signed_ubuntu22.04 rhel8 rhel9 flatcar fedora36 sles15.3 precompiled_rhcos +DISTRIBUTIONS := ubuntu18.04 ubuntu20.04 ubuntu22.04 ubuntu24.04 signed_ubuntu20.04 signed_ubuntu22.04 signed_ubuntu24.04 rhel8 rhel9 flatcar fedora36 sles15.3 precompiled_rhcos PUSH_TARGETS := $(patsubst %, push-%, $(DISTRIBUTIONS)) BASE_FROM := jammy focal PUSH_TARGETS := $(patsubst %, push-%, $(DISTRIBUTIONS)) @@ -92,6 +92,10 @@ pull-signed_ubuntu22.04%: DIST = ubuntu22.04 pull-signed_ubuntu22.04%: DRIVER_TAG = $(DRIVER_BRANCH) pull-signed_ubuntu22.04%: IMAGE_TAG = $(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) +pull-signed_ubuntu24.04%: DIST = ubuntu24.04 +pull-signed_ubuntu24.04%: DRIVER_TAG = $(DRIVER_BRANCH) +pull-signed_ubuntu24.04%: IMAGE_TAG = $(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) + PLATFORM ?= linux/amd64 $(DRIVER_PULL_TARGETS): pull-%: $(DOCKER) pull "--platform=$(PLATFORM)" "$(IMAGE)" @@ -109,6 +113,10 @@ archive-signed_ubuntu22.04%: DIST = ubuntu22.04 archive-signed_ubuntu22.04%: DRIVER_TAG = $(DRIVER_BRANCH) archive-signed_ubuntu22.04%: IMAGE_TAG = $(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) +archive-signed_ubuntu24.04%: DIST = ubuntu24.04 +archive-signed_ubuntu24.04%: DRIVER_TAG = $(DRIVER_BRANCH) +archive-signed_ubuntu24.04%: IMAGE_TAG = $(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) + $(DRIVER_ARCHIVE_TARGETS): archive-%: $(DOCKER) save "$(IMAGE)" -o "archive.tar" @@ -130,6 +138,11 @@ push-signed_ubuntu22.04%: DRIVER_TAG = $(DRIVER_BRANCH) push-signed_ubuntu22.04%: IMAGE_TAG = $(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) push-signed_ubuntu22.04%: OUT_IMAGE_TAG = $(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) +push-signed_ubuntu24.04%: DIST = ubuntu24.04 +push-signed_ubuntu24.04%: DRIVER_TAG = $(DRIVER_BRANCH) +push-signed_ubuntu24.04%: IMAGE_TAG = $(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) +push-signed_ubuntu24.04%: OUT_IMAGE_TAG = $(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) + # $(DRIVER_BUILD_TARGETS) is in the form of build-$(DIST)-$(DRIVER_VERSION) # Parse the target to set the required variables. build-%: DIST = $(word 2,$(subst -, ,$@)) @@ -176,6 +189,13 @@ build-signed_ubuntu22.04%: DRIVER_TAG = $(DRIVER_BRANCH) build-signed_ubuntu22.04%: IMAGE_TAG = $(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) build-signed_ubuntu22.04%: DOCKER_BUILD_ARGS = --build-arg KERNEL_VERSION="$(KERNEL_VERSION)" +# ubuntu24.04 Precompiled Driver +build-signed_ubuntu24.04%: DIST = ubuntu24.04 +build-signed_ubuntu24.04%: SUBDIR = ubuntu24.04/precompiled +build-signed_ubuntu24.04%: DRIVER_TAG = $(DRIVER_BRANCH) +build-signed_ubuntu24.04%: IMAGE_TAG = $(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) +build-signed_ubuntu24.04%: DOCKER_BUILD_ARGS = --build-arg KERNEL_VERSION="$(KERNEL_VERSION)" + # base is an image used to poll Canonical for the latest kernel version build-base-%: DOCKERFILE = $(CURDIR)/base/Dockerfile build-base-%: TARGET = $(word 3,$(subst -, ,$@)) diff --git a/multi-arch.mk b/multi-arch.mk index 72923ac5..62668d33 100644 --- a/multi-arch.mk +++ b/multi-arch.mk @@ -27,4 +27,5 @@ $(DRIVER_PUSH_TARGETS): push-%: build-ubuntu18.04%: DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64 build-signed_ubuntu20.04%: DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64 build-signed_ubuntu22.04%: DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64 +build-signed_ubuntu24.04%: DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64 build-sles%: DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64 diff --git a/ubuntu24.04/Dockerfile b/ubuntu24.04/Dockerfile new file mode 100644 index 00000000..36f209a0 --- /dev/null +++ b/ubuntu24.04/Dockerfile @@ -0,0 +1,109 @@ +FROM nvcr.io/nvidia/cuda:12.6.2-base-ubuntu24.04 as build + +ARG TARGETARCH +ARG GOLANG_VERSION + +SHELL ["/bin/bash", "-c"] + +RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections + +# Remove cuda repository to avoid GPG errors +RUN rm -f /etc/apt/sources.list.d/cuda* + +RUN apt-get update && apt-get install -y --no-install-recommends \ + apt-utils \ + build-essential \ + ca-certificates \ + curl \ + git && \ + rm -rf /var/lib/apt/lists/* + + + +# download appropriate binary based on the target architecture for multi-arch builds +RUN OS_ARCH=${TARGETARCH/x86_64/amd64} && OS_ARCH=${OS_ARCH/aarch64/arm64} && \ + curl https://storage.googleapis.com/golang/go${GOLANG_VERSION}.linux-${OS_ARCH}.tar.gz \ + | tar -C /usr/local -xz + +ENV PATH /usr/local/go/bin:$PATH + +WORKDIR /work + +RUN git clone https://github.com/NVIDIA/gpu-driver-container driver && \ + cd driver/vgpu/src && \ + go build -o vgpu-util && \ + mv vgpu-util /work + +FROM nvcr.io/nvidia/cuda:12.6.2-base-ubuntu24.04 + +SHELL ["/bin/bash", "-c"] + +ARG BASE_URL=https://us.download.nvidia.com/tesla +ARG TARGETARCH +ENV TARGETARCH=$TARGETARCH +ARG DRIVER_VERSION +ENV DRIVER_VERSION=$DRIVER_VERSION +ENV DEBIAN_FRONTEND=noninteractive + +# Arg to indicate if driver type is either of passthrough(baremetal) or vgpu +ARG DRIVER_TYPE=passthrough +ENV DRIVER_TYPE=$DRIVER_TYPE +ARG DRIVER_BRANCH=550 +ENV DRIVER_BRANCH=$DRIVER_BRANCH +ARG VGPU_LICENSE_SERVER_TYPE=NLS +ENV VGPU_LICENSE_SERVER_TYPE=$VGPU_LICENSE_SERVER_TYPE +# Enable vGPU version compability check by default +ARG DISABLE_VGPU_VERSION_CHECK=true +ENV DISABLE_VGPU_VERSION_CHECK=$DISABLE_VGPU_VERSION_CHECK +ENV NVIDIA_VISIBLE_DEVICES=void + +RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections + +RUN echo "TARGETARCH=$TARGETARCH" + +ADD install.sh /tmp + +# Fetch GPG keys for CUDA repo +RUN apt-key del 7fa2af80 && OS_ARCH=${TARGETARCH/amd64/x86_64} && OS_ARCH=${OS_ARCH/arm64/sbsa} && \ + apt-key adv --fetch-keys "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${OS_ARCH}/3bf863cc.pub" + +RUN /tmp/install.sh reposetup && /tmp/install.sh depinstall && \ + curl -fsSL -o /usr/local/bin/donkey https://github.com/3XX0/donkey/releases/download/v1.1.0/donkey && \ + chmod +x /usr/local/bin/donkey + +COPY nvidia-driver /usr/local/bin + +COPY --from=build /work/vgpu-util /usr/local/bin + +ADD drivers drivers/ + +# Fetch the installer automatically for passthrough/baremetal types +RUN if [ "$DRIVER_TYPE" != "vgpu" ]; then \ + cd drivers && \ + /tmp/install.sh download_installer; fi + +# Fabric manager packages are not available for arm64 +RUN if [ "$DRIVER_TYPE" != "vgpu" ] && [ "$TARGETARCH" != "arm64" ]; then \ + apt-get update && \ + apt-get install -y --no-install-recommends nvidia-fabricmanager-${DRIVER_BRANCH}=${DRIVER_VERSION}-1 \ + libnvidia-nscq-${DRIVER_BRANCH}=${DRIVER_VERSION}-1; fi + +WORKDIR /drivers + +ARG PUBLIC_KEY=empty +COPY ${PUBLIC_KEY} kernel/pubkey.x509 + +# Install / upgrade packages here that are required to resolve CVEs +ARG CVE_UPDATES +RUN if [ -n "${CVE_UPDATES}" ]; then \ + apt-get update && apt-get upgrade -y ${CVE_UPDATES} && \ + rm -rf /var/lib/apt/lists/*; \ + fi + +# Remove cuda repository to avoid GPG errors +RUN rm -f /etc/apt/sources.list.d/cuda* + +# Add NGC DL license from the CUDA image +RUN mkdir /licenses && mv /NGC-DL-CONTAINER-LICENSE /licenses/NGC-DL-CONTAINER-LICENSE + +ENTRYPOINT ["nvidia-driver", "init"] diff --git a/ubuntu24.04/README.md b/ubuntu24.04/README.md new file mode 100644 index 00000000..2527f9ff --- /dev/null +++ b/ubuntu24.04/README.md @@ -0,0 +1,3 @@ +# Ubuntu 20.04 [![build status](https://gitlab.com/nvidia/driver/badges/master/build.svg)](https://gitlab.com/nvidia/driver/commits/master) + +See https://github.com/NVIDIA/nvidia-docker/wiki/Driver-containers-(Beta) diff --git a/ubuntu24.04/drivers/README.md b/ubuntu24.04/drivers/README.md new file mode 100644 index 00000000..ddc27b5c --- /dev/null +++ b/ubuntu24.04/drivers/README.md @@ -0,0 +1 @@ +# Folder for downloading vGPU drivers and dependent metadata files \ No newline at end of file diff --git a/ubuntu24.04/empty b/ubuntu24.04/empty new file mode 100644 index 00000000..e69de29b diff --git a/ubuntu24.04/install.sh b/ubuntu24.04/install.sh new file mode 100755 index 00000000..8b36d4bd --- /dev/null +++ b/ubuntu24.04/install.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash + +set -eu + +download_installer () { + DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64} && curl -fSsl -O $BASE_URL/$DRIVER_VERSION/NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run && \ + chmod +x NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run; +} + +dep_install () { + if [ "$TARGETARCH" = "amd64" ]; then + dpkg --add-architecture i386 && \ + apt-get update && apt-get install -y --no-install-recommends \ + apt-utils \ + build-essential \ + ca-certificates \ + curl \ + kmod \ + file \ + libelf-dev \ + libglvnd-dev \ + pkg-config && \ + rm -rf /var/lib/apt/lists/* + elif [ "$TARGETARCH" = "arm64" ]; then + dpkg --add-architecture arm64 && \ + apt-get update && apt-get install -y \ + build-essential \ + ca-certificates \ + curl \ + kmod \ + file \ + libelf-dev \ + libglvnd-dev && \ + rm -rf /var/lib/apt/lists/* + fi +} + +repo_setup () { + if [ "$TARGETARCH" = "amd64" ]; then + echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ noble main universe" > /etc/apt/sources.list && \ + echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ noble-updates main universe" >> /etc/apt/sources.list && \ + echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ noble-security main universe" >> /etc/apt/sources.list && \ + usermod -o -u 0 -g 0 _apt + elif [ "$TARGETARCH" = "arm64" ]; then + echo "deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports noble main universe" > /etc/apt/sources.list && \ + echo "deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports noble-updates main universe" >> /etc/apt/sources.list && \ + echo "deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports noble-security main universe" >> /etc/apt/sources.list && \ + usermod -o -u 0 -g 0 _apt + else + echo "TARGETARCH doesn't match a known arch target" + exit 1 + fi +} + +if [ "$1" = "reposetup" ]; then + repo_setup +elif [ "$1" = "depinstall" ]; then + dep_install +elif [ "$1" = "download_installer" ]; then + download_installer +else + echo "Unknown function: $1" + exit 1 +fi + diff --git a/ubuntu24.04/nvidia-driver b/ubuntu24.04/nvidia-driver new file mode 100755 index 00000000..aedeeea2 --- /dev/null +++ b/ubuntu24.04/nvidia-driver @@ -0,0 +1,732 @@ +#! /bin/bash +# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. + +set -eu + +RUN_DIR=/run/nvidia +PID_FILE=${RUN_DIR}/${0##*/}.pid +DRIVER_VERSION=${DRIVER_VERSION:?"Missing DRIVER_VERSION env"} +KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver +NUM_VGPU_DEVICES=0 +GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" +USE_HOST_MOFED="${USE_HOST_MOFED:-false}" +NVIDIA_MODULE_PARAMS=() +NVIDIA_UVM_MODULE_PARAMS=() +NVIDIA_MODESET_MODULE_PARAMS=() +NVIDIA_PEERMEM_MODULE_PARAMS=() +TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"} + +OPEN_KERNEL_MODULES_ENABLED=${OPEN_KERNEL_MODULES_ENABLED:-false} +[[ "${OPEN_KERNEL_MODULES_ENABLED}" == "true" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel + +export DEBIAN_FRONTEND=noninteractive + +DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64} + +echo "DRIVER_ARCH is $DRIVER_ARCH" + +_update_package_cache() { + if [ "${PACKAGE_TAG:-}" != "builtin" ]; then + echo "Updating the package cache..." + apt-get -qq update + fi +} + +_cleanup_package_cache() { + if [ "${PACKAGE_TAG:-}" != "builtin" ]; then + echo "Cleaning up the package cache..." + rm -rf /var/lib/apt/lists/* + fi +} + +_update_ca_certificates() { + if [ ! -z "$(ls -A /usr/local/share/ca-certificates)" ]; then + update-ca-certificates + fi +} + +# Resolve the kernel version to the form major.minor.patch-revision-flavor where flavor defaults to generic. +_resolve_kernel_version() { + local version=$(apt-cache show "linux-headers-${KERNEL_VERSION}" 2> /dev/null | \ + sed -nE 's/^Version:\s+(([0-9]+\.){2}[0-9]+)[-.]([0-9]+).*/\1-\3/p' | head -1) + local flavor=$(echo ${KERNEL_VERSION} | sed 's/[^a-z]*//' | grep -Ev "^generic|virtual") + + echo "Resolving Linux kernel version..." + if [ -z "${version}" ]; then + echo "Could not resolve Linux kernel version" >&2 + return 1 + fi + + KERNEL_VERSION="${version}-${flavor:-generic}" + echo "Proceeding with Linux kernel version ${KERNEL_VERSION}" + return 0 +} + +# Install the kernel modules header/builtin/order files and generate the kernel version string. +_install_prerequisites() ( + local tmp_dir=$(mktemp -d) + + trap "rm -rf ${tmp_dir}" EXIT + cd ${tmp_dir} + + rm -rf /lib/modules/${KERNEL_VERSION} + mkdir -p /lib/modules/${KERNEL_VERSION}/proc + + echo "Installing Linux kernel headers..." + apt-get -qq install --no-install-recommends linux-headers-${KERNEL_VERSION} > /dev/null + + echo "Installing Linux kernel module files..." + apt-get -qq download linux-image-${KERNEL_VERSION} && dpkg -x linux-image*.deb . + { apt-get -qq download linux-modules-${KERNEL_VERSION} && dpkg -x linux-modules*.deb . || true; } 2> /dev/null + mv lib/modules/${KERNEL_VERSION}/modules.* /lib/modules/${KERNEL_VERSION} + mv lib/modules/${KERNEL_VERSION}/kernel /lib/modules/${KERNEL_VERSION} + depmod ${KERNEL_VERSION} + + echo "Generating Linux kernel version string..." + + ls -1 boot/vmlinuz-* | sed 's/\/boot\/vmlinuz-//g' - > version + if [ -z "$(&2 + return 1 + fi + mv version /lib/modules/${KERNEL_VERSION}/proc +) + +# Cleanup the prerequisites installed above. +_remove_prerequisites() { + if [ "${PACKAGE_TAG:-}" != "builtin" ]; then + apt-get -qq purge linux-headers-${KERNEL_VERSION} > /dev/null + # TODO remove module files not matching an existing driver package. + fi +} + +# Check if the kernel version requires a new precompiled driver packages. +_kernel_requires_package() { + local proc_mount_arg="" + + echo "Checking NVIDIA driver packages..." + cd /usr/src/nvidia-${DRIVER_VERSION}/${KERNEL_TYPE} + + # proc_mount_arg needs to be set, to do the module match check below + if [ -f /lib/modules/${KERNEL_VERSION}/proc/version ]; then + proc_mount_arg="--proc-mount-point /lib/modules/${KERNEL_VERSION}/proc" + fi + for pkg_name in $(ls -d -1 precompiled/** 2> /dev/null); do + if ! ../mkprecompiled --match ${pkg_name} ${proc_mount_arg} > /dev/null; then + echo "Found NVIDIA driver package ${pkg_name##*/}" + return 1 + fi + done + return 0 +} + +# Compile the kernel modules, optionally sign them, and generate a precompiled package for use by the nvidia-installer. +_create_driver_package() ( + local pkg_name="nvidia-modules-${KERNEL_VERSION%-*}${PACKAGE_TAG:+-${PACKAGE_TAG}}" + local nvidia_sign_args="" + local nvidia_modeset_sign_args="" + local nvidia_uvm_sign_args="" + + trap "make -s -j ${MAX_THREADS} SYSSRC=/lib/modules/${KERNEL_VERSION}/build clean > /dev/null" EXIT + + echo "Compiling NVIDIA driver kernel modules..." + cd /usr/src/nvidia-${DRIVER_VERSION}/${KERNEL_TYPE} + + if _gpu_direct_rdma_enabled; then + ln -s /run/mellanox/drivers/usr/src/ofa_kernel /usr/src/ + # if arch directory exists(MOFED >=5.5) then create a symlink as expected by GPU driver installer + # This is required as currently GPU driver installer doesn't expect headers in x86_64 folder, but only in either default or kernel-version folder. + # ls -ltr /usr/src/ofa_kernel/ + # lrwxrwxrwx 1 root root 36 Dec 8 20:10 default -> /etc/alternatives/ofa_kernel_headers + # drwxr-xr-x 4 root root 4096 Dec 8 20:14 x86_64 + # lrwxrwxrwx 1 root root 44 Dec 9 19:05 5.4.0-90-generic -> /usr/src/ofa_kernel/x86_64/5.4.0-90-generic/ + if [[ -d /run/mellanox/drivers/usr/src/ofa_kernel/$DRIVER_ARCH/`uname -r` ]]; then + if [[ ! -e /usr/src/ofa_kernel/`uname -r` ]]; then + ln -s /run/mellanox/drivers/usr/src/ofa_kernel/$DRIVER_ARCH/`uname -r` /usr/src/ofa_kernel/ + fi + fi + fi + + export IGNORE_CC_MISMATCH=1 + make -s -j ${MAX_THREADS} SYSSRC=/lib/modules/${KERNEL_VERSION}/build nv-linux.o nv-modeset-linux.o > /dev/null + + echo "Relinking NVIDIA driver kernel modules..." + rm -f nvidia.ko nvidia-modeset.ko + ld -d -r -o nvidia.ko ./nv-linux.o ./nvidia/nv-kernel.o_binary + ld -d -r -o nvidia-modeset.ko ./nv-modeset-linux.o ./nvidia-modeset/nv-modeset-kernel.o_binary + + if [ -n "${PRIVATE_KEY}" ]; then + echo "Signing NVIDIA driver kernel modules..." + donkey get ${PRIVATE_KEY} sh -c "PATH=${PATH}:/usr/src/linux-headers-${KERNEL_VERSION}/scripts && \ + sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia.ko nvidia.ko.sign && \ + sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia-modeset.ko nvidia-modeset.ko.sign && \ + sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia-uvm.ko" + nvidia_sign_args="--linked-module nvidia.ko --signed-module nvidia.ko.sign" + nvidia_modeset_sign_args="--linked-module nvidia-modeset.ko --signed-module nvidia-modeset.ko.sign" + nvidia_uvm_sign_args="--signed" + fi + + echo "Building NVIDIA driver package ${pkg_name}..." + ../mkprecompiled --pack ${pkg_name} --description ${KERNEL_VERSION} \ + --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc \ + --driver-version ${DRIVER_VERSION} \ + --kernel-interface nv-linux.o \ + --linked-module-name nvidia.ko \ + --core-object-name nvidia/nv-kernel.o_binary \ + ${nvidia_sign_args} \ + --target-directory . \ + --kernel-interface nv-modeset-linux.o \ + --linked-module-name nvidia-modeset.ko \ + --core-object-name nvidia-modeset/nv-modeset-kernel.o_binary \ + ${nvidia_modeset_sign_args} \ + --target-directory . \ + --kernel-module nvidia-uvm.ko \ + ${nvidia_uvm_sign_args} \ + --target-directory . + mkdir -p precompiled + mv ${pkg_name} precompiled +) + +_assert_nvswitch_system() { + [ -d /proc/driver/nvidia-nvswitch/devices ] || return 1 + if [ -z "$(ls -A /proc/driver/nvidia-nvswitch/devices)" ]; then + return 1 + fi + return 0 +} + +# Check if mellanox devices are present +_mellanox_devices_present() { + devices_found=0 + for dev in /sys/bus/pci/devices/*; do + read vendor < $dev/vendor + if [ "$vendor" = "0x15b3" ]; then + echo "Mellanox device found at $(basename $dev)" + return 0 + fi + done + echo "No Mellanox devices were found..." + return 1 +} + +_gpu_direct_rdma_enabled() { + if [ "${GPU_DIRECT_RDMA_ENABLED}" = "true" ]; then + # check if mellanox cards are present + if _mellanox_devices_present; then + return 0 + fi + fi + return 1 +} + +# For each kernel module configuration file mounted into the container, +# parse the file contents and extract the custom module parameters that +# are to be passed as input to 'modprobe'. +# +# Assumptions: +# - Configuration files are named .conf (i.e. nvidia.conf, nvidia-uvm.conf). +# - Configuration files are mounted inside the container at /drivers. +# - Each line in the file contains at least one parameter, where parameters on the same line +# are space delimited. It is up to the user to properly format the file to ensure +# the correct set of parameters are passed to 'modprobe'. +_get_module_params() { + local base_path="/drivers" + # nvidia + if [ -f "${base_path}/nvidia.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia.conf" + echo "Module parameters provided for nvidia: ${NVIDIA_MODULE_PARAMS[@]}" + fi + # nvidia-uvm + if [ -f "${base_path}/nvidia-uvm.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_UVM_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia-uvm.conf" + echo "Module parameters provided for nvidia-uvm: ${NVIDIA_UVM_MODULE_PARAMS[@]}" + fi + # nvidia-modeset + if [ -f "${base_path}/nvidia-modeset.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_MODESET_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia-modeset.conf" + echo "Module parameters provided for nvidia-modeset: ${NVIDIA_MODESET_MODULE_PARAMS[@]}" + fi + # nvidia-peermem + if [ -f "${base_path}/nvidia-peermem.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_PEERMEM_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia-peermem.conf" + echo "Module parameters provided for nvidia-peermem: ${NVIDIA_PEERMEM_MODULE_PARAMS[@]}" + fi +} + +# Load the kernel modules and start persistenced. +_load_driver() { + echo "Parsing kernel module parameters..." + _get_module_params + + local nv_fw_search_path="$RUN_DIR/driver/lib/firmware" + local set_fw_path="true" + local fw_path_config_file="/sys/module/firmware_class/parameters/path" + for param in "${NVIDIA_MODULE_PARAMS[@]}"; do + if [[ "$param" == "NVreg_EnableGpuFirmware=0" ]]; then + set_fw_path="false" + fi + done + + if [[ "$set_fw_path" == "true" ]]; then + echo "Configuring the following firmware search path in '$fw_path_config_file': $nv_fw_search_path" + if [[ ! -z $(grep '[^[:space:]]' $fw_path_config_file) ]]; then + echo "WARNING: A search path is already configured in $fw_path_config_file" + echo " Retaining the current configuration" + else + echo -n "$nv_fw_search_path" > $fw_path_config_file || echo "WARNING: Failed to configure firmware search path" + fi + fi + + echo "Loading ipmi and i2c_core kernel modules..." + modprobe -a i2c_core ipmi_msghandler ipmi_devintf + + echo "Loading NVIDIA driver kernel modules..." + set -o xtrace +o nounset + modprobe nvidia "${NVIDIA_MODULE_PARAMS[@]}" + modprobe nvidia-uvm "${NVIDIA_UVM_MODULE_PARAMS[@]}" + modprobe nvidia-modeset "${NVIDIA_MODESET_MODULE_PARAMS[@]}" + set +o xtrace -o nounset + + if _gpu_direct_rdma_enabled; then + echo "Loading NVIDIA Peer Memory kernel module..." + set -o xtrace +o nounset + modprobe nvidia-peermem "${NVIDIA_PEERMEM_MODULE_PARAMS[@]}" + set +o xtrace -o nounset + fi + + echo "Starting NVIDIA persistence daemon..." + nvidia-persistenced --persistence-mode + + if [ "${DRIVER_TYPE}" = "vgpu" ]; then + echo "Copying gridd.conf..." + cp /drivers/gridd.conf /etc/nvidia/gridd.conf + if [ "${VGPU_LICENSE_SERVER_TYPE}" = "NLS" ]; then + echo "Copying ClientConfigToken..." + mkdir -p /etc/nvidia/ClientConfigToken/ + cp /drivers/ClientConfigToken/* /etc/nvidia/ClientConfigToken/ + fi + + echo "Starting nvidia-gridd.." + LD_LIBRARY_PATH=/usr/lib/$DRIVER_ARCH-linux-gnu/nvidia/gridd nvidia-gridd + + # Start virtual topology daemon + _start_vgpu_topology_daemon + fi + + if _assert_nvswitch_system; then + echo "Starting NVIDIA fabric manager daemon..." + nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg + fi + + return 0 +} + +# Stop persistenced and unload the kernel modules if they are currently loaded. +_unload_driver() { + local rmmod_args=() + local nvidia_deps=0 + local nvidia_refs=0 + local nvidia_uvm_refs=0 + local nvidia_modeset_refs=0 + local nvidia_peermem_refs=0 + + echo "Stopping NVIDIA persistence daemon..." + if [ -f /var/run/nvidia-persistenced/nvidia-persistenced.pid ]; then + local pid=$(< /var/run/nvidia-persistenced/nvidia-persistenced.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVIDIA persistence daemon" >&2 + return 1 + fi + fi + + if [ -f /var/run/nvidia-gridd/nvidia-gridd.pid ]; then + echo "Stopping NVIDIA grid daemon..." + local pid=$(< /var/run/nvidia-gridd/nvidia-gridd.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 10); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 10 ]; then + echo "Could not stop NVIDIA Grid daemon" >&2 + return 1 + fi + fi + + if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then + echo "Stopping NVIDIA fabric manager daemon..." + local pid=$(< /var/run/nvidia-fabricmanager/nv-fabricmanager.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVIDIA fabric manager daemon" >&2 + return 1 + fi + fi + + echo "Unloading NVIDIA driver kernel modules..." + if [ -f /sys/module/nvidia_modeset/refcnt ]; then + nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt) + rmmod_args+=("nvidia-modeset") + ((++nvidia_deps)) + fi + if [ -f /sys/module/nvidia_uvm/refcnt ]; then + nvidia_uvm_refs=$(< /sys/module/nvidia_uvm/refcnt) + rmmod_args+=("nvidia-uvm") + ((++nvidia_deps)) + fi + if [ -f /sys/module/nvidia_peermem/refcnt ]; then + nvidia_peermem_refs=$(< /sys/module/nvidia_peermem/refcnt) + rmmod_args+=("nvidia-peermem") + ((++nvidia_deps)) + fi + if [ -f /sys/module/nvidia/refcnt ]; then + nvidia_refs=$(< /sys/module/nvidia/refcnt) + rmmod_args+=("nvidia") + fi + if [ ${nvidia_refs} -gt ${nvidia_deps} ] || [ ${nvidia_uvm_refs} -gt 0 ] || [ ${nvidia_modeset_refs} -gt 0 ] || [ ${nvidia_peermem_refs} -gt 0 ]; then + # run lsmod to debug module usage + lsmod | grep nvidia + echo "Could not unload NVIDIA driver kernel modules, driver is in use" >&2 + return 1 + fi + + if [ ${#rmmod_args[@]} -gt 0 ]; then + rmmod ${rmmod_args[@]} + fi + return 0 +} + +# Link and install the kernel modules from a precompiled package using the nvidia-installer. +_install_driver() { + local install_args=() + + echo "Installing NVIDIA driver kernel modules..." + cd /usr/src/nvidia-${DRIVER_VERSION} + if [ -d /lib/modules/${KERNEL_VERSION}/kernel/drivers/video ]; then + rm -rf /lib/modules/${KERNEL_VERSION}/kernel/drivers/video + else + rm -rf /lib/modules/${KERNEL_VERSION}/video + fi + + if [ "${ACCEPT_LICENSE}" = "yes" ]; then + install_args+=("--accept-license") + fi + nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"} +} + +# Mount the driver rootfs into the run directory with the exception of sysfs. +_mount_rootfs() { + echo "Mounting NVIDIA driver rootfs..." + mount --make-runbindable /sys + mount --make-private /sys + mkdir -p ${RUN_DIR}/driver + mount --rbind / ${RUN_DIR}/driver +} + +# Unmount the driver rootfs from the run directory. +_unmount_rootfs() { + echo "Unmounting NVIDIA driver rootfs..." + if findmnt -r -o TARGET | grep "${RUN_DIR}/driver" > /dev/null; then + umount -l -R ${RUN_DIR}/driver + fi +} + +# Write a kernel postinst.d script to automatically precompile packages on kernel update (similar to DKMS). +_write_kernel_update_hook() { + if [ ! -d ${KERNEL_UPDATE_HOOK%/*} ]; then + return + fi + + echo "Writing kernel update hook..." + cat > ${KERNEL_UPDATE_HOOK} <<'EOF' +#!/bin/bash + +set -eu +trap 'echo "ERROR: Failed to update the NVIDIA driver" >&2; exit 0' ERR + +NVIDIA_DRIVER_PID=$(< /run/nvidia/nvidia-driver.pid) + +export "$(grep -z DRIVER_VERSION /proc/${NVIDIA_DRIVER_PID}/environ)" +nsenter -t "${NVIDIA_DRIVER_PID}" -m -- nvidia-driver update --kernel "$1" +EOF + chmod +x ${KERNEL_UPDATE_HOOK} +} + +_shutdown() { + if _unload_driver; then + _unmount_rootfs + rm -f ${PID_FILE} ${KERNEL_UPDATE_HOOK} + return 0 + fi + return 1 +} + +_find_vgpu_driver_version() { + local count="" + local version="" + + if [ "${DISABLE_VGPU_VERSION_CHECK}" = "true" ]; then + echo "vgpu version compatibility check is disabled" + return 0 + fi + # check if vgpu devices are present + count=$(vgpu-util count) + if [ $? -ne 0 ]; then + echo "cannot find vgpu devices on host, pleae check /var/log/vgpu-util.log for more details..." + return 0 + fi + NUM_VGPU_DEVICES=$(echo "$count" | awk -F= '{print $2}') + if [ $NUM_VGPU_DEVICES -eq 0 ]; then + # no vgpu devices found, treat as passthrough + return 0 + fi + echo "found $NUM_VGPU_DEVICES vgpu devices on host" + + # find compatible guest driver using drive catalog + version=$(vgpu-util match -i /drivers -c /drivers/vgpuDriverCatalog.yaml) + if [ $? -ne 0 ]; then + echo "cannot find match for compatible vgpu driver from available list, please check /var/log/vgpu-util.log for more details..." + return 1 + fi + DRIVER_VERSION=$(echo "$version" | awk -F= '{print $2}') + echo "vgpu driver version selected: ${DRIVER_VERSION}" + return 0 +} + +_start_vgpu_topology_daemon() { + type nvidia-topologyd > /dev/null 2>&1 || return 0 + echo "Starting nvidia-topologyd.." + nvidia-topologyd +} + +init() { + if [ "${DRIVER_TYPE}" = "vgpu" ]; then + _find_vgpu_driver_version || exit 1 + fi + + # Install the userspace components and copy the kernel module sources. + sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \ + cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \ + ./nvidia-installer --silent \ + --no-kernel-module \ + --no-nouveau-check \ + --no-nvidia-modprobe \ + --no-rpms \ + --no-backup \ + --no-check-for-alternate-installs \ + --no-libglx-indirect \ + --no-install-libglvnd \ + --x-prefix=/tmp/null \ + --x-module-path=/tmp/null \ + --x-library-path=/tmp/null \ + --x-sysconfig-path=/tmp/null && \ + mkdir -p /usr/src/nvidia-${DRIVER_VERSION} && \ + mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION} && \ + sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest + + echo -e "\n========== NVIDIA Software Installer ==========\n" + echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n" + + exec 3> ${PID_FILE} + if ! flock -n 3; then + echo "An instance of the NVIDIA driver is already running, aborting" + exit 1 + fi + echo $$ >&3 + + trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM + trap "_shutdown" EXIT + + _unload_driver || exit 1 + _unmount_rootfs + + if _kernel_requires_package; then + _update_ca_certificates + _update_package_cache + _resolve_kernel_version || exit 1 + _install_prerequisites + _create_driver_package + #_remove_prerequisites + #_cleanup_package_cache + fi + + _install_driver + _load_driver || exit 1 + _mount_rootfs + _write_kernel_update_hook + + echo "Done, now waiting for signal" + sleep infinity & + trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM + trap - EXIT + while true; do wait $! || continue; done + exit 0 +} + +update() { + exec 3>&2 + if exec 2> /dev/null 4< ${PID_FILE}; then + if ! flock -n 4 && read pid <&4 && kill -0 "${pid}"; then + exec > >(tee -a "/proc/${pid}/fd/1") + exec 2> >(tee -a "/proc/${pid}/fd/2" >&3) + else + exec 2>&3 + fi + exec 4>&- + fi + exec 3>&- + + # vgpu driver version is choosen dynamically during runtime, so pre-compile modules for + # only non-vgpu driver types + if [ "${DRIVER_TYPE}" != "vgpu" ]; then + # Install the userspace components and copy the kernel module sources. + if [ ! -e /usr/src/nvidia-${DRIVER_VERSION}/mkprecompiled ]; then + sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \ + cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \ + ./nvidia-installer --silent \ + --no-kernel-module \ + --no-nouveau-check \ + --no-nvidia-modprobe \ + --no-rpms \ + --no-backup \ + --no-check-for-alternate-installs \ + --no-libglx-indirect \ + --no-install-libglvnd \ + --x-prefix=/tmp/null \ + --x-module-path=/tmp/null \ + --x-library-path=/tmp/null \ + --x-sysconfig-path=/tmp/null && \ + mkdir -p /usr/src/nvidia-${DRIVER_VERSION} && \ + mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION} && \ + sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest + fi + fi + + echo -e "\n========== NVIDIA Software Updater ==========\n" + echo -e "Starting update of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n" + + trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM + + _update_package_cache + _resolve_kernel_version || exit 1 + _install_prerequisites + if _kernel_requires_package; then + _create_driver_package + fi + _remove_prerequisites + _cleanup_package_cache + + echo "Done" + exit 0 +} + +# Wait for MOFED drivers to be loaded and load nvidia-peermem whenever it gets unloaded during MOFED driver updates +reload_nvidia_peermem() { + if [ "$USE_HOST_MOFED" = "true" ]; then + until lsmod | grep mlx5_core > /dev/null 2>&1 && [ -f /run/nvidia/validations/.driver-ctr-ready ]; + do + echo "waiting for mellanox ofed and nvidia drivers to be installed" + sleep 10 + done + else + # use driver readiness flag created by MOFED container + until [ -f /run/mellanox/drivers/.driver-ready ] && [ -f /run/nvidia/validations/.driver-ctr-ready ]; + do + echo "waiting for mellanox ofed and nvidia drivers to be installed" + sleep 10 + done + fi + # get any parameters provided for nvidia-peermem + _get_module_params && set +o nounset + if chroot /run/nvidia/driver modprobe nvidia-peermem "${NVIDIA_PEERMEM_MODULE_PARAMS[@]}"; then + if [ -f /sys/module/nvidia_peermem/refcnt ]; then + echo "successfully loaded nvidia-peermem module, now waiting for signal" + sleep inf + trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM + fi + fi + echo "failed to load nvidia-peermem module" + exit 1 +} + +# probe by gpu-opertor for liveness/startup checks for nvidia-peermem module to be loaded when MOFED drivers are ready +probe_nvidia_peermem() { + if lsmod | grep mlx5_core > /dev/null 2>&1; then + if [ ! -f /sys/module/nvidia_peermem/refcnt ]; then + echo "nvidia-peermem module is not loaded" + return 1 + fi + else + echo "MOFED drivers are not ready, skipping probe to avoid container restarts..." + fi + return 0 +} + +usage() { + cat >&2 < /etc/apt/sources.list && \ + echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ noble-updates main universe" >> /etc/apt/sources.list && \ + echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ noble-security main universe" >> /etc/apt/sources.list && \ + echo "deb [arch=amd64] http://us.archive.ubuntu.com/ubuntu noble-updates main restricted" >> /etc/apt/sources.list && \ + echo "deb [arch=amd64] http://us.archive.ubuntu.com/ubuntu noble-security main restricted" >> /etc/apt/sources.list && \ + usermod -o -u 0 -g 0 _apt + +RUN curl -fsSL -o /usr/local/bin/donkey https://github.com/3XX0/donkey/releases/download/v1.1.0/donkey && \ + chmod +x /usr/local/bin/donkey + +# Install / upgrade packages here that are required to resolve CVEs +ARG CVE_UPDATES +RUN if [ -n "${CVE_UPDATES}" ]; then \ + apt-get update && apt-get upgrade -y ${CVE_UPDATES} && \ + rm -rf /var/lib/apt/lists/*; \ + fi + +# update pkg cache and install pkgs for userspace driver libs +RUN apt-get update && apt-get install -y --no-install-recommends nvidia-driver-${DRIVER_BRANCH}-server \ + nvidia-fabricmanager-${DRIVER_BRANCH}=${DRIVER_VERSION}-1 \ + libnvidia-nscq-${DRIVER_BRANCH}=${DRIVER_VERSION}-1 && \ + apt-get purge -y \ + libnvidia-egl-wayland1 \ + nvidia-dkms-${DRIVER_BRANCH}-server \ + nvidia-kernel-source-${DRIVER_BRANCH}-server \ + xserver-xorg-video-nvidia-${DRIVER_BRANCH}-server && \ + rm -rf /var/lib/apt/lists/*; + +# update pkg cache and download pkgs for driver module installation during runtime. +# this is done to avoid shipping .ko files. +# avoid cleaning the cache after this to retain these packages during runtime. +RUN apt-get update && apt-get install --download-only --no-install-recommends -y linux-objects-nvidia-${DRIVER_BRANCH}-server-${KERNEL_VERSION} \ + linux-signatures-nvidia-${KERNEL_VERSION} \ + linux-modules-nvidia-${DRIVER_BRANCH}-server-${KERNEL_VERSION} \ + # add support for nvidia open source driver packages during runtime + linux-modules-nvidia-${DRIVER_BRANCH}-server-open-${KERNEL_VERSION} + +COPY nvidia-driver /usr/local/bin + +WORKDIR /drivers + +# Remove cuda repository to avoid GPG errors +RUN rm -f /etc/apt/sources.list.d/cuda* + +ENTRYPOINT ["nvidia-driver", "init"] diff --git a/ubuntu24.04/precompiled/nvidia-driver b/ubuntu24.04/precompiled/nvidia-driver new file mode 100755 index 00000000..97369be2 --- /dev/null +++ b/ubuntu24.04/precompiled/nvidia-driver @@ -0,0 +1,350 @@ +#! /bin/bash +# Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved. + +set -eu + +KERNEL_VERSION=$(uname -r) +OPEN_KERNEL_MODULES_ENABLED="${OPEN_KERNEL_MODULES_ENABLED:-false}" +RUN_DIR=/run/nvidia +PID_FILE=${RUN_DIR}/${0##*/}.pid +DRIVER_BRANCH=${DRIVER_BRANCH:?"Missing driver version"} +GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" +USE_HOST_MOFED="${USE_HOST_MOFED:-false}" +NVIDIA_MODULE_PARAMS=() +NVIDIA_UVM_MODULE_PARAMS=() +NVIDIA_MODESET_MODULE_PARAMS=() +NVIDIA_PEERMEM_MODULE_PARAMS=() + +_update_package_cache() { + if [ "${PACKAGE_TAG:-}" != "builtin" ]; then + echo "Updating the package cache..." + apt-get -qq update + fi +} + +_assert_nvswitch_system() { + [ -d /proc/driver/nvidia-nvswitch/devices ] || return 1 + if [ -z "$(ls -A /proc/driver/nvidia-nvswitch/devices)" ]; then + return 1 + fi + return 0 +} + +# Check if mellanox devices are present +_mellanox_devices_present() { + devices_found=0 + for dev in /sys/bus/pci/devices/*; do + read vendor < $dev/vendor + if [ "$vendor" = "0x15b3" ]; then + echo "Mellanox device found at $(basename $dev)" + return 0 + fi + done + echo "No Mellanox devices were found..." + return 1 +} + +_gpu_direct_rdma_enabled() { + if [ "${GPU_DIRECT_RDMA_ENABLED}" = "true" ]; then + # check if mellanox cards are present + if _mellanox_devices_present; then + return 0 + fi + fi + return 1 +} + +# For each kernel module configuration file mounted into the container, +# parse the file contents and extract the custom module parameters that +# are to be passed as input to 'modprobe'. +# +# Assumptions: +# - Configuration files are named .conf (i.e. nvidia.conf, nvidia-uvm.conf). +# - Configuration files are mounted inside the container at /drivers. +# - Each line in the file contains at least one parameter, where parameters on the same line +# are space delimited. It is up to the user to properly format the file to ensure +# the correct set of parameters are passed to 'modprobe'. +_get_module_params() { + local base_path="/drivers" + # nvidia + if [ -f "${base_path}/nvidia.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia.conf" + echo "Module parameters provided for nvidia: ${NVIDIA_MODULE_PARAMS[@]}" + fi + # nvidia-uvm + if [ -f "${base_path}/nvidia-uvm.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_UVM_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia-uvm.conf" + echo "Module parameters provided for nvidia-uvm: ${NVIDIA_UVM_MODULE_PARAMS[@]}" + fi + # nvidia-modeset + if [ -f "${base_path}/nvidia-modeset.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_MODESET_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia-modeset.conf" + echo "Module parameters provided for nvidia-modeset: ${NVIDIA_MODESET_MODULE_PARAMS[@]}" + fi + # nvidia-peermem + if [ -f "${base_path}/nvidia-peermem.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_PEERMEM_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia-peermem.conf" + echo "Module parameters provided for nvidia-peermem: ${NVIDIA_PEERMEM_MODULE_PARAMS[@]}" + fi +} + +# Load the kernel modules and start persistenced. +_load_driver() { + echo "Parsing kernel module parameters..." + _get_module_params + + echo "Loading ipmi and i2c_core kernel modules..." + modprobe -a i2c_core ipmi_msghandler ipmi_devintf + + echo "Loading NVIDIA driver kernel modules..." + set -o xtrace +o nounset + modprobe nvidia "${NVIDIA_MODULE_PARAMS[@]}" + modprobe nvidia-uvm "${NVIDIA_UVM_MODULE_PARAMS[@]}" + modprobe nvidia-modeset "${NVIDIA_MODESET_MODULE_PARAMS[@]}" + set +o xtrace -o nounset + + + if _gpu_direct_rdma_enabled; then + echo "Loading NVIDIA Peer Memory kernel module..." + set -o xtrace +o nounset + modprobe nvidia-peermem "${NVIDIA_PEERMEM_MODULE_PARAMS[@]}" + set +o xtrace -o nounset + fi + + echo "Starting NVIDIA persistence daemon..." + nvidia-persistenced --persistence-mode + + DRIVER_VERSION=$(nvidia-smi -q | grep "Driver Version" | awk -F: '{print $2}' | xargs) + if _assert_nvswitch_system; then + + echo "Starting NVIDIA fabric manager daemon..." + nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg + fi + + return 0 +} + +# Stop persistenced and unload the kernel modules if they are currently loaded. +_unload_driver() { + local rmmod_args=() + local nvidia_deps=0 + local nvidia_refs=0 + local nvidia_uvm_refs=0 + local nvidia_modeset_refs=0 + local nvidia_peermem_refs=0 + + echo "Stopping NVIDIA persistence daemon..." + if [ -f /var/run/nvidia-persistenced/nvidia-persistenced.pid ]; then + local pid=$(< /var/run/nvidia-persistenced/nvidia-persistenced.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVIDIA persistence daemon" >&2 + return 1 + fi + fi + + if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then + echo "Stopping NVIDIA fabric manager daemon..." + local pid=$(< /var/run/nvidia-fabricmanager/nv-fabricmanager.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVIDIA fabric manager daemon" >&2 + return 1 + fi + fi + + echo "Unloading NVIDIA driver kernel modules..." + if [ -f /sys/module/nvidia_drm/refcnt ]; then + nvidia_drm_refs=$(< /sys/module/nvidia_drm/refcnt) + rmmod_args+=("nvidia-drm") + ((++nvidia_deps)) + fi + if [ -f /sys/module/nvidia_modeset/refcnt ]; then + nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt) + rmmod_args+=("nvidia-modeset") + ((++nvidia_deps)) + fi + if [ -f /sys/module/nvidia_uvm/refcnt ]; then + nvidia_uvm_refs=$(< /sys/module/nvidia_uvm/refcnt) + rmmod_args+=("nvidia-uvm") + ((++nvidia_deps)) + fi + if [ -f /sys/module/nvidia_peermem/refcnt ]; then + nvidia_peermem_refs=$(< /sys/module/nvidia_peermem/refcnt) + rmmod_args+=("nvidia-peermem") + ((++nvidia_deps)) + fi + if [ -f /sys/module/nvidia/refcnt ]; then + nvidia_refs=$(< /sys/module/nvidia/refcnt) + rmmod_args+=("nvidia") + fi + if [ ${nvidia_refs} -gt ${nvidia_deps} ]; then + # run lsmod to debug module usage + lsmod | grep nvidia + echo "Could not unload NVIDIA driver kernel modules, driver is in use" >&2 + return 1 + fi + + if [ ${#rmmod_args[@]} -gt 0 ]; then + rmmod ${rmmod_args[@]} + fi + return 0 +} + +# Link and install the kernel modules from a precompiled packages +_install_driver() { + if [ "$OPEN_KERNEL_MODULES_ENABLED" = true ]; then + echo "Installing Open NVIDIA driver kernel modules..." + apt-get install --no-install-recommends -y \ + linux-signatures-nvidia-${KERNEL_VERSION} \ + linux-modules-nvidia-${DRIVER_BRANCH}-server-open-${KERNEL_VERSION} + else + echo "Installing Closed NVIDIA driver kernel modules..." + apt-get install --no-install-recommends -y \ + linux-objects-nvidia-${DRIVER_BRANCH}-server-${KERNEL_VERSION} \ + linux-signatures-nvidia-${KERNEL_VERSION} \ + linux-modules-nvidia-${DRIVER_BRANCH}-server-${KERNEL_VERSION} + fi +} + +# Mount the driver rootfs into the run directory with the exception of sysfs. +_mount_rootfs() { + echo "Mounting NVIDIA driver rootfs..." + mount --make-runbindable /sys + mount --make-private /sys + mkdir -p ${RUN_DIR}/driver + mount --rbind / ${RUN_DIR}/driver +} + +# Unmount the driver rootfs from the run directory. +_unmount_rootfs() { + echo "Unmounting NVIDIA driver rootfs..." + if findmnt -r -o TARGET | grep "${RUN_DIR}/driver" > /dev/null; then + umount -l -R ${RUN_DIR}/driver + fi +} + +init() { + echo -e "\n========== NVIDIA Software Installer ==========\n" + echo -e "Starting installation of NVIDIA driver branch ${DRIVER_BRANCH} for Linux kernel version ${KERNEL_VERSION}\n" + + exec 3> ${PID_FILE} + if ! flock -n 3; then + echo "An instance of the NVIDIA driver is already running, aborting" + exit 1 + fi + echo $$ >&3 + + trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM + trap "_shutdown" EXIT + + _unload_driver || exit 1 + _unmount_rootfs + + _install_driver + _load_driver || exit 1 + _mount_rootfs + + echo "Done, now waiting for signal" + sleep infinity & + trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM + trap - EXIT + while true; do wait $! || continue; done + exit 0 +} + +_shutdown() { + if _unload_driver; then + _unmount_rootfs + rm -f ${PID_FILE} + return 0 + fi + return 1 +} + +# Wait for MOFED drivers to be loaded and load nvidia-peermem whenever it gets unloaded during MOFED driver updates +reload_nvidia_peermem() { + if [ "$USE_HOST_MOFED" = "true" ]; then + until lsmod | grep mlx5_core > /dev/null 2>&1 && [ -f /sys/module/nvidia/refcnt ]; + do + echo "waiting for mellanox ofed and nvidia drivers to be installed" + sleep 10 + done + else + # use driver readiness flag created by MOFED container + until [ -f /run/mellanox/drivers/.driver-ready ] && [ -f /sys/module/nvidia/refcnt ]; + do + echo "waiting for mellanox ofed and nvidia drivers to be installed" + sleep 10 + done + fi + # get any parameters provided for nvidia-peermem + _get_module_params && set +o nounset + if chroot /run/nvidia/driver modprobe nvidia-peermem "${NVIDIA_PEERMEM_MODULE_PARAMS[@]}"; then + if [ -f /sys/module/nvidia_peermem/refcnt ]; then + echo "successfully loaded nvidia-peermem module, now waiting for signal" + sleep inf + trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM + fi + fi + echo "failed to load nvidia-peermem module" + exit 1 +} + +# probe by gpu-opertor for liveness/startup checks for nvidia-peermem module to be loaded when MOFED drivers are ready +probe_nvidia_peermem() { + if lsmod | grep mlx5_core > /dev/null 2>&1; then + if [ ! -f /sys/module/nvidia_peermem/refcnt ]; then + echo "nvidia-peermem module is not loaded" + return 1 + fi + else + echo "MOFED drivers are not ready, skipping probe to avoid container restarts..." + fi + return 0 +} + +usage() { + cat >&2 <