Skip to content

Commit

Permalink
[Testing] Fix RL9 Nvidia driver issue due to RL9 new release (#1839)
Browse files Browse the repository at this point in the history
  • Loading branch information
LujieDuan authored Nov 29, 2024
1 parent 703b46b commit 19ec7da
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 52 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
set -e
source /etc/os-release
VERSION_ID=${VERSION_ID%%.*}
MAJOR_VERSION_ID=${VERSION_ID%%.*}

verify_driver() {
# Verify NVIDIA driver:
Expand All @@ -18,24 +18,32 @@ install_cuda_from_runfile() {
# Remove existing installation before using the runfile
remove_cuda_package
remove_driver_package
# For Rocky Linux 9: when a new OS version becomes available, the default
# repo setting (/etc/yum.repos.d/rocky.repo) will automatically point to the
# new version's repo. This is problematic since the new OS is not available
# right away on GCE. Set up the matched repo to install the correct
# kernel-devel-$(uname -r)
# Not needed for RL8 since 8.10 is already the last RL8 release.
if [[ $ID == rocky && "${MAJOR_VERSION_ID}" == 9 ]]; then
cat <<EOF | sudo tee /etc/yum.repos.d/rocky-matched.repo
[appstream-matched]
name=Rocky Linux \$releasever - AppStream - Matched
baseurl=https://dl.rockylinux.org/vault/rocky/$VERSION_ID/AppStream/\$basearch/os/
gpgcheck=1
enabled=1
countme=1
metadata_expire=6h
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9
EOF
fi
sudo yum install -y kernel-devel-$(uname -r) pciutils gcc make wget yum-utils

# Installing latest version of NVIDIA CUDA and driver
# Data Center/Tesla drivers and CUDA are released on different schedules;
# normally we install the matching versions of driver and CUDA
# ($DRIVER_VERSION == $CUDA_BUNDLED_DRIVER_VERSION); due to https://github.com/NVIDIA/open-gpu-kernel-modules/issues/550
# we install a newer version of the driver
local DRIVER_VERSION=535.129.03
local CUDA_VERSION=12.2.2
local CUDA_BUNDLED_DRIVER_VERSION=535.104.05
echo "Installing NVIDIA Data Center driver $DRIVER_VERSION"
curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent
# Install the CUDA toolkit only, so that the CUDA toolkit uses the Data Center driver installed in the previous step
# See https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/ for CUDA and driver compatibility
local CUDA_VERSION=12.6.3
local CUDA_BUNDLED_DRIVER_VERSION=560.35.05
echo "Installing CUDA Toolkit $CUDA_VERSION from CUDA installer with bundled driver $CUDA_BUNDLED_DRIVER_VERSION"
curl -fSsl -O https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION/local_installers/cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run
sudo sh cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run --toolkit --silent
sudo sh cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run --silent
verify_driver
}

Expand All @@ -44,15 +52,15 @@ setup_repo() {
# Ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#prepare-rhel-9-rocky-9
sudo yum install -y yum-utils epel-release
sudo yum-config-manager \
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel$VERSION_ID/x86_64/cuda-rhel$VERSION_ID.repo
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel$MAJOR_VERSION_ID/x86_64/cuda-rhel$MAJOR_VERSION_ID.repo
sudo yum clean all
}

install_cuda_from_package_manager() {
setup_repo
install_driver_package
# TODO(b/377558109): remove the temporary fix once the repo is updated
sudo yum -y install cuda-toolkit cuda-demo*
sudo yum -y install cuda-toolkit cuda-demo*
verify_driver
}

Expand All @@ -76,7 +84,7 @@ install_dcgm() {
try_install() {
# Export all functions for the bash subprocess
eval "$(declare -F | sed 's/ -f / -fx /')"
export VERSION_ID
export ID MAJOR_VERSION_ID VERSION_ID
for install_method in "$@"; do
echo "Installing NVIDIA driver and CUDA with $install_method..."
# Can't use a subshell because of https://lists.gnu.org/archive/html/bug-bash/2012-12/msg00094.html
Expand Down Expand Up @@ -114,9 +122,10 @@ handle_common() {
# Ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#removing-cuda-toolkit-and-driver
sudo yum -y module remove --all nvidia-driver
}

}

case "$VERSION_ID" in
case "$MAJOR_VERSION_ID" in
7) handle_rhel7;;
*) handle_common;;
esac
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
set -e
source /etc/os-release
VERSION_ID=${VERSION_ID%%.*}
MAJOR_VERSION_ID=${VERSION_ID%%.*}

verify_driver() {
# Verify NVIDIA driver:
Expand All @@ -18,24 +18,32 @@ install_cuda_from_runfile() {
# Remove existing installation before using the runfile
remove_cuda_package
remove_driver_package
# For Rocky Linux 9: when a new OS version becomes available, the default
# repo setting (/etc/yum.repos.d/rocky.repo) will automatically point to the
# new version's repo. This is problematic since the new OS is not available
# right away on GCE. Set up the matched repo to install the correct
# kernel-devel-$(uname -r)
# Not needed for RL8 since 8.10 is already the last RL8 release.
if [[ $ID == rocky && "${MAJOR_VERSION_ID}" == 9 ]]; then
cat <<EOF | sudo tee /etc/yum.repos.d/rocky-matched.repo
[appstream-matched]
name=Rocky Linux \$releasever - AppStream - Matched
baseurl=https://dl.rockylinux.org/vault/rocky/$VERSION_ID/AppStream/\$basearch/os/
gpgcheck=1
enabled=1
countme=1
metadata_expire=6h
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9
EOF
fi
sudo yum install -y kernel-devel-$(uname -r) pciutils gcc make wget yum-utils

# Installing latest version of NVIDIA CUDA and driver
# Data Center/Tesla drivers and CUDA are released on different schedules;
# normally we install the matching versions of driver and CUDA
# ($DRIVER_VERSION == $CUDA_BUNDLED_DRIVER_VERSION); due to https://github.com/NVIDIA/open-gpu-kernel-modules/issues/550
# we install a newer version of the driver
local DRIVER_VERSION=535.129.03
local CUDA_VERSION=12.2.2
local CUDA_BUNDLED_DRIVER_VERSION=535.104.05
echo "Installing NVIDIA Data Center driver $DRIVER_VERSION"
curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent
# Install the CUDA toolkit only, so that the CUDA toolkit uses the Data Center driver installed in the previous step
# See https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/ for CUDA and driver compatibility
local CUDA_VERSION=12.6.3
local CUDA_BUNDLED_DRIVER_VERSION=560.35.05
echo "Installing CUDA Toolkit $CUDA_VERSION from CUDA installer with bundled driver $CUDA_BUNDLED_DRIVER_VERSION"
curl -fSsl -O https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION/local_installers/cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run
sudo sh cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run --toolkit --silent
sudo sh cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run --silent
verify_driver
}

Expand All @@ -44,7 +52,7 @@ setup_repo() {
# Ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#prepare-rhel-9-rocky-9
sudo yum install -y yum-utils epel-release
sudo yum-config-manager \
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel$VERSION_ID/x86_64/cuda-rhel$VERSION_ID.repo
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel$MAJOR_VERSION_ID/x86_64/cuda-rhel$MAJOR_VERSION_ID.repo
sudo yum clean all
}

Expand Down Expand Up @@ -76,7 +84,7 @@ install_dcgm() {
try_install() {
# Export all functions for the bash subprocess
eval "$(declare -F | sed 's/ -f / -fx /')"
export VERSION_ID
export ID MAJOR_VERSION_ID VERSION_ID
for install_method in "$@"; do
echo "Installing NVIDIA driver and CUDA with $install_method..."
# Can't use a subshell because of https://lists.gnu.org/archive/html/bug-bash/2012-12/msg00094.html
Expand Down Expand Up @@ -114,9 +122,10 @@ handle_common() {
# Ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#removing-cuda-toolkit-and-driver
sudo yum -y module remove --all nvidia-driver
}

}

case "$VERSION_ID" in
case "$MAJOR_VERSION_ID" in
7) handle_rhel7;;
*) handle_common;;
esac
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
set -e
source /etc/os-release
VERSION_ID=${VERSION_ID%%.*}
MAJOR_VERSION_ID=${VERSION_ID%%.*}

verify_driver() {
# Verify NVIDIA driver:
Expand All @@ -18,24 +18,32 @@ install_cuda_from_runfile() {
# Remove existing installation before using the runfile
remove_cuda_package
remove_driver_package
# For Rocky Linux 9: when a new OS version becomes available, the default
# repo setting (/etc/yum.repos.d/rocky.repo) will automatically point to the
# new version's repo. This is problematic since the new OS is not available
# right away on GCE. Set up the matched repo to install the correct
# kernel-devel-$(uname -r)
# Not needed for RL8 since 8.10 is already the last RL8 release.
if [[ $ID == rocky && "${MAJOR_VERSION_ID}" == 9 ]]; then
cat <<EOF | sudo tee /etc/yum.repos.d/rocky-matched.repo
[appstream-matched]
name=Rocky Linux \$releasever - AppStream - Matched
baseurl=https://dl.rockylinux.org/vault/rocky/$VERSION_ID/AppStream/\$basearch/os/
gpgcheck=1
enabled=1
countme=1
metadata_expire=6h
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9
EOF
fi
sudo yum install -y kernel-devel-$(uname -r) pciutils gcc make wget yum-utils

# Installing latest version of NVIDIA CUDA and driver
# Data Center/Tesla drivers and CUDA are released on different schedules;
# normally we install the matching versions of driver and CUDA
# ($DRIVER_VERSION == $CUDA_BUNDLED_DRIVER_VERSION); due to https://github.com/NVIDIA/open-gpu-kernel-modules/issues/550
# we install a newer version of the driver
local DRIVER_VERSION=535.129.03
local CUDA_VERSION=12.2.2
local CUDA_BUNDLED_DRIVER_VERSION=535.104.05
echo "Installing NVIDIA Data Center driver $DRIVER_VERSION"
curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
sudo bash ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent
# Install the CUDA toolkit only, so that the CUDA toolkit uses the Data Center driver installed in the previous step
# See https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/ for CUDA and driver compatibility
local CUDA_VERSION=12.6.3
local CUDA_BUNDLED_DRIVER_VERSION=560.35.05
echo "Installing CUDA Toolkit $CUDA_VERSION from CUDA installer with bundled driver $CUDA_BUNDLED_DRIVER_VERSION"
curl -fSsl -O https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION/local_installers/cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run
sudo sh cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run --toolkit --silent
sudo sh cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run --silent
verify_driver
}

Expand All @@ -44,7 +52,7 @@ setup_repo() {
# Ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#prepare-rhel-9-rocky-9
sudo yum install -y yum-utils epel-release
sudo yum-config-manager \
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel$VERSION_ID/x86_64/cuda-rhel$VERSION_ID.repo
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel$MAJOR_VERSION_ID/x86_64/cuda-rhel$MAJOR_VERSION_ID.repo
sudo yum clean all
}

Expand All @@ -66,7 +74,7 @@ remove_cuda_package() {
try_install() {
# Export all functions for the bash subprocess
eval "$(declare -F | sed 's/ -f / -fx /')"
export VERSION_ID
export ID MAJOR_VERSION_ID VERSION_ID
for install_method in "$@"; do
echo "Installing NVIDIA driver and CUDA with $install_method..."
# Can't use a subshell because of https://lists.gnu.org/archive/html/bug-bash/2012-12/msg00094.html
Expand Down Expand Up @@ -107,7 +115,7 @@ handle_common() {

}

case "$VERSION_ID" in
case "$MAJOR_VERSION_ID" in
7) handle_rhel7;;
*) handle_common;;
esac
Expand Down

0 comments on commit 19ec7da

Please sign in to comment.