Skip to content

Commit

Permalink
hotfix: use kind node version with containerd 1.8
Browse files Browse the repository at this point in the history
as kind has upgraded its containerd version to 1.9 which triggered
issues to gpu-operator (see issue NVIDIA/gpu-operator#432)

so we sticked kind version with containerd 1.8

also fix gpu installation
  • Loading branch information
hsinhoyeh committed Dec 2, 2022
1 parent e46a174 commit 95e6ba3
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 27 deletions.
22 changes: 13 additions & 9 deletions hack/gpu/get-cudadriver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,27 @@ if (( $EUID != 0 )); then
exit
fi

ENV OS=ubuntu2204
ENV cudnn_version=8.6.0.*
ENV cuda_version=cuda11.8
OS=ubuntu2004
cudnn_version=8.6.0.*
cuda_version=cuda11.8

echo " this script runs on ${OS}, for other version please check https://developer.nvidia.com/cuda-downloads"
echo " this script runs on $OS, for other version please check https://developer.nvidia.com/cuda-downloads"

# purge previous installation
# apt-get purge -y nvidia*

apt-get update
apt-get install -y wget

wget https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin \
&& mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/3bf863cc.pub
add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/ /"
wget https://developer.download.nvidia.com/compute/cuda/repos/$OS/x86_64/cuda-$OS.pin \
&& mv cuda-$OS.pin /etc/apt/preferences.d/cuda-repository-pin-600
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/$OS/x86_64/3bf863cc.pub
add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/$OS/x86_64/ /"

apt-get update

# install older driver
# apt-get install -y nvidia-driver-450 for k80

# or use apt-get install -y nvidia-driver-515 to install previous driver version to avoid conflict in cuda11.8
apt-get install -y cuda libcudnn8=${cudnn_version}-1+${cuda_version} libcudnn8-dev=${cudnn_version}-1+${cuda_version}
apt-get install -y cuda libcudnn8=$cudnn_version-1+$cuda_version libcudnn8-dev=$cudnn_version-1+$cuda_version
19 changes: 3 additions & 16 deletions hack/gpu/nvidia-container-runtime.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.li

apt-get update && apt-get install -y nvidia-container-toolkit nvidia-container-runtime

cat << EOF
check /etc/docker/daemon.json contains the following config
# if you were using containerd, please check here: https://github.com/NVIDIA/k8s-device-plugin#configure-containerd
# append /etc/docker/daemon.json with the following config
tee /etc/docker/daemon.json <<EOF
{
"default-runtime": "nvidia",
"runtimes": {
Expand All @@ -23,22 +24,8 @@ check /etc/docker/daemon.json contains the following config
}
}
}
if you were using containerd, please check here: https://github.com/NVIDIA/k8s-device-plugin#configure-containerd
EOF

#tee /etc/docker/daemon.json <<EOF
#{
# "default-runtime": "nvidia",
# "runtimes": {
# "nvidia": {
# "path": "/usr/bin/nvidia-container-runtime",
# "runtimeArgs": []
# }
# }
#}
#EOF

# restart dockerd
systemctl daemon-reload
systemctl restart docker
Expand Down
3 changes: 1 addition & 2 deletions pkg/template/kind_template.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,7 @@ nodes:
nodeRegistration:
kubeletExtraArgs:
node-labels: "ingress-ready=true"
# image: footprintai/kind-node:v1.21.9
image: kindest/node:v1.21.14
image: kindest/node:v1.21.14@sha256:ad5b7446dd8332439f22a1efdac73670f0da158c00f0a70b45716e7ef3fae20b
gpus: {{.UseGPU}}
{{if .ExportPorts}}extraPortMappings:{{end}}
{{- range $i, $p := .ExportPorts}}
Expand Down

0 comments on commit 95e6ba3

Please sign in to comment.