From 30f4a28c94b03f21b23504b0b31e0d75a243b39d Mon Sep 17 00:00:00 2001 From: Anurag Guda Date: Wed, 21 Aug 2024 14:12:43 -0500 Subject: [PATCH] 24.8.0 Release --- README.md | 30 +- cns.json | 90 +- install-guides/DGX-6.0_Server_v13.0.md | 2 +- install-guides/DGX-6.2_Server_v11.3.md | 722 ++++++++++ install-guides/DGX-6.2_Server_v12.2.md | 722 ++++++++++ install-guides/DGX-6.2_Server_v13.1.md | 721 ++++++++++ install-guides/Jetson_Xavier_v13.0.md | 2 +- .../RHEL-8-8_Server_x86-arm64_v11.3.md | 1178 +++++++++++++++++ .../RHEL-8-8_Server_x86-arm64_v12.2.md | 1178 +++++++++++++++++ .../RHEL-8-8_Server_x86-arm64_v13.1.md | 1177 ++++++++++++++++ ...-22-04_Server_Developer-x86-arm64_v11.0.md | 55 +- ...-22-04_Server_Developer-x86-arm64_v11.1.md | 55 +- ...-22-04_Server_Developer-x86-arm64_v11.2.md | 55 +- ...-22-04_Server_Developer-x86-arm64_v11.3.md | 938 +++++++++++++ ...-22-04_Server_Developer-x86-arm64_v12.0.md | 54 +- ...-22-04_Server_Developer-x86-arm64_v12.1.md | 54 +- ...-22-04_Server_Developer-x86-arm64_v12.2.md | 937 +++++++++++++ ...-22-04_Server_Developer-x86-arm64_v13.0.md | 56 +- ...-22-04_Server_Developer-x86-arm64_v13.1.md | 936 +++++++++++++ .../Ubuntu-22-04_Server_x86-arm64_v11.3.md | 1128 ++++++++++++++++ .../Ubuntu-22-04_Server_x86-arm64_v12.2.md | 1128 ++++++++++++++++ .../Ubuntu-22-04_Server_x86-arm64_v13.0.md | 2 +- .../Ubuntu-22-04_Server_x86-arm64_v13.1.md | 1127 ++++++++++++++++ .../DGX-6.0_Server_v10.2.md | 0 .../DGX-6.0_Server_v10.3.md | 0 .../DGX-6.0_Server_v10.4.md | 0 .../DGX-6.0_Server_v10.5.md | 0 .../Jetson_Xavier_v10.0.md | 0 .../Jetson_Xavier_v10.1.md | 0 .../Jetson_Xavier_v10.2.md | 0 .../Jetson_Xavier_v10.3.md | 0 .../Jetson_Xavier_v10.4.md | 0 .../Jetson_Xavier_v10.5.md | 0 .../RHEL-8-7_Server_x86-arm64_v10.0.md | 0 .../RHEL-8-7_Server_x86-arm64_v10.1.md | 0 .../RHEL-8-7_Server_x86-arm64_v10.2.md | 0 .../RHEL-8-7_Server_x86-arm64_v10.3.md | 0 .../RHEL-8-7_Server_x86-arm64_v10.4.md | 0 .../RHEL-8-7_Server_x86-arm64_v10.5.md | 0 ...-22-04_Server_Developer-x86-arm64_v10.0.md | 0 ...-22-04_Server_Developer-x86-arm64_v10.1.md | 0 ...-22-04_Server_Developer-x86-arm64_v10.2.md | 0 ...-22-04_Server_Developer-x86-arm64_v10.3.md | 0 ...-22-04_Server_Developer-x86-arm64_v10.4.md | 0 ...-22-04_Server_Developer-x86-arm64_v10.5.md | 0 .../Ubuntu-22-04_Server_x86-arm64_v10.0.md | 0 .../Ubuntu-22-04_Server_x86-arm64_v10.1.md | 0 .../Ubuntu-22-04_Server_x86-arm64_v10.2.md | 0 .../Ubuntu-22-04_Server_x86-arm64_v10.3.md | 0 .../Ubuntu-22-04_Server_x86-arm64_v10.4.md | 0 .../Ubuntu-22-04_Server_x86-arm64_v10.5.md | 0 install-guides/readme.md | 44 +- playbooks/cns-installation.yaml | 72 +- playbooks/cns-uninstall.yaml | 12 +- playbooks/cns-upgrade.yaml | 46 +- playbooks/cns-validation.yaml | 58 +- playbooks/cns_values_11.3.yaml | 121 ++ playbooks/cns_values_12.2.yaml | 121 ++ playbooks/cns_values_13.1.yaml | 121 ++ playbooks/gpu_operator.yaml | 13 + playbooks/guides/Cloud_Guide.md | 2 +- playbooks/hosts | 2 +- playbooks/k8s-install.yaml | 11 +- playbooks/microk8s.yaml | 27 +- playbooks/nvidia-driver.yaml | 19 +- .../{ => older_versions}/cns_values_10.0.yaml | 0 .../{ => older_versions}/cns_values_10.1.yaml | 0 .../{ => older_versions}/cns_values_10.2.yaml | 0 .../{ => older_versions}/cns_values_10.3.yaml | 0 .../{ => older_versions}/cns_values_10.4.yaml | 0 .../{ => older_versions}/cns_values_10.5.yaml | 0 playbooks/operators-upgrade.yaml | 10 + playbooks/prerequisites.yaml | 27 +- playbooks/readme.md | 4 +- 74 files changed, 12791 insertions(+), 266 deletions(-) create mode 100644 install-guides/DGX-6.2_Server_v11.3.md create mode 100644 install-guides/DGX-6.2_Server_v12.2.md create mode 100644 install-guides/DGX-6.2_Server_v13.1.md create mode 100644 install-guides/RHEL-8-8_Server_x86-arm64_v11.3.md create mode 100644 install-guides/RHEL-8-8_Server_x86-arm64_v12.2.md create mode 100644 install-guides/RHEL-8-8_Server_x86-arm64_v13.1.md create mode 100644 install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v11.3.md create mode 100644 install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v12.2.md create mode 100644 install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v13.1.md create mode 100644 install-guides/Ubuntu-22-04_Server_x86-arm64_v11.3.md create mode 100644 install-guides/Ubuntu-22-04_Server_x86-arm64_v12.2.md create mode 100644 install-guides/Ubuntu-22-04_Server_x86-arm64_v13.1.md rename install-guides/{ => older_versions}/DGX-6.0_Server_v10.2.md (100%) rename install-guides/{ => older_versions}/DGX-6.0_Server_v10.3.md (100%) rename install-guides/{ => older_versions}/DGX-6.0_Server_v10.4.md (100%) rename install-guides/{ => older_versions}/DGX-6.0_Server_v10.5.md (100%) rename install-guides/{ => older_versions}/Jetson_Xavier_v10.0.md (100%) rename install-guides/{ => older_versions}/Jetson_Xavier_v10.1.md (100%) rename install-guides/{ => older_versions}/Jetson_Xavier_v10.2.md (100%) rename install-guides/{ => older_versions}/Jetson_Xavier_v10.3.md (100%) rename install-guides/{ => older_versions}/Jetson_Xavier_v10.4.md (100%) rename install-guides/{ => older_versions}/Jetson_Xavier_v10.5.md (100%) rename install-guides/{ => older_versions}/RHEL-8-7_Server_x86-arm64_v10.0.md (100%) rename install-guides/{ => older_versions}/RHEL-8-7_Server_x86-arm64_v10.1.md (100%) rename install-guides/{ => older_versions}/RHEL-8-7_Server_x86-arm64_v10.2.md (100%) rename install-guides/{ => older_versions}/RHEL-8-7_Server_x86-arm64_v10.3.md (100%) rename install-guides/{ => older_versions}/RHEL-8-7_Server_x86-arm64_v10.4.md (100%) rename install-guides/{ => older_versions}/RHEL-8-7_Server_x86-arm64_v10.5.md (100%) rename install-guides/{ => older_versions}/Ubuntu-22-04_Server_Developer-x86-arm64_v10.0.md (100%) rename install-guides/{ => older_versions}/Ubuntu-22-04_Server_Developer-x86-arm64_v10.1.md (100%) rename install-guides/{ => older_versions}/Ubuntu-22-04_Server_Developer-x86-arm64_v10.2.md (100%) rename install-guides/{ => older_versions}/Ubuntu-22-04_Server_Developer-x86-arm64_v10.3.md (100%) rename install-guides/{ => older_versions}/Ubuntu-22-04_Server_Developer-x86-arm64_v10.4.md (100%) rename install-guides/{ => older_versions}/Ubuntu-22-04_Server_Developer-x86-arm64_v10.5.md (100%) rename install-guides/{ => older_versions}/Ubuntu-22-04_Server_x86-arm64_v10.0.md (100%) rename install-guides/{ => older_versions}/Ubuntu-22-04_Server_x86-arm64_v10.1.md (100%) rename install-guides/{ => older_versions}/Ubuntu-22-04_Server_x86-arm64_v10.2.md (100%) rename install-guides/{ => older_versions}/Ubuntu-22-04_Server_x86-arm64_v10.3.md (100%) rename install-guides/{ => older_versions}/Ubuntu-22-04_Server_x86-arm64_v10.4.md (100%) rename install-guides/{ => older_versions}/Ubuntu-22-04_Server_x86-arm64_v10.5.md (100%) create mode 100644 playbooks/cns_values_11.3.yaml create mode 100644 playbooks/cns_values_12.2.yaml create mode 100644 playbooks/cns_values_13.1.yaml rename playbooks/{ => older_versions}/cns_values_10.0.yaml (100%) rename playbooks/{ => older_versions}/cns_values_10.1.yaml (100%) rename playbooks/{ => older_versions}/cns_values_10.2.yaml (100%) rename playbooks/{ => older_versions}/cns_values_10.3.yaml (100%) rename playbooks/{ => older_versions}/cns_values_10.4.yaml (100%) rename playbooks/{ => older_versions}/cns_values_10.5.yaml (100%) diff --git a/README.md b/README.md index 7c211de..9056a88 100755 --- a/README.md +++ b/README.md @@ -54,27 +54,19 @@ For more Information about customize the values, please refer [Installation](htt | Branch/Release | Version | Initial Release Date | Platform | OS | Containerd | CRI-O | K8s | Helm | NVIDIA GPU Operator | NVIDIA Network Operator | NVIDIA Data Center Driver | | :---: | :------: | :---: | :---: | :---: | :---: | :----: | :---: | :---: | :---: | :---: | :---: | -| 24.5.0/master | 13.0 | 14 May 2024 | NVIDIA Certified Server (x86 & arm64) | Ubuntu 22.04 LTS | 1.7.16 | 1.30.0 | 1.30.0 | 3.14.4 | 24.3.0 | 24.1.1(x86 only) | 550.54.15 | -| 24.5.0/master | 13.0 | 14 May 2024 | NVIDIA Certified Server (x86 & arm64) | RHEL 8.9 | 1.7.16 | 1.30.0 | 1.30.0 | 3.14.4 | 24.3.0 | N/A | 550.54.15 | -| 24.5.0/master | 13.0 | 14 May 2024 | Jetson Devices(AGX, NX, Orin) | JetPack 5.1 and JetPack 5.0 | 1.7.16 | 1.30.0 | 1.30.0 | 3.14.4 | N/A | N/A | N/A | -| 24.5.0/master | 13.0 | 14 May 2024 | DGX Server | DGX OS 6.0(Ubuntu 22.04 LTS) | 1.7.16 | 1.30.0 | 1.30.0 | 3.14.4 | 24.3.0 | N/A | N/A | +| 24.8.0/master | 13.1 | 20 August 2024 | NVIDIA Certified Server (x86 & arm64) | Ubuntu 22.04 LTS | 1.7.20 | 1.30.2 | 1.30.2 | 3.15.3 | 24.6.1 | 24.4.1(x86 only) | 550.90.07 | +| 24.8.0/master | 13.1 | 20 August 2024 | NVIDIA Certified Server (x86 & arm64) | RHEL 8.8 | 1.7.20 | 1.30.2 | 1.30.2 | 3.15.3 | 24.6.1 | 24.4.1(x86 only) | 550.90.07 | +| 24.8.0/master | 13.1 | 20 August 2024 | DGX Server | DGX OS 6.2(Ubuntu 22.04 LTS) | 1.7.20 | 1.30.2 | 1.30.2 | 3.15.3 | 24.6.1 | N/A | N/A | | | | | | | | | | | | | -| 24.5.0/master | 12.1 | 14 May 2024 | NVIDIA Certified Server (x86 & arm64) | Ubuntu 22.04 LTS | 1.7.16 | 1.29.4 | 1.29.4 | 3.14.4 | 24.3.0 | 24.1.1(x86 only) | 550.54.15 | -| 24.5.0/master | 12.1 | 14 May 2024 | NVIDIA Certified Server (x86 & arm64) | RHEL 8.9 | 1.7.16 | 1.29.4 | 1.29.4 | 3.14.4 | 24.3.0 | N/A | 550.54.15 | -| 24.5.0/master | 12.1 | 14 May 2024 | Jetson Devices(AGX, NX, Orin) | JetPack 5.1 and JetPack 5.0 | 1.7.16 | 1.29.4 | 1.29.4 | 3.14.4 | N/A | N/A | N/A | -| 24.5.0/master | 12.1 | 14 May 2024 | DGX Server | DGX OS 6.0(Ubuntu 22.04 LTS) | 1.7.16 | 1.29.4 | 1.29.4 | 3.14.4 | 24.3.0 | N/A | N/A | +| 24.8.0/master | 12.2 | 20 August 2024 | NVIDIA Certified Server (x86 & arm64) | Ubuntu 22.04 LTS | 1.7.20 | 1.29.6 | 1.29.6 | 3.15.3 | 24.6.1 | 24.4.1(x86 only) | 550.90.07 | +| 24.8.0/master | 12.2 | 20 August 2024 | NVIDIA Certified Server (x86 & arm64) | RHEL 8.8 | 1.7.20 | 1.29.6 | 1.29.6 | 3.15.3 | 24.6.1 | 24.4.1(x86 only) | 550.90.07 | +| 24.8.0/master | 12.2 | 20 August 2024 | DGX Server | DGX OS 6.2(Ubuntu 22.04 LTS) | 1.7.20 | 1.29.6 | 1.29.6 | 3.15.3 | 24.6.1 | N/A | N/A | | | | | | | | | | | | | -| 24.5.0/masrer | 11.2 | 14 May 2024 | NVIDIA Certified Server (x86 & arm64) | Ubuntu 22.04 LTS | 1.7.16 | 1.28.6 | 1.28.8 | 3.14.4 | 24.3.0 | 24.1.1(x86 only) | 550.54.15 | -| 24.5.0/master | 11.2 | 14 May 2024 | NVIDIA Certified Server (x86 & arm64) | RHEL 8.9 | 1.7.16 | 1.28.6 | 1.28.8 | 3.14.4 | 24.3.0 | N/A | 550.54.15 | -| 24.5.0/master | 11.2 | 14 May 2024 | Jetson Devices(AGX, NX, Orin) | JetPack 5.1 and JetPack 5.0 | 1.7.16 | 1.28.6 | 1.28.8 | 3.14.4 | N/A | N/A | N/A | -| 24.5.0/master | 11.2 | 14 May 2024 | DGX Server | DGX OS 6.0(Ubuntu 22.04 LTS) | 1.7.16 | 1.28.6 | 1.28.8 | 3.14.4 | 24.3.0 | N/A | N/A | -| | | | | | | | | | | | -| 24.5.0/master | 10.5 | 14 May 2024 | NVIDIA Certified Server (x86 & arm64) | Ubuntu 22.04 LTS | 1.7.16 | 1.27.6 | 1.27.12 | 3.14.4 | 24.3.0 | 24.1.1(x86 only) | 550.54.15 | -| 24.5.0/master | 10.5 | 14 May 2024 | NVIDIA Certified Server (x86 & arm64) | RHEL 8.9 | 1.7.16 | 1.27.6 | 1.27.12 | 3.14.4 | 24.3.0 | N/A | 550.54.15 | -| 24.5.0/master | 10.5 | 14 May 2024 | Jetson Devices(AGX, NX, Orin) | JetPack 5.1 and JetPack 5.0 | 1.7.16 | 1.27.6 | 1.27.12 | 3.14.4 | N/A | N/A | N/A | -| 24.5.0/master | 10.5 | 14 May 2024 | DGX Server | DGX OS 6.0(Ubuntu 22.04 LTS) | 1.7.16 | 1.27.6 | 1.27.12 | 3.14.4 | 24.3.0 | N/A | N/A | +| 24.8.0/masrer | 11.3 | 20 August 2024 | NVIDIA Certified Server (x86 & arm64) | Ubuntu 22.04 LTS | 1.7.20 | 1.28.8 | 1.28.12 | 3.15.3 | 24.6.1 | 24.4.1(x86 only) | 550.90.07 | +| 24.8.0/master | 11.3 | 20 August 2024 | NVIDIA Certified Server (x86 & arm64) | RHEL 8.8 | 1.7.20 | 1.28.8 | 1.28.12 | 3.15.3 | 24.6.1 | 24.4.1(x86 only) | 550.90.07 | +| 24.8.0/master | 11.3 | 20 August 2024 | DGX Server | DGX OS 6.2(Ubuntu 22.04 LTS) | 1.7.20 | 1.28.8 | 1.28.12 | 3.15.3 | 24.6.1 | N/A | N/A | -To Find other CNS Release Information, please refer to [Cloud Native Stack Component Matrix](https://github.com/NVIDIA/cloud-native-stack/tree/24.3.0?tab=readme-ov-file#nvidia-cloud-native-stack-component-matrix-1) +To Find other CNS Release Information, please refer to [Cloud Native Stack Component Matrix](https://github.com/NVIDIA/cloud-native-stack/tree/24.5.0?tab=readme-ov-file#nvidia-cloud-native-stack-component-matrix-1) `NOTE:` Above CNS versions are available on master branch as well but it's recommend to use specific branch with respective release @@ -98,7 +90,7 @@ To Find other CNS Release Information, please refer to [Cloud Native Stack Compo | Branch/Release | CNS Version | Release Date | Kserve | LoadBalancer | Storage | Monitoring | | :---: | :------: | :---: | :---: | :---: | :---: | :---: | -| 24.5.0/master | 13.0
12.1
11.2
10.5 | 9 July 2024 |
**0.13**

| MetalLB: 0.14.5 | NFS: 4.0.18
Local Path: 0.0.26 | Prometheus: 61.3.0
Elastic: 8.14.1 | +| 24.8.0/master | 13.1
12.2
11.3 | 20 August 2024 |
**0.13**

| MetalLB: 0.14.5 | NFS: 4.0.18
Local Path: 0.0.26 | Prometheus: 61.3.0
Elastic: 8.14.1 | # Getting help or Providing feedback diff --git a/cns.json b/cns.json index 6a32d77..b4aa67c 100644 --- a/cns.json +++ b/cns.json @@ -1,37 +1,45 @@ { "name": "Cloud Native Stack", "latest": { - "version": "13.0", - "release_date": "14 May 2024", + "version": "13.1", + "release_date": "20 August 2024", "platforms": [{ "name": "NVIDIA Certified Server", "CPU architecture": "x86, Arm64", - "os": "Ubuntu 22.04 LTS, RedHat Linux 8.8, DGX OS 6.1", + "os": "Ubuntu 22.04 LTS, RedHat Linux 8.8, DGX OS 6.2", "components": { - "containerd": "1.7.16", - "cri-o": "1.30.0", - "k8s version": "1.30.0", - "Calico": "v3.27.3", - "helm version": "3.14.4", - "NVIDIA GPU Operator": "24.3.0", - "NVIDIA Network Operator": "24.1.1", - "NVIDIA DataCenter Driver": "550.54.15" - } - }, - { - "name": "Jetson Devices(AGX, NX, Orin)", - "os": "JetPack 5.1, JetPack 5.0", - "components": { - "containerd": "1.7.16", - "cri-o": "1.30.0", - "k8s version": "1.30.0", - "Flannel": "0.25.1", - "helm version": "3.14.4" + "containerd": "1.7.20", + "cri-o": "1.30.2", + "k8s version": "1.30.2", + "Calico": "v3.27.4", + "helm version": "3.15.3", + "NVIDIA GPU Operator": "24.6.1", + "NVIDIA Network Operator": "24.4.1", + "NVIDIA DataCenter Driver": "550.90.07" } } ] }, "versions": [{ + "13.1": { + "release_date": "20 August 2024", + "platforms": [{ + "name": "NVIDIA Certified Server", + "CPU architecture": "x86, Arm64", + "os": "Ubuntu 22.04 LTS, RedHat Linux 8.8, DGX OS 6.2", + "components": { + "containerd": "1.7.20", + "cri-o": "1.30.2", + "k8s version": "1.30.2", + "Calico": "v3.27.4", + "helm version": "3.15.3", + "NVIDIA GPU Operator": "24.6.1", + "NVIDIA Network Operator": "24.4.1", + "NVIDIA DataCenter Driver": "550.90.07" + } + } + ] + }, "13.0": { "release_date": "14 May 2024", "platforms": [{ @@ -62,6 +70,25 @@ } ] }, + "12.2": { + "release_date": "20 August 2024", + "platforms": [{ + "name": "NVIDIA Certified Server", + "CPU architecture": "x86, Arm64", + "os": "Ubuntu 22.04 LTS, RedHat Linux 8.8, DGX OS 6.2", + "components": { + "containerd": "1.7.20", + "cri-o": "1.29.6", + "k8s version": "1.29.6", + "Calico": "v3.27.4", + "helm version": "3.15.3", + "NVIDIA GPU Operator": "24.6.1", + "NVIDIA Network Operator": "24.4.1", + "NVIDIA DataCenter Driver": "550.90.07" + } + } + ] + }, "12.1": { "release_date": "14 May 2024", "platforms": [{ @@ -122,6 +149,25 @@ } ] }, + "11.3": { + "release_date": "20 August 2024", + "platforms": [{ + "name": "NVIDIA Certified Server", + "CPU architecture": "x86, Arm64", + "os": "Ubuntu 22.04 LTS, RedHat Linux 8.8, DGX OS 6.2", + "components": { + "containerd": "1.7.20", + "cri-o": "1.28.8", + "k8s version": "1.28.12", + "Calico": "v3.27.4", + "helm version": "3.15.3", + "NVIDIA GPU Operator": "24.6.1", + "NVIDIA Network Operator": "24.4.1", + "NVIDIA DataCenter Driver": "550.90.07" + } + } + ] + }, "11.2": { "release_date": "14 May 2024", "platforms": [{ diff --git a/install-guides/DGX-6.0_Server_v13.0.md b/install-guides/DGX-6.0_Server_v13.0.md index fb8226f..737923b 100644 --- a/install-guides/DGX-6.0_Server_v13.0.md +++ b/install-guides/DGX-6.0_Server_v13.0.md @@ -275,7 +275,7 @@ Now execute the below to install kubelet, kubeadm, and kubectl: sudo apt update ``` ``` - sudo apt install -y -q kubelet=1.30.0-00 kubectl=1.30.0-00 kubeadm=1.30.0-00 + sudo apt install -y -q kubelet=1.30.0-1.1 kubectl=1.30.0-1.1 kubeadm=1.30.0-1.1 ``` ``` sudo apt-mark hold kubelet kubeadm kubectl diff --git a/install-guides/DGX-6.2_Server_v11.3.md b/install-guides/DGX-6.2_Server_v11.3.md new file mode 100644 index 0000000..610667a --- /dev/null +++ b/install-guides/DGX-6.2_Server_v11.3.md @@ -0,0 +1,722 @@ +# NVIDIA Cloud Native Stack v11.3 - Install Guide for DGX +## Introduction + +NVIDIA Cloud Native Stack for DGX is focused to provide the Docker based experince. This document describes how to setup the NVIDIA Cloud Native Stack collection on a single or multiple systems. NVIDIA Cloud Native Stack can be configured to create a single node Kubernetes cluster or to create/add additional worker nodes to join an existing cluster. + +NVIDIA Cloud Native Stack v11.3 includes: +- Ubuntu 22.04 LTS +- Containerd 1.7.20 +- Kubernetes version 1.28.12 +- Helm 3.15.3 +- NVIDIA GPU Driver: 550.90.07 +- NVIDIA Container Toolkit: 1.16.1 +- NVIDIA GPU Operator 24.6.0 + - NVIDIA K8S Device Plugin: 0.16.1 + - NVIDIA DCGM-Exporter: 3.3.7-3.5.0 + - NVIDIA DCGM: 3.3.7-1 + - NVIDIA GPU Feature Discovery: 0.16.1 + - NVIDIA K8s MIG Manager: 0.8.0 + - NVIDIA Driver Manager: 0.6.10 + - Node Feature Discovery: 0.16.3 + - NVIDIA KubeVirt GPU Device Plugin: 1.2.9 + - NVIDIA GDS Driver: 2.17.5 + - NVIDIA Kata Manager for Kubernetes: 0.2.1 + - NVIDIA Confidential Computing Manager for Kubernetes: 0.1.1 + + +## Table of Contents + +- [Prerequisites](#Prerequisites) +- [Installing the DGX Operating System](#Installing-the-DGX-Operating-System) +- [Update the Docker Default Runtime](#Update-the-Docker-Default-Runtime) +- [Installing Container Runtime](#Installing-Container-Runtime) + - [Installing Containerd](#Installing-Containerd) + - [Installing CRI-O](#Installing-CRI-O) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Installing Helm](#Installing-Helm) +- [Adding an Additional Node to NVIDIA Cloud Native Stack](#Adding-additional-node-to-NVIDIA-Cloud-Native-Stack) +- [Installing the GPU Operator](#Installing-the-GPU-Operator) +- [Validating the GPU Operator](#Validating-the-GPU-Operator) +- [Build Docker Images and Deploy on Cloud Native Stack](#Build-Docker-Images-and-Deploy-on-Cloud-Native-Stack) +- [Validate NVIDIA Cloud Native Stack with an Application from NGC](#Validate-NVIDIA-Cloud-Native-Stack-with-an-application-from-NGC) +- [Uninstalling the GPU Operator](#Uninstalling-the-GPU-Operator) + +### Prerequisites + +The following instructions assume the following: + +- You have NVIDIA DGX System +- You will perform a clean install. + +Please note that NVIDIA Cloud Native Stack is validated only on systems with the default kernel (not HWE). + +### Installing the DGX Operating System + +Installing DGX server please reference the [DGX Server Installation Guide](https://docs.nvidia.com/dgx/dgx-os-6-user-guide/). + +### Update the Docker Default Runtime + + +Edit the docker daemon configuration to add the following line and save the file: + +``` +"default-runtime" : "nvidia" +``` + +Example: +``` +$ sudo nano /etc/docker/daemon.json + +{ + "runtimes": { + "nvidia": { + "path": "nvidia-container-runtime", + "runtimeArgs": [] + } + }, + "default-runtime" : "nvidia" +} +``` + +Now execute the below commands to restart the docker daemon: +``` +sudo systemctl daemon-reload && sudo systemctl restart docker +``` + +#### Validate docker default runtime + +Execute the below command to validate docker default runtime as NVIDIA: + +``` +$ sudo docker info | grep -i runtime +``` + +Output: +``` +Runtimes: nvidia runc +Default Runtime: nvidia +``` + + + +## Installing Container Runtime + +You need to install a container runtime into each node in the cluster so that Pods can run there. Currently Cloud Native Stack provides below container runtimes + +- [Installing Containerd](#Installing-Containerd) +- [Installing CRI-O](#Installing-CRI-O) + +`NOTE:` Only install one of either `Containerd` or `CRI-O`, not both! + +These steps apply to both runtimes. + +Set up the repository and update the apt package index: + +``` +sudo apt update +``` + +Install packages to allow apt to use a repository over HTTPS: + +``` +sudo apt install -y apt-transport-https ca-certificates gnupg-agent libseccomp2 autotools-dev debhelper software-properties-common +``` + +Configure the `overlay` and `br_netfilter` kernel modules required by Kubernetes: + +``` +cat <:6443 --token 489oi5.sm34l9uh7dk4z6cm \ + --discovery-token-ca-cert-hash sha256:17165b6c4a4b95d73a3a2a83749a957a10161ae34d2dfd02cd730597579b4b34 +``` + + +Following the instructions in the output, execute the commands as shown below: + +``` + mkdir -p $HOME/.kube + sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config + sudo chown $(id -u):$(id -g) $HOME/.kube/config +``` + +With the following command, you install a pod-network add-on to the control plane node. We are using calico as the pod-network add-on here: + +``` + kubectl apply -f https://raw.githubusercontent.com/projectcalico/calico/v3.26.1/manifests/calico.yaml +``` + +Update the Calico Daemonset + +``` +kubectl set env daemonset/calico-node -n kube-system IP_AUTODETECTION_METHOD=interface=ens\*,eth\*,enc\*,enp\* +``` + +You can execute the below commands to ensure that all pods are up and running: + +``` + kubectl get pods --all-namespaces +``` + +Output: + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +kube-system calico-kube-controllers-65b8787765-bjc8h 1/1 Running 0 2m8s +kube-system calico-node-c2tmk 1/1 Running 0 2m8s +kube-system coredns-5c98db65d4-d4kgh 1/1 Running 0 9m8s +kube-system coredns-5c98db65d4-h6x8m 1/1 Running 0 9m8s +kube-system etcd-#yourhost 1/1 Running 0 8m25s +kube-system kube-apiserver-#yourhost 1/1 Running 0 8m7s +kube-system kube-controller-manager-#yourhost 1/1 Running 0 8m3s +kube-system kube-proxy-6sh42 1/1 Running 0 9m7s +kube-system kube-scheduler-#yourhost 1/1 Running 0 8m26s +``` + +The get nodes command shows that the control-plane node is up and ready: + +``` + kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane,master 10m v1.28.12 +``` + +Since we are using a single-node Kubernetes cluster, the cluster will not schedule pods on the control plane node by default. To schedule pods on the control plane node, we have to remove the taint by executing the following command: + +``` +kubectl taint nodes --all node-role.kubernetes.io/control-plane- +``` + +Refer to [Installing Kubeadm](https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/install-kubeadm/) +for more information. + +### Installing Helm + +Execute the following command to download and install Helm 3.15.3: + +``` + wget https://get.helm.sh/helm-v3.15.3-linux-amd64.tar.gz && \ + tar -zxvf helm-v3.15.3-linux-amd64.tar.gz && \ + sudo mv linux-amd64/helm /usr/local/bin/helm && \ + rm -rf helm-v3.15.3-linux-amd64.tar.gz linux-amd64/ +``` + +Refer to the Helm 3.15.3 [release notes](https://github.com/helm/helm/releases) and the [Installing Helm guide](https://helm.sh/docs/using_helm/#installing-helm) for more information. + + +### Adding an Additional Node to NVIDIA Cloud Native Stack + +`NOTE:` If you're not adding additional nodes, please skip this step and proceed to the next step [Installing NVIDIA Network Operator](#Installing-NVIDIA-Network-Operator) + +Make sure to install the Containerd and Kubernetes packages on additional nodes. + +Prerequisites: +- [Installing Docker and NVIDIA Container Toolkit](#Installing-Docker-and-NVIDIA-Container-Toolkit) +- [Installing Containerd](#Installing-Containerd) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Disable swap](#Disable-swap) + +Once the prerequisites are completed on the additional nodes, execute the below command on the control-plane node and then execute the join command output on an additional node to add the additional node to NVIDIA Cloud Native Stack: + +``` + sudo kubeadm token create --print-join-command +``` + +Output: +``` +example: +sudo kubeadm join 10.110.0.34:6443 --token kg2h7r.e45g9uyrbm1c0w3k --discovery-token-ca-cert-hash sha256:77fd6571644373ea69074dd4af7b077bbf5bd15a3ed720daee98f4b04a8f524e +``` +`NOTE`: control-plane node and worker node should not have the same node name. + +The get nodes command shows that the master and worker nodes are up and ready: + +``` + kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane,master 10m v1.28.12 +#yourhost-worker Ready 10m v1.28.12 +``` + +### Installing GPU Operator + +Add the NVIDIA repo: + +``` + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia +``` + +Update the Helm repo: + +``` + helm repo update +``` + +Install GPU Operator: + +`NOTE:` As we are preinstalled with NVIDIA Driver and NVIDIA Container Toolkit, we need to set as `false` when installing the GPU Operator + +``` + helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator --devel nvidia/gpu-operator --set driver.enabled=false,toolkit.enabled=false --wait --generate-name +``` + +#### Validating the State of the GPU Operator: + +Please note that the installation of the GPU Operator can take a couple of minutes. How long the installation will take depends on your internet speed. + +``` +kubectl get pods --all-namespaces | grep -v kube-system +``` + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +default gpu-operator-1622656274-node-feature-discovery-master-5cddq96gq 1/1 Running 0 2m39s +default gpu-operator-1622656274-node-feature-discovery-worker-wr88v 1/1 Running 0 2m39s +default gpu-operator-7db468cfdf-mdrdp 1/1 Running 0 2m39s +nvidia-gpu-operator gpu-feature-discovery-g425f 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-cuda-validator-s6x2p 0/1 Completed 0 48s +nvidia-gpu-operator nvidia-dcgm-exporter-wtxnx 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-dcgm-jbz94 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-daemonset-hzzdt 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-validator-9nkxq 0/1 Completed 0 17s +nvidia-gpu-operator nvidia-operator-validator-cw4j5 1/1 Running 0 2m20s + +``` + +Please refer to the [GPU Operator page](https://ngc.nvidia.com/catalog/helm-charts/nvidia:gpu-operator) on NGC for more information. + +For multiple worker nodes, execute the below command to fix the CoreDNS and Node Feature Discovery. + +``` +kubectl delete pods $(kubectl get pods -n kube-system | grep core | awk '{print $1}') -n kube-system; kubectl delete pod $(kubectl get pods -o wide -n nvidia-gpu-operator | grep node-feature-discovery | grep -v master | awk '{print $1}') -n nvidia-gpu-operator +``` + +#### GPU Operator with MIG + +`NOTE:` Only A100 and A30 GPUs are supported for GPU Operator with MIG + +Multi-Instance GPU (MIG) allows GPUs based on the NVIDIA Ampere architecture (such as NVIDIA A100) to be securely partitioned into separate GPU instances for CUDA applications. For more information about enabling the MIG capability, please refer to [GPU Operator with MIG](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/gpu-operator-mig.html) + + +### Validating the GPU Operator + +GPU Operator validates the through the nvidia-device-plugin-validation pod and the nvidia-driver-validation pod. If both are completed successfully (see output from kubectl get pods --all-namespaces | grep -v kube-system), NVIDIA Cloud Native Stack is working as expected. This section provides two examples of validating that the GPU is usable from within a pod to validate the manually. + +#### Example 1: nvidia-smi + +Execute the following: + +``` +cat < test-image.tgz; sudo ctr -n=k8s.io images import test-image.tgz +``` + +### Validate NVIDIA Cloud Native Stack with an Application from NGC +Another option to validate NVIDIA Cloud Native Stack is by running a demo application hosted on NGC. + +NGC is NVIDIA's GPU-optimized software hub. NGC provides a curated set of GPU-optimized software for AI, HPC, and visualization. The content provided by NVIDIA and third-party ISVs simplify building, customizing, and integrating GPU-optimized software into workflows, accelerating the time to solutions for users. + +Containers, pre-trained models, Helm charts for Kubernetes deployments, and industry-specific AI toolkits with software development kits (SDKs) are hosted on NGC. For more information about how to deploy an application that is hosted on NGC or the NGC Private Registry, please refer to this [NGC Registry Guide](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/NGC_Registry_Guide_v1.0.md). Visit the [public NGC documentation](https://docs.nvidia.com/ngc) for more information. + +The steps in this section use the publicly available DeepStream - Intelligent Video Analytics (IVA) demo application Helm Chart. The application can validate the full NVIDIA Cloud Native Stack and test the connectivity of NVIDIA Cloud Native Stack to remote sensors. DeepStream delivers real-time AI-based video and image understanding and multi-sensor processing on GPUs. For more information, please refer to the [Helm Chart](https://ngc.nvidia.com/catalog/helm-charts/nvidia:video-analytics-demo). + +There are two ways to configure the DeepStream - Intelligent Video Analytics Demo Application on your NVIDIA Cloud Native Stack + +- Using a camera +- Using the integrated video file (no camera required) + +#### Using a camera + +##### Prerequisites: +- RTSP Camera stream + +Go through the below steps to install the demo application: +``` +1. helm fetch https://helm.ngc.nvidia.com/nvidia/charts/video-analytics-demo-0.1.9.tgz --untar + +2. cd into the folder video-analytics-demo and update the file values.yaml + +3. Go to the section Cameras in the values.yaml file and add the address of your IP camera. Read the comments section on how it can be added. Single or multiple cameras can be added as shown below + +cameras: + camera1: rtsp://XXXX +``` + +Execute the following command to deploy the demo application: +``` +helm install video-analytics-demo --name-template iva +``` + +Once the Helm chart is deployed, access the application with the VLC player. See the instructions below. + +#### Using the integrated video file (no camera) + +If you dont have a camera input, please execute the below commands to use the default video already integrated into the application: + +``` +$ helm fetch https://helm.ngc.nvidia.com/nvidia/charts/video-analytics-demo-0.1.9.tgz + +$ helm install video-analytics-demo-0.1.9.tgz --name-template iva +``` + +Once the helm chart is deployed, access the application with the VLC player as per the below instructions. +For more information about the demo application, please refer to the [application NGC page](https://ngc.nvidia.com/catalog/helm-charts/nvidia:video-analytics-demo) + +#### Access from WebUI + +Use the below WebUI URL to access the video analytic demo application from the browser: +``` +http://IPAddress of Node:31115/ +``` + +#### Access from VLC + +Download VLC Player from https://www.videolan.org/vlc/ on the machine where you intend to view the video stream. + +View the video stream in VLC by navigating to Media > Open Network Stream > Entering the following URL: + +``` +rtsp://IPAddress of Node:31113/ds-test +``` + +You should see the video output like below with the AI model detecting objects. + +![Deepstream_Video](screenshots/Deepstream.png) + +`NOTE:` Video stream in VLC will change if you provide an input RTSP camera. + + +### Uninstalling the GPU Operator + +Execute the below commands to uninstall the GPU Operator: + +``` +$ helm ls +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +gpu-operator-1606173805 nvidia-gpu-operator 1 2024-03-29 20:23:28.063421701 +0000 UTC deployed gpu-operator-24.3.0 v24.3.0 + +$ helm del gpu-operator-1606173805 -n nvidia-gpu-operator +``` \ No newline at end of file diff --git a/install-guides/DGX-6.2_Server_v12.2.md b/install-guides/DGX-6.2_Server_v12.2.md new file mode 100644 index 0000000..542ec95 --- /dev/null +++ b/install-guides/DGX-6.2_Server_v12.2.md @@ -0,0 +1,722 @@ +# NVIDIA Cloud Native Stack v12.2 - Install Guide for DGX +## Introduction + +NVIDIA Cloud Native Stack for DGX is focused to provide the Docker based experince. This document describes how to setup the NVIDIA Cloud Native Stack collection on a single or multiple systems. NVIDIA Cloud Native Stack can be configured to create a single node Kubernetes cluster or to create/add additional worker nodes to join an existing cluster. + +NVIDIA Cloud Native Stack v12.2 includes: +- Ubuntu 22.04 LTS +- Containerd 1.7.20 +- Kubernetes version 1.29.6 +- Helm 3.15.3 +- NVIDIA GPU Driver: 550.90.07 +- NVIDIA Container Toolkit: 1.16.1 +- NVIDIA GPU Operator 24.6.0 + - NVIDIA K8S Device Plugin: 0.16.1 + - NVIDIA DCGM-Exporter: 3.3.7-3.5.0 + - NVIDIA DCGM: 3.3.7-1 + - NVIDIA GPU Feature Discovery: 0.16.1 + - NVIDIA K8s MIG Manager: 0.8.0 + - NVIDIA Driver Manager: 0.6.10 + - Node Feature Discovery: 0.16.3 + - NVIDIA KubeVirt GPU Device Plugin: 1.2.9 + - NVIDIA GDS Driver: 2.17.5 + - NVIDIA Kata Manager for Kubernetes: 0.2.1 + - NVIDIA Confidential Computing Manager for Kubernetes: 0.1.1 + +## Table of Contents + +- [Prerequisites](#Prerequisites) +- [Installing the DGX Operating System](#Installing-the-DGX-Operating-System) +- [Update the Docker Default Runtime](#Update-the-Docker-Default-Runtime) +- [Installing Container Runtime](#Installing-Container-Runtime) + - [Installing Containerd](#Installing-Containerd) + - [Installing CRI-O](#Installing-CRI-O) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Installing Helm](#Installing-Helm) +- [Adding an Additional Node to NVIDIA Cloud Native Stack](#Adding-additional-node-to-NVIDIA-Cloud-Native-Stack) +- [Installing the GPU Operator](#Installing-the-GPU-Operator) +- [Validating the GPU Operator](#Validating-the-GPU-Operator) +- [Build Docker Images and Deploy on Cloud Native Stack](#Build-Docker-Images-and-Deploy-on-Cloud-Native-Stack) +- [Validate NVIDIA Cloud Native Stack with an Application from NGC](#Validate-NVIDIA-Cloud-Native-Stack-with-an-application-from-NGC) +- [Uninstalling the GPU Operator](#Uninstalling-the-GPU-Operator) + +### Prerequisites + +The following instructions assume the following: + +- You have NVIDIA DGX System +- You will perform a clean install. + +Please note that NVIDIA Cloud Native Stack is validated only on systems with the default kernel (not HWE). + +### Installing the DGX Operating System + +Installing DGX server please reference the [DGX Server Installation Guide](https://docs.nvidia.com/dgx/dgx-os-6-user-guide/). + +### Update the Docker Default Runtime + + +Edit the docker daemon configuration to add the following line and save the file: + +``` +"default-runtime" : "nvidia" +``` + +Example: +``` +$ sudo nano /etc/docker/daemon.json + +{ + "runtimes": { + "nvidia": { + "path": "nvidia-container-runtime", + "runtimeArgs": [] + } + }, + "default-runtime" : "nvidia" +} +``` + +Now execute the below commands to restart the docker daemon: +``` +sudo systemctl daemon-reload && sudo systemctl restart docker +``` + +#### Validate docker default runtime + +Execute the below command to validate docker default runtime as NVIDIA: + +``` +$ sudo docker info | grep -i runtime +``` + +Output: +``` +Runtimes: nvidia runc +Default Runtime: nvidia +``` + + + +## Installing Container Runtime + +You need to install a container runtime into each node in the cluster so that Pods can run there. Currently Cloud Native Stack provides below container runtimes + +- [Installing Containerd](#Installing-Containerd) +- [Installing CRI-O](#Installing-CRI-O) + +`NOTE:` Only install one of either `Containerd` or `CRI-O`, not both! + +These steps apply to both runtimes. + +Set up the repository and update the apt package index: + +``` +sudo apt update +``` + +Install packages to allow apt to use a repository over HTTPS: + +``` +sudo apt install -y apt-transport-https ca-certificates gnupg-agent libseccomp2 autotools-dev debhelper software-properties-common +``` + +Configure the `overlay` and `br_netfilter` kernel modules required by Kubernetes: + +``` +cat <:6443 --token 489oi5.sm34l9uh7dk4z6cm \ + --discovery-token-ca-cert-hash sha256:17165b6c4a4b95d73a3a2a83749a957a10161ae34d2dfd02cd730597579b4b34 +``` + + +Following the instructions in the output, execute the commands as shown below: + +``` + mkdir -p $HOME/.kube + sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config + sudo chown $(id -u):$(id -g) $HOME/.kube/config +``` + +With the following command, you install a pod-network add-on to the control plane node. We are using calico as the pod-network add-on here: + +``` + kubectl apply -f https://raw.githubusercontent.com/projectcalico/calico/v3.26.1/manifests/calico.yaml +``` + +Update the Calico Daemonset + +``` +kubectl set env daemonset/calico-node -n kube-system IP_AUTODETECTION_METHOD=interface=ens\*,eth\*,enc\*,enp\* +``` + +You can execute the below commands to ensure that all pods are up and running: + +``` + kubectl get pods --all-namespaces +``` + +Output: + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +kube-system calico-kube-controllers-65b8787765-bjc8h 1/1 Running 0 2m8s +kube-system calico-node-c2tmk 1/1 Running 0 2m8s +kube-system coredns-5c98db65d4-d4kgh 1/1 Running 0 9m8s +kube-system coredns-5c98db65d4-h6x8m 1/1 Running 0 9m8s +kube-system etcd-#yourhost 1/1 Running 0 8m25s +kube-system kube-apiserver-#yourhost 1/1 Running 0 8m7s +kube-system kube-controller-manager-#yourhost 1/1 Running 0 8m3s +kube-system kube-proxy-6sh42 1/1 Running 0 9m7s +kube-system kube-scheduler-#yourhost 1/1 Running 0 8m26s +``` + +The get nodes command shows that the control-plane node is up and ready: + +``` + kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane,master 10m v1.29.6 +``` + +Since we are using a single-node Kubernetes cluster, the cluster will not schedule pods on the control plane node by default. To schedule pods on the control plane node, we have to remove the taint by executing the following command: + +``` +kubectl taint nodes --all node-role.kubernetes.io/control-plane- +``` + +Refer to [Installing Kubeadm](https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/install-kubeadm/) +for more information. + +### Installing Helm + +Execute the following command to download and install Helm 3.15.3: + +``` + wget https://get.helm.sh/helm-v3.15.3-linux-amd64.tar.gz && \ + tar -zxvf helm-v3.15.3-linux-amd64.tar.gz && \ + sudo mv linux-amd64/helm /usr/local/bin/helm && \ + rm -rf helm-v3.15.3-linux-amd64.tar.gz linux-amd64/ +``` + +Refer to the Helm 3.15.3 [release notes](https://github.com/helm/helm/releases) and the [Installing Helm guide](https://helm.sh/docs/using_helm/#installing-helm) for more information. + + +### Adding an Additional Node to NVIDIA Cloud Native Stack + +`NOTE:` If you're not adding additional nodes, please skip this step and proceed to the next step [Installing NVIDIA Network Operator](#Installing-NVIDIA-Network-Operator) + +Make sure to install the Containerd and Kubernetes packages on additional nodes. + +Prerequisites: +- [Installing Docker and NVIDIA Container Toolkit](#Installing-Docker-and-NVIDIA-Container-Toolkit) +- [Installing Containerd](#Installing-Containerd) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Disable swap](#Disable-swap) + +Once the prerequisites are completed on the additional nodes, execute the below command on the control-plane node and then execute the join command output on an additional node to add the additional node to NVIDIA Cloud Native Stack: + +``` + sudo kubeadm token create --print-join-command +``` + +Output: +``` +example: +sudo kubeadm join 10.110.0.34:6443 --token kg2h7r.e45g9uyrbm1c0w3k --discovery-token-ca-cert-hash sha256:77fd6571644373ea69074dd4af7b077bbf5bd15a3ed720daee98f4b04a8f524e +``` +`NOTE`: control-plane node and worker node should not have the same node name. + +The get nodes command shows that the master and worker nodes are up and ready: + +``` + kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane,master 10m v1.29.6 +#yourhost-worker Ready 10m v1.29.6 +``` + +### Installing GPU Operator + +Add the NVIDIA repo: + +``` + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia +``` + +Update the Helm repo: + +``` + helm repo update +``` + +Install GPU Operator: + +`NOTE:` As we are preinstalled with NVIDIA Driver and NVIDIA Container Toolkit, we need to set as `false` when installing the GPU Operator + +``` + helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator --devel nvidia/gpu-operator --set driver.enabled=false,toolkit.enabled=false --wait --generate-name +``` + +#### Validating the State of the GPU Operator: + +Please note that the installation of the GPU Operator can take a couple of minutes. How long the installation will take depends on your internet speed. + +``` +kubectl get pods --all-namespaces | grep -v kube-system +``` + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +default gpu-operator-1622656274-node-feature-discovery-master-5cddq96gq 1/1 Running 0 2m39s +default gpu-operator-1622656274-node-feature-discovery-worker-wr88v 1/1 Running 0 2m39s +default gpu-operator-7db468cfdf-mdrdp 1/1 Running 0 2m39s +nvidia-gpu-operator gpu-feature-discovery-g425f 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-cuda-validator-s6x2p 0/1 Completed 0 48s +nvidia-gpu-operator nvidia-dcgm-exporter-wtxnx 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-dcgm-jbz94 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-daemonset-hzzdt 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-validator-9nkxq 0/1 Completed 0 17s +nvidia-gpu-operator nvidia-operator-validator-cw4j5 1/1 Running 0 2m20s + +``` + +Please refer to the [GPU Operator page](https://ngc.nvidia.com/catalog/helm-charts/nvidia:gpu-operator) on NGC for more information. + +For multiple worker nodes, execute the below command to fix the CoreDNS and Node Feature Discovery. + +``` +kubectl delete pods $(kubectl get pods -n kube-system | grep core | awk '{print $1}') -n kube-system; kubectl delete pod $(kubectl get pods -o wide -n nvidia-gpu-operator | grep node-feature-discovery | grep -v master | awk '{print $1}') -n nvidia-gpu-operator +``` + +#### GPU Operator with MIG + +`NOTE:` Only A100 and A30 GPUs are supported for GPU Operator with MIG + +Multi-Instance GPU (MIG) allows GPUs based on the NVIDIA Ampere architecture (such as NVIDIA A100) to be securely partitioned into separate GPU instances for CUDA applications. For more information about enabling the MIG capability, please refer to [GPU Operator with MIG](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/gpu-operator-mig.html) + + +### Validating the GPU Operator + +GPU Operator validates the through the nvidia-device-plugin-validation pod and the nvidia-driver-validation pod. If both are completed successfully (see output from kubectl get pods --all-namespaces | grep -v kube-system), NVIDIA Cloud Native Stack is working as expected. This section provides two examples of validating that the GPU is usable from within a pod to validate the manually. + +#### Example 1: nvidia-smi + +Execute the following: + +``` +cat < test-image.tgz; sudo ctr -n=k8s.io images import test-image.tgz +``` + +### Validate NVIDIA Cloud Native Stack with an Application from NGC +Another option to validate NVIDIA Cloud Native Stack is by running a demo application hosted on NGC. + +NGC is NVIDIA's GPU-optimized software hub. NGC provides a curated set of GPU-optimized software for AI, HPC, and visualization. The content provided by NVIDIA and third-party ISVs simplify building, customizing, and integrating GPU-optimized software into workflows, accelerating the time to solutions for users. + +Containers, pre-trained models, Helm charts for Kubernetes deployments, and industry-specific AI toolkits with software development kits (SDKs) are hosted on NGC. For more information about how to deploy an application that is hosted on NGC or the NGC Private Registry, please refer to this [NGC Registry Guide](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/NGC_Registry_Guide_v1.0.md). Visit the [public NGC documentation](https://docs.nvidia.com/ngc) for more information. + +The steps in this section use the publicly available DeepStream - Intelligent Video Analytics (IVA) demo application Helm Chart. The application can validate the full NVIDIA Cloud Native Stack and test the connectivity of NVIDIA Cloud Native Stack to remote sensors. DeepStream delivers real-time AI-based video and image understanding and multi-sensor processing on GPUs. For more information, please refer to the [Helm Chart](https://ngc.nvidia.com/catalog/helm-charts/nvidia:video-analytics-demo). + +There are two ways to configure the DeepStream - Intelligent Video Analytics Demo Application on your NVIDIA Cloud Native Stack + +- Using a camera +- Using the integrated video file (no camera required) + +#### Using a camera + +##### Prerequisites: +- RTSP Camera stream + +Go through the below steps to install the demo application: +``` +1. helm fetch https://helm.ngc.nvidia.com/nvidia/charts/video-analytics-demo-0.1.9.tgz --untar + +2. cd into the folder video-analytics-demo and update the file values.yaml + +3. Go to the section Cameras in the values.yaml file and add the address of your IP camera. Read the comments section on how it can be added. Single or multiple cameras can be added as shown below + +cameras: + camera1: rtsp://XXXX +``` + +Execute the following command to deploy the demo application: +``` +helm install video-analytics-demo --name-template iva +``` + +Once the Helm chart is deployed, access the application with the VLC player. See the instructions below. + +#### Using the integrated video file (no camera) + +If you dont have a camera input, please execute the below commands to use the default video already integrated into the application: + +``` +$ helm fetch https://helm.ngc.nvidia.com/nvidia/charts/video-analytics-demo-0.1.9.tgz + +$ helm install video-analytics-demo-0.1.9.tgz --name-template iva +``` + +Once the helm chart is deployed, access the application with the VLC player as per the below instructions. +For more information about the demo application, please refer to the [application NGC page](https://ngc.nvidia.com/catalog/helm-charts/nvidia:video-analytics-demo) + +#### Access from WebUI + +Use the below WebUI URL to access the video analytic demo application from the browser: +``` +http://IPAddress of Node:31115/ +``` + +#### Access from VLC + +Download VLC Player from https://www.videolan.org/vlc/ on the machine where you intend to view the video stream. + +View the video stream in VLC by navigating to Media > Open Network Stream > Entering the following URL: + +``` +rtsp://IPAddress of Node:31113/ds-test +``` + +You should see the video output like below with the AI model detecting objects. + +![Deepstream_Video](screenshots/Deepstream.png) + +`NOTE:` Video stream in VLC will change if you provide an input RTSP camera. + + +### Uninstalling the GPU Operator + +Execute the below commands to uninstall the GPU Operator: + +``` +$ helm ls +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +gpu-operator-1606173805 nvidia-gpu-operator 1 2024-03-20 20:23:28.063421701 +0000 UTC deployed gpu-operator-24.3.0 v24.3.0 + +$ helm del gpu-operator-1606173805 -n nvidia-gpu-operator +``` \ No newline at end of file diff --git a/install-guides/DGX-6.2_Server_v13.1.md b/install-guides/DGX-6.2_Server_v13.1.md new file mode 100644 index 0000000..a606704 --- /dev/null +++ b/install-guides/DGX-6.2_Server_v13.1.md @@ -0,0 +1,721 @@ +# NVIDIA Cloud Native Stack v13.1 - Install Guide for DGX +## Introduction + +NVIDIA Cloud Native Stack for DGX is focused to provide the Docker based experince. This document describes how to setup the NVIDIA Cloud Native Stack collection on a single or multiple systems. NVIDIA Cloud Native Stack can be configured to create a single node Kubernetes cluster or to create/add additional worker nodes to join an existing cluster. + +NVIDIA Cloud Native Stack v13.1 includes: +- Ubuntu 22.04 LTS +- Containerd 1.7.20 +- Kubernetes version 1.30.2 +- Helm 3.15.3 +- NVIDIA GPU Driver: 550.90.07 +- NVIDIA Container Toolkit: 1.16.1 +- NVIDIA GPU Operator 24.6.0 + - NVIDIA K8S Device Plugin: 0.16.1 + - NVIDIA DCGM-Exporter: 3.3.7-3.5.0 + - NVIDIA DCGM: 3.3.7-1 + - NVIDIA GPU Feature Discovery: 0.16.1 + - NVIDIA K8s MIG Manager: 0.8.0 + - NVIDIA Driver Manager: 0.6.10 + - Node Feature Discovery: 0.16.3 + - NVIDIA KubeVirt GPU Device Plugin: 1.2.9 + - NVIDIA GDS Driver: 2.17.5 + - NVIDIA Kata Manager for Kubernetes: 0.2.1 + - NVIDIA Confidential Computing Manager for Kubernetes: 0.1.1 + +## Table of Contents + +- [Prerequisites](#Prerequisites) +- [Installing the DGX Operating System](#Installing-the-DGX-Operating-System) +- [Update the Docker Default Runtime](#Update-the-Docker-Default-Runtime) +- [Installing Container Runtime](#Installing-Container-Runtime) + - [Installing Containerd](#Installing-Containerd) + - [Installing CRI-O](#Installing-CRI-O) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Installing Helm](#Installing-Helm) +- [Adding an Additional Node to NVIDIA Cloud Native Stack](#Adding-additional-node-to-NVIDIA-Cloud-Native-Stack) +- [Installing the GPU Operator](#Installing-the-GPU-Operator) +- [Validating the GPU Operator](#Validating-the-GPU-Operator) +- [Build Docker Images and Deploy on Cloud Native Stack](#Build-Docker-Images-and-Deploy-on-Cloud-Native-Stack) +- [Validate NVIDIA Cloud Native Stack with an Application from NGC](#Validate-NVIDIA-Cloud-Native-Stack-with-an-application-from-NGC) +- [Uninstalling the GPU Operator](#Uninstalling-the-GPU-Operator) + +### Prerequisites + +The following instructions assume the following: + +- You have NVIDIA DGX System +- You will perform a clean install. + +Please note that NVIDIA Cloud Native Stack is validated only on systems with the default kernel (not HWE). + +### Installing the DGX Operating System + +Installing DGX server please reference the [DGX Server Installation Guide](https://docs.nvidia.com/dgx/dgx-os-6-user-guide/). + +### Update the Docker Default Runtime + + +Edit the docker daemon configuration to add the following line and save the file: + +``` +"default-runtime" : "nvidia" +``` + +Example: +``` +$ sudo nano /etc/docker/daemon.json + +{ + "runtimes": { + "nvidia": { + "path": "nvidia-container-runtime", + "runtimeArgs": [] + } + }, + "default-runtime" : "nvidia" +} +``` + +Now execute the below commands to restart the docker daemon: +``` +sudo systemctl daemon-reload && sudo systemctl restart docker +``` + +#### Validate docker default runtime + +Execute the below command to validate docker default runtime as NVIDIA: + +``` +$ sudo docker info | grep -i runtime +``` + +Output: +``` +Runtimes: nvidia runc +Default Runtime: nvidia +``` + + + +## Installing Container Runtime + +You need to install a container runtime into each node in the cluster so that Pods can run there. Currently Cloud Native Stack provides below container runtimes + +- [Installing Containerd](#Installing-Containerd) +- [Installing CRI-O](#Installing-CRI-O) + +`NOTE:` Only install one of either `Containerd` or `CRI-O`, not both! + +These steps apply to both runtimes. + +Set up the repository and update the apt package index: + +``` +sudo apt update +``` + +Install packages to allow apt to use a repository over HTTPS: + +``` +sudo apt install -y apt-transport-https ca-certificates gnupg-agent libseccomp2 autotools-dev debhelper software-properties-common +``` + +Configure the `overlay` and `br_netfilter` kernel modules required by Kubernetes: + +``` +cat <:6443 --token 489oi5.sm34l9uh7dk4z6cm \ + --discovery-token-ca-cert-hash sha256:17165b6c4a4b95d73a3a2a83749a957a10161ae34d2dfd02cd730597579b4b34 +``` + + +Following the instructions in the output, execute the commands as shown below: + +``` + mkdir -p $HOME/.kube + sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config + sudo chown $(id -u):$(id -g) $HOME/.kube/config +``` + +With the following command, you install a pod-network add-on to the control plane node. We are using calico as the pod-network add-on here: + +``` + kubectl apply -f https://raw.githubusercontent.com/projectcalico/calico/v3.26.1/manifests/calico.yaml +``` + +Update the Calico Daemonset + +``` +kubectl set env daemonset/calico-node -n kube-system IP_AUTODETECTION_METHOD=interface=ens\*,eth\*,enc\*,enp\* +``` + +You can execute the below commands to ensure that all pods are up and running: + +``` + kubectl get pods --all-namespaces +``` + +Output: + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +kube-system calico-kube-controllers-65b8787765-bjc8h 1/1 Running 0 2m8s +kube-system calico-node-c2tmk 1/1 Running 0 2m8s +kube-system coredns-5c98db65d4-d4kgh 1/1 Running 0 9m8s +kube-system coredns-5c98db65d4-h6x8m 1/1 Running 0 9m8s +kube-system etcd-#yourhost 1/1 Running 0 8m25s +kube-system kube-apiserver-#yourhost 1/1 Running 0 8m7s +kube-system kube-controller-manager-#yourhost 1/1 Running 0 8m3s +kube-system kube-proxy-6sh42 1/1 Running 0 9m7s +kube-system kube-scheduler-#yourhost 1/1 Running 0 8m26s +``` + +The get nodes command shows that the control-plane node is up and ready: + +``` + kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane,master 10m v1.30.0 +``` + +Since we are using a single-node Kubernetes cluster, the cluster will not schedule pods on the control plane node by default. To schedule pods on the control plane node, we have to remove the taint by executing the following command: + +``` +kubectl taint nodes --all node-role.kubernetes.io/control-plane- +``` + +Refer to [Installing Kubeadm](https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/install-kubeadm/) +for more information. + +### Installing Helm + +Execute the following command to download and install Helm 3.15.3: + +``` + wget https://get.helm.sh/helm-v3.15.3-linux-amd64.tar.gz && \ + tar -zxvf helm-v3.15.3-linux-amd64.tar.gz && \ + sudo mv linux-amd64/helm /usr/local/bin/helm && \ + rm -rf helm-v3.15.3-linux-amd64.tar.gz linux-amd64/ +``` + +Refer to the Helm 3.15.3 [release notes](https://github.com/helm/helm/releases) and the [Installing Helm guide](https://helm.sh/docs/using_helm/#installing-helm) for more information. + + +### Adding an Additional Node to NVIDIA Cloud Native Stack + +`NOTE:` If you're not adding additional nodes, please skip this step and proceed to the next step [Installing NVIDIA Network Operator](#Installing-NVIDIA-Network-Operator) + +Make sure to install the Containerd and Kubernetes packages on additional nodes. + +Prerequisites: +- [Installing Docker and NVIDIA Container Toolkit](#Installing-Docker-and-NVIDIA-Container-Toolkit) +- [Installing Containerd](#Installing-Containerd) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Disable swap](#Disable-swap) + +Once the prerequisites are completed on the additional nodes, execute the below command on the control-plane node and then execute the join command output on an additional node to add the additional node to NVIDIA Cloud Native Stack: + +``` + sudo kubeadm token create --print-join-command +``` + +Output: +``` +example: +sudo kubeadm join 10.110.0.34:6443 --token kg2h7r.e45g9uyrbm1c0w3k --discovery-token-ca-cert-hash sha256:77fd6571644373ea69074dd4af7b077bbf5bd15a3ed720daee98f4b04a8f524e +``` +`NOTE`: control-plane node and worker node should not have the same node name. + +The get nodes command shows that the master and worker nodes are up and ready: + +``` + kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane,master 10m v1.30.0 +#yourhost-worker Ready 10m v1.30.0 +``` + +### Installing GPU Operator + +Add the NVIDIA repo: + +``` + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia +``` + +Update the Helm repo: + +``` + helm repo update +``` + +Install GPU Operator: + +`NOTE:` As we are preinstalled with NVIDIA Driver and NVIDIA Container Toolkit, we need to set as `false` when installing the GPU Operator + +``` + helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator --devel nvidia/gpu-operator --set driver.enabled=false,toolkit.enabled=false --wait --generate-name +``` + +#### Validating the State of the GPU Operator: + +Please note that the installation of the GPU Operator can take a couple of minutes. How long the installation will take depends on your internet speed. + +``` +kubectl get pods --all-namespaces | grep -v kube-system +``` + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +default gpu-operator-1622656274-node-feature-discovery-master-5cddq96gq 1/1 Running 0 2m39s +default gpu-operator-1622656274-node-feature-discovery-worker-wr88v 1/1 Running 0 2m39s +default gpu-operator-7db468cfdf-mdrdp 1/1 Running 0 2m39s +nvidia-gpu-operator gpu-feature-discovery-g425f 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-cuda-validator-s6x2p 0/1 Completed 0 48s +nvidia-gpu-operator nvidia-dcgm-exporter-wtxnx 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-dcgm-jbz94 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-daemonset-hzzdt 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-validator-9nkxq 0/1 Completed 0 17s +nvidia-gpu-operator nvidia-operator-validator-cw4j5 1/1 Running 0 2m20s + +``` + +Please refer to the [GPU Operator page](https://ngc.nvidia.com/catalog/helm-charts/nvidia:gpu-operator) on NGC for more information. + +For multiple worker nodes, execute the below command to fix the CoreDNS and Node Feature Discovery. + +``` +kubectl delete pods $(kubectl get pods -n kube-system | grep core | awk '{print $1}') -n kube-system; kubectl delete pod $(kubectl get pods -o wide -n nvidia-gpu-operator | grep node-feature-discovery | grep -v master | awk '{print $1}') -n nvidia-gpu-operator +``` + +#### GPU Operator with MIG + +`NOTE:` Only A100 and A30 GPUs are supported for GPU Operator with MIG + +Multi-Instance GPU (MIG) allows GPUs based on the NVIDIA Ampere architecture (such as NVIDIA A100) to be securely partitioned into separate GPU instances for CUDA applications. For more information about enabling the MIG capability, please refer to [GPU Operator with MIG](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/gpu-operator-mig.html) + + +### Validating the GPU Operator + +GPU Operator validates the through the nvidia-device-plugin-validation pod and the nvidia-driver-validation pod. If both are completed successfully (see output from kubectl get pods --all-namespaces | grep -v kube-system), NVIDIA Cloud Native Stack is working as expected. This section provides two examples of validating that the GPU is usable from within a pod to validate the manually. + +#### Example 1: nvidia-smi + +Execute the following: + +``` +cat < test-image.tgz; sudo ctr -n=k8s.io images import test-image.tgz +``` + +### Validate NVIDIA Cloud Native Stack with an Application from NGC +Another option to validate NVIDIA Cloud Native Stack is by running a demo application hosted on NGC. + +NGC is NVIDIA's GPU-optimized software hub. NGC provides a curated set of GPU-optimized software for AI, HPC, and visualization. The content provided by NVIDIA and third-party ISVs simplify building, customizing, and integrating GPU-optimized software into workflows, accelerating the time to solutions for users. + +Containers, pre-trained models, Helm charts for Kubernetes deployments, and industry-specific AI toolkits with software development kits (SDKs) are hosted on NGC. For more information about how to deploy an application that is hosted on NGC or the NGC Private Registry, please refer to this [NGC Registry Guide](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/NGC_Registry_Guide_v1.0.md). Visit the [public NGC documentation](https://docs.nvidia.com/ngc) for more information. + +The steps in this section use the publicly available DeepStream - Intelligent Video Analytics (IVA) demo application Helm Chart. The application can validate the full NVIDIA Cloud Native Stack and test the connectivity of NVIDIA Cloud Native Stack to remote sensors. DeepStream delivers real-time AI-based video and image understanding and multi-sensor processing on GPUs. For more information, please refer to the [Helm Chart](https://ngc.nvidia.com/catalog/helm-charts/nvidia:video-analytics-demo). + +There are two ways to configure the DeepStream - Intelligent Video Analytics Demo Application on your NVIDIA Cloud Native Stack + +- Using a camera +- Using the integrated video file (no camera required) + +#### Using a camera + +##### Prerequisites: +- RTSP Camera stream + +Go through the below steps to install the demo application: +``` +1. helm fetch https://helm.ngc.nvidia.com/nvidia/charts/video-analytics-demo-0.1.9.tgz --untar + +2. cd into the folder video-analytics-demo and update the file values.yaml + +3. Go to the section Cameras in the values.yaml file and add the address of your IP camera. Read the comments section on how it can be added. Single or multiple cameras can be added as shown below + +cameras: + camera1: rtsp://XXXX +``` + +Execute the following command to deploy the demo application: +``` +helm install video-analytics-demo --name-template iva +``` + +Once the Helm chart is deployed, access the application with the VLC player. See the instructions below. + +#### Using the integrated video file (no camera) + +If you dont have a camera input, please execute the below commands to use the default video already integrated into the application: + +``` +$ helm fetch https://helm.ngc.nvidia.com/nvidia/charts/video-analytics-demo-0.1.9.tgz + +$ helm install video-analytics-demo-0.1.9.tgz --name-template iva +``` + +Once the helm chart is deployed, access the application with the VLC player as per the below instructions. +For more information about the demo application, please refer to the [application NGC page](https://ngc.nvidia.com/catalog/helm-charts/nvidia:video-analytics-demo) + +#### Access from WebUI + +Use the below WebUI URL to access the video analytic demo application from the browser: +``` +http://IPAddress of Node:31115/ +``` + +#### Access from VLC + +Download VLC Player from https://www.videolan.org/vlc/ on the machine where you intend to view the video stream. + +View the video stream in VLC by navigating to Media > Open Network Stream > Entering the following URL: + +``` +rtsp://IPAddress of Node:31113/ds-test +``` + +You should see the video output like below with the AI model detecting objects. + +![Deepstream_Video](screenshots/Deepstream.png) + +`NOTE:` Video stream in VLC will change if you provide an input RTSP camera. + + +### Uninstalling the GPU Operator + +Execute the below commands to uninstall the GPU Operator: + +``` +$ helm ls +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +gpu-operator-1606173805 nvidia-gpu-operator 1 2024-03-20 20:23:28.063421701 +0000 UTC deployed gpu-operator-24.3.0 v24.3.0 + +$ helm del gpu-operator-1606173805 -n nvidia-gpu-operator +``` \ No newline at end of file diff --git a/install-guides/Jetson_Xavier_v13.0.md b/install-guides/Jetson_Xavier_v13.0.md index 77ee94b..3a694de 100644 --- a/install-guides/Jetson_Xavier_v13.0.md +++ b/install-guides/Jetson_Xavier_v13.0.md @@ -217,7 +217,7 @@ Now execute the below to install kubelet, kubeadm, and kubectl: ``` sudo apt update - sudo apt install -y -q kubelet=1.30.0-00 kubectl=1.30.0-00 kubeadm=1.30.0-00 + sudo apt install -y -q kubelet=1.30.0-1.1 kubectl=1.30.0-1.1 kubeadm=1.30.0-1.1 sudo apt-mark hold kubelet kubeadm kubectl ``` diff --git a/install-guides/RHEL-8-8_Server_x86-arm64_v11.3.md b/install-guides/RHEL-8-8_Server_x86-arm64_v11.3.md new file mode 100644 index 0000000..aac6281 --- /dev/null +++ b/install-guides/RHEL-8-8_Server_x86-arm64_v11.3.md @@ -0,0 +1,1178 @@ +# NVIDIA Cloud Native Stack v11.3 - Install Guide for RHEL Server +## Introduction + +This document describes how to setup the NVIDIA Cloud Native Stack collection on a single or multiple NVIDIA Certified Systems. NVIDIA Cloud Native Stack can be configured to create a single node Kubernetes cluster or to create/add additional worker nodes to join an existing cluster. + +NVIDIA Cloud Native Stack v11.3 includes: +- RHEL 8.8 +- Containerd 1.7.20 +- Kubernetes version 1.28.12 +- Helm 3.15.3 +- NVIDIA GPU Operator 24.6.0 + - NVIDIA GPU Driver: 550.90.07 + - NVIDIA Container Toolkit: 1.16.1 + - NVIDIA K8S Device Plugin: 0.16.1 + - NVIDIA DCGM-Exporter: 3.3.7-3.5.0 + - NVIDIA DCGM: 3.3.7-1 + - NVIDIA GPU Feature Discovery: 0.16.1 + - NVIDIA K8s MIG Manager: 0.8.0 + - NVIDIA Driver Manager: 0.6.10 + - Node Feature Discovery: 0.16.3 + - NVIDIA KubeVirt GPU Device Plugin: 1.2.9 + - NVIDIA GDS Driver: 2.17.5 + - NVIDIA Kata Manager for Kubernetes: 0.2.1 + - NVIDIA Confidential Computing Manager for Kubernetes: 0.1.1 +- NVIDIA Network Operator 24.4.1 + - Mellanox MOFED Driver 24.04-0.6.6.0-0 + - RDMA Shared Device Plugin 1.4.0 + - SRIOV Device Plugin 3.6.2 + - Container Networking Plugins 1.3.0 + - Multus 3.9.3 + - Whereabouts 0.7.0 + +## Table of Contents + +- [Prerequisites](#Prerequisites) +- [Installing the RHEL Operating System](#Installing-the-RHEL-Operating-System) +- [Installing Container Runtime](#Installing-Container-Runtime) + - [Installing Containerd](#Installing-Containerd) + - [Installing CRI-O](#Installing-CRI-O) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Installing Helm](#Installing-Helm) +- [Adding an Additional Node to NVIDIA Cloud Native Stack](#Adding-additional-node-to-NVIDIA-Cloud-Native-Stack) +- [Installing NVIDIA Network Operator](#Installing-NVIDIA-Network-Operator) +- [Installing the GPU Operator](#Installing-the-GPU-Operator) +- [Validating the Network Operator with GPUDirect RDMA](#Validating-the-Network-Operator-with-GPUDirect-RDMA) +- [Validating the GPU Operator](#Validating-the-GPU-Operator) +- [Validate NVIDIA Cloud Native Stack with an Application from NGC](#Validate-NVIDIA-Cloud-Native-Stack-with-an-application-from-NGC) +- [Uninstalling the GPU Operator](#Uninstalling-the-GPU-Operator) +- [Uninstalling the Network Operator](#Uninstalling-the-Network-Operator) + +### Prerequisites + +The following instructions assume the following: + +- You have [NVIDIA-Certified Systems](https://docs.nvidia.com/ngc/ngc-deploy-on-premises/nvidia-certified-systems/index.html) with Mellanox CX NICs for x86-64 servers +- You have [NVIDIA Qualified Systems](https://www.nvidia.com/en-us/data-center/data-center-gpus/qualified-system-catalog/?start=0&count=50&pageNumber=1&filters=eyJmaWx0ZXJzIjpbXSwic3ViRmlsdGVycyI6eyJwcm9jZXNzb3JUeXBlIjpbIkFSTS1UaHVuZGVyWDIiLCJBUk0tQWx0cmEiXX0sImNlcnRpZmllZEZpbHRlcnMiOnt9LCJwYXlsb2FkIjpbXX0=) for ARM servers + `NOTE:` For ARM systems, NVIDIA Network Operator is not supported yet. +- You will perform a clean install. + +To determine if your system qualifies as an NVIDIA Certified System, review the list of NVIDIA Certified Systems [here](https://docs.nvidia.com/ngc/ngc-deploy-on-premises/nvidia-certified-systems/index.html). + +Please note that NVIDIA Cloud Native Stack is validated only on systems with the default kernel (not HWE). + + +### Installing the RHEL 8.8 Operating System +These instructions require installing RedHat Enterprise Linux 8.8, can be downloaded [here](https://access.redhat.com/downloads/content/479/ver=/rhel---8/8.8/x86_64/product-software). + +Please reference the [RHEL Server Installation Guide](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/8/html-single/performing_a_standard_rhel_8_installation/index). + +### Changing the SELinux State + +Open the `/etc/selinux/config` file in a text editor of your choice, for example: + +``` +sudo vi /etc/selinux/config +``` + +Configure the `SELINUX=enforcing` option: +``` +# This file controls the state of SELinux on the system. +# SELINUX= can take one of these three values: +# enforcing - SELinux security policy is enforced. +# permissive - SELinux prints warnings instead of enforcing. +# disabled - No SELinux policy is loaded. +SELINUX=enforcing +# SELINUXTYPE= can take one of these two values: +# targeted - Targeted processes are protected, +# mls - Multi Level Security protection. +SELINUXTYPE=targeted +``` + +Save the change, and restart the system: + +``` +sudo reboot +``` + +After the system rebooted, run the below command to verify the status + +``` +sestatus +``` + +Expected output: + +``` +SELinux status: enabled +SELinuxfs mount: /sys/fs/selinux +SELinux root directory: /etc/selinux +Loaded policy name: targeted +Current mode: enforcing +Mode from config file: enforcing +Policy MLS status: enabled +Policy deny_unknown status: allowed +Memory protection checking: actual (secure) +Max kernel policy version: 31 +``` + +## Installing Container Runtime + +You need to install a container runtime into each node in the cluster so that Pods can run there. Currently Cloud Native Stack provides below container runtimes: + +- [Installing Containerd](#Installing-Containerd) +- [Installing CRI-O](#Installing-CRI-O) + +`NOTE:` Only install one of either `Containerd` or `CRI-O`, not both! + +These steps apply to both runtimes. + +Install required packages: + +``` +sudo dnf install -y yum-utils device-mapper-persistent-data lvm2 +``` + +Configure the `overlay` and `br_netfilter` kernel modules required by Kubernetes: + +``` +cat <:6443 --token 489oi5.sm34l9uh7dk4z6cm \ + --discovery-token-ca-cert-hash sha256:17165b6c4a4b95d73a3a2a83749a957a10161ae34d2dfd02cd730597579b4b34 +``` + + +Following the instructions in the output, execute the commands as shown below: + +``` +mkdir -p $HOME/.kube +``` + +``` +sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config +``` + +``` +sudo chown $(id -u):$(id -g) $HOME/.kube/config +``` + +With the following command, you install a pod-network add-on to the control plane node. We are using calico as the pod-network add-on here: + +``` +kubectl apply -f https://raw.githubusercontent.com/projectcalico/calico/v3.25.1/manifests/calico.yaml +``` + +Update the Calico Daemonset + +``` +kubectl set env daemonset/calico-node -n kube-system IP_AUTODETECTION_METHOD=interface=ens\*,eth\*,enc\*,enp\* +``` + +You can execute the below commands to ensure that all pods are up and running: + +``` +kubectl get pods --all-namespaces +``` + +Output: + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +kube-system calico-kube-controllers-65b8787765-bjc8h 1/1 Running 0 2m8s +kube-system calico-node-c2tmk 1/1 Running 0 2m8s +kube-system coredns-5c98db65d4-d4kgh 1/1 Running 0 9m8s +kube-system coredns-5c98db65d4-h6x8m 1/1 Running 0 9m8s +kube-system etcd-#yourhost 1/1 Running 0 8m25s +kube-system kube-apiserver-#yourhost 1/1 Running 0 8m7s +kube-system kube-controller-manager-#yourhost 1/1 Running 0 8m3s +kube-system kube-proxy-6sh42 1/1 Running 0 9m7s +kube-system kube-scheduler-#yourhost 1/1 Running 0 8m26s +``` + +The get nodes command shows that the control-plane node is up and ready: + +``` +kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane 10m v1.28.12 +``` + +Since we are using a single-node Kubernetes cluster, the cluster will not schedule pods on the control plane node by default. To schedule pods on the control plane node, we have to remove the taint by executing the following command: + +``` +kubectl taint nodes --all node-role.kubernetes.io/control-plane- +``` + +Refer to [Installing Kubeadm](https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/install-kubeadm/) +for more information. + +### Installing Helm + +Execute the following command to download and install Helm 3.15.3 for `x86-64` system: + +``` +wget https://get.helm.sh/helm-v3.15.3-linux-amd64.tar.gz +``` + +``` +tar -zxvf helm-v3.15.3-linux-amd64.tar.gz + ``` + + ``` +sudo mv linux-amd64/helm /usr/local/bin/helm + ``` + + ``` +rm -rf helm-v3.15.3-linux-amd64.tar.gz linux-amd64/ +``` + +Download and install Helm 3.15.3 for `ARM` system: + +``` +wget https://get.helm.sh/helm-v3.15.3-linux-arm64.tar.gz +``` + +``` +tar -zxvf helm-v3.15.3-linux-arm64.tar.gz + ``` + +``` +sudo mv linux-arm64/helm /usr/local/bin/helm +``` + +``` +rm -rf helm-v3.15.3-linux-arm64.tar.gz linux-arm64/ +``` + +Refer to the Helm 3.15.3 [release notes](https://github.com/helm/helm/releases) and the [Installing Helm guide](https://helm.sh/docs/using_helm/#installing-helm) for more information. + + +### Adding an Additional Node to NVIDIA Cloud Native Stack + +`NOTE:` If you're not adding additional nodes, please skip this step and proceed to the next step [Installing NVIDIA Network Operator](#Installing-NVIDIA-Network-Operator) + +Make sure to install the Containerd and Kubernetes packages on additional nodes. + +Prerequisites: +- [Installing Containerd](#Installing-Containerd) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Disable swap](#Disable-swap) + +Once the prerequisites are completed on the additional nodes, execute the below command on the control-plane node and then execute the join command output on an additional node to add the additional node to NVIDIA Cloud Native Stack: + +``` +sudo kubeadm token create --print-join-command +``` + +Output: +``` +example: +sudo kubeadm join 10.110.0.34:6443 --token kg2h7r.e45g9uyrbm1c0w3k --discovery-token-ca-cert-hash sha256:77fd6571644373ea69074dd4af7b077bbf5bd15a3ed720daee98f4b04a8f524e +``` +`NOTE`: control-plane node and worker node should not have the same node name. + +The get nodes command shows that the master and worker nodes are up and ready: + +``` +kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane 10m v1.28.12 +#yourhost-worker Ready 10m v1.28.12 +``` + +### Adding an Additional Node to NVIDIA Cloud Native Stack + +`NOTE:` If you're not adding additional nodes, please skip this step and proceed to the next step [Installing NVIDIA Network Operator](#Installing-NVIDIA-Network-Operator) + +Make sure to install the Containerd and Kubernetes packages on additional nodes. + +Prerequisites: +- [Installing Containerd](#Installing-Containerd) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Disable swap](#Disable-swap) + +Once the prerequisites are completed on the additional nodes, execute the below command on the control-plane node and then execute the join command output on an additional node to add the additional node to NVIDIA Cloud Native Stack: + +``` + sudo kubeadm token create --print-join-command +``` + +Output: +``` +example: +sudo kubeadm join 10.110.0.34:6443 --token kg2h7r.e45g9uyrbm1c0w3k --discovery-token-ca-cert-hash sha256:77fd6571644373ea69074dd4af7b077bbf5bd15a3ed720daee98f4b04a8f524e +``` +`NOTE`: control-plane node and worker node should not have the same node name. + +The get nodes command shows that the master and worker nodes are up and ready: + +``` + kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane 10m v1.28.12 +#yourhost-worker Ready 10m v1.28.12 +``` + +### Installing NVIDIA Network Operator + +`NOTE:` If Mellanox NICs are not connected to your nodes, please skip this step and proceed to the next step [Installing GPU Operator](#Installing-GPU-Operator) + +The below instructions assume that Mellanox NICs are connected to your machines. + +Execute the below command to verify Mellanox NICs are enabled on your machines: + +``` + lspci | grep -i "Mellanox" +``` + +Output: +``` +0c:00.0 Ethernet controller: Mellanox Technologies MT2892 Family [ConnectX-6 Dx] +0c:00.1 Ethernet controller: Mellanox Technologies MT2892 Family [ConnectX-6 Dx] +``` +Execute the below command to determine which Mellanox device is active: + +`NOTE:` Use whicever device shows as `Link Detected: yes` in further steps. The below command works only if you add the NICs before installing the Operating System. + +``` +for device in `sudo lshw -class network -short | grep -i ConnectX | awk '{print $2}' | egrep -v 'Device|path' | sed '/^$/d'`;do echo -n $device; sudo ethtool $device | grep -i "Link detected"; done +``` +Output: +``` +ens160f0 Link detected: yes +ens160f1 Link detected: no +``` + +Create the custom network operator values.yaml and update the active Mellanox device from the above command: +``` +nano network-operator-values.yaml +deployCR: true +ofedDriver: + deploy: true +rdmaSharedDevicePlugin: + deploy: true + resources: + - name: rdma_shared_device_a + vendors: [15b3] + devices: [ens160f0] +``` + +For more information about custom network operator values.yaml, please refer [Network Operator](https://docs.mellanox.com/display/COKAN10/Network+Operator#NetworkOperator-Example2:RDMADevicePluginConfiguration) + +Add the NVIDIA repo: +``` + helm repo add mellanox https://mellanox.github.io/network-operator +``` + +Update the Helm repo: +``` + helm repo update +``` +Install Network Operator: +``` + kubectl label nodes --all node-role.kubernetes.io/master- --overwrite + helm install -f --version 24.1.0 ./network-operator-values.yaml -n network-operator --create-namespace --wait network-operator mellanox/network-operator +``` +#### Validating the State of the Network Operator + +Please note that the installation of the Network Operator can take a couple of minutes. How long the installation will take depends on your internet speed. + +``` +kubectl get pods --all-namespaces | egrep 'network-operator|nvidia-network-operator-resources' +``` + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +network-operator network-operator-547cb8d999-mn2h9 1/1 Running 0 17m +network-operator network-operator-node-feature-discovery-master-596fb8b7cb-qrmvv 1/1 Running 0 17m +network-operator network-operator-node-feature-discovery-worker-qt5xt 1/1 Running 0 17m +nvidia-network-operator-resources cni-plugins-ds-dl5vl 1/1 Running 0 17m +nvidia-network-operator-resources kube-multus-ds-w82rv 1/1 Running 0 17m +nvidia-network-operator-resources mofed-ubuntu20.04-ds-xfpzl 1/1 Running 0 17m +nvidia-network-operator-resources rdma-shared-dp-ds-2hgb6 1/1 Running 0 17m +nvidia-network-operator-resources sriov-device-plugin-ch7bz 1/1 Running 0 10m +nvidia-network-operator-resources whereabouts-56ngr 1/1 Running 0 10m +``` + +Please refer to the [Network Operator page](https://docs.mellanox.com/display/COKAN10/Network+Operator) for more information. + +### Installing GPU Operator + +Add the NVIDIA repo: + +``` + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia +``` + +Update the Helm repo: + +``` + helm repo update +``` + +Install GPU Operator: + +`NOTE:` If you installed Network Operator, please skip the below command and follow the [GPU Operator with RDMA](#GPU-Operator-with-RDMA) + +``` + helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --wait --generate-name +``` + +#### GPU Operator with RDMA + +- Prerequisites: + - Please install the [Network Operator](#Installing NVIDIA Network Operator) to ensure that the MOFED drivers are installed. + +After Network Operator installation is completed, execute the below command to install the GPU Operator to load nv_peer_mem modules: + +``` + helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --set driver.rdma.enabled=true --wait --generate-name +``` + +#### GPU Operator with Host MOFED Driver and RDMA + +If the host is already installed MOFED driver without network operator, execute the below command to install the GPU Operator to load nv_peer_mem module + +``` + helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --set driver.rdma.enabled=true,driver.rdma.useHostMofed=true --wait --generate-name + +``` + +### GPU Operator with GPU Direct Storage(GDS) + +Execute the below command to enable the GPU Direct Storage Driver on GPU Operator + +``` +helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --set gds.enabled=true +``` +For more information refer, [GPU Direct Storage](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/gpu-operator-rdma.html) + +#### Validating the State of the GPU Operator: + +Please note that the installation of the GPU Operator can take a couple of minutes. How long the installation will take depends on your internet speed. + +``` +kubectl get pods --all-namespaces | grep -v kube-system +``` + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +default gpu-operator-1622656274-node-feature-discovery-master-5cddq96gq 1/1 Running 0 2m39s +default gpu-operator-1622656274-node-feature-discovery-worker-wr88v 1/1 Running 0 2m39s +default gpu-operator-7db468cfdf-mdrdp 1/1 Running 0 2m39s +nvidia-gpu-operator gpu-feature-discovery-g425f 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-container-toolkit-daemonset-mcmxj 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-cuda-validator-s6x2p 0/1 Completed 0 48s +nvidia-gpu-operator nvidia-dcgm-exporter-wtxnx 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-dcgm-jbz94 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-daemonset-hzzdt 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-validator-9nkxq 0/1 Completed 0 17s +nvidia-gpu-operator nvidia-driver-daemonset-kt8g5 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-operator-validator-cw4j5 1/1 Running 0 2m20s + +``` + +Please refer to the [GPU Operator page](https://ngc.nvidia.com/catalog/helm-charts/nvidia:gpu-operator) on NGC for more information. + +For multiple worker nodes, execute the below command to fix the CoreDNS and Node Feature Discovery. + +``` +kubectl delete pods $(kubectl get pods -n kube-system | grep core | awk '{print $1}') -n kube-system; kubectl delete pod $(kubectl get pods -o wide -n nvidia-gpu-operator | grep node-feature-discovery | grep -v master | awk '{print $1}') -n nvidia-gpu-operator +``` + +#### GPU Operator with MIG + +`NOTE:` Only A100 and A30 GPUs are supported for GPU Operator with MIG + +Multi-Instance GPU (MIG) allows GPUs based on the NVIDIA Ampere architecture (such as NVIDIA A100) to be securely partitioned into separate GPU instances for CUDA applications. For more information about enabling the MIG capability, please refer to [GPU Operator with MIG](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/gpu-operator-mig.html) + +### Validating the Network Operator with GPUDirect RDMA + +Execute the below command to list the Mellanox NIC's with the status: +``` +kubectl exec -it $(kubectl get pods -n nvidia-network-operator-resources | grep mofed | awk '{print $1}') -n nvidia-network-operator-resources -- ibdev2netdev +``` +Output: +``` +mlx5_0 port 1 ==> ens192f0 (Up) +mlx5_1 port 1 ==> ens192f1 (Down) +``` + +Create network definition for IPAM and replace the `ens192f0` with an active Mellanox device for `master`: +``` +$ nano networkdefinition.yaml +apiVersion: k8s.cni.cncf.io/v1 +kind: NetworkAttachmentDefinition +metadata: + annotations: + k8s.v1.cni.cncf.io/resourceName: rdma/rdma_shared_device_a + name: rdma-net-ipam + namespace: default +spec: + config: |- + { + "cniVersion": "0.3.1", + "name": "rdma-net-ipam", + "plugins": [ + { + "ipam": { + "datastore": "kubernetes", + "kubernetes": { + "kubeconfig": "/etc/cni/net.d/whereabouts.d/whereabouts.kubeconfig" + }, + "log_file": "/tmp/whereabouts.log", + "log_level": "debug", + "range": "192.168.111.1/24", + "type": "whereabouts" + }, + "type": "macvlan", + "master": "ens192f0" + }, + { + "mtu": 1500, + "type": "tuning" + } + ] + } +EOF +``` +`NOTE:` If you do not have VLAN-based networking on the high-performance side, please set "vlan": 0 + + +Execute the below command to install network definition on NVIDIA Cloud Native Stack from the control-plane node: + + ``` +kubectl apply -f networkdefinition.yaml + ``` + +Now create the pod YAML with the below content: + +``` +cat < ../../devices/virtual/net/eth0 +lrwxrwxrwx 1 root root 0 Jun 1 02:26 lo -> ../../devices/virtual/net/lo +lrwxrwxrwx 1 root root 0 Jun 1 02:26 net1 -> ../../devices/virtual/net/net1 +lrwxrwxrwx 1 root root 0 Jun 1 02:26 tunl0 -> ../../devices/virtual/net/tunl0 +``` + +Execute the below command to list the Mellanox NIC's with the status: +``` +kubectl exec -it $(kubectl get pods -n nvidia-network-operator-resources | grep mofed | awk '{print $1}') -n nvidia-network-operator-resources -- ibdev2netdev +``` +Output: +``` +mlx5_0 port 1 ==> ens192f0 (Up) +mlx5_1 port 1 ==> ens192f1 (Down) +``` + +Update the above Mellanox NIC, for which status is `Up` in the below command: + +``` +kubectl exec -it rdma-test-pod-1 -- bash + +[root@rdma-test-pod-1 /]# ib_write_bw -d mlx5_0 -a -F --report_gbits -q 1 +************************************ +* Waiting for client to connect... * +************************************ +``` + +In a separate terminal, print the network address of the secondary interface on the `rdma-test-pod-1` pod: + +``` +$ kubectl exec rdma-test-pod-1 -- ip addr show dev net1 +5: net1@if24: mtu 9000 qdisc noqueue state UP group default + link/ether 62:51:fb:13:88:ce brd ff:ff:ff:ff:ff:ff link-netnsid 0 + inet 192.168.111.1/24 brd 192.168.111.255 scope global net1 + valid_lft forever preferred_lft forever +``` + +Execute the below command with the above inet address to verify the nv_peer_memory performance on NVIDIA Cloud Native Stack: +``` +$ kubectl exec -it rdma-test-pod-2 -- bash +[root@rdma-test-pod-1 /]# ib_write_bw -d mlx5_0 -a -F --report_gbits -q 1 192.168.111.2 +--------------------------------------------------------------------------------------- + RDMA_Write BW Test + Dual-port : OFF Device : mlx5_0 + Number of qps : 1 Transport type : IB + Connection type : RC Using SRQ : OFF + TX depth : 128 + CQ Moderation : 100 + Mtu : 1024[B] + Link type : Ethernet + GID index : 4 + Max inline data : 0[B] + rdma_cm QPs : OFF + Data ex. method : Ethernet +--------------------------------------------------------------------------------------- + local address: LID 0000 QPN 0x0137 PSN 0x3c5d65 RKey 0x00370e VAddr 0x007ff44bf1d000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:01 + remote address: LID 0000 QPN 0x0136 PSN 0x475031 RKey 0x002c23 VAddr 0x007fd3d83cb000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:02 +--------------------------------------------------------------------------------------- + #bytes #iterations BW peak[Gb/sec] BW average[Gb/sec] MsgRate[Mpps] + 2 5000 0.080755 0.073090 4.568094 + 4 5000 0.16 0.15 4.588128 + 8 5000 0.31 0.29 4.567442 + 16 5000 0.66 0.59 4.647555 + 32 5000 1.35 1.22 4.776518 + 64 5000 2.50 2.29 4.481806 + 128 5000 5.34 4.73 4.621828 + 256 5000 10.53 9.11 4.448153 + 512 5000 21.03 17.05 4.162100 + 1024 5000 38.67 34.16 4.169397 + 2048 5000 47.11 43.50 2.655219 + 4096 5000 51.29 51.02 1.557094 + 8192 5000 52.00 51.98 0.793178 + 16384 5000 52.33 52.32 0.399164 + 32768 5000 52.47 52.47 0.200143 + 65536 5000 52.51 52.50 0.100143 + 131072 5000 52.51 52.51 0.050078 + 262144 5000 52.49 52.49 0.025029 + 524288 5000 52.50 52.50 0.012517 + 1048576 5000 52.51 52.51 0.006260 + 2097152 5000 52.51 52.51 0.003130 + 4194304 5000 52.51 52.51 0.001565 + 8388608 5000 52.52 52.52 0.000783 +--------------------------------------------------------------------------------------- +``` + +``` +[root@rdma-test-pod-1 /]# ib_write_bw -d mlx5_0 -a -F --report_gbits -q 1 + +************************************ +* Waiting for client to connect... * +************************************ +--------------------------------------------------------------------------------------- + RDMA_Write BW Test + Dual-port : OFF Device : mlx5_0 + Number of qps : 1 Transport type : IB + Connection type : RC Using SRQ : OFF + CQ Moderation : 100 + Mtu : 1024[B] + Link type : Ethernet + GID index : 8 + Max inline data : 0[B] + rdma_cm QPs : OFF + Data ex. method : Ethernet +--------------------------------------------------------------------------------------- + local address: LID 0000 QPN 0x0136 PSN 0x475031 RKey 0x002c23 VAddr 0x007fd3d83cb000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:02 + remote address: LID 0000 QPN 0x0137 PSN 0x3c5d65 RKey 0x00370e VAddr 0x007ff44bf1d000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:01 +--------------------------------------------------------------------------------------- + #bytes #iterations BW peak[Gb/sec] BW average[Gb/sec] MsgRate[Mpps] + 8388608 5000 52.52 52.52 0.000783 +--------------------------------------------------------------------------------------- +``` +The benchmark achieved approximately 52 Gbps throughput. + +Exit from RDMA test pods and then delete the RDMA test pods with the below command: + +``` +$ kubectl delete pod rdma-test-pod-1 rdma-test-pod-2 +``` + +### Validating the GPU Operator + +GPU Operator validates the through the nvidia-device-plugin-validation pod and the nvidia-driver-validation pod. If both are completed successfully (see output from kubectl get pods --all-namespaces | grep -v kube-system), NVIDIA Cloud Native Stack is working as expected. This section provides two examples of validating that the GPU is usable from within a pod to validate the manually. + +#### Example 1: nvidia-smi + +Execute the following: + +``` +cat < Open Network Stream > Entering the following URL: + +``` +rtsp://IPAddress of Node:31113/ds-test +``` + +You should see the video output like below with the AI model detecting objects. + +![Deepstream_Video](screenshots/Deepstream.png) + +`NOTE:` Video stream in VLC will change if you provide an input RTSP camera. + + +### Uninstalling the GPU Operator + +Execute the below commands to uninstall the GPU Operator: + +``` +$ helm ls +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +gpu-operator-1606173805 nvidia-gpu-operator 1 2023-03-14 20:23:28.063421701 +0000 UTC deployed gpu-operator-24.3.0 v23.3.2 + +$ helm del gpu-operator-1606173805 -n nvidia-gpu-operator + +``` + +### Uninstalling the Network Operator + +Execute the below commands to uninstall the Network Operator: + +``` +$ helm ls -n network-operator +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +network-operator network-operator 1 2023-04-03 17:09:04.665593336 +0000 UTC deployed network-operator-24.1.0 v24.1.0 + +$ helm del network-operator -n network-operator +``` diff --git a/install-guides/RHEL-8-8_Server_x86-arm64_v12.2.md b/install-guides/RHEL-8-8_Server_x86-arm64_v12.2.md new file mode 100644 index 0000000..63d9c80 --- /dev/null +++ b/install-guides/RHEL-8-8_Server_x86-arm64_v12.2.md @@ -0,0 +1,1178 @@ +# NVIDIA Cloud Native Stack v12.2 - Install Guide for RHEL Server +## Introduction + +This document describes how to setup the NVIDIA Cloud Native Stack collection on a single or multiple NVIDIA Certified Systems. NVIDIA Cloud Native Stack can be configured to create a single node Kubernetes cluster or to create/add additional worker nodes to join an existing cluster. + +NVIDIA Cloud Native Stack v12.2 includes: +- RHEL 8.8 +- Containerd 1.7.20 +- Kubernetes version 1.29.6 +- Helm 3.15.3 +- NVIDIA GPU Operator 24.6.0 + - NVIDIA GPU Driver: 550.90.07 + - NVIDIA Container Toolkit: 1.16.1 + - NVIDIA K8S Device Plugin: 0.16.1 + - NVIDIA DCGM-Exporter: 3.3.7-3.5.0 + - NVIDIA DCGM: 3.3.7-1 + - NVIDIA GPU Feature Discovery: 0.16.1 + - NVIDIA K8s MIG Manager: 0.8.0 + - NVIDIA Driver Manager: 0.6.10 + - Node Feature Discovery: 0.16.3 + - NVIDIA KubeVirt GPU Device Plugin: 1.2.9 + - NVIDIA GDS Driver: 2.17.5 + - NVIDIA Kata Manager for Kubernetes: 0.2.1 + - NVIDIA Confidential Computing Manager for Kubernetes: 0.1.1 +- NVIDIA Network Operator 24.4.1 + - Mellanox MOFED Driver 24.04-0.6.6.0-0 + - RDMA Shared Device Plugin 1.4.0 + - SRIOV Device Plugin 3.6.2 + - Container Networking Plugins 1.3.0 + - Multus 3.9.3 + - Whereabouts 0.7.0 + +## Table of Contents + +- [Prerequisites](#Prerequisites) +- [Installing the RHEL Operating System](#Installing-the-RHEL-Operating-System) +- [Installing Container Runtime](#Installing-Container-Runtime) + - [Installing Containerd](#Installing-Containerd) + - [Installing CRI-O](#Installing-CRI-O) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Installing Helm](#Installing-Helm) +- [Adding an Additional Node to NVIDIA Cloud Native Stack](#Adding-additional-node-to-NVIDIA-Cloud-Native-Stack) +- [Installing NVIDIA Network Operator](#Installing-NVIDIA-Network-Operator) +- [Installing the GPU Operator](#Installing-the-GPU-Operator) +- [Validating the Network Operator with GPUDirect RDMA](#Validating-the-Network-Operator-with-GPUDirect-RDMA) +- [Validating the GPU Operator](#Validating-the-GPU-Operator) +- [Validate NVIDIA Cloud Native Stack with an Application from NGC](#Validate-NVIDIA-Cloud-Native-Stack-with-an-application-from-NGC) +- [Uninstalling the GPU Operator](#Uninstalling-the-GPU-Operator) +- [Uninstalling the Network Operator](#Uninstalling-the-Network-Operator) + +### Prerequisites + +The following instructions assume the following: + +- You have [NVIDIA-Certified Systems](https://docs.nvidia.com/ngc/ngc-deploy-on-premises/nvidia-certified-systems/index.html) with Mellanox CX NICs for x86-64 servers +- You have [NVIDIA Qualified Systems](https://www.nvidia.com/en-us/data-center/data-center-gpus/qualified-system-catalog/?start=0&count=50&pageNumber=1&filters=eyJmaWx0ZXJzIjpbXSwic3ViRmlsdGVycyI6eyJwcm9jZXNzb3JUeXBlIjpbIkFSTS1UaHVuZGVyWDIiLCJBUk0tQWx0cmEiXX0sImNlcnRpZmllZEZpbHRlcnMiOnt9LCJwYXlsb2FkIjpbXX0=) for ARM servers + `NOTE:` For ARM systems, NVIDIA Network Operator is not supported yet. +- You will perform a clean install. + +To determine if your system qualifies as an NVIDIA Certified System, review the list of NVIDIA Certified Systems [here](https://docs.nvidia.com/ngc/ngc-deploy-on-premises/nvidia-certified-systems/index.html). + +Please note that NVIDIA Cloud Native Stack is validated only on systems with the default kernel (not HWE). + + +### Installing the RHEL 8.8 Operating System +These instructions require installing RedHat Enterprise Linux 8.8, can be downloaded [here](https://access.redhat.com/downloads/content/479/ver=/rhel---8/8.8/x86_64/product-software). + +Please reference the [RHEL Server Installation Guide](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/8/html-single/performing_a_standard_rhel_8_installation/index). + +### Changing the SELinux State + +Open the `/etc/selinux/config` file in a text editor of your choice, for example: + +``` +sudo vi /etc/selinux/config +``` + +Configure the `SELINUX=enforcing` option: +``` +# This file controls the state of SELinux on the system. +# SELINUX= can take one of these three values: +# enforcing - SELinux security policy is enforced. +# permissive - SELinux prints warnings instead of enforcing. +# disabled - No SELinux policy is loaded. +SELINUX=enforcing +# SELINUXTYPE= can take one of these two values: +# targeted - Targeted processes are protected, +# mls - Multi Level Security protection. +SELINUXTYPE=targeted +``` + +Save the change, and restart the system: + +``` +sudo reboot +``` + +After the system rebooted, run the below command to verify the status + +``` +sestatus +``` + +Expected output: + +``` +SELinux status: enabled +SELinuxfs mount: /sys/fs/selinux +SELinux root directory: /etc/selinux +Loaded policy name: targeted +Current mode: enforcing +Mode from config file: enforcing +Policy MLS status: enabled +Policy deny_unknown status: allowed +Memory protection checking: actual (secure) +Max kernel policy version: 31 +``` + +## Installing Container Runtime + +You need to install a container runtime into each node in the cluster so that Pods can run there. Currently Cloud Native Stack provides below container runtimes: + +- [Installing Containerd](#Installing-Containerd) +- [Installing CRI-O](#Installing-CRI-O) + +`NOTE:` Only install one of either `Containerd` or `CRI-O`, not both! + +These steps apply to both runtimes. + +Install required packages: + +``` +sudo dnf install -y yum-utils device-mapper-persistent-data lvm2 +``` + +Configure the `overlay` and `br_netfilter` kernel modules required by Kubernetes: + +``` +cat <:6443 --token 489oi5.sm34l9uh7dk4z6cm \ + --discovery-token-ca-cert-hash sha256:17165b6c4a4b95d73a3a2a83749a957a10161ae34d2dfd02cd730597579b4b34 +``` + + +Following the instructions in the output, execute the commands as shown below: + +``` +mkdir -p $HOME/.kube +``` + +``` +sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config +``` + +``` +sudo chown $(id -u):$(id -g) $HOME/.kube/config +``` + +With the following command, you install a pod-network add-on to the control plane node. We are using calico as the pod-network add-on here: + +``` +kubectl apply -f https://raw.githubusercontent.com/projectcalico/calico/v3.25.1/manifests/calico.yaml +``` + +Update the Calico Daemonset + +``` +kubectl set env daemonset/calico-node -n kube-system IP_AUTODETECTION_METHOD=interface=ens\*,eth\*,enc\*,enp\* +``` + +You can execute the below commands to ensure that all pods are up and running: + +``` +kubectl get pods --all-namespaces +``` + +Output: + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +kube-system calico-kube-controllers-65b8787765-bjc8h 1/1 Running 0 2m8s +kube-system calico-node-c2tmk 1/1 Running 0 2m8s +kube-system coredns-5c98db65d4-d4kgh 1/1 Running 0 9m8s +kube-system coredns-5c98db65d4-h6x8m 1/1 Running 0 9m8s +kube-system etcd-#yourhost 1/1 Running 0 8m25s +kube-system kube-apiserver-#yourhost 1/1 Running 0 8m7s +kube-system kube-controller-manager-#yourhost 1/1 Running 0 8m3s +kube-system kube-proxy-6sh42 1/1 Running 0 9m7s +kube-system kube-scheduler-#yourhost 1/1 Running 0 8m26s +``` + +The get nodes command shows that the control-plane node is up and ready: + +``` +kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane 10m v1.29.6 +``` + +Since we are using a single-node Kubernetes cluster, the cluster will not schedule pods on the control plane node by default. To schedule pods on the control plane node, we have to remove the taint by executing the following command: + +``` +kubectl taint nodes --all node-role.kubernetes.io/control-plane- +``` + +Refer to [Installing Kubeadm](https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/install-kubeadm/) +for more information. + +### Installing Helm + +Execute the following command to download and install Helm 3.15.3 for `x86-64` system: + +``` +wget https://get.helm.sh/helm-v3.15.3-linux-amd64.tar.gz +``` + +``` +tar -zxvf helm-v3.15.3-linux-amd64.tar.gz + ``` + + ``` +sudo mv linux-amd64/helm /usr/local/bin/helm + ``` + + ``` +rm -rf helm-v3.15.3-linux-amd64.tar.gz linux-amd64/ +``` + +Download and install Helm 3.15.3 for `ARM` system: + +``` +wget https://get.helm.sh/helm-v3.15.3-linux-arm64.tar.gz +``` + +``` +tar -zxvf helm-v3.15.3-linux-arm64.tar.gz + ``` + +``` +sudo mv linux-arm64/helm /usr/local/bin/helm +``` + +``` +rm -rf helm-v3.15.3-linux-arm64.tar.gz linux-arm64/ +``` + +Refer to the Helm 3.15.3 [release notes](https://github.com/helm/helm/releases) and the [Installing Helm guide](https://helm.sh/docs/using_helm/#installing-helm) for more information. + + +### Adding an Additional Node to NVIDIA Cloud Native Stack + +`NOTE:` If you're not adding additional nodes, please skip this step and proceed to the next step [Installing NVIDIA Network Operator](#Installing-NVIDIA-Network-Operator) + +Make sure to install the Containerd and Kubernetes packages on additional nodes. + +Prerequisites: +- [Installing Containerd](#Installing-Containerd) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Disable swap](#Disable-swap) + +Once the prerequisites are completed on the additional nodes, execute the below command on the control-plane node and then execute the join command output on an additional node to add the additional node to NVIDIA Cloud Native Stack: + +``` +sudo kubeadm token create --print-join-command +``` + +Output: +``` +example: +sudo kubeadm join 10.110.0.34:6443 --token kg2h7r.e45g9uyrbm1c0w3k --discovery-token-ca-cert-hash sha256:77fd6571644373ea69074dd4af7b077bbf5bd15a3ed720daee98f4b04a8f524e +``` +`NOTE`: control-plane node and worker node should not have the same node name. + +The get nodes command shows that the master and worker nodes are up and ready: + +``` +kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane 10m v1.29.6 +#yourhost-worker Ready 10m v1.29.6 +``` + +### Adding an Additional Node to NVIDIA Cloud Native Stack + +`NOTE:` If you're not adding additional nodes, please skip this step and proceed to the next step [Installing NVIDIA Network Operator](#Installing-NVIDIA-Network-Operator) + +Make sure to install the Containerd and Kubernetes packages on additional nodes. + +Prerequisites: +- [Installing Containerd](#Installing-Containerd) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Disable swap](#Disable-swap) + +Once the prerequisites are completed on the additional nodes, execute the below command on the control-plane node and then execute the join command output on an additional node to add the additional node to NVIDIA Cloud Native Stack: + +``` + sudo kubeadm token create --print-join-command +``` + +Output: +``` +example: +sudo kubeadm join 10.110.0.34:6443 --token kg2h7r.e45g9uyrbm1c0w3k --discovery-token-ca-cert-hash sha256:77fd6571644373ea69074dd4af7b077bbf5bd15a3ed720daee98f4b04a8f524e +``` +`NOTE`: control-plane node and worker node should not have the same node name. + +The get nodes command shows that the master and worker nodes are up and ready: + +``` + kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane 10m v1.29.6 +#yourhost-worker Ready 10m v1.29.6 +``` + +### Installing NVIDIA Network Operator + +`NOTE:` If Mellanox NICs are not connected to your nodes, please skip this step and proceed to the next step [Installing GPU Operator](#Installing-GPU-Operator) + +The below instructions assume that Mellanox NICs are connected to your machines. + +Execute the below command to verify Mellanox NICs are enabled on your machines: + +``` + lspci | grep -i "Mellanox" +``` + +Output: +``` +0c:00.0 Ethernet controller: Mellanox Technologies MT2892 Family [ConnectX-6 Dx] +0c:00.1 Ethernet controller: Mellanox Technologies MT2892 Family [ConnectX-6 Dx] +``` +Execute the below command to determine which Mellanox device is active: + +`NOTE:` Use whicever device shows as `Link Detected: yes` in further steps. The below command works only if you add the NICs before installing the Operating System. + +``` +for device in `sudo lshw -class network -short | grep -i ConnectX | awk '{print $2}' | egrep -v 'Device|path' | sed '/^$/d'`;do echo -n $device; sudo ethtool $device | grep -i "Link detected"; done +``` +Output: +``` +ens160f0 Link detected: yes +ens160f1 Link detected: no +``` + +Create the custom network operator values.yaml and update the active Mellanox device from the above command: +``` +nano network-operator-values.yaml +deployCR: true +ofedDriver: + deploy: true +rdmaSharedDevicePlugin: + deploy: true + resources: + - name: rdma_shared_device_a + vendors: [15b3] + devices: [ens160f0] +``` + +For more information about custom network operator values.yaml, please refer [Network Operator](https://docs.mellanox.com/display/COKAN10/Network+Operator#NetworkOperator-Example2:RDMADevicePluginConfiguration) + +Add the NVIDIA repo: +``` + helm repo add mellanox https://mellanox.github.io/network-operator +``` + +Update the Helm repo: +``` + helm repo update +``` +Install Network Operator: +``` + kubectl label nodes --all node-role.kubernetes.io/master- --overwrite + helm install -f --version 24.1.0 ./network-operator-values.yaml -n network-operator --create-namespace --wait network-operator mellanox/network-operator +``` +#### Validating the State of the Network Operator + +Please note that the installation of the Network Operator can take a couple of minutes. How long the installation will take depends on your internet speed. + +``` +kubectl get pods --all-namespaces | egrep 'network-operator|nvidia-network-operator-resources' +``` + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +network-operator network-operator-547cb8d999-mn2h9 1/1 Running 0 17m +network-operator network-operator-node-feature-discovery-master-596fb8b7cb-qrmvv 1/1 Running 0 17m +network-operator network-operator-node-feature-discovery-worker-qt5xt 1/1 Running 0 17m +nvidia-network-operator-resources cni-plugins-ds-dl5vl 1/1 Running 0 17m +nvidia-network-operator-resources kube-multus-ds-w82rv 1/1 Running 0 17m +nvidia-network-operator-resources mofed-ubuntu20.04-ds-xfpzl 1/1 Running 0 17m +nvidia-network-operator-resources rdma-shared-dp-ds-2hgb6 1/1 Running 0 17m +nvidia-network-operator-resources sriov-device-plugin-ch7bz 1/1 Running 0 10m +nvidia-network-operator-resources whereabouts-56ngr 1/1 Running 0 10m +``` + +Please refer to the [Network Operator page](https://docs.mellanox.com/display/COKAN10/Network+Operator) for more information. + +### Installing GPU Operator + +Add the NVIDIA repo: + +``` + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia +``` + +Update the Helm repo: + +``` + helm repo update +``` + +Install GPU Operator: + +`NOTE:` If you installed Network Operator, please skip the below command and follow the [GPU Operator with RDMA](#GPU-Operator-with-RDMA) + +``` + helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --wait --generate-name +``` + +#### GPU Operator with RDMA + +- Prerequisites: + - Please install the [Network Operator](#Installing NVIDIA Network Operator) to ensure that the MOFED drivers are installed. + +After Network Operator installation is completed, execute the below command to install the GPU Operator to load nv_peer_mem modules: + +``` + helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --set driver.rdma.enabled=true --wait --generate-name +``` + +#### GPU Operator with Host MOFED Driver and RDMA + +If the host is already installed MOFED driver without network operator, execute the below command to install the GPU Operator to load nv_peer_mem module + +``` + helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --set driver.rdma.enabled=true,driver.rdma.useHostMofed=true --wait --generate-name + +``` + +### GPU Operator with GPU Direct Storage(GDS) + +Execute the below command to enable the GPU Direct Storage Driver on GPU Operator + +``` +helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --set gds.enabled=true +``` +For more information refer, [GPU Direct Storage](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/gpu-operator-rdma.html) + +#### Validating the State of the GPU Operator: + +Please note that the installation of the GPU Operator can take a couple of minutes. How long the installation will take depends on your internet speed. + +``` +kubectl get pods --all-namespaces | grep -v kube-system +``` + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +default gpu-operator-1622656274-node-feature-discovery-master-5cddq96gq 1/1 Running 0 2m39s +default gpu-operator-1622656274-node-feature-discovery-worker-wr88v 1/1 Running 0 2m39s +default gpu-operator-7db468cfdf-mdrdp 1/1 Running 0 2m39s +nvidia-gpu-operator gpu-feature-discovery-g425f 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-container-toolkit-daemonset-mcmxj 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-cuda-validator-s6x2p 0/1 Completed 0 48s +nvidia-gpu-operator nvidia-dcgm-exporter-wtxnx 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-dcgm-jbz94 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-daemonset-hzzdt 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-validator-9nkxq 0/1 Completed 0 17s +nvidia-gpu-operator nvidia-driver-daemonset-kt8g5 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-operator-validator-cw4j5 1/1 Running 0 2m20s + +``` + +Please refer to the [GPU Operator page](https://ngc.nvidia.com/catalog/helm-charts/nvidia:gpu-operator) on NGC for more information. + +For multiple worker nodes, execute the below command to fix the CoreDNS and Node Feature Discovery. + +``` +kubectl delete pods $(kubectl get pods -n kube-system | grep core | awk '{print $1}') -n kube-system; kubectl delete pod $(kubectl get pods -o wide -n nvidia-gpu-operator | grep node-feature-discovery | grep -v master | awk '{print $1}') -n nvidia-gpu-operator +``` + +#### GPU Operator with MIG + +`NOTE:` Only A100 and A30 GPUs are supported for GPU Operator with MIG + +Multi-Instance GPU (MIG) allows GPUs based on the NVIDIA Ampere architecture (such as NVIDIA A100) to be securely partitioned into separate GPU instances for CUDA applications. For more information about enabling the MIG capability, please refer to [GPU Operator with MIG](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/gpu-operator-mig.html) + +### Validating the Network Operator with GPUDirect RDMA + +Execute the below command to list the Mellanox NIC's with the status: +``` +kubectl exec -it $(kubectl get pods -n nvidia-network-operator-resources | grep mofed | awk '{print $1}') -n nvidia-network-operator-resources -- ibdev2netdev +``` +Output: +``` +mlx5_0 port 1 ==> ens192f0 (Up) +mlx5_1 port 1 ==> ens192f1 (Down) +``` + +Create network definition for IPAM and replace the `ens192f0` with an active Mellanox device for `master`: +``` +$ nano networkdefinition.yaml +apiVersion: k8s.cni.cncf.io/v1 +kind: NetworkAttachmentDefinition +metadata: + annotations: + k8s.v1.cni.cncf.io/resourceName: rdma/rdma_shared_device_a + name: rdma-net-ipam + namespace: default +spec: + config: |- + { + "cniVersion": "0.3.1", + "name": "rdma-net-ipam", + "plugins": [ + { + "ipam": { + "datastore": "kubernetes", + "kubernetes": { + "kubeconfig": "/etc/cni/net.d/whereabouts.d/whereabouts.kubeconfig" + }, + "log_file": "/tmp/whereabouts.log", + "log_level": "debug", + "range": "192.168.112.0/24", + "type": "whereabouts" + }, + "type": "macvlan", + "master": "ens192f0" + }, + { + "mtu": 1500, + "type": "tuning" + } + ] + } +EOF +``` +`NOTE:` If you do not have VLAN-based networking on the high-performance side, please set "vlan": 0 + + +Execute the below command to install network definition on NVIDIA Cloud Native Stack from the control-plane node: + + ``` +kubectl apply -f networkdefinition.yaml + ``` + +Now create the pod YAML with the below content: + +``` +cat < ../../devices/virtual/net/eth0 +lrwxrwxrwx 1 root root 0 Jun 1 02:26 lo -> ../../devices/virtual/net/lo +lrwxrwxrwx 1 root root 0 Jun 1 02:26 net1 -> ../../devices/virtual/net/net1 +lrwxrwxrwx 1 root root 0 Jun 1 02:26 tunl0 -> ../../devices/virtual/net/tunl0 +``` + +Execute the below command to list the Mellanox NIC's with the status: +``` +kubectl exec -it $(kubectl get pods -n nvidia-network-operator-resources | grep mofed | awk '{print $1}') -n nvidia-network-operator-resources -- ibdev2netdev +``` +Output: +``` +mlx5_0 port 1 ==> ens192f0 (Up) +mlx5_1 port 1 ==> ens192f1 (Down) +``` + +Update the above Mellanox NIC, for which status is `Up` in the below command: + +``` +kubectl exec -it rdma-test-pod-1 -- bash + +[root@rdma-test-pod-1 /]# ib_write_bw -d mlx5_0 -a -F --report_gbits -q 1 +************************************ +* Waiting for client to connect... * +************************************ +``` + +In a separate terminal, print the network address of the secondary interface on the `rdma-test-pod-1` pod: + +``` +$ kubectl exec rdma-test-pod-1 -- ip addr show dev net1 +5: net1@if24: mtu 9000 qdisc noqueue state UP group default + link/ether 62:51:fb:13:88:ce brd ff:ff:ff:ff:ff:ff link-netnsid 0 + inet 192.168.111.1/24 brd 192.168.111.255 scope global net1 + valid_lft forever preferred_lft forever +``` + +Execute the below command with the above inet address to verify the nv_peer_memory performance on NVIDIA Cloud Native Stack: +``` +$ kubectl exec -it rdma-test-pod-2 -- bash +[root@rdma-test-pod-1 /]# ib_write_bw -d mlx5_0 -a -F --report_gbits -q 1 192.168.111.2 +--------------------------------------------------------------------------------------- + RDMA_Write BW Test + Dual-port : OFF Device : mlx5_0 + Number of qps : 1 Transport type : IB + Connection type : RC Using SRQ : OFF + TX depth : 128 + CQ Moderation : 100 + Mtu : 1024[B] + Link type : Ethernet + GID index : 4 + Max inline data : 0[B] + rdma_cm QPs : OFF + Data ex. method : Ethernet +--------------------------------------------------------------------------------------- + local address: LID 0000 QPN 0x0137 PSN 0x3c5d65 RKey 0x00370e VAddr 0x007ff44bf1d000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:01 + remote address: LID 0000 QPN 0x0136 PSN 0x475031 RKey 0x002c23 VAddr 0x007fd3d83cb000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:02 +--------------------------------------------------------------------------------------- + #bytes #iterations BW peak[Gb/sec] BW average[Gb/sec] MsgRate[Mpps] + 2 5000 0.080755 0.073090 4.568094 + 4 5000 0.16 0.15 4.588128 + 8 5000 0.31 0.29 4.567442 + 16 5000 0.66 0.59 4.647555 + 32 5000 1.35 1.22 4.776518 + 64 5000 2.50 2.29 4.481806 + 128 5000 5.34 4.73 4.621828 + 256 5000 10.53 9.11 4.448153 + 512 5000 21.03 17.05 4.162100 + 1024 5000 38.67 34.16 4.169397 + 2048 5000 47.11 43.50 2.655219 + 4096 5000 51.29 51.02 1.557094 + 8192 5000 52.00 51.98 0.793178 + 16384 5000 52.33 52.32 0.399164 + 32768 5000 52.47 52.47 0.200143 + 65536 5000 52.51 52.50 0.100143 + 131072 5000 52.51 52.51 0.050078 + 262144 5000 52.49 52.49 0.025029 + 524288 5000 52.50 52.50 0.012517 + 1048576 5000 52.51 52.51 0.006260 + 2097152 5000 52.51 52.51 0.003130 + 4194304 5000 52.51 52.51 0.001565 + 8388608 5000 52.52 52.52 0.000783 +--------------------------------------------------------------------------------------- +``` + +``` +[root@rdma-test-pod-1 /]# ib_write_bw -d mlx5_0 -a -F --report_gbits -q 1 + +************************************ +* Waiting for client to connect... * +************************************ +--------------------------------------------------------------------------------------- + RDMA_Write BW Test + Dual-port : OFF Device : mlx5_0 + Number of qps : 1 Transport type : IB + Connection type : RC Using SRQ : OFF + CQ Moderation : 100 + Mtu : 1024[B] + Link type : Ethernet + GID index : 8 + Max inline data : 0[B] + rdma_cm QPs : OFF + Data ex. method : Ethernet +--------------------------------------------------------------------------------------- + local address: LID 0000 QPN 0x0136 PSN 0x475031 RKey 0x002c23 VAddr 0x007fd3d83cb000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:02 + remote address: LID 0000 QPN 0x0137 PSN 0x3c5d65 RKey 0x00370e VAddr 0x007ff44bf1d000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:01 +--------------------------------------------------------------------------------------- + #bytes #iterations BW peak[Gb/sec] BW average[Gb/sec] MsgRate[Mpps] + 8388608 5000 52.52 52.52 0.000783 +--------------------------------------------------------------------------------------- +``` +The benchmark achieved approximately 52 Gbps throughput. + +Exit from RDMA test pods and then delete the RDMA test pods with the below command: + +``` +$ kubectl delete pod rdma-test-pod-1 rdma-test-pod-2 +``` + +### Validating the GPU Operator + +GPU Operator validates the through the nvidia-device-plugin-validation pod and the nvidia-driver-validation pod. If both are completed successfully (see output from kubectl get pods --all-namespaces | grep -v kube-system), NVIDIA Cloud Native Stack is working as expected. This section provides two examples of validating that the GPU is usable from within a pod to validate the manually. + +#### Example 1: nvidia-smi + +Execute the following: + +``` +cat < Open Network Stream > Entering the following URL: + +``` +rtsp://IPAddress of Node:31113/ds-test +``` + +You should see the video output like below with the AI model detecting objects. + +![Deepstream_Video](screenshots/Deepstream.png) + +`NOTE:` Video stream in VLC will change if you provide an input RTSP camera. + + +### Uninstalling the GPU Operator + +Execute the below commands to uninstall the GPU Operator: + +``` +$ helm ls +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +gpu-operator-1606173805 nvidia-gpu-operator 1 2024-03-20 20:23:28.063421701 +0000 UTC deployed gpu-operator-24.3.0 v23.3.2 + +$ helm del gpu-operator-1606173805 -n nvidia-gpu-operator + +``` + +### Uninstalling the Network Operator + +Execute the below commands to uninstall the Network Operator: + +``` +$ helm ls -n network-operator +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +network-operator network-operator 1 2024-03-20 17:09:04.665593336 +0000 UTC deployed network-operator-24.1.0 v24.1.0 + +$ helm del network-operator -n network-operator +``` diff --git a/install-guides/RHEL-8-8_Server_x86-arm64_v13.1.md b/install-guides/RHEL-8-8_Server_x86-arm64_v13.1.md new file mode 100644 index 0000000..22abf8b --- /dev/null +++ b/install-guides/RHEL-8-8_Server_x86-arm64_v13.1.md @@ -0,0 +1,1177 @@ +# NVIDIA Cloud Native Stack v13.1 - Install Guide for RHEL Server +## Introduction + +This document describes how to setup the NVIDIA Cloud Native Stack collection on a single or multiple NVIDIA Certified Systems. NVIDIA Cloud Native Stack can be configured to create a single node Kubernetes cluster or to create/add additional worker nodes to join an existing cluster. + +NVIDIA Cloud Native Stack v13.1 includes: +- RHEL 8.8 +- Containerd 1.7.20 +- Kubernetes version 1.30.2 +- Helm 3.15.3 +- NVIDIA GPU Operator 24.6.0 + - NVIDIA GPU Driver: 550.90.07 + - NVIDIA Container Toolkit: 1.16.1 + - NVIDIA K8S Device Plugin: 0.16.1 + - NVIDIA DCGM-Exporter: 3.3.7-3.5.0 + - NVIDIA DCGM: 3.3.7-1 + - NVIDIA GPU Feature Discovery: 0.16.1 + - NVIDIA K8s MIG Manager: 0.8.0 + - NVIDIA Driver Manager: 0.6.10 + - Node Feature Discovery: 0.16.3 + - NVIDIA KubeVirt GPU Device Plugin: 1.2.9 + - NVIDIA GDS Driver: 2.17.5 + - NVIDIA Kata Manager for Kubernetes: 0.2.1 + - NVIDIA Confidential Computing Manager for Kubernetes: 0.1.1 +- NVIDIA Network Operator 24.4.1 + - Mellanox MOFED Driver 24.04-0.6.6.0-0 + - RDMA Shared Device Plugin 1.4.0 + - SRIOV Device Plugin 3.6.2 + - Container Networking Plugins 1.3.0 + - Multus 3.9.3 + - Whereabouts 0.7.0 + +## Table of Contents + +- [Prerequisites](#Prerequisites) +- [Installing the RHEL Operating System](#Installing-the-RHEL-Operating-System) +- [Installing Container Runtime](#Installing-Container-Runtime) + - [Installing Containerd](#Installing-Containerd) + - [Installing CRI-O](#Installing-CRI-O) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Installing Helm](#Installing-Helm) +- [Adding an Additional Node to NVIDIA Cloud Native Stack](#Adding-additional-node-to-NVIDIA-Cloud-Native-Stack) +- [Installing NVIDIA Network Operator](#Installing-NVIDIA-Network-Operator) +- [Installing the GPU Operator](#Installing-the-GPU-Operator) +- [Validating the Network Operator with GPUDirect RDMA](#Validating-the-Network-Operator-with-GPUDirect-RDMA) +- [Validating the GPU Operator](#Validating-the-GPU-Operator) +- [Validate NVIDIA Cloud Native Stack with an Application from NGC](#Validate-NVIDIA-Cloud-Native-Stack-with-an-application-from-NGC) +- [Uninstalling the GPU Operator](#Uninstalling-the-GPU-Operator) +- [Uninstalling the Network Operator](#Uninstalling-the-Network-Operator) + +### Prerequisites + +The following instructions assume the following: + +- You have [NVIDIA-Certified Systems](https://docs.nvidia.com/ngc/ngc-deploy-on-premises/nvidia-certified-systems/index.html) with Mellanox CX NICs for x86-64 servers +- You have [NVIDIA Qualified Systems](https://www.nvidia.com/en-us/data-center/data-center-gpus/qualified-system-catalog/?start=0&count=50&pageNumber=1&filters=eyJmaWx0ZXJzIjpbXSwic3ViRmlsdGVycyI6eyJwcm9jZXNzb3JUeXBlIjpbIkFSTS1UaHVuZGVyWDIiLCJBUk0tQWx0cmEiXX0sImNlcnRpZmllZEZpbHRlcnMiOnt9LCJwYXlsb2FkIjpbXX0=) for ARM servers + `NOTE:` For ARM systems, NVIDIA Network Operator is not supported yet. +- You will perform a clean install. + +To determine if your system qualifies as an NVIDIA Certified System, review the list of NVIDIA Certified Systems [here](https://docs.nvidia.com/ngc/ngc-deploy-on-premises/nvidia-certified-systems/index.html). + +Please note that NVIDIA Cloud Native Stack is validated only on systems with the default kernel (not HWE). + + +### Installing the RHEL 8.8 Operating System +These instructions require installing RedHat Enterprise Linux 8.8, can be downloaded [here](https://access.redhat.com/downloads/content/479/ver=/rhel---8/8.8/x86_64/product-software). + +Please reference the [RHEL Server Installation Guide](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/8/html-single/performing_a_standard_rhel_8_installation/index). + +### Changing the SELinux State + +Open the `/etc/selinux/config` file in a text editor of your choice, for example: + +``` +sudo vi /etc/selinux/config +``` + +Configure the `SELINUX=enforcing` option: +``` +# This file controls the state of SELinux on the system. +# SELINUX= can take one of these three values: +# enforcing - SELinux security policy is enforced. +# permissive - SELinux prints warnings instead of enforcing. +# disabled - No SELinux policy is loaded. +SELINUX=enforcing +# SELINUXTYPE= can take one of these two values: +# targeted - Targeted processes are protected, +# mls - Multi Level Security protection. +SELINUXTYPE=targeted +``` + +Save the change, and restart the system: + +``` +sudo reboot +``` + +After the system rebooted, run the below command to verify the status + +``` +sestatus +``` + +Expected output: + +``` +SELinux status: enabled +SELinuxfs mount: /sys/fs/selinux +SELinux root directory: /etc/selinux +Loaded policy name: targeted +Current mode: enforcing +Mode from config file: enforcing +Policy MLS status: enabled +Policy deny_unknown status: allowed +Memory protection checking: actual (secure) +Max kernel policy version: 31 +``` + +## Installing Container Runtime + +You need to install a container runtime into each node in the cluster so that Pods can run there. Currently Cloud Native Stack provides below container runtimes: + +- [Installing Containerd](#Installing-Containerd) +- [Installing CRI-O](#Installing-CRI-O) + +`NOTE:` Only install one of either `Containerd` or `CRI-O`, not both! + +These steps apply to both runtimes. + +Install required packages: + +``` +sudo dnf install -y yum-utils device-mapper-persistent-data lvm2 +``` + +Configure the `overlay` and `br_netfilter` kernel modules required by Kubernetes: + +``` +cat <:6443 --token 489oi5.sm34l9uh7dk4z6cm \ + --discovery-token-ca-cert-hash sha256:17165b6c4a4b95d73a3a2a83749a957a10161ae34d2dfd02cd730597579b4b34 +``` + + +Following the instructions in the output, execute the commands as shown below: + +``` +mkdir -p $HOME/.kube +``` + +``` +sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config +``` + +``` +sudo chown $(id -u):$(id -g) $HOME/.kube/config +``` + +With the following command, you install a pod-network add-on to the control plane node. We are using calico as the pod-network add-on here: + +``` +kubectl apply -f https://raw.githubusercontent.com/projectcalico/calico/v3.25.1/manifests/calico.yaml +``` + +Update the Calico Daemonset + +``` +kubectl set env daemonset/calico-node -n kube-system IP_AUTODETECTION_METHOD=interface=ens\*,eth\*,enc\*,enp\* +``` + +You can execute the below commands to ensure that all pods are up and running: + +``` +kubectl get pods --all-namespaces +``` + +Output: + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +kube-system calico-kube-controllers-65b8787765-bjc8h 1/1 Running 0 2m8s +kube-system calico-node-c2tmk 1/1 Running 0 2m8s +kube-system coredns-5c98db65d4-d4kgh 1/1 Running 0 9m8s +kube-system coredns-5c98db65d4-h6x8m 1/1 Running 0 9m8s +kube-system etcd-#yourhost 1/1 Running 0 8m25s +kube-system kube-apiserver-#yourhost 1/1 Running 0 8m7s +kube-system kube-controller-manager-#yourhost 1/1 Running 0 8m3s +kube-system kube-proxy-6sh42 1/1 Running 0 9m7s +kube-system kube-scheduler-#yourhost 1/1 Running 0 8m26s +``` + +The get nodes command shows that the control-plane node is up and ready: + +``` +kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane 10m v1.30.0 +``` + +Since we are using a single-node Kubernetes cluster, the cluster will not schedule pods on the control plane node by default. To schedule pods on the control plane node, we have to remove the taint by executing the following command: + +``` +kubectl taint nodes --all node-role.kubernetes.io/control-plane- +``` + +Refer to [Installing Kubeadm](https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/install-kubeadm/) +for more information. + +### Installing Helm + +Execute the following command to download and install Helm 3.15.3 for `x86-64` system: + +``` +wget https://get.helm.sh/helm-v3.15.3-linux-amd64.tar.gz +``` + +``` +tar -zxvf helm-v3.15.3-linux-amd64.tar.gz + ``` + + ``` +sudo mv linux-amd64/helm /usr/local/bin/helm + ``` + + ``` +rm -rf helm-v3.15.3-linux-amd64.tar.gz linux-amd64/ +``` + +Download and install Helm 3.15.3 for `ARM` system: + +``` +wget https://get.helm.sh/helm-v3.15.3-linux-arm64.tar.gz +``` + +``` +tar -zxvf helm-v3.15.3-linux-arm64.tar.gz + ``` + +``` +sudo mv linux-arm64/helm /usr/local/bin/helm +``` + +``` +rm -rf helm-v3.15.3-linux-arm64.tar.gz linux-arm64/ +``` + +Refer to the Helm 3.15.3 [release notes](https://github.com/helm/helm/releases) and the [Installing Helm guide](https://helm.sh/docs/using_helm/#installing-helm) for more information. + + +### Adding an Additional Node to NVIDIA Cloud Native Stack + +`NOTE:` If you're not adding additional nodes, please skip this step and proceed to the next step [Installing NVIDIA Network Operator](#Installing-NVIDIA-Network-Operator) + +Make sure to install the Containerd and Kubernetes packages on additional nodes. + +Prerequisites: +- [Installing Containerd](#Installing-Containerd) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Disable swap](#Disable-swap) + +Once the prerequisites are completed on the additional nodes, execute the below command on the control-plane node and then execute the join command output on an additional node to add the additional node to NVIDIA Cloud Native Stack: + +``` +sudo kubeadm token create --print-join-command +``` + +Output: +``` +example: +sudo kubeadm join 10.110.0.34:6443 --token kg2h7r.e45g9uyrbm1c0w3k --discovery-token-ca-cert-hash sha256:77fd6571644373ea69074dd4af7b077bbf5bd15a3ed720daee98f4b04a8f524e +``` +`NOTE`: control-plane node and worker node should not have the same node name. + +The get nodes command shows that the master and worker nodes are up and ready: + +``` +kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane 10m v1.30.0 +#yourhost-worker Ready 10m v1.30.0 +``` + +### Adding an Additional Node to NVIDIA Cloud Native Stack + +`NOTE:` If you're not adding additional nodes, please skip this step and proceed to the next step [Installing NVIDIA Network Operator](#Installing-NVIDIA-Network-Operator) + +Make sure to install the Containerd and Kubernetes packages on additional nodes. + +Prerequisites: +- [Installing Containerd](#Installing-Containerd) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Disable swap](#Disable-swap) + +Once the prerequisites are completed on the additional nodes, execute the below command on the control-plane node and then execute the join command output on an additional node to add the additional node to NVIDIA Cloud Native Stack: + +``` + sudo kubeadm token create --print-join-command +``` + +Output: +``` +example: +sudo kubeadm join 10.110.0.34:6443 --token kg2h7r.e45g9uyrbm1c0w3k --discovery-token-ca-cert-hash sha256:77fd6571644373ea69074dd4af7b077bbf5bd15a3ed720daee98f4b04a8f524e +``` +`NOTE`: control-plane node and worker node should not have the same node name. + +The get nodes command shows that the master and worker nodes are up and ready: + +``` + kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane 10m v1.30.0 +#yourhost-worker Ready 10m v1.30.0 +``` + +### Installing NVIDIA Network Operator + +`NOTE:` If Mellanox NICs are not connected to your nodes, please skip this step and proceed to the next step [Installing GPU Operator](#Installing-GPU-Operator) + +The below instructions assume that Mellanox NICs are connected to your machines. + +Execute the below command to verify Mellanox NICs are enabled on your machines: + +``` + lspci | grep -i "Mellanox" +``` + +Output: +``` +0c:00.0 Ethernet controller: Mellanox Technologies MT2892 Family [ConnectX-6 Dx] +0c:00.1 Ethernet controller: Mellanox Technologies MT2892 Family [ConnectX-6 Dx] +``` +Execute the below command to determine which Mellanox device is active: + +`NOTE:` Use whicever device shows as `Link Detected: yes` in further steps. The below command works only if you add the NICs before installing the Operating System. + +``` +for device in `sudo lshw -class network -short | grep -i ConnectX | awk '{print $2}' | egrep -v 'Device|path' | sed '/^$/d'`;do echo -n $device; sudo ethtool $device | grep -i "Link detected"; done +``` +Output: +``` +ens160f0 Link detected: yes +ens160f1 Link detected: no +``` + +Create the custom network operator values.yaml and update the active Mellanox device from the above command: +``` +nano network-operator-values.yaml +deployCR: true +ofedDriver: + deploy: true +rdmaSharedDevicePlugin: + deploy: true + resources: + - name: rdma_shared_device_a + vendors: [15b3] + devices: [ens160f0] +``` + +For more information about custom network operator values.yaml, please refer [Network Operator](https://docs.mellanox.com/display/COKAN10/Network+Operator#NetworkOperator-Example2:RDMADevicePluginConfiguration) + +Add the NVIDIA repo: +``` + helm repo add mellanox https://mellanox.github.io/network-operator +``` + +Update the Helm repo: +``` + helm repo update +``` +Install Network Operator: +``` + kubectl label nodes --all node-role.kubernetes.io/master- --overwrite + helm install -f --version 24.1.0 ./network-operator-values.yaml -n network-operator --create-namespace --wait network-operator mellanox/network-operator +``` +#### Validating the State of the Network Operator + +Please note that the installation of the Network Operator can take a couple of minutes. How long the installation will take depends on your internet speed. + +``` +kubectl get pods --all-namespaces | egrep 'network-operator|nvidia-network-operator-resources' +``` + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +network-operator network-operator-547cb8d999-mn2h9 1/1 Running 0 17m +network-operator network-operator-node-feature-discovery-master-596fb8b7cb-qrmvv 1/1 Running 0 17m +network-operator network-operator-node-feature-discovery-worker-qt5xt 1/1 Running 0 17m +nvidia-network-operator-resources cni-plugins-ds-dl5vl 1/1 Running 0 17m +nvidia-network-operator-resources kube-multus-ds-w82rv 1/1 Running 0 17m +nvidia-network-operator-resources mofed-ubuntu20.04-ds-xfpzl 1/1 Running 0 17m +nvidia-network-operator-resources rdma-shared-dp-ds-2hgb6 1/1 Running 0 17m +nvidia-network-operator-resources sriov-device-plugin-ch7bz 1/1 Running 0 10m +nvidia-network-operator-resources whereabouts-56ngr 1/1 Running 0 10m +``` + +Please refer to the [Network Operator page](https://docs.mellanox.com/display/COKAN10/Network+Operator) for more information. + +### Installing GPU Operator + +Add the NVIDIA repo: + +``` + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia +``` + +Update the Helm repo: + +``` + helm repo update +``` + +Install GPU Operator: + +`NOTE:` If you installed Network Operator, please skip the below command and follow the [GPU Operator with RDMA](#GPU-Operator-with-RDMA) + +``` + helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --wait --generate-name +``` + +#### GPU Operator with RDMA + +- Prerequisites: + - Please install the [Network Operator](#Installing NVIDIA Network Operator) to ensure that the MOFED drivers are installed. + +After Network Operator installation is completed, execute the below command to install the GPU Operator to load nv_peer_mem modules: + +``` + helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --set driver.rdma.enabled=true --wait --generate-name +``` + +#### GPU Operator with Host MOFED Driver and RDMA + +If the host is already installed MOFED driver without network operator, execute the below command to install the GPU Operator to load nv_peer_mem module + +``` + helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --set driver.rdma.enabled=true,driver.rdma.useHostMofed=true --wait --generate-name + +``` + +### GPU Operator with GPU Direct Storage(GDS) + +Execute the below command to enable the GPU Direct Storage Driver on GPU Operator + +``` +helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --set gds.enabled=true +``` +For more information refer, [GPU Direct Storage](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/gpu-operator-rdma.html) + +#### Validating the State of the GPU Operator: + +Please note that the installation of the GPU Operator can take a couple of minutes. How long the installation will take depends on your internet speed. + +``` +kubectl get pods --all-namespaces | grep -v kube-system +``` + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +default gpu-operator-1622656274-node-feature-discovery-master-5cddq96gq 1/1 Running 0 2m39s +default gpu-operator-1622656274-node-feature-discovery-worker-wr88v 1/1 Running 0 2m39s +default gpu-operator-7db468cfdf-mdrdp 1/1 Running 0 2m39s +nvidia-gpu-operator gpu-feature-discovery-g425f 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-container-toolkit-daemonset-mcmxj 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-cuda-validator-s6x2p 0/1 Completed 0 48s +nvidia-gpu-operator nvidia-dcgm-exporter-wtxnx 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-dcgm-jbz94 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-daemonset-hzzdt 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-validator-9nkxq 0/1 Completed 0 17s +nvidia-gpu-operator nvidia-driver-daemonset-kt8g5 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-operator-validator-cw4j5 1/1 Running 0 2m20s + +``` + +Please refer to the [GPU Operator page](https://ngc.nvidia.com/catalog/helm-charts/nvidia:gpu-operator) on NGC for more information. + +For multiple worker nodes, execute the below command to fix the CoreDNS and Node Feature Discovery. + +``` +kubectl delete pods $(kubectl get pods -n kube-system | grep core | awk '{print $1}') -n kube-system; kubectl delete pod $(kubectl get pods -o wide -n nvidia-gpu-operator | grep node-feature-discovery | grep -v master | awk '{print $1}') -n nvidia-gpu-operator +``` + +#### GPU Operator with MIG + +`NOTE:` Only A100 and A30 GPUs are supported for GPU Operator with MIG + +Multi-Instance GPU (MIG) allows GPUs based on the NVIDIA Ampere architecture (such as NVIDIA A100) to be securely partitioned into separate GPU instances for CUDA applications. For more information about enabling the MIG capability, please refer to [GPU Operator with MIG](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/gpu-operator-mig.html) + +### Validating the Network Operator with GPUDirect RDMA + +Execute the below command to list the Mellanox NIC's with the status: +``` +kubectl exec -it $(kubectl get pods -n nvidia-network-operator-resources | grep mofed | awk '{print $1}') -n nvidia-network-operator-resources -- ibdev2netdev +``` +Output: +``` +mlx5_0 port 1 ==> ens192f0 (Up) +mlx5_1 port 1 ==> ens192f1 (Down) +``` + +Create network definition for IPAM and replace the `ens192f0` with an active Mellanox device for `master`: +``` +$ nano networkdefinition.yaml +apiVersion: k8s.cni.cncf.io/v1 +kind: NetworkAttachmentDefinition +metadata: + annotations: + k8s.v1.cni.cncf.io/resourceName: rdma/rdma_shared_device_a + name: rdma-net-ipam + namespace: default +spec: + config: |- + { + "cniVersion": "0.3.1", + "name": "rdma-net-ipam", + "plugins": [ + { + "ipam": { + "datastore": "kubernetes", + "kubernetes": { + "kubeconfig": "/etc/cni/net.d/whereabouts.d/whereabouts.kubeconfig" + }, + "log_file": "/tmp/whereabouts.log", + "log_level": "debug", + "range": "192.168.112.0/24", + "type": "whereabouts" + }, + "type": "macvlan", + "master": "ens192f0" + }, + { + "mtu": 1500, + "type": "tuning" + } + ] + } +EOF +``` +`NOTE:` If you do not have VLAN-based networking on the high-performance side, please set "vlan": 0 + + +Execute the below command to install network definition on NVIDIA Cloud Native Stack from the control-plane node: + + ``` +kubectl apply -f networkdefinition.yaml + ``` + +Now create the pod YAML with the below content: + +``` +cat < ../../devices/virtual/net/eth0 +lrwxrwxrwx 1 root root 0 Jun 1 02:26 lo -> ../../devices/virtual/net/lo +lrwxrwxrwx 1 root root 0 Jun 1 02:26 net1 -> ../../devices/virtual/net/net1 +lrwxrwxrwx 1 root root 0 Jun 1 02:26 tunl0 -> ../../devices/virtual/net/tunl0 +``` + +Execute the below command to list the Mellanox NIC's with the status: +``` +kubectl exec -it $(kubectl get pods -n nvidia-network-operator-resources | grep mofed | awk '{print $1}') -n nvidia-network-operator-resources -- ibdev2netdev +``` +Output: +``` +mlx5_0 port 1 ==> ens192f0 (Up) +mlx5_1 port 1 ==> ens192f1 (Down) +``` + +Update the above Mellanox NIC, for which status is `Up` in the below command: + +``` +kubectl exec -it rdma-test-pod-1 -- bash + +[root@rdma-test-pod-1 /]# ib_write_bw -d mlx5_0 -a -F --report_gbits -q 1 +************************************ +* Waiting for client to connect... * +************************************ +``` + +In a separate terminal, print the network address of the secondary interface on the `rdma-test-pod-1` pod: + +``` +$ kubectl exec rdma-test-pod-1 -- ip addr show dev net1 +5: net1@if24: mtu 9000 qdisc noqueue state UP group default + link/ether 62:51:fb:13:88:ce brd ff:ff:ff:ff:ff:ff link-netnsid 0 + inet 192.168.111.1/24 brd 192.168.111.255 scope global net1 + valid_lft forever preferred_lft forever +``` + +Execute the below command with the above inet address to verify the nv_peer_memory performance on NVIDIA Cloud Native Stack: +``` +$ kubectl exec -it rdma-test-pod-2 -- bash +[root@rdma-test-pod-1 /]# ib_write_bw -d mlx5_0 -a -F --report_gbits -q 1 192.168.111.2 +--------------------------------------------------------------------------------------- + RDMA_Write BW Test + Dual-port : OFF Device : mlx5_0 + Number of qps : 1 Transport type : IB + Connection type : RC Using SRQ : OFF + TX depth : 128 + CQ Moderation : 100 + Mtu : 1024[B] + Link type : Ethernet + GID index : 4 + Max inline data : 0[B] + rdma_cm QPs : OFF + Data ex. method : Ethernet +--------------------------------------------------------------------------------------- + local address: LID 0000 QPN 0x0137 PSN 0x3c5d65 RKey 0x00370e VAddr 0x007ff44bf1d000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:01 + remote address: LID 0000 QPN 0x0136 PSN 0x475031 RKey 0x002c23 VAddr 0x007fd3d83cb000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:02 +--------------------------------------------------------------------------------------- + #bytes #iterations BW peak[Gb/sec] BW average[Gb/sec] MsgRate[Mpps] + 2 5000 0.080755 0.073090 4.568094 + 4 5000 0.16 0.15 4.588128 + 8 5000 0.31 0.29 4.567442 + 16 5000 0.66 0.59 4.647555 + 32 5000 1.35 1.22 4.776518 + 64 5000 2.50 2.29 4.481806 + 128 5000 5.34 4.73 4.621828 + 256 5000 10.53 9.11 4.448153 + 512 5000 21.03 17.05 4.162100 + 1024 5000 38.67 34.16 4.169397 + 2048 5000 47.11 43.50 2.655219 + 4096 5000 51.29 51.02 1.557094 + 8192 5000 52.00 51.98 0.793178 + 16384 5000 52.33 52.32 0.399164 + 32768 5000 52.47 52.47 0.200143 + 65536 5000 52.51 52.50 0.100143 + 131072 5000 52.51 52.51 0.050078 + 262144 5000 52.49 52.49 0.025029 + 524288 5000 52.50 52.50 0.012517 + 1048576 5000 52.51 52.51 0.006260 + 2097152 5000 52.51 52.51 0.003130 + 4194304 5000 52.51 52.51 0.001565 + 8388608 5000 52.52 52.52 0.000783 +--------------------------------------------------------------------------------------- +``` + +``` +[root@rdma-test-pod-1 /]# ib_write_bw -d mlx5_0 -a -F --report_gbits -q 1 + +************************************ +* Waiting for client to connect... * +************************************ +--------------------------------------------------------------------------------------- + RDMA_Write BW Test + Dual-port : OFF Device : mlx5_0 + Number of qps : 1 Transport type : IB + Connection type : RC Using SRQ : OFF + CQ Moderation : 100 + Mtu : 1024[B] + Link type : Ethernet + GID index : 8 + Max inline data : 0[B] + rdma_cm QPs : OFF + Data ex. method : Ethernet +--------------------------------------------------------------------------------------- + local address: LID 0000 QPN 0x0136 PSN 0x475031 RKey 0x002c23 VAddr 0x007fd3d83cb000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:02 + remote address: LID 0000 QPN 0x0137 PSN 0x3c5d65 RKey 0x00370e VAddr 0x007ff44bf1d000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:01 +--------------------------------------------------------------------------------------- + #bytes #iterations BW peak[Gb/sec] BW average[Gb/sec] MsgRate[Mpps] + 8388608 5000 52.52 52.52 0.000783 +--------------------------------------------------------------------------------------- +``` +The benchmark achieved approximately 52 Gbps throughput. + +Exit from RDMA test pods and then delete the RDMA test pods with the below command: + +``` +$ kubectl delete pod rdma-test-pod-1 rdma-test-pod-2 +``` + +### Validating the GPU Operator + +GPU Operator validates the through the nvidia-device-plugin-validation pod and the nvidia-driver-validation pod. If both are completed successfully (see output from kubectl get pods --all-namespaces | grep -v kube-system), NVIDIA Cloud Native Stack is working as expected. This section provides two examples of validating that the GPU is usable from within a pod to validate the manually. + +#### Example 1: nvidia-smi + +Execute the following: + +``` +cat < Open Network Stream > Entering the following URL: + +``` +rtsp://IPAddress of Node:31113/ds-test +``` + +You should see the video output like below with the AI model detecting objects. + +![Deepstream_Video](screenshots/Deepstream.png) + +`NOTE:` Video stream in VLC will change if you provide an input RTSP camera. + + +### Uninstalling the GPU Operator + +Execute the below commands to uninstall the GPU Operator: + +``` +$ helm ls +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +gpu-operator-1606173805 nvidia-gpu-operator 1 2024-03-20 20:23:28.063421701 +0000 UTC deployed gpu-operator-24.3.0 v23.3.2 + +$ helm del gpu-operator-1606173805 -n nvidia-gpu-operator + +``` + +### Uninstalling the Network Operator + +Execute the below commands to uninstall the Network Operator: + +``` +$ helm ls -n network-operator +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +network-operator network-operator 1 2024-03-20 17:09:04.665593336 +0000 UTC deployed network-operator-24.1.0 v24.1.0 + +$ helm del network-operator -n network-operator +``` diff --git a/install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v11.0.md b/install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v11.0.md index 135e974..68fd4e7 100644 --- a/install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v11.0.md +++ b/install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v11.0.md @@ -126,13 +126,13 @@ Wed Mar 24 12:47:29 2023 Set up the repository and update the apt package index: ``` -$ sudo apt-get update +sudo apt update ``` Install packages to allow apt to use a repository over HTTPS: ``` -$ sudo apt-get install -y \ +sudo apt install -y \ apt-transport-https \ ca-certificates \ curl \ @@ -140,52 +140,51 @@ $ sudo apt-get install -y \ software-properties-common ``` +``` +sudo install -m 0755 -d /etc/apt/keyrings +``` + Add Docker's official GPG key: ``` -$ curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - +sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc ``` -Verify that you now have the key with the fingerprint 9DC8 5822 9FC7 DD38 854A E2D8 8D81 803C 0EBF CD88 by searching for the last 8 characters of the fingerprint: ``` -$ sudo apt-key fingerprint 0EBFCD88 - -pub rsa4096 2017-02-22 [SCEA] - 9DC8 5822 9FC7 DD38 854A E2D8 8D81 803C 0EBF CD88 -uid [ unknown] Docker Release (CE deb) -sub rsa4096 2017-02-22 [S] -``` - +sudo chmod a+r /etc/apt/keyrings/docker.asc +``` Use the following command to set up the stable repository: ``` -$ sudo add-apt-repository \ - "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) \ - stable" +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null ``` Install Docker Engine - Community + Update the apt package index: ``` -$ sudo apt-get update +sudo apt update ``` Install Docker Engine: ``` -$ sudo apt-get install -y docker-ce docker-ce-cli containerd.io +sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin ``` Verify that Docker Engine - Community is installed correctly by running the hello-world image: ``` -$ sudo docker run hello-world +sudo docker run hello-world ``` More information on how to install Docker can be found at https://docs.docker.com/install/linux/docker-ce/ubuntu/. + #### Installing NVIDIA Container Toolkit Setup the pacakge repository @@ -222,7 +221,7 @@ Edit the docker daemon configuration to add the following line and save the file Example: ``` -$ sudo nano /etc/docker/daemon.json +sudo nano /etc/docker/daemon.json { "runtimes": { @@ -245,7 +244,7 @@ sudo systemctl daemon-reload && sudo systemctl restart docker Execute the below command to validate docker default runtime as NVIDIA: ``` -$ sudo docker info | grep -i runtime +sudo docker info | grep -i runtime ``` Output: @@ -786,7 +785,7 @@ Wed Apr 14 12:47:29 2023 Create a pod YAML file: ``` -$ cat < -sub rsa4096 2017-02-22 [S] -``` - +sudo chmod a+r /etc/apt/keyrings/docker.asc +``` Use the following command to set up the stable repository: ``` -$ sudo add-apt-repository \ - "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) \ - stable" +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null ``` Install Docker Engine - Community + Update the apt package index: ``` -$ sudo apt-get update +sudo apt update ``` Install Docker Engine: ``` -$ sudo apt-get install -y docker-ce docker-ce-cli containerd.io +sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin ``` Verify that Docker Engine - Community is installed correctly by running the hello-world image: ``` -$ sudo docker run hello-world +sudo docker run hello-world ``` More information on how to install Docker can be found at https://docs.docker.com/install/linux/docker-ce/ubuntu/. + #### Installing NVIDIA Container Toolkit Setup the pacakge repository @@ -222,7 +221,7 @@ Edit the docker daemon configuration to add the following line and save the file Example: ``` -$ sudo nano /etc/docker/daemon.json +sudo nano /etc/docker/daemon.json { "runtimes": { @@ -245,7 +244,7 @@ sudo systemctl daemon-reload && sudo systemctl restart docker Execute the below command to validate docker default runtime as NVIDIA: ``` -$ sudo docker info | grep -i runtime +sudo docker info | grep -i runtime ``` Output: @@ -792,7 +791,7 @@ Wed Apr 14 12:47:29 2023 Create a pod YAML file: ``` -$ cat < -sub rsa4096 2017-02-22 [S] -``` - +sudo chmod a+r /etc/apt/keyrings/docker.asc +``` Use the following command to set up the stable repository: ``` -$ sudo add-apt-repository \ - "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) \ - stable" +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null ``` Install Docker Engine - Community + Update the apt package index: ``` -$ sudo apt-get update +sudo apt update ``` Install Docker Engine: ``` -$ sudo apt-get install -y docker-ce docker-ce-cli containerd.io +sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin ``` Verify that Docker Engine - Community is installed correctly by running the hello-world image: ``` -$ sudo docker run hello-world +sudo docker run hello-world ``` More information on how to install Docker can be found at https://docs.docker.com/install/linux/docker-ce/ubuntu/. + #### Installing NVIDIA Container Toolkit Setup the pacakge repository @@ -222,7 +221,7 @@ Edit the docker daemon configuration to add the following line and save the file Example: ``` -$ sudo nano /etc/docker/daemon.json +sudo nano /etc/docker/daemon.json { "runtimes": { @@ -245,7 +244,7 @@ sudo systemctl daemon-reload && sudo systemctl restart docker Execute the below command to validate docker default runtime as NVIDIA: ``` -$ sudo docker info | grep -i runtime +sudo docker info | grep -i runtime ``` Output: @@ -792,7 +791,7 @@ Wed Apr 14 12:47:29 2023 Create a pod YAML file: ``` -$ cat < /dev/null +``` + +Install Docker Engine - Community + +Update the apt package index: + +``` +sudo apt update +``` + +Install Docker Engine: + +``` +sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin +``` + +Verify that Docker Engine - Community is installed correctly by running the hello-world image: + +``` +sudo docker run hello-world +``` + +More information on how to install Docker can be found at https://docs.docker.com/install/linux/docker-ce/ubuntu/. + + +#### Installing NVIDIA Container Toolkit + +Setup the pacakge repository + +``` +distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \ + && curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list +``` + +Update the package index + +``` +sudo apt update +``` + +Install NVIDIA Conatiner Toolkit + +``` +sudo apt-get install -y nvidia-container-toolkit=1.15.0-1 +``` + + +### Update the Docker Default Runtime + + +Edit the docker daemon configuration to add the following line and save the file: + +``` +"default-runtime" : "nvidia" +``` + +Example: +``` +sudo nano /etc/docker/daemon.json + +{ + "runtimes": { + "nvidia": { + "path": "nvidia-container-runtime", + "runtimeArgs": [] + } + }, + "default-runtime" : "nvidia" +} +``` + +Now execute the below commands to restart the docker daemon: +``` +sudo systemctl daemon-reload && sudo systemctl restart docker +``` + +#### Validate docker default runtime + +Execute the below command to validate docker default runtime as NVIDIA: + +``` +sudo docker info | grep -i runtime +``` + +Output: +``` +Runtimes: nvidia runc +Default Runtime: nvidia +``` + + + +## Installing Container Runtime + +You need to install a container runtime into each node in the cluster so that Pods can run there. Currently Cloud Native Stack provides below container runtimes + +- [Installing Containerd](#Installing-Containerd) +- [Installing CRI-O](#Installing-CRI-O) + +`NOTE:` Only install one of either `Containerd` or `CRI-O`, not both! + +These steps apply to both runtimes. + +Set up the repository and update the apt package index: + +``` +sudo apt-get update +``` + +Install packages to allow apt to use a repository over HTTPS: + +``` +sudo apt-get install -y apt-transport-https ca-certificates gnupg-agent libseccomp2 autotools-dev debhelper software-properties-common +``` + +Configure the `overlay` and `br_netfilter` kernel modules required by Kubernetes: + +``` +cat <:6443 --token 489oi5.sm34l9uh7dk4z6cm \ + --discovery-token-ca-cert-hash sha256:17165b6c4a4b95d73a3a2a83749a957a10161ae34d2dfd02cd730597579b4b34 +``` + + +Following the instructions in the output, execute the commands as shown below: + +``` + mkdir -p $HOME/.kube + sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config + sudo chown $(id -u):$(id -g) $HOME/.kube/config +``` + +With the following command, you install a pod-network add-on to the control plane node. We are using calico as the pod-network add-on here: + +``` + kubectl apply -f https://raw.githubusercontent.com/projectcalico/calico/v3.25.1/manifests/calico.yaml +``` + +Update the Calico Daemonset + +``` +kubectl set env daemonset/calico-node -n kube-system IP_AUTODETECTION_METHOD=interface=ens\*,eth\*,enc\*,enp\* +``` + +You can execute the below commands to ensure that all pods are up and running: + +``` + kubectl get pods --all-namespaces +``` + +Output: + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +kube-system calico-kube-controllers-65b8787765-bjc8h 1/1 Running 0 2m8s +kube-system calico-node-c2tmk 1/1 Running 0 2m8s +kube-system coredns-5c98db65d4-d4kgh 1/1 Running 0 9m8s +kube-system coredns-5c98db65d4-h6x8m 1/1 Running 0 9m8s +kube-system etcd-#yourhost 1/1 Running 0 8m25s +kube-system kube-apiserver-#yourhost 1/1 Running 0 8m7s +kube-system kube-controller-manager-#yourhost 1/1 Running 0 8m3s +kube-system kube-proxy-6sh42 1/1 Running 0 9m7s +kube-system kube-scheduler-#yourhost 1/1 Running 0 8m26s +``` + +The get nodes command shows that the control-plane node is up and ready: + +``` + kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane,master 10m v1.28.12 +``` + +Since we are using a single-node Kubernetes cluster, the cluster will not schedule pods on the control plane node by default. To schedule pods on the control plane node, we have to remove the taint by executing the following command: + +``` +kubectl taint nodes --all node-role.kubernetes.io/control-plane- +``` + +Refer to [Installing Kubeadm](https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/install-kubeadm/) +for more information. + +### Installing Helm + +Execute the following command to download and install Helm 3.15.3 for `x86-64` system: + +``` +wget https://get.helm.sh/helm-v3.15.3-linux-amd64.tar.gz +``` + +``` +tar -zxvf helm-v3.15.3-linux-amd64.tar.gz + ``` + + ``` +sudo mv linux-amd64/helm /usr/local/bin/helm + ``` + + ``` +rm -rf helm-v3.15.3-linux-amd64.tar.gz linux-amd64/ +``` + +Download and install Helm 3.15.3 for `ARM` system: + +``` +wget https://get.helm.sh/helm-v3.15.3-linux-arm64.tar.gz +``` + +``` +tar -zxvf helm-v3.15.3-linux-arm64.tar.gz + ``` + +``` +sudo mv linux-arm64/helm /usr/local/bin/helm +``` + +``` +rm -rf helm-v3.15.3-linux-arm64.tar.gz linux-arm64/ +``` + +Refer to the Helm 3.15.3 [release notes](https://github.com/helm/helm/releases) and the [Installing Helm guide](https://helm.sh/docs/using_helm/#installing-helm) for more information. + +### Adding an Additional Node to NVIDIA Cloud Native Stack + +`NOTE:` If you're not adding additional nodes, please skip this step and proceed to the next step [Installing NVIDIA Network Operator](#Installing-NVIDIA-Network-Operator) + +Make sure to install the Containerd and Kubernetes packages on additional nodes. + +Prerequisites: +- [Installing Docker and NVIDIA Container Toolkit](#Installing-Docker-and-NVIDIA-Container-Toolkit) +- [Installing Containerd](#Installing-Containerd) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Disable swap](#Disable-swap) + +Once the prerequisites are completed on the additional nodes, execute the below command on the control-plane node and then execute the join command output on an additional node to add the additional node to NVIDIA Cloud Native Stack: + +``` + sudo kubeadm token create --print-join-command +``` + +Output: +``` +example: +sudo kubeadm join 10.110.0.34:6443 --token kg2h7r.e45g9uyrbm1c0w3k --discovery-token-ca-cert-hash sha256:77fd6571644373ea69074dd4af7b077bbf5bd15a3ed720daee98f4b04a8f524e +``` +`NOTE`: control-plane node and worker node should not have the same node name. + +The get nodes command shows that the master and worker nodes are up and ready: + +``` + kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane,master 10m v1.28.12 +#yourhost-worker Ready 10m v1.28.12 +``` + +### Installing GPU Operator + +Add the NVIDIA repo: + +``` + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia +``` + +Update the Helm repo: + +``` + helm repo update +``` + +Install GPU Operator: + +`NOTE:` As we are preinstalled with NVIDIA Driver and NVIDIA Container Toolkit, we need to set as `false` when installing the GPU Operator + +``` + helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator --devel nvidia/gpu-operator --set driver.enabled=false,toolkit.enabled=false --wait --generate-name +``` + +#### Validating the State of the GPU Operator: + +Please note that the installation of the GPU Operator can take a couple of minutes. How long the installation will take depends on your internet speed. + +``` +kubectl get pods --all-namespaces | grep -v kube-system +``` + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +default gpu-operator-1622656274-node-feature-discovery-master-5cddq96gq 1/1 Running 0 2m39s +default gpu-operator-1622656274-node-feature-discovery-worker-wr88v 1/1 Running 0 2m39s +default gpu-operator-7db468cfdf-mdrdp 1/1 Running 0 2m39s +nvidia-gpu-operator gpu-feature-discovery-g425f 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-cuda-validator-s6x2p 0/1 Completed 0 48s +nvidia-gpu-operator nvidia-dcgm-exporter-wtxnx 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-dcgm-jbz94 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-daemonset-hzzdt 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-validator-9nkxq 0/1 Completed 0 17s +nvidia-gpu-operator nvidia-operator-validator-cw4j5 1/1 Running 0 2m20s + +``` + +Please refer to the [GPU Operator page](https://ngc.nvidia.com/catalog/helm-charts/nvidia:gpu-operator) on NGC for more information. + +For multiple worker nodes, execute the below command to fix the CoreDNS and Node Feature Discovery. + +``` +kubectl delete pods $(kubectl get pods -n kube-system | grep core | awk '{print $1}') -n kube-system; kubectl delete pod $(kubectl get pods -o wide -n nvidia-gpu-operator | grep node-feature-discovery | grep -v master | awk '{print $1}') -n nvidia-gpu-operator +``` + +#### GPU Operator with MIG + +`NOTE:` Only A100 and A30 GPUs are supported for GPU Operator with MIG + +Multi-Instance GPU (MIG) allows GPUs based on the NVIDIA Ampere architecture (such as NVIDIA A100) to be securely partitioned into separate GPU instances for CUDA applications. For more information about enabling the MIG capability, please refer to [GPU Operator with MIG](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/gpu-operator-mig.html) + + +### Validating the GPU Operator + +GPU Operator validates the through the nvidia-device-plugin-validation pod and the nvidia-driver-validation pod. If both are completed successfully (see output from kubectl get pods --all-namespaces | grep -v kube-system), NVIDIA Cloud Native Stack is working as expected. This section provides two examples of validating that the GPU is usable from within a pod to validate the manually. + +#### Example 1: nvidia-smi + +Execute the following: + +``` +cat < test-image.tgz; sudo ctr -n=k8s.io images import test-image.tgz +``` + +### Validate NVIDIA Cloud Native Stack with an Application from NGC +Another option to validate NVIDIA Cloud Native Stack is by running a demo application hosted on NGC. + +NGC is NVIDIA's GPU-optimized software hub. NGC provides a curated set of GPU-optimized software for AI, HPC, and visualization. The content provided by NVIDIA and third-party ISVs simplify building, customizing, and integrating GPU-optimized software into workflows, accelerating the time to solutions for users. + +Containers, pre-trained models, Helm charts for Kubernetes deployments, and industry-specific AI toolkits with software development kits (SDKs) are hosted on NGC. For more information about how to deploy an application that is hosted on NGC or the NGC Private Registry, please refer to this [NGC Registry Guide](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/NGC_Registry_Guide_v1.0.md). Visit the [public NGC documentation](https://docs.nvidia.com/ngc) for more information. + +The steps in this section use the publicly available DeepStream - Intelligent Video Analytics (IVA) demo application Helm Chart. The application can validate the full NVIDIA Cloud Native Stack and test the connectivity of NVIDIA Cloud Native Stack to remote sensors. DeepStream delivers real-time AI-based video and image understanding and multi-sensor processing on GPUs. For more information, please refer to the [Helm Chart](https://ngc.nvidia.com/catalog/helm-charts/nvidia:video-analytics-demo). + +There are two ways to configure the DeepStream - Intelligent Video Analytics Demo Application on your NVIDIA Cloud Native Stack + +- Using a camera +- Using the integrated video file (no camera required) + +#### Using a camera + +##### Prerequisites: +- RTSP Camera stream + +Go through the below steps to install the demo application: +``` +1. helm fetch https://helm.ngc.nvidia.com/nvidia/charts/video-analytics-demo-0.1.9.tgz --untar + +2. cd into the folder video-analytics-demo and update the file values.yaml + +3. Go to the section Cameras in the values.yaml file and add the address of your IP camera. Read the comments section on how it can be added. Single or multiple cameras can be added as shown below + +cameras: + camera1: rtsp://XXXX +``` + +Execute the following command to deploy the demo application: +``` +helm install video-analytics-demo --name-template iva +``` + +Once the Helm chart is deployed, access the application with the VLC player. See the instructions below. + +#### Using the integrated video file (no camera) + +If you dont have a camera input, please execute the below commands to use the default video already integrated into the application: + +``` +helm fetch https://helm.ngc.nvidia.com/nvidia/charts/video-analytics-demo-0.1.9.tgz + +helm install video-analytics-demo-0.1.9.tgz --name-template iva +``` + +Once the helm chart is deployed, access the application with the VLC player as per the below instructions. +For more information about the demo application, please refer to the [application NGC page](https://ngc.nvidia.com/catalog/helm-charts/nvidia:video-analytics-demo) + +#### Access from WebUI + +Use the below WebUI URL to access the video analytic demo application from the browser: +``` +http://IPAddress of Node:31115/ +``` + +#### Access from VLC + +Download VLC Player from https://www.videolan.org/vlc/ on the machine where you intend to view the video stream. + +View the video stream in VLC by navigating to Media > Open Network Stream > Entering the following URL: + +``` +rtsp://IPAddress of Node:31113/ds-test +``` + +You should see the video output like below with the AI model detecting objects. + +![Deepstream_Video](screenshots/Deepstream.png) + +`NOTE:` Video stream in VLC will change if you provide an input RTSP camera. + + +### Uninstalling the GPU Operator + +Execute the below commands to uninstall the GPU Operator: + +``` +helm ls +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +gpu-operator-1606173805 nvidia-gpu-operator 1 2023-04-14 20:23:28.063421701 +0000 UTC deployed gpu-operator-23.6.0 v23.3.2 + +helm del gpu-operator-1606173805 -n nvidia-gpu-operator +``` diff --git a/install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v12.0.md b/install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v12.0.md index 1f041c6..2eb289d 100644 --- a/install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v12.0.md +++ b/install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v12.0.md @@ -126,13 +126,13 @@ Wed Mar 24 12:47:29 2023 Set up the repository and update the apt package index: ``` -$ sudo apt update +sudo apt update ``` Install packages to allow apt to use a repository over HTTPS: ``` -$ sudo apt install -y \ +sudo apt install -y \ apt-transport-https \ ca-certificates \ curl \ @@ -140,48 +140,46 @@ $ sudo apt install -y \ software-properties-common ``` +``` +sudo install -m 0755 -d /etc/apt/keyrings +``` + Add Docker's official GPG key: ``` -$ curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - +sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc ``` -Verify that you now have the key with the fingerprint 9DC8 5822 9FC7 DD38 854A E2D8 8D81 803C 0EBF CD88 by searching for the last 8 characters of the fingerprint: ``` -$ sudo apt-key fingerprint 0EBFCD88 - -pub rsa4096 2017-02-22 [SCEA] - 9DC8 5822 9FC7 DD38 854A E2D8 8D81 803C 0EBF CD88 -uid [ unknown] Docker Release (CE deb) -sub rsa4096 2017-02-22 [S] -``` - +sudo chmod a+r /etc/apt/keyrings/docker.asc +``` Use the following command to set up the stable repository: ``` -$ sudo add-apt-repository \ - "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) \ - stable" +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null ``` Install Docker Engine - Community + Update the apt package index: ``` -$ sudo apt update +sudo apt update ``` Install Docker Engine: ``` -$ sudo apt install -y docker-ce docker-ce-cli containerd.io +sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin ``` Verify that Docker Engine - Community is installed correctly by running the hello-world image: ``` -$ sudo docker run hello-world +sudo docker run hello-world ``` More information on how to install Docker can be found at https://docs.docker.com/install/linux/docker-ce/ubuntu/. @@ -222,7 +220,7 @@ Edit the docker daemon configuration to add the following line and save the file Example: ``` -$ sudo nano /etc/docker/daemon.json +sudo nano /etc/docker/daemon.json { "runtimes": { @@ -245,7 +243,7 @@ sudo systemctl daemon-reload && sudo systemctl restart docker Execute the below command to validate docker default runtime as NVIDIA: ``` -$ sudo docker info | grep -i runtime +sudo docker info | grep -i runtime ``` Output: @@ -792,7 +790,7 @@ Wed Apr 14 12:47:29 2023 Create a pod YAML file: ``` -$ cat < -sub rsa4096 2017-02-22 [S] -``` - +sudo chmod a+r /etc/apt/keyrings/docker.asc +``` Use the following command to set up the stable repository: ``` -$ sudo add-apt-repository \ - "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) \ - stable" +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null ``` Install Docker Engine - Community + Update the apt package index: ``` -$ sudo apt-get update +sudo apt update ``` Install Docker Engine: ``` -$ sudo apt-get install -y docker-ce docker-ce-cli containerd.io +sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin ``` Verify that Docker Engine - Community is installed correctly by running the hello-world image: ``` -$ sudo docker run hello-world +sudo docker run hello-world ``` More information on how to install Docker can be found at https://docs.docker.com/install/linux/docker-ce/ubuntu/. @@ -222,7 +220,7 @@ Edit the docker daemon configuration to add the following line and save the file Example: ``` -$ sudo nano /etc/docker/daemon.json +sudo nano /etc/docker/daemon.json { "runtimes": { @@ -245,7 +243,7 @@ sudo systemctl daemon-reload && sudo systemctl restart docker Execute the below command to validate docker default runtime as NVIDIA: ``` -$ sudo docker info | grep -i runtime +sudo docker info | grep -i runtime ``` Output: @@ -792,7 +790,7 @@ Wed Apr 14 12:47:29 2023 Create a pod YAML file: ``` -$ cat < /dev/null +``` + +Install Docker Engine - Community + +Update the apt package index: + +``` +sudo apt update +``` + +Install Docker Engine: + +``` +sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin +``` + +Verify that Docker Engine - Community is installed correctly by running the hello-world image: + +``` +sudo docker run hello-world +``` + +More information on how to install Docker can be found at https://docs.docker.com/install/linux/docker-ce/ubuntu/. + + +#### Installing NVIDIA Container Toolkit + +Setup the pacakge repository + +``` +distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \ + && curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list +``` + +Update the package index + +``` +sudo apt update +``` + +Install NVIDIA Conatiner Toolkit + +``` +sudo apt-get install -y nvidia-container-toolkit=1.15.0-1 +``` + + +### Update the Docker Default Runtime + + +Edit the docker daemon configuration to add the following line and save the file: + +``` +"default-runtime" : "nvidia" +``` + +Example: +``` +sudo nano /etc/docker/daemon.json + +{ + "runtimes": { + "nvidia": { + "path": "nvidia-container-runtime", + "runtimeArgs": [] + } + }, + "default-runtime" : "nvidia" +} +``` + +Now execute the below commands to restart the docker daemon: +``` +sudo systemctl daemon-reload && sudo systemctl restart docker +``` + +#### Validate docker default runtime + +Execute the below command to validate docker default runtime as NVIDIA: + +``` +sudo docker info | grep -i runtime +``` + +Output: +``` +Runtimes: nvidia runc +Default Runtime: nvidia +``` + + + +## Installing Container Runtime + +You need to install a container runtime into each node in the cluster so that Pods can run there. Currently Cloud Native Stack provides below container runtimes + +- [Installing Containerd](#Installing-Containerd) +- [Installing CRI-O](#Installing-CRI-O) + +`NOTE:` Only install one of either `Containerd` or `CRI-O`, not both! + +These steps apply to both runtimes. + +Set up the repository and update the apt package index: + +``` +sudo apt-get update +``` + +Install packages to allow apt to use a repository over HTTPS: + +``` +sudo apt-get install -y apt-transport-https ca-certificates gnupg-agent libseccomp2 autotools-dev debhelper software-properties-common +``` + +Configure the `overlay` and `br_netfilter` kernel modules required by Kubernetes: + +``` +cat <:6443 --token 489oi5.sm34l9uh7dk4z6cm \ + --discovery-token-ca-cert-hash sha256:17165b6c4a4b95d73a3a2a83749a957a10161ae34d2dfd02cd730597579b4b34 +``` + + +Following the instructions in the output, execute the commands as shown below: + +``` + mkdir -p $HOME/.kube + sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config + sudo chown $(id -u):$(id -g) $HOME/.kube/config +``` + +With the following command, you install a pod-network add-on to the control plane node. We are using calico as the pod-network add-on here: + +``` + kubectl apply -f https://raw.githubusercontent.com/projectcalico/calico/v3.25.1/manifests/calico.yaml +``` + +Update the Calico Daemonset + +``` +kubectl set env daemonset/calico-node -n kube-system IP_AUTODETECTION_METHOD=interface=ens\*,eth\*,enc\*,enp\* +``` + +You can execute the below commands to ensure that all pods are up and running: + +``` + kubectl get pods --all-namespaces +``` + +Output: + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +kube-system calico-kube-controllers-65b8787765-bjc8h 1/1 Running 0 2m8s +kube-system calico-node-c2tmk 1/1 Running 0 2m8s +kube-system coredns-5c98db65d4-d4kgh 1/1 Running 0 9m8s +kube-system coredns-5c98db65d4-h6x8m 1/1 Running 0 9m8s +kube-system etcd-#yourhost 1/1 Running 0 8m25s +kube-system kube-apiserver-#yourhost 1/1 Running 0 8m7s +kube-system kube-controller-manager-#yourhost 1/1 Running 0 8m3s +kube-system kube-proxy-6sh42 1/1 Running 0 9m7s +kube-system kube-scheduler-#yourhost 1/1 Running 0 8m26s +``` + +The get nodes command shows that the control-plane node is up and ready: + +``` + kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane,master 10m v1.29.6 +``` + +Since we are using a single-node Kubernetes cluster, the cluster will not schedule pods on the control plane node by default. To schedule pods on the control plane node, we have to remove the taint by executing the following command: + +``` +kubectl taint nodes --all node-role.kubernetes.io/control-plane- +``` + +Refer to [Installing Kubeadm](https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/install-kubeadm/) +for more information. + +### Installing Helm + +Execute the following command to download and install Helm 3.15.3 for `x86-64` system: + +``` +wget https://get.helm.sh/helm-v3.15.3-linux-amd64.tar.gz +``` + +``` +tar -zxvf helm-v3.15.3-linux-amd64.tar.gz + ``` + + ``` +sudo mv linux-amd64/helm /usr/local/bin/helm + ``` + + ``` +rm -rf helm-v3.15.3-linux-amd64.tar.gz linux-amd64/ +``` + +Download and install Helm 3.15.3 for `ARM` system: + +``` +wget https://get.helm.sh/helm-v3.15.3-linux-arm64.tar.gz +``` + +``` +tar -zxvf helm-v3.15.3-linux-arm64.tar.gz + ``` + +``` +sudo mv linux-arm64/helm /usr/local/bin/helm +``` + +``` +rm -rf helm-v3.15.3-linux-arm64.tar.gz linux-arm64/ +``` + +Refer to the Helm 3.15.3 [release notes](https://github.com/helm/helm/releases) and the [Installing Helm guide](https://helm.sh/docs/using_helm/#installing-helm) for more information. + +### Adding an Additional Node to NVIDIA Cloud Native Stack + +`NOTE:` If you're not adding additional nodes, please skip this step and proceed to the next step [Installing NVIDIA Network Operator](#Installing-NVIDIA-Network-Operator) + +Make sure to install the Containerd and Kubernetes packages on additional nodes. + +Prerequisites: +- [Installing Docker and NVIDIA Container Toolkit](#Installing-Docker-and-NVIDIA-Container-Toolkit) +- [Installing Containerd](#Installing-Containerd) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Disable swap](#Disable-swap) + +Once the prerequisites are completed on the additional nodes, execute the below command on the control-plane node and then execute the join command output on an additional node to add the additional node to NVIDIA Cloud Native Stack: + +``` + sudo kubeadm token create --print-join-command +``` + +Output: +``` +example: +sudo kubeadm join 10.110.0.34:6443 --token kg2h7r.e45g9uyrbm1c0w3k --discovery-token-ca-cert-hash sha256:77fd6571644373ea69074dd4af7b077bbf5bd15a3ed720daee98f4b04a8f524e +``` +`NOTE`: control-plane node and worker node should not have the same node name. + +The get nodes command shows that the master and worker nodes are up and ready: + +``` + kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane,master 10m v1.29.6 +#yourhost-worker Ready 10m v1.29.6 +``` + +### Installing GPU Operator + +Add the NVIDIA repo: + +``` + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia +``` + +Update the Helm repo: + +``` + helm repo update +``` + +Install GPU Operator: + +`NOTE:` As we are preinstalled with NVIDIA Driver and NVIDIA Container Toolkit, we need to set as `false` when installing the GPU Operator + +``` + helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator --devel nvidia/gpu-operator --set driver.enabled=false,toolkit.enabled=false --wait --generate-name +``` + +#### Validating the State of the GPU Operator: + +Please note that the installation of the GPU Operator can take a couple of minutes. How long the installation will take depends on your internet speed. + +``` +kubectl get pods --all-namespaces | grep -v kube-system +``` + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +default gpu-operator-1622656274-node-feature-discovery-master-5cddq96gq 1/1 Running 0 2m39s +default gpu-operator-1622656274-node-feature-discovery-worker-wr88v 1/1 Running 0 2m39s +default gpu-operator-7db468cfdf-mdrdp 1/1 Running 0 2m39s +nvidia-gpu-operator gpu-feature-discovery-g425f 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-cuda-validator-s6x2p 0/1 Completed 0 48s +nvidia-gpu-operator nvidia-dcgm-exporter-wtxnx 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-dcgm-jbz94 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-daemonset-hzzdt 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-validator-9nkxq 0/1 Completed 0 17s +nvidia-gpu-operator nvidia-operator-validator-cw4j5 1/1 Running 0 2m20s + +``` + +Please refer to the [GPU Operator page](https://ngc.nvidia.com/catalog/helm-charts/nvidia:gpu-operator) on NGC for more information. + +For multiple worker nodes, execute the below command to fix the CoreDNS and Node Feature Discovery. + +``` +kubectl delete pods $(kubectl get pods -n kube-system | grep core | awk '{print $1}') -n kube-system; kubectl delete pod $(kubectl get pods -o wide -n nvidia-gpu-operator | grep node-feature-discovery | grep -v master | awk '{print $1}') -n nvidia-gpu-operator +``` + +#### GPU Operator with MIG + +`NOTE:` Only A100 and A30 GPUs are supported for GPU Operator with MIG + +Multi-Instance GPU (MIG) allows GPUs based on the NVIDIA Ampere architecture (such as NVIDIA A100) to be securely partitioned into separate GPU instances for CUDA applications. For more information about enabling the MIG capability, please refer to [GPU Operator with MIG](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/gpu-operator-mig.html) + + +### Validating the GPU Operator + +GPU Operator validates the through the nvidia-device-plugin-validation pod and the nvidia-driver-validation pod. If both are completed successfully (see output from kubectl get pods --all-namespaces | grep -v kube-system), NVIDIA Cloud Native Stack is working as expected. This section provides two examples of validating that the GPU is usable from within a pod to validate the manually. + +#### Example 1: nvidia-smi + +Execute the following: + +``` +cat < test-image.tgz; sudo ctr -n=k8s.io images import test-image.tgz +``` + +### Validate NVIDIA Cloud Native Stack with an Application from NGC +Another option to validate NVIDIA Cloud Native Stack is by running a demo application hosted on NGC. + +NGC is NVIDIA's GPU-optimized software hub. NGC provides a curated set of GPU-optimized software for AI, HPC, and visualization. The content provided by NVIDIA and third-party ISVs simplify building, customizing, and integrating GPU-optimized software into workflows, accelerating the time to solutions for users. + +Containers, pre-trained models, Helm charts for Kubernetes deployments, and industry-specific AI toolkits with software development kits (SDKs) are hosted on NGC. For more information about how to deploy an application that is hosted on NGC or the NGC Private Registry, please refer to this [NGC Registry Guide](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/NGC_Registry_Guide_v1.0.md). Visit the [public NGC documentation](https://docs.nvidia.com/ngc) for more information. + +The steps in this section use the publicly available DeepStream - Intelligent Video Analytics (IVA) demo application Helm Chart. The application can validate the full NVIDIA Cloud Native Stack and test the connectivity of NVIDIA Cloud Native Stack to remote sensors. DeepStream delivers real-time AI-based video and image understanding and multi-sensor processing on GPUs. For more information, please refer to the [Helm Chart](https://ngc.nvidia.com/catalog/helm-charts/nvidia:video-analytics-demo). + +There are two ways to configure the DeepStream - Intelligent Video Analytics Demo Application on your NVIDIA Cloud Native Stack + +- Using a camera +- Using the integrated video file (no camera required) + +#### Using a camera + +##### Prerequisites: +- RTSP Camera stream + +Go through the below steps to install the demo application: +``` +1. helm fetch https://helm.ngc.nvidia.com/nvidia/charts/video-analytics-demo-0.1.9.tgz --untar + +2. cd into the folder video-analytics-demo and update the file values.yaml + +3. Go to the section Cameras in the values.yaml file and add the address of your IP camera. Read the comments section on how it can be added. Single or multiple cameras can be added as shown below + +cameras: + camera1: rtsp://XXXX +``` + +Execute the following command to deploy the demo application: +``` +helm install video-analytics-demo --name-template iva +``` + +Once the Helm chart is deployed, access the application with the VLC player. See the instructions below. + +#### Using the integrated video file (no camera) + +If you dont have a camera input, please execute the below commands to use the default video already integrated into the application: + +``` +helm fetch https://helm.ngc.nvidia.com/nvidia/charts/video-analytics-demo-0.1.9.tgz + +helm install video-analytics-demo-0.1.9.tgz --name-template iva +``` + +Once the helm chart is deployed, access the application with the VLC player as per the below instructions. +For more information about the demo application, please refer to the [application NGC page](https://ngc.nvidia.com/catalog/helm-charts/nvidia:video-analytics-demo) + +#### Access from WebUI + +Use the below WebUI URL to access the video analytic demo application from the browser: +``` +http://IPAddress of Node:31115/ +``` + +#### Access from VLC + +Download VLC Player from https://www.videolan.org/vlc/ on the machine where you intend to view the video stream. + +View the video stream in VLC by navigating to Media > Open Network Stream > Entering the following URL: + +``` +rtsp://IPAddress of Node:31113/ds-test +``` + +You should see the video output like below with the AI model detecting objects. + +![Deepstream_Video](screenshots/Deepstream.png) + +`NOTE:` Video stream in VLC will change if you provide an input RTSP camera. + + +### Uninstalling the GPU Operator + +Execute the below commands to uninstall the GPU Operator: + +``` +helm ls +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +gpu-operator-1606173805 nvidia-gpu-operator 1 2023-04-14 20:23:28.063421701 +0000 UTC deployed gpu-operator-23.6.0 v23.3.2 + +helm del gpu-operator-1606173805 -n nvidia-gpu-operator +``` diff --git a/install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v13.0.md b/install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v13.0.md index 1dc6260..eb23392 100644 --- a/install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v13.0.md +++ b/install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v13.0.md @@ -126,13 +126,13 @@ Wed Mar 24 12:47:29 2023 Set up the repository and update the apt package index: ``` -$ sudo apt update +sudo apt update ``` Install packages to allow apt to use a repository over HTTPS: ``` -$ sudo apt install -y \ +sudo apt install -y \ apt-transport-https \ ca-certificates \ curl \ @@ -140,48 +140,46 @@ $ sudo apt install -y \ software-properties-common ``` +``` +sudo install -m 0755 -d /etc/apt/keyrings +``` + Add Docker's official GPG key: ``` -$ curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - +sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc ``` -Verify that you now have the key with the fingerprint 9DC8 5822 9FC7 DD38 854A E2D8 8D81 803C 0EBF CD88 by searching for the last 8 characters of the fingerprint: ``` -$ sudo apt-key fingerprint 0EBFCD88 - -pub rsa4096 2017-02-22 [SCEA] - 9DC8 5822 9FC7 DD38 854A E2D8 8D81 803C 0EBF CD88 -uid [ unknown] Docker Release (CE deb) -sub rsa4096 2017-02-22 [S] -``` - +sudo chmod a+r /etc/apt/keyrings/docker.asc +``` Use the following command to set up the stable repository: ``` -$ sudo add-apt-repository \ - "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) \ - stable" +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null ``` Install Docker Engine - Community + Update the apt package index: ``` -$ sudo apt update +sudo apt update ``` Install Docker Engine: ``` -$ sudo apt install -y docker-ce docker-ce-cli containerd.io +sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin ``` Verify that Docker Engine - Community is installed correctly by running the hello-world image: ``` -$ sudo docker run hello-world +sudo docker run hello-world ``` More information on how to install Docker can be found at https://docs.docker.com/install/linux/docker-ce/ubuntu/. @@ -222,7 +220,7 @@ Edit the docker daemon configuration to add the following line and save the file Example: ``` -$ sudo nano /etc/docker/daemon.json +sudo nano /etc/docker/daemon.json { "runtimes": { @@ -245,7 +243,7 @@ sudo systemctl daemon-reload && sudo systemctl restart docker Execute the below command to validate docker default runtime as NVIDIA: ``` -$ sudo docker info | grep -i runtime +sudo docker info | grep -i runtime ``` Output: @@ -468,7 +466,7 @@ Now execute the below to install kubelet, kubeadm, and kubectl: sudo apt update ``` ``` - sudo apt install -y -q kubelet=1.30.0-00 kubectl=1.30.0-00 kubeadm=1.30.0-00 + sudo apt install -y -q kubelet=1.30.0-1.1 kubectl=1.30.0-1.1 kubeadm=1.30.0-1.1 ``` ``` sudo apt-mark hold kubelet kubeadm kubectl @@ -791,7 +789,7 @@ Wed Apr 14 12:47:29 2023 Create a pod YAML file: ``` -$ cat < /dev/null +``` + +Install Docker Engine - Community + +Update the apt package index: + +``` +sudo apt update +``` + +Install Docker Engine: + +``` +sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin +``` + +Verify that Docker Engine - Community is installed correctly by running the hello-world image: + +``` +sudo docker run hello-world +``` + +More information on how to install Docker can be found at https://docs.docker.com/install/linux/docker-ce/ubuntu/. + +#### Installing NVIDIA Container Toolkit + +Setup the pacakge repository + +``` +distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \ + && curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list +``` + +Update the package index + +``` +sudo apt update +``` + +Install NVIDIA Conatiner Toolkit + +``` +sudo apt install -y nvidia-container-toolkit=1.15.0-1 +``` + + +### Update the Docker Default Runtime + + +Edit the docker daemon configuration to add the following line and save the file: + +``` +"default-runtime" : "nvidia" +``` + +Example: +``` +$ sudo nano /etc/docker/daemon.json + +{ + "runtimes": { + "nvidia": { + "path": "nvidia-container-runtime", + "runtimeArgs": [] + } + }, + "default-runtime" : "nvidia" +} +``` + +Now execute the below commands to restart the docker daemon: +``` +sudo systemctl daemon-reload && sudo systemctl restart docker +``` + +#### Validate docker default runtime + +Execute the below command to validate docker default runtime as NVIDIA: + +``` +$ sudo docker info | grep -i runtime +``` + +Output: +``` +Runtimes: nvidia runc +Default Runtime: nvidia +``` + + + +## Installing Container Runtime + +You need to install a container runtime into each node in the cluster so that Pods can run there. Currently Cloud Native Stack provides below container runtimes + +- [Installing Containerd](#Installing-Containerd) +- [Installing CRI-O](#Installing-CRI-O) + +`NOTE:` Only install one of either `Containerd` or `CRI-O`, not both! + +These steps apply to both runtimes. + +Set up the repository and update the apt package index: + +``` +sudo apt update +``` + +Install packages to allow apt to use a repository over HTTPS: + +``` +sudo apt install -y apt-transport-https ca-certificates gnupg-agent libseccomp2 autotools-dev debhelper software-properties-common +``` + +Configure the `overlay` and `br_netfilter` kernel modules required by Kubernetes: + +``` +cat <:6443 --token 489oi5.sm34l9uh7dk4z6cm \ + --discovery-token-ca-cert-hash sha256:17165b6c4a4b95d73a3a2a83749a957a10161ae34d2dfd02cd730597579b4b34 +``` + + +Following the instructions in the output, execute the commands as shown below: + +``` + mkdir -p $HOME/.kube + sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config + sudo chown $(id -u):$(id -g) $HOME/.kube/config +``` + +With the following command, you install a pod-network add-on to the control plane node. We are using calico as the pod-network add-on here: + +``` + kubectl apply -f https://raw.githubusercontent.com/projectcalico/calico/v3.25.1/manifests/calico.yaml +``` + +Update the Calico Daemonset + +``` +kubectl set env daemonset/calico-node -n kube-system IP_AUTODETECTION_METHOD=interface=ens\*,eth\*,enc\*,enp\* +``` + +You can execute the below commands to ensure that all pods are up and running: + +``` + kubectl get pods --all-namespaces +``` + +Output: + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +kube-system calico-kube-controllers-65b8787765-bjc8h 1/1 Running 0 2m8s +kube-system calico-node-c2tmk 1/1 Running 0 2m8s +kube-system coredns-5c98db65d4-d4kgh 1/1 Running 0 9m8s +kube-system coredns-5c98db65d4-h6x8m 1/1 Running 0 9m8s +kube-system etcd-#yourhost 1/1 Running 0 8m25s +kube-system kube-apiserver-#yourhost 1/1 Running 0 8m7s +kube-system kube-controller-manager-#yourhost 1/1 Running 0 8m3s +kube-system kube-proxy-6sh42 1/1 Running 0 9m7s +kube-system kube-scheduler-#yourhost 1/1 Running 0 8m26s +``` + +The get nodes command shows that the control-plane node is up and ready: + +``` + kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane,master 10m v1.30.0 +``` + +Since we are using a single-node Kubernetes cluster, the cluster will not schedule pods on the control plane node by default. To schedule pods on the control plane node, we have to remove the taint by executing the following command: + +``` +kubectl taint nodes --all node-role.kubernetes.io/control-plane- +``` + +Refer to [Installing Kubeadm](https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/install-kubeadm/) +for more information. + +### Installing Helm + +Execute the following command to download and install Helm 3.15.3 for `x86-64` system: + +``` +wget https://get.helm.sh/helm-v3.15.3-linux-amd64.tar.gz +``` + +``` +tar -zxvf helm-v3.15.3-linux-amd64.tar.gz + ``` + + ``` +sudo mv linux-amd64/helm /usr/local/bin/helm + ``` + + ``` +rm -rf helm-v3.15.3-linux-amd64.tar.gz linux-amd64/ +``` + +Download and install Helm 3.15.3 for `ARM` system: + +``` +wget https://get.helm.sh/helm-v3.15.3-linux-arm64.tar.gz +``` + +``` +tar -zxvf helm-v3.15.3-linux-arm64.tar.gz + ``` + +``` +sudo mv linux-arm64/helm /usr/local/bin/helm +``` + +``` +rm -rf helm-v3.15.3-linux-arm64.tar.gz linux-arm64/ +``` + +Refer to the Helm 3.15.3 [release notes](https://github.com/helm/helm/releases) and the [Installing Helm guide](https://helm.sh/docs/using_helm/#installing-helm) for more information. + +### Adding an Additional Node to NVIDIA Cloud Native Stack + +`NOTE:` If you're not adding additional nodes, please skip this step and proceed to the next step [Installing NVIDIA Network Operator](#Installing-NVIDIA-Network-Operator) + +Make sure to install the Containerd and Kubernetes packages on additional nodes. + +Prerequisites: +- [Installing Docker and NVIDIA Container Toolkit](#Installing-Docker-and-NVIDIA-Container-Toolkit) +- [Installing Containerd](#Installing-Containerd) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Disable swap](#Disable-swap) + +Once the prerequisites are completed on the additional nodes, execute the below command on the control-plane node and then execute the join command output on an additional node to add the additional node to NVIDIA Cloud Native Stack: + +``` + sudo kubeadm token create --print-join-command +``` + +Output: +``` +example: +sudo kubeadm join 10.110.0.34:6443 --token kg2h7r.e45g9uyrbm1c0w3k --discovery-token-ca-cert-hash sha256:77fd6571644373ea69074dd4af7b077bbf5bd15a3ed720daee98f4b04a8f524e +``` +`NOTE`: control-plane node and worker node should not have the same node name. + +The get nodes command shows that the master and worker nodes are up and ready: + +``` + kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane,master 10m v1.30.0 +#yourhost-worker Ready 10m v1.30.0 +``` + +### Installing GPU Operator + +Add the NVIDIA repo: + +``` + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia +``` + +Update the Helm repo: + +``` + helm repo update +``` + +Install GPU Operator: + +`NOTE:` As we are preinstalled with NVIDIA Driver and NVIDIA Container Toolkit, we need to set as `false` when installing the GPU Operator + +``` + helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator --devel nvidia/gpu-operator --set driver.enabled=false,toolkit.enabled=false --wait --generate-name +``` + +#### Validating the State of the GPU Operator: + +Please note that the installation of the GPU Operator can take a couple of minutes. How long the installation will take depends on your internet speed. + +``` +kubectl get pods --all-namespaces | grep -v kube-system +``` + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +default gpu-operator-1622656274-node-feature-discovery-master-5cddq96gq 1/1 Running 0 2m39s +default gpu-operator-1622656274-node-feature-discovery-worker-wr88v 1/1 Running 0 2m39s +default gpu-operator-7db468cfdf-mdrdp 1/1 Running 0 2m39s +nvidia-gpu-operator gpu-feature-discovery-g425f 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-cuda-validator-s6x2p 0/1 Completed 0 48s +nvidia-gpu-operator nvidia-dcgm-exporter-wtxnx 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-dcgm-jbz94 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-daemonset-hzzdt 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-validator-9nkxq 0/1 Completed 0 17s +nvidia-gpu-operator nvidia-operator-validator-cw4j5 1/1 Running 0 2m20s + +``` + +Please refer to the [GPU Operator page](https://ngc.nvidia.com/catalog/helm-charts/nvidia:gpu-operator) on NGC for more information. + +For multiple worker nodes, execute the below command to fix the CoreDNS and Node Feature Discovery. + +``` +kubectl delete pods $(kubectl get pods -n kube-system | grep core | awk '{print $1}') -n kube-system; kubectl delete pod $(kubectl get pods -o wide -n nvidia-gpu-operator | grep node-feature-discovery | grep -v master | awk '{print $1}') -n nvidia-gpu-operator +``` + +#### GPU Operator with MIG + +`NOTE:` Only A100 and A30 GPUs are supported for GPU Operator with MIG + +Multi-Instance GPU (MIG) allows GPUs based on the NVIDIA Ampere architecture (such as NVIDIA A100) to be securely partitioned into separate GPU instances for CUDA applications. For more information about enabling the MIG capability, please refer to [GPU Operator with MIG](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/gpu-operator-mig.html) + + +### Validating the GPU Operator + +GPU Operator validates the through the nvidia-device-plugin-validation pod and the nvidia-driver-validation pod. If both are completed successfully (see output from kubectl get pods --all-namespaces | grep -v kube-system), NVIDIA Cloud Native Stack is working as expected. This section provides two examples of validating that the GPU is usable from within a pod to validate the manually. + +#### Example 1: nvidia-smi + +Execute the following: + +``` +cat < test-image.tgz; sudo ctr -n=k8s.io images import test-image.tgz +``` + +### Validate NVIDIA Cloud Native Stack with an Application from NGC +Another option to validate NVIDIA Cloud Native Stack is by running a demo application hosted on NGC. + +NGC is NVIDIA's GPU-optimized software hub. NGC provides a curated set of GPU-optimized software for AI, HPC, and visualization. The content provided by NVIDIA and third-party ISVs simplify building, customizing, and integrating GPU-optimized software into workflows, accelerating the time to solutions for users. + +Containers, pre-trained models, Helm charts for Kubernetes deployments, and industry-specific AI toolkits with software development kits (SDKs) are hosted on NGC. For more information about how to deploy an application that is hosted on NGC or the NGC Private Registry, please refer to this [NGC Registry Guide](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/NGC_Registry_Guide_v1.0.md). Visit the [public NGC documentation](https://docs.nvidia.com/ngc) for more information. + +The steps in this section use the publicly available DeepStream - Intelligent Video Analytics (IVA) demo application Helm Chart. The application can validate the full NVIDIA Cloud Native Stack and test the connectivity of NVIDIA Cloud Native Stack to remote sensors. DeepStream delivers real-time AI-based video and image understanding and multi-sensor processing on GPUs. For more information, please refer to the [Helm Chart](https://ngc.nvidia.com/catalog/helm-charts/nvidia:video-analytics-demo). + +There are two ways to configure the DeepStream - Intelligent Video Analytics Demo Application on your NVIDIA Cloud Native Stack + +- Using a camera +- Using the integrated video file (no camera required) + +#### Using a camera + +##### Prerequisites: +- RTSP Camera stream + +Go through the below steps to install the demo application: +``` +1. helm fetch https://helm.ngc.nvidia.com/nvidia/charts/video-analytics-demo-0.1.9.tgz --untar + +2. cd into the folder video-analytics-demo and update the file values.yaml + +3. Go to the section Cameras in the values.yaml file and add the address of your IP camera. Read the comments section on how it can be added. Single or multiple cameras can be added as shown below + +cameras: + camera1: rtsp://XXXX +``` + +Execute the following command to deploy the demo application: +``` +helm install video-analytics-demo --name-template iva +``` + +Once the Helm chart is deployed, access the application with the VLC player. See the instructions below. + +#### Using the integrated video file (no camera) + +If you dont have a camera input, please execute the below commands to use the default video already integrated into the application: + +``` +$ helm fetch https://helm.ngc.nvidia.com/nvidia/charts/video-analytics-demo-0.1.9.tgz + +$ helm install video-analytics-demo-0.1.9.tgz --name-template iva +``` + +Once the helm chart is deployed, access the application with the VLC player as per the below instructions. +For more information about the demo application, please refer to the [application NGC page](https://ngc.nvidia.com/catalog/helm-charts/nvidia:video-analytics-demo) + +#### Access from WebUI + +Use the below WebUI URL to access the video analytic demo application from the browser: +``` +http://IPAddress of Node:31115/ +``` + +#### Access from VLC + +Download VLC Player from https://www.videolan.org/vlc/ on the machine where you intend to view the video stream. + +View the video stream in VLC by navigating to Media > Open Network Stream > Entering the following URL: + +``` +rtsp://IPAddress of Node:31113/ds-test +``` + +You should see the video output like below with the AI model detecting objects. + +![Deepstream_Video](screenshots/Deepstream.png) + +`NOTE:` Video stream in VLC will change if you provide an input RTSP camera. + + +### Uninstalling the GPU Operator + +Execute the below commands to uninstall the GPU Operator: + +``` +$ helm ls +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +gpu-operator-1606173805 nvidia-gpu-operator 1 2023-04-14 20:23:28.063421701 +0000 UTC deployed gpu-operator-23.6.0 v23.3.2 + +$ helm del gpu-operator-1606173805 -n nvidia-gpu-operator +``` diff --git a/install-guides/Ubuntu-22-04_Server_x86-arm64_v11.3.md b/install-guides/Ubuntu-22-04_Server_x86-arm64_v11.3.md new file mode 100644 index 0000000..cebc589 --- /dev/null +++ b/install-guides/Ubuntu-22-04_Server_x86-arm64_v11.3.md @@ -0,0 +1,1128 @@ +# NVIDIA Cloud Native Stack v11.3 - Install Guide for Ubuntu Server +## Introduction + +This document describes how to setup the NVIDIA Cloud Native Stack collection on a single or multiple NVIDIA Certified Systems. NVIDIA Cloud Native Stack can be configured to create a single node Kubernetes cluster or to create/add additional worker nodes to join an existing cluster. + +NVIDIA Cloud Native Stack v11.3 includes: +- Ubuntu 22.04 LTS +- Containerd 1.7.20 +- Kubernetes version 1.28.12 +- Helm 3.15.3 +- NVIDIA GPU Operator 24.6.0 + - NVIDIA GPU Driver: 550.90.07 + - NVIDIA Container Toolkit: 1.16.1 + - NVIDIA K8S Device Plugin: 0.16.1 + - NVIDIA DCGM-Exporter: 3.3.7-3.5.0 + - NVIDIA DCGM: 3.3.7-1 + - NVIDIA GPU Feature Discovery: 0.16.1 + - NVIDIA K8s MIG Manager: 0.8.0 + - NVIDIA Driver Manager: 0.6.10 + - Node Feature Discovery: 0.16.3 + - NVIDIA KubeVirt GPU Device Plugin: 1.2.9 + - NVIDIA GDS Driver: 2.17.5 + - NVIDIA Kata Manager for Kubernetes: 0.2.1 + - NVIDIA Confidential Computing Manager for Kubernetes: 0.1.1 +- NVIDIA Network Operator 24.4.1 + - Mellanox MOFED Driver 24.04-0.6.6.0-0 + - RDMA Shared Device Plugin 1.4.0 + - SRIOV Device Plugin 3.6.2 + - Container Networking Plugins 1.3.0 + - Multus 3.9.3 + - Whereabouts 0.7.0 + +## Table of Contents + +- [Prerequisites](#Prerequisites) +- [Installing the Ubuntu Operating System](#Installing-the-Ubuntu-Operating-System) +- [Installing Container Runtime](#Installing-Container-Runtime) + - [Installing Containerd](#Installing-Containerd) + - [Installing CRI-O](#Installing-CRI-O) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Installing Helm](#Installing-Helm) +- [Adding an Additional Node to NVIDIA Cloud Native Stack](#Adding-additional-node-to-NVIDIA-Cloud-Native-Stack) +- [Installing NVIDIA Network Operator](#Installing-NVIDIA-Network-Operator) +- [Installing the GPU Operator](#Installing-the-GPU-Operator) +- [Validating the Network Operator with GPUDirect RDMA](#Validating-the-Network-Operator-with-GPUDirect-RDMA) +- [Validating the GPU Operator](#Validating-the-GPU-Operator) +- [Validate NVIDIA Cloud Native Stack with an Application from NGC](#Validate-NVIDIA-Cloud-Native-Stack-with-an-application-from-NGC) +- [Uninstalling the GPU Operator](#Uninstalling-the-GPU-Operator) +- [Uninstalling the Network Operator](#Uninstalling-the-Network-Operator) + +### Prerequisites + +The following instructions assume the following: + +- You have [NVIDIA-Certified Systems](https://docs.nvidia.com/ngc/ngc-deploy-on-premises/nvidia-certified-systems/index.html) with Mellanox CX NICs for x86-64 servers +- You have [NVIDIA Qualified Systems](https://www.nvidia.com/en-us/data-center/data-center-gpus/qualified-system-catalog/?start=0&count=50&pageNumber=1&filters=eyJmaWx0ZXJzIjpbXSwic3ViRmlsdGVycyI6eyJwcm9jZXNzb3JUeXBlIjpbIkFSTS1UaHVuZGVyWDIiLCJBUk0tQWx0cmEiXX0sImNlcnRpZmllZEZpbHRlcnMiOnt9LCJwYXlsb2FkIjpbXX0=) for ARM servers + `NOTE:` For ARM systems, NVIDIA Network Operator is not supported yet. +- You will perform a clean install. + +To determine if your system qualifies as an NVIDIA Certified System, review the list of NVIDIA Certified Systems [here](https://docs.nvidia.com/ngc/ngc-deploy-on-premises/nvidia-certified-systems/index.html). + +Please note that NVIDIA Cloud Native Stack is validated only on systems with the default kernel (not HWE). + + +### Installing the Ubuntu Operating System +These instructions require installing Ubuntu Server LTS 22.04 Ubuntu Server can be downloaded [here](http://cdimage.ubuntu.com/releases/20.04.4/release/). + +Please reference the [Ubuntu Server Installation Guide](https://ubuntu.com/tutorials/tutorial-install-ubuntu-server#1-overview). + +## Installing Container Runtime + +You need to install a container runtime into each node in the cluster so that Pods can run there. Currently Cloud Native Stack provides below container runtimes: + +- [Installing Containerd](#Installing-Containerd) +- [Installing CRI-O](#Installing-CRI-O) + +`NOTE:` Only install one of either `Containerd` or `CRI-O`, not both! + +These steps apply to both runtimes. + +Set up the repository and update the apt package index: + +``` +sudo apt update +``` + +Install packages to allow apt to use a repository over HTTPS: + +``` +sudo apt install -y apt-transport-https ca-certificates gnupg-agent libseccomp2 autotools-dev debhelper software-properties-common +``` + +Configure the `overlay` and `br_netfilter` kernel modules required by Kubernetes: + +``` +cat <:6443 --token 489oi5.sm34l9uh7dk4z6cm \ + --discovery-token-ca-cert-hash sha256:17165b6c4a4b95d73a3a2a83749a957a10161ae34d2dfd02cd730597579b4b34 +``` + + +Following the instructions in the output, execute the commands as shown below: + +``` +mkdir -p $HOME/.kube +``` + +``` +sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config +``` + +``` +sudo chown $(id -u):$(id -g) $HOME/.kube/config +``` + +With the following command, you install a pod-network add-on to the control plane node. We are using calico as the pod-network add-on here: + +``` +kubectl apply -f https://raw.githubusercontent.com/projectcalico/calico/v3.25.1/manifests/calico.yaml +``` + +Update the Calico Daemonset + +``` +kubectl set env daemonset/calico-node -n kube-system IP_AUTODETECTION_METHOD=interface=ens\*,eth\*,enc\*,enp\* +``` + +You can execute the below commands to ensure that all pods are up and running: + +``` +kubectl get pods --all-namespaces +``` + +Output: + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +kube-system calico-kube-controllers-65b8787765-bjc8h 1/1 Running 0 2m8s +kube-system calico-node-c2tmk 1/1 Running 0 2m8s +kube-system coredns-5c98db65d4-d4kgh 1/1 Running 0 9m8s +kube-system coredns-5c98db65d4-h6x8m 1/1 Running 0 9m8s +kube-system etcd-#yourhost 1/1 Running 0 8m25s +kube-system kube-apiserver-#yourhost 1/1 Running 0 8m7s +kube-system kube-controller-manager-#yourhost 1/1 Running 0 8m3s +kube-system kube-proxy-6sh42 1/1 Running 0 9m7s +kube-system kube-scheduler-#yourhost 1/1 Running 0 8m26s +``` + +The get nodes command shows that the control-plane node is up and ready: + +``` +kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane 10m v1.28.12 +``` + +Since we are using a single-node Kubernetes cluster, the cluster will not schedule pods on the control plane node by default. To schedule pods on the control plane node, we have to remove the taint by executing the following command: + +``` +kubectl taint nodes --all node-role.kubernetes.io/control-plane- +``` + +Refer to [Installing Kubeadm](https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/install-kubeadm/) +for more information. + +### Installing Helm + +Execute the following command to download and install Helm 3.15.3 for `x86-64` system: + +``` +wget https://get.helm.sh/helm-v3.15.3-linux-amd64.tar.gz +``` + +``` +tar -zxvf helm-v3.15.3-linux-amd64.tar.gz + ``` + + ``` +sudo mv linux-amd64/helm /usr/local/bin/helm + ``` + + ``` +rm -rf helm-v3.15.3-linux-amd64.tar.gz linux-amd64/ +``` + +Download and install Helm 3.15.3 for `ARM` system: + +``` +wget https://get.helm.sh/helm-v3.15.3-linux-arm64.tar.gz +``` + +``` +tar -zxvf helm-v3.15.3-linux-arm64.tar.gz + ``` + +``` +sudo mv linux-arm64/helm /usr/local/bin/helm +``` + +``` +rm -rf helm-v3.15.3-linux-arm64.tar.gz linux-arm64/ +``` + +Refer to the Helm 3.15.3 [release notes](https://github.com/helm/helm/releases) and the [Installing Helm guide](https://helm.sh/docs/using_helm/#installing-helm) for more information. + + +### Adding an Additional Node to NVIDIA Cloud Native Stack + +`NOTE:` If you're not adding additional nodes, please skip this step and proceed to the next step [Installing NVIDIA Network Operator](#Installing-NVIDIA-Network-Operator) + +Make sure to install the Containerd and Kubernetes packages on additional nodes. + +Prerequisites: +- [Installing Containerd](#Installing-Containerd) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Disable swap](#Disable-swap) + +Once the prerequisites are completed on the additional nodes, execute the below command on the control-plane node and then execute the join command output on an additional node to add the additional node to NVIDIA Cloud Native Stack: + +``` +sudo kubeadm token create --print-join-command +``` + +Output: +``` +example: +sudo kubeadm join 10.110.0.34:6443 --token kg2h7r.e45g9uyrbm1c0w3k --discovery-token-ca-cert-hash sha256:77fd6571644373ea69074dd4af7b077bbf5bd15a3ed720daee98f4b04a8f524e +``` +`NOTE`: control-plane node and worker node should not have the same node name. + +The get nodes command shows that the master and worker nodes are up and ready: + +``` +kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane 10m v1.28.12 +#yourhost-worker Ready 10m v1.28.12 +``` + +### Installing NVIDIA Network Operator + +`NOTE:` If Mellanox NICs are not connected to your nodes, please skip this step and proceed to the next step [Installing GPU Operator](#Installing-GPU-Operator) + +The below instructions assume that Mellanox NICs are connected to your machines. + +Execute the below command to verify Mellanox NICs are enabled on your machines: + +``` +lspci | grep -i "Mellanox" +``` + +Output: +``` +0c:00.0 Ethernet controller: Mellanox Technologies MT2892 Family [ConnectX-6 Dx] +0c:00.1 Ethernet controller: Mellanox Technologies MT2892 Family [ConnectX-6 Dx] +``` +Execute the below command to determine which Mellanox device is active: + +`NOTE:` Use whicever device shows as `Link Detected: yes` in further steps. The below command works only if you add the NICs before installing the Operating System. + +``` +for device in `sudo lshw -class network -short | grep -i ConnectX | awk '{print $2}' | egrep -v 'Device|path' | sed '/^$/d'`;do echo -n $device; sudo ethtool $device | grep -i "Link detected"; done +``` +Output: +``` +ens160f0 Link detected: yes +ens160f1 Link detected: no +``` + +Create the custom network operator values.yaml and update the active Mellanox device from the above command: +``` +nano network-operator-values.yaml +deployCR: true +ofedDriver: + deploy: true +rdmaSharedDevicePlugin: + deploy: true + resources: + - name: rdma_shared_device_a + vendors: [15b3] + devices: [ens160f0] +``` + +For more information about custom network operator values.yaml, please refer [Network Operator](https://docs.mellanox.com/display/COKAN10/Network+Operator#NetworkOperator-Example2:RDMADevicePluginConfiguration) + +Add the NVIDIA repo: +``` +helm repo add mellanox https://mellanox.github.io/network-operator +``` + +Update the Helm repo: +``` + helm repo update +``` +Install Network Operator: +``` +kubectl label nodes --all node-role.kubernetes.io/master- --overwrite +``` + +``` +helm install -f --version 24.1.0 ./network-operator-values.yaml -n network-operator --create-namespace --wait network-operator mellanox/network-operator +``` +#### Validating the State of the Network Operator + +Please note that the installation of the Network Operator can take a couple of minutes. How long the installation will take depends on your internet speed. + +``` +kubectl get pods --all-namespaces | egrep 'network-operator|nvidia-network-operator-resources' +``` + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +network-operator network-operator-547cb8d999-mn2h9 1/1 Running 0 17m +network-operator network-operator-node-feature-discovery-master-596fb8b7cb-qrmvv 1/1 Running 0 17m +network-operator network-operator-node-feature-discovery-worker-qt5xt 1/1 Running 0 17m +nvidia-network-operator-resources cni-plugins-ds-dl5vl 1/1 Running 0 17m +nvidia-network-operator-resources kube-multus-ds-w82rv 1/1 Running 0 17m +nvidia-network-operator-resources mofed-ubuntu20.04-ds-xfpzl 1/1 Running 0 17m +nvidia-network-operator-resources rdma-shared-dp-ds-2hgb6 1/1 Running 0 17m +nvidia-network-operator-resources sriov-device-plugin-ch7bz 1/1 Running 0 10m +nvidia-network-operator-resources whereabouts-56ngr 1/1 Running 0 10m +``` + +Please refer to the [Network Operator page](https://docs.mellanox.com/display/COKAN10/Network+Operator) for more information. + +### Installing GPU Operator + +Add the NVIDIA repo: + +``` +helm repo add nvidia https://helm.ngc.nvidia.com/nvidia +``` + +Update the Helm repo: + +``` +hhelm repo update +``` + +Install GPU Operator: + +`NOTE:` If you installed Network Operator, please skip the below command and follow the [GPU Operator with RDMA](#GPU-Operator-with-RDMA) + +``` +helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --set driver.version=535.104.05 --wait --generate-name +``` + +#### GPU Operator with RDMA + +- Prerequisites: + - Please install the [Network Operator](#Installing NVIDIA Network Operator) to ensure that the MOFED drivers are installed. + +After Network Operator installation is completed, execute the below command to install the GPU Operator to load nv_peer_mem modules: + +``` + helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --set driver.rdma.enabled=true --wait --generate-name +``` + +#### GPU Operator with Host MOFED Driver and RDMA + +If the host is already installed MOFED driver without network operator, execute the below command to install the GPU Operator to load nv_peer_mem module + +``` + helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --set driver.rdma.enabled=true,driver.rdma.useHostMofed=true --wait --generate-name + +``` + +### GPU Operator with GPU Direct Storage(GDS) + +Execute the below command to enable the GPU Direct Storage Driver on GPU Operator + +``` +helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --set gds.enabled=true +``` +For more information refer, [GPU Direct Storage](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/gpu-operator-rdma.html) + +#### Validating the State of the GPU Operator: + +Please note that the installation of the GPU Operator can take a couple of minutes. How long the installation will take depends on your internet speed. + +``` +kubectl get pods --all-namespaces | grep -v kube-system +``` + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +default gpu-operator-1622656274-node-feature-discovery-master-5cddq96gq 1/1 Running 0 2m39s +default gpu-operator-1622656274-node-feature-discovery-worker-wr88v 1/1 Running 0 2m39s +default gpu-operator-7db468cfdf-mdrdp 1/1 Running 0 2m39s +nvidia-gpu-operator gpu-feature-discovery-g425f 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-container-toolkit-daemonset-mcmxj 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-cuda-validator-s6x2p 0/1 Completed 0 48s +nvidia-gpu-operator nvidia-dcgm-exporter-wtxnx 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-dcgm-jbz94 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-daemonset-hzzdt 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-validator-9nkxq 0/1 Completed 0 17s +nvidia-gpu-operator nvidia-driver-daemonset-kt8g5 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-operator-validator-cw4j5 1/1 Running 0 2m20s + +``` + +Please refer to the [GPU Operator page](https://ngc.nvidia.com/catalog/helm-charts/nvidia:gpu-operator) on NGC for more information. + +For multiple worker nodes, execute the below command to fix the CoreDNS and Node Feature Discovery. + +``` +kubectl delete pods $(kubectl get pods -n kube-system | grep core | awk '{print $1}') -n kube-system; kubectl delete pod $(kubectl get pods -o wide -n nvidia-gpu-operator | grep node-feature-discovery | grep -v master | awk '{print $1}') -n nvidia-gpu-operator +``` + +#### GPU Operator with MIG + +`NOTE:` Only A100 and A30 GPUs are supported for GPU Operator with MIG + +Multi-Instance GPU (MIG) allows GPUs based on the NVIDIA Ampere architecture (such as NVIDIA A100) to be securely partitioned into separate GPU instances for CUDA applications. For more information about enabling the MIG capability, please refer to [GPU Operator with MIG](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/gpu-operator-mig.html) + +### Validating the Network Operator with GPUDirect RDMA + +Execute the below command to list the Mellanox NIC's with the status: +``` +kubectl exec -it $(kubectl get pods -n nvidia-network-operator-resources | grep mofed | awk '{print $1}') -n nvidia-network-operator-resources -- ibdev2netdev +``` +Output: +``` +mlx5_0 port 1 ==> ens192f0 (Up) +mlx5_1 port 1 ==> ens192f1 (Down) +``` + +Create network definition for IPAM and replace the `ens192f0` with an active Mellanox device for `master`: +``` +$ nano networkdefinition.yaml +apiVersion: k8s.cni.cncf.io/v1 +kind: NetworkAttachmentDefinition +metadata: + annotations: + k8s.v1.cni.cncf.io/resourceName: rdma/rdma_shared_device_a + name: rdma-net-ipam + namespace: default +spec: + config: |- + { + "cniVersion": "0.3.1", + "name": "rdma-net-ipam", + "plugins": [ + { + "ipam": { + "datastore": "kubernetes", + "kubernetes": { + "kubeconfig": "/etc/cni/net.d/whereabouts.d/whereabouts.kubeconfig" + }, + "log_file": "/tmp/whereabouts.log", + "log_level": "debug", + "range": "192.168.111.1/24", + "type": "whereabouts" + }, + "type": "macvlan", + "master": "ens192f0" + }, + { + "mtu": 1500, + "type": "tuning" + } + ] + } +EOF +``` +`NOTE:` If you do not have VLAN-based networking on the high-performance side, please set "vlan": 0 + + +Execute the below command to install network definition on NVIDIA Cloud Native Stack from the control-plane node: + + ``` +kubectl apply -f networkdefinition.yaml + ``` + +Now create the pod YAML with the below content: + +``` +cat < ../../devices/virtual/net/eth0 +lrwxrwxrwx 1 root root 0 Jun 1 02:26 lo -> ../../devices/virtual/net/lo +lrwxrwxrwx 1 root root 0 Jun 1 02:26 net1 -> ../../devices/virtual/net/net1 +lrwxrwxrwx 1 root root 0 Jun 1 02:26 tunl0 -> ../../devices/virtual/net/tunl0 +``` + +Execute the below command to list the Mellanox NIC's with the status: +``` +kubectl exec -it $(kubectl get pods -n nvidia-network-operator-resources | grep mofed | awk '{print $1}') -n nvidia-network-operator-resources -- ibdev2netdev +``` +Output: +``` +mlx5_0 port 1 ==> ens192f0 (Up) +mlx5_1 port 1 ==> ens192f1 (Down) +``` + +Update the above Mellanox NIC, for which status is `Up` in the below command: + +``` +kubectl exec -it rdma-test-pod-1 -- bash + +[root@rdma-test-pod-1 /]# ib_write_bw -d mlx5_0 -a -F --report_gbits -q 1 +************************************ +* Waiting for client to connect... * +************************************ +``` + +In a separate terminal, print the network address of the secondary interface on the `rdma-test-pod-1` pod: + +``` +$ kubectl exec rdma-test-pod-1 -- ip addr show dev net1 +5: net1@if24: mtu 9000 qdisc noqueue state UP group default + link/ether 62:51:fb:13:88:ce brd ff:ff:ff:ff:ff:ff link-netnsid 0 + inet 192.168.111.1/24 brd 192.168.111.255 scope global net1 + valid_lft forever preferred_lft forever +``` + +Execute the below command with the above inet address to verify the nv_peer_memory performance on NVIDIA Cloud Native Stack: +``` +$ kubectl exec -it rdma-test-pod-2 -- bash +[root@rdma-test-pod-1 /]# ib_write_bw -d mlx5_0 -a -F --report_gbits -q 1 192.168.111.2 +--------------------------------------------------------------------------------------- + RDMA_Write BW Test + Dual-port : OFF Device : mlx5_0 + Number of qps : 1 Transport type : IB + Connection type : RC Using SRQ : OFF + TX depth : 128 + CQ Moderation : 100 + Mtu : 1024[B] + Link type : Ethernet + GID index : 4 + Max inline data : 0[B] + rdma_cm QPs : OFF + Data ex. method : Ethernet +--------------------------------------------------------------------------------------- + local address: LID 0000 QPN 0x0137 PSN 0x3c5d65 RKey 0x00370e VAddr 0x007ff44bf1d000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:01 + remote address: LID 0000 QPN 0x0136 PSN 0x475031 RKey 0x002c23 VAddr 0x007fd3d83cb000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:02 +--------------------------------------------------------------------------------------- + #bytes #iterations BW peak[Gb/sec] BW average[Gb/sec] MsgRate[Mpps] + 2 5000 0.080755 0.073090 4.568094 + 4 5000 0.16 0.15 4.588128 + 8 5000 0.31 0.29 4.567442 + 16 5000 0.66 0.59 4.647555 + 32 5000 1.35 1.22 4.776518 + 64 5000 2.50 2.29 4.481806 + 128 5000 5.34 4.73 4.621828 + 256 5000 10.53 9.11 4.448153 + 512 5000 21.03 17.05 4.162100 + 1024 5000 38.67 34.16 4.169397 + 2048 5000 47.11 43.50 2.655219 + 4096 5000 51.29 51.02 1.557094 + 8192 5000 52.00 51.98 0.793178 + 16384 5000 52.33 52.32 0.399164 + 32768 5000 52.47 52.47 0.200143 + 65536 5000 52.51 52.50 0.100143 + 131072 5000 52.51 52.51 0.050078 + 262144 5000 52.49 52.49 0.025029 + 524288 5000 52.50 52.50 0.012517 + 1048576 5000 52.51 52.51 0.006260 + 2097152 5000 52.51 52.51 0.003130 + 4194304 5000 52.51 52.51 0.001565 + 8388608 5000 52.52 52.52 0.000783 +--------------------------------------------------------------------------------------- +``` + +``` +[root@rdma-test-pod-1 /]# ib_write_bw -d mlx5_0 -a -F --report_gbits -q 1 + +************************************ +* Waiting for client to connect... * +************************************ +--------------------------------------------------------------------------------------- + RDMA_Write BW Test + Dual-port : OFF Device : mlx5_0 + Number of qps : 1 Transport type : IB + Connection type : RC Using SRQ : OFF + CQ Moderation : 100 + Mtu : 1024[B] + Link type : Ethernet + GID index : 8 + Max inline data : 0[B] + rdma_cm QPs : OFF + Data ex. method : Ethernet +--------------------------------------------------------------------------------------- + local address: LID 0000 QPN 0x0136 PSN 0x475031 RKey 0x002c23 VAddr 0x007fd3d83cb000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:02 + remote address: LID 0000 QPN 0x0137 PSN 0x3c5d65 RKey 0x00370e VAddr 0x007ff44bf1d000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:01 +--------------------------------------------------------------------------------------- + #bytes #iterations BW peak[Gb/sec] BW average[Gb/sec] MsgRate[Mpps] + 8388608 5000 52.52 52.52 0.000783 +--------------------------------------------------------------------------------------- +``` +The benchmark achieved approximately 52 Gbps throughput. + +Exit from RDMA test pods and then delete the RDMA test pods with the below command: + +``` +$ kubectl delete pod rdma-test-pod-1 rdma-test-pod-2 +``` + +### Validating the GPU Operator + +GPU Operator validates the through the nvidia-device-plugin-validation pod and the nvidia-driver-validation pod. If both are completed successfully (see output from kubectl get pods --all-namespaces | grep -v kube-system), NVIDIA Cloud Native Stack is working as expected. This section provides two examples of validating that the GPU is usable from within a pod to validate the manually. + +#### Example 1: nvidia-smi + +Execute the following: + +``` +cat < Open Network Stream > Entering the following URL: + +``` +rtsp://IPAddress of Node:31113/ds-test +``` + +You should see the video output like below with the AI model detecting objects. + +![Deepstream_Video](screenshots/Deepstream.png) + +`NOTE:` Video stream in VLC will change if you provide an input RTSP camera. + + +### Uninstalling the GPU Operator + +Execute the below commands to uninstall the GPU Operator: + +``` +$ helm ls +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +gpu-operator-1606173805 nvidia-gpu-operator 1 2024-03-20 20:23:28.063421701 +0000 UTC deployed gpu-operator-24.3.0 24.3.0 + +$ helm del gpu-operator-1606173805 -n nvidia-gpu-operator + +``` + +### Uninstalling the Network Operator + +Execute the below commands to uninstall the Network Operator: + +``` +$ helm ls -n network-operator +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +network-operator network-operator 1 2024-03-20 17:09:04.665593336 +0000 UTC deployed network-operator-24.1.0 v24.1.0 + +$ helm del network-operator -n network-operator +``` diff --git a/install-guides/Ubuntu-22-04_Server_x86-arm64_v12.2.md b/install-guides/Ubuntu-22-04_Server_x86-arm64_v12.2.md new file mode 100644 index 0000000..073906a --- /dev/null +++ b/install-guides/Ubuntu-22-04_Server_x86-arm64_v12.2.md @@ -0,0 +1,1128 @@ +# NVIDIA Cloud Native Stack v12.2 - Install Guide for Ubuntu Server +## Introduction + +This document describes how to setup the NVIDIA Cloud Native Stack collection on a single or multiple NVIDIA Certified Systems. NVIDIA Cloud Native Stack can be configured to create a single node Kubernetes cluster or to create/add additional worker nodes to join an existing cluster. + +NVIDIA Cloud Native Stack v12.2 includes: +- Ubuntu 22.04 LTS +- Containerd 1.7.20 +- Kubernetes version 1.29.6 +- Helm 3.15.3 +- NVIDIA GPU Operator 24.6.0 + - NVIDIA GPU Driver: 550.90.07 + - NVIDIA Container Toolkit: 1.16.1 + - NVIDIA K8S Device Plugin: 0.16.1 + - NVIDIA DCGM-Exporter: 3.3.7-3.5.0 + - NVIDIA DCGM: 3.3.7-1 + - NVIDIA GPU Feature Discovery: 0.16.1 + - NVIDIA K8s MIG Manager: 0.8.0 + - NVIDIA Driver Manager: 0.6.10 + - Node Feature Discovery: 0.16.3 + - NVIDIA KubeVirt GPU Device Plugin: 1.2.9 + - NVIDIA GDS Driver: 2.17.5 + - NVIDIA Kata Manager for Kubernetes: 0.2.1 + - NVIDIA Confidential Computing Manager for Kubernetes: 0.1.1 +- NVIDIA Network Operator 24.4.1 + - Mellanox MOFED Driver 24.04-0.6.6.0-0 + - RDMA Shared Device Plugin 1.4.0 + - SRIOV Device Plugin 3.6.2 + - Container Networking Plugins 1.3.0 + - Multus 3.9.3 + - Whereabouts 0.7.0 + +## Table of Contents + +- [Prerequisites](#Prerequisites) +- [Installing the Ubuntu Operating System](#Installing-the-Ubuntu-Operating-System) +- [Installing Container Runtime](#Installing-Container-Runtime) + - [Installing Containerd](#Installing-Containerd) + - [Installing CRI-O](#Installing-CRI-O) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Installing Helm](#Installing-Helm) +- [Adding an Additional Node to NVIDIA Cloud Native Stack](#Adding-additional-node-to-NVIDIA-Cloud-Native-Stack) +- [Installing NVIDIA Network Operator](#Installing-NVIDIA-Network-Operator) +- [Installing the GPU Operator](#Installing-the-GPU-Operator) +- [Validating the Network Operator with GPUDirect RDMA](#Validating-the-Network-Operator-with-GPUDirect-RDMA) +- [Validating the GPU Operator](#Validating-the-GPU-Operator) +- [Validate NVIDIA Cloud Native Stack with an Application from NGC](#Validate-NVIDIA-Cloud-Native-Stack-with-an-application-from-NGC) +- [Uninstalling the GPU Operator](#Uninstalling-the-GPU-Operator) +- [Uninstalling the Network Operator](#Uninstalling-the-Network-Operator) + +### Prerequisites + +The following instructions assume the following: + +- You have [NVIDIA-Certified Systems](https://docs.nvidia.com/ngc/ngc-deploy-on-premises/nvidia-certified-systems/index.html) with Mellanox CX NICs for x86-64 servers +- You have [NVIDIA Qualified Systems](https://www.nvidia.com/en-us/data-center/data-center-gpus/qualified-system-catalog/?start=0&count=50&pageNumber=1&filters=eyJmaWx0ZXJzIjpbXSwic3ViRmlsdGVycyI6eyJwcm9jZXNzb3JUeXBlIjpbIkFSTS1UaHVuZGVyWDIiLCJBUk0tQWx0cmEiXX0sImNlcnRpZmllZEZpbHRlcnMiOnt9LCJwYXlsb2FkIjpbXX0=) for ARM servers + `NOTE:` For ARM systems, NVIDIA Network Operator is not supported yet. +- You will perform a clean install. + +To determine if your system qualifies as an NVIDIA Certified System, review the list of NVIDIA Certified Systems [here](https://docs.nvidia.com/ngc/ngc-deploy-on-premises/nvidia-certified-systems/index.html). + +Please note that NVIDIA Cloud Native Stack is validated only on systems with the default kernel (not HWE). + + +### Installing the Ubuntu Operating System +These instructions require installing Ubuntu Server LTS 22.04 Ubuntu Server can be downloaded [here](http://cdimage.ubuntu.com/releases/20.04.4/release/). + +Please reference the [Ubuntu Server Installation Guide](https://ubuntu.com/tutorials/tutorial-install-ubuntu-server#1-overview). + +## Installing Container Runtime + +You need to install a container runtime into each node in the cluster so that Pods can run there. Currently Cloud Native Stack provides below container runtimes: + +- [Installing Containerd](#Installing-Containerd) +- [Installing CRI-O](#Installing-CRI-O) + +`NOTE:` Only install one of either `Containerd` or `CRI-O`, not both! + +These steps apply to both runtimes. + +Set up the repository and update the apt package index: + +``` +sudo apt-get update +``` + +Install packages to allow apt to use a repository over HTTPS: + +``` +sudo apt-get install -y apt-transport-https ca-certificates gnupg-agent libseccomp2 autotools-dev debhelper software-properties-common +``` + +Configure the `overlay` and `br_netfilter` kernel modules required by Kubernetes: + +``` +cat <:6443 --token 489oi5.sm34l9uh7dk4z6cm \ + --discovery-token-ca-cert-hash sha256:17165b6c4a4b95d73a3a2a83749a957a10161ae34d2dfd02cd730597579b4b34 +``` + + +Following the instructions in the output, execute the commands as shown below: + +``` +mkdir -p $HOME/.kube +``` + +``` +sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config +``` + +``` +sudo chown $(id -u):$(id -g) $HOME/.kube/config +``` + +With the following command, you install a pod-network add-on to the control plane node. We are using calico as the pod-network add-on here: + +``` +kubectl apply -f https://raw.githubusercontent.com/projectcalico/calico/v3.25.1/manifests/calico.yaml +``` + +Update the Calico Daemonset + +``` +kubectl set env daemonset/calico-node -n kube-system IP_AUTODETECTION_METHOD=interface=ens\*,eth\*,enc\*,enp\* +``` + +You can execute the below commands to ensure that all pods are up and running: + +``` +kubectl get pods --all-namespaces +``` + +Output: + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +kube-system calico-kube-controllers-65b8787765-bjc8h 1/1 Running 0 2m8s +kube-system calico-node-c2tmk 1/1 Running 0 2m8s +kube-system coredns-5c98db65d4-d4kgh 1/1 Running 0 9m8s +kube-system coredns-5c98db65d4-h6x8m 1/1 Running 0 9m8s +kube-system etcd-#yourhost 1/1 Running 0 8m25s +kube-system kube-apiserver-#yourhost 1/1 Running 0 8m7s +kube-system kube-controller-manager-#yourhost 1/1 Running 0 8m3s +kube-system kube-proxy-6sh42 1/1 Running 0 9m7s +kube-system kube-scheduler-#yourhost 1/1 Running 0 8m26s +``` + +The get nodes command shows that the control-plane node is up and ready: + +``` +kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane 10m v1.29.6 +``` + +Since we are using a single-node Kubernetes cluster, the cluster will not schedule pods on the control plane node by default. To schedule pods on the control plane node, we have to remove the taint by executing the following command: + +``` +kubectl taint nodes --all node-role.kubernetes.io/control-plane- +``` + +Refer to [Installing Kubeadm](https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/install-kubeadm/) +for more information. + +### Installing Helm + +Execute the following command to download and install Helm 3.15.3 for `x86-64` system: + +``` +wget https://get.helm.sh/helm-v3.15.3-linux-amd64.tar.gz +``` + +``` +tar -zxvf helm-v3.15.3-linux-amd64.tar.gz + ``` + + ``` +sudo mv linux-amd64/helm /usr/local/bin/helm + ``` + + ``` +rm -rf helm-v3.15.3-linux-amd64.tar.gz linux-amd64/ +``` + +Download and install Helm 3.15.3 for `ARM` system: + +``` +wget https://get.helm.sh/helm-v3.15.3-linux-arm64.tar.gz +``` + +``` +tar -zxvf helm-v3.15.3-linux-arm64.tar.gz + ``` + +``` +sudo mv linux-arm64/helm /usr/local/bin/helm +``` + +``` +rm -rf helm-v3.15.3-linux-arm64.tar.gz linux-arm64/ +``` + +Refer to the Helm 3.15.3 [release notes](https://github.com/helm/helm/releases) and the [Installing Helm guide](https://helm.sh/docs/using_helm/#installing-helm) for more information. + + +### Adding an Additional Node to NVIDIA Cloud Native Stack + +`NOTE:` If you're not adding additional nodes, please skip this step and proceed to the next step [Installing NVIDIA Network Operator](#Installing-NVIDIA-Network-Operator) + +Make sure to install the Containerd and Kubernetes packages on additional nodes. + +Prerequisites: +- [Installing Containerd](#Installing-Containerd) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Disable swap](#Disable-swap) + +Once the prerequisites are completed on the additional nodes, execute the below command on the control-plane node and then execute the join command output on an additional node to add the additional node to NVIDIA Cloud Native Stack: + +``` +sudo kubeadm token create --print-join-command +``` + +Output: +``` +example: +sudo kubeadm join 10.110.0.34:6443 --token kg2h7r.e45g9uyrbm1c0w3k --discovery-token-ca-cert-hash sha256:77fd6571644373ea69074dd4af7b077bbf5bd15a3ed720daee98f4b04a8f524e +``` +`NOTE`: control-plane node and worker node should not have the same node name. + +The get nodes command shows that the master and worker nodes are up and ready: + +``` +kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane 10m v1.29.6 +#yourhost-worker Ready 10m v1.29.6 +``` + +### Installing NVIDIA Network Operator + +`NOTE:` If Mellanox NICs are not connected to your nodes, please skip this step and proceed to the next step [Installing GPU Operator](#Installing-GPU-Operator) + +The below instructions assume that Mellanox NICs are connected to your machines. + +Execute the below command to verify Mellanox NICs are enabled on your machines: + +``` +lspci | grep -i "Mellanox" +``` + +Output: +``` +0c:00.0 Ethernet controller: Mellanox Technologies MT2892 Family [ConnectX-6 Dx] +0c:00.1 Ethernet controller: Mellanox Technologies MT2892 Family [ConnectX-6 Dx] +``` +Execute the below command to determine which Mellanox device is active: + +`NOTE:` Use whicever device shows as `Link Detected: yes` in further steps. The below command works only if you add the NICs before installing the Operating System. + +``` +for device in `sudo lshw -class network -short | grep -i ConnectX | awk '{print $2}' | egrep -v 'Device|path' | sed '/^$/d'`;do echo -n $device; sudo ethtool $device | grep -i "Link detected"; done +``` +Output: +``` +ens160f0 Link detected: yes +ens160f1 Link detected: no +``` + +Create the custom network operator values.yaml and update the active Mellanox device from the above command: +``` +nano network-operator-values.yaml +deployCR: true +ofedDriver: + deploy: true +rdmaSharedDevicePlugin: + deploy: true + resources: + - name: rdma_shared_device_a + vendors: [15b3] + devices: [ens160f0] +``` + +For more information about custom network operator values.yaml, please refer [Network Operator](https://docs.mellanox.com/display/COKAN10/Network+Operator#NetworkOperator-Example2:RDMADevicePluginConfiguration) + +Add the NVIDIA repo: +``` +helm repo add mellanox https://mellanox.github.io/network-operator +``` + +Update the Helm repo: +``` + helm repo update +``` +Install Network Operator: +``` +kubectl label nodes --all node-role.kubernetes.io/master- --overwrite +``` + +``` +helm install -f --version 24.1.0 ./network-operator-values.yaml -n network-operator --create-namespace --wait network-operator mellanox/network-operator +``` +#### Validating the State of the Network Operator + +Please note that the installation of the Network Operator can take a couple of minutes. How long the installation will take depends on your internet speed. + +``` +kubectl get pods --all-namespaces | egrep 'network-operator|nvidia-network-operator-resources' +``` + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +network-operator network-operator-547cb8d999-mn2h9 1/1 Running 0 17m +network-operator network-operator-node-feature-discovery-master-596fb8b7cb-qrmvv 1/1 Running 0 17m +network-operator network-operator-node-feature-discovery-worker-qt5xt 1/1 Running 0 17m +nvidia-network-operator-resources cni-plugins-ds-dl5vl 1/1 Running 0 17m +nvidia-network-operator-resources kube-multus-ds-w82rv 1/1 Running 0 17m +nvidia-network-operator-resources mofed-ubuntu20.04-ds-xfpzl 1/1 Running 0 17m +nvidia-network-operator-resources rdma-shared-dp-ds-2hgb6 1/1 Running 0 17m +nvidia-network-operator-resources sriov-device-plugin-ch7bz 1/1 Running 0 10m +nvidia-network-operator-resources whereabouts-56ngr 1/1 Running 0 10m +``` + +Please refer to the [Network Operator page](https://docs.mellanox.com/display/COKAN10/Network+Operator) for more information. + +### Installing GPU Operator + +Add the NVIDIA repo: + +``` +helm repo add nvidia https://helm.ngc.nvidia.com/nvidia +``` + +Update the Helm repo: + +``` +helm repo update +``` + +Install GPU Operator: + +`NOTE:` If you installed Network Operator, please skip the below command and follow the [GPU Operator with RDMA](#GPU-Operator-with-RDMA) + +``` +helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --set driver.version=550.54.15 --wait --generate-name +``` + +#### GPU Operator with RDMA + +- Prerequisites: + - Please install the [Network Operator](#Installing NVIDIA Network Operator) to ensure that the MOFED drivers are installed. + +After Network Operator installation is completed, execute the below command to install the GPU Operator to load nv_peer_mem modules: + +``` + helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --set driver.rdma.enabled=true --wait --generate-name +``` + +#### GPU Operator with Host MOFED Driver and RDMA + +If the host is already installed MOFED driver without network operator, execute the below command to install the GPU Operator to load nv_peer_mem module + +``` + helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --set driver.rdma.enabled=true,driver.rdma.useHostMofed=true --wait --generate-name + +``` + +### GPU Operator with GPU Direct Storage(GDS) + +Execute the below command to enable the GPU Direct Storage Driver on GPU Operator + +``` +helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --set gds.enabled=true +``` +For more information refer, [GPU Direct Storage](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/gpu-operator-rdma.html) + +#### Validating the State of the GPU Operator: + +Please note that the installation of the GPU Operator can take a couple of minutes. How long the installation will take depends on your internet speed. + +``` +kubectl get pods --all-namespaces | grep -v kube-system +``` + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +default gpu-operator-1622656274-node-feature-discovery-master-5cddq96gq 1/1 Running 0 2m39s +default gpu-operator-1622656274-node-feature-discovery-worker-wr88v 1/1 Running 0 2m39s +default gpu-operator-7db468cfdf-mdrdp 1/1 Running 0 2m39s +nvidia-gpu-operator gpu-feature-discovery-g425f 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-container-toolkit-daemonset-mcmxj 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-cuda-validator-s6x2p 0/1 Completed 0 48s +nvidia-gpu-operator nvidia-dcgm-exporter-wtxnx 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-dcgm-jbz94 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-daemonset-hzzdt 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-validator-9nkxq 0/1 Completed 0 17s +nvidia-gpu-operator nvidia-driver-daemonset-kt8g5 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-operator-validator-cw4j5 1/1 Running 0 2m20s + +``` + +Please refer to the [GPU Operator page](https://ngc.nvidia.com/catalog/helm-charts/nvidia:gpu-operator) on NGC for more information. + +For multiple worker nodes, execute the below command to fix the CoreDNS and Node Feature Discovery. + +``` +kubectl delete pods $(kubectl get pods -n kube-system | grep core | awk '{print $1}') -n kube-system; kubectl delete pod $(kubectl get pods -o wide -n nvidia-gpu-operator | grep node-feature-discovery | grep -v master | awk '{print $1}') -n nvidia-gpu-operator +``` + +#### GPU Operator with MIG + +`NOTE:` Only A100 and A30 GPUs are supported for GPU Operator with MIG + +Multi-Instance GPU (MIG) allows GPUs based on the NVIDIA Ampere architecture (such as NVIDIA A100) to be securely partitioned into separate GPU instances for CUDA applications. For more information about enabling the MIG capability, please refer to [GPU Operator with MIG](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/gpu-operator-mig.html) + +### Validating the Network Operator with GPUDirect RDMA + +Execute the below command to list the Mellanox NIC's with the status: +``` +kubectl exec -it $(kubectl get pods -n nvidia-network-operator-resources | grep mofed | awk '{print $1}') -n nvidia-network-operator-resources -- ibdev2netdev +``` +Output: +``` +mlx5_0 port 1 ==> ens192f0 (Up) +mlx5_1 port 1 ==> ens192f1 (Down) +``` + +Create network definition for IPAM and replace the `ens192f0` with an active Mellanox device for `master`: +``` +$ nano networkdefinition.yaml +apiVersion: k8s.cni.cncf.io/v1 +kind: NetworkAttachmentDefinition +metadata: + annotations: + k8s.v1.cni.cncf.io/resourceName: rdma/rdma_shared_device_a + name: rdma-net-ipam + namespace: default +spec: + config: |- + { + "cniVersion": "0.3.1", + "name": "rdma-net-ipam", + "plugins": [ + { + "ipam": { + "datastore": "kubernetes", + "kubernetes": { + "kubeconfig": "/etc/cni/net.d/whereabouts.d/whereabouts.kubeconfig" + }, + "log_file": "/tmp/whereabouts.log", + "log_level": "debug", + "range": "192.168.112.0/24", + "type": "whereabouts" + }, + "type": "macvlan", + "master": "ens192f0" + }, + { + "mtu": 1500, + "type": "tuning" + } + ] + } +EOF +``` +`NOTE:` If you do not have VLAN-based networking on the high-performance side, please set "vlan": 0 + + +Execute the below command to install network definition on NVIDIA Cloud Native Stack from the control-plane node: + + ``` +kubectl apply -f networkdefinition.yaml + ``` + +Now create the pod YAML with the below content: + +``` +cat < ../../devices/virtual/net/eth0 +lrwxrwxrwx 1 root root 0 Jun 1 02:26 lo -> ../../devices/virtual/net/lo +lrwxrwxrwx 1 root root 0 Jun 1 02:26 net1 -> ../../devices/virtual/net/net1 +lrwxrwxrwx 1 root root 0 Jun 1 02:26 tunl0 -> ../../devices/virtual/net/tunl0 +``` + +Execute the below command to list the Mellanox NIC's with the status: +``` +kubectl exec -it $(kubectl get pods -n nvidia-network-operator-resources | grep mofed | awk '{print $1}') -n nvidia-network-operator-resources -- ibdev2netdev +``` +Output: +``` +mlx5_0 port 1 ==> ens192f0 (Up) +mlx5_1 port 1 ==> ens192f1 (Down) +``` + +Update the above Mellanox NIC, for which status is `Up` in the below command: + +``` +kubectl exec -it rdma-test-pod-1 -- bash + +[root@rdma-test-pod-1 /]# ib_write_bw -d mlx5_0 -a -F --report_gbits -q 1 +************************************ +* Waiting for client to connect... * +************************************ +``` + +In a separate terminal, print the network address of the secondary interface on the `rdma-test-pod-1` pod: + +``` +$ kubectl exec rdma-test-pod-1 -- ip addr show dev net1 +5: net1@if24: mtu 9000 qdisc noqueue state UP group default + link/ether 62:51:fb:13:88:ce brd ff:ff:ff:ff:ff:ff link-netnsid 0 + inet 192.168.111.1/24 brd 192.168.111.255 scope global net1 + valid_lft forever preferred_lft forever +``` + +Execute the below command with the above inet address to verify the nv_peer_memory performance on NVIDIA Cloud Native Stack: +``` +$ kubectl exec -it rdma-test-pod-2 -- bash +[root@rdma-test-pod-1 /]# ib_write_bw -d mlx5_0 -a -F --report_gbits -q 1 192.168.111.2 +--------------------------------------------------------------------------------------- + RDMA_Write BW Test + Dual-port : OFF Device : mlx5_0 + Number of qps : 1 Transport type : IB + Connection type : RC Using SRQ : OFF + TX depth : 128 + CQ Moderation : 100 + Mtu : 1024[B] + Link type : Ethernet + GID index : 4 + Max inline data : 0[B] + rdma_cm QPs : OFF + Data ex. method : Ethernet +--------------------------------------------------------------------------------------- + local address: LID 0000 QPN 0x0137 PSN 0x3c5d65 RKey 0x00370e VAddr 0x007ff44bf1d000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:01 + remote address: LID 0000 QPN 0x0136 PSN 0x475031 RKey 0x002c23 VAddr 0x007fd3d83cb000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:02 +--------------------------------------------------------------------------------------- + #bytes #iterations BW peak[Gb/sec] BW average[Gb/sec] MsgRate[Mpps] + 2 5000 0.080755 0.073090 4.568094 + 4 5000 0.16 0.15 4.588128 + 8 5000 0.31 0.29 4.567442 + 16 5000 0.66 0.59 4.647555 + 32 5000 1.35 1.22 4.776518 + 64 5000 2.50 2.29 4.481806 + 128 5000 5.34 4.73 4.621828 + 256 5000 10.53 9.11 4.448153 + 512 5000 21.03 17.05 4.162100 + 1024 5000 38.67 34.16 4.169397 + 2048 5000 47.11 43.50 2.655219 + 4096 5000 51.29 51.02 1.557094 + 8192 5000 52.00 51.98 0.793178 + 16384 5000 52.33 52.32 0.399164 + 32768 5000 52.47 52.47 0.200143 + 65536 5000 52.51 52.50 0.100143 + 131072 5000 52.51 52.51 0.050078 + 262144 5000 52.49 52.49 0.025029 + 524288 5000 52.50 52.50 0.012517 + 1048576 5000 52.51 52.51 0.006260 + 2097152 5000 52.51 52.51 0.003130 + 4194304 5000 52.51 52.51 0.001565 + 8388608 5000 52.52 52.52 0.000783 +--------------------------------------------------------------------------------------- +``` + +``` +[root@rdma-test-pod-1 /]# ib_write_bw -d mlx5_0 -a -F --report_gbits -q 1 + +************************************ +* Waiting for client to connect... * +************************************ +--------------------------------------------------------------------------------------- + RDMA_Write BW Test + Dual-port : OFF Device : mlx5_0 + Number of qps : 1 Transport type : IB + Connection type : RC Using SRQ : OFF + CQ Moderation : 100 + Mtu : 1024[B] + Link type : Ethernet + GID index : 8 + Max inline data : 0[B] + rdma_cm QPs : OFF + Data ex. method : Ethernet +--------------------------------------------------------------------------------------- + local address: LID 0000 QPN 0x0136 PSN 0x475031 RKey 0x002c23 VAddr 0x007fd3d83cb000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:02 + remote address: LID 0000 QPN 0x0137 PSN 0x3c5d65 RKey 0x00370e VAddr 0x007ff44bf1d000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:01 +--------------------------------------------------------------------------------------- + #bytes #iterations BW peak[Gb/sec] BW average[Gb/sec] MsgRate[Mpps] + 8388608 5000 52.52 52.52 0.000783 +--------------------------------------------------------------------------------------- +``` +The benchmark achieved approximately 52 Gbps throughput. + +Exit from RDMA test pods and then delete the RDMA test pods with the below command: + +``` +$ kubectl delete pod rdma-test-pod-1 rdma-test-pod-2 +``` + +### Validating the GPU Operator + +GPU Operator validates the through the nvidia-device-plugin-validation pod and the nvidia-driver-validation pod. If both are completed successfully (see output from kubectl get pods --all-namespaces | grep -v kube-system), NVIDIA Cloud Native Stack is working as expected. This section provides two examples of validating that the GPU is usable from within a pod to validate the manually. + +#### Example 1: nvidia-smi + +Execute the following: + +``` +cat < Open Network Stream > Entering the following URL: + +``` +rtsp://IPAddress of Node:31113/ds-test +``` + +You should see the video output like below with the AI model detecting objects. + +![Deepstream_Video](screenshots/Deepstream.png) + +`NOTE:` Video stream in VLC will change if you provide an input RTSP camera. + + +### Uninstalling the GPU Operator + +Execute the below commands to uninstall the GPU Operator: + +``` +$ helm ls +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +gpu-operator-1606173805 nvidia-gpu-operator 1 2024-03-20 20:23:28.063421701 +0000 UTC deployed gpu-operator-24.3.0 24.3.0 + +$ helm del gpu-operator-1606173805 -n nvidia-gpu-operator + +``` + +### Uninstalling the Network Operator + +Execute the below commands to uninstall the Network Operator: + +``` +$ helm ls -n network-operator +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +network-operator network-operator 1 2024-03-20 17:09:04.665593336 +0000 UTC deployed network-operator-24.1.0 v24.1.0 + +$ helm del network-operator -n network-operator +``` diff --git a/install-guides/Ubuntu-22-04_Server_x86-arm64_v13.0.md b/install-guides/Ubuntu-22-04_Server_x86-arm64_v13.0.md index 605f5e8..a939126 100644 --- a/install-guides/Ubuntu-22-04_Server_x86-arm64_v13.0.md +++ b/install-guides/Ubuntu-22-04_Server_x86-arm64_v13.0.md @@ -250,7 +250,7 @@ Now execute the below to install kubelet, kubeadm, and kubectl: ``` ``` - sudo apt install -y -q kubelet=1.30.0-00 kubectl=1.30.0-00 kubeadm=1.30.0-00 + sudo apt install -y -q kubelet=1.30.0-1.1 kubectl=1.30.0-1.1 kubeadm=1.30.0-1.1 ``` ``` diff --git a/install-guides/Ubuntu-22-04_Server_x86-arm64_v13.1.md b/install-guides/Ubuntu-22-04_Server_x86-arm64_v13.1.md new file mode 100644 index 0000000..580eb70 --- /dev/null +++ b/install-guides/Ubuntu-22-04_Server_x86-arm64_v13.1.md @@ -0,0 +1,1127 @@ +# NVIDIA Cloud Native Stack v13.1 - Install Guide for Ubuntu Server +## Introduction + +This document describes how to setup the NVIDIA Cloud Native Stack collection on a single or multiple NVIDIA Certified Systems. NVIDIA Cloud Native Stack can be configured to create a single node Kubernetes cluster or to create/add additional worker nodes to join an existing cluster. + +NVIDIA Cloud Native Stack v13.1 includes: +- Ubuntu 22.04 LTS +- Containerd 1.7.20 +- Kubernetes version 1.30.2 +- Helm 3.15.3 +- NVIDIA GPU Operator 24.6.0 + - NVIDIA GPU Driver: 550.90.07 + - NVIDIA Container Toolkit: 1.16.1 + - NVIDIA K8S Device Plugin: 0.16.1 + - NVIDIA DCGM-Exporter: 3.3.7-3.5.0 + - NVIDIA DCGM: 3.3.7-1 + - NVIDIA GPU Feature Discovery: 0.16.1 + - NVIDIA K8s MIG Manager: 0.8.0 + - NVIDIA Driver Manager: 0.6.10 + - Node Feature Discovery: 0.16.3 + - NVIDIA KubeVirt GPU Device Plugin: 1.2.9 + - NVIDIA GDS Driver: 2.17.5 + - NVIDIA Kata Manager for Kubernetes: 0.2.1 + - NVIDIA Confidential Computing Manager for Kubernetes: 0.1.1 +- NVIDIA Network Operator 24.4.1 + - Mellanox MOFED Driver 24.04-0.6.6.0-0 + - RDMA Shared Device Plugin 1.4.0 + - SRIOV Device Plugin 3.6.2 + - Container Networking Plugins 1.3.0 + - Multus 3.9.3 + - Whereabouts 0.7.0 + +## Table of Contents + +- [Prerequisites](#Prerequisites) +- [Installing the Ubuntu Operating System](#Installing-the-Ubuntu-Operating-System) +- [Installing Container Runtime](#Installing-Container-Runtime) + - [Installing Containerd](#Installing-Containerd) + - [Installing CRI-O](#Installing-CRI-O) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Installing Helm](#Installing-Helm) +- [Adding an Additional Node to NVIDIA Cloud Native Stack](#Adding-additional-node-to-NVIDIA-Cloud-Native-Stack) +- [Installing NVIDIA Network Operator](#Installing-NVIDIA-Network-Operator) +- [Installing the GPU Operator](#Installing-the-GPU-Operator) +- [Validating the Network Operator with GPUDirect RDMA](#Validating-the-Network-Operator-with-GPUDirect-RDMA) +- [Validating the GPU Operator](#Validating-the-GPU-Operator) +- [Validate NVIDIA Cloud Native Stack with an Application from NGC](#Validate-NVIDIA-Cloud-Native-Stack-with-an-application-from-NGC) +- [Uninstalling the GPU Operator](#Uninstalling-the-GPU-Operator) +- [Uninstalling the Network Operator](#Uninstalling-the-Network-Operator) + +### Prerequisites + +The following instructions assume the following: + +- You have [NVIDIA-Certified Systems](https://docs.nvidia.com/ngc/ngc-deploy-on-premises/nvidia-certified-systems/index.html) with Mellanox CX NICs for x86-64 servers +- You have [NVIDIA Qualified Systems](https://www.nvidia.com/en-us/data-center/data-center-gpus/qualified-system-catalog/?start=0&count=50&pageNumber=1&filters=eyJmaWx0ZXJzIjpbXSwic3ViRmlsdGVycyI6eyJwcm9jZXNzb3JUeXBlIjpbIkFSTS1UaHVuZGVyWDIiLCJBUk0tQWx0cmEiXX0sImNlcnRpZmllZEZpbHRlcnMiOnt9LCJwYXlsb2FkIjpbXX0=) for ARM servers + `NOTE:` For ARM systems, NVIDIA Network Operator is not supported yet. +- You will perform a clean install. + +To determine if your system qualifies as an NVIDIA Certified System, review the list of NVIDIA Certified Systems [here](https://docs.nvidia.com/ngc/ngc-deploy-on-premises/nvidia-certified-systems/index.html). + +Please note that NVIDIA Cloud Native Stack is validated only on systems with the default kernel (not HWE). + + +### Installing the Ubuntu Operating System +These instructions require installing Ubuntu Server LTS 22.04 Ubuntu Server can be downloaded [here](http://cdimage.ubuntu.com/releases/20.04.4/release/). + +Please reference the [Ubuntu Server Installation Guide](https://ubuntu.com/tutorials/tutorial-install-ubuntu-server#1-overview). + +## Installing Container Runtime + +You need to install a container runtime into each node in the cluster so that Pods can run there. Currently Cloud Native Stack provides below container runtimes: + +- [Installing Containerd](#Installing-Containerd) +- [Installing CRI-O](#Installing-CRI-O) + +`NOTE:` Only install one of either `Containerd` or `CRI-O`, not both! + +These steps apply to both runtimes. + +Set up the repository and update the apt package index: + +``` +sudo apt update +``` + +Install packages to allow apt to use a repository over HTTPS: + +``` +sudo apt install -y apt-transport-https ca-certificates gnupg-agent libseccomp2 autotools-dev debhelper software-properties-common +``` + +Configure the `overlay` and `br_netfilter` kernel modules required by Kubernetes: + +``` +cat <:6443 --token 489oi5.sm34l9uh7dk4z6cm \ + --discovery-token-ca-cert-hash sha256:17165b6c4a4b95d73a3a2a83749a957a10161ae34d2dfd02cd730597579b4b34 +``` + + +Following the instructions in the output, execute the commands as shown below: + +``` +mkdir -p $HOME/.kube +``` + +``` +sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config +``` + +``` +sudo chown $(id -u):$(id -g) $HOME/.kube/config +``` + +With the following command, you install a pod-network add-on to the control plane node. We are using calico as the pod-network add-on here: + +``` +kubectl apply -f https://raw.githubusercontent.com/projectcalico/calico/v3.25.1/manifests/calico.yaml +``` + +Update the Calico Daemonset + +``` +kubectl set env daemonset/calico-node -n kube-system IP_AUTODETECTION_METHOD=interface=ens\*,eth\*,enc\*,enp\* +``` + +You can execute the below commands to ensure that all pods are up and running: + +``` +kubectl get pods --all-namespaces +``` + +Output: + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +kube-system calico-kube-controllers-65b8787765-bjc8h 1/1 Running 0 2m8s +kube-system calico-node-c2tmk 1/1 Running 0 2m8s +kube-system coredns-5c98db65d4-d4kgh 1/1 Running 0 9m8s +kube-system coredns-5c98db65d4-h6x8m 1/1 Running 0 9m8s +kube-system etcd-#yourhost 1/1 Running 0 8m25s +kube-system kube-apiserver-#yourhost 1/1 Running 0 8m7s +kube-system kube-controller-manager-#yourhost 1/1 Running 0 8m3s +kube-system kube-proxy-6sh42 1/1 Running 0 9m7s +kube-system kube-scheduler-#yourhost 1/1 Running 0 8m26s +``` + +The get nodes command shows that the control-plane node is up and ready: + +``` +kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane 10m v1.30.0 +``` + +Since we are using a single-node Kubernetes cluster, the cluster will not schedule pods on the control plane node by default. To schedule pods on the control plane node, we have to remove the taint by executing the following command: + +``` +kubectl taint nodes --all node-role.kubernetes.io/control-plane- +``` + +Refer to [Installing Kubeadm](https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/install-kubeadm/) +for more information. + +### Installing Helm + +Execute the following command to download and install Helm 3.15.3 for `x86-64` system: + +``` +wget https://get.helm.sh/helm-v3.15.3-linux-amd64.tar.gz +``` + +``` +tar -zxvf helm-v3.15.3-linux-amd64.tar.gz + ``` + + ``` +sudo mv linux-amd64/helm /usr/local/bin/helm + ``` + + ``` +rm -rf helm-v3.15.3-linux-amd64.tar.gz linux-amd64/ +``` + +Download and install Helm 3.15.3 for `ARM` system: + +``` +wget https://get.helm.sh/helm-v3.15.3-linux-arm64.tar.gz +``` + +``` +tar -zxvf helm-v3.15.3-linux-arm64.tar.gz + ``` + +``` +sudo mv linux-arm64/helm /usr/local/bin/helm +``` + +``` +rm -rf helm-v3.15.3-linux-arm64.tar.gz linux-arm64/ +``` + +Refer to the Helm 3.15.3 [release notes](https://github.com/helm/helm/releases) and the [Installing Helm guide](https://helm.sh/docs/using_helm/#installing-helm) for more information. + + +### Adding an Additional Node to NVIDIA Cloud Native Stack + +`NOTE:` If you're not adding additional nodes, please skip this step and proceed to the next step [Installing NVIDIA Network Operator](#Installing-NVIDIA-Network-Operator) + +Make sure to install the Containerd and Kubernetes packages on additional nodes. + +Prerequisites: +- [Installing Containerd](#Installing-Containerd) +- [Installing Kubernetes](#Installing-Kubernetes) +- [Disable swap](#Disable-swap) + +Once the prerequisites are completed on the additional nodes, execute the below command on the control-plane node and then execute the join command output on an additional node to add the additional node to NVIDIA Cloud Native Stack: + +``` +sudo kubeadm token create --print-join-command +``` + +Output: +``` +example: +sudo kubeadm join 10.110.0.34:6443 --token kg2h7r.e45g9uyrbm1c0w3k --discovery-token-ca-cert-hash sha256:77fd6571644373ea69074dd4af7b077bbf5bd15a3ed720daee98f4b04a8f524e +``` +`NOTE`: control-plane node and worker node should not have the same node name. + +The get nodes command shows that the master and worker nodes are up and ready: + +``` +kubectl get nodes +``` + +Output: + +``` +NAME STATUS ROLES AGE VERSION +#yourhost Ready control-plane 10m v1.30.0 +#yourhost-worker Ready 10m v1.30.0 +``` + +### Installing NVIDIA Network Operator + +`NOTE:` If Mellanox NICs are not connected to your nodes, please skip this step and proceed to the next step [Installing GPU Operator](#Installing-GPU-Operator) + +The below instructions assume that Mellanox NICs are connected to your machines. + +Execute the below command to verify Mellanox NICs are enabled on your machines: + +``` +lspci | grep -i "Mellanox" +``` + +Output: +``` +0c:00.0 Ethernet controller: Mellanox Technologies MT2892 Family [ConnectX-6 Dx] +0c:00.1 Ethernet controller: Mellanox Technologies MT2892 Family [ConnectX-6 Dx] +``` +Execute the below command to determine which Mellanox device is active: + +`NOTE:` Use whicever device shows as `Link Detected: yes` in further steps. The below command works only if you add the NICs before installing the Operating System. + +``` +for device in `sudo lshw -class network -short | grep -i ConnectX | awk '{print $2}' | egrep -v 'Device|path' | sed '/^$/d'`;do echo -n $device; sudo ethtool $device | grep -i "Link detected"; done +``` +Output: +``` +ens160f0 Link detected: yes +ens160f1 Link detected: no +``` + +Create the custom network operator values.yaml and update the active Mellanox device from the above command: +``` +nano network-operator-values.yaml +deployCR: true +ofedDriver: + deploy: true +rdmaSharedDevicePlugin: + deploy: true + resources: + - name: rdma_shared_device_a + vendors: [15b3] + devices: [ens160f0] +``` + +For more information about custom network operator values.yaml, please refer [Network Operator](https://docs.mellanox.com/display/COKAN10/Network+Operator#NetworkOperator-Example2:RDMADevicePluginConfiguration) + +Add the NVIDIA repo: +``` +helm repo add mellanox https://mellanox.github.io/network-operator +``` + +Update the Helm repo: +``` + helm repo update +``` +Install Network Operator: +``` +kubectl label nodes --all node-role.kubernetes.io/master- --overwrite +``` + +``` +helm install -f --version 24.1.0 ./network-operator-values.yaml -n network-operator --create-namespace --wait network-operator mellanox/network-operator +``` +#### Validating the State of the Network Operator + +Please note that the installation of the Network Operator can take a couple of minutes. How long the installation will take depends on your internet speed. + +``` +kubectl get pods --all-namespaces | egrep 'network-operator|nvidia-network-operator-resources' +``` + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +network-operator network-operator-547cb8d999-mn2h9 1/1 Running 0 17m +network-operator network-operator-node-feature-discovery-master-596fb8b7cb-qrmvv 1/1 Running 0 17m +network-operator network-operator-node-feature-discovery-worker-qt5xt 1/1 Running 0 17m +nvidia-network-operator-resources cni-plugins-ds-dl5vl 1/1 Running 0 17m +nvidia-network-operator-resources kube-multus-ds-w82rv 1/1 Running 0 17m +nvidia-network-operator-resources mofed-ubuntu20.04-ds-xfpzl 1/1 Running 0 17m +nvidia-network-operator-resources rdma-shared-dp-ds-2hgb6 1/1 Running 0 17m +nvidia-network-operator-resources sriov-device-plugin-ch7bz 1/1 Running 0 10m +nvidia-network-operator-resources whereabouts-56ngr 1/1 Running 0 10m +``` + +Please refer to the [Network Operator page](https://docs.mellanox.com/display/COKAN10/Network+Operator) for more information. + +### Installing GPU Operator + +Add the NVIDIA repo: + +``` +helm repo add nvidia https://helm.ngc.nvidia.com/nvidia +``` + +Update the Helm repo: + +``` +helm repo update +``` + +Install GPU Operator: + +`NOTE:` If you installed Network Operator, please skip the below command and follow the [GPU Operator with RDMA](#GPU-Operator-with-RDMA) + +``` +helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --set driver.version=550.54.15 --wait --generate-name +``` + +#### GPU Operator with RDMA + +- Prerequisites: + - Please install the [Network Operator](#Installing NVIDIA Network Operator) to ensure that the MOFED drivers are installed. + +After Network Operator installation is completed, execute the below command to install the GPU Operator to load nv_peer_mem modules: + +``` + helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --set driver.rdma.enabled=true --wait --generate-name +``` + +#### GPU Operator with Host MOFED Driver and RDMA + +If the host is already installed MOFED driver without network operator, execute the below command to install the GPU Operator to load nv_peer_mem module + +``` + helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --set driver.rdma.enabled=true,driver.rdma.useHostMofed=true --wait --generate-name + +``` + +### GPU Operator with GPU Direct Storage(GDS) + +Execute the below command to enable the GPU Direct Storage Driver on GPU Operator + +``` +helm install --version 24.3.0 --create-namespace --namespace nvidia-gpu-operator nvidia/gpu-operator --set gds.enabled=true +``` +For more information refer, [GPU Direct Storage](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/gpu-operator-rdma.html) + +#### Validating the State of the GPU Operator: + +Please note that the installation of the GPU Operator can take a couple of minutes. How long the installation will take depends on your internet speed. + +``` +kubectl get pods --all-namespaces | grep -v kube-system +``` + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +default gpu-operator-1622656274-node-feature-discovery-master-5cddq96gq 1/1 Running 0 2m39s +default gpu-operator-1622656274-node-feature-discovery-worker-wr88v 1/1 Running 0 2m39s +default gpu-operator-7db468cfdf-mdrdp 1/1 Running 0 2m39s +nvidia-gpu-operator gpu-feature-discovery-g425f 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-container-toolkit-daemonset-mcmxj 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-cuda-validator-s6x2p 0/1 Completed 0 48s +nvidia-gpu-operator nvidia-dcgm-exporter-wtxnx 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-dcgm-jbz94 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-daemonset-hzzdt 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-device-plugin-validator-9nkxq 0/1 Completed 0 17s +nvidia-gpu-operator nvidia-driver-daemonset-kt8g5 1/1 Running 0 2m20s +nvidia-gpu-operator nvidia-operator-validator-cw4j5 1/1 Running 0 2m20s + +``` + +Please refer to the [GPU Operator page](https://ngc.nvidia.com/catalog/helm-charts/nvidia:gpu-operator) on NGC for more information. + +For multiple worker nodes, execute the below command to fix the CoreDNS and Node Feature Discovery. + +``` +kubectl delete pods $(kubectl get pods -n kube-system | grep core | awk '{print $1}') -n kube-system; kubectl delete pod $(kubectl get pods -o wide -n nvidia-gpu-operator | grep node-feature-discovery | grep -v master | awk '{print $1}') -n nvidia-gpu-operator +``` + +#### GPU Operator with MIG + +`NOTE:` Only A100 and A30 GPUs are supported for GPU Operator with MIG + +Multi-Instance GPU (MIG) allows GPUs based on the NVIDIA Ampere architecture (such as NVIDIA A100) to be securely partitioned into separate GPU instances for CUDA applications. For more information about enabling the MIG capability, please refer to [GPU Operator with MIG](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/gpu-operator-mig.html) + +### Validating the Network Operator with GPUDirect RDMA + +Execute the below command to list the Mellanox NIC's with the status: +``` +kubectl exec -it $(kubectl get pods -n nvidia-network-operator-resources | grep mofed | awk '{print $1}') -n nvidia-network-operator-resources -- ibdev2netdev +``` +Output: +``` +mlx5_0 port 1 ==> ens192f0 (Up) +mlx5_1 port 1 ==> ens192f1 (Down) +``` + +Create network definition for IPAM and replace the `ens192f0` with an active Mellanox device for `master`: +``` +$ nano networkdefinition.yaml +apiVersion: k8s.cni.cncf.io/v1 +kind: NetworkAttachmentDefinition +metadata: + annotations: + k8s.v1.cni.cncf.io/resourceName: rdma/rdma_shared_device_a + name: rdma-net-ipam + namespace: default +spec: + config: |- + { + "cniVersion": "0.3.1", + "name": "rdma-net-ipam", + "plugins": [ + { + "ipam": { + "datastore": "kubernetes", + "kubernetes": { + "kubeconfig": "/etc/cni/net.d/whereabouts.d/whereabouts.kubeconfig" + }, + "log_file": "/tmp/whereabouts.log", + "log_level": "debug", + "range": "192.168.112.0/24", + "type": "whereabouts" + }, + "type": "macvlan", + "master": "ens192f0" + }, + { + "mtu": 1500, + "type": "tuning" + } + ] + } +EOF +``` +`NOTE:` If you do not have VLAN-based networking on the high-performance side, please set "vlan": 0 + + +Execute the below command to install network definition on NVIDIA Cloud Native Stack from the control-plane node: + + ``` +kubectl apply -f networkdefinition.yaml + ``` + +Now create the pod YAML with the below content: + +``` +cat < ../../devices/virtual/net/eth0 +lrwxrwxrwx 1 root root 0 Jun 1 02:26 lo -> ../../devices/virtual/net/lo +lrwxrwxrwx 1 root root 0 Jun 1 02:26 net1 -> ../../devices/virtual/net/net1 +lrwxrwxrwx 1 root root 0 Jun 1 02:26 tunl0 -> ../../devices/virtual/net/tunl0 +``` + +Execute the below command to list the Mellanox NIC's with the status: +``` +kubectl exec -it $(kubectl get pods -n nvidia-network-operator-resources | grep mofed | awk '{print $1}') -n nvidia-network-operator-resources -- ibdev2netdev +``` +Output: +``` +mlx5_0 port 1 ==> ens192f0 (Up) +mlx5_1 port 1 ==> ens192f1 (Down) +``` + +Update the above Mellanox NIC, for which status is `Up` in the below command: + +``` +kubectl exec -it rdma-test-pod-1 -- bash + +[root@rdma-test-pod-1 /]# ib_write_bw -d mlx5_0 -a -F --report_gbits -q 1 +************************************ +* Waiting for client to connect... * +************************************ +``` + +In a separate terminal, print the network address of the secondary interface on the `rdma-test-pod-1` pod: + +``` +$ kubectl exec rdma-test-pod-1 -- ip addr show dev net1 +5: net1@if24: mtu 9000 qdisc noqueue state UP group default + link/ether 62:51:fb:13:88:ce brd ff:ff:ff:ff:ff:ff link-netnsid 0 + inet 192.168.111.1/24 brd 192.168.111.255 scope global net1 + valid_lft forever preferred_lft forever +``` + +Execute the below command with the above inet address to verify the nv_peer_memory performance on NVIDIA Cloud Native Stack: +``` +$ kubectl exec -it rdma-test-pod-2 -- bash +[root@rdma-test-pod-1 /]# ib_write_bw -d mlx5_0 -a -F --report_gbits -q 1 192.168.111.2 +--------------------------------------------------------------------------------------- + RDMA_Write BW Test + Dual-port : OFF Device : mlx5_0 + Number of qps : 1 Transport type : IB + Connection type : RC Using SRQ : OFF + TX depth : 128 + CQ Moderation : 100 + Mtu : 1024[B] + Link type : Ethernet + GID index : 4 + Max inline data : 0[B] + rdma_cm QPs : OFF + Data ex. method : Ethernet +--------------------------------------------------------------------------------------- + local address: LID 0000 QPN 0x0137 PSN 0x3c5d65 RKey 0x00370e VAddr 0x007ff44bf1d000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:01 + remote address: LID 0000 QPN 0x0136 PSN 0x475031 RKey 0x002c23 VAddr 0x007fd3d83cb000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:02 +--------------------------------------------------------------------------------------- + #bytes #iterations BW peak[Gb/sec] BW average[Gb/sec] MsgRate[Mpps] + 2 5000 0.080755 0.073090 4.568094 + 4 5000 0.16 0.15 4.588128 + 8 5000 0.31 0.29 4.567442 + 16 5000 0.66 0.59 4.647555 + 32 5000 1.35 1.22 4.776518 + 64 5000 2.50 2.29 4.481806 + 128 5000 5.34 4.73 4.621828 + 256 5000 10.53 9.11 4.448153 + 512 5000 21.03 17.05 4.162100 + 1024 5000 38.67 34.16 4.169397 + 2048 5000 47.11 43.50 2.655219 + 4096 5000 51.29 51.02 1.557094 + 8192 5000 52.00 51.98 0.793178 + 16384 5000 52.33 52.32 0.399164 + 32768 5000 52.47 52.47 0.200143 + 65536 5000 52.51 52.50 0.100143 + 131072 5000 52.51 52.51 0.050078 + 262144 5000 52.49 52.49 0.025029 + 524288 5000 52.50 52.50 0.012517 + 1048576 5000 52.51 52.51 0.006260 + 2097152 5000 52.51 52.51 0.003130 + 4194304 5000 52.51 52.51 0.001565 + 8388608 5000 52.52 52.52 0.000783 +--------------------------------------------------------------------------------------- +``` + +``` +[root@rdma-test-pod-1 /]# ib_write_bw -d mlx5_0 -a -F --report_gbits -q 1 + +************************************ +* Waiting for client to connect... * +************************************ +--------------------------------------------------------------------------------------- + RDMA_Write BW Test + Dual-port : OFF Device : mlx5_0 + Number of qps : 1 Transport type : IB + Connection type : RC Using SRQ : OFF + CQ Moderation : 100 + Mtu : 1024[B] + Link type : Ethernet + GID index : 8 + Max inline data : 0[B] + rdma_cm QPs : OFF + Data ex. method : Ethernet +--------------------------------------------------------------------------------------- + local address: LID 0000 QPN 0x0136 PSN 0x475031 RKey 0x002c23 VAddr 0x007fd3d83cb000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:02 + remote address: LID 0000 QPN 0x0137 PSN 0x3c5d65 RKey 0x00370e VAddr 0x007ff44bf1d000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:111:01 +--------------------------------------------------------------------------------------- + #bytes #iterations BW peak[Gb/sec] BW average[Gb/sec] MsgRate[Mpps] + 8388608 5000 52.52 52.52 0.000783 +--------------------------------------------------------------------------------------- +``` +The benchmark achieved approximately 52 Gbps throughput. + +Exit from RDMA test pods and then delete the RDMA test pods with the below command: + +``` +$ kubectl delete pod rdma-test-pod-1 rdma-test-pod-2 +``` + +### Validating the GPU Operator + +GPU Operator validates the through the nvidia-device-plugin-validation pod and the nvidia-driver-validation pod. If both are completed successfully (see output from kubectl get pods --all-namespaces | grep -v kube-system), NVIDIA Cloud Native Stack is working as expected. This section provides two examples of validating that the GPU is usable from within a pod to validate the manually. + +#### Example 1: nvidia-smi + +Execute the following: + +``` +cat < Open Network Stream > Entering the following URL: + +``` +rtsp://IPAddress of Node:31113/ds-test +``` + +You should see the video output like below with the AI model detecting objects. + +![Deepstream_Video](screenshots/Deepstream.png) + +`NOTE:` Video stream in VLC will change if you provide an input RTSP camera. + + +### Uninstalling the GPU Operator + +Execute the below commands to uninstall the GPU Operator: + +``` +$ helm ls +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +gpu-operator-1606173805 nvidia-gpu-operator 1 2024-03-20 20:23:28.063421701 +0000 UTC deployed gpu-operator-24.3.0 24.3.0 + +$ helm del gpu-operator-1606173805 -n nvidia-gpu-operator + +``` + +### Uninstalling the Network Operator + +Execute the below commands to uninstall the Network Operator: + +``` +$ helm ls -n network-operator +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +network-operator network-operator 1 2024-03-20 17:09:04.665593336 +0000 UTC deployed network-operator-24.1.0 v24.1.0 + +$ helm del network-operator -n network-operator +``` diff --git a/install-guides/DGX-6.0_Server_v10.2.md b/install-guides/older_versions/DGX-6.0_Server_v10.2.md similarity index 100% rename from install-guides/DGX-6.0_Server_v10.2.md rename to install-guides/older_versions/DGX-6.0_Server_v10.2.md diff --git a/install-guides/DGX-6.0_Server_v10.3.md b/install-guides/older_versions/DGX-6.0_Server_v10.3.md similarity index 100% rename from install-guides/DGX-6.0_Server_v10.3.md rename to install-guides/older_versions/DGX-6.0_Server_v10.3.md diff --git a/install-guides/DGX-6.0_Server_v10.4.md b/install-guides/older_versions/DGX-6.0_Server_v10.4.md similarity index 100% rename from install-guides/DGX-6.0_Server_v10.4.md rename to install-guides/older_versions/DGX-6.0_Server_v10.4.md diff --git a/install-guides/DGX-6.0_Server_v10.5.md b/install-guides/older_versions/DGX-6.0_Server_v10.5.md similarity index 100% rename from install-guides/DGX-6.0_Server_v10.5.md rename to install-guides/older_versions/DGX-6.0_Server_v10.5.md diff --git a/install-guides/Jetson_Xavier_v10.0.md b/install-guides/older_versions/Jetson_Xavier_v10.0.md similarity index 100% rename from install-guides/Jetson_Xavier_v10.0.md rename to install-guides/older_versions/Jetson_Xavier_v10.0.md diff --git a/install-guides/Jetson_Xavier_v10.1.md b/install-guides/older_versions/Jetson_Xavier_v10.1.md similarity index 100% rename from install-guides/Jetson_Xavier_v10.1.md rename to install-guides/older_versions/Jetson_Xavier_v10.1.md diff --git a/install-guides/Jetson_Xavier_v10.2.md b/install-guides/older_versions/Jetson_Xavier_v10.2.md similarity index 100% rename from install-guides/Jetson_Xavier_v10.2.md rename to install-guides/older_versions/Jetson_Xavier_v10.2.md diff --git a/install-guides/Jetson_Xavier_v10.3.md b/install-guides/older_versions/Jetson_Xavier_v10.3.md similarity index 100% rename from install-guides/Jetson_Xavier_v10.3.md rename to install-guides/older_versions/Jetson_Xavier_v10.3.md diff --git a/install-guides/Jetson_Xavier_v10.4.md b/install-guides/older_versions/Jetson_Xavier_v10.4.md similarity index 100% rename from install-guides/Jetson_Xavier_v10.4.md rename to install-guides/older_versions/Jetson_Xavier_v10.4.md diff --git a/install-guides/Jetson_Xavier_v10.5.md b/install-guides/older_versions/Jetson_Xavier_v10.5.md similarity index 100% rename from install-guides/Jetson_Xavier_v10.5.md rename to install-guides/older_versions/Jetson_Xavier_v10.5.md diff --git a/install-guides/RHEL-8-7_Server_x86-arm64_v10.0.md b/install-guides/older_versions/RHEL-8-7_Server_x86-arm64_v10.0.md similarity index 100% rename from install-guides/RHEL-8-7_Server_x86-arm64_v10.0.md rename to install-guides/older_versions/RHEL-8-7_Server_x86-arm64_v10.0.md diff --git a/install-guides/RHEL-8-7_Server_x86-arm64_v10.1.md b/install-guides/older_versions/RHEL-8-7_Server_x86-arm64_v10.1.md similarity index 100% rename from install-guides/RHEL-8-7_Server_x86-arm64_v10.1.md rename to install-guides/older_versions/RHEL-8-7_Server_x86-arm64_v10.1.md diff --git a/install-guides/RHEL-8-7_Server_x86-arm64_v10.2.md b/install-guides/older_versions/RHEL-8-7_Server_x86-arm64_v10.2.md similarity index 100% rename from install-guides/RHEL-8-7_Server_x86-arm64_v10.2.md rename to install-guides/older_versions/RHEL-8-7_Server_x86-arm64_v10.2.md diff --git a/install-guides/RHEL-8-7_Server_x86-arm64_v10.3.md b/install-guides/older_versions/RHEL-8-7_Server_x86-arm64_v10.3.md similarity index 100% rename from install-guides/RHEL-8-7_Server_x86-arm64_v10.3.md rename to install-guides/older_versions/RHEL-8-7_Server_x86-arm64_v10.3.md diff --git a/install-guides/RHEL-8-7_Server_x86-arm64_v10.4.md b/install-guides/older_versions/RHEL-8-7_Server_x86-arm64_v10.4.md similarity index 100% rename from install-guides/RHEL-8-7_Server_x86-arm64_v10.4.md rename to install-guides/older_versions/RHEL-8-7_Server_x86-arm64_v10.4.md diff --git a/install-guides/RHEL-8-7_Server_x86-arm64_v10.5.md b/install-guides/older_versions/RHEL-8-7_Server_x86-arm64_v10.5.md similarity index 100% rename from install-guides/RHEL-8-7_Server_x86-arm64_v10.5.md rename to install-guides/older_versions/RHEL-8-7_Server_x86-arm64_v10.5.md diff --git a/install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v10.0.md b/install-guides/older_versions/Ubuntu-22-04_Server_Developer-x86-arm64_v10.0.md similarity index 100% rename from install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v10.0.md rename to install-guides/older_versions/Ubuntu-22-04_Server_Developer-x86-arm64_v10.0.md diff --git a/install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v10.1.md b/install-guides/older_versions/Ubuntu-22-04_Server_Developer-x86-arm64_v10.1.md similarity index 100% rename from install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v10.1.md rename to install-guides/older_versions/Ubuntu-22-04_Server_Developer-x86-arm64_v10.1.md diff --git a/install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v10.2.md b/install-guides/older_versions/Ubuntu-22-04_Server_Developer-x86-arm64_v10.2.md similarity index 100% rename from install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v10.2.md rename to install-guides/older_versions/Ubuntu-22-04_Server_Developer-x86-arm64_v10.2.md diff --git a/install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v10.3.md b/install-guides/older_versions/Ubuntu-22-04_Server_Developer-x86-arm64_v10.3.md similarity index 100% rename from install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v10.3.md rename to install-guides/older_versions/Ubuntu-22-04_Server_Developer-x86-arm64_v10.3.md diff --git a/install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v10.4.md b/install-guides/older_versions/Ubuntu-22-04_Server_Developer-x86-arm64_v10.4.md similarity index 100% rename from install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v10.4.md rename to install-guides/older_versions/Ubuntu-22-04_Server_Developer-x86-arm64_v10.4.md diff --git a/install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v10.5.md b/install-guides/older_versions/Ubuntu-22-04_Server_Developer-x86-arm64_v10.5.md similarity index 100% rename from install-guides/Ubuntu-22-04_Server_Developer-x86-arm64_v10.5.md rename to install-guides/older_versions/Ubuntu-22-04_Server_Developer-x86-arm64_v10.5.md diff --git a/install-guides/Ubuntu-22-04_Server_x86-arm64_v10.0.md b/install-guides/older_versions/Ubuntu-22-04_Server_x86-arm64_v10.0.md similarity index 100% rename from install-guides/Ubuntu-22-04_Server_x86-arm64_v10.0.md rename to install-guides/older_versions/Ubuntu-22-04_Server_x86-arm64_v10.0.md diff --git a/install-guides/Ubuntu-22-04_Server_x86-arm64_v10.1.md b/install-guides/older_versions/Ubuntu-22-04_Server_x86-arm64_v10.1.md similarity index 100% rename from install-guides/Ubuntu-22-04_Server_x86-arm64_v10.1.md rename to install-guides/older_versions/Ubuntu-22-04_Server_x86-arm64_v10.1.md diff --git a/install-guides/Ubuntu-22-04_Server_x86-arm64_v10.2.md b/install-guides/older_versions/Ubuntu-22-04_Server_x86-arm64_v10.2.md similarity index 100% rename from install-guides/Ubuntu-22-04_Server_x86-arm64_v10.2.md rename to install-guides/older_versions/Ubuntu-22-04_Server_x86-arm64_v10.2.md diff --git a/install-guides/Ubuntu-22-04_Server_x86-arm64_v10.3.md b/install-guides/older_versions/Ubuntu-22-04_Server_x86-arm64_v10.3.md similarity index 100% rename from install-guides/Ubuntu-22-04_Server_x86-arm64_v10.3.md rename to install-guides/older_versions/Ubuntu-22-04_Server_x86-arm64_v10.3.md diff --git a/install-guides/Ubuntu-22-04_Server_x86-arm64_v10.4.md b/install-guides/older_versions/Ubuntu-22-04_Server_x86-arm64_v10.4.md similarity index 100% rename from install-guides/Ubuntu-22-04_Server_x86-arm64_v10.4.md rename to install-guides/older_versions/Ubuntu-22-04_Server_x86-arm64_v10.4.md diff --git a/install-guides/Ubuntu-22-04_Server_x86-arm64_v10.5.md b/install-guides/older_versions/Ubuntu-22-04_Server_x86-arm64_v10.5.md similarity index 100% rename from install-guides/Ubuntu-22-04_Server_x86-arm64_v10.5.md rename to install-guides/older_versions/Ubuntu-22-04_Server_x86-arm64_v10.5.md diff --git a/install-guides/readme.md b/install-guides/readme.md index b4140aa..c5dad41 100755 --- a/install-guides/readme.md +++ b/install-guides/readme.md @@ -2,73 +2,59 @@ The following NVIDIA Cloud Native Stack Install Guides are available. ### Ubuntu Systems -- [Ubuntu 22.04 Server x86 & arm64 v10.0](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_x86-arm64_v10.0.md) -- [Ubuntu 22.04 Server x86 & arm64 v10.1](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_x86-arm64_v10.1.md) -- [Ubuntu 22.04 Server x86 & arm64 v10.2](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_x86-arm64_v10.2.md) -- [Ubuntu 22.04 Server x86 & arm64 v10.3](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_x86-arm64_v10.3.md) -- [Ubuntu 22.04 Server x86 & arm64 v10.4](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_x86-arm64_v10.4.md) -- [Ubuntu 22.04 Server x86 & arm64 v10.5](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_x86-arm64_v10.5.md) - [Ubuntu 22.04 Server x86 & arm64 v11.0](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_x86-arm64_v11.0.md) - [Ubuntu 22.04 Server x86 & arm64 v11.1](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_x86-arm64_v11.1.md) - [Ubuntu 22.04 Server x86 & arm64 v11.2](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_x86-arm64_v11.2.md) -- [Ubuntu 22.04 Server x86 & arm64 v12.0](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_x86-arm64_v12.0.md) +- [Ubuntu 22.04 Server x86 & arm64 v11.3](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_x86-arm64_v11.3.md) - [Ubuntu 22.04 Server x86 & arm64 v12.0](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_x86-arm64_v12.0.md) - [Ubuntu 22.04 Server x86 & arm64 v12.1](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_x86-arm64_v12.1.md) +- [Ubuntu 22.04 Server x86 & arm64 v12.2](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_x86-arm64_v12.2.md) - [Ubuntu 22.04 Server x86 & arm64 v13.0](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_x86-arm64_v13.0.md) +- [Ubuntu 22.04 Server x86 & arm64 v13.1](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_x86-arm64_v13.1.md) ### RedHat Enterprise Linux(RHEL) Systems -- [RHEL 8.7 Server x86 & arm64 v10.0](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/RHEL-8-7_Server_x86-arm64_v10.0.md) -- [RHEL 8.7 Server x86 & arm64 v10.1](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/RHEL-8-7_Server_x86-arm64_v10.1.md) -- [RHEL 8.7 Server x86 & arm64 v10.2](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/RHEL-8-7_Server_x86-arm64_v10.2.md) -- [RHEL 8.7 Server x86 & arm64 v10.3](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/RHEL-8-7_Server_x86-arm64_v10.3.md) -- [RHEL 8.7 Server x86 & arm64 v10.4](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/RHEL-8-7_Server_x86-arm64_v10.4.md) -- [RHEL 8.7 Server x86 & arm64 v10.5](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/RHEL-8-7_Server_x86-arm64_v10.5.md) - [RHEL 8.7 Server x86 & arm64 v11.0](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/RHEL-8-7_Server_x86-arm64_v11.0.md) - [RHEL 8.7 Server x86 & arm64 v11.1](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/RHEL-8-7_Server_x86-arm64_v11.1.md) - [RHEL 8.7 Server x86 & arm64 v11.2](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/RHEL-8-7_Server_x86-arm64_v11.2.md) +- [RHEL 8.8 Server x86 & arm64 v11.3](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/RHEL-8-8_Server_x86-arm64_v11.3.md) - [RHEL 8.7 Server x86 & arm64 v12.0](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/RHEL-8-7_Server_x86-arm64_v12.0.md) - [RHEL 8.7 Server x86 & arm64 v12.1](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/RHEL-8-7_Server_x86-arm64_v12.1.md) +- [RHEL 8.8 Server x86 & arm64 v12.2](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/RHEL-8-8_Server_x86-arm64_v12.2.md) - [RHEL 8.7 Server x86 & arm64 v13.0](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/RHEL-8-7_Server_x86-arm64_v13.0.md) +- [RHEL 8.8 Server x86 & arm64 v13.1](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/RHEL-8-8_Server_x86-arm64_v13.1.md) ### Jetson Systems -- [Jetson Xavier v10.0](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Jetson_Xavier_v10.0.md) -- [Jetson Xavier v10.1](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Jetson_Xavier_v10.1.md) -- [Jetson Xavier v10.2](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Jetson_Xavier_v10.2.md) -- [Jetson Xavier v10.3](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Jetson_Xavier_v10.3.md) -- [Jetson Xavier v10.4](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Jetson_Xavier_v10.4.md) -- [Jetson Xavier v10.5](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Jetson_Xavier_v10.5.md) - [Jetson Xavier v11.0](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Jetson_Xavier_v11.0.md) - [Jetson Xavier v11.1](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Jetson_Xavier_v11.1.md) - [Jetson Xavier v11.2](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Jetson_Xavier_v11.2.md) +- [Jetson Xavier v11.3](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Jetson_Xavier_v11.3.md) - [Jetson Xavier v12.0](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Jetson_Xavier_v12.0.md) - [Jetson Xavier v12.1](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Jetson_Xavier_v12.1.md) +- [Jetson Xavier v12.2](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Jetson_Xavier_v12.2.md) - [Jetson Xavier v13.0](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Jetson_Xavier_v13.0.md) +- [Jetson Xavier v13.1](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Jetson_Xavier_v13.1.md) ### Ubuntu Server for Developers -- [Ubuntu 22.04 Server Developer x86 & arm64 v10.0](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_Developer_x86-arm64_v10.0.md) -- [Ubuntu 22.04 Server Developer x86 & arm64 v10.1](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_Developer_x86-arm64_v10.1.md) -- [Ubuntu 22.04 Server Developer x86 & arm64 v10.2](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_Developer_x86-arm64_v10.2.md) -- [Ubuntu 22.04 Server Developer x86 & arm64 v10.3](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_Developer_x86-arm64_v10.3.md) -- [Ubuntu 22.04 Server Developer x86 & arm64 v10.4](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_Developer_x86-arm64_v10.4.md) -- [Ubuntu 22.04 Server Developer x86 & arm64 v10.5](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_Developer_x86-arm64_v10.5.md) - [Ubuntu 22.04 Server Developer x86 & arm64 v11.0](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_Developer_x86-arm64_v11.0.md) - [Ubuntu 22.04 Server Developer x86 & arm64 v11.1](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_Developer_x86-arm64_v11.1.md) - [Ubuntu 22.04 Server Developer x86 & arm64 v11.2](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_Developer_x86-arm64_v11.2.md) +- [Ubuntu 22.04 Server Developer x86 & arm64 v11.3](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_Developer_x86-arm64_v11.3.md) - [Ubuntu 22.04 Server Developer x86 & arm64 v12.0](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_Developer_x86-arm64_v12.0.md) - [Ubuntu 22.04 Server Developer x86 & arm64 v12.1](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_Developer_x86-arm64_v12.1.md) +- [Ubuntu 22.04 Server Developer x86 & arm64 v12.2](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_Developer_x86-arm64_v12.2.md) - [Ubuntu 22.04 Server Developer x86 & arm64 v13.0](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_Developer_x86-arm64_v13.0.md) +- [Ubuntu 22.04 Server Developer x86 & arm64 v13.1](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/Ubuntu-22-04_Server_Developer_x86-arm64_v13.1.md) ### DGX Systems -- [DGX 6.0 Server v10.2](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/DGX-6.0_Server_v10.2.md) -- [DGX 6.0 Server v10.3](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/DGX-6.0_Server_v10.3.md) -- [DGX 6.0 Server v10.4](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/DGX-6.0_Server_v10.4.md) -- [DGX 6.0 Server v10.5](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/DGX-6.0_Server_v10.5.md) - [DGX 6.0 Server v11.0](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/DGX-6.0_Server_v11.0.md) - [DGX 6.0 Server v11.1](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/DGX-6.0_Server_v11.1.md) - [DGX 6.0 Server v11.2](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/DGX-6.0_Server_v11.2.md) +- [DGX 6.2 Server v11.3](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/DGX-6.0_Server_v11.3.md) - [DGX 6.0 Server v12.0](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/DGX-6.0_Server_v12.0.md) - [DGX 6.0 Server v12.1](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/DGX-6.0_Server_v12.1.md) +- [DGX 6.2 Server v12.2](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/DGX-6.0_Server_v12.2.md) - [DGX 6.0 Server v13.0](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/DGX-6.0_Server_v13.0.md) +- [DGX 6.2 Server v13.1](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/DGX-6.0_Server_v13.1.md) `NOTE` A list of older NVIDIA Cloud Native Stack versions (formerly known as Cloud Native Core) can be found [here](https://github.com/NVIDIA/cloud-native-stack/blob/master/install-guides/older_versions/readme.md) diff --git a/playbooks/cns-installation.yaml b/playbooks/cns-installation.yaml index 9fb476b..0188d2f 100755 --- a/playbooks/cns-installation.yaml +++ b/playbooks/cns-installation.yaml @@ -158,7 +158,75 @@ - name: Check CNS Version support for RHEL shell: "echo 'Not a Valid Installation please use CNS Version 10.0 above and retry'" failed_when: "cns_version < 10.0 and ansible_distribution in ['RedHat', 'CentOS']" - + + - name: check dgx + stat: + path: /etc/dgx-release + register: dgx + + - name: check l4t + stat: + path: /etc/l4t-release + register: l4t + + - name: NVIDIA Driver Clean Up on DGX + when: dgx.stat.exists == True and ansible_distribution == 'Ubuntu' and enable_rdma == true or dgx.stat.exists == True and ansible_distribution == 'Ubuntu' and enable_gds == true or l4t.stat.exists == True and ansible_distribution == 'Ubuntu' and enable_rdma == true or l4t.stat.exists == True and ansible_distribution == 'Ubuntu' and enable_gds == true + become: true + block: + - name: Remove Ubuntu unattended upgrades to prevent apt lock + ansible.builtin.apt: + name: unattended-upgrades + state: absent + purge: yes + register: apt_cleanup + retries: 10 + until: apt_cleanup is success + + - name: Remove OLD Apt Repository + apt_repository: + repo: ppa:graphics-drivers/ppa + state: absent + register: ppa_clean + retries: 10 + until: ppa_clean is success + + - name: Remove NVIDIA packages + apt: + name: + - "*cuda*" + - "libnvidia-cfg1-*" + - "libnvidia-common-*" + - "libnvidia-compute-*" + - "libnvidia-decode-*" + - "libnvidia-encode-*" + - "libnvidia-extra-*" + - "libnvidia-fbc1-*" + - "libnvidia-gl-*" + - "nvidia-compute-utils-*" + - "nvidia-dkms-*" + - "nvidia-driver-*" + - "nvidia-kernel-common-*" + - "nvidia-kernel-source-*" + - "nvidia-modprobe" + - "nvidia-prime" + - "nvidia-settings" + - "nvidia-utils-*" + - "nvidia-fabricmanager-*" + - "screen-resolution-extra" + - "xserver-xorg-video-nvidia-*" + - "gdm*" + - "xserver-xorg-*" + autoremove: yes + purge: yes + state: absent + register: nvidia_cleanup + retries: 10 + until: nvidia_cleanup is success + + - name: unload NVIDIA + shell: /usr/bin/nvidia-uninstall --silent; kill -9 $(lsof /dev/nvidia* | awk '{print $2}' | grep -v PID | uniq); rmmod -f nvidia_uvm; rmmod -f nvidia_drm; rmmod -f nvidia_modeset; rmmod -f nvidia + ignore_errors: yes + failed_when: false # - name: Add DHCP Mac to netplan # become: true # args: @@ -216,5 +284,5 @@ register: status when: "cns_version > 4.1 and cns_validation == true and release != 'tegra'" -- when: "cns_version > 4.1 and cns_validation == true and release != 'tegra' and install_k8s == true" +- when: "cns_version > 4.1 and cns_validation == true and release != 'tegra'" import_playbook: cns-validation.yaml \ No newline at end of file diff --git a/playbooks/cns-uninstall.yaml b/playbooks/cns-uninstall.yaml index ad8c2b0..b02f7e2 100755 --- a/playbooks/cns-uninstall.yaml +++ b/playbooks/cns-uninstall.yaml @@ -30,7 +30,17 @@ purge: true autoremove: yes become: true - when: microk8s == true + when: microk8s == true and ansible_distribution == 'Ubuntu' + + - name: Install Snapd on RHEL + when: ansible_distribution == 'RedHat' and microk8s == true + shell: "{{ item }}" + become: true + ignore_errors: true + with_items: + - yum remove snapd -y + - yum autoremove -y + - rm -rf /var/lib/snapd/snap /snap - name: add kubectl alias for Microk8s when: microk8s == true diff --git a/playbooks/cns-upgrade.yaml b/playbooks/cns-upgrade.yaml index 39e314d..ed65929 100644 --- a/playbooks/cns-upgrade.yaml +++ b/playbooks/cns-upgrade.yaml @@ -168,6 +168,13 @@ content: | cns_version: 11.2 + - name: Create Cloud Native Stack cns_version.yaml + when: "k8sversion.stdout == 'v1.28.8'" + copy: + dest: "/tmp/cns_version.yaml" + content: | + cns_version: 11.3 + - name: Create Cloud Native Stack cns_version.yaml when: "k8sversion.stdout == 'v1.29.2'" copy: @@ -175,6 +182,20 @@ content: | cns_version: 12.1 + - name: Create Cloud Native Stack cns_version.yaml + when: "k8sversion.stdout == 'v1.29.4'" + copy: + dest: "/tmp/cns_version.yaml" + content: | + cns_version: 12.2 + + - name: Create Cloud Native Stack cns_version.yaml + when: "k8sversion.stdout == 'v1.30.0'" + copy: + dest: "/tmp/cns_version.yaml" + content: | + cns_version: 13.1 + - name: Fetch cns_version.yaml ansible.builtin.fetch: src: "/tmp/cns_version.yaml" @@ -219,11 +240,26 @@ shell: kubectl version -o json | jq .serverVersion.gitVersion | sed 's/\"//g' register: k8sversion + - name: Upgrade the Cloud Native Stack from 13.0 to 13.1 + shell: kubeadm upgrade apply v1.30.2 --force + when: "'running' in k8sup.stdout and k8sversion.stdout == 'v1.30.0'" + become: true + + - name: Upgrade the Cloud Native Stack from 12.1 to 12.2 + shell: kubeadm upgrade apply v1.29.6 --force + when: "'running' in k8sup.stdout and k8sversion.stdout == 'v1.29.4'" + become: true + - name: Upgrade the Cloud Native Stack from 12.0 to 12.1 shell: kubeadm upgrade apply v1.29.4 --force when: "'running' in k8sup.stdout and k8sversion.stdout == 'v1.29.2'" become: true + - name: Upgrade the Cloud Native Stack from 11.2 to 11.3 + shell: kubeadm upgrade apply v1.28.12 --force + when: "'running' in k8sup.stdout and k8sversion.stdout == 'v1.28.8'" + become: true + - name: Upgrade the Cloud Native Stack from 11.1 to 11.2 shell: kubeadm upgrade apply v1.28.8 --force when: "'running' in k8sup.stdout and k8sversion.stdout == 'v1.28.6'" @@ -329,7 +365,11 @@ when: "'running' in k8sup.stdout and k8sversion.stdout == 'v1.24.2'" become: true - - name: Install networking plugin to kubernetes cluster on NVIDIA Cloud Native Stack 12.1 or 11.2 or 10.5 + - name: Install networking plugin to kubernetes cluster on NVIDIA Cloud Native Stack 12.2 or 11.3 or 13.1 + when: "'running' in k8sup.stdout and k8sversion.stdout == 'v1.29.4' and release != 'tegra' or 'running' in k8sup.stdout and k8sversion.stdout == 'v1.28.8' and release != 'tegra' or 'running' in k8sup.stdout and k8sversion.stdout == 'v1.30.0' and release != 'tegra'" + command: kubectl apply -f https://raw.githubusercontent.com/projectcalico/calico/v3.27.4/manifests/calico.yaml + + - name: Install networking plugin to kubernetes cluster on NVIDIA Cloud Native Stack 12.1 or 11.2 or 10.5 when: "'running' in k8sup.stdout and k8sversion.stdout == 'v1.29.2' and release != 'tegra' or 'running' in k8sup.stdout and k8sversion.stdout == 'v1.28.6' and release != 'tegra' or 'running' in k8sup.stdout and k8sversion.stdout == 'v1.27.10' and release != 'tegra'" command: kubectl apply -f https://raw.githubusercontent.com/projectcalico/calico/v3.27.3/manifests/calico.yaml @@ -405,6 +445,10 @@ when: "k8sversion.stdout == 'v1.29.2' and release == 'tegra' or k8sversion.stdout == 'v1.28.6' and release == 'tegra' or k8sversion.stdout == 'v1.27.10' and release == 'tegra'" command: kubectl apply -f https://raw.githubusercontent.com/coreos/flannel/v0.25.1/Documentation/kube-flannel.yml + - name: Install networking plugin to kubernetes cluster on NVIDIA Cloud Native Stack + when: "k8sversion.stdout == 'v1.29.4' and release == 'tegra' or k8sversion.stdout == 'v1.28.8' and release == 'tegra' or k8sversion.stdout == 'v1.30.0' and release == 'tegra'" + command: kubectl apply -f https://raw.githubusercontent.com/coreos/flannel/v0.25.5/Documentation/kube-flannel.yml + - pause: seconds: 15 diff --git a/playbooks/cns-validation.yaml b/playbooks/cns-validation.yaml index 8b54804..e9affb5 100755 --- a/playbooks/cns-validation.yaml +++ b/playbooks/cns-validation.yaml @@ -202,7 +202,7 @@ dest: nvidia-smi.yaml - name: Create NVIDIA-SMI yaml - when: "cns_version == 10.5 and ansible_distribution == 'Ubuntu' or cns_version == 11.2 and ansible_distribution == 'Ubuntu' or cns_version == 12.1 and ansible_distribution == 'Ubuntu' or cns_version == 13.0 and ansible_distribution == 'Ubuntu'" + when: "cns_version == 10.5 and ansible_distribution == 'Ubuntu' or cns_version == 11.2 and ansible_distribution == 'Ubuntu' or cns_version == 12.1 and ansible_distribution == 'Ubuntu' or cns_version == 13.0 and ansible_distribution == 'Ubuntu' or cns_version == 11.3 and ansible_distribution == 'Ubuntu' or cns_version == 12.2 and ansible_distribution == 'Ubuntu' or cns_version == 13.1 and ansible_distribution == 'Ubuntu'" copy: content: | apiVersion: v1 @@ -504,6 +504,29 @@ echo -e echo -e "Please validate between Target Version and Installed Version listed above" + - name: Componenets Target Versions Vs Installed Versions + when: cns_docker == false and cns_version == 11.3 or cns_docker == false and cns_version == 12.2 or cns_docker == false and cns_version == 13.1 + register: compare_113 + args: + executable: /bin/bash + shell: | + echo -e "===========================================================================================" + echo -e " Components Target Version || Installed Version " + echo -e "===========================================================================================" + echo -e "GPU Operator Version {{ release_24_6_0['gpu_operator_version'] }} || {{ gpuoperator.stdout }}" + echo -e "NVIDIA Container Driver Version {{ release_24_6_0['gpu_driver_version'] }} || {{ nvcdriver.stdout }}" + echo -e "GPU Operator NV Toolkit Driver {{ release_24_6_0['container_toolkit'] }} || {{ nvtoolkit.stdout }}" + echo -e "K8sDevice Plugin Version {{ release_24_6_0['device_plugin'] }} || {{ k8sdevice.stdout }}" + echo -e "Data Center GPU Manager(DCGM) Version {{ release_24_6_0['dcgm_exporter_version'] }} || {{ dcgm_exporter.stdout }}" + echo -e "Node Feature Discovery Version {{ release_24_6_0['nfd_version'] }} || {{ nodediscover.stdout }}" + echo -e "GPU Feature Discovery Version {{ release_24_6_0['gfd_version'] }} || {{ gpudiscover.stdout }}" + echo -e "NVIDIA validator version {{ release_24_6_0['validator_version'] }} || {{ validator.stdout }}" + echo -e "NVIDIA MIG Manager version {{ release_24_6_0['mig_manager_version'] }} || {{ mig_manager.stdout }}" + echo -e + echo -e "NOTE: NVIDIA Mig Manager is valid for only A100 and A30 and H100 GPU's" + echo -e + echo -e "Please validate between Target Version and Installed Version listed above" + - name: Componenets Target Versions Vs Installed Versions when: cns_version == 9.4 and cns_docker == true or cns_version == 10.3 and cns_docker == true or cns_version == 11.0 and cns_docker == true register: compare_94docker @@ -574,6 +597,29 @@ echo -e echo -e "Please validate between Target Version and Installed Version listed above" + - name: Componenets Target Versions Vs Installed Versions + when: cns_version == 11.3 and cns_docker == true or cns_version == 12.2 and cns_docker == true or cns_version == 13.1 and cns_docker == true + register: compare_113docker + args: + executable: /bin/bash + shell: | + echo -e "===========================================================================================" + echo -e " Components Target Version || Installed Version " + echo -e "===========================================================================================" + echo -e "GPU Operator Version {{ release_24_6_0['gpu_operator_version'] }} || {{ gpuoperator.stdout }}" + echo -e "NVIDIA Container Driver Version {{ release_24_6_0['gpu_driver_version'] }} || {{ nvidia_driver.stdout }}" + echo -e "NVIDIA Toolkit Driver {{ release_24_6_0['container_toolkit'] }} || {{ nvidia_ct.stdout }}" + echo -e "K8sDevice Plugin Version {{ release_24_6_0['device_plugin'] }} || {{ k8sdevice.stdout }}" + echo -e "Data Center GPU Manager(DCGM) Version {{ release_24_6_0['dcgm_exporter_version'] }} || {{ dcgm_exporter.stdout }}" + echo -e "Node Feature Discovery Version {{ release_24_6_0['nfd_version'] }} || {{ nodediscover.stdout }}" + echo -e "GPU Feature Discovery Version {{ release_24_6_0['gfd_version'] }} || {{ gpudiscover.stdout }}" + echo -e "NVIDIA validator version {{ release_24_6_0['validator_version'] }} || {{ validator.stdout }}" + echo -e "NVIDIA MIG Manager version {{ release_24_6_0['mig_manager_version'] }} || {{ mig_manager.stdout }}" + echo -e + echo -e "NOTE: NVIDIA Mig Manager is valid for only A100 and A30 and H100 GPU's" + echo -e + echo -e "Please validate between Target Version and Installed Version listed above" + - name: Componenets Target Versions Vs Installed Versions ignore_errors: yes register: stack_versions @@ -677,6 +723,16 @@ debug: msg: "{{ compare_105.stdout_lines }}" + - name: Report Versions + when: cns_version == 11.3 and cns_docker == false or cns_version == 12.2 and cns_docker == false or cns_version == 13.1 and cns_docker == false + debug: + msg: "{{ compare_113.stdout_lines }}" + + - name: Report Versions + when: cns_version == 11.3 and cns_docker == true or cns_version == 12.2 and cns_docker == true or cns_version == 13.1 and cns_docker == true + debug: + msg: "{{ compare_113docker.stdout_lines }}" + - name: Report Stack Version debug: msg: "{{ stack_versions.stdout_lines }}" diff --git a/playbooks/cns_values_11.3.yaml b/playbooks/cns_values_11.3.yaml new file mode 100644 index 0000000..1c4c3f0 --- /dev/null +++ b/playbooks/cns_values_11.3.yaml @@ -0,0 +1,121 @@ +cns_version: 11.3 + +## MicroK8s cluster +microk8s: no +## Kubernetes Install with Kubeadm +install_k8s: yes + +## Components Versions +# Container Runtime options are containerd, cri-o, cri-dockerd +container_runtime: "containerd" +containerd_version: "1.7.20" +runc_version: "1.1.13" +cni_plugins_version: "1.5.1" +containerd_max_concurrent_downloads: "5" +nvidia_container_toolkit_version: "1.16.1" +crio_version: "1.28.8" +cri_dockerd_version: "0.3.15" +k8s_version: "1.28.12" +calico_version: "3.27.4" +flannel_version: "0.25.5" +helm_version: "3.15.3" +gpu_operator_version: "24.6.1" +network_operator_version: "24.4.1" +local_path_provisioner: "0.0.26" +nfs_provisioner: "4.0.18" +metallb_version: "0.14.5" +kserve_version: "0.13" +prometheus_stack: "61.3.0" +elastic_stack: "8.14.1" + +# GPU Operator Values +enable_gpu_operator: yes +confidential_computing: no +gpu_driver_version: "550.90.07" +use_open_kernel_module: no +enable_mig: no +mig_profile: all-disabled +mig_strategy: single +# To use GDS, use_open_kernel_module needs to be enabled +enable_gds: no +#Secure Boot for only Ubuntu +enable_secure_boot: no +enable_vgpu: no +vgpu_license_server: "" +# URL of Helm repo to be added. If using NGC get this from the fetch command in the console +helm_repository: https://helm.ngc.nvidia.com/nvidia +# Name of the helm chart to be deployed +gpu_operator_helm_chart: nvidia/gpu-operator +## If using a private/protected registry. NGC API Key. Leave blank for public registries +gpu_operator_registry_password: "" +## This is most likely an NGC email +gpu_operator_registry_email: "" +## This is most likely GPU Operator Driver Registry +gpu_operator_driver_registry: "nvcr.io/nvidia" +gpu_operator_registry_username: "$oauthtoken" + +# Network Operator Values +## If the Network Operator is yes then make sure enable_rdma as well yes +enable_network_operator: no +## Enable RDMA yes for NVIDIA Certification +enable_rdma: no + +# Prxoy Configuration +proxy: no +http_proxy: "" +https_proxy: "" + +# Cloud Native Stack for Developers Values +## Enable for Cloud Native Stack Developers +cns_docker: no +## Enable For Cloud Native Stack Developers with TRD Driver +cns_nvidia_driver: no + +## Kubernetes resources +k8s_apt_key: "https://pkgs.k8s.io/core:/stable:/v1.28/deb/Release.key" +k8s_gpg_key: "https://pkgs.k8s.io/core:/stable:/v1.28/rpm/repodata/repomd.xml.key" +k8s_apt_ring: "/etc/apt/keyrings/kubernetes-apt-keyring.gpg" +k8s_registry: "registry.k8s.io" + +# Local Path Provisioner and NFS Provisoner as Storage option +storage: no + +# Monitoring Stack Prometheus/Grafana with GPU Metrics and Elastic Logging stack +monitoring: no + +# Enable Kserve on Cloud Native Stack with Istio and Cert-Manager +kserve: no + +# Install MetalLB +loadbalancer: no +# Example input loadbalancer_ip: "10.117.20.50/32", it could be node/host IP +loadbalancer_ip: "" + +## Cloud Native Stack Validation +cns_validation: no + +# BMC Details for Confidential Computing +bmc_ip: +bmc_username: +bmc_password: + +# CSP values +## AWS EKS values +aws_region: us-east-2 +aws_cluster_name: cns-cluster-1 +aws_gpu_instance_type: g4dn.2xlarge + +## Google Cloud GKE Values +#https://cloud.google.com/resource-manager/docs/creating-managing-projects#identifying_projects +gke_project_id: +#https://cloud.google.com/compute/docs/regions-zones#available +gke_region: us-west1 +gke_node_zones: ["us-west1-b"] +gke_cluster_name: gke-cluster-1 + +## Azure AKS Values +aks_cluster_name: aks-cluster-1 +#https://azure.microsoft.com/en-us/explore/global-infrastructure/geographies/#geographies +aks_cluster_location: "West US 2" +#https://learn.microsoft.com/en-us/partner-center/marketplace/find-tenant-object-id +azure_object_id: [""] \ No newline at end of file diff --git a/playbooks/cns_values_12.2.yaml b/playbooks/cns_values_12.2.yaml new file mode 100644 index 0000000..fe50437 --- /dev/null +++ b/playbooks/cns_values_12.2.yaml @@ -0,0 +1,121 @@ +cns_version: 12.2 + +## MicroK8s cluster +microk8s: no +## Kubernetes Install with Kubeadm +install_k8s: yes + +## Components Versions +# Container Runtime options are containerd, cri-o, cri-dockerd +container_runtime: "containerd" +containerd_version: "1.7.20" +runc_version: "1.1.13" +cni_plugins_version: "1.5.1" +containerd_max_concurrent_downloads: "5" +nvidia_container_toolkit_version: "1.16.1" +crio_version: "1.29.6" +cri_dockerd_version: "0.3.15" +k8s_version: "1.29.6" +calico_version: "3.27.4" +flannel_version: "0.25.5" +helm_version: "3.15.3" +gpu_operator_version: "24.6.1" +network_operator_version: "24.4.1" +local_path_provisioner: "0.0.26" +nfs_provisioner: "4.0.18" +metallb_version: "0.14.5" +kserve_version: "0.13" +prometheus_stack: "61.3.0" +elastic_stack: "8.14.1" + +# GPU Operator Values +enable_gpu_operator: yes +confidential_computing: no +gpu_driver_version: "550.90.07" +use_open_kernel_module: no +enable_mig: no +mig_profile: all-disabled +mig_strategy: single +# To use GDS, use_open_kernel_module needs to be enabled +enable_gds: no +#Secure Boot for only Ubuntu +enable_secure_boot: no +enable_vgpu: no +vgpu_license_server: "" +# URL of Helm repo to be added. If using NGC get this from the fetch command in the console +helm_repository: https://helm.ngc.nvidia.com/nvidia +# Name of the helm chart to be deployed +gpu_operator_helm_chart: nvidia/gpu-operator +## If using a private/protected registry. NGC API Key. Leave blank for public registries +gpu_operator_registry_password: "" +## This is most likely an NGC email +gpu_operator_registry_email: "" +## This is most likely GPU Operator Driver Registry +gpu_operator_driver_registry: "nvcr.io/nvidia" +gpu_operator_registry_username: "$oauthtoken" + +# Network Operator Values +## If the Network Operator is yes then make sure enable_rdma as well yes +enable_network_operator: no +## Enable RDMA yes for NVIDIA Certification +enable_rdma: no + +# Prxoy Configuration +proxy: no +http_proxy: "" +https_proxy: "" + +# Cloud Native Stack for Developers Values +## Enable for Cloud Native Stack Developers +cns_docker: no +## Enable For Cloud Native Stack Developers with TRD Driver +cns_nvidia_driver: no + +## Kubernetes resources +k8s_apt_key: "https://pkgs.k8s.io/core:/stable:/v1.29/deb/Release.key" +k8s_gpg_key: "https://pkgs.k8s.io/core:/stable:/v1.29/rpm/repodata/repomd.xml.key" +k8s_apt_ring: "/etc/apt/keyrings/kubernetes-apt-keyring.gpg" +k8s_registry: "registry.k8s.io" + +# Local Path Provisioner and NFS Provisoner as Storage option +storage: no + +# Monitoring Stack Prometheus/Grafana with GPU Metrics and Elastic Logging stack +monitoring: no + +# Enable Kserve on Cloud Native Stack with Istio and Cert-Manager +kserve: no + +# Install MetalLB +loadbalancer: no +# Example input loadbalancer_ip: "10.117.20.50/32", it could be node/host IP +loadbalancer_ip: "" + +## Cloud Native Stack Validation +cns_validation: no + +# BMC Details for Confidential Computing +bmc_ip: +bmc_username: +bmc_password: + +# CSP values +## AWS EKS values +aws_region: us-east-2 +aws_cluster_name: cns-cluster-1 +aws_gpu_instance_type: g4dn.2xlarge + +## Google Cloud GKE Values +#https://cloud.google.com/resource-manager/docs/creating-managing-projects#identifying_projects +gke_project_id: +#https://cloud.google.com/compute/docs/regions-zones#available +gke_region: us-west1 +gke_node_zones: ["us-west1-b"] +gke_cluster_name: gke-cluster-1 + +## Azure AKS Values +aks_cluster_name: aks-cluster-1 +#https://azure.microsoft.com/en-us/explore/global-infrastructure/geographies/#geographies +aks_cluster_location: "West US 2" +#https://learn.microsoft.com/en-us/partner-center/marketplace/find-tenant-object-id +azure_object_id: [""] \ No newline at end of file diff --git a/playbooks/cns_values_13.1.yaml b/playbooks/cns_values_13.1.yaml new file mode 100644 index 0000000..602d477 --- /dev/null +++ b/playbooks/cns_values_13.1.yaml @@ -0,0 +1,121 @@ +cns_version: 13.1 + +## MicroK8s cluster +microk8s: no +## Kubernetes Install with Kubeadm +install_k8s: yes + +## Components Versions +# Container Runtime options are containerd, cri-o, cri-dockerd +container_runtime: "containerd" +containerd_version: "1.7.20" +runc_version: "1.1.13" +cni_plugins_version: "1.5.1" +containerd_max_concurrent_downloads: "5" +nvidia_container_toolkit_version: "1.16.1" +crio_version: "1.30.2" +cri_dockerd_version: "0.3.15" +k8s_version: "1.30.2" +calico_version: "3.27.4" +flannel_version: "0.25.5" +helm_version: "3.15.3" +gpu_operator_version: "24.6.1" +network_operator_version: "24.4.1" +local_path_provisioner: "0.0.26" +nfs_provisioner: "4.0.18" +metallb_version: "0.14.5" +kserve_version: "0.13" +prometheus_stack: "61.3.0" +elastic_stack: "8.14.1" + +# GPU Operator Values +enable_gpu_operator: yes +confidential_computing: no +gpu_driver_version: "550.90.07" +use_open_kernel_module: no +enable_mig: no +mig_profile: all-disabled +mig_strategy: single +# To use GDS, use_open_kernel_module needs to be enabled +enable_gds: no +#Secure Boot for only Ubuntu +enable_secure_boot: no +enable_vgpu: no +vgpu_license_server: "" +# URL of Helm repo to be added. If using NGC get this from the fetch command in the console +helm_repository: https://helm.ngc.nvidia.com/nvidia +# Name of the helm chart to be deployed +gpu_operator_helm_chart: nvidia/gpu-operator +## If using a private/protected registry. NGC API Key. Leave blank for public registries +gpu_operator_registry_password: "" +## This is most likely an NGC email +gpu_operator_registry_email: "" +## This is most likely GPU Operator Driver Registry +gpu_operator_driver_registry: "nvcr.io/nvidia" +gpu_operator_registry_username: "$oauthtoken" + +# Network Operator Values +## If the Network Operator is yes then make sure enable_rdma as well yes +enable_network_operator: no +## Enable RDMA yes for NVIDIA Certification +enable_rdma: no + +# Prxoy Configuration +proxy: no +http_proxy: "" +https_proxy: "" + +# Cloud Native Stack for Developers Values +## Enable for Cloud Native Stack Developers +cns_docker: no +## Enable For Cloud Native Stack Developers with TRD Driver +cns_nvidia_driver: no + +## Kubernetes resources +k8s_apt_key: "https://pkgs.k8s.io/core:/stable:/v1.30/deb/Release.key" +k8s_gpg_key: "https://pkgs.k8s.io/core:/stable:/v1.30/rpm/repodata/repomd.xml.key" +k8s_apt_ring: "/etc/apt/keyrings/kubernetes-apt-keyring.gpg" +k8s_registry: "registry.k8s.io" + +# Local Path Provisioner and NFS Provisoner as Storage option +storage: no + +# Monitoring Stack Prometheus/Grafana with GPU Metrics and Elastic Logging stack +monitoring: no + +# Enable Kserve on Cloud Native Stack with Istio and Cert-Manager +kserve: no + +# Install MetalLB +loadbalancer: no +# Example input loadbalancer_ip: "10.117.20.50/32", it could be node/host IP +loadbalancer_ip: "" + +## Cloud Native Stack Validation +cns_validation: no + +# BMC Details for Confidential Computing +bmc_ip: +bmc_username: +bmc_password: + +# CSP values +## AWS EKS values +aws_region: us-east-2 +aws_cluster_name: cns-cluster-1 +aws_gpu_instance_type: g4dn.2xlarge + +## Google Cloud GKE Values +#https://cloud.google.com/resource-manager/docs/creating-managing-projects#identifying_projects +gke_project_id: +#https://cloud.google.com/compute/docs/regions-zones#available +gke_region: us-west1 +gke_node_zones: ["us-west1-b"] +gke_cluster_name: gke-cluster-1 + +## Azure AKS Values +aks_cluster_name: aks-cluster-1 +#https://azure.microsoft.com/en-us/explore/global-infrastructure/geographies/#geographies +aks_cluster_location: "West US 2" +#https://learn.microsoft.com/en-us/partner-center/marketplace/find-tenant-object-id +azure_object_id: [""] \ No newline at end of file diff --git a/playbooks/gpu_operator.yaml b/playbooks/gpu_operator.yaml index 5a65350..70d6581 100644 --- a/playbooks/gpu_operator.yaml +++ b/playbooks/gpu_operator.yaml @@ -1,3 +1,16 @@ +release_24_6_0: + gpu_operator_version: v24.6.0 + gpu_driver_version: 550.90.07 + driver_manager_version: 0.6.8 + container_toolkit: v1.16.1 + device_plugin: v0.16.1 + dcgm_exporter_version: 3.3.7-3.5.0 + nfd_version: v0.16.3 + gfd_version: v0.16.1 + mig_manager_version: v0.8.0 + dcgm_version: 3.3.7-1 + validator_version: v24.6.0 + gds_driver: 2.17.5 release_24_3_0: gpu_operator_version: v24.3.0 gpu_driver_version: 550.54.14 diff --git a/playbooks/guides/Cloud_Guide.md b/playbooks/guides/Cloud_Guide.md index 46b8218..ef4438e 100644 --- a/playbooks/guides/Cloud_Guide.md +++ b/playbooks/guides/Cloud_Guide.md @@ -4,7 +4,7 @@ This page describes the steps required to use Ansible Playbooks ## Following supported cloud environments -- EKS(Elastic Kubernetes Service) +- EKS(Elastci Kubernetes Service) - GKE(Google Kubernetes Engine) - AKS(Azure Kubernetes Service) - In Progress diff --git a/playbooks/hosts b/playbooks/hosts index 982d417..0e607f8 100755 --- a/playbooks/hosts +++ b/playbooks/hosts @@ -1,3 +1,3 @@ [master] -#localhost ansible_ssh_user=nvidia ansible_ssh_pass=nvidiapass ansible_sudo_pass=nvidiapass ansible_ssh_common_args='-o StrictHostKeyChecking=no' +#localhost ansible_ssh_user=nvidia ansible_ssh_pass=nvidia3D! ansible_sudo_pass=nvidia3D! ansible_ssh_common_args='-o StrictHostKeyChecking=no' [nodes] \ No newline at end of file diff --git a/playbooks/k8s-install.yaml b/playbooks/k8s-install.yaml index 63b5802..6a2a08a 100755 --- a/playbooks/k8s-install.yaml +++ b/playbooks/k8s-install.yaml @@ -251,7 +251,16 @@ register: kubeadm_join_cmd - set_fact: - kubeadm_join: "{{ kubeadm_join_cmd.stdout }}" + kubeadm_join: "{{ kubeadm_join_cmd.stdout }} --cri-socket=unix:///var/run/cri-dockerd.sock" + when: container_runtime == 'cri-dockerd' + + - set_fact: + kubeadm_join: "{{ kubeadm_join_cmd.stdout }} --cri-socket=unix:///var/run/crio/crio.sock " + when: container_runtime == 'cri-o' + + - set_fact: + kubeadm_join: "{{ kubeadm_join_cmd.stdout }} --cri-socket=unix:///run/containerd/containerd.sock" + when: container_runtime == 'containerd' - name: Store join command become: true diff --git a/playbooks/microk8s.yaml b/playbooks/microk8s.yaml index 8f07e85..931371a 100644 --- a/playbooks/microk8s.yaml +++ b/playbooks/microk8s.yaml @@ -64,13 +64,29 @@ autoremove: yes become: true - - name: Install snap - when: ansible_distribution == 'RedHat' and snap_exists.rc != 0 + + - name: Update Snap Repo + when: ansible_distribution == 'RedHat' and snap_exists.rc != 0 and ansible_distribution_major_version == '8' shell: "{{ item }}" become: true with_items: - dnf install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm -y - - subscription-manager release --set 8.9 + - subscription-manager release --set 8.8 + + - name: Update Snap Repo + when: ansible_distribution == 'RedHat' and snap_exists.rc != 0 and ansible_distribution_major_version == '9' + shell: "{{ item }}" + become: true + with_items: + - dnf install https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm -y + - subscription-manager release --set 9.4 + + - name: Install Snapd on RHEL + when: ansible_distribution == 'RedHat' and snap_exists.rc != 0 + shell: "{{ item }}" + become: true + ignore_errors: true + with_items: - yum update -y - yum install snapd -y - systemctl enable --now snapd.socket @@ -112,6 +128,11 @@ shell: sudo snap alias microk8s.kubectl kubectl; sudo snap alias microk8s.helm helm become: true + - name: Add Home Dir to Snap + shell: "home=$(echo {{ ansible_user_dir }} | awk -F'/' '{print $2}'); sudo snap set system homedirs=/$home" + async: 30 + become: true + - name: Create kube directory file: path: $HOME/.kube diff --git a/playbooks/nvidia-driver.yaml b/playbooks/nvidia-driver.yaml index 155e964..09fd69f 100755 --- a/playbooks/nvidia-driver.yaml +++ b/playbooks/nvidia-driver.yaml @@ -94,6 +94,7 @@ - "nvidia-prime" - "nvidia-settings" - "nvidia-utils-*" + - "nvidia-fabricmanager-*" - "screen-resolution-extra" - "xserver-xorg-video-nvidia-*" - "gdm*" @@ -109,6 +110,11 @@ shell: cmd: "apt-key del 7fa2af80" + - name: unload NVIDIA + shell: /usr/bin/nvidia-uninstall --silent; kill -9 $(lsof /dev/nvidia* | awk '{print $2}' | grep -v PID | uniq); rmmod -f nvidia_uvm; rmmod -f nvidia_drm; rmmod -f nvidia_modeset; rmmod -f nvidia + ignore_errors: yes + failed_when: false + - name: ensure we have kernel-headers installed for the current kernel on RHEL when: "cns_version >= 10.0 and ansible_distribution == 'RedHat'" block: @@ -173,4 +179,15 @@ shell: chage -d 1 root - name: Install driver packages - shell: "BASE_URL=https://us.download.nvidia.com/tesla; curl -fSsl -O $BASE_URL/{{ gpu_driver_version }}/NVIDIA-Linux-{{ ansible_architecture }}-{{ gpu_driver_version }}.run; chmod +x ./NVIDIA-Linux-{{ ansible_architecture }}-{{ gpu_driver_version }}.run; sh ./NVIDIA-Linux-{{ ansible_architecture }}-{{ gpu_driver_version }}.run --silent" \ No newline at end of file + shell: "BASE_URL=https://us.download.nvidia.com/tesla; curl -fSsl -O $BASE_URL/{{ gpu_driver_version }}/NVIDIA-Linux-{{ ansible_architecture }}-{{ gpu_driver_version }}.run; chmod +x ./NVIDIA-Linux-{{ ansible_architecture }}-{{ gpu_driver_version }}.run; sh ./NVIDIA-Linux-{{ ansible_architecture }}-{{ gpu_driver_version }}.run --silent" + + - name: check dgx + stat: + path: /etc/dgx-release + register: dgx + + - name: Install NVIDIA Fabric Manager + become: true + when: "dgx.stat.exists == True and ansible_distribution == 'Ubuntu'" + ignore_errors: true + shell: "sudo apt install nvidia-fabricmanager-550 -y; sudo systemctl --now enable nvidia-fabricmanager; sudo systemctl daemon-reload" diff --git a/playbooks/cns_values_10.0.yaml b/playbooks/older_versions/cns_values_10.0.yaml similarity index 100% rename from playbooks/cns_values_10.0.yaml rename to playbooks/older_versions/cns_values_10.0.yaml diff --git a/playbooks/cns_values_10.1.yaml b/playbooks/older_versions/cns_values_10.1.yaml similarity index 100% rename from playbooks/cns_values_10.1.yaml rename to playbooks/older_versions/cns_values_10.1.yaml diff --git a/playbooks/cns_values_10.2.yaml b/playbooks/older_versions/cns_values_10.2.yaml similarity index 100% rename from playbooks/cns_values_10.2.yaml rename to playbooks/older_versions/cns_values_10.2.yaml diff --git a/playbooks/cns_values_10.3.yaml b/playbooks/older_versions/cns_values_10.3.yaml similarity index 100% rename from playbooks/cns_values_10.3.yaml rename to playbooks/older_versions/cns_values_10.3.yaml diff --git a/playbooks/cns_values_10.4.yaml b/playbooks/older_versions/cns_values_10.4.yaml similarity index 100% rename from playbooks/cns_values_10.4.yaml rename to playbooks/older_versions/cns_values_10.4.yaml diff --git a/playbooks/cns_values_10.5.yaml b/playbooks/older_versions/cns_values_10.5.yaml similarity index 100% rename from playbooks/cns_values_10.5.yaml rename to playbooks/older_versions/cns_values_10.5.yaml diff --git a/playbooks/operators-upgrade.yaml b/playbooks/operators-upgrade.yaml index dee39e2..97d979e 100644 --- a/playbooks/operators-upgrade.yaml +++ b/playbooks/operators-upgrade.yaml @@ -157,6 +157,16 @@ - kubectl apply -f https://gitlab.com/nvidia/kubernetes/gpu-operator/-/raw/v24.3.0/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml - helm show --version=v{{ gpu_operator_version }} values nvidia/gpu-operator > {{ ansible_user_dir }}/values.yaml + - name: Upgrade GPU Operator CRDs on Cloud Native Stack + when: "enable_gpu_operator == true and k8sversion.stdout == 'v1.29.6' or enable_gpu_operator == true and k8sversion.stdout == 'v1.28.12' or enable_gpu_operator == true and k8sversion.stdout == 'v1.30.2'" + shell: "{{ item }}" + with_items: + - kubectl delete crd clusterpolicies.nvidia.com nvidiadrivers.nvidia.com + - kubectl apply -f https://gitlab.com/nvidia/kubernetes/gpu-operator/-/raw/v24.6.0/deployments/gpu-operator/crds/nvidia.com_clusterpolicies_crd.yaml + - kubectl apply -f https://gitlab.com/nvidia/kubernetes/gpu-operator/-/raw/v24.6.0/deployments/gpu-operator/charts/node-feature-discovery/crds/nfd-api-crds.yaml + - kubectl apply -f https://gitlab.com/nvidia/kubernetes/gpu-operator/-/raw/v24.6.0/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml + - helm show --version=v{{ gpu_operator_version }} values nvidia/gpu-operator > {{ ansible_user_dir }}/values.yaml + - name: create GPU Custom Values for proxy when: proxy == true replace: diff --git a/playbooks/prerequisites.yaml b/playbooks/prerequisites.yaml index a9dafda..3423a31 100755 --- a/playbooks/prerequisites.yaml +++ b/playbooks/prerequisites.yaml @@ -174,6 +174,9 @@ - {name: "net.bridge.bridge-nf-call-ip6tables", value: "1", reload: no} - {name: "net.bridge.bridge-nf-call-iptables", value: "1", reload: no} - {name: "net.ipv4.ip_forward", value: "1", reload: yes} + - {name: "fs.inotify.max_user_watches", value: "2099999999", reload: no} + - {name: "fs.inotify.max_user_instances", value: "2099999999", reload: no} + - {name: "fs.inotify.max_queued_events", value: "2099999999", reload: yes} when: "cns_version >= 4.0" - name: Setup Containerd for Ubuntu @@ -355,7 +358,7 @@ when: "proxy == true and cns_version >= 6.1 and container_runtime == 'containerd'" - name: Install CRI-O on Ubuntu - when: "container_runtime == 'cri-o' and ansible_distribution == 'Ubuntu' and cns_version >= 12.0" + when: "container_runtime == 'cri-o' and ansible_distribution == 'Ubuntu' and cns_version >= 11.0" become: true block: - name: trim CRI-O version @@ -384,12 +387,12 @@ filename: cri-o.list - name: Install CRI-O on Ubuntu - when: "container_runtime == 'cri-o' and ansible_distribution == 'Ubuntu' and cns_version >= 12.0" + when: "container_runtime == 'cri-o' and ansible_distribution == 'Ubuntu' and cns_version >= 11.0" become: true block: - name: install CRI-O apt: - name: ['cri-o'] + name: ["cri-o={{ crio_version }}*"] state: present update_cache: true force: yes @@ -408,7 +411,7 @@ ] - name: Install CRI-O on Ubuntu 22.04 - when: "container_runtime == 'cri-o' and ansible_distribution == 'Ubuntu' and ansible_distribution_major_version == '22' and cns_version < 12.0" + when: "container_runtime == 'cri-o' and ansible_distribution == 'Ubuntu' and ansible_distribution_major_version == '22' and cns_version < 11.0" become: true block: - name: trim CRI-O version @@ -449,7 +452,7 @@ filename: devel:kubic:libcontainers:stable:cri-o:{{ k8s_version }} - name: Install CRI-O on Ubuntu 20.04 - when: "container_runtime == 'cri-o' and ansible_distribution == 'Ubuntu' and ansible_distribution_major_version <= '20' and cns_version < 12.0" + when: "container_runtime == 'cri-o' and ansible_distribution == 'Ubuntu' and ansible_distribution_major_version <= '20' and cns_version < 11.0" become: true block: - name: trim CRI-O version @@ -499,7 +502,7 @@ create: yes - name: Install CRI-O on Ubuntu - when: "container_runtime == 'cri-o' and ansible_distribution == 'Ubuntu' and cns_version < 12.0" + when: "container_runtime == 'cri-o' and ansible_distribution == 'Ubuntu' and cns_version < 11.0" become: true block: - name: install CRI-O @@ -511,7 +514,7 @@ force: yes - name: Install CRI-O on RHEL - when: "container_runtime == 'cri-o' and ansible_distribution == 'RedHat' and cns_version < 12.0" + when: "container_runtime == 'cri-o' and ansible_distribution == 'RedHat' and cns_version < 11.0" become: true block: - name: trim CRI-O version @@ -541,13 +544,12 @@ description: CRIO Repo - name: install CRI-O - when: cns_version < 12.0 yum: name: ['cri-o', 'cri-tools'] state: present - name: Install CRI-O - when: "container_runtime == 'cri-o' and cns_version < 12.0" + when: "container_runtime == 'cri-o' and cns_version < 11.0" become: true block: - name: Create overlay-images directory @@ -568,7 +570,7 @@ ] - name: Install CRI-O on RHEL - when: "container_runtime == 'cri-o' and ansible_distribution == 'RedHat' and cns_version >= 12.0" + when: "container_runtime == 'cri-o' and ansible_distribution == 'RedHat' and cns_version >= 11.0" become: true block: - name: trim CRI-O version @@ -589,7 +591,6 @@ description: CRIO Repo - name: install CRI-O - when: cns_version >= 12.0 yum: name: ['container-selinux', 'cri-o'] state: present @@ -904,19 +905,23 @@ - name: "Install Helm on NVIDIA Cloud Native Stack" become: true command: "{{ item }}" + ignore_errors: yes with_items: - curl -O https://get.helm.sh/helm-v{{ helm_version }}-linux-amd64.tar.gz - tar -xvzf helm-v{{ helm_version }}-linux-amd64.tar.gz - cp linux-amd64/helm /usr/local/bin/ + - cp linux-arm64/helm /usr/bin/ - rm -rf helm-v{{ helm_version }}-linux-amd64.tar.gz linux-amd64 when: "ansible_architecture == 'x86_64'" - name: "Install Helm on NVIDIA Cloud Native Stack" become: true command: "{{ item }}" + ignore_errors: yes with_items: - curl -O https://get.helm.sh/helm-v{{ helm_version }}-linux-arm64.tar.gz - tar -xvzf helm-v{{ helm_version }}-linux-arm64.tar.gz - cp linux-arm64/helm /usr/local/bin/ + - cp linux-arm64/helm /usr/bin/ - rm -rf helm-v{{ helm_version }}-linux-arm64.tar.gz linux-arm64 when: "ansible_architecture == 'aarch64'" \ No newline at end of file diff --git a/playbooks/readme.md b/playbooks/readme.md index 92fb82a..228c1c7 100755 --- a/playbooks/readme.md +++ b/playbooks/readme.md @@ -787,4 +787,6 @@ The Ansible NVIDIA Cloud Native Stack uninstall playbook will do the following: ### Getting Help -Please [open an issue on the GitHub project](https://github.com/NVIDIA/cloud-native-stack/issues) for any questions. Your feedback is appreciated. \ No newline at end of file +Please [open an issue on the GitHub project](https://github.com/NVIDIA/cloud-native-stack/issues) for any questions. Your feedback is appreciated. + +