From 0dcb377a941086c0411f572f1b5803665b43aeb6 Mon Sep 17 00:00:00 2001 From: Tariq Ibrahim Date: Mon, 12 Aug 2024 14:04:02 -0700 Subject: [PATCH] add the v24.6.1 OLM bundle Signed-off-by: Tariq Ibrahim (cherry picked from commit fb7a882f5d9ac5278fb5afe457c922535b7ca4f4) --- ...rator-certified.clusterserviceversion.yaml | 921 +++++++ .../manifests/nvidia.com_clusterpolicies.yaml | 2404 +++++++++++++++++ .../manifests/nvidia.com_nvidiadrivers.yaml | 810 ++++++ bundle/v24.6.1/metadata/annotations.yaml | 17 + 4 files changed, 4152 insertions(+) create mode 100644 bundle/v24.6.1/manifests/gpu-operator-certified.clusterserviceversion.yaml create mode 100644 bundle/v24.6.1/manifests/nvidia.com_clusterpolicies.yaml create mode 100644 bundle/v24.6.1/manifests/nvidia.com_nvidiadrivers.yaml create mode 100644 bundle/v24.6.1/metadata/annotations.yaml diff --git a/bundle/v24.6.1/manifests/gpu-operator-certified.clusterserviceversion.yaml b/bundle/v24.6.1/manifests/gpu-operator-certified.clusterserviceversion.yaml new file mode 100644 index 000000000..a4c4aec27 --- /dev/null +++ b/bundle/v24.6.1/manifests/gpu-operator-certified.clusterserviceversion.yaml @@ -0,0 +1,921 @@ +apiVersion: operators.coreos.com/v1alpha1 +kind: ClusterServiceVersion +metadata: + labels: + operatorframework.io/arch.arm64: supported + operatorframework.io/arch.amd64: supported + pod-security.kubernetes.io/enforce: privileged + pod-security.kubernetes.io/audit: privileged + pod-security.kubernetes.io/warn: privileged + annotations: + features.operators.openshift.io/disconnected: "true" + features.operators.openshift.io/fips-compliant: "false" + features.operators.openshift.io/proxy-aware: "true" + features.operators.openshift.io/tls-profiles: "false" + features.operators.openshift.io/token-auth-aws: "false" + features.operators.openshift.io/token-auth-azure: "false" + features.operators.openshift.io/token-auth-gcp: "false" + features.operators.openshift.io/cnf: "false" + features.operators.openshift.io/cni: "false" + features.operators.openshift.io/csi: "false" + olm.skipRange: '>=1.9.0 <24.6.1' + alm-examples: |- + [ + { + "apiVersion": "nvidia.com/v1", + "kind": "ClusterPolicy", + "metadata": { + "name": "gpu-cluster-policy" + }, + "spec": { + "operator": { + "defaultRuntime": "crio", + "use_ocp_driver_toolkit": true, + "initContainer": { + } + }, + "sandboxWorkloads": { + "enabled": false, + "defaultWorkload": "container" + }, + "driver": { + "enabled": true, + "useNvidiaDriverCRD": false, + "useOpenKernelModules": false, + "upgradePolicy": { + "autoUpgrade": true, + "drain": { + "deleteEmptyDir": false, + "enable": false, + "force": false, + "timeoutSeconds": 300 + }, + "maxParallelUpgrades": 1, + "maxUnavailable": "25%", + "podDeletion": { + "deleteEmptyDir": false, + "force": false, + "timeoutSeconds": 300 + }, + "waitForCompletion": { + "timeoutSeconds": 0 + } + }, + "repoConfig": { + "configMapName": "" + }, + "certConfig": { + "name": "" + }, + "licensingConfig": { + "nlsEnabled": true, + "configMapName": "" + }, + "virtualTopology": { + "config": "" + }, + "kernelModuleConfig": { + "name": "" + } + }, + "dcgmExporter": { + "enabled": true, + "config": { + "name": "" + }, + "serviceMonitor": { + "enabled": true + } + }, + "dcgm": { + "enabled": true + }, + "daemonsets": { + "updateStrategy": "RollingUpdate", + "rollingUpdate": { + "maxUnavailable": "1" + } + }, + "devicePlugin": { + "enabled": true, + "config": { + "name": "", + "default": "" + }, + "mps": { + "root": "/run/nvidia/mps" + } + }, + "gfd": { + "enabled": true + }, + "migManager": { + "enabled": true + }, + "nodeStatusExporter": { + "enabled": true + }, + "mig": { + "strategy": "single" + }, + "toolkit": { + "enabled": true + }, + "validator": { + "plugin": { + "env": [ + { + "name": "WITH_WORKLOAD", + "value": "false" + } + ] + } + }, + "vgpuManager": { + "enabled": false + }, + "vgpuDeviceManager": { + "enabled": true + }, + "sandboxDevicePlugin": { + "enabled": true + }, + "vfioManager": { + "enabled": true + }, + "gds": { + "enabled": false + }, + "gdrcopy": { + "enabled": false + } + } + }, + { + "apiVersion": "nvidia.com/v1alpha1", + "kind": "NVIDIADriver", + "metadata": { + "name": "gpu-driver" + }, + "spec": { + "driverType": "gpu", + "repository": "nvcr.io/nvidia", + "image": "driver", + "version": "sha256:858de27c152669f5a3cf4287406405b16dd5bb70c0373324eb735511997bb415", + "nodeSelector": {}, + "manager": {}, + "repoConfig": { + "name": "" + }, + "certConfig": { + "name": "" + }, + "licensingConfig": { + "nlsEnabled": true, + "name": "" + }, + "virtualTopologyConfig": { + "name": "" + }, + "kernelModuleConfig": { + "name": "" + } + } + } + ] + operators.operatorframework.io/builder: operator-sdk-v1.4.0 + operators.operatorframework.io/project_layout: go.kubebuilder.io/v3 + operatorframework.io/suggested-namespace: nvidia-gpu-operator + capabilities: Deep Insights + categories: AI/Machine Learning, OpenShift Optional + certified: "true" + containerImage: nvcr.io/nvidia/gpu-operator@sha256:d51c3a34aaa9a5dfbdd3b710ee18d9eaa50aa0fb3518bacd541053d77c5c1098 + createdAt: "Mon Aug 12 11:35:29 PDT 2024" + description: Automate the management and monitoring of NVIDIA GPUs. + provider: NVIDIA + repository: http://github.com/NVIDIA/gpu-operator + support: NVIDIA + name: gpu-operator-certified.v24.6.1 + namespace: placeholder +spec: + apiservicedefinitions: {} + relatedImages: + - name: gpu-operator-image + image: nvcr.io/nvidia/gpu-operator@sha256:d51c3a34aaa9a5dfbdd3b710ee18d9eaa50aa0fb3518bacd541053d77c5c1098 + - name: dcgm-exporter-image + image: nvcr.io/nvidia/k8s/dcgm-exporter@sha256:21f4c8b88716e8e6f732f9fb4c2efaef937c227491a8631c5e55036f80f39a4d + - name: dcgm-image + image: nvcr.io/nvidia/cloud-native/dcgm@sha256:15dab1273345df4a5844c4c761d064dbc4b592101251dc39174e597137123027 + - name: container-toolkit-image + image: nvcr.io/nvidia/k8s/container-toolkit@sha256:f95ef6a0c377e011bc0561c7d2c2bf32e45106fb0ba91ae9a10f97236ded0581 + - name: driver-image + image: nvcr.io/nvidia/driver@sha256:858de27c152669f5a3cf4287406405b16dd5bb70c0373324eb735511997bb415 + - name: driver-image-535 + image: nvcr.io/nvidia/driver@sha256:a6d12fb5753f267dda25dfd38910f972bc632c006a24107fa50e20bba3642d7c + - name: driver-image-470 + image: nvcr.io/nvidia/driver@sha256:07e11f85d54d49ec9648fb06e148b8d832ee1f9c3549a915eee853c9ef2949c2 + - name: device-plugin-image + image: nvcr.io/nvidia/k8s-device-plugin@sha256:7ad2c9f71fe06f9f7745ac8635f46740fbdff4f11edd468addfab81afcdfa534 + - name: gpu-feature-discovery-image + image: nvcr.io/nvidia/k8s-device-plugin@sha256:7ad2c9f71fe06f9f7745ac8635f46740fbdff4f11edd468addfab81afcdfa534 + - name: mig-manager-image + image: nvcr.io/nvidia/cloud-native/k8s-mig-manager@sha256:781fb47e264d9e0fbc8da5bd046e5e678316c866bc36ddd4b56d4eb0de682d5b + - name: init-container-image + image: nvcr.io/nvidia/cuda@sha256:b0b6c9286f20432ba9becb711aff2d1c1bd56e47b33e6d1cab04aba926c067fe + - name: gpu-operator-validator-image + image: nvcr.io/nvidia/cloud-native/gpu-operator-validator@sha256:0a48b6c65148358ab792b3dc23bce5d3e660e9176670f62864502f68647704f0 + - name: k8s-driver-manager-image + image: nvcr.io/nvidia/cloud-native/k8s-driver-manager@sha256:740abc3ff657545c10effd5354f09af525200ed9a1b7623f0c2e8c7bd9e4a4e2 + - name: vfio-manager-image + image: nvcr.io/nvidia/cuda@sha256:b0b6c9286f20432ba9becb711aff2d1c1bd56e47b33e6d1cab04aba926c067fe + - name: sandbox-device-plugin-image + image: nvcr.io/nvidia/kubevirt-gpu-device-plugin@sha256:969147c01d63be5d1fe458f32f1cc0c7408cf3062531db91408e2fc57b4d8a67 + - name: vgpu-device-manager-image + image: nvcr.io/nvidia/cloud-native/vgpu-device-manager@sha256:ae63fac9a4057a7646f0cf0ee0566e8928529adde05c4c0a017cda0599e381b2 + - name: gdrcopy-image + image: nvcr.io/nvidia/cloud-native/gdrdrv@sha256:33de74eb590f071403c17b6c210c02963245851971168bc0c07c06c100a9f376 + customresourcedefinitions: + owned: + - name: nvidiadrivers.nvidia.com + kind: NVIDIADriver + version: v1alpha1 + displayName: NVIDIADriver + description: NVIDIADriver allows you to deploy the NVIDIA driver + resources: + - kind: ServiceAccount + name: '' + version: v1 + - kind: DaemonSet + name: '' + version: apps/v1 + - kind: ConfigMap + name: '' + version: v1 + - kind: Pod + name: '' + version: v1 + - kind: clusterpolicies + name: '' + version: v1 + - kind: clusterversions + name: '' + version: v1 + - kind: nodes + name: '' + version: v1 + - kind: status + name: '' + version: v1 + specDescriptors: + - description: 'Optional: Set Node affinity' + displayName: Node affinity + path: affinity + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - 'urn:alm:descriptor:com.tectonic.ui:nodeAffinity' + - description: Node selector to control the selection of nodes (optional) + displayName: Node Selector + path: nodeSelector + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:selector:Node' + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - description: 'Optional: Set tolerations' + displayName: Tolerations + path: tolerations + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - 'urn:alm:descriptor:io.kubernetes:Tolerations' + - description: Image pull secrets + displayName: Image pull secrets + path: imagePullSecrets + x-descriptors: + - 'urn:alm:descriptor:io.kubernetes:Secret' + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - displayName: ImagePullPolicy + description: 'Image pull policy (default: IfNotPresent)' + path: imagePullPolicy + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:imagePullPolicy' + statusDescriptors: + - description: The current state of the driver. + displayName: State + path: state + x-descriptors: + - 'urn:alm:descriptor:text' + - name: clusterpolicies.nvidia.com + kind: ClusterPolicy + version: v1 + group: nvidia.com + displayName: ClusterPolicy + description: ClusterPolicy allows you to configure the GPU Operator + resources: + - kind: ServiceAccount + name: '' + version: v1 + - kind: Deployment + name: '' + version: apps/v1 + - kind: DaemonSet + name: '' + version: apps/v1 + - kind: ConfigMap + name: '' + version: v1 + - kind: Pod + name: '' + version: v1 + - kind: clusterpolicies + name: '' + version: v1 + - kind: clusterversions + name: '' + version: v1 + - kind: nodes + name: '' + version: v1 + - kind: status + name: '' + version: v1 + specDescriptors: + - description: GPU Operator config + displayName: GPU Operator config + path: operator + - displayName: ImagePullPolicy + description: 'Image pull policy (default: IfNotPresent)' + path: operator.validator.imagePullPolicy + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:imagePullPolicy' + - description: Image pull secrets + displayName: Image pull secrets + path: operator.validator.imagePullSecrets + x-descriptors: + - 'urn:alm:descriptor:io.kubernetes:Secret' + - description: NVIDIA GPU/vGPU Driver config + displayName: NVIDIA GPU/vGPU Driver config + path: driver + - description: 'Optional: Set Node affinity' + displayName: Node affinity + path: driver.affinity + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - 'urn:alm:descriptor:com.tectonic.ui:nodeAffinity' + - description: Node selector to control the selection of nodes (optional) + displayName: Node Selector + path: driver.nodeSelector + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:selector:Node' + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - description: 'Optional: Set tolerations' + displayName: Tolerations + path: driver.tolerations + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - 'urn:alm:descriptor:io.kubernetes:Tolerations' + - description: Image pull secrets + displayName: Image pull secrets + path: driver.imagePullSecrets + x-descriptors: + - 'urn:alm:descriptor:io.kubernetes:Secret' + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - displayName: ImagePullPolicy + description: 'Image pull policy (default: IfNotPresent)' + path: driver.imagePullPolicy + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:imagePullPolicy' + - description: NVIDIA DCGM Exporter config + displayName: NVIDIA DCGM Exporter config + path: dcgmExporter + - description: 'Optional: Set Node affinity' + displayName: Node affinity + path: dcgmExporter.affinity + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - 'urn:alm:descriptor:com.tectonic.ui:nodeAffinity' + - description: Node selector to control the selection of nodes (optional) + displayName: Node Selector + path: dcgmExporter.nodeSelector + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:selector:Node' + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - description: 'Optional: Set tolerations' + displayName: Tolerations + path: dcgmExporter.tolerations + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - 'urn:alm:descriptor:io.kubernetes:Tolerations' + - description: Image pull secrets + displayName: Image pull secrets + path: dcgmExporter.imagePullSecrets + x-descriptors: + - 'urn:alm:descriptor:io.kubernetes:Secret' + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - displayName: ImagePullPolicy + description: 'Image pull policy (default: IfNotPresent)' + path: dcgmExporter.imagePullPolicy + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:imagePullPolicy' + - description: NVIDIA Device Plugin config + displayName: NVIDIA Device Plugin config + path: devicePlugin + - description: 'Optional: Set Node affinity' + displayName: Node affinity + path: devicePlugin.affinity + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - 'urn:alm:descriptor:com.tectonic.ui:nodeAffinity' + - description: Node selector to control the selection of nodes (optional) + displayName: Node Selector + path: devicePlugin.nodeSelector + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:selector:Node' + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - description: 'Optional: Set tolerations' + displayName: Tolerations + path: devicePlugin.tolerations + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - 'urn:alm:descriptor:io.kubernetes:Tolerations' + - description: Image pull secrets + displayName: Image pull secrets + path: devicePlugin.imagePullSecrets + x-descriptors: + - 'urn:alm:descriptor:io.kubernetes:Secret' + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - displayName: ImagePullPolicy + description: 'Image pull policy (default: IfNotPresent)' + path: devicePlugin.imagePullPolicy + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:imagePullPolicy' + - description: GPU Feature Discovery Plugin config + displayName: GPU Feature Discovery Plugin config + path: gfd + - description: 'Optional: Set Node affinity' + displayName: Node affinity + path: gfd.affinity + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - 'urn:alm:descriptor:com.tectonic.ui:nodeAffinity' + - description: Node selector to control the selection of nodes (optional) + displayName: Node Selector + path: gfd.nodeSelector + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:selector:Node' + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - description: 'Optional: Set tolerations' + displayName: Tolerations + path: gfd.tolerations + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - 'urn:alm:descriptor:io.kubernetes:Tolerations' + - description: Image pull secrets + displayName: Image pull secrets + path: gfd.imagePullSecrets + x-descriptors: + - 'urn:alm:descriptor:io.kubernetes:Secret' + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - displayName: ImagePullPolicy + description: 'Image pull policy (default: IfNotPresent)' + path: gfd.imagePullPolicy + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:imagePullPolicy' + - description: NVIDIA Container Toolkit config + displayName: NVIDIA Container Toolkit config + path: toolkit + - description: 'Optional: Set Node affinity' + displayName: Node affinity + path: toolkit.affinity + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - 'urn:alm:descriptor:com.tectonic.ui:nodeAffinity' + - description: Node selector to control the selection of nodes (optional) + displayName: Node Selector + path: toolkit.nodeSelector + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:selector:Node' + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - description: 'Optional: Set tolerations' + displayName: Tolerations + path: toolkit.tolerations + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - 'urn:alm:descriptor:io.kubernetes:Tolerations' + - description: Image pull secrets + displayName: Image pull secrets + path: toolkit.imagePullSecrets + x-descriptors: + - 'urn:alm:descriptor:io.kubernetes:Secret' + - 'urn:alm:descriptor:com.tectonic.ui:advanced' + - displayName: ImagePullPolicy + description: 'Image pull policy (default: IfNotPresent)' + path: toolkit.imagePullPolicy + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:imagePullPolicy' + - displayName: NVIDIA DCGM config + description: NVIDIA DCGM config + path: dcgm + - displayName: Validator config + description: Validator config + path: validator + - displayName: Node Status Exporter config + description: Node Status Exporter config + path: nodeStatusExporter + - displayName: Daemonsets config + description: Daemonsets config + path: daemonsets + - displayName: MIG config + description: MIG config + path: mig + - displayName: NVIDIA MIG Manager config + description: NVIDIA MIG Manager config + path: migManager + - displayName: PodSecurityPolicy config + description: PodSecurityPolicy config + path: psp + - displayName: NVIDIA GPUDirect Storage config + description: NVIDIA GPUDirect Storage config + path: gds + - displayName: Sandbox Workloads config + description: Sandbox Workloads config + path: sandboxWorkloads + - displayName: NVIDIA vGPU Manager config + description: NVIDIA vGPU Manager config + path: vgpuManager + - displayName: NVIDIA vGPU Device Manager config + description: NVIDIA vGPU Device Manager config + path: vgpuDeviceManager + - displayName: VFIO Manager config + description: VFIO Manager config + path: vfioManager + - displayName: NVIDIA Sandbox Device Plugin config + description: NVIDIA Sandbox Device Plugin config + path: sandboxDevicePlugin + statusDescriptors: + - description: The current state of the operator. + displayName: State + path: state + x-descriptors: + - 'urn:alm:descriptor:text' + displayName: NVIDIA GPU Operator + description: > + Kubernetes provides access to special hardware resources such as NVIDIA + GPUs, NICs, Infiniband adapters and other devices through the [device plugin + framework](https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/). + However, configuring and managing nodes with these hardware resources + requires configuration of multiple software components such as drivers, + container runtimes or other libraries which are difficult and prone to + errors. + + The NVIDIA GPU Operator uses the [operator + framework](https://cloud.redhat.com/blog/introducing-the-operator-framework) within + Kubernetes to automate the management of all NVIDIA software components + needed to provision and monitor GPUs. + These components include the NVIDIA drivers (to enable CUDA), Kubernetes + device plugin for GPUs, the NVIDIA Container Runtime, automatic node + labelling and NVIDIA DCGM exporter. + + Visit the official site of the [GPU Operator](https://github.com/NVIDIA/gpu-operator) for more information. + For getting started with using the GPU Operator with OpenShift, see the instructions + [here](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/openshift/contents.html). + icon: + - base64data: iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAMAAACdt4HsAAAB2lBMVEUAAAD///8EBAN3uQACAgIAAAQJDQUCAgB1tgAHCQf+/v5Ufg5Hagxxqwt+xgJ3uAB9wwB4vQBRUVEeLA3e3t5nZ2coKCgODg4FBwZ9wwR6wAJ4vADz8/MbGxt5tw1vpw1/wgoOFwkLDwh9xQH5+fny8vLw8PDFxcWysrKFhYVvb282NjYyMjIqKioXFxdikxRYgxNCYxJQdhFqoQ9xrg16ugxyqgyAxQkEBQj7+/v29vbIyMhjY2NbW1tHR0cvLy8kJCQdHR0ZGRlKbxJ8uhFNcxFVgBAxSBBgkg93tQ50sA4qPg4XIg18vwsbKQsSGgsLCwsMEwqCyQeByQFztADPz8+/v7+6urqWlpZra2tKSkogICASEhJmmRE8XBA5VRA2UBBonA9biA9GaQ4sQg4jMw4mOQ0aJw2GzgsUHgttpAqJ0Ql/wQWG0AJ8vwF0uQCtra2jo6OQkJB9fX1VVVVCQkI9PT0iIiIUFBRSfBNgjhA7WRBGZw+GywmFzgaAyASBxQN2twDb29u2traenp6Kiop+fn53d3dzc3NyqRV4sxM/YBNAXRElNhBjlQ+IzA00TQ16vgxJbgp6vAl4tgJ3vgDs7Ozn5+fa2trS0tJCXRY6VBV6thSL1gf4nFdFAAAD80lEQVRYw+zSOXPaQBgG4He0LJJmbGRGDUIzuvgBQiAEPfcdwC33DTbUtmOwSyc+4iRucvzXRImLFJmRShc8xXbfu+9+szg4OHjjAsH/iFD49q7rqM6xc/wPtWyBhS8sC94ObWRCZDksh1+RzmcEfI0DoPrjylEkSTgViMs9udjYTwMG4Gf51Z1BM81ioRwit+QvgYsdUQZeKFr3ladyKXvVr+pAM5uKcmRLXFzoCIxn+0i/8lSaBMHnfi7qowfQuZnm3PuFPwGs13zD3NlViozY/z4YD6/TCQORbPr2q78GLB0ou5IO40pd5AxQZnJ83m2y9Ju2JYKfgEhWC18aEIfrZLURHwQC0B87ySZwHxX8BNDWB1KfQfyxT2TA24uPQMt8yTWA3obz8wQGlhTN06Z900MkuJLrYu3u5LkK9LTtGRF8NEDLeSnXYLUdHUFVlpPqTa4IamlhJZ464biY1w4CKGrROOW7uwLlV+Q02lanCF6cbSoPVLzUfPwDll5I9T6WyXWhZre1yjiI6VCSzCWY3+FKaAwGHngzpEygx6+V6Uzk6TJR7yhWxJ1bFgTPJ7gMc58aUCq+n+qNT6Pn8y/xOcCiZZVjnJ+AAPhEuj0SKZ9bL9ZpNS9SgM6z9p5w3jt43cMvecfWBhm7dtfEpfhYMDBYpFd7mDZIAxPCFKgBhB0hkWbE2wVMyqycfhOMEiebSzFz5IMTEjw7E87UFj4GVR7GXqaSkoIcISEc/I38/PwhOTUMRBrADgwK09zgYGUBqbwcARiQyp3Eyk6kC4BloqtbJTcaSHIHShALWFmBSRuCWBGC+AtDMAAGIpAAc9mBiB0sCLSXHUSygxSxEIoE7IKEgbhopKgogC96x04QCMMw/H0cG6f0cEmBHaLc7FFQzApoTLwtQgWUWo26glx2mzGkyoHM1PPMO/NrnSH8e2QAiRsZ8S3ZuJoW5Udg5moGoMRLN2gAnkcUctueJ1gADsdtlZ2AgmSYoaDZBXwRctcwy6HN3XX/wfnTnA7Q5x0S0Gku4wHpe7Ql8Mbtu4TqC3qcADGtUl4O3eK0AkZdKH1mU/a6MFQGA7pQGoAVoAuuPYZlLJF2BawVLLjwac6Q8wUax61/CpKQAT6ZX3hFqoqqAFvuf4AzM+NgsoBS/wcSOD7SFzyf6CE9UQK9II1MRvIJm8QSgsLiBZuypsAWKyARElgx5FcLv1N4nFLbB45Sh6+TzsQRtn7bz/B3fS9GQ12bgUE2PKycQbwgXD0SWLwVhpZFq4eHhWloOjLoqGvoRYRGAR2vp2EtpNUaTUpiRAizMAEhKNXpYZNnAUlBCSgFYTIxQTlMMJNGwSgYBdQHAFsKs+/bUkeyAAAAAElFTkSuQmCC + mediatype: image/png + install: + spec: + clusterPermissions: + - serviceAccountName: gpu-operator + rules: + - apiGroups: + - nvidia.com + resources: + - clusterpolicies + - clusterpolicies/finalizers + - clusterpolicies/status + - nvidiadrivers + - nvidiadrivers/finalizers + - nvidiadrivers/status + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - deletecollection + - apiGroups: + - config.openshift.io + resources: + - clusterversions + - proxies + verbs: + - get + - list + - watch + - apiGroups: + - security.openshift.io + resources: + - securitycontextconstraints + verbs: + - use + - create + - get + - list + - watch + - patch + - update + - delete + - apiGroups: + - security.openshift.io + resources: + - securitycontextconstraints + verbs: + - use + resourceNames: + - hostmount-anyuid + - apiGroups: + - image.openshift.io + resources: + - imagestreams + verbs: + - get + - list + - watch + - apiGroups: + - monitoring.coreos.com + resources: + - servicemonitors + - prometheusrules + verbs: + - get + - list + - create + - watch + - update + - delete + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - clusterrolebindings + verbs: + - create + - get + - list + - watch + - update + - patch + - delete + - apiGroups: + - "" + resources: + - pods + - pods/eviction + - services + - services/finalizers + - events + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - list + - create + - watch + - update + - patch + - apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - watch + - update + - patch + - apiGroups: + - apps + resources: + - daemonsets + verbs: + - get + - list + - watch + - apiGroups: + - apps + resources: + - controllerrevisions + verbs: + - get + - list + - watch + - apiGroups: + - node.k8s.io + resources: + - runtimeclasses + verbs: + - get + - list + - create + - update + - watch + - delete + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - create + - get + - list + - watch + - update + - patch + - delete + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - create + - get + - list + - watch + - update + - patch + - delete + permissions: + - serviceAccountName: gpu-operator + rules: + - apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + - rolebindings + verbs: + - create + - get + - list + - watch + - update + - patch + - delete + - apiGroups: + - apps + resources: + - daemonsets + verbs: + - create + - get + - list + - watch + - update + - patch + - delete + - apiGroups: + - "" + resources: + - configmaps + - endpoints + - secrets + - serviceaccounts + verbs: + - create + - get + - list + - watch + - update + - patch + - delete + deployments: + - name: gpu-operator + spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: gpu-operator + app: gpu-operator + strategy: {} + template: + metadata: + labels: + app.kubernetes.io/component: gpu-operator + app: gpu-operator + nvidia.com/gpu-driver-upgrade-drain.skip: "true" + spec: + priorityClassName: system-node-critical + containers: + - args: + - --leader-elect + - --leader-lease-renew-deadline + - "60s" + image: nvcr.io/nvidia/gpu-operator@sha256:d51c3a34aaa9a5dfbdd3b710ee18d9eaa50aa0fb3518bacd541053d77c5c1098 + command: + - gpu-operator + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: gpu-operator + ports: + - name: metrics + containerPort: 8080 + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + limits: + cpu: 500m + memory: 1Gi + requests: + cpu: 200m + memory: 200Mi + securityContext: + allowPrivilegeEscalation: false + volumeMounts: + - mountPath: /host-etc/os-release + name: host-os-release + readOnly: true + env: + - name: OPERATOR_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: "VALIDATOR_IMAGE" + value: "nvcr.io/nvidia/cloud-native/gpu-operator-validator@sha256:0a48b6c65148358ab792b3dc23bce5d3e660e9176670f62864502f68647704f0" + - name: "GFD_IMAGE" + value: "nvcr.io/nvidia/k8s-device-plugin@sha256:7ad2c9f71fe06f9f7745ac8635f46740fbdff4f11edd468addfab81afcdfa534" + - name: "CONTAINER_TOOLKIT_IMAGE" + value: "nvcr.io/nvidia/k8s/container-toolkit@sha256:f95ef6a0c377e011bc0561c7d2c2bf32e45106fb0ba91ae9a10f97236ded0581" + - name: "DCGM_IMAGE" + value: "nvcr.io/nvidia/cloud-native/dcgm@sha256:15dab1273345df4a5844c4c761d064dbc4b592101251dc39174e597137123027" + - name: "DCGM_EXPORTER_IMAGE" + value: "nvcr.io/nvidia/k8s/dcgm-exporter@sha256:21f4c8b88716e8e6f732f9fb4c2efaef937c227491a8631c5e55036f80f39a4d" + - name: "DEVICE_PLUGIN_IMAGE" + value: "nvcr.io/nvidia/k8s-device-plugin@sha256:7ad2c9f71fe06f9f7745ac8635f46740fbdff4f11edd468addfab81afcdfa534" + - name: "DRIVER_IMAGE" + value: "nvcr.io/nvidia/driver@sha256:858de27c152669f5a3cf4287406405b16dd5bb70c0373324eb735511997bb415" + - name: "DRIVER_IMAGE-535" + value: "nvcr.io/nvidia/driver@sha256:a6d12fb5753f267dda25dfd38910f972bc632c006a24107fa50e20bba3642d7c" + - name: "DRIVER_IMAGE-470" + value: "nvcr.io/nvidia/driver@sha256:07e11f85d54d49ec9648fb06e148b8d832ee1f9c3549a915eee853c9ef2949c2" + - name: "DRIVER_MANAGER_IMAGE" + value: "nvcr.io/nvidia/cloud-native/k8s-driver-manager@sha256:740abc3ff657545c10effd5354f09af525200ed9a1b7623f0c2e8c7bd9e4a4e2" + - name: "MIG_MANAGER_IMAGE" + value: "nvcr.io/nvidia/cloud-native/k8s-mig-manager@sha256:781fb47e264d9e0fbc8da5bd046e5e678316c866bc36ddd4b56d4eb0de682d5b" + - name: "CUDA_BASE_IMAGE" + value: "nvcr.io/nvidia/cuda@sha256:b0b6c9286f20432ba9becb711aff2d1c1bd56e47b33e6d1cab04aba926c067fe" + - name: "VFIO_MANAGER_IMAGE" + value: "nvcr.io/nvidia/cuda@sha256:b0b6c9286f20432ba9becb711aff2d1c1bd56e47b33e6d1cab04aba926c067fe" + - name: "SANDBOX_DEVICE_PLUGIN_IMAGE" + value: "nvcr.io/nvidia/kubevirt-gpu-device-plugin@sha256:969147c01d63be5d1fe458f32f1cc0c7408cf3062531db91408e2fc57b4d8a67" + - name: "VGPU_DEVICE_MANAGER_IMAGE" + value: "nvcr.io/nvidia/cloud-native/vgpu-device-manager@sha256:ae63fac9a4057a7646f0cf0ee0566e8928529adde05c4c0a017cda0599e381b2" + - name: "GDRCOPY_IMAGE" + value: "nvcr.io/nvidia/cloud-native/gdrdrv@sha256:33de74eb590f071403c17b6c210c02963245851971168bc0c07c06c100a9f376" + terminationGracePeriodSeconds: 10 + volumes: + - hostPath: + path: /etc/os-release + name: host-os-release + serviceAccountName: gpu-operator + strategy: deployment + installModes: + - supported: true + type: OwnNamespace + - supported: true + type: SingleNamespace + - supported: false + type: MultiNamespace + - supported: false + type: AllNamespaces + keywords: + - gpu + - cuda + - compute + - operator + - deep learning + - monitoring + - tesla + maintainers: + - name: NVIDIA + email: operator_feedback@nvidia.com + maturity: stable + provider: + name: NVIDIA Corporation + version: 24.6.1 + replaces: gpu-operator-certified.v24.6.0 diff --git a/bundle/v24.6.1/manifests/nvidia.com_clusterpolicies.yaml b/bundle/v24.6.1/manifests/nvidia.com_clusterpolicies.yaml new file mode 100644 index 000000000..8e29cabf1 --- /dev/null +++ b/bundle/v24.6.1/manifests/nvidia.com_clusterpolicies.yaml @@ -0,0 +1,2404 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.15.0 + name: clusterpolicies.nvidia.com +spec: + group: nvidia.com + names: + kind: ClusterPolicy + listKind: ClusterPolicyList + plural: clusterpolicies + singular: clusterpolicy + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .status.state + name: Status + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: string + name: v1 + schema: + openAPIV3Schema: + description: ClusterPolicy is the Schema for the clusterpolicies API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: ClusterPolicySpec defines the desired state of ClusterPolicy + properties: + ccManager: + description: CCManager component spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + defaultMode: + description: Default CC mode setting for compatible GPUs on the + node + enum: + - "on" + - "off" + - devtools + type: string + enabled: + description: Enabled indicates if deployment of CC Manager is + enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: CC Manager image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: CC Manager image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: CC Manager image tag + type: string + type: object + cdi: + description: CDI configures how the Container Device Interface is + used in the cluster + properties: + default: + default: false + description: Default indicates whether to use CDI as the default + mechanism for providing GPU access to containers. + type: boolean + enabled: + default: false + description: Enabled indicates whether CDI can be used to make + GPUs accessible to containers. + type: boolean + type: object + daemonsets: + description: Daemonset defines common configuration for all Daemonsets + properties: + annotations: + additionalProperties: + type: string + description: |- + Optional: Annotations is an unstructured key value map stored with a resource that may be + set by external tools to store and retrieve arbitrary metadata. They are not + queryable and should be preserved when modifying objects. + type: object + labels: + additionalProperties: + type: string + description: |- + Optional: Map of string keys and values that can be used to organize and categorize + (scope and select) objects. May match selectors of replication controllers + and services. + type: object + priorityClassName: + type: string + rollingUpdate: + description: 'Optional: Configuration for rolling update of all + DaemonSet pods' + properties: + maxUnavailable: + type: string + type: object + tolerations: + description: 'Optional: Set tolerations' + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + updateStrategy: + default: RollingUpdate + enum: + - RollingUpdate + - OnDelete + type: string + type: object + dcgm: + description: DCGM component spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + enabled: + description: Enabled indicates if deployment of NVIDIA DCGM Hostengine + as a separate pod is enabled. + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + hostPort: + description: 'Deprecated: HostPort represents host port that needs + to be bound for DCGM engine (Default: 5555)' + format: int32 + type: integer + image: + description: NVIDIA DCGM image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA DCGM image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: NVIDIA DCGM image tag + type: string + type: object + dcgmExporter: + description: DCGMExporter spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + config: + description: 'Optional: Custom metrics configuration for NVIDIA + DCGM Exporter' + properties: + name: + description: ConfigMap name with file dcgm-metrics.csv for + metrics to be collected by NVIDIA DCGM Exporter + type: string + type: object + enabled: + description: Enabled indicates if deployment of NVIDIA DCGM Exporter + through operator is enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA DCGM Exporter image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA DCGM Exporter image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + serviceMonitor: + description: 'Optional: ServiceMonitor configuration for NVIDIA + DCGM Exporter' + properties: + additionalLabels: + additionalProperties: + type: string + description: AdditionalLabels to add to ServiceMonitor instance + for NVIDIA DCGM Exporter + type: object + enabled: + description: Enabled indicates if ServiceMonitor is deployed + for NVIDIA DCGM Exporter + type: boolean + honorLabels: + description: HonorLabels chooses the metric’s labels on collisions + with target labels. + type: boolean + interval: + description: |- + Interval which metrics should be scraped from NVIDIA DCGM Exporter. If not specified Prometheus’ global scrape interval is used. + Supported units: y, w, d, h, m, s, ms + pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + relabelings: + description: Relabelings allows to rewrite labels on metric + sets for NVIDIA DCGM Exporter + items: + description: |- + RelabelConfig allows dynamic rewriting of the label set for targets, alerts, + scraped samples and remote write samples. + + + More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config + properties: + action: + default: replace + description: |- + Action to perform based on the regex matching. + + + `Uppercase` and `Lowercase` actions require Prometheus >= v2.36.0. + `DropEqual` and `KeepEqual` actions require Prometheus >= v2.41.0. + + + Default: "Replace" + enum: + - replace + - Replace + - keep + - Keep + - drop + - Drop + - hashmod + - HashMod + - labelmap + - LabelMap + - labeldrop + - LabelDrop + - labelkeep + - LabelKeep + - lowercase + - Lowercase + - uppercase + - Uppercase + - keepequal + - KeepEqual + - dropequal + - DropEqual + type: string + modulus: + description: |- + Modulus to take of the hash of the source label values. + + + Only applicable when the action is `HashMod`. + format: int64 + type: integer + regex: + description: Regular expression against which the extracted + value is matched. + type: string + replacement: + description: |- + Replacement value against which a Replace action is performed if the + regular expression matches. + + + Regex capture groups are available. + type: string + separator: + description: Separator is the string between concatenated + SourceLabels. + type: string + sourceLabels: + description: |- + The source labels select values from existing labels. Their content is + concatenated using the configured Separator and matched against the + configured regular expression. + items: + description: |- + LabelName is a valid Prometheus label name which may only contain ASCII + letters, numbers, as well as underscores. + pattern: ^[a-zA-Z_][a-zA-Z0-9_]*$ + type: string + type: array + targetLabel: + description: |- + Label to which the resulting string is written in a replacement. + + + It is mandatory for `Replace`, `HashMod`, `Lowercase`, `Uppercase`, + `KeepEqual` and `DropEqual` actions. + + + Regex capture groups are available. + type: string + type: object + type: array + type: object + version: + description: NVIDIA DCGM Exporter image tag + type: string + type: object + devicePlugin: + description: DevicePlugin component spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + config: + description: 'Optional: Configuration for the NVIDIA Device Plugin + via the ConfigMap' + properties: + default: + description: Default config name within the ConfigMap for + the NVIDIA Device Plugin config + type: string + name: + description: ConfigMap name for NVIDIA Device Plugin config + including shared config between plugin and GFD + type: string + type: object + enabled: + description: Enabled indicates if deployment of NVIDIA Device + Plugin through operator is enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA Device Plugin image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + mps: + description: 'Optional: MPS related configuration for the NVIDIA + Device Plugin' + properties: + root: + default: /run/nvidia/mps + description: Root defines the MPS root path on the host + type: string + type: object + repository: + description: NVIDIA Device Plugin image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: NVIDIA Device Plugin image tag + type: string + type: object + driver: + description: Driver component spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + certConfig: + description: 'Optional: Custom certificates configuration for + NVIDIA Driver container' + properties: + name: + type: string + type: object + enabled: + description: Enabled indicates if deployment of NVIDIA Driver + through operator is enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA Driver image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + kernelModuleConfig: + description: 'Optional: Kernel module configuration parameters + for the NVIDIA Driver' + properties: + name: + type: string + type: object + licensingConfig: + description: 'Optional: Licensing configuration for NVIDIA vGPU + licensing' + properties: + configMapName: + type: string + nlsEnabled: + description: NLSEnabled indicates if NVIDIA Licensing System + is used for licensing. + type: boolean + type: object + livenessProbe: + description: NVIDIA Driver container liveness probe settings + properties: + failureThreshold: + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + initialDelaySeconds: + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + type: integer + periodSeconds: + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + successThreshold: + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + timeoutSeconds: + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + minimum: 1 + type: integer + type: object + manager: + description: Manager represents configuration for NVIDIA Driver + Manager initContainer + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: Image represents NVIDIA Driver Manager image + name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: Repository represents Driver Managerrepository + path + type: string + version: + description: Version represents NVIDIA Driver Manager image + tag(version) + type: string + type: object + rdma: + description: GPUDirectRDMASpec defines the properties for nvidia-peermem + deployment + properties: + enabled: + description: Enabled indicates if GPUDirect RDMA is enabled + through GPU operator + type: boolean + useHostMofed: + description: UseHostMOFED indicates to use MOFED drivers directly + installed on the host to enable GPUDirect RDMA + type: boolean + type: object + readinessProbe: + description: NVIDIA Driver container readiness probe settings + properties: + failureThreshold: + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + initialDelaySeconds: + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + type: integer + periodSeconds: + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + successThreshold: + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + timeoutSeconds: + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + minimum: 1 + type: integer + type: object + repoConfig: + description: 'Optional: Custom repo configuration for NVIDIA Driver + container' + properties: + configMapName: + type: string + type: object + repository: + description: NVIDIA Driver image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + startupProbe: + description: NVIDIA Driver container startup probe settings + properties: + failureThreshold: + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + initialDelaySeconds: + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + type: integer + periodSeconds: + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + successThreshold: + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + timeoutSeconds: + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + minimum: 1 + type: integer + type: object + upgradePolicy: + description: Driver auto-upgrade settings + properties: + autoUpgrade: + default: false + description: |- + AutoUpgrade is a global switch for automatic upgrade feature + if set to false all other options are ignored + type: boolean + drain: + description: DrainSpec describes configuration for node drain + during automatic upgrade + properties: + deleteEmptyDir: + default: false + description: |- + DeleteEmptyDir indicates if should continue even if there are pods using emptyDir + (local data that will be deleted when the node is drained) + type: boolean + enable: + default: false + description: Enable indicates if node draining is allowed + during upgrade + type: boolean + force: + default: false + description: Force indicates if force draining is allowed + type: boolean + podSelector: + description: |- + PodSelector specifies a label selector to filter pods on the node that need to be drained + For more details on label selectors, see: + https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors + type: string + timeoutSeconds: + default: 300 + description: TimeoutSecond specifies the length of time + in seconds to wait before giving up drain, zero means + infinite + minimum: 0 + type: integer + type: object + maxParallelUpgrades: + default: 1 + description: |- + MaxParallelUpgrades indicates how many nodes can be upgraded in parallel + 0 means no limit, all nodes will be upgraded in parallel + minimum: 0 + type: integer + maxUnavailable: + anyOf: + - type: integer + - type: string + default: 25% + description: |- + MaxUnavailable is the maximum number of nodes with the driver installed, that can be unavailable during the upgrade. + Value can be an absolute number (ex: 5) or a percentage of total nodes at the start of upgrade (ex: 10%). + Absolute number is calculated from percentage by rounding up. + By default, a fixed value of 25% is used. + x-kubernetes-int-or-string: true + podDeletion: + description: PodDeletionSpec describes configuration for deletion + of pods using special resources during automatic upgrade + properties: + deleteEmptyDir: + default: false + description: |- + DeleteEmptyDir indicates if should continue even if there are pods using emptyDir + (local data that will be deleted when the pod is deleted) + type: boolean + force: + default: false + description: Force indicates if force deletion is allowed + type: boolean + timeoutSeconds: + default: 300 + description: |- + TimeoutSecond specifies the length of time in seconds to wait before giving up on pod termination, zero means + infinite + minimum: 0 + type: integer + type: object + waitForCompletion: + description: WaitForCompletionSpec describes the configuration + for waiting on job completions + properties: + podSelector: + description: |- + PodSelector specifies a label selector for the pods to wait for completion + For more details on label selectors, see: + https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors + type: string + timeoutSeconds: + default: 0 + description: |- + TimeoutSecond specifies the length of time in seconds to wait before giving up on pod termination, zero means + infinite + minimum: 0 + type: integer + type: object + type: object + useNvidiaDriverCRD: + description: UseNvidiaDriverCRD indicates if the deployment of + NVIDIA Driver is managed by the NVIDIADriver CRD type + type: boolean + useOpenKernelModules: + description: UseOpenKernelModules indicates if the open GPU kernel + modules should be used + type: boolean + usePrecompiled: + description: UsePrecompiled indicates if deployment of NVIDIA + Driver using pre-compiled modules is enabled + type: boolean + version: + description: NVIDIA Driver image tag + type: string + virtualTopology: + description: 'Optional: Virtual Topology Daemon configuration + for NVIDIA vGPU drivers' + properties: + config: + description: 'Optional: Config name representing virtual topology + daemon configuration file nvidia-topologyd.conf' + type: string + type: object + type: object + gdrcopy: + description: GDRCopy component spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + enabled: + description: Enabled indicates if GDRCopy is enabled through GPU + Operator + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA GDRCopy driver image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA GDRCopy driver image repository + type: string + version: + description: NVIDIA GDRCopy driver image tag + type: string + type: object + gds: + description: GPUDirectStorage defines the spec for GDS components(Experimental) + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + enabled: + description: Enabled indicates if GPUDirect Storage is enabled + through GPU operator + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA GPUDirect Storage Driver image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA GPUDirect Storage Driver image repository + type: string + version: + description: NVIDIA GPUDirect Storage Driver image tag + type: string + type: object + gfd: + description: GPUFeatureDiscovery spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + enabled: + description: Enabled indicates if deployment of GPU Feature Discovery + Plugin is enabled. + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: GFD image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: GFD image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: GFD image tag + type: string + type: object + hostPaths: + description: HostPaths defines various paths on the host needed by + GPU Operator components + properties: + driverInstallDir: + description: |- + DriverInstallDir represents the root at which driver files including libraries, + config files, and executables can be found. + type: string + rootFS: + description: |- + RootFS represents the path to the root filesystem of the host. + This is used by components that need to interact with the host filesystem + and as such this must be a chroot-able filesystem. + Examples include the MIG Manager and Toolkit Container which may need to + stop, start, or restart systemd services. + type: string + type: object + kataManager: + description: KataManager component spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + config: + description: Kata Manager config + properties: + artifactsDir: + default: /opt/nvidia-gpu-operator/artifacts/runtimeclasses + description: |- + ArtifactsDir is the directory where kata artifacts (e.g. kernel / guest images, configuration, etc.) + are placed on the local filesystem. + type: string + runtimeClasses: + description: RuntimeClasses is a list of kata runtime classes + to configure. + items: + description: RuntimeClass defines the configuration for + a kata RuntimeClass + properties: + artifacts: + description: Artifacts are the kata artifacts associated + with the runtime class. + properties: + pullSecret: + description: PullSecret is the secret used to pull + the OCI artifact. + type: string + url: + description: |- + URL is the path to the OCI artifact (payload) containing all artifacts + associated with a kata runtime class. + type: string + required: + - url + type: object + name: + description: Name is the name of the kata runtime class. + type: string + nodeSelector: + additionalProperties: + type: string + description: |- + NodeSelector specifies the nodeSelector for the RuntimeClass object. + This ensures pods running with the RuntimeClass only get scheduled + onto nodes which support it. + type: object + required: + - artifacts + - name + type: object + type: array + type: object + enabled: + description: Enabled indicates if deployment of Kata Manager is + enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: Kata Manager image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: Kata Manager image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: Kata Manager image tag + type: string + type: object + mig: + description: MIG spec + properties: + strategy: + description: 'Optional: MIGStrategy to apply for GFD and NVIDIA + Device Plugin' + enum: + - none + - single + - mixed + type: string + type: object + migManager: + description: MIGManager for configuration to deploy MIG Manager + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + config: + description: 'Optional: Custom mig-parted configuration for NVIDIA + MIG Manager container' + properties: + default: + default: all-disabled + description: Default MIG config to be applied on the node, + when there is no config specified with the node label nvidia.com/mig.config + enum: + - all-disabled + - "" + type: string + name: + default: default-mig-parted-config + description: ConfigMap name + type: string + type: object + enabled: + description: Enabled indicates if deployment of NVIDIA MIG Manager + is enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + gpuClientsConfig: + description: 'Optional: Custom gpu-clients configuration for NVIDIA + MIG Manager container' + properties: + name: + description: ConfigMap name + type: string + type: object + image: + description: NVIDIA MIG Manager image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA MIG Manager image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: NVIDIA MIG Manager image tag + type: string + type: object + nodeStatusExporter: + description: NodeStatusExporter spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + enabled: + description: Enabled indicates if deployment of Node Status Exporter + is enabled. + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: Node Status Exporter image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: Node Status Exporterimage repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: Node Status Exporterimage tag + type: string + type: object + operator: + description: Operator component spec + properties: + annotations: + additionalProperties: + type: string + description: |- + Optional: Annotations is an unstructured key value map stored with a resource that may be + set by external tools to store and retrieve arbitrary metadata. They are not + queryable and should be preserved when modifying objects. + type: object + defaultRuntime: + default: docker + description: Runtime defines container runtime type + enum: + - docker + - crio + - containerd + type: string + initContainer: + description: InitContainerSpec describes configuration for initContainer + image used with all components + properties: + image: + description: Image represents image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: Repository represents image repository path + type: string + version: + description: Version represents image tag(version) + type: string + type: object + labels: + additionalProperties: + type: string + description: |- + Optional: Map of string keys and values that can be used to organize and categorize + (scope and select) objects. May match selectors of replication controllers + and services. + type: object + runtimeClass: + default: nvidia + type: string + use_ocp_driver_toolkit: + description: UseOpenShiftDriverToolkit indicates if DriverToolkit + image should be used on OpenShift to build and install driver + modules + type: boolean + required: + - defaultRuntime + type: object + psa: + description: PSA defines spec for PodSecurityAdmission configuration + properties: + enabled: + description: Enabled indicates if PodSecurityAdmission configuration + needs to be enabled for all Pods + type: boolean + type: object + psp: + description: |- + Deprecated: Pod Security Policies are no longer supported. Please use PodSecurityAdmission instead + PSP defines spec for handling PodSecurityPolicies + properties: + enabled: + description: Enabled indicates if PodSecurityPolicies needs to + be enabled for all Pods + type: boolean + type: object + sandboxDevicePlugin: + description: SandboxDevicePlugin component spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + enabled: + description: Enabled indicates if deployment of NVIDIA Sandbox + Device Plugin through operator is enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA Sandbox Device Plugin image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA Sandbox Device Plugin image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: NVIDIA Sandbox Device Plugin image tag + type: string + type: object + sandboxWorkloads: + description: SandboxWorkloads defines the spec for handling sandbox + workloads (i.e. Virtual Machines) + properties: + defaultWorkload: + default: container + description: |- + DefaultWorkload indicates the default GPU workload type to configure + worker nodes in the cluster for + enum: + - container + - vm-passthrough + - vm-vgpu + type: string + enabled: + description: |- + Enabled indicates if the GPU Operator should manage additional operands required + for sandbox workloads (i.e. VFIO Manager, vGPU Manager, and additional device plugins) + type: boolean + type: object + toolkit: + description: Toolkit component spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + enabled: + description: Enabled indicates if deployment of NVIDIA Container + Toolkit through operator is enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA Container Toolkit image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + installDir: + default: /usr/local/nvidia + description: Toolkit install directory on the host + type: string + repository: + description: NVIDIA Container Toolkit image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: NVIDIA Container Toolkit image tag + type: string + type: object + validator: + description: Validator defines the spec for operator-validator daemonset + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + cuda: + description: CUDA validator spec + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + type: object + driver: + description: Toolkit validator spec + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + type: object + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: Validator image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + plugin: + description: Plugin validator spec + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + type: object + repository: + description: Validator image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + toolkit: + description: Toolkit validator spec + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + type: object + version: + description: Validator image tag + type: string + vfioPCI: + description: VfioPCI validator spec + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + type: object + vgpuDevices: + description: VGPUDevices validator spec + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + type: object + vgpuManager: + description: VGPUManager validator spec + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + type: object + type: object + vfioManager: + description: VFIOManager for configuration to deploy VFIO-PCI Manager + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + driverManager: + description: DriverManager represents configuration for NVIDIA + Driver Manager + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: Image represents NVIDIA Driver Manager image + name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: Repository represents Driver Managerrepository + path + type: string + version: + description: Version represents NVIDIA Driver Manager image + tag(version) + type: string + type: object + enabled: + description: Enabled indicates if deployment of VFIO Manager is + enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: VFIO Manager image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: VFIO Manager image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: VFIO Manager image tag + type: string + type: object + vgpuDeviceManager: + description: VGPUDeviceManager spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + config: + description: NVIDIA vGPU devices configuration for NVIDIA vGPU + Device Manager container + properties: + default: + default: default + description: Default config name within the ConfigMap + type: string + name: + description: ConfigMap name + type: string + type: object + enabled: + description: Enabled indicates if deployment of NVIDIA vGPU Device + Manager is enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA vGPU Device Manager image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA vGPU Device Manager image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: NVIDIA vGPU Device Manager image tag + type: string + type: object + vgpuManager: + description: VGPUManager component spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + driverManager: + description: DriverManager represents configuration for NVIDIA + Driver Manager initContainer + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: Image represents NVIDIA Driver Manager image + name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: Repository represents Driver Managerrepository + path + type: string + version: + description: Version represents NVIDIA Driver Manager image + tag(version) + type: string + type: object + enabled: + description: Enabled indicates if deployment of NVIDIA vGPU Manager + through operator is enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA vGPU Manager image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA vGPU Manager image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: NVIDIA vGPU Manager image tag + type: string + type: object + required: + - daemonsets + - dcgm + - dcgmExporter + - devicePlugin + - driver + - gfd + - nodeStatusExporter + - operator + - toolkit + type: object + status: + description: ClusterPolicyStatus defines the observed state of ClusterPolicy + properties: + conditions: + description: Conditions is a list of conditions representing the ClusterPolicy's + current state. + items: + description: "Condition contains details for one aspect of the current + state of this API Resource.\n---\nThis struct is intended for + direct use as an array at the field path .status.conditions. For + example,\n\n\n\ttype FooStatus struct{\n\t // Represents the + observations of a foo's current state.\n\t // Known .status.conditions.type + are: \"Available\", \"Progressing\", and \"Degraded\"\n\t // + +patchMergeKey=type\n\t // +patchStrategy=merge\n\t // +listType=map\n\t + \ // +listMapKey=type\n\t Conditions []metav1.Condition `json:\"conditions,omitempty\" + patchStrategy:\"merge\" patchMergeKey:\"type\" protobuf:\"bytes,1,rep,name=conditions\"`\n\n\n\t + \ // other fields\n\t}" + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + --- + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be + useful (see .node.status.conditions), the ability to deconflict is important. + The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + namespace: + description: Namespace indicates a namespace in which the operator + is installed + type: string + state: + description: State indicates status of ClusterPolicy + enum: + - ignored + - ready + - notReady + type: string + required: + - state + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/bundle/v24.6.1/manifests/nvidia.com_nvidiadrivers.yaml b/bundle/v24.6.1/manifests/nvidia.com_nvidiadrivers.yaml new file mode 100644 index 000000000..665088edd --- /dev/null +++ b/bundle/v24.6.1/manifests/nvidia.com_nvidiadrivers.yaml @@ -0,0 +1,810 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.15.0 + name: nvidiadrivers.nvidia.com +spec: + group: nvidia.com + names: + kind: NVIDIADriver + listKind: NVIDIADriverList + plural: nvidiadrivers + shortNames: + - nvd + - nvdriver + - nvdrivers + singular: nvidiadriver + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .status.state + name: Status + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: string + name: v1alpha1 + schema: + openAPIV3Schema: + description: NVIDIADriver is the Schema for the nvidiadrivers API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: NVIDIADriverSpec defines the desired state of NVIDIADriver + properties: + annotations: + additionalProperties: + type: string + description: |- + Optional: Annotations is an unstructured key value map stored with a resource that may be + set by external tools to store and retrieve arbitrary metadata. They are not + queryable and should be preserved when modifying objects. + type: object + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + certConfig: + description: 'Optional: Custom certificates configuration for NVIDIA + Driver container' + properties: + name: + type: string + type: object + driverType: + default: gpu + description: DriverType defines NVIDIA driver type + enum: + - gpu + - vgpu + - vgpu-host-manager + type: string + x-kubernetes-validations: + - message: driverType is an immutable field. Please create a new NvidiaDriver + resource instead when you want to change this setting. + rule: self == oldSelf + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present in + a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + gdrcopy: + description: GDRCopy defines the spec for GDRCopy driver + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + enabled: + description: Enabled indicates if GDRCopy is enabled through GPU + operator + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: GDRCopy driver image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: GDRCopy diver image repository + type: string + version: + description: GDRCopy driver image tag + type: string + type: object + gds: + description: GPUDirectStorage defines the spec for GDS driver + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + enabled: + description: Enabled indicates if GPUDirect Storage is enabled + through GPU operator + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA GPUDirect Storage Driver image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA GPUDirect Storage Driver image repository + type: string + version: + description: NVIDIA GPUDirect Storage Driver image tag + type: string + type: object + image: + default: nvcr.io/nvidia/driver + description: NVIDIA Driver container image name + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + kernelModuleConfig: + description: 'Optional: Kernel module configuration parameters for + the NVIDIA Driver' + properties: + name: + type: string + type: object + labels: + additionalProperties: + type: string + description: |- + Optional: Map of string keys and values that can be used to organize and categorize + (scope and select) objects. May match selectors of replication controllers + and services. + type: object + licensingConfig: + description: 'Optional: Licensing configuration for NVIDIA vGPU licensing' + properties: + name: + type: string + nlsEnabled: + description: NLSEnabled indicates if NVIDIA Licensing System is + used for licensing. + type: boolean + type: object + livenessProbe: + description: NVIDIA Driver container liveness probe settings + properties: + failureThreshold: + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + initialDelaySeconds: + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + type: integer + periodSeconds: + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + successThreshold: + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + timeoutSeconds: + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + minimum: 1 + type: integer + type: object + manager: + description: Manager represents configuration for NVIDIA Driver Manager + initContainer + properties: + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: Image represents NVIDIA Driver Manager image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: Repository represents Driver Managerrepository path + type: string + version: + description: Version represents NVIDIA Driver Manager image tag(version) + type: string + type: object + nodeAffinity: + description: Affinity specifies node affinity rules for driver pods + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: |- + The scheduler will prefer to schedule pods to nodes that satisfy + the affinity expressions specified by this field, but it may choose + a node that violates one or more of the expressions. The node that is + most preferred is the one with the greatest sum of weights, i.e. + for each node that meets all of the scheduling requirements (resource + request, requiredDuringScheduling affinity expressions, etc.), + compute a sum by iterating through the elements of this field and adding + "weight" to the sum if the node matches the corresponding matchExpressions; the + node(s) with the highest sum are the most preferred. + items: + description: |- + An empty preferred scheduling term matches all objects with implicit weight 0 + (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op). + properties: + preference: + description: A node selector term, associated with the corresponding + weight. + properties: + matchExpressions: + description: A list of node selector requirements by + node's labels. + items: + description: |- + A node selector requirement is a selector that contains values, a key, and an operator + that relates the key and values. + properties: + key: + description: The label key that the selector applies + to. + type: string + operator: + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchFields: + description: A list of node selector requirements by + node's fields. + items: + description: |- + A node selector requirement is a selector that contains values, a key, and an operator + that relates the key and values. + properties: + key: + description: The label key that the selector applies + to. + type: string + operator: + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + type: object + x-kubernetes-map-type: atomic + weight: + description: Weight associated with matching the corresponding + nodeSelectorTerm, in the range 1-100. + format: int32 + type: integer + required: + - preference + - weight + type: object + type: array + x-kubernetes-list-type: atomic + requiredDuringSchedulingIgnoredDuringExecution: + description: |- + If the affinity requirements specified by this field are not met at + scheduling time, the pod will not be scheduled onto the node. + If the affinity requirements specified by this field cease to be met + at some point during pod execution (e.g. due to an update), the system + may or may not try to eventually evict the pod from its node. + properties: + nodeSelectorTerms: + description: Required. A list of node selector terms. The + terms are ORed. + items: + description: |- + A null or empty node selector term matches no objects. The requirements of + them are ANDed. + The TopologySelectorTerm type implements a subset of the NodeSelectorTerm. + properties: + matchExpressions: + description: A list of node selector requirements by + node's labels. + items: + description: |- + A node selector requirement is a selector that contains values, a key, and an operator + that relates the key and values. + properties: + key: + description: The label key that the selector applies + to. + type: string + operator: + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchFields: + description: A list of node selector requirements by + node's fields. + items: + description: |- + A node selector requirement is a selector that contains values, a key, and an operator + that relates the key and values. + properties: + key: + description: The label key that the selector applies + to. + type: string + operator: + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + type: object + x-kubernetes-map-type: atomic + type: array + x-kubernetes-list-type: atomic + required: + - nodeSelectorTerms + type: object + x-kubernetes-map-type: atomic + type: object + nodeSelector: + additionalProperties: + type: string + description: NodeSelector specifies a selector for installation of + NVIDIA driver + type: object + priorityClassName: + description: 'Optional: Set priorityClassName' + type: string + rdma: + description: GPUDirectRDMA defines the spec for NVIDIA Peer Memory + driver + properties: + enabled: + description: Enabled indicates if GPUDirect RDMA is enabled through + GPU operator + type: boolean + useHostMofed: + description: UseHostMOFED indicates to use MOFED drivers directly + installed on the host to enable GPUDirect RDMA + type: boolean + type: object + readinessProbe: + description: NVIDIA Driver container readiness probe settings + properties: + failureThreshold: + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + initialDelaySeconds: + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + type: integer + periodSeconds: + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + successThreshold: + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + timeoutSeconds: + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + minimum: 1 + type: integer + type: object + repoConfig: + description: 'Optional: Custom repo configuration for NVIDIA Driver + container' + properties: + name: + type: string + type: object + repository: + description: NVIDIA Driver repository + type: string + resources: + description: 'Optional: Define resources requests and limits for each + pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + startupProbe: + description: NVIDIA Driver container startup probe settings + properties: + failureThreshold: + description: |- + Minimum consecutive failures for the probe to be considered failed after having succeeded. + Defaults to 3. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + initialDelaySeconds: + description: |- + Number of seconds after the container has started before liveness probes are initiated. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + type: integer + periodSeconds: + description: |- + How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + successThreshold: + description: |- + Minimum consecutive successes for the probe to be considered successful after having failed. + Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. + format: int32 + minimum: 1 + type: integer + timeoutSeconds: + description: |- + Number of seconds after which the probe times out. + Defaults to 1 second. Minimum value is 1. + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes + format: int32 + minimum: 1 + type: integer + type: object + tolerations: + description: 'Optional: Set tolerations' + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + useOpenKernelModules: + description: UseOpenKernelModules indicates if the open GPU kernel + modules should be used + type: boolean + usePrecompiled: + description: UsePrecompiled indicates if deployment of NVIDIA Driver + using pre-compiled modules is enabled + type: boolean + x-kubernetes-validations: + - message: usePrecompiled is an immutable field. Please create a new + NvidiaDriver resource instead when you want to change this setting. + rule: self == oldSelf + version: + description: NVIDIA Driver version (or just branch for precompiled + drivers) + type: string + virtualTopologyConfig: + description: 'Optional: Virtual Topology Daemon configuration for + NVIDIA vGPU drivers' + properties: + name: + description: 'Optional: Config name representing virtual topology + daemon configuration file nvidia-topologyd.conf' + type: string + type: object + required: + - driverType + - image + type: object + status: + description: NVIDIADriverStatus defines the observed state of NVIDIADriver + properties: + conditions: + description: Conditions is a list of conditions representing the NVIDIADriver's + current state. + items: + description: "Condition contains details for one aspect of the current + state of this API Resource.\n---\nThis struct is intended for + direct use as an array at the field path .status.conditions. For + example,\n\n\n\ttype FooStatus struct{\n\t // Represents the + observations of a foo's current state.\n\t // Known .status.conditions.type + are: \"Available\", \"Progressing\", and \"Degraded\"\n\t // + +patchMergeKey=type\n\t // +patchStrategy=merge\n\t // +listType=map\n\t + \ // +listMapKey=type\n\t Conditions []metav1.Condition `json:\"conditions,omitempty\" + patchStrategy:\"merge\" patchMergeKey:\"type\" protobuf:\"bytes,1,rep,name=conditions\"`\n\n\n\t + \ // other fields\n\t}" + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + --- + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be + useful (see .node.status.conditions), the ability to deconflict is important. + The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + namespace: + description: Namespace indicates a namespace in which the operator + and driver are installed + type: string + state: + description: |- + INSERT ADDITIONAL STATUS FIELD - define observed state of cluster + Important: Run "make" to regenerate code after modifying this file + State indicates status of NVIDIADriver instance + enum: + - ignored + - ready + - notReady + type: string + required: + - state + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/bundle/v24.6.1/metadata/annotations.yaml b/bundle/v24.6.1/metadata/annotations.yaml new file mode 100644 index 000000000..f7383d5c4 --- /dev/null +++ b/bundle/v24.6.1/metadata/annotations.yaml @@ -0,0 +1,17 @@ +annotations: + operators.operatorframework.io.bundle.channels.v1: stable,v24.6 + operators.operatorframework.io.bundle.channel.default.v1: v24.6 + operators.operatorframework.io.bundle.manifests.v1: manifests/ + operators.operatorframework.io.bundle.mediatype.v1: registry+v1 + operators.operatorframework.io.bundle.metadata.v1: metadata/ + operators.operatorframework.io.bundle.package.v1: gpu-operator-certified + operators.operatorframework.io.metrics.builder: operator-sdk-v1.4.0 + operators.operatorframework.io.metrics.mediatype.v1: metrics+v1 + operators.operatorframework.io.metrics.project_layout: go.kubebuilder.io/v3 + operators.operatorframework.io.test.config.v1: tests/scorecard/ + operators.operatorframework.io.test.mediatype.v1: scorecard+v1 + operatorframework.io/cluster-monitoring: "true" + operatorframework.io/suggested-namespace: nvidia-gpu-operator + + # Annotations to specify OCP versions compatibility. + com.redhat.openshift.versions: v4.12-v4.16