diff --git a/deployments/gpu-operator/Chart.lock b/deployments/gpu-operator/Chart.lock index 5d1a7d3dc..9c36f7956 100644 --- a/deployments/gpu-operator/Chart.lock +++ b/deployments/gpu-operator/Chart.lock @@ -1,6 +1,6 @@ dependencies: - name: node-feature-discovery repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts - version: 0.16.6 -digest: sha256:e7b02cbdf9daff49892c0b74c50da2ed11e18eff2105a1b1abc9a8f2ebd8be47 -generated: "2024-10-31T07:12:50.141904-07:00" + version: 0.17.0 +digest: sha256:21baa50c4947a80eb075f1db42f9521672dbbcbbea309b0f2d6d9c05fbdd8a65 +generated: "2024-12-24T11:04:04.635377-08:00" diff --git a/deployments/gpu-operator/Chart.yaml b/deployments/gpu-operator/Chart.yaml index 59f9e6904..c9dbcc5ba 100644 --- a/deployments/gpu-operator/Chart.yaml +++ b/deployments/gpu-operator/Chart.yaml @@ -19,6 +19,6 @@ keywords: dependencies: - name: node-feature-discovery - version: v0.16.6 + version: v0.17.0 repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts condition: nfd.enabled diff --git a/deployments/gpu-operator/charts/node-feature-discovery/Chart.yaml b/deployments/gpu-operator/charts/node-feature-discovery/Chart.yaml index 7656c732f..1f72fa8f7 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/Chart.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v2 -appVersion: v0.16.6 +appVersion: v0.17.0 description: 'Detects hardware features available on each node in a Kubernetes cluster, and advertises those features using node labels. ' home: https://github.com/kubernetes-sigs/node-feature-discovery @@ -11,4 +11,4 @@ name: node-feature-discovery sources: - https://github.com/kubernetes-sigs/node-feature-discovery type: application -version: 0.16.6 +version: 0.17.0 diff --git a/deployments/gpu-operator/charts/node-feature-discovery/README.md b/deployments/gpu-operator/charts/node-feature-discovery/README.md index 93734f8b7..02f7b1707 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/README.md +++ b/deployments/gpu-operator/charts/node-feature-discovery/README.md @@ -6,5 +6,5 @@ labels. NFD provides flexible configuration and extension points for a wide range of vendor and application specific node labeling needs. See -[NFD documentation](https://kubernetes-sigs.github.io/node-feature-discovery/v0.16/deployment/helm.html) +[NFD documentation](https://kubernetes-sigs.github.io/node-feature-discovery/v0.17/deployment/helm.html) for deployment instructions. diff --git a/deployments/gpu-operator/charts/node-feature-discovery/crds/nfd-api-crds.yaml b/deployments/gpu-operator/charts/node-feature-discovery/crds/nfd-api-crds.yaml index 0a73c5dca..9f62da6f6 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/crds/nfd-api-crds.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/crds/nfd-api-crds.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.14.0 + controller-gen.kubebuilder.io/version: v0.16.3 name: nodefeatures.nfd.k8s-sigs.io spec: group: nfd.k8s-sigs.io @@ -69,8 +69,9 @@ spec: properties: elements: additionalProperties: - description: Nil is a dummy empty struct for protobuf - compatibility + description: |- + Nil is a dummy empty struct for protobuf compatibility. + NOTE: protobuf definitions have been removed but this is kept for API compatibility. type: object description: Individual features of the feature set. type: object @@ -124,7 +125,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.14.0 + controller-gen.kubebuilder.io/version: v0.16.3 name: nodefeaturegroups.nfd.k8s-sigs.io spec: group: nfd.k8s-sigs.io @@ -395,7 +396,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.14.0 + controller-gen.kubebuilder.io/version: v0.16.3 name: nodefeaturerules.nfd.k8s-sigs.io spec: group: nfd.k8s-sigs.io diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/cert-manager-certs.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/cert-manager-certs.yaml deleted file mode 100644 index 2d1576022..000000000 --- a/deployments/gpu-operator/charts/node-feature-discovery/templates/cert-manager-certs.yaml +++ /dev/null @@ -1,80 +0,0 @@ -{{- if .Values.tls.certManager }} -{{- if .Values.master.enable }} ---- -apiVersion: cert-manager.io/v1 -kind: Certificate -metadata: - name: nfd-master-cert - namespace: {{ include "node-feature-discovery.namespace" . }} -spec: - secretName: nfd-master-cert - subject: - organizations: - - node-feature-discovery - commonName: nfd-master - dnsNames: - # must match the service name - - {{ include "node-feature-discovery.fullname" . }}-master - # first one is configured for use by the worker; below are for completeness - - {{ include "node-feature-discovery.fullname" . }}-master.{{ include "node-feature-discovery.namespace" . }}.svc - - {{ include "node-feature-discovery.fullname" . }}-master.{{ include "node-feature-discovery.namespace" . }}.svc.cluster.local - issuerRef: - name: {{ default "nfd-ca-issuer" .Values.tls.certManagerCertificate.issuerName }} - {{- if and .Values.tls.certManagerCertificate.issuerName .Values.tls.certManagerCertificate.issuerKind }} - kind: {{ .Values.tls.certManagerCertificate.issuerKind }} - {{- else }} - kind: Issuer - {{- end }} - group: cert-manager.io -{{- end }} ---- -{{- if .Values.worker.enable }} -apiVersion: cert-manager.io/v1 -kind: Certificate -metadata: - name: nfd-worker-cert - namespace: {{ include "node-feature-discovery.namespace" . }} -spec: - secretName: nfd-worker-cert - subject: - organizations: - - node-feature-discovery - commonName: nfd-worker - dnsNames: - - {{ include "node-feature-discovery.fullname" . }}-worker.{{ include "node-feature-discovery.namespace" . }}.svc.cluster.local - issuerRef: - name: {{ default "nfd-ca-issuer" .Values.tls.certManagerCertificate.issuerName }} - {{- if and .Values.tls.certManagerCertificate.issuerName .Values.tls.certManagerCertificate.issuerKind }} - kind: {{ .Values.tls.certManagerCertificate.issuerKind }} - {{- else }} - kind: Issuer - {{- end }} - group: cert-manager.io -{{- end }} - -{{- if .Values.topologyUpdater.enable }} ---- -apiVersion: cert-manager.io/v1 -kind: Certificate -metadata: - name: nfd-topology-updater-cert - namespace: {{ include "node-feature-discovery.namespace" . }} -spec: - secretName: nfd-topology-updater-cert - subject: - organizations: - - node-feature-discovery - commonName: nfd-topology-updater - dnsNames: - - {{ include "node-feature-discovery.fullname" . }}-topology-updater.{{ include "node-feature-discovery.namespace" . }}.svc.cluster.local - issuerRef: - name: {{ default "nfd-ca-issuer" .Values.tls.certManagerCertificate.issuerName }} - {{- if and .Values.tls.certManagerCertificate.issuerName .Values.tls.certManagerCertificate.issuerKind }} - kind: {{ .Values.tls.certManagerCertificate.issuerKind }} - {{- else }} - kind: Issuer - {{- end }} - group: cert-manager.io -{{- end }} - -{{- end }} diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/cert-manager-issuer.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/cert-manager-issuer.yaml deleted file mode 100644 index 874468908..000000000 --- a/deployments/gpu-operator/charts/node-feature-discovery/templates/cert-manager-issuer.yaml +++ /dev/null @@ -1,42 +0,0 @@ -{{- if and .Values.tls.certManager (not .Values.tls.certManagerCertificate.issuerName ) }} -# See https://cert-manager.io/docs/configuration/selfsigned/#bootstrapping-ca-issuers -# - Create a self signed issuer -# - Use this to create a CA cert -# - Use this to now create a CA issuer ---- -apiVersion: cert-manager.io/v1 -kind: Issuer -metadata: - name: nfd-ca-bootstrap - namespace: {{ include "node-feature-discovery.namespace" . }} -spec: - selfSigned: {} - ---- -apiVersion: cert-manager.io/v1 -kind: Certificate -metadata: - name: nfd-ca-cert - namespace: {{ include "node-feature-discovery.namespace" . }} -spec: - isCA: true - secretName: nfd-ca-cert - subject: - organizations: - - node-feature-discovery - commonName: nfd-ca-cert - issuerRef: - name: nfd-ca-bootstrap - kind: Issuer - group: cert-manager.io - ---- -apiVersion: cert-manager.io/v1 -kind: Issuer -metadata: - name: nfd-ca-issuer - namespace: {{ include "node-feature-discovery.namespace" . }} -spec: - ca: - secretName: nfd-ca-cert -{{- end }} diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/clusterrole.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/clusterrole.yaml index f935cfe41..ea6e3e30f 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/templates/clusterrole.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/templates/clusterrole.yaml @@ -6,6 +6,13 @@ metadata: labels: {{- include "node-feature-discovery.labels" . | nindent 4 }} rules: +- apiGroups: + - "" + resources: + - namespaces + verbs: + - watch + - list - apiGroups: - "" resources: @@ -94,7 +101,7 @@ rules: - update {{- end }} -{{- if and .Values.gc.enable .Values.gc.rbac.create (or (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) .Values.topologyUpdater.enable) }} +{{- if and .Values.gc.enable .Values.gc.rbac.create }} --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/clusterrolebinding.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/clusterrolebinding.yaml index 3f717988b..8331019dd 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/templates/clusterrolebinding.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/templates/clusterrolebinding.yaml @@ -33,7 +33,7 @@ subjects: namespace: {{ include "node-feature-discovery.namespace" . }} {{- end }} -{{- if and .Values.gc.enable .Values.gc.rbac.create (or (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) .Values.topologyUpdater.enable) }} +{{- if and .Values.gc.enable .Values.gc.rbac.create }} --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/master.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/master.yaml index 733131a03..da3ca2408 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/templates/master.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/templates/master.yaml @@ -23,10 +23,11 @@ spec: labels: {{- include "node-feature-discovery.selectorLabels" . | nindent 8 }} role: master - {{- with .Values.master.annotations }} annotations: + checksum/config: {{ include (print $.Template.BasePath "/nfd-master-conf.yaml") . | sha256sum }} + {{- with .Values.master.annotations }} {{- toYaml . | nindent 8 }} - {{- end }} + {{- end }} spec: {{- with .Values.priorityClassName }} priorityClassName: {{ . }} @@ -46,16 +47,58 @@ spec: {{- toYaml .Values.master.securityContext | nindent 12 }} image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.image.pullPolicy }} + startupProbe: + grpc: + port: {{ .Values.master.healthPort | default "8082" }} + {{- with .Values.master.startupProbe.initialDelaySeconds }} + initialDelaySeconds: {{ . }} + {{- end }} + {{- with .Values.master.startupProbe.failureThreshold }} + failureThreshold: {{ . }} + {{- end }} + {{- with .Values.master.startupProbe.periodSeconds }} + periodSeconds: {{ . }} + {{- end }} + {{- with .Values.master.startupProbe.timeoutSeconds }} + timeoutSeconds: {{ . }} + {{- end }} livenessProbe: - {{- toYaml .Values.master.livenessProbe | nindent 12 }} + grpc: + port: {{ .Values.master.healthPort | default "8082" }} + {{- with .Values.master.livenessProbe.initialDelaySeconds }} + initialDelaySeconds: {{ . }} + {{- end }} + {{- with .Values.master.livenessProbe.failureThreshold }} + failureThreshold: {{ . }} + {{- end }} + {{- with .Values.master.livenessProbe.periodSeconds }} + periodSeconds: {{ . }} + {{- end }} + {{- with .Values.master.livenessProbe.timeoutSeconds }} + timeoutSeconds: {{ . }} + {{- end }} readinessProbe: - {{- toYaml .Values.master.readinessProbe | nindent 12 }} + grpc: + port: {{ .Values.master.healthPort | default "8082" }} + {{- with .Values.master.readinessProbe.initialDelaySeconds }} + initialDelaySeconds: {{ . }} + {{- end }} + {{- with .Values.master.readinessProbe.failureThreshold }} + failureThreshold: {{ . }} + {{- end }} + {{- with .Values.master.readinessProbe.periodSeconds }} + periodSeconds: {{ . }} + {{- end }} + {{- with .Values.master.readinessProbe.timeoutSeconds }} + timeoutSeconds: {{ . }} + {{- end }} + {{- with .Values.master.readinessProbe.successThreshold }} + successThreshold: {{ . }} + {{- end }} ports: - - containerPort: {{ .Values.master.port | default "8080" }} - name: grpc - containerPort: {{ .Values.master.metricsPort | default "8081" }} name: metrics - - containerPort: {{ .Values.master.healthPort | default "8082" }} + - containerPort: {{ .Values.master.healthPort | default "8082" }} name: health env: - name: NODE_NAME @@ -73,29 +116,16 @@ spec: {{- if .Values.master.instance | empty | not }} - "-instance={{ .Values.master.instance }}" {{- end }} - {{- if not (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) }} - - "-port={{ .Values.master.port | default "8080" }}" - {{- else if gt (int .Values.master.replicaCount) 1 }} - "-enable-leader-election" - {{- end }} {{- if .Values.master.extraLabelNs | empty | not }} - "-extra-label-ns={{- join "," .Values.master.extraLabelNs }}" {{- end }} {{- if .Values.master.denyLabelNs | empty | not }} - "-deny-label-ns={{- join "," .Values.master.denyLabelNs }}" {{- end }} - {{- if .Values.master.resourceLabels | empty | not }} - - "-resource-labels={{- join "," .Values.master.resourceLabels }}" - {{- end }} {{- if .Values.master.enableTaints }} - "-enable-taints" {{- end }} - {{- if .Values.master.crdController | kindIs "invalid" | not }} - - "-crd-controller={{ .Values.master.crdController }}" - {{- else }} - ## By default, disable crd controller for other than the default instances - - "-crd-controller={{ .Values.master.instance | empty }}" - {{- end }} {{- if .Values.master.featureRulesController | kindIs "invalid" | not }} - "-featurerules-controller={{ .Values.master.featureRulesController }}" {{- end }} @@ -105,32 +135,20 @@ spec: {{- if .Values.master.nfdApiParallelism | empty | not }} - "-nfd-api-parallelism={{ .Values.master.nfdApiParallelism }}" {{- end }} - {{- if .Values.tls.enable }} - - "-ca-file=/etc/kubernetes/node-feature-discovery/certs/ca.crt" - - "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key" - - "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt" - {{- end }} # Go over featureGates and add the feature-gate flag {{- range $key, $value := .Values.featureGates }} - "-feature-gates={{ $key }}={{ $value }}" {{- end }} - "-metrics={{ .Values.master.metricsPort | default "8081" }}" - - "-grpc-health={{ .Values.master.healthPort | default "8082" }}" - volumeMounts: - {{- if .Values.tls.enable }} - - name: nfd-master-cert - mountPath: "/etc/kubernetes/node-feature-discovery/certs" - readOnly: true + - "-grpc-health={{ .Values.master.healthPort | default "8082" }}" + {{- with .Values.master.extraArgs }} + {{- toYaml . | nindent 12 }} {{- end }} + volumeMounts: - name: nfd-master-conf mountPath: "/etc/kubernetes/node-feature-discovery" readOnly: true volumes: - {{- if .Values.tls.enable }} - - name: nfd-master-cert - secret: - secretName: nfd-master-cert - {{- end }} - name: nfd-master-conf configMap: name: {{ include "node-feature-discovery.fullname" . }}-master-conf diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/nfd-gc.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/nfd-gc.yaml index 375f93827..3642aa642 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/templates/nfd-gc.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/templates/nfd-gc.yaml @@ -1,4 +1,4 @@ -{{- if and .Values.gc.enable (or (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) .Values.topologyUpdater.enable) -}} +{{- if and .Values.gc.enable -}} apiVersion: apps/v1 kind: Deployment metadata: @@ -58,6 +58,9 @@ spec: {{- if .Values.gc.interval | empty | not }} - "-gc-interval={{ .Values.gc.interval }}" {{- end }} + {{- with .Values.gc.extraArgs }} + {{- toYaml . | nindent 10 }} + {{- end }} resources: {{- toYaml .Values.gc.resources | nindent 12 }} securityContext: diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/service.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/service.yaml deleted file mode 100644 index 7191dca70..000000000 --- a/deployments/gpu-operator/charts/node-feature-discovery/templates/service.yaml +++ /dev/null @@ -1,20 +0,0 @@ -{{- if and (not (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi)) .Values.master.enable }} -apiVersion: v1 -kind: Service -metadata: - name: {{ include "node-feature-discovery.fullname" . }}-master - namespace: {{ include "node-feature-discovery.namespace" . }} - labels: - {{- include "node-feature-discovery.labels" . | nindent 4 }} - role: master -spec: - type: {{ .Values.master.service.type }} - ports: - - port: {{ .Values.master.service.port | default "8080" }} - targetPort: grpc - protocol: TCP - name: grpc - selector: - {{- include "node-feature-discovery.selectorLabels" . | nindent 4 }} - role: master -{{- end}} diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/serviceaccount.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/serviceaccount.yaml index 59edc5e6c..47c75a7e5 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/templates/serviceaccount.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/templates/serviceaccount.yaml @@ -27,7 +27,7 @@ metadata: {{- end }} {{- end }} -{{- if and .Values.gc.enable .Values.gc.serviceAccount.create (or (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) .Values.topologyUpdater.enable) }} +{{- if and .Values.gc.enable .Values.gc.serviceAccount.create }} --- apiVersion: v1 kind: ServiceAccount diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/topologyupdater.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/topologyupdater.yaml index ba0214c88..9a466f88e 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/templates/topologyupdater.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/templates/topologyupdater.yaml @@ -22,10 +22,11 @@ spec: labels: {{- include "node-feature-discovery.selectorLabels" . | nindent 8 }} role: topology-updater - {{- with .Values.topologyUpdater.annotations }} annotations: + checksum/config: {{ include (print $.Template.BasePath "/nfd-topologyupdater-conf.yaml") . | sha256sum }} + {{- with .Values.topologyUpdater.annotations }} {{- toYaml . | nindent 8 }} - {{- end }} + {{- end }} spec: serviceAccountName: {{ include "node-feature-discovery.topologyUpdater.serviceAccountName" . }} dnsPolicy: ClusterFirstWithHostNet @@ -44,9 +45,38 @@ spec: image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: "{{ .Values.image.pullPolicy }}" livenessProbe: - {{- toYaml .Values.topologyUpdater.livenessProbe | nindent 10 }} + grpc: + port: {{ .Values.topologyUpdater.healthPort | default "8082" }} + {{- with .Values.topologyUpdater.livenessProbe.initialDelaySeconds }} + initialDelaySeconds: {{ . }} + {{- end }} + {{- with .Values.topologyUpdater.livenessProbe.failureThreshold }} + failureThreshold: {{ . }} + {{- end }} + {{- with .Values.topologyUpdater.livenessProbe.periodSeconds }} + periodSeconds: {{ . }} + {{- end }} + {{- with .Values.topologyUpdater.livenessProbe.timeoutSeconds }} + timeoutSeconds: {{ . }} + {{- end }} readinessProbe: - {{- toYaml .Values.topologyUpdater.readinessProbe | nindent 10 }} + grpc: + port: {{ .Values.topologyUpdater.healthPort | default "8082" }} + {{- with .Values.topologyUpdater.readinessProbe.initialDelaySeconds }} + initialDelaySeconds: {{ . }} + {{- end }} + {{- with .Values.topologyUpdater.readinessProbe.failureThreshold }} + failureThreshold: {{ . }} + {{- end }} + {{- with .Values.topologyUpdater.readinessProbe.periodSeconds }} + periodSeconds: {{ . }} + {{- end }} + {{- with .Values.topologyUpdater.readinessProbe.timeoutSeconds }} + timeoutSeconds: {{ . }} + {{- end }} + {{- with .Values.topologyUpdater.readinessProbe.successThreshold }} + successThreshold: {{ . }} + {{- end }} env: - name: NODE_NAME valueFrom: @@ -73,11 +103,6 @@ spec: {{- else }} - "-watch-namespace=*" {{- end }} - {{- if .Values.tls.enable }} - - "-ca-file=/etc/kubernetes/node-feature-discovery/certs/ca.crt" - - "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key" - - "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt" - {{- end }} {{- if not .Values.topologyUpdater.podSetFingerprint }} - "-pods-fingerprint=false" {{- end }} @@ -88,12 +113,15 @@ spec: # Disable kubelet state tracking by giving an empty path - "-kubelet-state-dir=" {{- end }} - - -metrics={{ .Values.topologyUpdater.metricsPort | default "8081"}} - - "-grpc-health={{ .Values.topologyUpdater.healthPort | default "8082" }}" + - "-metrics={{ .Values.topologyUpdater.metricsPort | default "8081"}}" + - "-grpc-health={{ .Values.topologyUpdater.healthPort | default "8082" }}" + {{- with .Values.topologyUpdater.extraArgs }} + {{- toYaml . | nindent 10 }} + {{- end }} ports: - containerPort: {{ .Values.topologyUpdater.metricsPort | default "8081"}} name: metrics - - containerPort: {{ .Values.topologyUpdater.healthPort | default "8082" }} + - containerPort: {{ .Values.topologyUpdater.healthPort | default "8082" }} name: health volumeMounts: {{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }} @@ -109,11 +137,6 @@ spec: mountPath: /host-var/lib/kubelet readOnly: true {{- end }} - {{- if .Values.tls.enable }} - - name: nfd-topology-updater-cert - mountPath: "/etc/kubernetes/node-feature-discovery/certs" - readOnly: true - {{- end }} - name: nfd-topology-updater-conf mountPath: "/etc/kubernetes/node-feature-discovery" readOnly: true @@ -149,12 +172,6 @@ spec: items: - key: nfd-topology-updater.conf path: nfd-topology-updater.conf - {{- if .Values.tls.enable }} - - name: nfd-topology-updater-cert - secret: - secretName: nfd-topology-updater-cert - {{- end }} - {{- with .Values.topologyUpdater.nodeSelector }} nodeSelector: diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/worker.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/worker.yaml index 755466c75..5fd1ab744 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/templates/worker.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/templates/worker.yaml @@ -22,10 +22,11 @@ spec: labels: {{- include "node-feature-discovery.selectorLabels" . | nindent 8 }} role: worker - {{- with .Values.worker.annotations }} annotations: + checksum/config: {{ include (print $.Template.BasePath "/nfd-worker-conf.yaml") . | sha256sum }} + {{- with .Values.worker.annotations }} {{- toYaml . | nindent 8 }} - {{- end }} + {{- end }} spec: dnsPolicy: ClusterFirstWithHostNet {{- with .Values.priorityClassName }} @@ -46,9 +47,38 @@ spec: image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.image.pullPolicy }} livenessProbe: - {{- toYaml .Values.worker.livenessProbe | nindent 12 }} + grpc: + port: {{ .Values.worker.healthPort | default "8082" }} + {{- with .Values.worker.livenessProbe.initialDelaySeconds }} + initialDelaySeconds: {{ . }} + {{- end }} + {{- with .Values.worker.livenessProbe.failureThreshold }} + failureThreshold: {{ . }} + {{- end }} + {{- with .Values.worker.livenessProbe.periodSeconds }} + periodSeconds: {{ . }} + {{- end }} + {{- with .Values.worker.livenessProbe.timeoutSeconds }} + timeoutSeconds: {{ . }} + {{- end }} readinessProbe: - {{- toYaml .Values.worker.readinessProbe | nindent 12 }} + grpc: + port: {{ .Values.worker.healthPort | default "8082" }} + {{- with .Values.worker.readinessProbe.initialDelaySeconds }} + initialDelaySeconds: {{ . }} + {{- end }} + {{- with .Values.worker.readinessProbe.failureThreshold }} + failureThreshold: {{ . }} + {{- end }} + {{- with .Values.worker.readinessProbe.periodSeconds }} + periodSeconds: {{ . }} + {{- end }} + {{- with .Values.worker.readinessProbe.timeoutSeconds }} + timeoutSeconds: {{ . }} + {{- end }} + {{- with .Values.worker.readinessProbe.successThreshold }} + successThreshold: {{ . }} + {{- end }} env: - name: NODE_NAME valueFrom: @@ -70,24 +100,19 @@ spec: command: - "nfd-worker" args: -{{- if not (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) }} - - "-server={{ include "node-feature-discovery.fullname" . }}-master:{{ .Values.master.service.port }}" -{{- end }} -{{- if .Values.tls.enable }} - - "-ca-file=/etc/kubernetes/node-feature-discovery/certs/ca.crt" - - "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key" - - "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt" -{{- end }} -# Go over featureGate and add the feature-gate flag -{{- range $key, $value := .Values.featureGates }} + # Go over featureGate and add the feature-gate flag + {{- range $key, $value := .Values.featureGates }} - "-feature-gates={{ $key }}={{ $value }}" -{{- end }} + {{- end }} - "-metrics={{ .Values.worker.metricsPort | default "8081"}}" - - "-grpc-health={{ .Values.worker.healthPort | default "8082" }}" + - "-grpc-health={{ .Values.worker.healthPort | default "8082" }}" + {{- with .Values.gc.extraArgs }} + {{- toYaml . | nindent 8 }} + {{- end }} ports: - containerPort: {{ .Values.worker.metricsPort | default "8081"}} name: metrics - - containerPort: {{ .Values.worker.healthPort | default "8082" }} + - containerPort: {{ .Values.worker.healthPort | default "8082" }} name: health volumeMounts: - name: host-boot @@ -113,20 +138,12 @@ spec: mountPath: "/host-usr/src" readOnly: true {{- end }} - - name: source-d - mountPath: "/etc/kubernetes/node-feature-discovery/source.d/" - readOnly: true - name: features-d mountPath: "/etc/kubernetes/node-feature-discovery/features.d/" readOnly: true - name: nfd-worker-conf mountPath: "/etc/kubernetes/node-feature-discovery" readOnly: true -{{- if .Values.tls.enable }} - - name: nfd-worker-cert - mountPath: "/etc/kubernetes/node-feature-discovery/certs" - readOnly: true -{{- end }} volumes: - name: host-boot hostPath: @@ -151,9 +168,6 @@ spec: hostPath: path: "/usr/src" {{- end }} - - name: source-d - hostPath: - path: "/etc/kubernetes/node-feature-discovery/source.d/" - name: features-d hostPath: path: "/etc/kubernetes/node-feature-discovery/features.d/" @@ -163,12 +177,7 @@ spec: items: - key: nfd-worker.conf path: nfd-worker.conf -{{- if .Values.tls.enable }} - - name: nfd-worker-cert - secret: - secretName: nfd-worker-cert -{{- end }} - {{- with .Values.worker.nodeSelector }} + {{- with .Values.worker.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} {{- end }} diff --git a/deployments/gpu-operator/charts/node-feature-discovery/values.yaml b/deployments/gpu-operator/charts/node-feature-discovery/values.yaml index 2d24983db..18aa7bcb5 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/values.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/values.yaml @@ -10,16 +10,14 @@ nameOverride: "" fullnameOverride: "" namespaceOverride: "" -enableNodeFeatureApi: true - featureGates: - NodeFeatureAPI: true NodeFeatureGroupAPI: false priorityClassName: "" master: enable: true + extraArgs: [] extraEnvs: [] hostNetwork: false config: ### @@ -27,10 +25,24 @@ master: # autoDefaultNs: true # extraLabelNs: ["added.ns.io","added.kubernets.io"] # denyLabelNs: ["denied.ns.io","denied.kubernetes.io"] - # resourceLabels: ["vendor-1.com/feature-1","vendor-2.io/feature-2"] # enableTaints: false # labelWhiteList: "foo" # resyncPeriod: "2h" + # restrictions: + # disableLabels: true + # disableTaints: true + # disableExtendedResources: true + # disableAnnotations: true + # allowOverwrite: false + # denyNodeFeatureLabels: true + # nodeFeatureNamespaceSelector: + # matchLabels: + # kubernetes.io/metadata.name: "node-feature-discovery" + # matchExpressions: + # - key: "kubernetes.io/metadata.name" + # operator: "In" + # values: + # - "node-feature-discovery" # klog: # addDirHeader: false # alsologtostderr: false @@ -54,10 +66,6 @@ master: # retryPeriod: 2s # nfdApiParallelism: 10 ### - # The TCP port that nfd-master listens for incoming requests. Default: 8080 - # Deprecated this parameter is related to the deprecated gRPC API and will - # be removed with it in a future release - port: 8080 metricsPort: 8081 healthPort: 8082 instance: @@ -65,9 +73,7 @@ master: resyncPeriod: denyLabelNs: [] extraLabelNs: [] - resourceLabels: [] enableTaints: false - crdController: null featureRulesController: null nfdApiParallelism: null deploymentAnnotations: {} @@ -92,17 +98,13 @@ master: # The name of the service account to use. # If not set and create is true, a name is generated using the fullname template name: - + # specify how many old ReplicaSets for the Deployment to retain. - revisionHistoryLimit: + revisionHistoryLimit: rbac: create: true - service: - type: ClusterIP - port: 8080 - resources: limits: memory: 4Gi @@ -145,27 +147,37 @@ master: operator: In values: [""] + startupProbe: + grpc: + port: 8082 + failureThreshold: 30 + # periodSeconds: 10 livenessProbe: grpc: port: 8082 - initialDelaySeconds: 10 # failureThreshold: 3 + # initialDelaySeconds: 0 # periodSeconds: 10 + # timeoutSeconds: 1 readinessProbe: grpc: port: 8082 - initialDelaySeconds: 5 failureThreshold: 10 + # initialDelaySeconds: 0 # periodSeconds: 10 + # timeoutSeconds: 1 + # successThreshold: 1 worker: enable: true + extraArgs: [] extraEnvs: [] hostNetwork: false config: ### #core: # labelWhiteList: # noPublish: false + # noOwnerRefs: false # sleepInterval: 60s # featureSources: [all] # labelSources: [all] @@ -242,8 +254,6 @@ worker: # - "class" # - "vendor" # - "device" - # local: - # hooksEnabled: false # custom: # # The following feature demonstrates the capabilities of the matchFeatures # - name: "my custom rule" @@ -426,12 +436,15 @@ worker: initialDelaySeconds: 10 # failureThreshold: 3 # periodSeconds: 10 + # timeoutSeconds: 1 readinessProbe: grpc: port: 8082 initialDelaySeconds: 5 failureThreshold: 10 # periodSeconds: 10 + # timeoutSeconds: 1 + # successThreshold: 1 serviceAccount: # Specifies whether a service account should be created. @@ -483,6 +496,7 @@ topologyUpdater: enable: false createCRDs: false + extraArgs: [] extraEnvs: [] hostNetwork: false @@ -519,12 +533,15 @@ topologyUpdater: initialDelaySeconds: 10 # failureThreshold: 3 # periodSeconds: 10 + # timeoutSeconds: 1 readinessProbe: grpc: port: 8082 initialDelaySeconds: 5 failureThreshold: 10 # periodSeconds: 10 + # timeoutSeconds: 1 + # successThreshold: 1 resources: limits: @@ -542,6 +559,7 @@ topologyUpdater: gc: enable: true + extraArgs: [] extraEnvs: [] hostNetwork: false replicaCount: 1 @@ -573,19 +591,7 @@ gc: affinity: {} # specify how many old ReplicaSets for the Deployment to retain. - revisionHistoryLimit: - -# Optionally use encryption for worker <--> master comms -# TODO: verify hostname is not yet supported -# -# If you do not enable certManager (and have it installed) you will -# need to manually, or otherwise, provision the TLS certs as secrets -tls: - enable: false - certManager: false - certManagerCertificate: - issuerKind: - issuerName: + revisionHistoryLimit: prometheus: enable: false