diff --git a/deployments/gpu-operator/Chart.lock b/deployments/gpu-operator/Chart.lock index 497649e71..81e3b7e3a 100644 --- a/deployments/gpu-operator/Chart.lock +++ b/deployments/gpu-operator/Chart.lock @@ -1,6 +1,6 @@ dependencies: - name: node-feature-discovery repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts - version: 0.15.4 -digest: sha256:025d011a09907e4b274e550b36689c47ed12722543624debd1e7d709d70462d8 -generated: "2024-04-05T09:13:06.247476-07:00" + version: 0.16.0 +digest: sha256:a906ecd78195a74662c8df45db70ff4093fd89104a345baf40f9a5c0ae6f29c2 +generated: "2024-05-29T09:57:11.533324-07:00" diff --git a/deployments/gpu-operator/Chart.yaml b/deployments/gpu-operator/Chart.yaml index a6083cdbe..b3310b871 100644 --- a/deployments/gpu-operator/Chart.yaml +++ b/deployments/gpu-operator/Chart.yaml @@ -19,6 +19,6 @@ keywords: dependencies: - name: node-feature-discovery - version: v0.15.4 + version: v0.16.0 repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts condition: nfd.enabled diff --git a/deployments/gpu-operator/charts/node-feature-discovery/Chart.yaml b/deployments/gpu-operator/charts/node-feature-discovery/Chart.yaml index a57e62eda..547351297 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/Chart.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v2 -appVersion: v0.15.4 +appVersion: v0.16.0 description: 'Detects hardware features available on each node in a Kubernetes cluster, and advertises those features using node labels. ' home: https://github.com/kubernetes-sigs/node-feature-discovery @@ -11,4 +11,4 @@ name: node-feature-discovery sources: - https://github.com/kubernetes-sigs/node-feature-discovery type: application -version: 0.15.4 +version: 0.16.0 diff --git a/deployments/gpu-operator/charts/node-feature-discovery/README.md b/deployments/gpu-operator/charts/node-feature-discovery/README.md index b8b7d90ca..93734f8b7 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/README.md +++ b/deployments/gpu-operator/charts/node-feature-discovery/README.md @@ -6,5 +6,5 @@ labels. NFD provides flexible configuration and extension points for a wide range of vendor and application specific node labeling needs. See -[NFD documentation](https://kubernetes-sigs.github.io/node-feature-discovery/v0.15/deployment/helm.html) +[NFD documentation](https://kubernetes-sigs.github.io/node-feature-discovery/v0.16/deployment/helm.html) for deployment instructions. diff --git a/deployments/gpu-operator/charts/node-feature-discovery/crds/nfd-api-crds.yaml b/deployments/gpu-operator/charts/node-feature-discovery/crds/nfd-api-crds.yaml index 4e6304163..0a73c5dca 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/crds/nfd-api-crds.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/crds/nfd-api-crds.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.12.1 + controller-gen.kubebuilder.io/version: v0.14.0 name: nodefeatures.nfd.k8s-sigs.io spec: group: nfd.k8s-sigs.io @@ -17,23 +17,30 @@ spec: - name: v1alpha1 schema: openAPIV3Schema: - description: NodeFeature resource holds the features discovered for one node - in the cluster. + description: |- + NodeFeature resource holds the features discovered for one node in the + cluster. properties: apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources type: string kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds type: string metadata: type: object spec: - description: NodeFeatureSpec describes a NodeFeature object. + description: Specification of the NodeFeature, containing features discovered + for a node. properties: features: description: Features is the full "raw" features data that has been @@ -47,6 +54,7 @@ spec: elements: additionalProperties: type: string + description: Individual features of the feature set. type: object required: - elements @@ -64,6 +72,7 @@ spec: description: Nil is a dummy empty struct for protobuf compatibility type: object + description: Individual features of the feature set. type: object required: - elements @@ -77,6 +86,7 @@ spec: which is an instance having multiple attributes. properties: elements: + description: Individual features of the feature set. items: description: InstanceFeature represents one instance of a complex features, e.g. a device. @@ -84,6 +94,7 @@ spec: attributes: additionalProperties: type: string + description: Attributes of the instance feature. type: object required: - attributes @@ -113,7 +124,278 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.12.1 + controller-gen.kubebuilder.io/version: v0.14.0 + name: nodefeaturegroups.nfd.k8s-sigs.io +spec: + group: nfd.k8s-sigs.io + names: + kind: NodeFeatureGroup + listKind: NodeFeatureGroupList + plural: nodefeaturegroups + shortNames: + - nfg + singular: nodefeaturegroup + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: NodeFeatureGroup resource holds Node pools by featureGroup + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: Spec defines the rules to be evaluated. + properties: + featureGroupRules: + description: List of rules to evaluate to determine nodes that belong + in this group. + items: + description: GroupRule defines a rule for nodegroup filtering. + properties: + matchAny: + description: MatchAny specifies a list of matchers one of which + must match. + items: + description: MatchAnyElem specifies one sub-matcher of MatchAny. + properties: + matchFeatures: + description: MatchFeatures specifies a set of matcher + terms all of which must match. + items: + description: |- + FeatureMatcherTerm defines requirements against one feature set. All + requirements (specified as MatchExpressions) are evaluated against each + element in the feature set. + properties: + feature: + description: Feature is the name of the feature + set to match against. + type: string + matchExpressions: + additionalProperties: + description: |- + MatchExpression specifies an expression to evaluate against a set of input + values. It contains an operator that is applied when matching the input and + an array of values that the operator evaluates the input against. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + description: |- + MatchExpressions is the set of per-element expressions evaluated. These + match against the value of the specified elements. + type: object + matchName: + description: |- + MatchName in an expression that is matched against the name of each + element in the feature set. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + required: + - feature + type: object + type: array + required: + - matchFeatures + type: object + type: array + matchFeatures: + description: MatchFeatures specifies a set of matcher terms + all of which must match. + items: + description: |- + FeatureMatcherTerm defines requirements against one feature set. All + requirements (specified as MatchExpressions) are evaluated against each + element in the feature set. + properties: + feature: + description: Feature is the name of the feature set to + match against. + type: string + matchExpressions: + additionalProperties: + description: |- + MatchExpression specifies an expression to evaluate against a set of input + values. It contains an operator that is applied when matching the input and + an array of values that the operator evaluates the input against. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + description: |- + MatchExpressions is the set of per-element expressions evaluated. These + match against the value of the specified elements. + type: object + matchName: + description: |- + MatchName in an expression that is matched against the name of each + element in the feature set. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + required: + - feature + type: object + type: array + name: + description: Name of the rule. + type: string + required: + - name + type: object + type: array + required: + - featureGroupRules + type: object + status: + description: |- + Status of the NodeFeatureGroup after the most recent evaluation of the + specification. + properties: + nodes: + description: Nodes is a list of FeatureGroupNode in the cluster that + match the featureGroupRules + items: + properties: + name: + description: Name of the node. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.14.0 name: nodefeaturerules.nfd.k8s-sigs.io spec: group: nfd.k8s-sigs.io @@ -129,23 +411,29 @@ spec: - name: v1alpha1 schema: openAPIV3Schema: - description: NodeFeatureRule resource specifies a configuration for feature-based + description: |- + NodeFeatureRule resource specifies a configuration for feature-based customization of node objects, such as node labeling. properties: apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources type: string kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds type: string metadata: type: object spec: - description: NodeFeatureRuleSpec describes a NodeFeatureRule. + description: Spec defines the rules to be evaluated. properties: rules: description: Rules is a list of node customization rules. @@ -169,10 +457,10 @@ spec: description: Labels to create if the rule matches. type: object labelsTemplate: - description: LabelsTemplate specifies a template to expand for - dynamically generating multiple labels. Data (after template - expansion) must be keys with an optional value ([=]) - separated by newlines. + description: |- + LabelsTemplate specifies a template to expand for dynamically generating + multiple labels. Data (after template expansion) must be keys with an + optional value ([=]) separated by newlines. type: string matchAny: description: MatchAny specifies a list of matchers one of which @@ -184,10 +472,10 @@ spec: description: MatchFeatures specifies a set of matcher terms all of which must match. items: - description: FeatureMatcherTerm defines requirements - against one feature set. All requirements (specified - as MatchExpressions) are evaluated against each element - in the feature set. + description: |- + FeatureMatcherTerm defines requirements against one feature set. All + requirements (specified as MatchExpressions) are evaluated against each + element in the feature set. properties: feature: description: Feature is the name of the feature @@ -195,11 +483,10 @@ spec: type: string matchExpressions: additionalProperties: - description: MatchExpression specifies an expression - to evaluate against a set of input values. It - contains an operator that is applied when matching - the input and an array of values that the operator - evaluates the input against. + description: |- + MatchExpression specifies an expression to evaluate against a set of input + values. It contains an operator that is applied when matching the input and + an array of values that the operator evaluates the input against. properties: op: description: Op is the operator to be applied. @@ -216,29 +503,26 @@ spec: - IsFalse type: string value: - description: Value is the list of values that - the operand evaluates the input against. - Value should be empty if the operator is - Exists, DoesNotExist, IsTrue or IsFalse. - Value should contain exactly one element - if the operator is Gt or Lt and exactly - two elements if the operator is GtLt. In - other cases Value should contain at least - one element. + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. items: type: string type: array required: - op type: object - description: MatchExpressions is the set of per-element - expressions evaluated. These match against the - value of the specified elements. + description: |- + MatchExpressions is the set of per-element expressions evaluated. These + match against the value of the specified elements. type: object matchName: - description: MatchName in an expression that is - matched against the name of each element in the - feature set. + description: |- + MatchName in an expression that is matched against the name of each + element in the feature set. properties: op: description: Op is the operator to be applied. @@ -255,14 +539,12 @@ spec: - IsFalse type: string value: - description: Value is the list of values that - the operand evaluates the input against. Value - should be empty if the operator is Exists, - DoesNotExist, IsTrue or IsFalse. Value should - contain exactly one element if the operator - is Gt or Lt and exactly two elements if the - operator is GtLt. In other cases Value should - contain at least one element. + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. items: type: string type: array @@ -281,9 +563,10 @@ spec: description: MatchFeatures specifies a set of matcher terms all of which must match. items: - description: FeatureMatcherTerm defines requirements against - one feature set. All requirements (specified as MatchExpressions) - are evaluated against each element in the feature set. + description: |- + FeatureMatcherTerm defines requirements against one feature set. All + requirements (specified as MatchExpressions) are evaluated against each + element in the feature set. properties: feature: description: Feature is the name of the feature set to @@ -291,11 +574,10 @@ spec: type: string matchExpressions: additionalProperties: - description: MatchExpression specifies an expression - to evaluate against a set of input values. It contains - an operator that is applied when matching the input - and an array of values that the operator evaluates - the input against. + description: |- + MatchExpression specifies an expression to evaluate against a set of input + values. It contains an operator that is applied when matching the input and + an array of values that the operator evaluates the input against. properties: op: description: Op is the operator to be applied. @@ -312,26 +594,26 @@ spec: - IsFalse type: string value: - description: Value is the list of values that the - operand evaluates the input against. Value should - be empty if the operator is Exists, DoesNotExist, - IsTrue or IsFalse. Value should contain exactly - one element if the operator is Gt or Lt and exactly - two elements if the operator is GtLt. In other - cases Value should contain at least one element. + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. items: type: string type: array required: - op type: object - description: MatchExpressions is the set of per-element - expressions evaluated. These match against the value - of the specified elements. + description: |- + MatchExpressions is the set of per-element expressions evaluated. These + match against the value of the specified elements. type: object matchName: - description: MatchName in an expression that is matched - against the name of each element in the feature set. + description: |- + MatchName in an expression that is matched against the name of each + element in the feature set. properties: op: description: Op is the operator to be applied. @@ -348,13 +630,12 @@ spec: - IsFalse type: string value: - description: Value is the list of values that the - operand evaluates the input against. Value should - be empty if the operator is Exists, DoesNotExist, - IsTrue or IsFalse. Value should contain exactly - one element if the operator is Gt or Lt and exactly - two elements if the operator is GtLt. In other cases - Value should contain at least one element. + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. items: type: string type: array @@ -371,21 +652,24 @@ spec: taints: description: Taints to create if the rule matches. items: - description: The node this Taint is attached to has the "effect" - on any pod that does not tolerate the Taint. + description: |- + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. properties: effect: - description: Required. The effect of the taint on pods - that do not tolerate the taint. Valid effects are NoSchedule, - PreferNoSchedule and NoExecute. + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. type: string key: description: Required. The taint key to be applied to a node. type: string timeAdded: - description: TimeAdded represents the time at which the - taint was added. It is only written for NoExecute taints. + description: |- + TimeAdded represents the time at which the taint was added. + It is only written for NoExecute taints. format: date-time type: string value: @@ -400,17 +684,17 @@ spec: vars: additionalProperties: type: string - description: Vars is the variables to store if the rule matches. - Variables do not directly inflict any changes in the node - object. However, they can be referenced from other rules enabling - more complex rule hierarchies, without exposing intermediary - output values as labels. + description: |- + Vars is the variables to store if the rule matches. Variables do not + directly inflict any changes in the node object. However, they can be + referenced from other rules enabling more complex rule hierarchies, + without exposing intermediary output values as labels. type: object varsTemplate: - description: VarsTemplate specifies a template to expand for - dynamically generating multiple variables. Data (after template - expansion) must be keys with an optional value ([=]) - separated by newlines. + description: |- + VarsTemplate specifies a template to expand for dynamically generating + multiple variables. Data (after template expansion) must be keys with an + optional value ([=]) separated by newlines. type: string required: - name diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/cert-manager-certs.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/cert-manager-certs.yaml index 8af115316..2d1576022 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/templates/cert-manager-certs.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/templates/cert-manager-certs.yaml @@ -19,8 +19,12 @@ spec: - {{ include "node-feature-discovery.fullname" . }}-master.{{ include "node-feature-discovery.namespace" . }}.svc - {{ include "node-feature-discovery.fullname" . }}-master.{{ include "node-feature-discovery.namespace" . }}.svc.cluster.local issuerRef: - name: nfd-ca-issuer + name: {{ default "nfd-ca-issuer" .Values.tls.certManagerCertificate.issuerName }} + {{- if and .Values.tls.certManagerCertificate.issuerName .Values.tls.certManagerCertificate.issuerKind }} + kind: {{ .Values.tls.certManagerCertificate.issuerKind }} + {{- else }} kind: Issuer + {{- end }} group: cert-manager.io {{- end }} --- @@ -39,8 +43,12 @@ spec: dnsNames: - {{ include "node-feature-discovery.fullname" . }}-worker.{{ include "node-feature-discovery.namespace" . }}.svc.cluster.local issuerRef: - name: nfd-ca-issuer + name: {{ default "nfd-ca-issuer" .Values.tls.certManagerCertificate.issuerName }} + {{- if and .Values.tls.certManagerCertificate.issuerName .Values.tls.certManagerCertificate.issuerKind }} + kind: {{ .Values.tls.certManagerCertificate.issuerKind }} + {{- else }} kind: Issuer + {{- end }} group: cert-manager.io {{- end }} @@ -60,8 +68,12 @@ spec: dnsNames: - {{ include "node-feature-discovery.fullname" . }}-topology-updater.{{ include "node-feature-discovery.namespace" . }}.svc.cluster.local issuerRef: - name: nfd-ca-issuer + name: {{ default "nfd-ca-issuer" .Values.tls.certManagerCertificate.issuerName }} + {{- if and .Values.tls.certManagerCertificate.issuerName .Values.tls.certManagerCertificate.issuerKind }} + kind: {{ .Values.tls.certManagerCertificate.issuerKind }} + {{- else }} kind: Issuer + {{- end }} group: cert-manager.io {{- end }} diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/cert-manager-issuer.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/cert-manager-issuer.yaml index f3c57acea..874468908 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/templates/cert-manager-issuer.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/templates/cert-manager-issuer.yaml @@ -1,4 +1,4 @@ -{{- if .Values.tls.certManager }} +{{- if and .Values.tls.certManager (not .Values.tls.certManagerCertificate.issuerName ) }} # See https://cert-manager.io/docs/configuration/selfsigned/#bootstrapping-ca-issuers # - Create a self signed issuer # - Use this to create a CA cert diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/clusterrole.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/clusterrole.yaml index e652e1df8..f935cfe41 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/templates/clusterrole.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/templates/clusterrole.yaml @@ -21,10 +21,18 @@ rules: resources: - nodefeatures - nodefeaturerules + - nodefeaturegroups verbs: - get - list - watch +- apiGroups: + - nfd.k8s-sigs.io + resources: + - nodefeaturegroups/status + verbs: + - patch + - update - apiGroups: - coordination.k8s.io resources: @@ -58,6 +66,12 @@ rules: verbs: - get - list +- apiGroups: + - "" + resources: + - namespaces + verbs: + - get - apiGroups: - "" resources: @@ -80,7 +94,7 @@ rules: - update {{- end }} -{{- if and .Values.gc.enable .Values.gc.rbac.create (or .Values.enableNodeFeatureApi .Values.topologyUpdater.enable) }} +{{- if and .Values.gc.enable .Values.gc.rbac.create (or (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) .Values.topologyUpdater.enable) }} --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/clusterrolebinding.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/clusterrolebinding.yaml index 99134a1c5..3f717988b 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/templates/clusterrolebinding.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/templates/clusterrolebinding.yaml @@ -33,7 +33,7 @@ subjects: namespace: {{ include "node-feature-discovery.namespace" . }} {{- end }} -{{- if and .Values.gc.enable .Values.gc.rbac.create (or .Values.enableNodeFeatureApi .Values.topologyUpdater.enable) }} +{{- if and .Values.gc.enable .Values.gc.rbac.create (or (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) .Values.topologyUpdater.enable) }} --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/master.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/master.yaml index 53a291e0f..3a584209e 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/templates/master.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/templates/master.yaml @@ -27,6 +27,9 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} spec: + {{- with .Values.priorityClassName }} + priorityClassName: {{ . }} + {{- end }} {{- with .Values.imagePullSecrets }} imagePullSecrets: {{- toYaml . | nindent 8 }} @@ -43,12 +46,12 @@ spec: imagePullPolicy: {{ .Values.image.pullPolicy }} livenessProbe: grpc: - port: 8080 + port: 8082 initialDelaySeconds: 10 periodSeconds: 10 readinessProbe: grpc: - port: 8080 + port: 8082 initialDelaySeconds: 5 periodSeconds: 10 failureThreshold: 10 @@ -70,9 +73,8 @@ spec: {{- if .Values.master.instance | empty | not }} - "-instance={{ .Values.master.instance }}" {{- end }} - {{- if not .Values.enableNodeFeatureApi }} + {{- if not (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) }} - "-port={{ .Values.master.port | default "8080" }}" - - "-enable-nodefeature-api=false" {{- else if gt (int .Values.master.replicaCount) 1 }} - "-enable-leader-election" {{- end }} @@ -108,6 +110,10 @@ spec: - "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key" - "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt" {{- end }} + # Go over featureGates and add the feature-gate flag + {{- range $key, $value := .Values.featureGates }} + - "-feature-gates={{ $key }}={{ $value }}" + {{- end }} - "-metrics={{ .Values.master.metricsPort | default "8081" }}" volumeMounts: {{- if .Values.tls.enable }} diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/nfd-gc.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/nfd-gc.yaml index 1e0e12327..4f4ac76c7 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/templates/nfd-gc.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/templates/nfd-gc.yaml @@ -1,4 +1,4 @@ -{{- if and .Values.gc.enable (or .Values.enableNodeFeatureApi .Values.topologyUpdater.enable) -}} +{{- if and .Values.gc.enable (or (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) .Values.topologyUpdater.enable) -}} apiVersion: apps/v1 kind: Deployment metadata: @@ -29,6 +29,9 @@ spec: spec: serviceAccountName: {{ include "node-feature-discovery.gc.serviceAccountName" . }} dnsPolicy: ClusterFirstWithHostNet + {{- with .Values.priorityClassName }} + priorityClassName: {{ . }} + {{- end }} {{- with .Values.imagePullSecrets }} imagePullSecrets: {{- toYaml . | nindent 8 }} diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/nfd-topologyupdater-conf.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/nfd-topologyupdater-conf.yaml index 9867f5089..8d03aa2d8 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/templates/nfd-topologyupdater-conf.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/templates/nfd-topologyupdater-conf.yaml @@ -1,3 +1,4 @@ +{{- if .Values.topologyUpdater.enable -}} apiVersion: v1 kind: ConfigMap metadata: @@ -8,3 +9,4 @@ metadata: data: nfd-topology-updater.conf: |- {{- .Values.topologyUpdater.config | toYaml | nindent 4 }} +{{- end }} diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/post-delete-job.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/post-delete-job.yaml new file mode 100644 index 000000000..23467ea0d --- /dev/null +++ b/deployments/gpu-operator/charts/node-feature-discovery/templates/post-delete-job.yaml @@ -0,0 +1,94 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "node-feature-discovery.master.serviceAccountName" . }}-prune + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": post-delete + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-prune + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": post-delete + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +rules: +- apiGroups: + - "" + resources: + - nodes + - nodes/status + verbs: + - get + - patch + - update + - list +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-prune + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": post-delete + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "node-feature-discovery.fullname" . }}-prune +subjects: +- kind: ServiceAccount + name: {{ include "node-feature-discovery.fullname" . }}-prune + namespace: {{ include "node-feature-discovery.namespace" . }} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-prune + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": post-delete + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +spec: + template: + metadata: + labels: + {{- include "node-feature-discovery.labels" . | nindent 8 }} + role: prune + spec: + serviceAccountName: {{ include "node-feature-discovery.fullname" . }}-prune + containers: + - name: nfd-master + securityContext: + {{- toYaml .Values.master.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: + - "nfd-master" + args: + - "-prune" + {{- if .Values.master.instance | empty | not }} + - "-instance={{ .Values.master.instance }}" + {{- end }} + restartPolicy: Never + {{- with .Values.master.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.master.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.master.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/prometheus.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/prometheus.yaml index b9f4b4640..3d680e24e 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/templates/prometheus.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/templates/prometheus.yaml @@ -12,7 +12,7 @@ metadata: spec: podMetricsEndpoints: - honorLabels: true - interval: 10s + interval: {{ .Values.prometheus.scrapeInterval }} path: /metrics port: metrics scheme: http diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/service.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/service.yaml index d71d1555f..7191dca70 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/templates/service.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/templates/service.yaml @@ -1,4 +1,4 @@ -{{- if and (not .Values.enableNodeFeatureApi) .Values.master.enable }} +{{- if and (not (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi)) .Values.master.enable }} apiVersion: v1 kind: Service metadata: diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/serviceaccount.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/serviceaccount.yaml index 7da2c877e..59edc5e6c 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/templates/serviceaccount.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/templates/serviceaccount.yaml @@ -27,7 +27,7 @@ metadata: {{- end }} {{- end }} -{{- if and .Values.gc.enable .Values.gc.serviceAccount.create (or .Values.enableNodeFeatureApi .Values.topologyUpdater.enable) }} +{{- if and .Values.gc.enable .Values.gc.serviceAccount.create (or (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) .Values.topologyUpdater.enable) }} --- apiVersion: v1 kind: ServiceAccount diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/topologyupdater.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/topologyupdater.yaml index f51c10e6d..1221cfd2d 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/templates/topologyupdater.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/templates/topologyupdater.yaml @@ -28,6 +28,9 @@ spec: spec: serviceAccountName: {{ include "node-feature-discovery.topologyUpdater.serviceAccountName" . }} dnsPolicy: ClusterFirstWithHostNet + {{- with .Values.priorityClassName }} + priorityClassName: {{ . }} + {{- end }} {{- with .Values.imagePullSecrets }} imagePullSecrets: {{- toYaml . | nindent 8 }} @@ -38,6 +41,17 @@ spec: - name: topology-updater image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: "{{ .Values.image.pullPolicy }}" + livenessProbe: + grpc: + port: 8082 + initialDelaySeconds: 10 + periodSeconds: 10 + readinessProbe: + grpc: + port: 8082 + initialDelaySeconds: 5 + periodSeconds: 10 + failureThreshold: 10 env: - name: NODE_NAME valueFrom: @@ -66,8 +80,8 @@ spec: - "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key" - "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt" {{- end }} - {{- if .Values.topologyUpdater.podSetFingerprint }} - - "-pods-fingerprint" + {{- if not .Values.topologyUpdater.podSetFingerprint }} + - "-pods-fingerprint=false" {{- end }} {{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }} - "-kubelet-config-uri=file:///host-var/kubelet-config" diff --git a/deployments/gpu-operator/charts/node-feature-discovery/templates/worker.yaml b/deployments/gpu-operator/charts/node-feature-discovery/templates/worker.yaml index f49f9bd64..f2a2419fc 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/templates/worker.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/templates/worker.yaml @@ -27,6 +27,9 @@ spec: {{- end }} spec: dnsPolicy: ClusterFirstWithHostNet + {{- with .Values.priorityClassName }} + priorityClassName: {{ . }} + {{- end }} {{- with .Values.imagePullSecrets }} imagePullSecrets: {{- toYaml . | nindent 8 }} @@ -40,6 +43,17 @@ spec: {{- toYaml .Values.worker.securityContext | nindent 12 }} image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.image.pullPolicy }} + livenessProbe: + grpc: + port: 8082 + initialDelaySeconds: 10 + periodSeconds: 10 + readinessProbe: + grpc: + port: 8082 + initialDelaySeconds: 5 + periodSeconds: 10 + failureThreshold: 10 env: - name: NODE_NAME valueFrom: @@ -58,14 +72,17 @@ spec: command: - "nfd-worker" args: - {{- if not .Values.enableNodeFeatureApi }} +{{- if not (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) }} - "-server={{ include "node-feature-discovery.fullname" . }}-master:{{ .Values.master.service.port }}" - - "-enable-nodefeature-api=false" - {{- end }} +{{- end }} {{- if .Values.tls.enable }} - "-ca-file=/etc/kubernetes/node-feature-discovery/certs/ca.crt" - "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key" - "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt" +{{- end }} +# Go over featureGate and add the feature-gate flag +{{- range $key, $value := .Values.featureGates }} + - "-feature-gates={{ $key }}={{ $value }}" {{- end }} - "-metrics={{ .Values.worker.metricsPort | default "8081"}}" ports: @@ -87,6 +104,9 @@ spec: - name: host-lib mountPath: "/host-lib" readOnly: true + - name: host-proc-swaps + mountPath: "/host-proc/swaps" + readOnly: true {{- if .Values.worker.mountUsrSrc }} - name: host-usr-src mountPath: "/host-usr/src" @@ -122,6 +142,9 @@ spec: - name: host-lib hostPath: path: "/lib" + - name: host-proc-swaps + hostPath: + path: "/proc/swaps" {{- if .Values.worker.mountUsrSrc }} - name: host-usr-src hostPath: diff --git a/deployments/gpu-operator/charts/node-feature-discovery/values.yaml b/deployments/gpu-operator/charts/node-feature-discovery/values.yaml index d4919bca8..57feca0b1 100644 --- a/deployments/gpu-operator/charts/node-feature-discovery/values.yaml +++ b/deployments/gpu-operator/charts/node-feature-discovery/values.yaml @@ -12,6 +12,12 @@ namespaceOverride: "" enableNodeFeatureApi: true +featureGates: + NodeFeatureAPI: true + NodeFeatureGroupAPI: false + +priorityClassName: "" + master: enable: true config: ### @@ -91,17 +97,18 @@ master: type: ClusterIP port: 8080 - resources: {} - # We usually recommend not to specify default resources and to leave this as a conscious - # choice for the user. This also increases chances charts run on environments with little - # resources, such as Minikube. If you do want to specify resources, uncomment the following - # lines, adjust them as necessary, and remove the curly braces after 'resources:'. - # limits: - # cpu: 100m - # memory: 128Mi - # requests: - # cpu: 100m - # memory: 128Mi + resources: + limits: + cpu: 300m + memory: 4Gi + requests: + cpu: 100m + # You may want to use the same value for `requests.memory` and `limits.memory`. The “requests” value affects scheduling to accommodate pods on nodes. + # If there is a large difference between “requests” and “limits” and nodes experience memory pressure, the kernel may invoke + # the OOM Killer, even if the memory does not exceed the “limits” threshold. This can cause unexpected pod evictions. Memory + # cannot be compressed and once allocated to a pod, it can only be reclaimed by killing the pod. + # Natan Yellin 22/09/2022 https://home.robusta.dev/blog/kubernetes-memory-limit + memory: 128Mi nodeSelector: {} @@ -162,6 +169,7 @@ worker: # cpuid: ## NOTE: whitelist has priority over blacklist # attributeBlacklist: + # - "AVX10" # - "BMI1" # - "BMI2" # - "CLMUL" @@ -391,6 +399,20 @@ worker: runAsNonRoot: true # runAsUser: 1000 + # livenessProbe: {} + ## NOTE: Currently not configurable, defaults are provided for the sake of extra documentation. + # grpc: + # port: 8082 + # initialDelaySeconds: 10 + # periodSeconds: 10 + # readinessProbe: {} + ## NOTE: Currently not configurable, defaults are provided for the sake of extra documentation. + # grpc: + # port: 8082 + # initialDelaySeconds: 5 + # periodSeconds: 10 + # failureThreshold: 10 + serviceAccount: # Specifies whether a service account should be created. # We create this by default to make it easier for downstream users to apply PodSecurityPolicies. @@ -408,17 +430,13 @@ worker: # Does not work on systems without /usr/src AND a read-only /usr, such as Talos mountUsrSrc: false - resources: {} - # We usually recommend not to specify default resources and to leave this as a conscious - # choice for the user. This also increases chances charts run on environments with little - # resources, such as Minikube. If you do want to specify resources, uncomment the following - # lines, adjust them as necessary, and remove the curly braces after 'resources:'. - # limits: - # cpu: 100m - # memory: 128Mi - # requests: - # cpu: 100m - # memory: 128Mi + resources: + limits: + cpu: 200m + memory: 512Mi + requests: + cpu: 5m + memory: 64Mi nodeSelector: {} @@ -466,17 +484,27 @@ topologyUpdater: readOnlyRootFilesystem: true runAsUser: 0 - resources: {} - # We usually recommend not to specify default resources and to leave this as a conscious - # choice for the user. This also increases chances charts run on environments with little - # resources, such as Minikube. If you do want to specify resources, uncomment the following - # lines, adjust them as necessary, and remove the curly braces after 'resources:'. - # limits: - # cpu: 100m - # memory: 128Mi - # requests: - # cpu: 100m - # memory: 128Mi + # livenessProbe: {} + ## NOTE: Currently not configurable, defaults are provided for the sake of extra documentation. + # grpc: + # port: 8082 + # initialDelaySeconds: 10 + # periodSeconds: 10 + # readinessProbe: {} + ## NOTE: Currently not configurable, defaults are provided for the sake of extra documentation. + # grpc: + # port: 8082 + # initialDelaySeconds: 5 + # periodSeconds: 10 + # failureThreshold: 10 + + resources: + limits: + cpu: 100m + memory: 60Mi + requests: + cpu: 50m + memory: 40Mi nodeSelector: {} tolerations: [] @@ -500,17 +528,13 @@ gc: podSecurityContext: {} - resources: {} - # We usually recommend not to specify default resources and to leave this as a conscious - # choice for the user. This also increases chances charts run on environments with little - # resources, such as Minikube. If you do want to specify resources, uncomment the following - # lines, adjust them as necessary, and remove the curly braces after 'resources:'. - # limits: - # cpu: 100m - # memory: 128Mi - # requests: - # cpu: 100m - # memory: 128Mi + resources: + limits: + cpu: 20m + memory: 1Gi + requests: + cpu: 10m + memory: 128Mi metricsPort: 8081 @@ -528,7 +552,11 @@ gc: tls: enable: false certManager: false + certManagerCertificate: + issuerKind: + issuerName: prometheus: enable: false + scrapeInterval: 10s labels: {}