From 5d73d7fb8fe574df643f0034a68b172c7c472ba1 Mon Sep 17 00:00:00 2001 From: chipzoller Date: Thu, 22 Aug 2024 08:34:56 -0400 Subject: [PATCH] config custom metrics Signed-off-by: chipzoller finalize Signed-off-by: Chip Zoller fix nil pointer Signed-off-by: Chip Zoller newline Signed-off-by: chipzoller empty Signed-off-by: chipzoller change if Signed-off-by: chipzoller fix Signed-off-by: chipzoller --- .../gpu-operator/templates/clusterpolicy.yaml | 5 +++-- .../templates/dcgm_exporter_config.yaml | 14 ++++++++++++++ deployments/gpu-operator/values.yaml | 19 +++++++++++++++++++ 3 files changed, 36 insertions(+), 2 deletions(-) create mode 100644 deployments/gpu-operator/templates/dcgm_exporter_config.yaml diff --git a/deployments/gpu-operator/templates/clusterpolicy.yaml b/deployments/gpu-operator/templates/clusterpolicy.yaml index 259082773..af9e87c38 100644 --- a/deployments/gpu-operator/templates/clusterpolicy.yaml +++ b/deployments/gpu-operator/templates/clusterpolicy.yaml @@ -511,8 +511,9 @@ spec: {{- if .Values.dcgmExporter.args }} args: {{ toYaml .Values.dcgmExporter.args | nindent 6 }} {{- end }} - {{- if .Values.dcgmExporter.config }} - config: {{ toYaml .Values.dcgmExporter.config | nindent 6 }} + {{- if and (.Values.dcgmExporter.config) (.Values.dcgmExporter.config.name) }} + config: + name: {{ .Values.dcgmExporter.config.name }} {{- end }} {{- if .Values.dcgmExporter.serviceMonitor }} serviceMonitor: {{ toYaml .Values.dcgmExporter.serviceMonitor | nindent 6 }} diff --git a/deployments/gpu-operator/templates/dcgm_exporter_config.yaml b/deployments/gpu-operator/templates/dcgm_exporter_config.yaml new file mode 100644 index 000000000..c4bf6dcc8 --- /dev/null +++ b/deployments/gpu-operator/templates/dcgm_exporter_config.yaml @@ -0,0 +1,14 @@ +{{- if .Values.dcgmExporter.config }} +{{- if and (.Values.dcgmExporter.config.create) (not (empty .Values.dcgmExporter.config.data)) }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Values.dcgmExporter.config.name }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "gpu-operator.labels" . | nindent 4 }} +data: + dcgm-metrics.csv: | +{{- .Values.dcgmExporter.config.data | nindent 4 }} +{{- end }} +{{- end }} diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index abb5e958b..e04dd760b 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -332,7 +332,26 @@ dcgmExporter: # target_label: instance # replacement: $1 # action: replace + # DCGM Exporter configuration + # This block is used to configure DCGM Exporter to emit a customized list of metrics. + # Use "name" to either point to an existing ConfigMap or to create a new one with a + # list of configurations (i.e with create=true). + # When pointing to an existing ConfigMap, the ConfigMap must exist in the same namespace as the release. + # The metrics are expected to be listed under a key called `dcgm-metrics.csv`. + # Use "data" to build an integrated ConfigMap from a set of custom metrics as + # part of the chart. An example of some custom metrics are shown below. Note that + # the contents of "data" must be in CSV format and be valid DCGM Exporter metric configurations. + # config: + # name: custom-dcgm-exporter-metrics + # create: true + # data: |- + # Format + # If line starts with a '#' it is considered a comment + # DCGM FIELD, Prometheus metric type, help message + # Clocks + # DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). + # DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). gfd: enabled: true repository: nvcr.io/nvidia