Skip to content

Commit e6e80a2

Browse files
authored
Merge pull request #107 from frezes/fix
[wiztelemetry-monitoring-helper] fix npu recording rules
2 parents 6faefb4 + b28efc4 commit e6e80a2

File tree

4 files changed

+16
-16
lines changed

4 files changed

+16
-16
lines changed

charts/wiztelemetry-monitoring-helper/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 0.10.1
18+
version: 0.10.2
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to

charts/wiztelemetry-monitoring-helper/templates/wiztelemetry-rules/wiztelemetry-ascend-npu.rules.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ spec:
126126
{{- end }}
127127
- expr: |-
128128
label_replace(
129-
label_replace(npu_chip_info_hbm_used_memory{job="npu-exporter"}, "device_num", "npu${1}", "id", "(.*)") * 1024 * 1024,
129+
label_replace(npu_chip_info_used_memory{job="npu-exporter"}, "device_num", "npu${1}", "id", "(.*)") * 1024 * 1024,
130130
"device_name",
131131
"$1",
132132
"model_name",
@@ -144,7 +144,7 @@ spec:
144144
{{- end }}
145145
- expr: |-
146146
label_replace(
147-
label_replace(npu_chip_info_hbm_total_memory{job="npu-exporter"}, "device_num", "npu${1}", "id", "(.*)") * 1024 * 1024,
147+
label_replace(npu_chip_info_total_memory{job="npu-exporter"}, "device_num", "npu${1}", "id", "(.*)") * 1024 * 1024,
148148
"device_name",
149149
"$1",
150150
"model_name",
@@ -163,7 +163,7 @@ spec:
163163
- expr: |-
164164
label_replace(
165165
label_replace(
166-
npu_chip_info_hbm_used_memory{job="npu-exporter"} / npu_chip_info_hbm_total_memory{job="npu-exporter"},
166+
npu_chip_info_used_memory{job="npu-exporter"} / npu_chip_info_total_memory{job="npu-exporter"},
167167
"device_num",
168168
"npu${1}",
169169
"id",
@@ -204,7 +204,7 @@ spec:
204204
{{- end }}
205205
- expr: |-
206206
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) (
207-
kube_pod_container_resource_requests{job="kube-state-metrics",resource="huawei_com_Ascend910"}
207+
kube_pod_container_resource_requests{job="kube-state-metrics",resource=~"huawei_com_Ascend(.*)"}
208208
)
209209
record: node:node_gpu_allocated_num:sum
210210
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.ascendNPU }}
@@ -218,7 +218,7 @@ spec:
218218
{{- end }}
219219
- expr: |-
220220
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) (
221-
kube_node_status_allocatable{job="kube-state-metrics",resource="huawei_com_Ascend910"}
221+
kube_node_status_allocatable{job="kube-state-metrics",resource=~"huawei_com_Ascend(.*)"}
222222
)
223223
record: node:node_gpu_num:sum
224224
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.gpuDevice.ascendNPU }}

ks-prometheus/components/wiztelemetry-mixin/rules/gpu.libsonnet

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,7 @@
426426
record: 'node:gpu_device:gpu_memory_used_bytes',
427427
expr: |||
428428
label_replace(
429-
label_replace(npu_chip_info_hbm_used_memory{%(ascendNPUMonitoringSelector)s}, "device_num", "npu${1}", "id", "(.*)") * 1024 * 1024,
429+
label_replace(npu_chip_info_used_memory{%(ascendNPUMonitoringSelector)s}, "device_num", "npu${1}", "id", "(.*)") * 1024 * 1024,
430430
"device_name",
431431
"$1",
432432
"model_name",
@@ -438,7 +438,7 @@
438438
record: 'node:gpu_device:gpu_memory_total_bytes',
439439
expr: |||
440440
label_replace(
441-
label_replace(npu_chip_info_hbm_total_memory{%(ascendNPUMonitoringSelector)s}, "device_num", "npu${1}", "id", "(.*)") * 1024 * 1024,
441+
label_replace(npu_chip_info_total_memory{%(ascendNPUMonitoringSelector)s}, "device_num", "npu${1}", "id", "(.*)") * 1024 * 1024,
442442
"device_name",
443443
"$1",
444444
"model_name",
@@ -451,7 +451,7 @@
451451
expr: |||
452452
label_replace(
453453
label_replace(
454-
npu_chip_info_hbm_used_memory{%(ascendNPUMonitoringSelector)s} / npu_chip_info_hbm_total_memory{%(ascendNPUMonitoringSelector)s},
454+
npu_chip_info_used_memory{%(ascendNPUMonitoringSelector)s} / npu_chip_info_total_memory{%(ascendNPUMonitoringSelector)s},
455455
"device_num",
456456
"npu${1}",
457457
"id",
@@ -480,15 +480,15 @@
480480
record: 'node:node_gpu_allocated_num:sum',
481481
expr: |||
482482
sum by (%(clusterLabel)s, node) (
483-
kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s,resource="huawei_com_Ascend910"}
483+
kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s,resource=~"huawei_com_Ascend(.*)"}
484484
)
485485
||| % $._config,
486486
},
487487
{
488488
record: 'node:node_gpu_num:sum',
489489
expr: |||
490490
sum by(%(clusterLabel)s, node) (
491-
kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="huawei_com_Ascend910"}
491+
kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource=~"huawei_com_Ascend(.*)"}
492492
)
493493
||| % $._config,
494494
},

ks-prometheus/manifests/wiztelemetry-prometheusRule.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -910,7 +910,7 @@ spec:
910910
record: node:gpu_device:gpu_power_usage
911911
- expr: |
912912
label_replace(
913-
label_replace(npu_chip_info_hbm_used_memory{job="npu-exporter"}, "device_num", "npu${1}", "id", "(.*)") * 1024 * 1024,
913+
label_replace(npu_chip_info_used_memory{job="npu-exporter"}, "device_num", "npu${1}", "id", "(.*)") * 1024 * 1024,
914914
"device_name",
915915
"$1",
916916
"model_name",
@@ -919,7 +919,7 @@ spec:
919919
record: node:gpu_device:gpu_memory_used_bytes
920920
- expr: |
921921
label_replace(
922-
label_replace(npu_chip_info_hbm_total_memory{job="npu-exporter"}, "device_num", "npu${1}", "id", "(.*)") * 1024 * 1024,
922+
label_replace(npu_chip_info_total_memory{job="npu-exporter"}, "device_num", "npu${1}", "id", "(.*)") * 1024 * 1024,
923923
"device_name",
924924
"$1",
925925
"model_name",
@@ -929,7 +929,7 @@ spec:
929929
- expr: |
930930
label_replace(
931931
label_replace(
932-
npu_chip_info_hbm_used_memory{job="npu-exporter"} / npu_chip_info_hbm_total_memory{job="npu-exporter"},
932+
npu_chip_info_used_memory{job="npu-exporter"} / npu_chip_info_total_memory{job="npu-exporter"},
933933
"device_num",
934934
"npu${1}",
935935
"id",
@@ -952,12 +952,12 @@ spec:
952952
record: node:gpu_device:gpu_utilization
953953
- expr: |
954954
sum by (cluster, node) (
955-
kube_pod_container_resource_requests{job="kube-state-metrics",resource="huawei_com_Ascend910"}
955+
kube_pod_container_resource_requests{job="kube-state-metrics",resource=~"huawei_com_Ascend(.*)"}
956956
)
957957
record: node:node_gpu_allocated_num:sum
958958
- expr: |
959959
sum by(cluster, node) (
960-
kube_node_status_allocatable{job="kube-state-metrics",resource="huawei_com_Ascend910"}
960+
kube_node_status_allocatable{job="kube-state-metrics",resource=~"huawei_com_Ascend(.*)"}
961961
)
962962
record: node:node_gpu_num:sum
963963
- name: wiztelemetry-kubelet.rules

0 commit comments

Comments
 (0)