Skip to content

Commit 520f392

Browse files
authored
Merge pull request #99 from frezes/feat/updateRecordingRules
[wiztelemetry-monitoring-helper] add kubelet rules
2 parents 507df3b + d6d8728 commit 520f392

File tree

9 files changed

+135
-11
lines changed

9 files changed

+135
-11
lines changed

charts/wiztelemetry-monitoring-helper/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 0.8.0
18+
version: 0.9.0
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to
2222
# follow Semantic Versioning. They should reflect the version the application is using.
2323
# It is recommended to use it with quotes.
24-
appVersion: "1.16.0"
24+
appVersion: "1.2.0"

charts/wiztelemetry-monitoring-helper/hack/sync_prometheus_rules.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,12 +132,13 @@ def new_representer(dumper, data):
132132
# Additional conditions map
133133
condition_map = {
134134
# wiztelemetry rules
135-
'wiztelemetry-apiserver.rules': ' .Values.defaultRules.rules.wiztelemetry.apiserver',
136135
'wiztelemetry-cluster.rules': ' .Values.defaultRules.rules.wiztelemetry.cluster',
137136
'wiztelemetry-workload.rules': ' .Values.defaultRules.rules.wiztelemetry.workload',
138137
'wiztelemetry-pod.rules': ' .Values.defaultRules.rules.wiztelemetry.pod',
139138
'wiztelemetry-node.rules': ' .Values.defaultRules.rules.wiztelemetry.node',
139+
'wiztelemetry-kubelet.rules': ' .Values.defaultRules.rules.wiztelemetry.kubelet',
140140
'wiztelemetry-etcd.rules': ' .Values.defaultRules.rules.wiztelemetry.etcd',
141+
'wiztelemetry-apiserver.rules': ' .Values.defaultRules.rules.wiztelemetry.apiserver',
141142
'wiztelemetry-kube-scheduler.rules': ' .Values.defaultRules.rules.wiztelemetry.scheduler',
142143
'wiztelemetry-ascend-npu.rules': ' .Values.defaultRules.rules.gpuDevice.ascendNPU',
143144
'wiztelemetry-cambricon-mlu.rules': ' .Values.defaultRules.rules.gpuDevice.cambriconMLU',
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
{{- /*
2+
Generated from 'wiztelemetry-kubelet.rules' group from file://../../../ks-prometheus/manifests/wiztelemetry-prometheusRule.yaml
3+
Do not change in-place! In order to change this file first read following link:
4+
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
5+
*/ -}}
6+
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
7+
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.wiztelemetry.kubelet }}
8+
apiVersion: monitoring.coreos.com/v1
9+
kind: PrometheusRule
10+
metadata:
11+
name: {{ printf "%s-%s" (include "wiztelemetry-monitoring-helper.fullname" .) "wiztelemetry-kubelet.rules" | trunc 63 | trimSuffix "-" }}
12+
namespace: {{ template "wiztelemetry-monitoring-helper.namespace" . }}
13+
labels:
14+
app: {{ template "wiztelemetry-monitoring-helper.name" . }}
15+
{{ include "wiztelemetry-monitoring-helper.labels" . | indent 4 }}
16+
{{- if .Values.defaultRules.labels }}
17+
{{ toYaml .Values.defaultRules.labels | indent 4 }}
18+
{{- end }}
19+
{{- if .Values.defaultRules.annotations }}
20+
annotations:
21+
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
22+
{{- end }}
23+
spec:
24+
groups:
25+
- name: wiztelemetry-kubelet.rules
26+
rules:
27+
- expr: histogram_quantile(0.99, sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance, le)(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance) group_left(node) kubelet_node_name{job="kubelet"})
28+
labels:
29+
quantile: '0.99'
30+
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.wiztelemetry.kubelet }}
31+
{{- with .Values.defaultRules.additionalRuleLabels }}
32+
{{- toYaml . | nindent 8 }}
33+
{{- end }}
34+
{{- with .Values.defaultRules.additionalRuleGroupLabels.wiztelemetry.kubelet }}
35+
{{- toYaml . | nindent 8 }}
36+
{{- end }}
37+
{{- end }}
38+
record: node_quantile:kubelet_pod_worker_duration_seconds:histogram_quantile
39+
- expr: histogram_quantile(0.99, sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance, operation_type, le)(rate(kubelet_runtime_operations_duration_seconds_bucket{job="kubelet"}[5m])) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance) group_left(node) kubelet_node_name{job="kubelet"})
40+
labels:
41+
quantile: '0.99'
42+
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.wiztelemetry.kubelet }}
43+
{{- with .Values.defaultRules.additionalRuleLabels }}
44+
{{- toYaml . | nindent 8 }}
45+
{{- end }}
46+
{{- with .Values.defaultRules.additionalRuleGroupLabels.wiztelemetry.kubelet }}
47+
{{- toYaml . | nindent 8 }}
48+
{{- end }}
49+
{{- end }}
50+
record: node_quantile:kubelet_runtime_operations_duration_seconds:histogram_quantile
51+
- expr: histogram_quantile(0.99, sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance, operation_name, volume_plugin, le)(rate(storage_operation_duration_seconds_bucket{job="kubelet"}[5m])) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance) group_left(node) kubelet_node_name{job="kubelet"})
52+
labels:
53+
quantile: '0.99'
54+
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.wiztelemetry.kubelet }}
55+
{{- with .Values.defaultRules.additionalRuleLabels }}
56+
{{- toYaml . | nindent 8 }}
57+
{{- end }}
58+
{{- with .Values.defaultRules.additionalRuleGroupLabels.wiztelemetry.kubelet }}
59+
{{- toYaml . | nindent 8 }}
60+
{{- end }}
61+
{{- end }}
62+
record: node_quantile:storage_operation_duration_seconds:histogram_quantile
63+
{{- end }}

charts/wiztelemetry-monitoring-helper/templates/wiztelemetry-rules/wiztelemetry-workload.rules.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ spec:
162162
{{- end }}
163163
{{- end }}
164164
record: namespace_workload:workload_created:relabel
165-
- expr: label_replace(kube_deployment_status_replicas{job="kube-state-metrics"},"workload", "$1", "deployment", "(.*)")
165+
- expr: label_replace(kube_deployment_spec_replicas{job="kube-state-metrics"},"workload", "$1", "deployment", "(.*)")
166166
labels:
167167
workload_type: deployment
168168
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.wiztelemetry.workload }}
@@ -198,7 +198,7 @@ spec:
198198
{{- end }}
199199
{{- end }}
200200
record: namespace_workload:workload_created:relabel
201-
- expr: label_replace(kube_statefulset_status_replicas{job="kube-state-metrics"},"workload", "$1", "statefulset", "(.*)")
201+
- expr: label_replace(kube_statefulset_replicas{job="kube-state-metrics"},"workload", "$1", "statefulset", "(.*)")
202202
labels:
203203
workload_type: statefulset
204204
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.wiztelemetry.workload }}

ks-prometheus/components/wiztelemetry-mixin/rules/custom.libsonnet

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -536,7 +536,7 @@
536536
{
537537
record: 'namespace_workload:workload_replicas:relabel',
538538
expr: |||
539-
label_replace(kube_deployment_status_replicas{job="kube-state-metrics"},"workload", "$1", "deployment", "(.*)")
539+
label_replace(kube_deployment_spec_replicas{job="kube-state-metrics"},"workload", "$1", "deployment", "(.*)")
540540
||| % $._config,
541541
labels: {
542542
workload_type: 'deployment',
@@ -563,7 +563,7 @@
563563
{
564564
record: 'namespace_workload:workload_replicas:relabel',
565565
expr: |||
566-
label_replace(kube_statefulset_status_replicas{job="kube-state-metrics"},"workload", "$1", "statefulset", "(.*)")
566+
label_replace(kube_statefulset_replicas{job="kube-state-metrics"},"workload", "$1", "statefulset", "(.*)")
567567
||| % $._config,
568568
labels: {
569569
workload_type: 'statefulset',
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
{
2+
_config+:: {
3+
kubeletSelector: 'job="kubelet"',
4+
quantileP99: "0.99",
5+
},
6+
7+
prometheusRules+:: {
8+
groups+: [
9+
{
10+
name: 'wiztelemetry-kubelet.rules',
11+
rules: [
12+
{
13+
record: 'node_quantile:kubelet_pod_worker_duration_seconds:histogram_quantile',
14+
expr: |||
15+
histogram_quantile(%(quantileP99)s, sum by (%(clusterLabel)s, instance, le)(rate(kubelet_pod_worker_duration_seconds_bucket{%(kubeletSelector)s}[5m])) * on(%(clusterLabel)s, instance) group_left(node) kubelet_node_name{%(kubeletSelector)s})
16+
||| % $._config,
17+
labels: {
18+
quantile: "0.99",
19+
}
20+
},
21+
{
22+
record: 'node_quantile:kubelet_runtime_operations_duration_seconds:histogram_quantile',
23+
expr: |||
24+
histogram_quantile(%(quantileP99)s, sum by (%(clusterLabel)s, instance, operation_type, le)(rate(kubelet_runtime_operations_duration_seconds_bucket{%(kubeletSelector)s}[5m])) * on(%(clusterLabel)s, instance) group_left(node) kubelet_node_name{%(kubeletSelector)s})
25+
||| % $._config,
26+
labels: {
27+
quantile: "0.99",
28+
}
29+
},
30+
{
31+
record: 'node_quantile:storage_operation_duration_seconds:histogram_quantile',
32+
expr: |||
33+
histogram_quantile(%(quantileP99)s, sum by (%(clusterLabel)s, instance, operation_name, volume_plugin, le)(rate(storage_operation_duration_seconds_bucket{%(kubeletSelector)s}[5m])) * on(%(clusterLabel)s, instance) group_left(node) kubelet_node_name{%(kubeletSelector)s})
34+
||| % $._config,
35+
labels: {
36+
quantile: "0.99",
37+
}
38+
},
39+
],
40+
},
41+
],
42+
},
43+
}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
(import 'custom.libsonnet') + (import 'gpu.libsonnet')
1+
(import 'custom.libsonnet') + (import 'gpu.libsonnet') + (import 'kubelet.libsonnet')

ks-prometheus/jsonnetfile.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
"subdir": "jsonnet/kube-prometheus"
99
}
1010
},
11-
"version": "v0.13.0"
11+
"version": "v0.14.0"
1212
}
1313
],
1414
"legacyImports": true

ks-prometheus/manifests/wiztelemetry-prometheusRule.yaml

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,7 @@ spec:
381381
workload_type: deployment
382382
record: namespace_workload:workload_created:relabel
383383
- expr: |
384-
label_replace(kube_deployment_status_replicas{job="kube-state-metrics"},"workload", "$1", "deployment", "(.*)")
384+
label_replace(kube_deployment_spec_replicas{job="kube-state-metrics"},"workload", "$1", "deployment", "(.*)")
385385
labels:
386386
workload_type: deployment
387387
record: namespace_workload:workload_replicas:relabel
@@ -396,7 +396,7 @@ spec:
396396
workload_type: statefulset
397397
record: namespace_workload:workload_created:relabel
398398
- expr: |
399-
label_replace(kube_statefulset_status_replicas{job="kube-state-metrics"},"workload", "$1", "statefulset", "(.*)")
399+
label_replace(kube_statefulset_replicas{job="kube-state-metrics"},"workload", "$1", "statefulset", "(.*)")
400400
labels:
401401
workload_type: statefulset
402402
record: namespace_workload:workload_replicas:relabel
@@ -960,3 +960,20 @@ spec:
960960
kube_node_status_allocatable{job="kube-state-metrics",resource="huawei_com_Ascend910"}
961961
)
962962
record: node:node_gpu_num:sum
963+
- name: wiztelemetry-kubelet.rules
964+
rules:
965+
- expr: |
966+
histogram_quantile(0.99, sum by (cluster, instance, le)(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet"})
967+
labels:
968+
quantile: "0.99"
969+
record: node_quantile:kubelet_pod_worker_duration_seconds:histogram_quantile
970+
- expr: |
971+
histogram_quantile(0.99, sum by (cluster, instance, operation_type, le)(rate(kubelet_runtime_operations_duration_seconds_bucket{job="kubelet"}[5m])) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet"})
972+
labels:
973+
quantile: "0.99"
974+
record: node_quantile:kubelet_runtime_operations_duration_seconds:histogram_quantile
975+
- expr: |
976+
histogram_quantile(0.99, sum by (cluster, instance, operation_name, volume_plugin, le)(rate(storage_operation_duration_seconds_bucket{job="kubelet"}[5m])) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet"})
977+
labels:
978+
quantile: "0.99"
979+
record: node_quantile:storage_operation_duration_seconds:histogram_quantile

0 commit comments

Comments
 (0)