Skip to content

Commit 5da0355

Browse files
authored
Merge pull request kruize#1269 from shreyabiradar07/metricProfile-genRecommendations
Generate recommendations using MetricProfile Queries
2 parents cc2da86 + 499a252 commit 5da0355

File tree

3 files changed

+298
-30
lines changed

3 files changed

+298
-30
lines changed

manifests/autotune/performance-profiles/resource_optimization_local_monitoring.yaml

Lines changed: 64 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,17 @@ slo:
2323

2424
aggregation_functions:
2525
- function: 'avg'
26-
query: 'avg(kube_pod_container_resource_requests{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container="$CONTAINER_NAME$", namespace="$NAMESPACE", resource="cpu", unit="core"})'
26+
query: 'avg by(container, namespace) (kube_pod_container_resource_requests{container!='', container!="", container!="POD", pod!="", resource="cpu", unit="core" ,namespace="$NAMESPACE$",container="$CONTAINER_NAME$"})'
2727

2828
# Show sum of cpu requests in bytes for a container in a deployment
2929
- function: 'sum'
30-
query: 'sum(kube_pod_container_resource_requests{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container="$CONTAINER_NAME$", namespace="$NAMESPACE", resource="cpu", unit="core"})'
30+
query: 'sum by(container, namespace) (kube_pod_container_resource_requests{container!='', container!="", container!="POD", pod!="", resource="cpu", unit="core" ,namespace="$NAMESPACE$",container="$CONTAINER_NAME$"})'
3131

32+
- function: 'min'
33+
query: 'min by(container, namespace) (kube_pod_container_resource_requests{container!='', container!="", container!="POD", pod!="", resource="cpu", unit="core" ,namespace="$NAMESPACE$",container="$CONTAINER_NAME$"})'
34+
35+
- function: 'max'
36+
query: 'max by(container, namespace) (kube_pod_container_resource_requests{container!='', container!="", container!="POD", pod!="", resource="cpu", unit="core" ,namespace="$NAMESPACE$",container="$CONTAINER_NAME$"})'
3237

3338
# CPU Limit
3439
# Show cpu limits in bytes for a container in a deployment
@@ -39,11 +44,17 @@ slo:
3944

4045
aggregation_functions:
4146
- function: avg
42-
query: 'avg(kube_pod_container_resource_limits{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container="$CONTAINER_NAME$", namespace="$NAMESPACE", resource="cpu", unit="core"})'
47+
query: 'avg by(container,namespace) (kube_pod_container_resource_limits{container!='', container!="", container!="POD", pod!="", resource="cpu", unit="core",namespace="$NAMESPACE$",container="$CONTAINER_NAME$"})'
4348

4449
# Show sum of cpu limits in bytes for a container in a deployment
4550
- function: sum
46-
query: 'sum(kube_pod_container_resource_limits{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container="$CONTAINER_NAME$", namespace="$NAMESPACE$", resource="cpu", unit="core"})'
51+
query: 'sum by(container,namespace) (kube_pod_container_resource_limits{container!='',container!="", container!="POD", pod!="", resource="cpu", unit="core",namespace="$NAMESPACE$",container="$CONTAINER_NAME$"})'
52+
53+
- function: 'max'
54+
query: 'max by(container,namespace) (kube_pod_container_resource_limits{container!='', container!="", container!="POD", pod!="", resource="cpu", unit="core",namespace="$NAMESPACE$",container="$CONTAINER_NAME$"})'
55+
56+
- function: 'max'
57+
query: 'min by(container,namespace) (kube_pod_container_resource_limits{container!='', container!="", container!="POD", pod!="", resource="cpu", unit="core",namespace="$NAMESPACE$",container="$CONTAINER_NAME$"})'
4758

4859

4960
# CPU Usage
@@ -65,45 +76,45 @@ slo:
6576
# For openshift versions <=4.8
6677
aggregation_functions:
6778
- function: avg
68-
query: 'avg(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container=$CONTAINER_NAME$”}[15m]))'
79+
query: 'avg by(container, namespace)(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{container!='', container!="POD", pod!="",namespace="$NAMESPACE$",container="$CONTAINER_NAME$" }[$MEASUREMENT_DURATION_IN_MIN$m]))'
6980
versions: "<=4.8"
7081

7182
# For openshift versions >=4.9
7283
- function: avg
73-
query: 'avg(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container=$CONTAINER_NAME$”}[15m]))'
84+
query: 'avg by(container, namespace)(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!='', container!="POD", pod!="",namespace="$NAMESPACE$",container="$CONTAINER_NAME$" }[$MEASUREMENT_DURATION_IN_MIN$m]))'
7485
versions: ">4.9"
7586

7687
# Approx minimum CPU per container in a deployment
7788
# For openshift versions <=4.8
7889
- function: min
79-
query: 'min(min_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container="$CONTAINER_NAME$"}[15m]))'
90+
query: 'min by(container, namespace)(min_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{container!='', container!="POD", pod!="",namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\" }[$MEASUREMENT_DURATION_IN_MIN$m]))'
8091
versions: "<=4.8"
8192

8293
# For openshift versions >=4.9
8394
- function: min
84-
query: 'min(min_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container="$CONTAINER_NAME$"}[15m]))'
95+
query: 'min by(container, namespace)(min_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!='', container!="POD", pod!="",namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))'
8596
versions: ">4.9"
8697

8798
# Approx maximum CPU per container in a deployment
8899
# For openshift versions <=4.8
89100
- function: max
90-
query: 'max(max_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container="$CONTAINER_NAME$"}[15m]))'
101+
query: 'max by(container, namespace)(max_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{container!='', container!="POD", pod!="",namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\" }[$MEASUREMENT_DURATION_IN_MIN$m]))'
91102
versions: "<=4.8"
92103

93104
# For openshift versions >=4.9
94105
- function: max
95-
query: 'max(max_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container="$CONTAINER_NAME$"}[15m]))'
106+
query: 'max by(container, namespace)(max_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!='', container!="POD", pod!="",namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\" }[$MEASUREMENT_DURATION_IN_MIN$m]))'
96107
versions: ">4.9"
97108

98109
# Sum of CPU usage for a container in all pods of a deployment
99110
# For openshift versions <=4.8
100111
- function: sum
101-
query: 'sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container="$CONTAINER_NAME$"}[15m]))'
112+
query: 'sum by(container, namespace)(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{container!='', container!="POD", pod!="",namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\" }[$MEASUREMENT_DURATION_IN_MIN$m]))'
102113
versions: "<=4.8"
103114

104115
# For openshift versions >=4.9
105116
- function: sum
106-
query: 'sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container="$CONTAINER_NAME$"}[15m]))'
117+
query: 'sum by(container, namespace)(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!='', container!="POD", pod!="",namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\" }[$MEASUREMENT_DURATION_IN_MIN$m]))'
107118
versions: ">4.9"
108119

109120

@@ -116,15 +127,19 @@ slo:
116127
aggregation_functions:
117128
# Average CPU throttling per container in a deployment
118129
- function: avg
119-
query: 'avg(rate(container_cpu_cfs_throttled_seconds_total{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container=$CONTAINER_NAME$”}[15m]))'
130+
query: 'avg by(container,namespace) (rate(container_cpu_cfs_throttled_seconds_total{container!='', container!="POD", pod!="",namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))'
120131

121132
# Maximum CPU throttling per container in a deployment
122133
- function: max
123-
query: 'max(rate(container_cpu_cfs_throttled_seconds_total{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container=”$CONTAINER_NAME$”}[15m]))'
134+
query: 'max by(container,namespace) (rate(container_cpu_cfs_throttled_seconds_total{container!='', container!="POD", pod!="",namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m])))'
135+
136+
# Min of CPU throttling for a container in all pods of a deployment
137+
- function: min
138+
query: 'min by(container,namespace) (rate(container_cpu_cfs_throttled_seconds_total{container!='', container!="POD", pod!="",namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m])'
124139

125140
# Sum of CPU throttling for a container in all pods of a deployment
126141
- function: sum
127-
query: 'sum(rate(container_cpu_cfs_throttled_seconds_total{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace="$NAMESPACE$", container=$CONTAINER_NAME$”}[15m]))'
142+
query: 'sum by(container,namespace) (rate(container_cpu_cfs_throttled_seconds_total{container!='', container!="POD", pod!="",namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))'
128143

129144

130145

@@ -139,11 +154,17 @@ slo:
139154

140155
aggregation_functions:
141156
- function: avg
142-
query: 'avg(kube_pod_container_resource_requests{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container=”$CONTAINER_NAME$”, namespace=”$NAMESPACE”, resource="memory", unit="byte"})'
157+
query: 'avg by(container, namespace) (kube_pod_container_resource_requests{container!='', container!="POD", pod!="", resource="memory", unit="byte" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"})'
143158

144159
# Show sum of memory requests in bytes for a container in a deployment
145160
- function: sum
146-
query: 'sum(kube_pod_container_resource_requests{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container=”$CONTAINER_NAME$”, namespace=”$NAMESPACE”, resource="memory", unit="byte"})'
161+
query: 'sum by(container, namespace) (kube_pod_container_resource_requests{container!='', container!="POD", pod!="", resource="memory", unit="byte" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"})'
162+
163+
- function: max
164+
query: 'max by(container, namespace) (kube_pod_container_resource_requests{container!='', container!="POD", pod!="", resource="memory", unit="byte" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"})'
165+
166+
- function: min
167+
query: 'min by(container, namespace) (kube_pod_container_resource_requests{container!='', container!="POD", pod!="", resource="memory", unit="byte" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"})'
147168

148169

149170
# Memory Limit
@@ -155,12 +176,17 @@ slo:
155176

156177
aggregation_functions:
157178
- function: avg
158-
query: 'avg(kube_pod_container_resource_limits{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container="$CONTAINER_NAME$", namespace="$NAMESPACE", resource="memory", unit="byte"})'
179+
query: 'avg by(container,namespace) (kube_pod_container_resource_limits{container!='', container!="POD", pod!="", resource="memory", unit="byte" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"})'
159180

160181
# Show sum of memory limits in bytes for a container in a deployment
161182
- function: sum
162-
query: 'sum(kube_pod_container_resource_limits{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", container=”$CONTAINER_NAME$”, namespace=”$NAMESPACE”, resource="memory", unit="byte"})'
183+
query: 'sum by(container,namespace) (kube_pod_container_resource_limits{container!='', container!="POD", pod!="", resource="memory", unit="byte" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"})'
184+
185+
- function: max
186+
query: 'max by(container,namespace) (kube_pod_container_resource_limits{container!='', container!="POD", pod!="", resource="memory", unit="byte" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"})'
163187

188+
- function: min
189+
query: 'min by(container,namespace) (kube_pod_container_resource_limits{container!='', container!="POD", pod!="", resource="memory", unit="byte" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"})'
164190

165191
# Memory Usage
166192
# Average memory per container in a deployment
@@ -171,19 +197,19 @@ slo:
171197

172198
aggregation_functions:
173199
- function: avg
174-
query: 'avg(avg_over_time(container_memory_working_set_bytes{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container=$CONTAINER_NAME$”}[15m]))'
200+
query: 'avg by(container, namespace) (avg_over_time(container_memory_working_set_bytes{container!='', container!="POD", pod!="", namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))'
175201

176202
# Approx minimum memory per container in a deployment
177203
- function: min
178-
query: 'min(min_over_time(container_memory_working_set_bytes{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container="$CONTAINER_NAME$"}[15m]))'
204+
query: 'min by(container, namespace) (min_over_time(container_memory_working_set_bytes{container!='', container!="POD", pod!="", namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\" }[$MEASUREMENT_DURATION_IN_MIN$m])'
179205

180206
# Approx maximum memory per container in a deployment
181207
- function: max
182-
query: 'max(max_over_time(container_memory_working_set_bytes{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container="$CONTAINER_NAME$"}[15m]))'
208+
query: 'max by(container, namespace) (max_over_time(container_memory_working_set_bytes{container!='', container!="POD", pod!="", namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))'
183209

184210
# Sum of memory usage for a contianer in all pods of a deployment
185211
- function: sum
186-
query: 'sum(avg_over_time(container_memory_working_set_bytes{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container="$CONTAINER_NAME$"}[15m]))'
212+
query: 'sum by(container, namespace) (avg_over_time(container_memory_working_set_bytes{container!='', container!="POD", pod!="", namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))'
187213

188214

189215
# 2.4 Memory RSS
@@ -195,17 +221,28 @@ slo:
195221
aggregation_functions:
196222
# Average memory RSS per container in a deployment
197223
- function: avg
198-
query: 'avg(avg_over_time(container_memory_rss{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container=$CONTAINER_NAME$”}[15m]))'
224+
query: 'avg by(container, namespace) (avg_over_time(container_memory_rss{container!='', container!="POD", pod!="" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))'
199225

200226
# Approx minimum memory RSS per container in a deployment
201227
- function: min
202-
query: 'min(min_over_time(container_memory_rss{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container="$CONTAINER_NAME$"}[15m]))'
228+
query: 'min by(container, namespace) (min_over_time(container_memory_rss{container!='', container!="POD", pod!="" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m])'
203229

204230

205231
# Approx maximum memory RSS per container in a deployment
206232
- function: max
207-
query: 'max(max_over_time(container_memory_rss{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container="$CONTAINER_NAME$"}[15m]))'
233+
query: 'max by(container, namespace) (max_over_time(container_memory_rss{container!='', container!="POD", pod!="" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))'
208234

209235
# Sum of memory RSS for a contianer in all pods of a deployment
210236
- function: sum
211-
query: 'sum(avg_over_time(container_memory_rss{pod=~"$DEPLOYMENT_NAME$-[^-]*-[^-]*$", namespace=$NAMESPACE$, container=”$CONTAINER_NAME$”}[15m]))'
237+
query: 'sum by(container, namespace) (avg_over_time(container_memory_rss{container!='', container!="POD", pod!="" ,namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"}[$MEASUREMENT_DURATION_IN_MIN$m]))'
238+
239+
240+
# Container Last Active Timestamp
241+
- name: maxDate
242+
datasource: prometheus
243+
value_type: "double"
244+
kubernetes_object: "container"
245+
246+
aggregation_functions:
247+
- function: max
248+
query: 'max by(namespace,container) (last_over_time((timestamp(container_cpu_usage_seconds_total{namespace=\"$NAMESPACE$\",container=\"$CONTAINER_NAME$\"} > 0))[15d:]))'

0 commit comments

Comments
 (0)