Skip to content

Commit

Permalink
Add metrics-server panels for monitoring stack ... (#121)
Browse files Browse the repository at this point in the history
... also resizes some other panels that were aggregiously large
  • Loading branch information
afcollins authored Jul 15, 2024
1 parent b85b7ae commit fd68ef1
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 6 deletions.
2 changes: 1 addition & 1 deletion assets/ocp-performance/panels.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn
'max',
'min',
])
+ options.legend.withSortBy('max')
+ options.legend.withSortBy('Max')
+ options.legend.withSortDesc(true)
+ options.legend.withPlacement('bottom'),
},
Expand Down
17 changes: 16 additions & 1 deletion assets/ocp-performance/queries.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -103,11 +103,26 @@ local generateTimeSeriesQuery(query, legend) = [
query():
generateTimeSeriesQuery('topk(10, sum(container_memory_rss{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}) by (node))', '{{node}}')
},
promReplCpuUsage: {
query():
generateTimeSeriesQuery('sum(irate(container_cpu_usage_seconds_total{pod=~"prometheus-k8s-0",namespace!="",name!="",container="prometheus"}[$interval])) by (pod,container) * 100', '{{pod}}')
+ generateTimeSeriesQuery('sum(irate(container_cpu_usage_seconds_total{pod=~"prometheus-k8s-1",namespace!="",name!="",container="prometheus"}[$interval])) by (pod,container) * 100', '{{pod}}')
},
promReplMemUsage: {
query():
generateTimeSeriesQuery('sum(container_memory_rss{pod="prometheus-k8s-1",namespace!="",name!="",container="prometheus"}) by (pod)', '{{pod}}')
+ generateTimeSeriesQuery('sum(container_memory_rss{pod="prometheus-k8s-0",namespace!="",name!="",container="prometheus"}) by (pod)', '{{pod}}')
},
metricsServerCpuUsage: {
query():
generateTimeSeriesQuery('sum(irate(container_cpu_usage_seconds_total{pod=~"metrics-server-.*",namespace!="",name!=""}[$interval])) by (pod,container) * 100', '{{pod}}')
+ generateTimeSeriesQuery('sum(irate(container_cpu_usage_seconds_total{pod=~"prometheus-adapter-.*",namespace="openshift-monitoring",name!=""}[$interval])) by (pod,container) * 100', '{{pod}}')
},
metricsServerMemUsage: {
query():
generateTimeSeriesQuery('sum(container_memory_rss{pod=~"metrics-server-.*",namespace!="",name!=""}) by (pod)', '{{pod}}')
+ generateTimeSeriesQuery('sum(container_memory_rss{pod=~"prometheus-adapter-.*",namespace="openshift-monitoring",name!=""}) by (pod)', '{{pod}}')
},
kubeletCPU: {
query():
generateTimeSeriesQuery('topk(10,irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[$interval])*100)', 'kubelet - {{node}}')
Expand Down Expand Up @@ -202,4 +217,4 @@ local generateTimeSeriesQuery(query, legend) = [
query():
generateTimeSeriesQuery('cluster_operator_conditions{condition="Degraded",name!="",reason!=""}', '{{name}} - {{reason}}')
},
}
}
11 changes: 7 additions & 4 deletions templates/General/ocp-performance-v2.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ g.dashboard.new('Openshift Performance')
+ g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 })
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withPanels([
panels.timeSeries.generic('99% Pod Annotation Latency', 's', queries.ovnAnnotationLatency.query(), { x: 0, y: 1, w: 24, h: 12 }),
panels.timeSeries.generic('99% Pod Annotation Latency', 's', queries.ovnAnnotationLatency.query(), { x: 0, y: 1, w: 24, h: 4 }),
panels.timeSeries.generic('99% CNI Request ADD Latency', 's', queries.ovnCNIAdd.query(), { x: 0, y: 13, w: 12, h: 8 }),
panels.timeSeries.generic('99% CNI Request DEL Latency', 's', queries.ovnCNIDel.query(), { x: 12, y: 13, w: 12, h: 8 }),
panels.timeSeries.genericLegend('ovnkube-master CPU Usage', 'percent', queries.ovnKubeMasterCPU.query(), { x: 0, y: 21, w: 12, h: 8 }),
Expand All @@ -42,14 +42,17 @@ g.dashboard.new('Openshift Performance')
+ g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 })
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withPanels([
panels.timeSeries.genericLegend('Prometheus Replica Memory usage', 'bytes', queries.promReplMemUsage.query(), { x: 0, y: 2, w: 24, h: 12 }),
panels.timeSeries.genericLegend('Prometheus Replica CPU', 'percent', queries.promReplCpuUsage.query(), { x: 0, y: 2, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Prometheus Replica RSS', 'bytes', queries.promReplMemUsage.query(), { x: 12, y: 2, w: 12, h: 8 }),
panels.timeSeries.genericLegend('metrics-server/prom-adapter CPU', 'percent', queries.metricsServerCpuUsage.query(), { x: 0, y: 10, w: 12, h: 8 }),
panels.timeSeries.genericLegend('metrics-server/prom-adapter RSS', 'bytes', queries.metricsServerMemUsage.query(), { x: 12, y: 10, w: 12, h: 8 }),
]),
g.panel.row.new('Stackrox')
+ g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 })
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withPanels([
panels.timeSeries.genericLegend('Top 25 stackrox container RSS bytes', 'bytes', queries.stackroxMem.query(), { x: 0, y: 2, w: 24, h: 12 }),
panels.timeSeries.genericLegend('Top 25 stackrox container CPU percent', 'percent', queries.stackroxCPU.query(), { x: 0, y: 2, w: 24, h: 12 }),
panels.timeSeries.genericLegend('Top 25 stackrox container RSS bytes', 'bytes', queries.stackroxMem.query(), { x: 0, y: 2, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Top 25 stackrox container CPU percent', 'percent', queries.stackroxCPU.query(), { x: 12, y: 2, w: 12, h: 8 }),
]),
g.panel.row.new('Cluster Kubelet')
+ g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 })
Expand Down

0 comments on commit fd68ef1

Please sign in to comment.