From 47743c697f970563d11cee30299fc03b06970a39 Mon Sep 17 00:00:00 2001 From: Andrew Collins Date: Fri, 8 Nov 2024 11:18:49 -0600 Subject: [PATCH] Add missing cgroup memory panels ... Also adds a sum line on right axis for memory available. --- assets/ocp-performance/panels.libsonnet | 12 ++++++++++++ assets/ocp-performance/queries.libsonnet | 14 ++++++++++++-- assets/ocp-performance/variables.libsonnet | 2 +- templates/General/ocp-performance.jsonnet | 14 ++++++++------ 4 files changed, 33 insertions(+), 9 deletions(-) diff --git a/assets/ocp-performance/panels.libsonnet b/assets/ocp-performance/panels.libsonnet index 0c47f93..b37cb0c 100644 --- a/assets/ocp-performance/panels.libsonnet +++ b/assets/ocp-performance/panels.libsonnet @@ -6,6 +6,8 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn local fieldOverride = g.panel.timeSeries.fieldOverride, local custom = timeSeries.fieldConfig.defaults.custom, local options = timeSeries.options, + local standardOptions = timeSeries.standardOptions, + local byRegexp = timeSeries.standardOptions.override.byRegexp, generic(title, unit, targets, gridPos): timeSeries.new(title) @@ -45,6 +47,16 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn + options.legend.withSortBy('Max') + options.legend.withSortDesc(true) + options.legend.withPlacement('bottom'), + + genericLegendCounterSumRightHand(title, unit, targets, gridPos): + self.genericLegendCounter(title, unit, targets, gridPos) + + options.legend.withDisplayMode('table') + + options.legend.withSortBy('Max') + + standardOptions.withOverrides([ + byRegexp.new('sum') + + byRegexp.withProperty('custom.axisPlacement', 'right') + + byRegexp.withProperty('custom.axisLabel', 'sum'), + ]), }, stat: { local stat = g.panel.stat, diff --git a/assets/ocp-performance/queries.libsonnet b/assets/ocp-performance/queries.libsonnet index 9c442ae..2ec9141 100644 --- a/assets/ocp-performance/queries.libsonnet +++ b/assets/ocp-performance/queries.libsonnet @@ -37,13 +37,23 @@ local generateTimeSeriesQuery(query, legend) = [ query(): generateTimeSeriesQuery('sum by (id) (( rate(container_cpu_usage_seconds_total{ job=~".*", id =~"/system.slice|/system.slice/kubelet.service|/system.slice/ovs-vswitchd.service|/system.slice/crio.service|/system.slice/systemd-journald.service|/system.slice/ovsdb-server.service|/system.slice/systemd-udevd.service|/kubepods.slice"}[$interval])) * 100 * on (node) group_left kube_node_role{ role = "control-plane" } )', '{{instance}}'), }, + workersCGroupMemoryRSS: { + query(): + generateTimeSeriesQuery('sum by (id) ( container_memory_rss{ job=~".*", id =~"/system.slice|/system.slice/kubelet.service|/system.slice/ovs-vswitchd.service|/system.slice/crio.service|/system.slice/systemd-journald.service|/system.slice/ovsdb-server.service|/system.slice/systemd-udevd.service|/kubepods.slice"} * on (node) group_left kube_node_role{ role = "worker" } )', '{{instance}}'), + }, + controlPlaneCGroupMemoryRSS: { + query(): + generateTimeSeriesQuery('sum by (id) ( container_memory_rss{ job=~".*", id =~"/system.slice|/system.slice/kubelet.service|/system.slice/ovs-vswitchd.service|/system.slice/crio.service|/system.slice/systemd-journald.service|/system.slice/ovsdb-server.service|/system.slice/systemd-udevd.service|/kubepods.slice"} * on (node) group_left kube_node_role{ role = "control-plane" } )', '{{instance}}'), + }, workersMemoryAvailable: { query(): - generateTimeSeriesQuery('node_memory_MemAvailable_bytes * on (instance) group_left label_replace( kube_node_role{ role = "worker"} , "instance" , "$1" , "node" ,"(.*)")', '{{instance}}'), + generateTimeSeriesQuery('node_memory_MemAvailable_bytes * on (instance) group_left label_replace( kube_node_role{ role = "worker"} , "instance" , "$1" , "node" ,"(.*)")', '{{instance}}') + + generateTimeSeriesQuery('sum( node_memory_MemAvailable_bytes * on (instance) group_left label_replace( kube_node_role{ role = "worker"} , "instance" , "$1" , "node" ,"(.*)") )', 'sum'), }, controlPlaneMemoryAvailable: { query(): - generateTimeSeriesQuery('node_memory_MemAvailable_bytes * on (instance) group_left label_replace( kube_node_role{ role = "control-plane"} , "instance" , "$1" , "node" ,"(.*)")', '{{instance}}'), + generateTimeSeriesQuery('node_memory_MemAvailable_bytes * on (instance) group_left label_replace( kube_node_role{ role = "control-plane"} , "instance" , "$1" , "node" ,"(.*)")', '{{instance}}') + + generateTimeSeriesQuery('sum( node_memory_MemAvailable_bytes * on (instance) group_left label_replace( kube_node_role{ role = "control-plane"} , "instance" , "$1" , "node" ,"(.*)") )', 'sum'), }, workersContainerThreads: { query(): diff --git a/assets/ocp-performance/variables.libsonnet b/assets/ocp-performance/variables.libsonnet index 3bd02d4..41f1b91 100644 --- a/assets/ocp-performance/variables.libsonnet +++ b/assets/ocp-performance/variables.libsonnet @@ -22,7 +22,7 @@ local var = g.dashboard.variable; + var.query.withDatasourceFromVariable(self.datasource) + var.query.queryTypes.withLabelValues( 'node', - 'kube_node_role{role=~"work.*"}', + 'kube_node_role{role=~"worker"}', // Do we want to include the workload node for some reason? WHy is it a regexp? ) + var.query.withRefresh(2) + var.query.selectionOptions.withMulti() diff --git a/templates/General/ocp-performance.jsonnet b/templates/General/ocp-performance.jsonnet index 4061506..462b4dc 100644 --- a/templates/General/ocp-performance.jsonnet +++ b/templates/General/ocp-performance.jsonnet @@ -36,12 +36,14 @@ g.dashboard.new('Openshift Performance') panels.timeSeries.genericLegend('Control Plane Load1', 'short', queries.controlPlanesLoad1.query(), { x: 12, y: 9, w: 12, h: 8 }), panels.timeSeries.genericLegend('Workers CGroup CPU Rate', 'short', queries.workersCGroupCpuRate.query(), { x: 0, y: 17, w: 12, h: 8 }), panels.timeSeries.genericLegend('Control Plane CGroup CPU Rate', 'short', queries.controlPlaneCGroupCpuRate.query(), { x: 12, y: 17, w: 12, h: 8 }), - panels.timeSeries.genericLegendCounter('Workers Memory Available', 'bytes', queries.workersMemoryAvailable.query(), { x: 0, y: 25, w: 12, h: 8 }), - panels.timeSeries.genericLegendCounter('Control Plane Memory Available', 'bytes', queries.controlPlaneMemoryAvailable.query(), { x: 12, y: 25, w: 12, h: 8 }), - panels.timeSeries.genericLegendCounter('Workers Container Threads', 'short', queries.workersContainerThreads.query(), { x: 0, y: 33, w: 12, h: 8 }), - panels.timeSeries.genericLegendCounter('Control Plane Container Threads', 'short', queries.controlPlaneContainerThreads.query(), { x: 12, y: 33, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Workers Disk IOPS', 'short', queries.workersIOPS.query(), { x: 0, y: 41, w: 12, h: 8 }), - panels.timeSeries.genericLegend('Control Plane Disk IOPS', 'short', queries.controlPlaneIOPS.query(), { x: 12, y: 41, w: 12, h: 8 }), + panels.timeSeries.genericLegendCounterSumRightHand('Workers Memory Available', 'bytes', queries.workersMemoryAvailable.query(), { x: 0, y: 25, w: 12, h: 8 }), + panels.timeSeries.genericLegendCounterSumRightHand('Control Plane Memory Available', 'bytes', queries.controlPlaneMemoryAvailable.query(), { x: 12, y: 25, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Workers CGroup Memory RSS', 'bytes', queries.workersCGroupMemoryRSS.query(), { x: 0, y: 33, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Control Plane CGroup Memory RSS', 'bytes', queries.controlPlaneCGroupMemoryRSS.query(), { x: 12, y: 33, w: 12, h: 8 }), + panels.timeSeries.genericLegendCounter('Workers Container Threads', 'short', queries.workersContainerThreads.query(), { x: 0, y: 41, w: 12, h: 8 }), + panels.timeSeries.genericLegendCounter('Control Plane Container Threads', 'short', queries.controlPlaneContainerThreads.query(), { x: 12, y: 41, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Workers Disk IOPS', 'short', queries.workersIOPS.query(), { x: 0, y: 49, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Control Plane Disk IOPS', 'short', queries.controlPlaneIOPS.query(), { x: 12, y: 49, w: 12, h: 8 }), ]), g.panel.row.new('OVN') + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 })