Skip to content

Commit

Permalink
Add missing cgroup memory panels ...
Browse files Browse the repository at this point in the history
Also adds a sum line on right axis for memory available.
  • Loading branch information
afcollins committed Dec 4, 2024
1 parent f374918 commit c315d67
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 29 deletions.
12 changes: 12 additions & 0 deletions assets/ocp-performance/panels.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn
local fieldOverride = g.panel.timeSeries.fieldOverride,
local custom = timeSeries.fieldConfig.defaults.custom,
local options = timeSeries.options,
local standardOptions = timeSeries.standardOptions,
local byRegexp = timeSeries.standardOptions.override.byRegexp,

generic(title, unit, targets, gridPos):
timeSeries.new(title)
Expand Down Expand Up @@ -45,6 +47,16 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn
+ options.legend.withSortBy('Max')
+ options.legend.withSortDesc(true)
+ options.legend.withPlacement('bottom'),

genericLegendCounterSumRightHand(title, unit, targets, gridPos):
self.genericLegendCounter(title, unit, targets, gridPos)
+ options.legend.withDisplayMode('table')
+ options.legend.withSortBy('Max')
+ standardOptions.withOverrides([
byRegexp.new('sum')
+ byRegexp.withProperty('custom.axisPlacement', 'right')
+ byRegexp.withProperty('custom.axisLabel', 'sum'),
]),
},
stat: {
local stat = g.panel.stat,
Expand Down
47 changes: 38 additions & 9 deletions assets/ocp-performance/queries.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,23 @@ local generateTimeSeriesQuery(query, legend) = [
query():
generateTimeSeriesQuery('sum by (id) (( rate(container_cpu_usage_seconds_total{ job=~".*", id =~"/system.slice|/system.slice/kubelet.service|/system.slice/ovs-vswitchd.service|/system.slice/crio.service|/system.slice/systemd-journald.service|/system.slice/ovsdb-server.service|/system.slice/systemd-udevd.service|/kubepods.slice"}[$interval])) * 100 * on (node) group_left kube_node_role{ role = "control-plane" } )', '{{instance}}'),
},
workersCGroupMemoryRSS: {
query():
generateTimeSeriesQuery('sum by (id) ( container_memory_rss{ job=~".*", id =~"/system.slice|/system.slice/kubelet.service|/system.slice/ovs-vswitchd.service|/system.slice/crio.service|/system.slice/systemd-journald.service|/system.slice/ovsdb-server.service|/system.slice/systemd-udevd.service|/kubepods.slice"} * on (node) group_left kube_node_role{ role = "worker" } )', '{{instance}}'),
},
controlPlaneCGroupMemoryRSS: {
query():
generateTimeSeriesQuery('sum by (id) ( container_memory_rss{ job=~".*", id =~"/system.slice|/system.slice/kubelet.service|/system.slice/ovs-vswitchd.service|/system.slice/crio.service|/system.slice/systemd-journald.service|/system.slice/ovsdb-server.service|/system.slice/systemd-udevd.service|/kubepods.slice"} * on (node) group_left kube_node_role{ role = "control-plane" } )', '{{instance}}'),
},
workersMemoryAvailable: {
query():
generateTimeSeriesQuery('node_memory_MemAvailable_bytes * on (instance) group_left label_replace( kube_node_role{ role = "worker"} , "instance" , "$1" , "node" ,"(.*)")', '{{instance}}'),
generateTimeSeriesQuery('node_memory_MemAvailable_bytes * on (instance) group_left label_replace( kube_node_role{ role = "worker"} , "instance" , "$1" , "node" ,"(.*)")', '{{instance}}') +
generateTimeSeriesQuery('sum( node_memory_MemAvailable_bytes * on (instance) group_left label_replace( kube_node_role{ role = "worker"} , "instance" , "$1" , "node" ,"(.*)") )', 'sum'),
},
controlPlaneMemoryAvailable: {
query():
generateTimeSeriesQuery('node_memory_MemAvailable_bytes * on (instance) group_left label_replace( kube_node_role{ role = "control-plane"} , "instance" , "$1" , "node" ,"(.*)")', '{{instance}}'),
generateTimeSeriesQuery('node_memory_MemAvailable_bytes * on (instance) group_left label_replace( kube_node_role{ role = "control-plane"} , "instance" , "$1" , "node" ,"(.*)")', '{{instance}}') +
generateTimeSeriesQuery('sum( node_memory_MemAvailable_bytes * on (instance) group_left label_replace( kube_node_role{ role = "control-plane"} , "instance" , "$1" , "node" ,"(.*)") )', 'sum'),
},
workersContainerThreads: {
query():
Expand Down Expand Up @@ -114,9 +124,23 @@ local generateTimeSeriesQuery(query, legend) = [
query(nodeName):
generateTimeSeriesQuery('topk(10, container_memory_rss{container!="POD",name!="",node=~"' + nodeName + '",namespace!="",namespace=~"$namespace"})', '{{ pod }}: {{ container }}'),
},
containerWriteBytes: {
nodeCGroupCPU: {
query(nodeName):
generateTimeSeriesQuery('sum by (id) ( rate(container_cpu_usage_seconds_total{ job=~".*", id =~"/system.slice|/system.slice/kubelet.service|/system.slice/ovs-vswitchd.service|/system.slice/crio.service|/system.slice/systemd-journald.service|/system.slice/ovsdb-server.service|/system.slice/systemd-udevd.service|/kubepods.slice", node=~"' + nodeName + '"}[$interval])) * 100', '{{ id }}'),
},
nodeCGroupRSS: {
query(nodeName):
generateTimeSeriesQuery('sum(rate(container_fs_writes_bytes_total{device!~".+dm.+", node=~"' + nodeName + '", container!=""}[$interval])) by (device, container)', '{{ container }}: {{ device }}'),
generateTimeSeriesQuery('sum by (id) ( container_memory_rss{ job=~".*", id =~"/system.slice|/system.slice/kubelet.service|/system.slice/ovs-vswitchd.service|/system.slice/crio.service|/system.slice/systemd-journald.service|/system.slice/ovsdb-server.service|/system.slice/systemd-udevd.service|/kubepods.slice", node=~"' + nodeName + '"})', '{{ id }}'),
},
containerReadWriteBytesPod: {
query(nodeName):
generateTimeSeriesQuery('sum(rate(container_fs_writes_bytes_total{device!~".+dm.+", node=~"' + nodeName + '", pod!=""}[$interval])) by (device, pod)', '{{ pod }}: {{ device }} - write')
+ generateTimeSeriesQuery('sum(rate(container_fs_reads_bytes_total{device!~".+dm.+", node=~"' + nodeName + '", pod!=""}[$interval])) by (device, pod)', '{{ pod }}: {{ device }} - read'),
},
containerReadWriteBytesCGroup: {
query(nodeName):
generateTimeSeriesQuery('sum(rate(container_fs_writes_bytes_total{device!~".+dm.+", node=~"' + nodeName + '", id =~"/system.slice/kubelet.service|/system.slice/ovs-vswitchd.service|/system.slice/crio.service|/system.slice/systemd-journald.service|/system.slice/ovsdb-server.service|/system.slice/systemd-udevd.service|/kubepods.slice"}[$interval])) by (device, id)', '{{ id }}: {{ device }} - write')
+ generateTimeSeriesQuery('sum(rate(container_fs_reads_bytes_total{device!~".+dm.+", node=~"' + nodeName + '", id =~"/system.slice/kubelet.service|/system.slice/ovs-vswitchd.service|/system.slice/crio.service|/system.slice/systemd-journald.service|/system.slice/ovsdb-server.service|/system.slice/systemd-udevd.service|/kubepods.slice"}[$interval])) by (device, id)', '{{ id }}: {{ device }} - read'),
},
stackroxCPU: {
query():
Expand Down Expand Up @@ -186,23 +210,28 @@ local generateTimeSeriesQuery(query, legend) = [
},
kubeletCPU: {
query():
generateTimeSeriesQuery('topk(10,irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[$interval])*100)', 'kubelet - {{node}}'),
generateTimeSeriesQuery('topk(10,irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[$interval])*100 * on (node) group_left kube_node_role{ role = "worker" })', 'kubelet - {{node}}'),
},
crioCPU: {
query():
generateTimeSeriesQuery('topk(10,irate(process_cpu_seconds_total{service="kubelet",job="crio"}[$interval])*100)', 'crio - {{node}}'),
generateTimeSeriesQuery('topk(10,irate(process_cpu_seconds_total{service="kubelet",job="crio"}[$interval])*100 * on (node) group_left kube_node_role{ role = "worker" })', 'crio - {{node}}'),
},
kubeletMemory: {
query():
generateTimeSeriesQuery('topk(10,process_resident_memory_bytes{service="kubelet",job="kubelet"})', 'kubelet - {{node}}'),
generateTimeSeriesQuery('topk(10,process_resident_memory_bytes{service="kubelet",job="kubelet"} * on (node) group_left kube_node_role{ role = "worker" })', 'kubelet - {{node}}'),
},
crioMemory: {
query():
generateTimeSeriesQuery('topk(10,process_resident_memory_bytes{service="kubelet",job="crio"})', 'crio - {{node}}'),
generateTimeSeriesQuery('topk(10,process_resident_memory_bytes{service="kubelet",job="crio"} * on (node) group_left kube_node_role{ role = "worker" })', 'crio - {{node}}'),
},
crioINodes: {
query():
generateTimeSeriesQuery('(1 - node_filesystem_files_free{fstype!="",mountpoint="/run"} / node_filesystem_files{fstype!="",mountpoint="/run"}) * 100', '/var/run - {{instance}}'),
generateTimeSeriesQuery('(1 - node_filesystem_files_free{fstype!="",mountpoint="/run"} / node_filesystem_files{fstype!="",mountpoint="/run"}) * 100', '{{instance}}'),
},
crioINodesCount: {
query():
generateTimeSeriesQuery('node_filesystem_files{fstype!="",mountpoint="/run"} - node_filesystem_files_free{fstype!="",mountpoint="/run"}', '{{instance}}')
+ generateTimeSeriesQuery('sum(node_filesystem_files{fstype!="",mountpoint="/run"} - node_filesystem_files_free{fstype!="",mountpoint="/run"})', 'sum'),
},
currentNodeCount: {
query():
Expand Down
2 changes: 1 addition & 1 deletion assets/ocp-performance/variables.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ local var = g.dashboard.variable;
+ var.query.withDatasourceFromVariable(self.datasource)
+ var.query.queryTypes.withLabelValues(
'node',
'kube_node_role{role=~"work.*"}',
'kube_node_role{role=~"worker"}', // Do we want to include the workload node for some reason? WHy is it a regexp?
)
+ var.query.withRefresh(2)
+ var.query.selectionOptions.withMulti()
Expand Down
46 changes: 27 additions & 19 deletions templates/General/ocp-performance.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,16 @@ g.dashboard.new('Openshift Performance')
panels.timeSeries.genericLegend('Control Plane CPU Usage', 'percent', queries.controlPlanesCPU.query(), { x: 12, y: 2, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Workers Load1', 'short', queries.workersLoad1.query(), { x: 0, y: 9, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Control Plane Load1', 'short', queries.controlPlanesLoad1.query(), { x: 12, y: 9, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Workers CGroup CPU Rate', 'short', queries.workersCGroupCpuRate.query(), { x: 0, y: 17, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Control Plane CGroup CPU Rate', 'short', queries.controlPlaneCGroupCpuRate.query(), { x: 12, y: 17, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounter('Workers Memory Available', 'bytes', queries.workersMemoryAvailable.query(), { x: 0, y: 25, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounter('Control Plane Memory Available', 'bytes', queries.controlPlaneMemoryAvailable.query(), { x: 12, y: 25, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounter('Workers Container Threads', 'short', queries.workersContainerThreads.query(), { x: 0, y: 33, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounter('Control Plane Container Threads', 'short', queries.controlPlaneContainerThreads.query(), { x: 12, y: 33, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Workers Disk IOPS', 'short', queries.workersIOPS.query(), { x: 0, y: 41, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Control Plane Disk IOPS', 'short', queries.controlPlaneIOPS.query(), { x: 12, y: 41, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounterSumRightHand('Workers Memory Available', 'bytes', queries.workersMemoryAvailable.query(), { x: 0, y: 17, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounterSumRightHand('Control Plane Memory Available', 'bytes', queries.controlPlaneMemoryAvailable.query(), { x: 12, y: 17, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Workers CGroup CPU Rate', 'short', queries.workersCGroupCpuRate.query(), { x: 0, y: 25, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Control Plane CGroup CPU Rate', 'short', queries.controlPlaneCGroupCpuRate.query(), { x: 12, y: 25, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounter('Workers CGroup Memory RSS', 'bytes', queries.workersCGroupMemoryRSS.query(), { x: 0, y: 33, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounter('Control Plane CGroup Memory RSS', 'bytes', queries.controlPlaneCGroupMemoryRSS.query(), { x: 12, y: 33, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounter('Workers Container Threads', 'short', queries.workersContainerThreads.query(), { x: 0, y: 41, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounter('Control Plane Container Threads', 'short', queries.controlPlaneContainerThreads.query(), { x: 12, y: 41, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Workers Disk IOPS', 'short', queries.workersIOPS.query(), { x: 0, y: 49, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Control Plane Disk IOPS', 'short', queries.controlPlaneIOPS.query(), { x: 12, y: 49, w: 12, h: 8 }),
]),
g.panel.row.new('OVN')
+ g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 })
Expand Down Expand Up @@ -81,9 +83,10 @@ g.dashboard.new('Openshift Performance')
+ g.panel.row.withPanels([
panels.timeSeries.genericLegend('Top 10 Kubelet CPU usage', 'percent', queries.kubeletCPU.query(), { x: 0, y: 3, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Top 10 crio CPU usage', 'percent', queries.crioCPU.query(), { x: 12, y: 3, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Top 10 Kubelet memory usage', 'bytes', queries.kubeletMemory.query(), { x: 0, y: 11, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Top 10 crio memory usage', 'bytes', queries.crioMemory.query(), { x: 12, y: 11, w: 12, h: 8 }),
panels.timeSeries.genericLegend('inodes usage in /var/run', 'percent', queries.crioINodes.query(), { x: 0, y: 19, w: 24, h: 8 }),
panels.timeSeries.genericLegendCounter('Top 10 Kubelet memory usage', 'bytes', queries.kubeletMemory.query(), { x: 0, y: 11, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounter('Top 10 crio memory usage', 'bytes', queries.crioMemory.query(), { x: 12, y: 11, w: 12, h: 8 }),
panels.timeSeries.genericLegend('inodes usage in /run', 'percent', queries.crioINodes.query(), { x: 0, y: 19, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounterSumRightHand('inodes count in /run', 'none', queries.crioINodesCount.query(), { x: 12, y: 19, w: 12, h: 8 }),
]),
g.panel.row.new('Cluster Details')
+ g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 })
Expand Down Expand Up @@ -120,32 +123,37 @@ g.dashboard.new('Openshift Performance')
+ g.panel.row.withRepeat('_master_node')
+ g.panel.row.withPanels([
panels.timeSeries.genericLegend('CPU Basic: $_master_node', 'percent', queries.nodeCPU.query('$_master_node'), { x: 0, y: 0, w: 12, h: 8 }),
panels.timeSeries.genericLegend('System Memory: $_master_node', 'bytes', queries.nodeMemory.query('$_master_node'), { x: 12, y: 0, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounter('System Memory: $_master_node', 'bytes', queries.nodeMemory.query('$_master_node'), { x: 12, y: 0, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Disk throughput: $_master_node', 'Bps', queries.diskThroughput.query('$_master_node'), { x: 0, y: 8, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Disk IOPS: $_master_node', 'iops', queries.diskIOPS.query('$_master_node'), { x: 12, y: 8, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Network Utilization: $_master_node', 'bps', queries.networkUtilization.query('$_master_node'), { x: 0, y: 16, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Network Packets: $_master_node', 'pps', queries.networkPackets.query('$_master_node'), { x: 12, y: 16, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Network packets drop: $_master_node', 'pps', queries.networkDrop.query('$_master_node'), { x: 0, y: 24, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Conntrack stats: $_master_node', '', queries.conntrackStats.query('$_master_node'), { x: 12, y: 24, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounter('Conntrack stats: $_master_node', '', queries.conntrackStats.query('$_master_node'), { x: 12, y: 24, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Top 10 container CPU: $_master_node', 'percent', queries.top10ContainerCPU.query('$_master_node'), { x: 0, y: 24, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Top 10 container RSS: $_master_node', 'bytes', queries.top10ContainerRSS.query('$_master_node'), { x: 12, y: 24, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Container fs write rate: $_master_node', 'Bps', queries.containerWriteBytes.query('$_master_node'), { x: 0, y: 32, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounter('Top 10 container RSS: $_master_node', 'bytes', queries.top10ContainerRSS.query('$_master_node'), { x: 12, y: 24, w: 12, h: 8 }),
panels.timeSeries.genericLegend('cgroup CPU: $_master_node', 'percent', queries.nodeCGroupCPU.query('$_master_node'), { x: 0, y: 32, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounter('cgroup RSS: $_master_node', 'bytes', queries.nodeCGroupRSS.query('$_master_node'), { x: 12, y: 32, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Pod fs write rate: $_master_node', 'Bps', queries.containerReadWriteBytesPod.query('$_master_node'), { x: 0, y: 32, w: 12, h: 8 }),
panels.timeSeries.genericLegend('cgroup fs write rate: $_master_node', 'Bps', queries.containerReadWriteBytesCGroup.query('$_master_node'), { x: 12, y: 32, w: 12, h: 8 }),
]),
g.panel.row.new('Worker: $_worker_node')
+ g.panel.row.withGridPos({ x: 0, y: 0, w: 0, h: 8 })
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withRepeat('_worker_node')
+ g.panel.row.withPanels([
panels.timeSeries.genericLegend('CPU Basic: $_worker_node', 'percent', queries.nodeCPU.query('$_worker_node'), { x: 0, y: 0, w: 12, h: 8 }),
panels.timeSeries.genericLegend('System Memory: $_worker_node', 'bytes', queries.nodeMemory.query('$_worker_node'), { x: 12, y: 0, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounter('System Memory: $_worker_node', 'bytes', queries.nodeMemory.query('$_worker_node'), { x: 12, y: 0, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Disk throughput: $_worker_node', 'Bps', queries.diskThroughput.query('$_worker_node'), { x: 0, y: 8, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Disk IOPS: $_worker_node', 'iops', queries.diskIOPS.query('$_worker_node'), { x: 12, y: 8, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Network Utilization: $_worker_node', 'bps', queries.networkUtilization.query('$_worker_node'), { x: 0, y: 16, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Network Packets: $_worker_node', 'pps', queries.networkPackets.query('$_worker_node'), { x: 12, y: 16, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Network packets drop: $_worker_node', 'pps', queries.networkDrop.query('$_worker_node'), { x: 0, y: 24, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Conntrack stats: $_worker_node', '', queries.conntrackStats.query('$_worker_node'), { x: 12, y: 24, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Top 10 container CPU: $_worker_node', 'percent', queries.top10ContainerCPU.query('$_worker_node'), { x: 0, y: 24, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Top 10 container RSS: $_worker_node', 'bytes', queries.top10ContainerRSS.query('$_worker_node'), { x: 12, y: 24, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounter('Conntrack stats: $_worker_node', '', queries.conntrackStats.query('$_worker_node'), { x: 12, y: 24, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Top 10 container CPU: $_worker_node', 'percent', queries.top10ContainerCPU.query('$_worker_node'), { x: 0, y: 32, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounter('Top 10 container RSS: $_worker_node', 'bytes', queries.top10ContainerRSS.query('$_worker_node'), { x: 12, y: 32, w: 12, h: 8 }),
panels.timeSeries.genericLegend('cgroup CPU: $_worker_node', 'percent', queries.nodeCGroupCPU.query('$_worker_node'), { x: 0, y: 40, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounter('cgroup RSS: $_worker_node', 'bytes', queries.nodeCGroupRSS.query('$_worker_node'), { x: 12, y: 40, w: 12, h: 8 }),
]),
g.panel.row.new('Infra: $_infra_node')
+ g.panel.row.withGridPos({ x: 0, y: 0, w: 0, h: 8 })
Expand Down

0 comments on commit c315d67

Please sign in to comment.