From 1b03ca2ba6ac38680f8a15b59ce7416863fefd0a Mon Sep 17 00:00:00 2001
From: Andrew Collins <ancollin@redhat.com>
Date: Fri, 27 Sep 2024 14:02:00 -0500
Subject: [PATCH] Node panels (#136)

* OCP Dash changes
* Add panels that show cluster view
* Makefile and variables changes
* panels and legends updates
* Add similar changes to ovn dashboard, remove old queries

---------

Signed-off-by: Andrew Collins <ancollin@redhat.com>
---
 Makefile                                   |  6 ++-
 assets/ocp-performance/panels.libsonnet    | 13 +++++
 assets/ocp-performance/queries.libsonnet   | 59 ++++++++++++++++++++--
 assets/ocp-performance/variables.libsonnet |  3 +-
 assets/ovn-monitoring/queries.libsonnet    | 23 +++------
 templates/General/ocp-performance.jsonnet  | 39 ++++++++++----
 templates/General/ovn-dashboard.jsonnet    |  8 ++-
 7 files changed, 111 insertions(+), 40 deletions(-)

diff --git a/Makefile b/Makefile
index f02e620..cf893fe 100644
--- a/Makefile
+++ b/Makefile
@@ -30,10 +30,14 @@ format: deps
 
 build: deps $(LIBRARY_PATH) $(outputs)
 
-clean:
+clean-all:
 	@echo "Cleaning up"
 	rm -rf $(ALLDIRS) $(TEMPLATESDIR)/vendor
 
+clean:
+	@echo "Cleaning up"
+	rm -rf $(OUTPUTDIR)
+
 $(BINDIR)/jsonnet:
 	@echo "Downloading jsonnet binary"
 	curl -s -L $(JSONNET) | tar xz -C $(BINDIR)
diff --git a/assets/ocp-performance/panels.libsonnet b/assets/ocp-performance/panels.libsonnet
index 3baee2f..0c47f93 100644
--- a/assets/ocp-performance/panels.libsonnet
+++ b/assets/ocp-performance/panels.libsonnet
@@ -26,8 +26,21 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn
       + options.legend.withShowLegend(true)
       + options.legend.withCalcs([
         'mean',
+        'min',
         'max',
+      ])
+      + options.legend.withSortBy('Max')
+      + options.legend.withSortDesc(true)
+      + options.legend.withPlacement('bottom'),
+
+    genericLegendCounter(title, unit, targets, gridPos):
+      self.generic(title, unit, targets, gridPos)
+      + options.legend.withShowLegend(true)
+      + options.legend.withCalcs([
+        'first',
         'min',
+        'max',
+        'last',
       ])
       + options.legend.withSortBy('Max')
       + options.legend.withSortDesc(true)
diff --git a/assets/ocp-performance/queries.libsonnet b/assets/ocp-performance/queries.libsonnet
index ffa49e7..9c442ae 100644
--- a/assets/ocp-performance/queries.libsonnet
+++ b/assets/ocp-performance/queries.libsonnet
@@ -13,6 +13,57 @@ local generateTimeSeriesQuery(query, legend) = [
 ];
 
 {
+  workersCPU: {
+    query():
+      generateTimeSeriesQuery('sum( rate( (node_cpu_seconds_total{ mode != "idle" } * on (instance) group_left label_replace( kube_node_role{ role = "worker"} , "instance" , "$1" , "node" ,"(.*)") )[$interval:] ) ) by (instance) * 100', '{{instance}}'),
+  },
+  controlPlanesCPU: {
+    query():
+      generateTimeSeriesQuery('sum( rate( (node_cpu_seconds_total{ mode != "idle" } * on (instance) group_left label_replace( kube_node_role{ role = "control-plane"} , "instance" , "$1" , "node" ,"(.*)") )[$interval:] ) ) by (instance) * 100', '{{instance}}'),
+  },
+  workersLoad1: {
+    query():
+      generateTimeSeriesQuery('node_load1 * on (instance) group_left label_replace( kube_node_role{ role = "worker"} , "instance" , "$1" , "node" ,"(.*)") ', '{{instance}}'),
+  },
+  controlPlanesLoad1: {
+    query():
+      generateTimeSeriesQuery('node_load1 * on (instance) group_left label_replace( kube_node_role{ role = "control-plane"} , "instance" , "$1" , "node" ,"(.*)") ', '{{instance}}'),
+  },
+  workersCGroupCpuRate: {
+    query():
+      generateTimeSeriesQuery('sum by (id) (( rate(container_cpu_usage_seconds_total{ job=~".*", id =~"/system.slice|/system.slice/kubelet.service|/system.slice/ovs-vswitchd.service|/system.slice/crio.service|/system.slice/systemd-journald.service|/system.slice/ovsdb-server.service|/system.slice/systemd-udevd.service|/kubepods.slice"}[$interval])) * 100 * on (node) group_left kube_node_role{ role = "worker" } )', '{{instance}}'),
+  },
+  controlPlaneCGroupCpuRate: {
+    query():
+      generateTimeSeriesQuery('sum by (id) (( rate(container_cpu_usage_seconds_total{ job=~".*", id =~"/system.slice|/system.slice/kubelet.service|/system.slice/ovs-vswitchd.service|/system.slice/crio.service|/system.slice/systemd-journald.service|/system.slice/ovsdb-server.service|/system.slice/systemd-udevd.service|/kubepods.slice"}[$interval])) * 100 * on (node) group_left kube_node_role{ role = "control-plane" } )', '{{instance}}'),
+  },
+  workersMemoryAvailable: {
+    query():
+      generateTimeSeriesQuery('node_memory_MemAvailable_bytes * on (instance) group_left label_replace( kube_node_role{ role = "worker"} , "instance" , "$1" , "node" ,"(.*)")', '{{instance}}'),
+  },
+  controlPlaneMemoryAvailable: {
+    query():
+      generateTimeSeriesQuery('node_memory_MemAvailable_bytes * on (instance) group_left label_replace( kube_node_role{ role = "control-plane"} , "instance" , "$1" , "node" ,"(.*)")', '{{instance}}'),
+  },
+  workersContainerThreads: {
+    query():
+      generateTimeSeriesQuery('sum by (node) (container_threads{ container!=""})  * on (node) group_left kube_node_role{ role = "worker" }', '{{instance}}'),
+  },
+  controlPlaneContainerThreads: {
+    query():
+      generateTimeSeriesQuery('sum by (node) (container_threads{ container!=""})  * on (node) group_left kube_node_role{ role = "control-plane" }', '{{instance}}'),
+  },
+  workersIOPS: {
+    query():
+      generateTimeSeriesQuery('rate( (  node_disk_reads_completed_total *  on (instance) group_left label_replace( kube_node_role{ role = "worker" } , "instance" , "$1" , "node" ,"(.*)") )[$interval:])', '{{instance}} - {{ device }} - read') +
+      generateTimeSeriesQuery('rate( (  node_disk_writes_completed_total *  on (instance) group_left label_replace( kube_node_role{ role = "worker" } , "instance" , "$1" , "node" ,"(.*)") )[$interval:])', '{{instance}} - {{ device }} - write'),
+  },
+  controlPlaneIOPS: {
+    query():
+      generateTimeSeriesQuery('rate( (  node_disk_reads_completed_total *  on (instance) group_left label_replace( kube_node_role{ role = "control-plane" } , "instance" , "$1" , "node" ,"(.*)") )[$interval:])', '{{instance}} - {{ device }} - read') +
+      generateTimeSeriesQuery('rate( (  node_disk_writes_completed_total *  on (instance) group_left label_replace( kube_node_role{ role = "control-plane" } , "instance" , "$1" , "node" ,"(.*)") )[$interval:])', '{{instance}} - {{ device }} - write'),
+  },
+
   nodeMemory: {
     query(nodeName):
       generateTimeSeriesQuery('node_memory_Active_bytes{instance=~"' + nodeName + '"}', 'Active')
@@ -99,19 +150,19 @@ local generateTimeSeriesQuery(query, legend) = [
   },
   ovnKubeControlPlaneCPU: {
     query():
-      generateTimeSeriesQuery('irate(container_cpu_usage_seconds_total{pod=~"(ovnkube-master|ovnkube-control-plane).+",namespace="openshift-ovn-kubernetes",container!~"POD|"}[$interval])*100', '{{container}}-{{pod}}-{{node}}'),
+      generateTimeSeriesQuery('sum( irate(container_cpu_usage_seconds_total{pod=~"(ovnkube-master|ovnkube-control-plane).+",namespace="openshift-ovn-kubernetes",container!~"POD|"}[$interval])*100 ) by (pod, node)', '{{pod}} - {{node}}'),
   },
   ovnKubeControlPlaneMem: {
     query():
-      generateTimeSeriesQuery('container_memory_rss{pod=~"(ovnkube-master|ovnkube-control-plane).+",namespace="openshift-ovn-kubernetes",container!~"POD|"}', '{{container}}-{{pod}}-{{node}}'),
+      generateTimeSeriesQuery('container_memory_rss{pod=~"(ovnkube-master|ovnkube-control-plane).+",namespace="openshift-ovn-kubernetes",container!~"POD|"}', '{{pod}} - {{node}}'),
   },
   topOvnControllerCPU: {
     query():
-      generateTimeSeriesQuery('topk(10, irate(container_cpu_usage_seconds_total{pod=~"ovnkube-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}[$interval])*100)', '{{node}}'),
+      generateTimeSeriesQuery('topk(10, sum( irate(container_cpu_usage_seconds_total{pod=~"ovnkube-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}[$interval])*100)  by (pod,node) )', '{{pod}} - {{node}}'),
   },
   topOvnControllerMem: {
     query():
-      generateTimeSeriesQuery('topk(10, sum(container_memory_rss{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}) by (node))', '{{node}}'),
+      generateTimeSeriesQuery('topk(10, sum(container_memory_rss{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}) by (pod,node))', '{{pod}} - {{node}}'),
   },
   promReplCpuUsage: {
     query():
diff --git a/assets/ocp-performance/variables.libsonnet b/assets/ocp-performance/variables.libsonnet
index 64ae056..3bd02d4 100644
--- a/assets/ocp-performance/variables.libsonnet
+++ b/assets/ocp-performance/variables.libsonnet
@@ -3,8 +3,7 @@ local var = g.dashboard.variable;
 
 {
   datasource:
-    var.datasource.new('datasource', 'prometheus')
-    + var.datasource.withRegex('/^Cluster Prometheus$/'),
+    var.datasource.new('datasource', 'prometheus'),
 
   master_node:
     var.query.new('_master_node')
diff --git a/assets/ovn-monitoring/queries.libsonnet b/assets/ovn-monitoring/queries.libsonnet
index dba2e05..3459111 100644
--- a/assets/ovn-monitoring/queries.libsonnet
+++ b/assets/ovn-monitoring/queries.libsonnet
@@ -10,9 +10,9 @@ local generateTimeSeriesQuery(query, legend) = [
 ];
 
 {
-  ovnMasterLeader: {
+  ovnClusterManagerLeader: {
     query():
-      generateTimeSeriesQuery('ovnkube_master_leader', '{{pod}}'),
+      generateTimeSeriesQuery('ovnkube_clustermanager_leader > 0', '{{pod}}'),
   },
 
   ovnNorthd: {
@@ -20,16 +20,6 @@ local generateTimeSeriesQuery(query, legend) = [
       generateTimeSeriesQuery('ovn_northd_status', '{{pod}}'),
   },
 
-  ovnNbdbLeader: {
-    query():
-      generateTimeSeriesQuery('ovn_db_cluster_server_role{server_role="leader",db_name="OVN_Northbound"}', '{{pod}}'),
-  },
-
-  ovnSbdbLeader: {
-    query():
-      generateTimeSeriesQuery('ovn_db_cluster_server_role{server_role="leader",db_name="OVN_Southbound"}', '{{pod}}'),
-  },
-
   numOnvController: {
     query():
       generateTimeSeriesQuery('count(ovn_controller_monitor_all) by (namespace)', ''),
@@ -37,22 +27,21 @@ local generateTimeSeriesQuery(query, legend) = [
 
   ovnKubeControlPlaneCPU: {
     query():
-      generateTimeSeriesQuery('irate(container_cpu_usage_seconds_total{pod=~"(ovnkube-master|ovnkube-control-plane).+",namespace="openshift-ovn-kubernetes",container!~"POD|"}[2m])*100','{{container}}-{{pod}}-{{node}}'),
+      generateTimeSeriesQuery('sum( irate(container_cpu_usage_seconds_total{pod=~"(ovnkube-master|ovnkube-control-plane).+",namespace="openshift-ovn-kubernetes",container!~"POD|"}[2m])*100 ) by (pod, node)', '{{pod}} - {{node}}'),
   },
 
   ovnKubeControlPlaneMem: {
     query():
-      generateTimeSeriesQuery('container_memory_rss{pod=~"(ovnkube-master|ovnkube-control-plane).+",namespace="openshift-ovn-kubernetes",container!~"POD|"}','{{container}}-{{pod}}-{{node}}'),
+      generateTimeSeriesQuery('container_memory_rss{pod=~"(ovnkube-master|ovnkube-control-plane).+",namespace="openshift-ovn-kubernetes",container!~"POD|"}', '{{pod}} - {{node}}'),
   },
 
   topOvnControllerCPU: {
     query():
-      generateTimeSeriesQuery('topk(10, irate(container_cpu_usage_seconds_total{pod=~"ovnkube-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}[2m])*100)', '{{node}}'),
+      generateTimeSeriesQuery('topk(10, sum( irate(container_cpu_usage_seconds_total{pod=~"ovnkube-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}[2m])*100)  by (pod,node) )', '{{pod}} - {{node}}'),
   },
-
   topOvnControllerMem: {
     query():
-      generateTimeSeriesQuery('topk(10, sum(container_memory_rss{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}) by (node))', '{{node}}'),
+      generateTimeSeriesQuery('topk(10, sum(container_memory_rss{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}) by (pod,node))', '{{pod}} - {{node}}'),
   },
 
   ovnAnnotationLatency: {
diff --git a/templates/General/ocp-performance.jsonnet b/templates/General/ocp-performance.jsonnet
index c3be94f..4061506 100644
--- a/templates/General/ocp-performance.jsonnet
+++ b/templates/General/ocp-performance.jsonnet
@@ -26,21 +26,38 @@ g.dashboard.new('Openshift Performance')
   variables.interval,
 ])
 + g.dashboard.withPanels([
+  g.panel.row.new('Cluster-at-a-Glance')
+  + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 })
+  + g.panel.row.withCollapsed(true)
+  + g.panel.row.withPanels([
+    panels.timeSeries.genericLegend('Workers CPU Usage', 'percent', queries.workersCPU.query(), { x: 0, y: 2, w: 12, h: 8 }),
+    panels.timeSeries.genericLegend('Control Plane CPU Usage', 'percent', queries.controlPlanesCPU.query(), { x: 12, y: 2, w: 12, h: 8 }),
+    panels.timeSeries.genericLegend('Workers Load1', 'short', queries.workersLoad1.query(), { x: 0, y: 9, w: 12, h: 8 }),
+    panels.timeSeries.genericLegend('Control Plane Load1', 'short', queries.controlPlanesLoad1.query(), { x: 12, y: 9, w: 12, h: 8 }),
+    panels.timeSeries.genericLegend('Workers CGroup CPU Rate', 'short', queries.workersCGroupCpuRate.query(), { x: 0, y: 17, w: 12, h: 8 }),
+    panels.timeSeries.genericLegend('Control Plane CGroup CPU Rate', 'short', queries.controlPlaneCGroupCpuRate.query(), { x: 12, y: 17, w: 12, h: 8 }),
+    panels.timeSeries.genericLegendCounter('Workers Memory Available', 'bytes', queries.workersMemoryAvailable.query(), { x: 0, y: 25, w: 12, h: 8 }),
+    panels.timeSeries.genericLegendCounter('Control Plane Memory Available', 'bytes', queries.controlPlaneMemoryAvailable.query(), { x: 12, y: 25, w: 12, h: 8 }),
+    panels.timeSeries.genericLegendCounter('Workers Container Threads', 'short', queries.workersContainerThreads.query(), { x: 0, y: 33, w: 12, h: 8 }),
+    panels.timeSeries.genericLegendCounter('Control Plane Container Threads', 'short', queries.controlPlaneContainerThreads.query(), { x: 12, y: 33, w: 12, h: 8 }),
+    panels.timeSeries.genericLegend('Workers Disk IOPS', 'short', queries.workersIOPS.query(), { x: 0, y: 41, w: 12, h: 8 }),
+    panels.timeSeries.genericLegend('Control Plane Disk IOPS', 'short', queries.controlPlaneIOPS.query(), { x: 12, y: 41, w: 12, h: 8 }),
+  ]),
   g.panel.row.new('OVN')
   + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 })
   + g.panel.row.withCollapsed(true)
   + g.panel.row.withPanels([
-    panels.timeSeries.genericLegend('ovs-master CPU Usage', 'percent', queries.OVSCPU.query('$_master_node'), { x: 0, y: 21, w: 12, h: 8 }),
-    panels.timeSeries.genericLegend('ovs-master Memory Usage', 'bytes', queries.OVSMemory.query('$_master_node'), { x: 12, y: 21, w: 12, h: 8 }),
-    panels.timeSeries.genericLegend('ovs-worker CPU Usage', 'percent', queries.OVSCPU.query('$_worker_node'), { x: 0, y: 21, w: 12, h: 8 }),
-    panels.timeSeries.genericLegend('ovs-worker Memory Usage', 'bytes', queries.OVSMemory.query('$_worker_node'), { x: 12, y: 21, w: 12, h: 8 }),
-    panels.timeSeries.generic('99% Pod Annotation Latency', 's', queries.ovnAnnotationLatency.query(), { x: 0, y: 1, w: 24, h: 12 }),
-    panels.timeSeries.generic('99% CNI Request ADD Latency', 's', queries.ovnCNIAdd.query(), { x: 0, y: 13, w: 12, h: 8 }),
-    panels.timeSeries.generic('99% CNI Request DEL Latency', 's', queries.ovnCNIDel.query(), { x: 12, y: 13, w: 12, h: 8 }),
-    panels.timeSeries.genericLegend('ovnkube-control-plane CPU Usage', 'percent', queries.ovnKubeControlPlaneCPU.query(), { x: 0, y: 21, w: 12, h: 8 }),
-    panels.timeSeries.genericLegend('ovnkube-control-plane Memory Usage', 'bytes', queries.ovnKubeControlPlaneMem.query(), { x: 12, y: 21, w: 12, h: 8 }),
-    panels.timeSeries.genericLegend('Top 10 ovn-controller CPU Usage', 'percent', queries.topOvnControllerCPU.query(), { x: 0, y: 28, w: 12, h: 8 }),
-    panels.timeSeries.genericLegend('Top 10 ovn-controller Memory Usage', 'bytes', queries.topOvnControllerMem.query(), { x: 12, y: 28, w: 12, h: 8 }),
+    panels.timeSeries.genericLegend('ovnkube-control-plane CPU Usage', 'percent', queries.ovnKubeControlPlaneCPU.query(), { x: 0, y: 1, w: 12, h: 8 }),
+    panels.timeSeries.genericLegend('ovnkube-control-plane Memory Usage', 'bytes', queries.ovnKubeControlPlaneMem.query(), { x: 12, y: 1, w: 12, h: 8 }),
+    panels.timeSeries.genericLegend('Top 10 ovn-controller CPU Usage', 'percent', queries.topOvnControllerCPU.query(), { x: 0, y: 9, w: 12, h: 8 }),
+    panels.timeSeries.genericLegend('Top 10 ovn-controller Memory Usage', 'bytes', queries.topOvnControllerMem.query(), { x: 12, y: 9, w: 12, h: 8 }),
+    panels.timeSeries.genericLegend('ovs-master CPU Usage', 'percent', queries.OVSCPU.query('$_master_node'), { x: 0, y: 17, w: 12, h: 8 }),
+    panels.timeSeries.genericLegend('ovs-master Memory Usage', 'bytes', queries.OVSMemory.query('$_master_node'), { x: 12, y: 17, w: 12, h: 8 }),
+    panels.timeSeries.genericLegend('ovs-worker CPU Usage', 'percent', queries.OVSCPU.query('$_worker_node'), { x: 0, y: 25, w: 12, h: 8 }),
+    panels.timeSeries.genericLegend('ovs-worker Memory Usage', 'bytes', queries.OVSMemory.query('$_worker_node'), { x: 12, y: 25, w: 12, h: 8 }),
+    panels.timeSeries.genericLegend('99% Pod Annotation Latency', 's', queries.ovnAnnotationLatency.query(), { x: 0, y: 33, w: 8, h: 8 }),
+    panels.timeSeries.genericLegend('99% CNI Request ADD Latency', 's', queries.ovnCNIAdd.query(), { x: 8, y: 41, w: 8, h: 8 }),
+    panels.timeSeries.genericLegend('99% CNI Request DEL Latency', 's', queries.ovnCNIDel.query(), { x: 16, y: 41, w: 8, h: 8 }),
   ]),
   g.panel.row.new('Monitoring stack')
   + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 })
diff --git a/templates/General/ovn-dashboard.jsonnet b/templates/General/ovn-dashboard.jsonnet
index daca8c5..8fe8fcd 100644
--- a/templates/General/ovn-dashboard.jsonnet
+++ b/templates/General/ovn-dashboard.jsonnet
@@ -26,11 +26,9 @@ g.dashboard.new('OVN-Monitoring-dashboard')
   + g.panel.row.withCollapsed(true)
   + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 })
   + g.panel.row.withPanels([
-    panels.stat.genericstatThresoldPanel('OVNKube Master', 'none', queries.ovnMasterLeader.query(), { x: 0, y: 0, w: 4, h: 4 }),
-    panels.stat.genericstatThresoldPanel('OVN Northd Status', 'none', queries.ovnNorthd.query(), { x: 4, y: 0, w: 4, h: 4 }),
-    panels.stat.genericstatThresoldPanel('OVN NBDB leader', 'none', queries.ovnNbdbLeader.query(), { x: 8, y: 0, w: 4, h: 4 }),
-    panels.stat.genericstatThresoldPanel('OVN SBDB leader', 'none', queries.ovnSbdbLeader.query(), { x: 12, y: 0, w: 4, h: 4 }),
-    panels.stat.genericstatThresoldOVNControllerPanel('OVN controller', 'none', queries.numOnvController.query(), { x: 16, y: 0, w: 4, h: 4 }),
+    panels.stat.genericstatThresoldPanel('OVNKube Master', 'none', queries.ovnClusterManagerLeader.query(), { x: 0, y: 0, w: 8, h: 4 }),
+    panels.stat.genericstatThresoldPanel('OVN Northd Status', 'none', queries.ovnNorthd.query(), { x: 8, y: 0, w: 8, h: 4 }),
+    panels.stat.genericstatThresoldOVNControllerPanel('OVN controller', 'none', queries.numOnvController.query(), { x: 16, y: 0, w: 8, h: 4 }),
     panels.timeSeries.genericTimeSeriesLegendPanel('ovnkube-control-plane CPU Usage', 'percent', queries.ovnKubeControlPlaneCPU.query(), { x: 0, y: 4, w: 12, h: 10 }),
     panels.timeSeries.genericTimeSeriesLegendPanel('ovnkube-control-plane Memory Usage', 'bytes', queries.ovnKubeControlPlaneMem.query(), { x: 12, y: 4, w: 12, h: 10 }),
     panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 ovn-controller CPU Usage', 'percent', queries.topOvnControllerCPU.query(), { x: 0, y: 12, w: 12, h: 10 }),