Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Node panels #136

Merged
merged 3 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,14 @@ format: deps

build: deps $(LIBRARY_PATH) $(outputs)

clean:
clean-all:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New make task so binaries can still be deleted, but are not deleted every time.

@echo "Cleaning up"
rm -rf $(ALLDIRS) $(TEMPLATESDIR)/vendor

clean:
@echo "Cleaning up"
rm -rf $(OUTPUTDIR)

$(BINDIR)/jsonnet:
@echo "Downloading jsonnet binary"
curl -s -L $(JSONNET) | tar xz -C $(BINDIR)
Expand Down
13 changes: 13 additions & 0 deletions assets/ocp-performance/panels.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,21 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn
+ options.legend.withShowLegend(true)
+ options.legend.withCalcs([
'mean',
'min',
'max',
])
+ options.legend.withSortBy('Max')
+ options.legend.withSortDesc(true)
+ options.legend.withPlacement('bottom'),

genericLegendCounter(title, unit, targets, gridPos):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New panel type with different legend fields, more relevant for counters and memory.

self.generic(title, unit, targets, gridPos)
+ options.legend.withShowLegend(true)
+ options.legend.withCalcs([
'first',
'min',
'max',
'last',
])
+ options.legend.withSortBy('Max')
+ options.legend.withSortDesc(true)
Expand Down
59 changes: 55 additions & 4 deletions assets/ocp-performance/queries.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,57 @@ local generateTimeSeriesQuery(query, legend) = [
];

{
workersCPU: {
query():
generateTimeSeriesQuery('sum( rate( (node_cpu_seconds_total{ mode != "idle" } * on (instance) group_left label_replace( kube_node_role{ role = "worker"} , "instance" , "$1" , "node" ,"(.*)") )[$interval:] ) ) by (instance) * 100', '{{instance}}'),
},
controlPlanesCPU: {
query():
generateTimeSeriesQuery('sum( rate( (node_cpu_seconds_total{ mode != "idle" } * on (instance) group_left label_replace( kube_node_role{ role = "control-plane"} , "instance" , "$1" , "node" ,"(.*)") )[$interval:] ) ) by (instance) * 100', '{{instance}}'),
},
workersLoad1: {
query():
generateTimeSeriesQuery('node_load1 * on (instance) group_left label_replace( kube_node_role{ role = "worker"} , "instance" , "$1" , "node" ,"(.*)") ', '{{instance}}'),
},
controlPlanesLoad1: {
query():
generateTimeSeriesQuery('node_load1 * on (instance) group_left label_replace( kube_node_role{ role = "control-plane"} , "instance" , "$1" , "node" ,"(.*)") ', '{{instance}}'),
},
workersCGroupCpuRate: {
query():
generateTimeSeriesQuery('sum by (id) (( rate(container_cpu_usage_seconds_total{ job=~".*", id =~"/system.slice|/system.slice/kubelet.service|/system.slice/ovs-vswitchd.service|/system.slice/crio.service|/system.slice/systemd-journald.service|/system.slice/ovsdb-server.service|/system.slice/systemd-udevd.service|/kubepods.slice"}[$interval])) * 100 * on (node) group_left kube_node_role{ role = "worker" } )', '{{instance}}'),
},
controlPlaneCGroupCpuRate: {
query():
generateTimeSeriesQuery('sum by (id) (( rate(container_cpu_usage_seconds_total{ job=~".*", id =~"/system.slice|/system.slice/kubelet.service|/system.slice/ovs-vswitchd.service|/system.slice/crio.service|/system.slice/systemd-journald.service|/system.slice/ovsdb-server.service|/system.slice/systemd-udevd.service|/kubepods.slice"}[$interval])) * 100 * on (node) group_left kube_node_role{ role = "control-plane" } )', '{{instance}}'),
},
workersMemoryAvailable: {
query():
generateTimeSeriesQuery('node_memory_MemAvailable_bytes * on (instance) group_left label_replace( kube_node_role{ role = "worker"} , "instance" , "$1" , "node" ,"(.*)")', '{{instance}}'),
},
controlPlaneMemoryAvailable: {
query():
generateTimeSeriesQuery('node_memory_MemAvailable_bytes * on (instance) group_left label_replace( kube_node_role{ role = "control-plane"} , "instance" , "$1" , "node" ,"(.*)")', '{{instance}}'),
},
workersContainerThreads: {
query():
generateTimeSeriesQuery('sum by (node) (container_threads{ container!=""}) * on (node) group_left kube_node_role{ role = "worker" }', '{{instance}}'),
},
controlPlaneContainerThreads: {
query():
generateTimeSeriesQuery('sum by (node) (container_threads{ container!=""}) * on (node) group_left kube_node_role{ role = "control-plane" }', '{{instance}}'),
},
workersIOPS: {
query():
generateTimeSeriesQuery('rate( ( node_disk_reads_completed_total * on (instance) group_left label_replace( kube_node_role{ role = "worker" } , "instance" , "$1" , "node" ,"(.*)") )[$interval:])', '{{instance}} - {{ device }} - read') +
generateTimeSeriesQuery('rate( ( node_disk_writes_completed_total * on (instance) group_left label_replace( kube_node_role{ role = "worker" } , "instance" , "$1" , "node" ,"(.*)") )[$interval:])', '{{instance}} - {{ device }} - write'),
},
controlPlaneIOPS: {
query():
generateTimeSeriesQuery('rate( ( node_disk_reads_completed_total * on (instance) group_left label_replace( kube_node_role{ role = "control-plane" } , "instance" , "$1" , "node" ,"(.*)") )[$interval:])', '{{instance}} - {{ device }} - read') +
generateTimeSeriesQuery('rate( ( node_disk_writes_completed_total * on (instance) group_left label_replace( kube_node_role{ role = "control-plane" } , "instance" , "$1" , "node" ,"(.*)") )[$interval:])', '{{instance}} - {{ device }} - write'),
},

nodeMemory: {
query(nodeName):
generateTimeSeriesQuery('node_memory_Active_bytes{instance=~"' + nodeName + '"}', 'Active')
Expand Down Expand Up @@ -99,19 +150,19 @@ local generateTimeSeriesQuery(query, legend) = [
},
ovnKubeControlPlaneCPU: {
query():
generateTimeSeriesQuery('irate(container_cpu_usage_seconds_total{pod=~"(ovnkube-master|ovnkube-control-plane).+",namespace="openshift-ovn-kubernetes",container!~"POD|"}[$interval])*100', '{{container}}-{{pod}}-{{node}}'),
generateTimeSeriesQuery('sum( irate(container_cpu_usage_seconds_total{pod=~"(ovnkube-master|ovnkube-control-plane).+",namespace="openshift-ovn-kubernetes",container!~"POD|"}[$interval])*100 ) by (pod, node)', '{{pod}} - {{node}}'),
},
ovnKubeControlPlaneMem: {
query():
generateTimeSeriesQuery('container_memory_rss{pod=~"(ovnkube-master|ovnkube-control-plane).+",namespace="openshift-ovn-kubernetes",container!~"POD|"}', '{{container}}-{{pod}}-{{node}}'),
generateTimeSeriesQuery('container_memory_rss{pod=~"(ovnkube-master|ovnkube-control-plane).+",namespace="openshift-ovn-kubernetes",container!~"POD|"}', '{{pod}} - {{node}}'),
},
topOvnControllerCPU: {
query():
generateTimeSeriesQuery('topk(10, irate(container_cpu_usage_seconds_total{pod=~"ovnkube-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}[$interval])*100)', '{{node}}'),
generateTimeSeriesQuery('topk(10, sum( irate(container_cpu_usage_seconds_total{pod=~"ovnkube-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}[$interval])*100) by (pod,node) )', '{{pod}} - {{node}}'),
},
topOvnControllerMem: {
query():
generateTimeSeriesQuery('topk(10, sum(container_memory_rss{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}) by (node))', '{{node}}'),
generateTimeSeriesQuery('topk(10, sum(container_memory_rss{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}) by (pod,node))', '{{pod}} - {{node}}'),
},
promReplCpuUsage: {
query():
Expand Down
3 changes: 1 addition & 2 deletions assets/ocp-performance/variables.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@ local var = g.dashboard.variable;

{
datasource:
var.datasource.new('datasource', 'prometheus')
+ var.datasource.withRegex('/^Cluster Prometheus$/'),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using a dashboard against a prometheus outside of openshift, I update this variable after importing. Instead, just deleting for good.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Confirmed on ROSA cluster that the dashboard variable auto-populates to 'Cluster Prometheus'

var.datasource.new('datasource', 'prometheus'),

master_node:
var.query.new('_master_node')
Expand Down
23 changes: 6 additions & 17 deletions assets/ovn-monitoring/queries.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -10,49 +10,38 @@ local generateTimeSeriesQuery(query, legend) = [
];

{
ovnMasterLeader: {
ovnClusterManagerLeader: {
query():
generateTimeSeriesQuery('ovnkube_master_leader', '{{pod}}'),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

metric doesn't exist. Replacement is only _leader that is unique, as ovnkube_controller_leader is 0 for all pods.

generateTimeSeriesQuery('ovnkube_clustermanager_leader > 0', '{{pod}}'),
},

ovnNorthd: {
query():
generateTimeSeriesQuery('ovn_northd_status', '{{pod}}'),
},

ovnNbdbLeader: {
query():
generateTimeSeriesQuery('ovn_db_cluster_server_role{server_role="leader",db_name="OVN_Northbound"}', '{{pod}}'),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removing both as neither metric exists any longer.

},

ovnSbdbLeader: {
query():
generateTimeSeriesQuery('ovn_db_cluster_server_role{server_role="leader",db_name="OVN_Southbound"}', '{{pod}}'),
},

numOnvController: {
query():
generateTimeSeriesQuery('count(ovn_controller_monitor_all) by (namespace)', ''),
},

ovnKubeControlPlaneCPU: {
query():
generateTimeSeriesQuery('irate(container_cpu_usage_seconds_total{pod=~"(ovnkube-master|ovnkube-control-plane).+",namespace="openshift-ovn-kubernetes",container!~"POD|"}[2m])*100','{{container}}-{{pod}}-{{node}}'),
generateTimeSeriesQuery('sum( irate(container_cpu_usage_seconds_total{pod=~"(ovnkube-master|ovnkube-control-plane).+",namespace="openshift-ovn-kubernetes",container!~"POD|"}[2m])*100 ) by (pod, node)', '{{pod}} - {{node}}'),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

label formatting to match ocp-performance dashboard

},

ovnKubeControlPlaneMem: {
query():
generateTimeSeriesQuery('container_memory_rss{pod=~"(ovnkube-master|ovnkube-control-plane).+",namespace="openshift-ovn-kubernetes",container!~"POD|"}','{{container}}-{{pod}}-{{node}}'),
generateTimeSeriesQuery('container_memory_rss{pod=~"(ovnkube-master|ovnkube-control-plane).+",namespace="openshift-ovn-kubernetes",container!~"POD|"}', '{{pod}} - {{node}}'),
},

topOvnControllerCPU: {
query():
generateTimeSeriesQuery('topk(10, irate(container_cpu_usage_seconds_total{pod=~"ovnkube-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}[2m])*100)', '{{node}}'),
generateTimeSeriesQuery('topk(10, sum( irate(container_cpu_usage_seconds_total{pod=~"ovnkube-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}[2m])*100) by (pod,node) )', '{{pod}} - {{node}}'),
},

topOvnControllerMem: {
query():
generateTimeSeriesQuery('topk(10, sum(container_memory_rss{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}) by (node))', '{{node}}'),
generateTimeSeriesQuery('topk(10, sum(container_memory_rss{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}) by (pod,node))', '{{pod}} - {{node}}'),
},

ovnAnnotationLatency: {
Expand Down
39 changes: 28 additions & 11 deletions templates/General/ocp-performance.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,38 @@ g.dashboard.new('Openshift Performance')
variables.interval,
])
+ g.dashboard.withPanels([
g.panel.row.new('Cluster-at-a-Glance')
+ g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 })
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withPanels([
panels.timeSeries.genericLegend('Workers CPU Usage', 'percent', queries.workersCPU.query(), { x: 0, y: 2, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Control Plane CPU Usage', 'percent', queries.controlPlanesCPU.query(), { x: 12, y: 2, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Workers Load1', 'short', queries.workersLoad1.query(), { x: 0, y: 9, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Control Plane Load1', 'short', queries.controlPlanesLoad1.query(), { x: 12, y: 9, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Workers CGroup CPU Rate', 'short', queries.workersCGroupCpuRate.query(), { x: 0, y: 17, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Control Plane CGroup CPU Rate', 'short', queries.controlPlaneCGroupCpuRate.query(), { x: 12, y: 17, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounter('Workers Memory Available', 'bytes', queries.workersMemoryAvailable.query(), { x: 0, y: 25, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounter('Control Plane Memory Available', 'bytes', queries.controlPlaneMemoryAvailable.query(), { x: 12, y: 25, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounter('Workers Container Threads', 'short', queries.workersContainerThreads.query(), { x: 0, y: 33, w: 12, h: 8 }),
panels.timeSeries.genericLegendCounter('Control Plane Container Threads', 'short', queries.controlPlaneContainerThreads.query(), { x: 12, y: 33, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Workers Disk IOPS', 'short', queries.workersIOPS.query(), { x: 0, y: 41, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Control Plane Disk IOPS', 'short', queries.controlPlaneIOPS.query(), { x: 12, y: 41, w: 12, h: 8 }),
]),
g.panel.row.new('OVN')
+ g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 })
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withPanels([
panels.timeSeries.genericLegend('ovs-master CPU Usage', 'percent', queries.OVSCPU.query('$_master_node'), { x: 0, y: 21, w: 12, h: 8 }),
panels.timeSeries.genericLegend('ovs-master Memory Usage', 'bytes', queries.OVSMemory.query('$_master_node'), { x: 12, y: 21, w: 12, h: 8 }),
panels.timeSeries.genericLegend('ovs-worker CPU Usage', 'percent', queries.OVSCPU.query('$_worker_node'), { x: 0, y: 21, w: 12, h: 8 }),
panels.timeSeries.genericLegend('ovs-worker Memory Usage', 'bytes', queries.OVSMemory.query('$_worker_node'), { x: 12, y: 21, w: 12, h: 8 }),
panels.timeSeries.generic('99% Pod Annotation Latency', 's', queries.ovnAnnotationLatency.query(), { x: 0, y: 1, w: 24, h: 12 }),
panels.timeSeries.generic('99% CNI Request ADD Latency', 's', queries.ovnCNIAdd.query(), { x: 0, y: 13, w: 12, h: 8 }),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These y values were causing these three panels to pop out of the row.

Also, the metrics seem far less frequently used than CPU and memory usage, so I also moved them to the bottom so relevant panels stay at the top.

panels.timeSeries.generic('99% CNI Request DEL Latency', 's', queries.ovnCNIDel.query(), { x: 12, y: 13, w: 12, h: 8 }),
panels.timeSeries.genericLegend('ovnkube-control-plane CPU Usage', 'percent', queries.ovnKubeControlPlaneCPU.query(), { x: 0, y: 21, w: 12, h: 8 }),
panels.timeSeries.genericLegend('ovnkube-control-plane Memory Usage', 'bytes', queries.ovnKubeControlPlaneMem.query(), { x: 12, y: 21, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Top 10 ovn-controller CPU Usage', 'percent', queries.topOvnControllerCPU.query(), { x: 0, y: 28, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Top 10 ovn-controller Memory Usage', 'bytes', queries.topOvnControllerMem.query(), { x: 12, y: 28, w: 12, h: 8 }),
panels.timeSeries.genericLegend('ovnkube-control-plane CPU Usage', 'percent', queries.ovnKubeControlPlaneCPU.query(), { x: 0, y: 1, w: 12, h: 8 }),
panels.timeSeries.genericLegend('ovnkube-control-plane Memory Usage', 'bytes', queries.ovnKubeControlPlaneMem.query(), { x: 12, y: 1, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Top 10 ovn-controller CPU Usage', 'percent', queries.topOvnControllerCPU.query(), { x: 0, y: 9, w: 12, h: 8 }),
panels.timeSeries.genericLegend('Top 10 ovn-controller Memory Usage', 'bytes', queries.topOvnControllerMem.query(), { x: 12, y: 9, w: 12, h: 8 }),
panels.timeSeries.genericLegend('ovs-master CPU Usage', 'percent', queries.OVSCPU.query('$_master_node'), { x: 0, y: 17, w: 12, h: 8 }),
panels.timeSeries.genericLegend('ovs-master Memory Usage', 'bytes', queries.OVSMemory.query('$_master_node'), { x: 12, y: 17, w: 12, h: 8 }),
panels.timeSeries.genericLegend('ovs-worker CPU Usage', 'percent', queries.OVSCPU.query('$_worker_node'), { x: 0, y: 25, w: 12, h: 8 }),
panels.timeSeries.genericLegend('ovs-worker Memory Usage', 'bytes', queries.OVSMemory.query('$_worker_node'), { x: 12, y: 25, w: 12, h: 8 }),
panels.timeSeries.genericLegend('99% Pod Annotation Latency', 's', queries.ovnAnnotationLatency.query(), { x: 0, y: 33, w: 8, h: 8 }),
panels.timeSeries.genericLegend('99% CNI Request ADD Latency', 's', queries.ovnCNIAdd.query(), { x: 8, y: 41, w: 8, h: 8 }),
panels.timeSeries.genericLegend('99% CNI Request DEL Latency', 's', queries.ovnCNIDel.query(), { x: 16, y: 41, w: 8, h: 8 }),
]),
g.panel.row.new('Monitoring stack')
+ g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 })
Expand Down
8 changes: 3 additions & 5 deletions templates/General/ovn-dashboard.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,9 @@ g.dashboard.new('OVN-Monitoring-dashboard')
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 })
+ g.panel.row.withPanels([
panels.stat.genericstatThresoldPanel('OVNKube Master', 'none', queries.ovnMasterLeader.query(), { x: 0, y: 0, w: 4, h: 4 }),
panels.stat.genericstatThresoldPanel('OVN Northd Status', 'none', queries.ovnNorthd.query(), { x: 4, y: 0, w: 4, h: 4 }),
panels.stat.genericstatThresoldPanel('OVN NBDB leader', 'none', queries.ovnNbdbLeader.query(), { x: 8, y: 0, w: 4, h: 4 }),
panels.stat.genericstatThresoldPanel('OVN SBDB leader', 'none', queries.ovnSbdbLeader.query(), { x: 12, y: 0, w: 4, h: 4 }),
panels.stat.genericstatThresoldOVNControllerPanel('OVN controller', 'none', queries.numOnvController.query(), { x: 16, y: 0, w: 4, h: 4 }),
panels.stat.genericstatThresoldPanel('OVNKube Master', 'none', queries.ovnClusterManagerLeader.query(), { x: 0, y: 0, w: 8, h: 4 }),
panels.stat.genericstatThresoldPanel('OVN Northd Status', 'none', queries.ovnNorthd.query(), { x: 8, y: 0, w: 8, h: 4 }),
panels.stat.genericstatThresoldOVNControllerPanel('OVN controller', 'none', queries.numOnvController.query(), { x: 16, y: 0, w: 8, h: 4 }),
panels.timeSeries.genericTimeSeriesLegendPanel('ovnkube-control-plane CPU Usage', 'percent', queries.ovnKubeControlPlaneCPU.query(), { x: 0, y: 4, w: 12, h: 10 }),
panels.timeSeries.genericTimeSeriesLegendPanel('ovnkube-control-plane Memory Usage', 'bytes', queries.ovnKubeControlPlaneMem.query(), { x: 12, y: 4, w: 12, h: 10 }),
panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 ovn-controller CPU Usage', 'percent', queries.topOvnControllerCPU.query(), { x: 0, y: 12, w: 12, h: 10 }),
Expand Down