-
Notifications
You must be signed in to change notification settings - Fork 40
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Node panels #136
Node panels #136
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,8 +26,21 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonn | |
+ options.legend.withShowLegend(true) | ||
+ options.legend.withCalcs([ | ||
'mean', | ||
'min', | ||
'max', | ||
]) | ||
+ options.legend.withSortBy('Max') | ||
+ options.legend.withSortDesc(true) | ||
+ options.legend.withPlacement('bottom'), | ||
|
||
genericLegendCounter(title, unit, targets, gridPos): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. New panel type with different legend fields, more relevant for counters and memory. |
||
self.generic(title, unit, targets, gridPos) | ||
+ options.legend.withShowLegend(true) | ||
+ options.legend.withCalcs([ | ||
'first', | ||
'min', | ||
'max', | ||
'last', | ||
]) | ||
+ options.legend.withSortBy('Max') | ||
+ options.legend.withSortDesc(true) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,8 +3,7 @@ local var = g.dashboard.variable; | |
|
||
{ | ||
datasource: | ||
var.datasource.new('datasource', 'prometheus') | ||
+ var.datasource.withRegex('/^Cluster Prometheus$/'), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using a dashboard against a prometheus outside of openshift, I update this variable after importing. Instead, just deleting for good. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Confirmed on ROSA cluster that the dashboard variable auto-populates to 'Cluster Prometheus' |
||
var.datasource.new('datasource', 'prometheus'), | ||
|
||
master_node: | ||
var.query.new('_master_node') | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,49 +10,38 @@ local generateTimeSeriesQuery(query, legend) = [ | |
]; | ||
|
||
{ | ||
ovnMasterLeader: { | ||
ovnClusterManagerLeader: { | ||
query(): | ||
generateTimeSeriesQuery('ovnkube_master_leader', '{{pod}}'), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. metric doesn't exist. Replacement is only |
||
generateTimeSeriesQuery('ovnkube_clustermanager_leader > 0', '{{pod}}'), | ||
}, | ||
|
||
ovnNorthd: { | ||
query(): | ||
generateTimeSeriesQuery('ovn_northd_status', '{{pod}}'), | ||
}, | ||
|
||
ovnNbdbLeader: { | ||
query(): | ||
generateTimeSeriesQuery('ovn_db_cluster_server_role{server_role="leader",db_name="OVN_Northbound"}', '{{pod}}'), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removing both as neither metric exists any longer. |
||
}, | ||
|
||
ovnSbdbLeader: { | ||
query(): | ||
generateTimeSeriesQuery('ovn_db_cluster_server_role{server_role="leader",db_name="OVN_Southbound"}', '{{pod}}'), | ||
}, | ||
|
||
numOnvController: { | ||
query(): | ||
generateTimeSeriesQuery('count(ovn_controller_monitor_all) by (namespace)', ''), | ||
}, | ||
|
||
ovnKubeControlPlaneCPU: { | ||
query(): | ||
generateTimeSeriesQuery('irate(container_cpu_usage_seconds_total{pod=~"(ovnkube-master|ovnkube-control-plane).+",namespace="openshift-ovn-kubernetes",container!~"POD|"}[2m])*100','{{container}}-{{pod}}-{{node}}'), | ||
generateTimeSeriesQuery('sum( irate(container_cpu_usage_seconds_total{pod=~"(ovnkube-master|ovnkube-control-plane).+",namespace="openshift-ovn-kubernetes",container!~"POD|"}[2m])*100 ) by (pod, node)', '{{pod}} - {{node}}'), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. label formatting to match ocp-performance dashboard |
||
}, | ||
|
||
ovnKubeControlPlaneMem: { | ||
query(): | ||
generateTimeSeriesQuery('container_memory_rss{pod=~"(ovnkube-master|ovnkube-control-plane).+",namespace="openshift-ovn-kubernetes",container!~"POD|"}','{{container}}-{{pod}}-{{node}}'), | ||
generateTimeSeriesQuery('container_memory_rss{pod=~"(ovnkube-master|ovnkube-control-plane).+",namespace="openshift-ovn-kubernetes",container!~"POD|"}', '{{pod}} - {{node}}'), | ||
}, | ||
|
||
topOvnControllerCPU: { | ||
query(): | ||
generateTimeSeriesQuery('topk(10, irate(container_cpu_usage_seconds_total{pod=~"ovnkube-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}[2m])*100)', '{{node}}'), | ||
generateTimeSeriesQuery('topk(10, sum( irate(container_cpu_usage_seconds_total{pod=~"ovnkube-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}[2m])*100) by (pod,node) )', '{{pod}} - {{node}}'), | ||
}, | ||
|
||
topOvnControllerMem: { | ||
query(): | ||
generateTimeSeriesQuery('topk(10, sum(container_memory_rss{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}) by (node))', '{{node}}'), | ||
generateTimeSeriesQuery('topk(10, sum(container_memory_rss{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}) by (pod,node))', '{{pod}} - {{node}}'), | ||
}, | ||
|
||
ovnAnnotationLatency: { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,21 +26,38 @@ g.dashboard.new('Openshift Performance') | |
variables.interval, | ||
]) | ||
+ g.dashboard.withPanels([ | ||
g.panel.row.new('Cluster-at-a-Glance') | ||
+ g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) | ||
+ g.panel.row.withCollapsed(true) | ||
+ g.panel.row.withPanels([ | ||
panels.timeSeries.genericLegend('Workers CPU Usage', 'percent', queries.workersCPU.query(), { x: 0, y: 2, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('Control Plane CPU Usage', 'percent', queries.controlPlanesCPU.query(), { x: 12, y: 2, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('Workers Load1', 'short', queries.workersLoad1.query(), { x: 0, y: 9, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('Control Plane Load1', 'short', queries.controlPlanesLoad1.query(), { x: 12, y: 9, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('Workers CGroup CPU Rate', 'short', queries.workersCGroupCpuRate.query(), { x: 0, y: 17, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('Control Plane CGroup CPU Rate', 'short', queries.controlPlaneCGroupCpuRate.query(), { x: 12, y: 17, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegendCounter('Workers Memory Available', 'bytes', queries.workersMemoryAvailable.query(), { x: 0, y: 25, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegendCounter('Control Plane Memory Available', 'bytes', queries.controlPlaneMemoryAvailable.query(), { x: 12, y: 25, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegendCounter('Workers Container Threads', 'short', queries.workersContainerThreads.query(), { x: 0, y: 33, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegendCounter('Control Plane Container Threads', 'short', queries.controlPlaneContainerThreads.query(), { x: 12, y: 33, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('Workers Disk IOPS', 'short', queries.workersIOPS.query(), { x: 0, y: 41, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('Control Plane Disk IOPS', 'short', queries.controlPlaneIOPS.query(), { x: 12, y: 41, w: 12, h: 8 }), | ||
]), | ||
g.panel.row.new('OVN') | ||
+ g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) | ||
+ g.panel.row.withCollapsed(true) | ||
+ g.panel.row.withPanels([ | ||
panels.timeSeries.genericLegend('ovs-master CPU Usage', 'percent', queries.OVSCPU.query('$_master_node'), { x: 0, y: 21, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('ovs-master Memory Usage', 'bytes', queries.OVSMemory.query('$_master_node'), { x: 12, y: 21, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('ovs-worker CPU Usage', 'percent', queries.OVSCPU.query('$_worker_node'), { x: 0, y: 21, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('ovs-worker Memory Usage', 'bytes', queries.OVSMemory.query('$_worker_node'), { x: 12, y: 21, w: 12, h: 8 }), | ||
panels.timeSeries.generic('99% Pod Annotation Latency', 's', queries.ovnAnnotationLatency.query(), { x: 0, y: 1, w: 24, h: 12 }), | ||
panels.timeSeries.generic('99% CNI Request ADD Latency', 's', queries.ovnCNIAdd.query(), { x: 0, y: 13, w: 12, h: 8 }), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These Also, the metrics seem far less frequently used than CPU and memory usage, so I also moved them to the bottom so relevant panels stay at the top. |
||
panels.timeSeries.generic('99% CNI Request DEL Latency', 's', queries.ovnCNIDel.query(), { x: 12, y: 13, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('ovnkube-control-plane CPU Usage', 'percent', queries.ovnKubeControlPlaneCPU.query(), { x: 0, y: 21, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('ovnkube-control-plane Memory Usage', 'bytes', queries.ovnKubeControlPlaneMem.query(), { x: 12, y: 21, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('Top 10 ovn-controller CPU Usage', 'percent', queries.topOvnControllerCPU.query(), { x: 0, y: 28, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('Top 10 ovn-controller Memory Usage', 'bytes', queries.topOvnControllerMem.query(), { x: 12, y: 28, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('ovnkube-control-plane CPU Usage', 'percent', queries.ovnKubeControlPlaneCPU.query(), { x: 0, y: 1, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('ovnkube-control-plane Memory Usage', 'bytes', queries.ovnKubeControlPlaneMem.query(), { x: 12, y: 1, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('Top 10 ovn-controller CPU Usage', 'percent', queries.topOvnControllerCPU.query(), { x: 0, y: 9, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('Top 10 ovn-controller Memory Usage', 'bytes', queries.topOvnControllerMem.query(), { x: 12, y: 9, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('ovs-master CPU Usage', 'percent', queries.OVSCPU.query('$_master_node'), { x: 0, y: 17, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('ovs-master Memory Usage', 'bytes', queries.OVSMemory.query('$_master_node'), { x: 12, y: 17, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('ovs-worker CPU Usage', 'percent', queries.OVSCPU.query('$_worker_node'), { x: 0, y: 25, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('ovs-worker Memory Usage', 'bytes', queries.OVSMemory.query('$_worker_node'), { x: 12, y: 25, w: 12, h: 8 }), | ||
panels.timeSeries.genericLegend('99% Pod Annotation Latency', 's', queries.ovnAnnotationLatency.query(), { x: 0, y: 33, w: 8, h: 8 }), | ||
panels.timeSeries.genericLegend('99% CNI Request ADD Latency', 's', queries.ovnCNIAdd.query(), { x: 8, y: 41, w: 8, h: 8 }), | ||
panels.timeSeries.genericLegend('99% CNI Request DEL Latency', 's', queries.ovnCNIDel.query(), { x: 16, y: 41, w: 8, h: 8 }), | ||
]), | ||
g.panel.row.new('Monitoring stack') | ||
+ g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
New make task so binaries can still be deleted, but are not deleted every time.