Skip to content

Commit

Permalink
Adding OVN Monitoring Dashboard to grafonnet
Browse files Browse the repository at this point in the history
  • Loading branch information
smanda99 committed Jan 4, 2024
1 parent 3fb5de1 commit b286d85
Show file tree
Hide file tree
Showing 4 changed files with 316 additions and 0 deletions.
100 changes: 100 additions & 0 deletions assets/ovn-monitoring/panels.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
local g = import "github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet";
{
stat: {
local stat = g.panel.stat,
local options = stat.options,

base(title, unit, targets, gridPos):
stat.new(title)
+ stat.datasource.withType('prometheus')
+ stat.datasource.withUid('$Datasource')
+ stat.standardOptions.withUnit(unit)
+ stat.queryOptions.withTargets(targets)
+ stat.gridPos.withX(gridPos.x)
+ stat.gridPos.withY(gridPos.y)
+ stat.gridPos.withH(gridPos.h)
+ stat.gridPos.withW(gridPos.w)
+ options.withJustifyMode("auto")
+ options.withGraphMode("area")
+ options.text.withTitleSize(12)
+ stat.standardOptions.color.withMode('thresholds')
+ options.withColorMode('none')
+ options.withColorMode('value'),

genericstatLegendPanel(title, unit, targets, gridPos):
self.base(title, unit, targets, gridPos)
+ stat.options.reduceOptions.withCalcs([
'last'
]),

genericstatThresoldPanel(title, unit, targets, gridPos):
self.genericstatLegendPanel(title, unit, targets, gridPos)
+ stat.standardOptions.thresholds.withSteps([
{
"color": "green",
"value": null
},
{
"color": "orange",
"value": 0
},
{
"color": "green",
"value": 1
}
])
+ options.withTextMode('name'),

genericstatThresoldOVNControllerPanel(title, unit, targets, gridPos):
self.genericstatLegendPanel(title, unit, targets, gridPos)
+ stat.standardOptions.thresholds.withSteps([
{
'color': 'green',
'value': null
}
])
+ options.withTextMode('auto'),
},

timeSeries: {
local timeSeries = g.panel.timeSeries,
local custom = timeSeries.fieldConfig.defaults.custom,
local options = timeSeries.options,

base(title, unit, targets, gridPos):
timeSeries.new(title)
+ timeSeries.queryOptions.withTargets(targets)
+ timeSeries.datasource.withType('prometheus')
+ timeSeries.datasource.withUid('$Datasource')
+ timeSeries.standardOptions.withUnit(unit)
+ timeSeries.gridPos.withX(gridPos.x)
+ timeSeries.gridPos.withY(gridPos.y)
+ timeSeries.gridPos.withH(gridPos.h)
+ timeSeries.gridPos.withW(gridPos.w)
+ custom.withDrawStyle("line")
+ custom.withLineInterpolation("linear")
+ custom.withBarAlignment(0)
+ custom.withLineWidth(1)
+ custom.withFillOpacity(10)
+ custom.withGradientMode("none")
+ custom.withSpanNulls(false)
+ custom.withPointSize(5)
+ custom.withSpanNulls(false)
+ custom.stacking.withMode("none")
+ custom.withShowPoints('never')
+ options.tooltip.withMode('multi')
+ options.tooltip.withSort('desc')
+ options.legend.withShowLegend(true)
+ options.legend.withPlacement('bottom'),

genericTimeSeriesLegendPanel(title, unit, targets, gridPos):
self.base(title, unit, targets, gridPos)
+ options.legend.withCalcs([
"mean",
"max"
])
+ options.legend.withDisplayMode('table'),


}
}
111 changes: 111 additions & 0 deletions assets/ovn-monitoring/queries.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
local variables = import './variables.libsonnet';

local generateTimeSeriesQuery(query, legend) = [
local prometheusQuery = g.query.prometheus;
prometheusQuery.new('$'+variables.Datasource.name, query)
+ prometheusQuery.withFormat('time_series')
+ prometheusQuery.withIntervalFactor(2)
+ prometheusQuery.withLegendFormat(legend),
];

{
ovnMasterLeader: {
query():
generateTimeSeriesQuery('ovnkube_master_leader','{{pod}}')
},

ovnNorthd: {
query():
generateTimeSeriesQuery('ovn_northd_status','{{pod}}')
},

ovnNbdbLeader: {
query():
generateTimeSeriesQuery('ovn_db_cluster_server_role{server_role="leader",db_name="OVN_Northbound"}','{{pod}}')
},

ovnSbdbLeader: {
query():
generateTimeSeriesQuery('ovn_db_cluster_server_role{server_role="leader",db_name="OVN_Southbound"}','{{pod}}')
},

numOnvController: {
query():
generateTimeSeriesQuery('count(ovn_controller_monitor_all) by (namespace)','')
},

ovnKubeMasterCPU: {
query():
generateTimeSeriesQuery('irate(container_cpu_usage_seconds_total{pod=~"ovnkube-master.*",namespace="openshift-ovn-kubernetes",container!~"POD|"}[2m])*100','{{container}}-{{pod}}-{{node}}')
},

ovnKubeMasterMem: {
query():
generateTimeSeriesQuery('container_memory_rss{pod=~"ovnkube-master-.*",namespace="openshift-ovn-kubernetes",container!~"POD|"}','{{container}}-{{pod}}-{{node}}')
},

topOvnControllerCPU: {
query():
generateTimeSeriesQuery('topk(10, irate(container_cpu_usage_seconds_total{pod=~"ovnkube-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}[2m])*100)','{{node}}')
},

topOvnControllerMem: {
query():
generateTimeSeriesQuery('topk(10, sum(container_memory_rss{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}) by (node))','{{node}}')
},

ovnAnnotationLatency: {
query():
generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_master_pod_creation_latency_seconds_bucket[2m])) by (pod,le)) > 0','{{pod}} - Pod Annotation latency')
},

ovnCNIAdd: {
query():
generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_node_cni_request_duration_seconds_bucket{command="ADD"}[2m])) by (pod,le)) > 0','{{pod}}')
},

podLatency: {
query():
generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_master_pod_lsp_created_port_binding_duration_seconds_bucket[2m])) by (pod,le))','{{pod}} - LSP created')
+ generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_master_pod_port_binding_port_binding_chassis_duration_seconds_bucket[2m])) by (pod,le))','{{pod}} - Port Binding')
+ generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_master_pod_port_binding_chassis_port_binding_up_duration_seconds_bucket[2m])) by (pod,le))','{{pod}} - Port Binding Up')
+ generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_master_pod_first_seen_lsp_created_duration_seconds_bucket[2m])) by (pod,le))','{{pod}} - Pod First seen')
},

synclatency: {
query():
generateTimeSeriesQuery('rate(ovnkube_master_sync_service_latency_seconds_sum[2m])','{{pod}} - Sync service latency')
},

ovnLatencyCalculate: {
query():
generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_master_network_programming_duration_seconds_bucket[2m])) by (pod, le))','{{pod}} - Kind Pod')
+ generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_master_network_programming_duration_seconds_bucket[2m])) by (service, le))','{{service}} - Kind Service')
},

ovnkubeNodeReadyLatency: {
query():
generateTimeSeriesQuery('ovnkube_node_ready_duration_seconds{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container!~"POD|"}','{{pod}}')
},

workQueue: {
query():
generateTimeSeriesQuery('rate(ovnkube_master_workqueue_adds_total[2m])','{{pod}} - Rate of handled adds')
},

workQueueDepth: {
query():
generateTimeSeriesQuery('ovnkube_master_workqueue_depth','{{pod}} - Depth of workqueue')
},

workQueueLatency: {
query():
generateTimeSeriesQuery('ovnkube_master_workqueue_longest_running_processor_seconds','{{pod}} - Longest processor duration')
},

workQueueUnfinishedLatency: {
query():
generateTimeSeriesQuery('ovnkube_master_workqueue_unfinished_work_seconds','{{pod}} - Unfinished work duration')
}
}
46 changes: 46 additions & 0 deletions assets/ovn-monitoring/variables.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
local g = import "github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet";
local var = g.dashboard.variable;

{
Datasource:
var.datasource.new('Datasource','prometheus')
+ var.datasource.withRegex('')
+ var.query.generalOptions.withLabel('Datasource')
+ var.query.selectionOptions.withMulti(false)
+ var.query.selectionOptions.withIncludeAll(false)
+ var.query.withRefresh(1),

_master_node:
var.query.new('_master_node','label_values(kube_node_role{role="master"}, node)')
+ var.datasource.withRegex('')
+ var.query.generalOptions.withLabel('Master')
+ var.query.selectionOptions.withMulti(true)
+ var.query.selectionOptions.withIncludeAll(false)
+ var.query.withRefresh(2),

_worker_node:
var.query.new('_worker_node','label_values(kube_node_role{role=~"work.*"}, node)')
+ var.datasource.withRegex('')
+ var.query.generalOptions.withLabel('Worker')
+ var.query.selectionOptions.withMulti(true)
+ var.query.selectionOptions.withIncludeAll(false)
+ var.query.withRefresh(2),

master_pod:
var.query.new('master_pod','label_values({pod=~"ovnkube-master.*", namespace=~"openshift-ovn-kubernetes"}, pod)')
+ var.datasource.withRegex('')
+ var.query.generalOptions.withLabel('OVNKube-Master')
+ var.query.selectionOptions.withMulti(true)
+ var.query.selectionOptions.withIncludeAll(false)
+ var.query.withRefresh(1),

kubenode_pod:
var.query.new('kubenode_pod','label_values({pod=~"ovnkube-node.*", namespace=~"openshift-ovn-kubernetes"}, pod)')
+ var.datasource.withRegex('')
+ var.query.generalOptions.withLabel('OVNKube-Node')
+ var.query.selectionOptions.withMulti(true)
+ var.query.selectionOptions.withIncludeAll(false)
+ var.query.withRefresh(1),


}
59 changes: 59 additions & 0 deletions templates/General/ovn-monitoring-v2.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
local panels = import '../../assets/ovn-monitoring/panels.libsonnet';
local queries = import '../../assets/ovn-monitoring/queries.libsonnet';
local variables = import '../../assets/ovn-monitoring/variables.libsonnet';
local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';

g.dashboard.new('OVN-Monitoring-dashboard')
+ g.dashboard.time.withFrom('now-1h')
+ g.dashboard.time.withTo('now')
+ g.dashboard.withTimezone('utc')
+ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'])
+ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'])
+ g.dashboard.withRefresh('')
+ g.dashboard.withEditable(false)
+ g.dashboard.graphTooltip.withSharedCrosshair()
+ g.dashboard.withVariables([
variables.Datasource,
variables._master_node,
variables._worker_node,
variables.master_pod,
variables.kubenode_pod,
])


+ g.dashboard.withPanels([
g.panel.row.new('OVN Resource Monitoring')
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 })
+ g.panel.row.withPanels([
panels.stat.genericstatThresoldPanel('OVNKube Master', 'none', queries.ovnMasterLeader.query(), { x: 0, y: 0, w: 4, h: 4 }),
panels.stat.genericstatThresoldPanel('OVN Northd Status', 'none', queries.ovnNorthd.query(), { x: 4, y: 0, w: 4, h: 4 }),
panels.stat.genericstatThresoldPanel('OVN NBDB leader', 'none', queries.ovnNbdbLeader.query(), { x: 8, y: 0, w: 4, h: 4 }),
panels.stat.genericstatThresoldPanel('OVN SBDB leader', 'none', queries.ovnSbdbLeader.query(), { x: 12, y: 0, w: 4, h: 4 }),
panels.stat.genericstatThresoldOVNControllerPanel('OVN controller', 'none', queries.numOnvController.query(), { x: 16, y: 0, w: 4, h: 4 }),
panels.timeSeries.genericTimeSeriesLegendPanel('ovnkube-master CPU Usage', 'percent', queries.ovnKubeMasterCPU.query(), { x: 0, y: 4, w: 12, h: 10 }),
panels.timeSeries.genericTimeSeriesLegendPanel('ovnkube-master Memory Usage', 'bytes', queries.ovnKubeMasterMem.query(), { x: 12, y: 4, w: 12, h: 10 }),
panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 ovn-controller CPU Usage', 'percent', queries.topOvnControllerCPU.query(), { x: 0, y: 12, w: 12, h: 10 }),
panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 ovn-controller Memory Usage', 'bytes', queries.topOvnControllerMem.query(), { x: 12, y: 12, w: 12, h: 10 }),
]),
g.panel.row.new('Latency Monitoring')
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 })
+ g.panel.row.withPanels([
panels.timeSeries.genericTimeSeriesLegendPanel('Pod Annotation Latency', 's', queries.ovnAnnotationLatency.query(), { x: 0, y: 0, w: 12, h: 10 }),
panels.timeSeries.genericTimeSeriesLegendPanel('CNI Request ADD Latency', 's', queries.ovnCNIAdd.query(), { x: 12, y: 0, w: 12, h: 10 }),
panels.timeSeries.genericTimeSeriesLegendPanel('Pod creation Latency', 's', queries.podLatency.query(), { x: 0, y: 8, w: 24, h: 10 }),
panels.timeSeries.genericTimeSeriesLegendPanel('Sync Service Latency', 's', queries.synclatency.query(), { x: 0, y: 16, w: 24, h: 10 }),
panels.timeSeries.genericTimeSeriesLegendPanel('Duration for OVN to apply network configuration', 's', queries.ovnLatencyCalculate.query(), { x: 0, y: 24, w: 24, h: 10 }),
panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Node Ready Latency', 's', queries.ovnkubeNodeReadyLatency.query(), { x: 0, y: 32, w: 24, h: 10 }),
]),
g.panel.row.new('WorkQueue Monitoring')
+ g.panel.row.withCollapsed(true)
+ g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 })
+ g.panel.row.withPanels([
panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Master workqueue', 'short', queries.workQueue.query(), { x: 0, y: 0, w: 12, h: 10 }),
panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Master workqueue Depth', 'short', queries.workQueueDepth.query(), { x: 12, y: 0, w: 12, h: 10 }),
panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Master workqueue duration', 's', queries.workQueueLatency.query(), { x: 0, y: 8, w: 12, h: 10 }),
panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Master workqueue - Unfinished', 's', queries.workQueueUnfinishedLatency.query(), { x: 12, y: 8, w: 12, h: 10 }),
]),
])

0 comments on commit b286d85

Please sign in to comment.