diff --git a/assets/ovn-monitoring/panels.libsonnet b/assets/ovn-monitoring/panels.libsonnet new file mode 100644 index 0000000..975a5ad --- /dev/null +++ b/assets/ovn-monitoring/panels.libsonnet @@ -0,0 +1,100 @@ +local g = import "github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet"; +{ + stat: { + local stat = g.panel.stat, + local options = stat.options, + + base(title, unit, targets, gridPos): + stat.new(title) + + stat.datasource.withType('prometheus') + + stat.datasource.withUid('$Datasource') + + stat.standardOptions.withUnit(unit) + + stat.queryOptions.withTargets(targets) + + stat.gridPos.withX(gridPos.x) + + stat.gridPos.withY(gridPos.y) + + stat.gridPos.withH(gridPos.h) + + stat.gridPos.withW(gridPos.w) + + options.withJustifyMode("auto") + + options.withGraphMode("area") + + options.text.withTitleSize(12) + + stat.standardOptions.color.withMode('thresholds') + + options.withColorMode('none') + + options.withColorMode('value'), + + genericstatLegendPanel(title, unit, targets, gridPos): + self.base(title, unit, targets, gridPos) + + stat.options.reduceOptions.withCalcs([ + 'last' + ]), + + genericstatThresoldPanel(title, unit, targets, gridPos): + self.genericstatLegendPanel(title, unit, targets, gridPos) + + stat.standardOptions.thresholds.withSteps([ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ]) + + options.withTextMode('name'), + + genericstatThresoldOVNControllerPanel(title, unit, targets, gridPos): + self.genericstatLegendPanel(title, unit, targets, gridPos) + + stat.standardOptions.thresholds.withSteps([ + { + 'color': 'green', + 'value': null + } + ]) + + options.withTextMode('auto'), + }, + + timeSeries: { + local timeSeries = g.panel.timeSeries, + local custom = timeSeries.fieldConfig.defaults.custom, + local options = timeSeries.options, + + base(title, unit, targets, gridPos): + timeSeries.new(title) + + timeSeries.queryOptions.withTargets(targets) + + timeSeries.datasource.withType('prometheus') + + timeSeries.datasource.withUid('$Datasource') + + timeSeries.standardOptions.withUnit(unit) + + timeSeries.gridPos.withX(gridPos.x) + + timeSeries.gridPos.withY(gridPos.y) + + timeSeries.gridPos.withH(gridPos.h) + + timeSeries.gridPos.withW(gridPos.w) + + custom.withDrawStyle("line") + + custom.withLineInterpolation("linear") + + custom.withBarAlignment(0) + + custom.withLineWidth(1) + + custom.withFillOpacity(10) + + custom.withGradientMode("none") + + custom.withSpanNulls(false) + + custom.withPointSize(5) + + custom.withSpanNulls(false) + + custom.stacking.withMode("none") + + custom.withShowPoints('never') + + options.tooltip.withMode('multi') + + options.tooltip.withSort('desc') + + options.legend.withShowLegend(true) + + options.legend.withPlacement('bottom'), + + genericTimeSeriesLegendPanel(title, unit, targets, gridPos): + self.base(title, unit, targets, gridPos) + + options.legend.withCalcs([ + "mean", + "max" + ]) + + options.legend.withDisplayMode('table'), + + + } +} \ No newline at end of file diff --git a/assets/ovn-monitoring/queries.libsonnet b/assets/ovn-monitoring/queries.libsonnet new file mode 100644 index 0000000..c37cae9 --- /dev/null +++ b/assets/ovn-monitoring/queries.libsonnet @@ -0,0 +1,111 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; +local variables = import './variables.libsonnet'; + +local generateTimeSeriesQuery(query, legend) = [ + local prometheusQuery = g.query.prometheus; + prometheusQuery.new('$'+variables.Datasource.name, query) + + prometheusQuery.withFormat('time_series') + + prometheusQuery.withIntervalFactor(2) + + prometheusQuery.withLegendFormat(legend), +]; + +{ + ovnMasterLeader: { + query(): + generateTimeSeriesQuery('ovnkube_master_leader','{{pod}}') + }, + + ovnNorthd: { + query(): + generateTimeSeriesQuery('ovn_northd_status','{{pod}}') + }, + + ovnNbdbLeader: { + query(): + generateTimeSeriesQuery('ovn_db_cluster_server_role{server_role="leader",db_name="OVN_Northbound"}','{{pod}}') + }, + + ovnSbdbLeader: { + query(): + generateTimeSeriesQuery('ovn_db_cluster_server_role{server_role="leader",db_name="OVN_Southbound"}','{{pod}}') + }, + + numOnvController: { + query(): + generateTimeSeriesQuery('count(ovn_controller_monitor_all) by (namespace)','') + }, + + ovnKubeMasterCPU: { + query(): + generateTimeSeriesQuery('irate(container_cpu_usage_seconds_total{pod=~"ovnkube-master.*",namespace="openshift-ovn-kubernetes",container!~"POD|"}[2m])*100','{{container}}-{{pod}}-{{node}}') + }, + + ovnKubeMasterMem: { + query(): + generateTimeSeriesQuery('container_memory_rss{pod=~"ovnkube-master-.*",namespace="openshift-ovn-kubernetes",container!~"POD|"}','{{container}}-{{pod}}-{{node}}') + }, + + topOvnControllerCPU: { + query(): + generateTimeSeriesQuery('topk(10, irate(container_cpu_usage_seconds_total{pod=~"ovnkube-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}[2m])*100)','{{node}}') + }, + + topOvnControllerMem: { + query(): + generateTimeSeriesQuery('topk(10, sum(container_memory_rss{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}) by (node))','{{node}}') + }, + + ovnAnnotationLatency: { + query(): + generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_master_pod_creation_latency_seconds_bucket[2m])) by (pod,le)) > 0','{{pod}} - Pod Annotation latency') + }, + + ovnCNIAdd: { + query(): + generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_node_cni_request_duration_seconds_bucket{command="ADD"}[2m])) by (pod,le)) > 0','{{pod}}') + }, + + podLatency: { + query(): + generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_master_pod_lsp_created_port_binding_duration_seconds_bucket[2m])) by (pod,le))','{{pod}} - LSP created') + + generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_master_pod_port_binding_port_binding_chassis_duration_seconds_bucket[2m])) by (pod,le))','{{pod}} - Port Binding') + + generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_master_pod_port_binding_chassis_port_binding_up_duration_seconds_bucket[2m])) by (pod,le))','{{pod}} - Port Binding Up') + + generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_master_pod_first_seen_lsp_created_duration_seconds_bucket[2m])) by (pod,le))','{{pod}} - Pod First seen') + }, + + synclatency: { + query(): + generateTimeSeriesQuery('rate(ovnkube_master_sync_service_latency_seconds_sum[2m])','{{pod}} - Sync service latency') + }, + + ovnLatencyCalculate: { + query(): + generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_master_network_programming_duration_seconds_bucket[2m])) by (pod, le))','{{pod}} - Kind Pod') + + generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_master_network_programming_duration_seconds_bucket[2m])) by (service, le))','{{service}} - Kind Service') + }, + + ovnkubeNodeReadyLatency: { + query(): + generateTimeSeriesQuery('ovnkube_node_ready_duration_seconds{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container!~"POD|"}','{{pod}}') + }, + + workQueue: { + query(): + generateTimeSeriesQuery('rate(ovnkube_master_workqueue_adds_total[2m])','{{pod}} - Rate of handled adds') + }, + + workQueueDepth: { + query(): + generateTimeSeriesQuery('ovnkube_master_workqueue_depth','{{pod}} - Depth of workqueue') + }, + + workQueueLatency: { + query(): + generateTimeSeriesQuery('ovnkube_master_workqueue_longest_running_processor_seconds','{{pod}} - Longest processor duration') + }, + + workQueueUnfinishedLatency: { + query(): + generateTimeSeriesQuery('ovnkube_master_workqueue_unfinished_work_seconds','{{pod}} - Unfinished work duration') + } +} \ No newline at end of file diff --git a/assets/ovn-monitoring/variables.libsonnet b/assets/ovn-monitoring/variables.libsonnet new file mode 100644 index 0000000..249836b --- /dev/null +++ b/assets/ovn-monitoring/variables.libsonnet @@ -0,0 +1,46 @@ +local g = import "github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet"; +local var = g.dashboard.variable; + +{ + Datasource: + var.datasource.new('Datasource','prometheus') + + var.datasource.withRegex('') + + var.query.generalOptions.withLabel('Datasource') + + var.query.selectionOptions.withMulti(false) + + var.query.selectionOptions.withIncludeAll(false) + + var.query.withRefresh(1), + + _master_node: + var.query.new('_master_node','label_values(kube_node_role{role="master"}, node)') + + var.datasource.withRegex('') + + var.query.generalOptions.withLabel('Master') + + var.query.selectionOptions.withMulti(true) + + var.query.selectionOptions.withIncludeAll(false) + + var.query.withRefresh(2), + + _worker_node: + var.query.new('_worker_node','label_values(kube_node_role{role=~"work.*"}, node)') + + var.datasource.withRegex('') + + var.query.generalOptions.withLabel('Worker') + + var.query.selectionOptions.withMulti(true) + + var.query.selectionOptions.withIncludeAll(false) + + var.query.withRefresh(2), + + master_pod: + var.query.new('master_pod','label_values({pod=~"ovnkube-master.*", namespace=~"openshift-ovn-kubernetes"}, pod)') + + var.datasource.withRegex('') + + var.query.generalOptions.withLabel('OVNKube-Master') + + var.query.selectionOptions.withMulti(true) + + var.query.selectionOptions.withIncludeAll(false) + + var.query.withRefresh(1), + + kubenode_pod: + var.query.new('kubenode_pod','label_values({pod=~"ovnkube-node.*", namespace=~"openshift-ovn-kubernetes"}, pod)') + + var.datasource.withRegex('') + + var.query.generalOptions.withLabel('OVNKube-Node') + + var.query.selectionOptions.withMulti(true) + + var.query.selectionOptions.withIncludeAll(false) + + var.query.withRefresh(1), + + +} \ No newline at end of file diff --git a/templates/General/ovn-monitoring-v2.jsonnet b/templates/General/ovn-monitoring-v2.jsonnet new file mode 100644 index 0000000..081e169 --- /dev/null +++ b/templates/General/ovn-monitoring-v2.jsonnet @@ -0,0 +1,59 @@ +local panels = import '../../assets/ovn-monitoring/panels.libsonnet'; +local queries = import '../../assets/ovn-monitoring/queries.libsonnet'; +local variables = import '../../assets/ovn-monitoring/variables.libsonnet'; +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +g.dashboard.new('OVN-Monitoring-dashboard') ++ g.dashboard.time.withFrom('now-1h') ++ g.dashboard.time.withTo('now') ++ g.dashboard.withTimezone('utc') ++ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) ++ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) ++ g.dashboard.withRefresh('') ++ g.dashboard.withEditable(false) ++ g.dashboard.graphTooltip.withSharedCrosshair() ++ g.dashboard.withVariables([ + variables.Datasource, + variables._master_node, + variables._worker_node, + variables.master_pod, + variables.kubenode_pod, +]) + + ++ g.dashboard.withPanels([ + g.panel.row.new('OVN Resource Monitoring') + + g.panel.row.withCollapsed(true) + + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) + + g.panel.row.withPanels([ + panels.stat.genericstatThresoldPanel('OVNKube Master', 'none', queries.ovnMasterLeader.query(), { x: 0, y: 0, w: 4, h: 4 }), + panels.stat.genericstatThresoldPanel('OVN Northd Status', 'none', queries.ovnNorthd.query(), { x: 4, y: 0, w: 4, h: 4 }), + panels.stat.genericstatThresoldPanel('OVN NBDB leader', 'none', queries.ovnNbdbLeader.query(), { x: 8, y: 0, w: 4, h: 4 }), + panels.stat.genericstatThresoldPanel('OVN SBDB leader', 'none', queries.ovnSbdbLeader.query(), { x: 12, y: 0, w: 4, h: 4 }), + panels.stat.genericstatThresoldOVNControllerPanel('OVN controller', 'none', queries.numOnvController.query(), { x: 16, y: 0, w: 4, h: 4 }), + panels.timeSeries.genericTimeSeriesLegendPanel('ovnkube-master CPU Usage', 'percent', queries.ovnKubeMasterCPU.query(), { x: 0, y: 4, w: 12, h: 10 }), + panels.timeSeries.genericTimeSeriesLegendPanel('ovnkube-master Memory Usage', 'bytes', queries.ovnKubeMasterMem.query(), { x: 12, y: 4, w: 12, h: 10 }), + panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 ovn-controller CPU Usage', 'percent', queries.topOvnControllerCPU.query(), { x: 0, y: 12, w: 12, h: 10 }), + panels.timeSeries.genericTimeSeriesLegendPanel('Top 10 ovn-controller Memory Usage', 'bytes', queries.topOvnControllerMem.query(), { x: 12, y: 12, w: 12, h: 10 }), + ]), + g.panel.row.new('Latency Monitoring') + + g.panel.row.withCollapsed(true) + + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) + + g.panel.row.withPanels([ + panels.timeSeries.genericTimeSeriesLegendPanel('Pod Annotation Latency', 's', queries.ovnAnnotationLatency.query(), { x: 0, y: 0, w: 12, h: 10 }), + panels.timeSeries.genericTimeSeriesLegendPanel('CNI Request ADD Latency', 's', queries.ovnCNIAdd.query(), { x: 12, y: 0, w: 12, h: 10 }), + panels.timeSeries.genericTimeSeriesLegendPanel('Pod creation Latency', 's', queries.podLatency.query(), { x: 0, y: 8, w: 24, h: 10 }), + panels.timeSeries.genericTimeSeriesLegendPanel('Sync Service Latency', 's', queries.synclatency.query(), { x: 0, y: 16, w: 24, h: 10 }), + panels.timeSeries.genericTimeSeriesLegendPanel('Duration for OVN to apply network configuration', 's', queries.ovnLatencyCalculate.query(), { x: 0, y: 24, w: 24, h: 10 }), + panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Node Ready Latency', 's', queries.ovnkubeNodeReadyLatency.query(), { x: 0, y: 32, w: 24, h: 10 }), + ]), + g.panel.row.new('WorkQueue Monitoring') + + g.panel.row.withCollapsed(true) + + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) + + g.panel.row.withPanels([ + panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Master workqueue', 'short', queries.workQueue.query(), { x: 0, y: 0, w: 12, h: 10 }), + panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Master workqueue Depth', 'short', queries.workQueueDepth.query(), { x: 12, y: 0, w: 12, h: 10 }), + panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Master workqueue duration', 's', queries.workQueueLatency.query(), { x: 0, y: 8, w: 12, h: 10 }), + panels.timeSeries.genericTimeSeriesLegendPanel('OVNKube Master workqueue - Unfinished', 's', queries.workQueueUnfinishedLatency.query(), { x: 12, y: 8, w: 12, h: 10 }), + ]), +])