diff --git a/.gitignore b/.gitignore index 82eb612..03d0198 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ templates/grafonnet-lib +templates/vendor rendered tmp bin diff --git a/Makefile b/Makefile index 71dcc65..5a52572 100644 --- a/Makefile +++ b/Makefile @@ -63,4 +63,4 @@ build-syncer-image: build podman build --platform=${PLATFORM} -f Dockerfile --manifest=${SYNCER_IMG_TAG} . push-syncer-image: - podman manifest push ${SYNCER_IMG_TAG} ${SYNCER_IMG_TAG} + podman manifest push ${SYNCER_IMG_TAG} ${SYNCER_IMG_TAG} \ No newline at end of file diff --git a/assets/ocp-performance/panels.libsonnet b/assets/ocp-performance/panels.libsonnet new file mode 100644 index 0000000..8d192c9 --- /dev/null +++ b/assets/ocp-performance/panels.libsonnet @@ -0,0 +1,52 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +{ + timeSeries: { + local timeSeries = g.panel.timeSeries, + local fieldOverride = g.panel.timeSeries.fieldOverride, + local custom = timeSeries.fieldConfig.defaults.custom, + local options = timeSeries.options, + + generic(title, unit, targets, gridPos): + timeSeries.new(title) + + timeSeries.queryOptions.withTargets(targets) + + timeSeries.datasource.withUid('$datasource') + + timeSeries.standardOptions.withUnit(unit) + + timeSeries.gridPos.withX(gridPos.x) + + timeSeries.gridPos.withY(gridPos.y) + + timeSeries.gridPos.withH(gridPos.h) + + timeSeries.gridPos.withW(gridPos.w) + + custom.withSpanNulls('false') + + options.tooltip.withMode('multi') + + options.tooltip.withSort('desc') + + options.legend.withDisplayMode('table'), + + genericLegend(title, unit, targets, gridPos): + self.generic(title, unit, targets, gridPos) + + options.legend.withShowLegend(true) + + options.legend.withCalcs([ + 'mean', + 'max', + 'min', + ]) + + options.legend.withSortBy('max') + + options.legend.withSortDesc(true) + + options.legend.withPlacement('bottom'), + }, + stat: { + local stat = g.panel.stat, + local options = stat.options, + + base(title, targets, gridPos): + stat.new(title) + + stat.datasource.withUid('$datasource') + + stat.queryOptions.withTargets(targets) + + stat.gridPos.withX(gridPos.x) + + stat.gridPos.withY(gridPos.y) + + stat.gridPos.withH(gridPos.h) + + stat.gridPos.withW(gridPos.w) + + options.reduceOptions.withCalcs([ + 'last', + ]), + }, +} \ No newline at end of file diff --git a/assets/ocp-performance/queries.libsonnet b/assets/ocp-performance/queries.libsonnet new file mode 100644 index 0000000..fee6ca6 --- /dev/null +++ b/assets/ocp-performance/queries.libsonnet @@ -0,0 +1,205 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; +local variables = import './variables.libsonnet'; + +local generateTimeSeriesQuery(query, legend) = [ + local prometheusQuery = g.query.prometheus; + prometheusQuery.new( + '$' + variables.datasource.name, + query + ) + + prometheusQuery.withFormat('time_series') + + prometheusQuery.withIntervalFactor(2) + + prometheusQuery.withLegendFormat(legend), +]; + +{ + nodeMemory: { + query(nodeName): + generateTimeSeriesQuery('node_memory_Active_bytes{instance=~"' + nodeName + '"}', 'Active') + + generateTimeSeriesQuery('node_memory_MemTotal_bytes{instance=~"' + nodeName + '"}', 'Total') + + generateTimeSeriesQuery('node_memory_Cached_bytes{instance=~"' + nodeName + '"} + node_memory_Buffers_bytes{instance=~"' + nodeName + '"}', 'Cached + Buffers') + + generateTimeSeriesQuery('node_memory_MemAvailable_bytes{instance=~"' + nodeName + '"}', 'Available') + + generateTimeSeriesQuery('(node_memory_MemTotal_bytes{instance=~"' + nodeName + '"} - (node_memory_MemFree_bytes{instance=~"' + nodeName + '"} + node_memory_Buffers_bytes{instance=~"' + nodeName + '"} + node_memory_Cached_bytes{instance=~"' + nodeName + '"}))', 'Used') + }, + nodeCPU: { + query(nodeName): + generateTimeSeriesQuery('sum by (instance, mode)(irate(node_cpu_seconds_total{instance=~"' + nodeName + '",job=~".*"}[$interval])) * 100', 'Busy {{mode}}') + }, + diskThroughput: { + query(nodeName): + generateTimeSeriesQuery('rate(node_disk_read_bytes_total{device=~"$block_device",instance=~"' + nodeName + '"}[$interval])', '{{ device }} - read') + + generateTimeSeriesQuery('rate(node_disk_written_bytes_total{device=~"$block_device",instance=~"' + nodeName + '"}[$interval])', '{{ device }} - write') + }, + diskIOPS: { + query(nodeName): + generateTimeSeriesQuery('rate(node_disk_reads_completed_total{device=~"$block_device",instance=~"' + nodeName + '"}[$interval])', '{{ device }} - read') + + generateTimeSeriesQuery('rate(node_disk_writes_completed_total{device=~"$block_device",instance=~"' + nodeName + '"}[$interval])', '{{ device }} - write') + }, + networkUtilization: { + query(nodeName): + generateTimeSeriesQuery('rate(node_network_receive_bytes_total{instance=~"' + nodeName + '",device=~"$net_device"}[$interval]) * 8', '{{instance}} - {{device}} - RX') + + generateTimeSeriesQuery('rate(node_network_transmit_bytes_total{instance=~"' + nodeName + '",device=~"$net_device"}[$interval]) * 8', '{{instance}} - {{device}} - TX') + }, + networkPackets: { + query(nodeName): + generateTimeSeriesQuery('rate(node_network_receive_packets_total{instance=~"' + nodeName + '",device=~"$net_device"}[$interval])', '{{instance}} - {{device}} - RX') + + generateTimeSeriesQuery('rate(node_network_transmit_packets_total{instance=~"' + nodeName + '",device=~"$net_device"}[$interval])', '{{instance}} - {{device}} - TX') + }, + networkDrop: { + query(nodeName): + generateTimeSeriesQuery('topk(10, rate(node_network_receive_drop_total{instance=~"' + nodeName + '"}[$interval]))', 'rx-drop-{{ device }}') + + generateTimeSeriesQuery('topk(10,rate(node_network_transmit_drop_total{instance=~"' + nodeName + '"}[$interval]))', 'tx-drop-{{ device }}') + }, + conntrackStats: { + query(nodeName): + generateTimeSeriesQuery('node_nf_conntrack_entries{instance=~"' + nodeName + '"}', 'conntrack_entries') + + generateTimeSeriesQuery('node_nf_conntrack_entries_limit{instance=~"' + nodeName + '"}', 'conntrack_limit') + }, + top10ContainerCPU: { + query(nodeName): + generateTimeSeriesQuery('topk(10, sum(irate(container_cpu_usage_seconds_total{container!="POD",name!="",node=~"' + nodeName + '",namespace!="",namespace=~"$namespace"}[$interval])) by (pod,container,namespace,name,service) * 100)', '{{ pod }}: {{ container }}') + }, + top10ContainerRSS: { + query(nodeName): + generateTimeSeriesQuery('topk(10, container_memory_rss{container!="POD",name!="",node=~"' + nodeName + '",namespace!="",namespace=~"$namespace"})', '{{ pod }}: {{ container }}') + }, + containerWriteBytes: { + query(nodeName): + generateTimeSeriesQuery('sum(rate(container_fs_writes_bytes_total{device!~".+dm.+", node=~"' + nodeName + '", container!=""}[$interval])) by (device, container)', '{{ container }}: {{ device }}') + }, + stackroxCPU: { + query(): + generateTimeSeriesQuery('topk(25, sum(irate(container_cpu_usage_seconds_total{container!="POD",name!="",namespace!="",namespace=~"stackrox"}[$interval])) by (pod,container,namespace,name,service) * 100)', '{{ pod }}: {{ container }}') + }, + stackroxMem: { + query(): + generateTimeSeriesQuery('topk(25, container_memory_rss{container!="POD",name!="",namespace!="",namespace=~"stackrox"})', '{{ pod }}: {{ container }}') + }, + ovnAnnotationLatency: { + query(): + generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_master_pod_creation_latency_seconds_bucket[$interval])) by (pod,le)) > 0', '{{ pod }}') + }, + ovnCNIAdd: { + query(): + generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_node_cni_request_duration_seconds_bucket{command="ADD"}[$interval])) by (pod,le)) > 0', '{{ pod }}') + }, + ovnCNIDel: { + query(): + generateTimeSeriesQuery('histogram_quantile(0.99, sum(rate(ovnkube_node_cni_request_duration_seconds_bucket{command="DEL"}[$interval])) by (pod,le)) > 0', '{{ pod }}') + }, + ovnKubeMasterCPU: { + query(): + generateTimeSeriesQuery('irate(container_cpu_usage_seconds_total{pod=~"ovnkube-master.*",namespace="openshift-ovn-kubernetes",container!~"POD|"}[$interval])*100', '{{container}}-{{pod}}-{{node}}') + }, + ovnKubeMasterMem: { + query(): + generateTimeSeriesQuery('container_memory_rss{pod=~"ovnkube-master-.*",namespace="openshift-ovn-kubernetes",container!~"POD|"}', '{{container}}-{{pod}}-{{node}}') + }, + topOvnControllerCPU: { + query(): + generateTimeSeriesQuery('topk(10, irate(container_cpu_usage_seconds_total{pod=~"ovnkube-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}[$interval])*100)', '{{node}}') + }, + topOvnControllerMem: { + query(): + generateTimeSeriesQuery('topk(10, sum(container_memory_rss{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}) by (node))', '{{node}}') + }, + promReplMemUsage: { + query(): + generateTimeSeriesQuery('sum(container_memory_rss{pod="prometheus-k8s-1",namespace!="",name!="",container="prometheus"}) by (pod)', '{{pod}}') + + generateTimeSeriesQuery('sum(container_memory_rss{pod="prometheus-k8s-0",namespace!="",name!="",container="prometheus"}) by (pod)', '{{pod}}') + }, + kubeletCPU: { + query(): + generateTimeSeriesQuery('topk(10,irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[$interval])*100)', 'kubelet - {{node}}') + }, + crioCPU: { + query(): + generateTimeSeriesQuery('topk(10,irate(process_cpu_seconds_total{service="kubelet",job="crio"}[$interval])*100)', 'crio - {{node}}') + }, + kubeletMemory: { + query(): + generateTimeSeriesQuery('topk(10,process_resident_memory_bytes{service="kubelet",job="kubelet"})', 'kubelet - {{node}}') + }, + crioMemory: { + query(): + generateTimeSeriesQuery('topk(10,process_resident_memory_bytes{service="kubelet",job="crio"})', 'crio - {{node}}') + }, + crioINodes: { + query(): + generateTimeSeriesQuery('(1 - node_filesystem_files_free{fstype!="",mountpoint="/run"} / node_filesystem_files{fstype!="",mountpoint="/run"}) * 100', '/var/run - {{instance}}') + }, + currentNodeCount: { + query(): + generateTimeSeriesQuery('sum(kube_node_info{})', 'Number of nodes') + + generateTimeSeriesQuery('sum(kube_node_status_condition{status="true"}) by (condition) > 0', 'Node: {{ condition }}') + }, + currentNamespaceCount: { + query(): + generateTimeSeriesQuery('sum(kube_namespace_status_phase) by (phase)', '{{ phase }}') + }, + currentPodCount: { + query(): + generateTimeSeriesQuery('sum(kube_pod_status_phase{}) by (phase) > 0', '{{ phase}} Pods') + }, + nsCount: { + query(): + generateTimeSeriesQuery('sum(kube_namespace_status_phase) by (phase) > 0', '{{ phase }} namespaces') + }, + podCount: { + query(): + generateTimeSeriesQuery('sum(kube_pod_status_phase{}) by (phase)', '{{phase}} pods') + }, + secretCmCount: { + query(): + generateTimeSeriesQuery('count(kube_secret_info{})', 'secrets') + + generateTimeSeriesQuery('count(kube_configmap_info{})', 'Configmaps') + }, + deployCount: { + query(): + generateTimeSeriesQuery('count(kube_deployment_labels{})', 'Deployments') + }, + servicesCount: { + query(): + generateTimeSeriesQuery('count(kube_service_info{})', 'Services') + }, + routesCount: { + query(): + generateTimeSeriesQuery('count(openshift_route_info{})', 'Routes') + }, + alerts: { + query(): + generateTimeSeriesQuery('topk(10,sum(ALERTS{severity!="none"}) by (alertname, severity))', '{{severity}}: {{alertname}}') + }, + podDistribution: { + query(): + generateTimeSeriesQuery('count(kube_pod_info{}) by (node)', '{{ node }}') + }, + top10ContMem: { + query(): + generateTimeSeriesQuery('topk(10, container_memory_rss{namespace!="",container!="POD",name!=""})', '{{ namespace }} - {{ name }}') + }, + contMemRSSSystemSlice: { + query(): + generateTimeSeriesQuery('sum by (node)(container_memory_rss{id="/system.slice"})', 'system.slice - {{ node }}') + }, + top10ContCPU: { + query(): + generateTimeSeriesQuery('topk(10,irate(container_cpu_usage_seconds_total{namespace!="",container!="POD",name!=""}[$interval])*100)', '{{ namespace }} - {{ name }}') + }, + goroutinesCount: { + query(): + generateTimeSeriesQuery('topk(10, sum(go_goroutines{}) by (job,instance))', '{{ job }} - {{ instance }}') + }, + clusterOperatorsOverview: { + query(): + generateTimeSeriesQuery('sum by (condition)(cluster_operator_conditions{condition!=""})', '{{ condition }}') + }, + clusterOperatorsInformation: { + query(): + generateTimeSeriesQuery('cluster_operator_conditions{name!="",reason!=""}', '{{name}} - {{reason}}') + }, + clusterOperatorsDegraded: { + query(): + generateTimeSeriesQuery('cluster_operator_conditions{condition="Degraded",name!="",reason!=""}', '{{name}} - {{reason}}') + }, +} \ No newline at end of file diff --git a/assets/ocp-performance/variables.libsonnet b/assets/ocp-performance/variables.libsonnet new file mode 100644 index 0000000..e067d67 --- /dev/null +++ b/assets/ocp-performance/variables.libsonnet @@ -0,0 +1,87 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; +local var = g.dashboard.variable; + +{ + datasource: + var.datasource.new('datasource', 'prometheus') + + var.datasource.withRegex('/^Cluster Prometheus$/'), + + master_node: + var.query.new('_master_node') + + var.query.withDatasourceFromVariable(self.datasource) + + var.query.queryTypes.withLabelValues( + 'node', + 'kube_node_role{role="master"}', + ) + + var.query.withRefresh(2) + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll(false) + + var.query.generalOptions.withLabel('Master'), + + worker_node: + var.query.new('_worker_node') + + var.query.withDatasourceFromVariable(self.datasource) + + var.query.queryTypes.withLabelValues( + 'node', + 'kube_node_role{role=~"work.*"}', + ) + + var.query.withRefresh(2) + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll(false) + + var.query.generalOptions.withLabel('Worker'), + + infra_node: + var.query.new('_infra_node') + + var.query.withDatasourceFromVariable(self.datasource) + + var.query.queryTypes.withLabelValues( + 'node', + 'kube_node_role{role="infra"}', + ) + + var.query.withRefresh(2) + + var.query.selectionOptions.withMulti() + + var.query.selectionOptions.withIncludeAll(false) + + var.query.generalOptions.withLabel('Infra'), + + namespace: + var.query.new('namespace') + + var.query.withDatasourceFromVariable(self.datasource) + + var.query.queryTypes.withLabelValues( + 'namespace', + 'kube_pod_info{namespace!="(cluster-density.*|node-density-.*)"}', + ) + + var.query.withRefresh(2) + + var.query.withRegex('') + + var.query.selectionOptions.withMulti(false) + + var.query.selectionOptions.withIncludeAll(true) + + var.query.generalOptions.withLabel('Namespace'), + + block_device: + var.query.new('block_device') + + var.query.withDatasourceFromVariable(self.datasource) + + var.query.queryTypes.withLabelValues( + 'device', + 'node_disk_written_bytes_total', + ) + + var.query.withRefresh(2) + + var.query.withRegex('/^(?:(?!dm|rb).)*$/') + + var.query.selectionOptions.withMulti(true) + + var.query.selectionOptions.withIncludeAll(true) + + var.query.generalOptions.withLabel('Block device'), + + net_device: + var.query.new('net_device') + + var.query.withDatasourceFromVariable(self.datasource) + + var.query.queryTypes.withLabelValues( + 'device', + 'node_network_receive_bytes_total', + ) + + var.query.withRefresh(2) + + var.query.withRegex('/^((br|en|et).*)$/') + + var.query.selectionOptions.withMulti(true) + + var.query.selectionOptions.withIncludeAll(true) + + var.query.generalOptions.withLabel('Network device'), + + interval: + var.interval.new('interval', ['2m','3m','4m','5m'],) + + var.interval.generalOptions.withLabel('interval'), +} \ No newline at end of file diff --git a/templates/General/ocp-performance-v2.jsonnet b/templates/General/ocp-performance-v2.jsonnet new file mode 100644 index 0000000..dbe421c --- /dev/null +++ b/templates/General/ocp-performance-v2.jsonnet @@ -0,0 +1,142 @@ +local panels = import '../../assets/ocp-performance/panels.libsonnet'; +local queries = import '../../assets/ocp-performance/queries.libsonnet'; +local variables = import '../../assets/ocp-performance/variables.libsonnet'; +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +g.dashboard.new('Openshift Performance') ++ g.dashboard.withDescription(||| + Performance dashboard for Red Hat Openshift +|||) ++ g.dashboard.time.withFrom('now-1h') ++ g.dashboard.time.withTo('now') ++ g.dashboard.withTimezone('utc') ++ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) ++ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) ++ g.dashboard.withRefresh('30s') ++ g.dashboard.withEditable(true) ++ g.dashboard.graphTooltip.withSharedCrosshair() ++ g.dashboard.withVariables([ + variables.datasource, + variables.master_node, + variables.worker_node, + variables.infra_node, + variables.namespace, + variables.block_device, + variables.net_device, + variables.interval, +]) ++ g.dashboard.withPanels([ + g.panel.row.new('OVN') + + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.timeSeries.generic('99% Pod Annotation Latency', 's', queries.ovnAnnotationLatency.query(), { x: 0, y: 1, w: 24, h: 12 }), + panels.timeSeries.generic('99% CNI Request ADD Latency', 's', queries.ovnCNIAdd.query(), { x: 0, y: 13, w: 12, h: 8 }), + panels.timeSeries.generic('99% CNI Request DEL Latency', 's', queries.ovnCNIDel.query(), { x: 12, y: 13, w: 12, h: 8 }), + panels.timeSeries.genericLegend('ovnkube-master CPU Usage', 'percent', queries.ovnKubeMasterCPU.query(), { x: 0, y: 21, w: 12, h: 8 }), + panels.timeSeries.genericLegend('ovnkube-master Memory Usage', 'bytes', queries.ovnKubeMasterMem.query(), { x: 12, y: 21, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 ovn-controller CPU Usage', 'percent', queries.topOvnControllerCPU.query(), { x: 0, y: 28, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 ovn-controller Memory Usage', 'bytes', queries.topOvnControllerMem.query(), { x: 12, y: 28, w: 12, h: 8 }), + ]), + g.panel.row.new('Monitoring stack') + + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.timeSeries.genericLegend('Prometheus Replica Memory usage', 'bytes', queries.promReplMemUsage.query(), { x: 0, y: 2, w: 24, h: 12 }), + ]), + g.panel.row.new('Stackrox') + + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.timeSeries.genericLegend('Top 25 stackrox container RSS bytes', 'bytes', queries.stackroxMem.query(), { x: 0, y: 2, w: 24, h: 12 }), + panels.timeSeries.genericLegend('Top 25 stackrox container CPU percent', 'percent', queries.stackroxCPU.query(), { x: 0, y: 2, w: 24, h: 12 }), + ]), + g.panel.row.new('Cluster Kubelet') + + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.timeSeries.genericLegend('Top 10 Kubelet CPU usage', 'percent', queries.kubeletCPU.query(), { x: 0, y: 3, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 crio CPU usage', 'percent', queries.crioCPU.query(), { x: 12, y: 3, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 Kubelet memory usage', 'bytes', queries.kubeletMemory.query(), { x: 0, y: 11, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 crio memory usage', 'bytes', queries.crioMemory.query(), { x: 12, y: 11, w: 12, h: 8 }), + panels.timeSeries.genericLegend('inodes usage in /var/run', 'percent', queries.crioINodes.query(), { x: 0, y: 19, w: 24, h: 8 }), + ]), + g.panel.row.new('Cluster Details') + + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.stat.base('Current Node Count', queries.currentNodeCount.query(), { x: 0, y: 4, w: 8, h: 3 }), + panels.stat.base('Current Namespace Count', queries.currentNamespaceCount.query(), { x: 8, y: 4, w: 8, h: 3 }), + panels.stat.base('Current Pod Count', queries.currentPodCount.query(), { x: 16, y: 4, w: 8, h: 3 }), + panels.timeSeries.generic('Number of nodes', 'none', queries.currentNodeCount.query(), { x: 0, y: 12, w: 8, h: 8 }), + panels.timeSeries.generic('Namespace count', 'none', queries.nsCount.query(), { x: 8, y: 12, w: 8, h: 8 }), + panels.timeSeries.generic('Pod count', 'none', queries.podCount.query(), { x: 16, y: 12, w: 8, h: 8 }), + panels.timeSeries.generic('Secret & configmap count', 'none', queries.secretCmCount.query(), { x: 0, y: 20, w: 8, h: 8 }), + panels.timeSeries.generic('Deployment count', 'none', queries.deployCount.query(), { x: 8, y: 20, w: 8, h: 8 }), + panels.timeSeries.generic('Services count', 'none', queries.servicesCount.query(), { x: 16, y: 20, w: 8, h: 8 }), + panels.timeSeries.generic('Routes count', 'none', queries.routesCount.query(), { x: 0, y: 20, w: 8, h: 8 }), + panels.timeSeries.generic('Alerts', 'none', queries.alerts.query(), { x: 8, y: 20, w: 8, h: 8 }), + panels.timeSeries.genericLegend('Pod Distribution', 'none', queries.podDistribution.query(), { x: 16, y: 20, w: 8, h: 8 }), + panels.timeSeries.genericLegend('Top 10 container RSS', 'bytes', queries.top10ContMem.query(), { x: 0, y: 28, w: 24, h: 8 }), + panels.timeSeries.genericLegend('container RSS system.slice', 'bytes', queries.contMemRSSSystemSlice.query(), { x: 12, y: 28, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 container CPU', 'percent', queries.top10ContCPU.query(), { x: 0, y: 36, w: 12, h: 8 }), + panels.timeSeries.generic('Goroutines count', 'none', queries.goroutinesCount.query(), { x: 12, y: 36, w: 12, h: 8 }), + ]), + g.panel.row.new('Cluster Operators Details') + + g.panel.row.withGridPos({ x: 0, y: 0, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.stat.base('Cluster operators overview', queries.clusterOperatorsOverview.query(), { x: 0, y: 4, w: 24, h: 3 }), + panels.timeSeries.genericLegend('Cluster operators information', 'none', queries.clusterOperatorsInformation.query(), { x: 0, y: 4, w: 8, h: 8 }), + panels.timeSeries.genericLegend('Cluster operators degraded', 'none', queries.clusterOperatorsDegraded.query(), { x: 8, y: 4, w: 8, h: 8 }), + ]), + g.panel.row.new('Master: $_master_node') + + g.panel.row.withGridPos({ x: 0, y: 0, w: 0, h: 8 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withRepeat('_master_node') + + g.panel.row.withPanels([ + panels.timeSeries.genericLegend('CPU Basic: $_master_node', 'percent', queries.nodeCPU.query('$_master_node'), { x: 0, y: 0, w: 12, h: 8 }), + panels.timeSeries.genericLegend('System Memory: $_master_node', 'bytes', queries.nodeMemory.query('$_master_node'), { x: 12, y: 0, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Disk throughput: $_master_node', 'Bps', queries.diskThroughput.query('$_master_node'), { x: 0, y: 8, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Disk IOPS: $_master_node', 'iops', queries.diskIOPS.query('$_master_node'), { x: 12, y: 8, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Network Utilization: $_master_node', 'bps', queries.networkUtilization.query('$_master_node'), { x: 0, y: 16, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Network Packets: $_master_node', 'pps', queries.networkPackets.query('$_master_node'), { x: 12, y: 16, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Network packets drop: $_master_node', 'pps', queries.networkDrop.query('$_master_node'), { x: 0, y: 24, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Conntrack stats: $_master_node', '', queries.conntrackStats.query('$_master_node'), { x: 12, y: 24, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 container CPU: $_master_node', 'percent', queries.top10ContainerCPU.query('$_master_node'), { x: 0, y: 24, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 container RSS: $_master_node', 'bytes', queries.top10ContainerRSS.query('$_master_node'), { x: 12, y: 24, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Container fs write rate: $_master_node', 'Bps', queries.containerWriteBytes.query('$_master_node'), { x: 0, y: 32, w: 12, h: 8 }), + ]), + g.panel.row.new('Worker: $_worker_node') + + g.panel.row.withGridPos({ x: 0, y: 0, w: 0, h: 8 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withRepeat('_worker_node') + + g.panel.row.withPanels([ + panels.timeSeries.genericLegend('CPU Basic: $_worker_node', 'percent', queries.nodeCPU.query('$_worker_node'), { x: 0, y: 0, w: 12, h: 8 }), + panels.timeSeries.genericLegend('System Memory: $_worker_node', 'bytes', queries.nodeMemory.query('$_worker_node'), { x: 12, y: 0, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Disk throughput: $_worker_node', 'Bps', queries.diskThroughput.query('$_worker_node'), { x: 0, y: 8, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Disk IOPS: $_worker_node', 'iops', queries.diskIOPS.query('$_worker_node'), { x: 12, y: 8, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Network Utilization: $_worker_node', 'bps', queries.networkUtilization.query('$_worker_node'), { x: 0, y: 16, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Network Packets: $_worker_node', 'pps', queries.networkPackets.query('$_worker_node'), { x: 12, y: 16, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Network packets drop: $_worker_node', 'pps', queries.networkDrop.query('$_worker_node'), { x: 0, y: 24, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Conntrack stats: $_worker_node', '', queries.conntrackStats.query('$_worker_node'), { x: 12, y: 24, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 container CPU: $_worker_node', 'percent', queries.top10ContainerCPU.query('$_worker_node'), { x: 0, y: 24, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 container RSS: $_worker_node', 'bytes', queries.top10ContainerRSS.query('$_worker_node'), { x: 12, y: 24, w: 12, h: 8 }), + ]), + g.panel.row.new('Infra: $_infra_node') + + g.panel.row.withGridPos({ x: 0, y: 0, w: 0, h: 8 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withRepeat('_infra_node') + + g.panel.row.withPanels([ + panels.timeSeries.genericLegend('CPU Basic: $_infra_node', 'percent', queries.nodeCPU.query('$_infra_node'), { x: 0, y: 0, w: 12, h: 8 }), + panels.timeSeries.genericLegend('System Memory: $_infra_node', 'bytes', queries.nodeMemory.query('$_infra_node'), { x: 12, y: 0, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Disk throughput: $_infra_node', 'Bps', queries.diskThroughput.query('$_infra_node'), { x: 0, y: 8, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Disk IOPS: $_infra_node', 'iops', queries.diskIOPS.query('$_infra_node'), { x: 12, y: 8, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Network Utilization: $_infra_node', 'bps', queries.networkUtilization.query('$_infra_node'), { x: 0, y: 16, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Network Packets: $_infra_node', 'pps', queries.networkPackets.query('$_infra_node'), { x: 12, y: 16, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Network packets drop: $_infra_node', 'pps', queries.networkDrop.query('$_infra_node'), { x: 0, y: 24, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Conntrack stats: $_infra_node', '', queries.conntrackStats.query('$_infra_node'), { x: 12, y: 24, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 container CPU: $_infra_node', 'percent', queries.top10ContainerCPU.query('$_infra_node'), { x: 0, y: 24, w: 12, h: 8 }), + panels.timeSeries.genericLegend('Top 10 container RSS: $_infra_node', 'bytes', queries.top10ContainerRSS.query('$_infra_node'), { x: 12, y: 24, w: 12, h: 8 }), + ]), +]) diff --git a/templates/jsonnetfile.lock.json b/templates/jsonnetfile.lock.json index d4b9d5c..e7be9c9 100644 --- a/templates/jsonnetfile.lock.json +++ b/templates/jsonnetfile.lock.json @@ -43,4 +43,4 @@ } ], "legacyImports": false -} +} \ No newline at end of file