From 597efb07d10754bcbb4f3125e8413dd6bcbdfc3e Mon Sep 17 00:00:00 2001 From: Manda-supraja26 Date: Sat, 23 Mar 2024 10:35:54 +0530 Subject: [PATCH 1/4] Adding Hyepershift Performance Dashboard to Grafonnet --- .../panels.libsonnet | 229 ++++ .../queries.libsonnet | 1061 +++++++++++++++++ .../variables.libsonnet | 40 + .../General/hypershift-performance-v2.jsonnet | 154 +++ 4 files changed, 1484 insertions(+) create mode 100644 assets/hypershift-perf-dashboard/panels.libsonnet create mode 100644 assets/hypershift-perf-dashboard/queries.libsonnet create mode 100644 assets/hypershift-perf-dashboard/variables.libsonnet create mode 100644 templates/General/hypershift-performance-v2.jsonnet diff --git a/assets/hypershift-perf-dashboard/panels.libsonnet b/assets/hypershift-perf-dashboard/panels.libsonnet new file mode 100644 index 0000000..4750f57 --- /dev/null +++ b/assets/hypershift-perf-dashboard/panels.libsonnet @@ -0,0 +1,229 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +{ + stat: { + local stat = g.panel.stat, + local options = stat.options, + + base(title, unit, targets, datasource, gridPos): + stat.new(title) + + stat.datasource.withType('prometheus') + + stat.datasource.withUid(datasource) + + stat.standardOptions.withUnit(unit) + + stat.queryOptions.withTargets(targets) + + stat.gridPos.withX(gridPos.x) + + stat.gridPos.withY(gridPos.y) + + stat.gridPos.withH(gridPos.h) + + stat.gridPos.withW(gridPos.w) + + options.withJustifyMode("auto") + + options.withGraphMode("none") + + options.text.withTitleSize(12), + + m_infrastructure(title, unit, targets, datasource, gridPos): + self.base(title, unit, targets, datasource, gridPos) + + options.withTextMode('name') + + options.withColorMode('value') + + stat.standardOptions.thresholds.withSteps([{ + "color": "green", + "value": null + }]) + + options.withJustifyMode('auto'), + + m_region(title, unit, targets, datasource, gridPos): + self.base(title, unit, targets, datasource, gridPos) + + options.withTextMode('name') + + options.withColorMode('value') + + stat.standardOptions.thresholds.withSteps([{ + "color": "green", + "value": null + }]) + + options.withJustifyMode('auto'), + + m_ocp_version(title, unit, targets, datasource, gridPos): + self.base(title, unit, targets, datasource, gridPos) + + options.withTextMode('name') + + options.withColorMode('value') + + stat.standardOptions.thresholds.withSteps([{ + "color": "green", + "value": null + }]) + + options.withJustifyMode('auto'), + + num_hosted_cluster(title, unit, targets, datasource, gridPos): + self.base(title, unit, targets, datasource, gridPos) + + options.withTextMode('auto') + + options.withColorMode('value') + + options.withJustifyMode('auto') + + stat.standardOptions.thresholds.withSteps([{ + "color": "green", + "value": null + }]) + + options.reduceOptions.withCalcs([ + 'max', + ]), + + current_namespace_count(title, unit, targets, datasource, gridPos): + self.base(title, unit, targets, datasource, gridPos) + + options.withTextMode('auto') + + options.withColorMode('value') + + options.withJustifyMode('auto') + + stat.standardOptions.thresholds.withSteps([]) + + stat.standardOptions.thresholds.withMode('absolute') + + options.reduceOptions.withCalcs([ + 'last', + ]), + + current_node_count(title, unit, targets, datasource, gridPos): + self.base(title, unit, targets, datasource, gridPos) + + options.withTextMode('auto') + + options.withColorMode('value') + + options.withJustifyMode('auto') + + options.withGraphMode('area') + + stat.standardOptions.thresholds.withSteps([]) + + stat.standardOptions.thresholds.withMode('absolute') + + options.reduceOptions.withCalcs([ + 'last', + ]), + + current_pod_count(title, unit, targets, datasource, gridPos): + self.base(title, unit, targets, datasource, gridPos) + + options.withTextMode('auto') + + options.withColorMode('value') + + options.withJustifyMode('auto') + + options.withGraphMode('area') + + stat.standardOptions.thresholds.withSteps([]) + + stat.standardOptions.thresholds.withMode('absolute') + + options.reduceOptions.withCalcs([ + 'last', + ]), + + etcd_has_leader(title, unit, targets, datasource, gridPos): + self.base(title, unit, targets, datasource, gridPos) + + options.withOrientation('horizontal') + + options.withColorMode('none') + + stat.standardOptions.withMappings([ + { + "type": "value", + "options": { + "0": { + "text": "NO" + }, + "1": { + "text": "YES" + } + } + } + ]), + + mgmt_num_failed_proposals(title, unit, targets, datasource, gridPos): + self.base(title, unit, targets, datasource, gridPos) + + options.withOrientation('horizontal') + + options.withColorMode('none') + + options.withTextMode('auto') + + options.withGraphMode('none') + + options.withJustifyMode('auto'), + + hostedControlPlaneStats(title, unit, targets, datasource, gridPos): + self.base(title, unit, targets, datasource, gridPos) + + options.withTextMode('name') + + options.withColorMode('value') + + options.withJustifyMode('auto') + + options.withGraphMode('none') + + stat.standardOptions.thresholds.withSteps([{ + "color": "green", + "value": null + }]), + }, + + timeSeries: { + local timeSeries = g.panel.timeSeries, + local custom = timeSeries.fieldConfig.defaults.custom, + local options = timeSeries.options, + + base(title, unit, targets, datasource, gridPos): + timeSeries.new(title) + + timeSeries.queryOptions.withTargets(targets) + + timeSeries.datasource.withType('prometheus') + + timeSeries.datasource.withUid(datasource) + + timeSeries.standardOptions.withUnit(unit) + + timeSeries.gridPos.withX(gridPos.x) + + timeSeries.gridPos.withY(gridPos.y) + + timeSeries.gridPos.withH(gridPos.h) + + timeSeries.gridPos.withW(gridPos.w) + + custom.withSpanNulls(false) + + custom.withFillOpacity(25) + + options.legend.withShowLegend(true), + + managementClustersStatsTimeseriesSettings(title, unit, targets, datasource, gridPos): + self.base(title, unit, targets, datasource, gridPos) + + options.legend.withPlacement('bottom') + + options.legend.withDisplayMode('table') + + options.tooltip.withMode('multi') + + options.tooltip.withSort('desc') + + custom.withDrawStyle('line') + + custom.withLineInterpolation('linear') + + options.legend.withDisplayMode('table') + + options.legend.withCalcs([ + 'mean', + 'max' + ]), + + mgmt(title, unit, targets, datasource, gridPos): + self.base(title, unit, targets, datasource, gridPos) + + options.legend.withDisplayMode('table') + + options.legend.withShowLegend(true) + + options.legend.withPlacement('bottom') + + options.legend.withCalcs([ + 'mean', + 'max' + ]) + + options.tooltip.withMode('multi') + + options.tooltip.withSort('desc') + + options.legend.withSortBy('max'), + + DBPanelsSettings(title, unit, targets, datasource, gridPos): + self.base(title, unit, targets, datasource, gridPos) + + options.legend.withDisplayMode('list') + + options.legend.withShowLegend(true) + + options.legend.withPlacement('bottom') + + options.tooltip.withMode('multi') + + options.tooltip.withSort('none'), + + genericGraphLegendPanel(title, unit, targets, datasource, gridPos): + self.base(title, unit, targets, datasource, gridPos) + + options.legend.withDisplayMode('table') + + options.legend.withShowLegend(true) + + options.legend.withPlacement('bottom') + + options.legend.withCalcs([ + 'mean', + 'max' + ]) + + options.tooltip.withMode('multi') + + options.tooltip.withSort('desc') + + options.legend.withSortBy('max'), + + genericGraphLegendPanelRightSide(title, unit, targets, datasource, gridPos): + self.base(title, unit, targets, datasource, gridPos) + + options.legend.withDisplayMode('table') + + options.legend.withShowLegend(true) + + options.legend.withPlacement('right') + + options.legend.withCalcs([ + 'lastNotNull', + ]) + + options.tooltip.withMode('multi') + + options.tooltip.withSort('desc') + + options.legend.withSortBy('max'), + + genericGraphLegendPanelList(title, unit, targets, datasource, gridPos): + self.base(title, unit, targets, datasource, gridPos) + + options.legend.withDisplayMode('list') + + options.legend.withShowLegend(true) + + options.legend.withPlacement('bottom') + + options.legend.withCalcs([ + 'lastNotNull', + ]) + + options.tooltip.withMode('multi') + + options.tooltip.withSort('desc') + + options.legend.withSortBy('max'), + }, +} \ No newline at end of file diff --git a/assets/hypershift-perf-dashboard/queries.libsonnet b/assets/hypershift-perf-dashboard/queries.libsonnet new file mode 100644 index 0000000..bb1fe65 --- /dev/null +++ b/assets/hypershift-perf-dashboard/queries.libsonnet @@ -0,0 +1,1061 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; +local variables = import './variables.libsonnet'; +local prometheus = g.query.prometheus; + +{ + m_infrastructure: { + query(): + prometheus.withExpr('cluster_infrastructure_provider{namespace="openshift-kube-apiserver-operator"}') + + prometheus.withInstant(true) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{type}}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A') + + }, + + m_region: { + query(): + prometheus.withExpr('cluster_infrastructure_provider{namespace="openshift-kube-apiserver-operator"}') + + prometheus.withInstant(true) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{region}}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A') + }, + + m_ocp_version: { + query(): + prometheus.withExpr('cluster_version{type="completed",version!="",namespace="openshift-cluster-version"}') + + prometheus.withInstant(true) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{version}}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A') + }, + + num_hosted_cluster: { + query(): + prometheus.withExpr('count(kube_namespace_labels{namespace=~"^ocm-.*"})') + + prometheus.withInstant(true) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('PF55DCC5EC58ABF5A') + }, + + current_namespace_count: { + query(): + prometheus.withExpr('sum(kube_namespace_status_phase) by (phase)') + + prometheus.withInstant(true) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{ phase }}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A') + }, + + current_node_count: { + query(): + [prometheus.withExpr('sum(kube_node_info{})') + + prometheus.withInstant(true) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('Number of nodes') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + + prometheus.withExpr('sum(kube_node_status_condition{status="true"}) by (condition) > 0') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('Node: {{ condition }}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + + prometheus.withExpr('sum(kube_node_role{}) by (role)') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('Role: {{ role }}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A') + ] + }, + + current_pod_count: { + query(): + prometheus.withExpr('sum(kube_pod_status_phase{}) by (phase) > 0') + + prometheus.withInstant(true) + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{ phase}} Pods') + + prometheus.withDatasource('PF55DCC5EC58ABF5A') + }, + + top10ContCPUHosted: { + query(): + prometheus.withExpr('topk(10,irate(container_cpu_usage_seconds_total{namespace=~"^ocm-.*",container!="POD",name!=""}[2m])*100)') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{ namespace }} - {{ name }}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A') + }, + + top10ContMemHosted: { + query(): + prometheus.withExpr('topk(10, container_memory_rss{namespace=~"^ocm-.*",container!="POD",name!=""})') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{ namespace }} - {{ name }}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A') + }, + + top10ContCPUManagement: { + query(): + prometheus.withExpr('topk(10,irate(container_cpu_usage_seconds_total{namespace!="",container!="POD",name!=""}[2m])*100)') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{ namespace }} - {{ name }}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A') + }, + + top10ContMemManagement: { + query(): + prometheus.withExpr('topk(10, container_memory_rss{namespace!="",container!="POD",name!=""})') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{ namespace }} - {{ name }}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A') + }, + + top10ContCPUOBOManagement: { + query(): + prometheus.withExpr('topk(10,irate(container_cpu_usage_seconds_total{namespace="openshift-observability-operator",container!="POD",name!=""}[2m])*100)') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{ pod }}/{{ container }}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A') + }, + + top10ContMemOBOManagement: { + query(): + prometheus.withExpr('topk(10, container_memory_rss{namespace="openshift-observability-operator",container!="POD",name!=""})') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{ pod }}/{{ container }}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A') + }, + + top10ContCPUHypershiftManagement: { + query(): + prometheus.withExpr('topk(10,irate(container_cpu_usage_seconds_total{namespace="hypershift",container!="POD",name!=""}[2m])*100)') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{ pod }}/{{ container }}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A') + }, + + top10ContMemHypershiftManagement: { + query(): + prometheus.withExpr('topk(10, container_memory_rss{namespace="hypershift",container!="POD",name!=""})') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{ pod }}/{{ container }}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A') + }, + + dynaactivegateMem: { + query(): + prometheus.withExpr('sum(container_memory_rss{namespace=~"dynatrace",pod=~".*-activegate-.*",container!=""}) by (node, namespace, pod)') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{ node }}: {{ namespace }} : {{ pod }}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A') + }, + + dynaactivegateCPU: { + query(): + prometheus.withExpr('sum(irate(container_cpu_usage_seconds_total{namespace=~"dynatrace", pod=~".*-activegate-.*", container!~"POD|"}[2m])*100) by (node, namespace, pod)') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{ node }}: {{ namespace }} : {{ pod }}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A') + }, + + opentelemetryCPU: { + query(): + prometheus.withExpr('sum(irate(container_cpu_usage_seconds_total{namespace=~"dynatrace", pod=~"opentelemetry-.*", container!~"POD|"}[2m])*100) by (node, namespace, pod)') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{ node }}: {{ namespace }} : {{ pod }}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A') + }, + + opentelemetryMem: { + query(): + prometheus.withExpr('sum(container_memory_rss{namespace=~"dynatrace",pod=~"opentelemetry-.*",container!=""}) by (node, namespace, pod)') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{ node }}: {{ namespace }} : {{ pod }}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A') + }, + + nodeCount: { + query(): + [ + prometheus.withExpr('sum(kube_node_info{})') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('Number of nodes') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + prometheus.withExpr('sum(kube_node_status_condition{status="true"}) by (node,condition) > 0') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{node}}: {{ condition }}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A') + ] + }, + + current_machine_set_replica_count: { + query(): + [ + prometheus.withExpr('mapi_machine_set_status_replicas{name=~".*worker.*"}') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('Replicas: {{ name }}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + + prometheus.withExpr('mapi_machine_set_status_replicas_available{name=~".*worker.*"}') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('Available: {{ name }}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + + prometheus.withExpr('mapi_machine_set_status_replicas_ready{name=~".*worker.*"}') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('Ready: {{ name }}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + ] + }, + + nsCount: { + query(): + prometheus.withExpr('sum(kube_namespace_status_phase) by (phase) > 0') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{ phase }} namespaces') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + podCount: { + query(): + prometheus.withExpr('sum(kube_pod_status_phase{}) by (phase)') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{phase}} pods') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + clusterOperatorsInformation: { + query(): + prometheus.withExpr('cluster_operator_conditions{name!="",reason!=""}') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{name}} - {{reason}}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + clusterOperatorsDegraded: { + query(): + prometheus.withExpr('cluster_operator_conditions{condition="Degraded",name!="",reason!=""}') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{name}} - {{reason}}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + FailedPods: { + query(): + [ + prometheus.withExpr('kube_pod_status_phase{phase="Failed"}') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{namespace}}/{{ pod }}:{{ phase }}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + prometheus.withExpr('count(kube_pod_status_phase{phase="Failed"})') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{phase}} pods') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + ] + }, + + alerts: { + query(): + prometheus.withExpr('topk(10,sum(ALERTS{severity!="none"}) by (alertname, severity))') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{severity}}: {{alertname}}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + mgmt_disk_wal_sync_duration: { + query(): + prometheus.withExpr('histogram_quantile(0.99, sum(irate(etcd_disk_wal_fsync_duration_seconds_bucket{namespace=~"openshift-etcd"}[2m])) by (namespace, pod, le))') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{namespace}} - {{pod}} WAL fsync') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + mgmt_disk_backend_sync_duration: { + query(): + prometheus.withExpr('histogram_quantile(0.99, sum(irate(etcd_disk_backend_commit_duration_seconds_bucket{namespace=~"openshift-etcd"}[2m])) by (namespace, pod, le))') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{namespace}} - {{pod}} DB fsync') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + mgmt_percent_db_used: { + query(): + prometheus.withExpr('(etcd_mvcc_db_total_size_in_bytes{namespace=~"openshift-etcd"} / etcd_server_quota_backend_bytes{namespace=~"openshift-etcd"})*100') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{namespace}} - {{pod}}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + mgmt_db_capacity_left: { + query(): + prometheus.withExpr('etcd_server_quota_backend_bytes{namespace=~"openshift-etcd"} - etcd_mvcc_db_total_size_in_bytes{namespace=~"openshift-etcd"}') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{namespace}} - {{pod}}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + mgmt_db_size_limit: { + query(): + prometheus.withExpr('etcd_server_quota_backend_bytes{namespace=~"openshift-etcd"}') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{namespace}} - {{ pod }} Quota Bytes') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + mgmt_db_size: { + query(): + [ + prometheus.withExpr('etcd_mvcc_db_total_size_in_bytes{namespace=~"openshift-etcd"}') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{namespace}} - {{pod}} DB physical size') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + + prometheus.withExpr('etcd_mvcc_db_total_size_in_use_in_bytes{namespace=~"openshift-etcd"}') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{namespace}} - {{pod}} DB logical size') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + ] + }, + + mgmt_grpc_traffic: { + query(): + [ + prometheus.withExpr('rate(etcd_network_client_grpc_received_bytes_total{namespace=~"openshift-etcd"}[2m])') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('rx {{namespace}} - {{pod}}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + + prometheus.withExpr('rate(etcd_network_client_grpc_sent_bytes_total{namespace=~"openshift-etcd"}[2m])') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('tx {{namespace}} - {{pod}}') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + ] + }, + + mgmt_active_streams: { + query(): + [ + prometheus.withExpr('sum(grpc_server_started_total{namespace=~"openshift-etcd",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{namespace=~"openshift-etcd",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{namespace}} - Watch Streams') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + + prometheus.withExpr('sum(grpc_server_started_total{namespace=~"openshift-etcd",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{namespace=~"openshift-etcd",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{namespace}} - Lease Streams') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + ] + }, + + mgmt_snapshot_duration: { + query(): + prometheus.withExpr('sum(rate(etcd_debugging_snap_save_total_duration_seconds_sum{namespace=~"openshift-etcd"}[2m]))') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('the total latency distributions of save called by snapshot') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + mgmt_raft_proposals: { + query(): + [ + prometheus.withExpr('sum(rate(etcd_server_proposals_failed_total{namespace=~"openshift-etcd"}[2m]))') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{namespace}} - Proposal Failure Rate') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + + prometheus.withExpr('sum(etcd_server_proposals_pending{namespace=~"openshift-etcd"})') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{namespace}} - Proposal Pending Total') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + + prometheus.withExpr('sum(rate(etcd_server_proposals_committed_total{namespace=~"openshift-etcd"}[2m]))') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{namespace}} - Proposal Commit Rate') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + + prometheus.withExpr('sum(rate(etcd_server_proposals_applied_total{namespace=~"openshift-etcd"}[2m]))') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withLegendFormat('{{namespace}} - Proposal Apply Rate') + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + ] + }, + + mgmt_num_leader_changes: { + query(): + prometheus.withExpr('sum(rate(etcd_server_leader_changes_seen_total{namespace=~"openshift-etcd"}[2m]))') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + mgmt_etcd_has_leader: { + query(): + prometheus.withExpr('max(etcd_server_has_leader{namespace=~"openshift-etcd"})') + + prometheus.withFormat('time_series') + + prometheus.withInstant(true) + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + mgmt_num_failed_proposals: { + query(): + prometheus.withExpr('max(etcd_server_proposals_committed_total{namespace=~"openshift-etcd"})') + + prometheus.withFormat('time_series') + + prometheus.withInstant(true) + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + mgmt_leader_elections_per_day: { + query(): + prometheus.withExpr('changes(etcd_server_leader_changes_seen_total{namespace=~"openshift-etcd"}[1d])') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{instance}} Total Leader Elections Per Day') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + mgmt_keys: { + query(): + prometheus.withExpr('etcd_debugging_mvcc_keys_total{namespace=~"openshift-etcd"}') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{ pod }} Num keys') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + mgmt_slow_operations: { + query(): + [ + prometheus.withExpr('delta(etcd_server_slow_apply_total{namespace=~"openshift-etcd"}[2m])') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{ pod }} slow applies') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + + prometheus.withExpr('delta(etcd_server_slow_read_indexes_total{namespace=~"openshift-etcd"}[2m])') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{ pod }} slow read indexes') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + ] + }, + + mgmt_key_operations: { + query(): + [ + prometheus.withExpr('rate(etcd_mvcc_put_total{namespace=~"openshift-etcd"}[2m])') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{ pod }} puts/s') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + + prometheus.withExpr('rate(etcd_mvcc_delete_total{namespace=~"openshift-etcd"}[2m])') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{ pod }} deletes/s') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + ] + }, + + mgmt_heartbeat_failures: { + query(): + [ + prometheus.withExpr('etcd_server_heartbeat_send_failures_total{namespace=~"openshift-etcd"}') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{ pod }} heartbeat failures') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + + prometheus.withExpr('etcd_server_health_failures{namespace=~"openshift-etcd"}') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{ pod }} health failures') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + ] + }, + + mgmt_compacted_keys: { + query(): + [ + prometheus.withExpr('etcd_debugging_mvcc_db_compaction_keys_total{namespace=~"openshift-etcd"}') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{ pod }} keys compacted') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + ] + }, + + nodeCPU: { + query(): + prometheus.withExpr('sum by (instance, mode)(irate(node_cpu_seconds_total{job=~".*"}[2m])) * 100 and on (instance) label_replace(cluster:nodes_roles{label_hypershift_openshift_io_cluster=~"$namespace"}, "instance", "$1", "node", "(.+)")') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{instance}} - {{mode}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + nodeMemory: { + query(): + prometheus.withExpr('node_memory_Active_bytes and on (instance) label_replace(cluster:nodes_roles{label_hypershift_openshift_io_cluster=~"$namespace"}, "instance", "$1", "node", "(.+)")') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{instance}} - Active') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + suricataCPU: { + query(): + prometheus.withExpr('sum(irate(container_cpu_usage_seconds_total{namespace=~"openshift-suricata",container!="POD",name!=""}[2m])*100) by (node) and on (node) label_replace(cluster:nodes_roles{label_hypershift_openshift_io_cluster=~"$namespace"}, "node", "$1", "node", "(.+)")') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{node}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + suricataMemory: { + query(): + prometheus.withExpr('sum(container_memory_rss{namespace=~"openshift-suricata",container!="POD",name!=""}) by (node) and on (node) label_replace(cluster:nodes_roles{label_hypershift_openshift_io_cluster=~"$namespace"}, "node", "$1", "node", "(.+)")') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{node}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + dynaoneagentCPU: { + query(): + prometheus.withExpr('sum(irate(container_cpu_usage_seconds_total{namespace=~"dynatrace", pod=~".*-oneagent-.*", container!~"POD|"}[2m])*100) by (node, namespace, pod)') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{ node }}: {{ namespace }} : {{ pod }}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + dynaoneagentMem: { + query(): + prometheus.withExpr('sum(container_memory_rss{namespace=~"dynatrace",pod=~".*-oneagent-.*",container!=""}) by (node, namespace, pod)') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{ node }}: {{ namespace }} : {{ pod }}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + infrastructure: { + query(): + prometheus.withExpr('cluster_infrastructure_provider{namespace=~"$namespace"}') + + prometheus.withFormat('time_series') + + prometheus.withInstant(true) + + prometheus.withLegendFormat('{{type}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + region: { + query(): + prometheus.withExpr('cluster_infrastructure_provider{namespace=~"$namespace"}') + + prometheus.withFormat('time_series') + + prometheus.withInstant(true) + + prometheus.withLegendFormat('{{region}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + ocp_version: { + query(): + prometheus.withExpr('cluster_version{type="completed",version!="",namespace=~"$namespace"}') + + prometheus.withFormat('time_series') + + prometheus.withInstant(true) + + prometheus.withLegendFormat('{{version}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + hostedControlPlaneCPU: { + query(): + prometheus.withExpr('cluster_version{type="completed",version!="",namespace=~"$namespace"}') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{version}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + hostedControlPlaneMemory: { + query(): + prometheus.withExpr('topk(10, container_memory_rss{namespace=~"$namespace",container!="POD",name!=""})') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{pod}}/{{container}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('PF55DCC5EC58ABF5A'), + }, + + request_duration_99th_quantile: { + query(): + prometheus.withExpr('histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{namespace=~"$namespace",resource=~"$resource",subresource!="log",verb!~"WATCH|WATCHLIST|PROXY"}[2m])) by(verb,le))') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{verb}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + request_rate_by_instance: { + query(): + prometheus.withExpr('sum(rate(apiserver_request_total{namespace=~"$namespace",resource=~"$resource",code=~"$code",verb=~"$verb"}[2m])) by(instance)') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{instance}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + request_duration_99th_quantile_by_resource: { + query(): + prometheus.withExpr('histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{namespace=~"$namespace",resource=~"$resource",subresource!="log",verb!~"WATCH|WATCHLIST|PROXY"}[2m])) by(resource, namespace, verb, le))') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{verb}}:{{resource}}/{{namespace}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + request_rate_by_resource: { + query(): + prometheus.withExpr('sum(rate(apiserver_request_total{namespace=~"$namespace",resource=~"$resource",code=~"$code",verb=~"$verb"}[2m])) by(resource)') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{resource}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + request_duration_read_write: { + query(): + [ + prometheus.withExpr('histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{namespace=~"$namespace",resource=~"$resource",verb=~"LIST|GET"}[2m])) by(le))') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('read') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + + prometheus.withExpr('histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{namespace=~"$namespace",resource=~"$resource",verb=~"POST|PUT|PATCH|UPDATE|DELETE"}[2m])) by(le))') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('write') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + ] + }, + + request_rate_read_write: { + query(): + [ + prometheus.withExpr('sum(rate(apiserver_request_total{namespace=~"$namespace",resource=~"$resource",verb=~"LIST|GET"}[2m]))') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('read') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + + prometheus.withExpr('sum(rate(apiserver_request_total{namespace=~"$namespace",resource=~"$resource",verb=~"POST|PUT|PATCH|UPDATE|DELETE"}[2m]))') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('write') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + ] + }, + + requests_dropped_rate: { + query(): + prometheus.withExpr('sum(rate(apiserver_dropped_requests_total{namespace=~"$namespace"}[2m])) by (requestKind)') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + requests_terminated_rate: { + query(): + prometheus.withExpr('sum(rate(apiserver_request_terminations_total{namespace=~"$namespace",resource=~"$resource",code=~"$code"}[2m])) by(component)') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + requests_status_rate: { + query(): + prometheus.withExpr('sum(rate(apiserver_request_total{namespace=~"$namespace",resource=~"$resource",verb=~"$verb",code=~"$code"}[2m])) by(code)') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{code}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + long_running_requests: { + query(): + prometheus.withExpr('sum(apiserver_longrunning_gauge{namespace=~"$namespace",resource=~"$resource",verb=~"$verb"}) by(instance)') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{instance}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + request_in_flight: { + query(): + prometheus.withExpr('sum(apiserver_current_inflight_requests{namespace=~"$namespace"}) by (instance,requestKind)') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{requestKind}}-{{instance}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + pf_requests_rejected: { + query(): + prometheus.withExpr('sum(rate(apiserver_flowcontrol_rejected_requests_total{namespace=~"$namespace"}[2m])) by (reason)') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + response_size_99th_quartile: { + query(): + prometheus.withExpr('histogram_quantile(0.99, sum(rate(apiserver_response_sizes_bucket{namespace=~"$namespace",resource=~"$resource",verb=~"$verb"}[2m])) by(instance,le))') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{instance}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + pf_request_queue_length: { + query(): + prometheus.withExpr('histogram_quantile(0.99, sum(rate(apiserver_flowcontrol_request_queue_length_after_enqueue_bucket{namespace=~"$namespace"}[2m])) by(flowSchema, priorityLevel, le))') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{flowSchema}}:{{priorityLevel}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + pf_request_wait_duration_99th_quartile: { + query(): + prometheus.withExpr('histogram_quantile(0.99, sum(rate(apiserver_flowcontrol_request_wait_duration_seconds_bucket{namespace=~"$namespace"}[2m])) by(flowSchema, priorityLevel, le))') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{flowSchema}}:{{priorityLevel}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + pf_request_execution_duration: { + query(): + prometheus.withExpr('histogram_quantile(0.99, sum(rate(apiserver_flowcontrol_request_execution_seconds_bucket{namespace=~"$namespace"}[2m])) by(flowSchema, priorityLevel, le))') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{flowSchema}}:{{priorityLevel}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + pf_request_dispatch_rate: { + query(): + prometheus.withExpr('sum(rate(apiserver_flowcontrol_dispatched_requests_total{namespace=~"$namespace"}[2m])) by(flowSchema,priorityLevel)') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{flowSchema}}:{{priorityLevel}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + pf_concurrency_limit: { + query(): + prometheus.withExpr('sum(apiserver_flowcontrol_request_concurrency_limit{namespace=~"$namespace"}) by (priorityLevel)') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{priorityLevel}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + pf_pending_in_queue: { + query(): + prometheus.withExpr('sum(apiserver_flowcontrol_current_inqueue_requests{namespace=~"$namespace"}) by (flowSchema,priorityLevel)') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{flowSchema}}:{{priorityLevel}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + disk_wal_sync_duration: { + query(): + prometheus.withExpr('histogram_quantile(0.99, sum(irate(etcd_disk_wal_fsync_duration_seconds_bucket{namespace=~"$namespace"}[2m])) by (namespace, pod, le))') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{pod}} WAL fsync') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + disk_backend_sync_duration: { + query(): + prometheus.withExpr('histogram_quantile(0.99, sum(irate(etcd_disk_backend_commit_duration_seconds_bucket{namespace=~"$namespace"}[2m])) by (namespace, pod, le))') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{pod}} DB fsync') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + percent_db_used: { + query(): + prometheus.withExpr('(etcd_mvcc_db_total_size_in_bytes{namespace=~"$namespace"} / etcd_server_quota_backend_bytes{namespace=~"$namespace"})*100') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{pod}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + db_capacity_left: { + query(): + prometheus.withExpr('etcd_server_quota_backend_bytes{namespace=~"$namespace"} - etcd_mvcc_db_total_size_in_bytes{namespace=~"$namespace"}') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{pod}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + + db_size_limit: { + query(): + prometheus.withExpr('etcd_server_quota_backend_bytes{namespace=~"$namespace"}') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{ pod }} Quota Bytes') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + db_size: { + query(): + [ + prometheus.withExpr('etcd_mvcc_db_total_size_in_bytes{namespace=~"$namespace"}') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{pod}} DB physical size') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + + prometheus.withExpr('etcd_mvcc_db_total_size_in_use_in_bytes{namespace=~"$namespace"}') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{pod}} DB logical size') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + ] + }, + + grpc_traffic: { + query(): + [ + prometheus.withExpr('rate(etcd_network_client_grpc_received_bytes_total{namespace=~"$namespace"}[2m])') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('rx {{namespace}} - {{pod}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + + prometheus.withExpr('rate(etcd_network_client_grpc_sent_bytes_total{namespace=~"$namespace"}[2m])') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('tx {{namespace}} - {{pod}}') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + ] + }, + + active_streams: { + query(): + [ + prometheus.withExpr('sum(grpc_server_started_total{namespace=~"$namespace",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{namespace=~"$namespace",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - Watch Streams') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + + prometheus.withExpr('sum(grpc_server_started_total{namespace=~"$namespace",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{namespace=~"$namespace",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - Lease Streams') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + ] + }, + + snapshot_duration: { + query(): + prometheus.withExpr('sum(rate(etcd_debugging_snap_save_total_duration_seconds_sum{namespace=~"$namespace"}[2m]))') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('the total latency distributions of save called by snapshot') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3') + }, + + raft_proposals: { + query(): + [ prometheus.withExpr('sum(rate(etcd_server_proposals_failed_total{namespace=~"openshift-etcd"}[2m]))') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - Proposal Failure Rate') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + + prometheus.withExpr('sum(etcd_server_proposals_pending{namespace=~"$namespace"})') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - Proposal Pending Total') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + + prometheus.withExpr('sum(rate(etcd_server_proposals_committed_total{namespace=~"$namespace"}[2m]))') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - Proposal Commit Rate') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + + prometheus.withExpr('sum(rate(etcd_server_proposals_applied_total{namespace=~"$namespace"}[2m]))') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - Proposal Apply Rate') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + + ] + }, + + num_leader_changes: { + query(): + prometheus.withExpr('sum(rate(etcd_server_leader_changes_seen_total{namespace=~"$namespace"}[2m]))') + + prometheus.withFormat('time_series') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + etcd_has_leader: { + query(): + prometheus.withExpr('max(etcd_server_has_leader{namespace=~"$namespace"})') + + prometheus.withFormat('time_series') + + prometheus.withInstant(true) + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + num_failed_proposals: { + query(): + prometheus.withExpr('max(etcd_server_proposals_committed_total{namespace=~"$namespace"})') + + prometheus.withFormat('time_series') + + prometheus.withInstant(true) + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + leader_elections_per_day: { + query(): + prometheus.withExpr('changes(etcd_server_leader_changes_seen_total{namespace=~"$namespace"}[1d])') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{instance}} Total Leader Elections Per Day') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + keys: { + query(): + prometheus.withExpr('etcd_debugging_mvcc_keys_total{namespace=~"$namespace"}') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{ pod }} Num keys') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + }, + + slow_operations: { + query(): + [ + prometheus.withExpr('delta(etcd_server_slow_apply_total{namespace=~"$namespace"}[2m])') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{ pod }} slow applies') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + + prometheus.withExpr('delta(etcd_server_slow_read_indexes_total{namespace=~"$namespace"}[2m])') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{ pod }} slow read indexes') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + ] + }, + + key_operations: { + query(): + [ + prometheus.withExpr('rate(etcd_mvcc_put_total{namespace=~"$namespace"}[2m])') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{ pod }} puts/s') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + + prometheus.withExpr('rate(etcd_mvcc_delete_total{namespace=~"$namespace"}[2m])') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{ pod }} deletes/s') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + ] + }, + + heartbeat_failures: { + query(): + [ + prometheus.withExpr('etcd_server_heartbeat_send_failures_total{namespace=~"$namespace"}') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{ pod }} heartbeat failures') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + + prometheus.withExpr('etcd_server_health_failures{namespace=~"$namespace') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{ pod }} health failures') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + ] + }, + + compacted_keys: { + query(): + prometheus.withExpr('etcd_debugging_mvcc_db_compaction_keys_total{namespace=~"$namespace"}') + + prometheus.withFormat('time_series') + + prometheus.withLegendFormat('{{namespace}} - {{ pod }} keys compacted') + + prometheus.withIntervalFactor(2) + + prometheus.withDatasource('P1BA917A37525EDF3'), + } + +} \ No newline at end of file diff --git a/assets/hypershift-perf-dashboard/variables.libsonnet b/assets/hypershift-perf-dashboard/variables.libsonnet new file mode 100644 index 0000000..6fdab32 --- /dev/null +++ b/assets/hypershift-perf-dashboard/variables.libsonnet @@ -0,0 +1,40 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; +local var = g.dashboard.variable; + +{ + Namespace: + var.query.new('namespace','label_values(kube_pod_info, namespace)') + + var.query.withDatasource('prometheus','PF55DCC5EC58ABF5A') + + var.datasource.withRegex("/^ocm/") + + var.query.selectionOptions.withMulti(true) + + var.query.selectionOptions.withIncludeAll(true) + + var.query.generalOptions.withLabel('Namespace') + + var.query.withRefresh(2), + + Resource: + var.query.new('resource','label_values(apiserver_request_duration_seconds_bucket, resource)') + + var.query.withDatasource('prometheus','PF55DCC5EC58ABF5A') + + var.datasource.withRegex("") + + var.query.selectionOptions.withMulti(true) + + var.query.selectionOptions.withIncludeAll(true) + + var.query.generalOptions.withLabel('resource') + + var.query.withRefresh(2), + + Code: + var.query.new('code','label_values(code)') + + var.query.withDatasource('prometheus','PF55DCC5EC58ABF5A') + + var.datasource.withRegex("") + + var.query.selectionOptions.withMulti(true) + + var.query.selectionOptions.withIncludeAll(true) + + var.query.generalOptions.withLabel('code') + + var.query.withRefresh(2), + + Verb: + var.query.new('verb','label_values(verb)') + + var.query.withDatasource('prometheus','PF55DCC5EC58ABF5A') + + var.datasource.withRegex("") + + var.query.selectionOptions.withMulti(true) + + var.query.selectionOptions.withIncludeAll(true) + + var.query.generalOptions.withLabel('verb') + + var.query.withRefresh(2), +} diff --git a/templates/General/hypershift-performance-v2.jsonnet b/templates/General/hypershift-performance-v2.jsonnet new file mode 100644 index 0000000..505d1bb --- /dev/null +++ b/templates/General/hypershift-performance-v2.jsonnet @@ -0,0 +1,154 @@ +local panels = import '../../assets/hypershift-perf-dashboard/panels.libsonnet'; +local queries = import '../../assets/hypershift-perf-dashboard/queries.libsonnet'; +local variables = import '../../assets/hypershift-perf-dashboard/variables.libsonnet'; +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +local cluster_prometheus = 'PF55DCC5EC58ABF5A'; +local OBO = 'P1BA917A37525EDF3'; + +g.dashboard.new('Hypershift Performance Dashboard') ++ g.dashboard.withDescription(||| + Dashboard for Api-performance-overview +|||) ++ g.dashboard.withTags('') ++ g.dashboard.time.withFrom('now-6h') ++ g.dashboard.time.withTo('now') ++ g.dashboard.withTimezone('utc') ++ g.dashboard.timepicker.withRefreshIntervals(['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d']) ++ g.dashboard.timepicker.withTimeOptions(['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d']) ++ g.dashboard.withRefresh('') ++ g.dashboard.withEditable(true) ++ g.dashboard.graphTooltip.withSharedCrosshair() ++ g.dashboard.withVariables([ + variables.Namespace, + variables.Resource, + variables.Code, + variables.Verb, +]) ++ g.dashboard.withPanels([ + g.panel.row.new('Management cluster stats') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.stat.m_infrastructure('Management Cloud Infrastructure', '', queries.m_infrastructure.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 0, w: 6, h: 4 }), + panels.stat.m_region('Management Cloud Region', '', queries.m_region.query(), 'PF55DCC5EC58ABF5A', { x: 6, y: 0, w: 6, h: 4 }), + panels.stat.m_ocp_version('Management OCP Version', '', queries.m_ocp_version.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 0, w: 6, h: 4 }), + panels.stat.num_hosted_cluster('Number of HostedCluster', '', queries.num_hosted_cluster.query(), 'PF55DCC5EC58ABF5A', { x: 18, y: 0, w: 6, h: 4 }), + panels.stat.current_namespace_count('Current namespace Count', '', queries.current_namespace_count.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 5, w: 8, h: 4 }), + panels.stat.current_node_count('Current Node Count', '', queries.current_node_count.query(), 'PF55DCC5EC58ABF5A', { x: 8, y: 5, w: 8, h: 4 }), + panels.stat.current_pod_count('Current Pod Count', '', queries.current_pod_count.query(), 'PF55DCC5EC58ABF5A', { x: 16, y: 5, w: 8, h: 4 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Hosted Clusters container CPU', 'percent', queries.top10ContCPUHosted.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 8, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Hosted Clusters container RSS', 'bytes', queries.top10ContMemHosted.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 8, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Management Cluster container CPU', 'percent', queries.top10ContCPUManagement.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 20, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Management Cluster container RSS', 'bytes', queries.top10ContMemManagement.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 20, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Management Cluster OBO NS Pods CPU', 'percent', queries.top10ContCPUOBOManagement.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 28, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Management Cluster OBO NS Pods RSS', 'bytes', queries.top10ContMemOBOManagement.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 28, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Management Cluster Hypershift NS Pods CPU', 'percent', queries.top10ContCPUHypershiftManagement.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 36, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Top 10 Management Cluster Hypershift NS Pods RSS', 'bytes', queries.top10ContMemHypershiftManagement.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 36, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Active Gate Memory Usage', 'bytes', queries.dynaactivegateMem.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 18, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Active Gate CPU Usage', 'percent', queries.dynaactivegateCPU.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 18, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Opentelemetry CPU Usage', 'percent', queries.opentelemetryCPU.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 18, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Opentelemetry Memory Usage', 'bytes', queries.opentelemetryMem.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 18, w: 12, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Number of nodes', 'none', queries.nodeCount.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 44, w: 6, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Machine Set Replicas', 'none', queries.current_machine_set_replica_count.query(), 'PF55DCC5EC58ABF5A', { x: 6, y: 44, w: 6, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Namespace count', 'none', queries.nsCount.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 44, w: 6, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Pod count', 'none', queries.podCount.query(), 'PF55DCC5EC58ABF5A', { x: 18, y: 44, w: 6, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Cluster operators information', 'none', queries.clusterOperatorsInformation.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 52, w: 8, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Cluster operators degraded', 'none', queries.clusterOperatorsDegraded.query(), 'PF55DCC5EC58ABF5A', { x: 8, y: 52, w: 8, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Failed pods', 'none', queries.FailedPods.query(), 'PF55DCC5EC58ABF5A', { x: 16, y: 52, w: 8, h: 8 }), + panels.timeSeries.managementClustersStatsTimeseriesSettings('Alerts', 'none', queries.alerts.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 60, w: 24, h: 8 }), + ]), + g.panel.row.new('Management cluster Etcd stats') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + panels.timeSeries.mgmt('Disk WAL Sync Duration', 's', queries.mgmt_disk_wal_sync_duration.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 2, w: 12, h: 8 }), + panels.timeSeries.mgmt('Disk Backend Sync Duration', 's', queries.mgmt_disk_backend_sync_duration.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 2, w: 12, h: 8 }), + panels.timeSeries.DBPanelsSettings('% DB Space Used', 'percent', queries.mgmt_percent_db_used.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 10, w: 8, h: 8 }), + panels.timeSeries.DBPanelsSettings('DB Left capacity (with fragmented space)', 'bytes', queries.mgmt_db_capacity_left.query(), 'PF55DCC5EC58ABF5A', { x: 8, y: 10, w: 8, h: 8 }), + panels.timeSeries.DBPanelsSettings('DB Size Limit (Backend-bytes)', 'bytes', queries.mgmt_db_size_limit.query(), 'PF55DCC5EC58ABF5A', { x: 16, y: 10, w: 8, h: 8 }), + panels.timeSeries.mgmt('DB Size', 'bytes', queries.mgmt_db_size.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 18, w: 12, h: 8 }), + panels.timeSeries.mgmt('gRPC network traffic', 'Bps', queries.mgmt_grpc_traffic.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 18, w: 12, h: 8 }), + panels.timeSeries.DBPanelsSettings('Active Streams', '', queries.mgmt_active_streams.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 26, w: 12, h: 8 }), + panels.timeSeries.DBPanelsSettings('Snapshot duration', 's', queries.mgmt_snapshot_duration.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 26, w: 12, h: 8 }), + panels.timeSeries.DBPanelsSettings('Raft Proposals', '', queries.mgmt_raft_proposals.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 1, w: 12, h: 8 }), + panels.timeSeries.DBPanelsSettings('Number of leader changes seen', '', queries.mgmt_num_leader_changes.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 1, w: 12, h: 8 }), + panels.stat.etcd_has_leader('Etcd has a leader?', '', queries.mgmt_etcd_has_leader.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 8, w: 6, h: 2 }), + panels.stat.mgmt_num_failed_proposals('Total number of failed proposals seen', '', queries.mgmt_num_failed_proposals.query(), 'PF55DCC5EC58ABF5A', { x: 6, y: 8, w: 6, h: 2 }), + panels.timeSeries.DBPanelsSettings('Leader Elections Per Day', '', queries.mgmt_leader_elections_per_day.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 12, w: 12, h: 6 }), + panels.timeSeries.DBPanelsSettings('Keys', '', queries.mgmt_keys.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 12, w: 12, h: 8 }), + panels.timeSeries.DBPanelsSettings('Slow Operations', 'ops', queries.mgmt_slow_operations.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 20, w: 12, h: 8 }), + panels.timeSeries.DBPanelsSettings('Key Operations', 'ops', queries.mgmt_key_operations.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 20, w: 12, h: 8 }), + panels.timeSeries.DBPanelsSettings('Heartbeat Failures', '', queries.mgmt_heartbeat_failures.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 28, w: 12, h: 8 }), + panels.timeSeries.DBPanelsSettings('Compacted Keys', '', queries.mgmt_compacted_keys.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 28, w: 12, h: 8 }), + ]), + + g.panel.row.new('Hosted Clusters Serving Node stats - $namespace') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withRepeat('namespace') + + g.panel.row.withPanels([ + panels.timeSeries.genericGraphLegendPanel('Serving Node CPU Basic', 'percent', queries.nodeCPU.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 2, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanel('Serving Node Memory', 'bytes', queries.nodeMemory.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 2, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanel('Suricata CPU(Running on Serving node)', 'percent', queries.suricataCPU.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 18, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanel('Suricata Memory(Running on Serving node)', 'bytes', queries.suricataMemory.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 18, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanel('OneAgent CPU Usage', 'percent', queries.dynaactivegateCPU.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 18, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanel('OneAgent Memory Usage', 'bytes', queries.dynaoneagentMem.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 18, w: 12, h: 8 }), + ]), + + g.panel.row.new('Hosted Clusters Serving Node stats - $namespace') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withRepeat('namespace') + + g.panel.row.withPanels([ + panels.stat.hostedControlPlaneStats('Hosted Cluster Cloud Infrastructure', '', queries.infrastructure.query(), 'P1BA917A37525EDF3', { x: 0, y: 0, w: 8, h: 4 }), + panels.stat.hostedControlPlaneStats('Hosted Cluster Cloud Region', '', queries.region.query(), 'P1BA917A37525EDF3', { x: 8, y: 0, w: 8, h: 4 }), + panels.stat.hostedControlPlaneStats('Hosted Cluster OCP Version', '', queries.ocp_version.query(), 'P1BA917A37525EDF3', { x: 16, y: 0, w: 8, h: 4 }), + panels.timeSeries.genericGraphLegendPanel('Hosted Control Plane CPU', 'percent', queries.hostedControlPlaneCPU.query(), 'PF55DCC5EC58ABF5A', { x: 0, y: 12, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanel('Hosted Control Plane Memory', 'bytes', queries.hostedControlPlaneMemory.query(), 'PF55DCC5EC58ABF5A', { x: 12, y: 12, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanelRightSide('request duration - 99th quantile', '', queries.request_duration_99th_quantile.query(), OBO, { x: 0, y: 20, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelRightSide('request rate - by instance', '', queries.request_rate_by_instance.query(), OBO, { x: 8, y: 20, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelRightSide('request duration - 99th quantile - by resource', '', queries.request_duration_99th_quantile_by_resource.query(), OBO, { x: 16, y: 20, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelRightSide('request duration - 99th quantile', '', queries.request_rate_by_resource.query(), OBO, { x: 0, y: 30, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('request duration - read vs write', '', queries.request_duration_read_write.query(), OBO, { x: 8, y: 30, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('request rate - read vs write', '', queries.request_rate_read_write.query(), OBO, { x: 16, y: 30, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('requests dropped rate', '', queries.requests_dropped_rate.query(), OBO, { x: 0, y: 40, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('requests terminated rate', '', queries.requests_terminated_rate.query(), OBO, { x: 8, y: 40, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('requests status rate', '', queries.requests_status_rate.query(), OBO, { x: 16, y: 40, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('long running requests', '', queries.long_running_requests.query(), OBO, { x: 0, y: 50, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelRightSide('requests in flight', '', queries.request_in_flight.query(), OBO, { x: 8, y: 50, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('p&f - requests rejected', '', queries.pf_requests_rejected.query(), OBO, { x: 16, y: 50, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelRightSide('response size - 99th quantile', '', queries.response_size_99th_quartile.query(), OBO, { x: 0, y: 60, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelRightSide('p&f - request queue length', '', queries.pf_request_queue_length.query(), OBO, { x: 8, y: 60, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelRightSide('p&f - request wait duration - 99th quantile', '', queries.pf_request_wait_duration_99th_quartile.query(), OBO, { x: 16, y: 60, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelRightSide('p&f - request execution duration', '', queries.pf_request_execution_duration.query(), OBO, { x: 0, y: 70, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelRightSide('p&f - request dispatch rate', '', queries.pf_request_dispatch_rate.query(), OBO, { x: 8, y: 70, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('p&f - concurrency limit by priority level', '', queries.pf_concurrency_limit.query(), OBO, { x: 16, y: 70, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelRightSide('p&f - pending in queue', '', queries.pf_pending_in_queue.query(), OBO, { x: 0, y: 80, w: 8, h: 8 }), + ]), + g.panel.row.new('Hosted Clusters ETCD General Resource Usage - $namespace') + + g.panel.row.withGridPos({ x: 0, y: 14, w: 24, h: 1 }) + + g.panel.row.withCollapsed(true) + + g.panel.row.withRepeat('namespace') + + g.panel.row.withPanels([ + panels.timeSeries.genericGraphLegendPanel('Disk WAL Sync Duration', 's', queries.disk_wal_sync_duration.query(), OBO, { x: 0, y: 2, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanel('Disk Backend Sync Duration', 's', queries.disk_backend_sync_duration.query(), OBO, { x: 12, y: 2, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('% DB Space Used', 'percent', queries.percent_db_used.query(), OBO, { x: 0, y: 10, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('DB Left capacity (with fragmented space)', 'bytes', queries.db_capacity_left.query(), OBO, { x: 8, y: 10, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('DB Size Limit (Backend-bytes)', 'bytes', queries.db_size_limit.query(), OBO, { x: 16, y: 10, w: 8, h: 8 }), + panels.timeSeries.genericGraphLegendPanel('DB Size', 'bytes', queries.db_size.query(), OBO, { x: 0, y: 18, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanel('gRPC network traffic', 'Bps', queries.grpc_traffic.query(), OBO, { x: 12, y: 18, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('Active Streams', '', queries.active_streams.query(), OBO, { x: 0, y: 26, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('Snapshot duration', 's', queries.snapshot_duration.query(), OBO, { x: 12, y: 26, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('Raft Proposals', '', queries.raft_proposals.query(), OBO, { x: 0, y: 34, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('Number of leader changes seen', '', queries.num_leader_changes.query(), OBO, { x: 12, y: 34, w: 12, h: 8 }), + panels.stat.etcd_has_leader('Etcd has a leader?', '', queries.etcd_has_leader.query(), OBO, { x: 0, y: 42, w: 6, h: 2 }), + panels.stat.mgmt_num_failed_proposals('Total number of failed proposals seen', '', queries.num_failed_proposals.query(), OBO, { x: 6, y: 42, w: 6, h: 2 }), + panels.timeSeries.genericGraphLegendPanelList('Leader Elections Per Day', '', queries.leader_elections_per_day.query(), OBO, { x: 0, y: 44, w: 12, h: 6 }), + panels.timeSeries.genericGraphLegendPanelList('Keys', '', queries.keys.query(), OBO, { x: 12, y: 44, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('Slow Operations', 'ops', queries.slow_operations.query(), OBO, { x: 0, y: 52, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('Key Operations', 'ops', queries.key_operations.query(), OBO, { x: 12, y: 52, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('Heartbeat Failures', '', queries.heartbeat_failures.query(), OBO, { x: 0, y: 60, w: 12, h: 8 }), + panels.timeSeries.genericGraphLegendPanelList('Compacted Keys', '', queries.compacted_keys.query(), OBO, { x: 12, y: 60, w: 12, h: 8 }), + ]), +]) From f39bfea4e39837666e5bd2b37ddf5ce0b18a63fe Mon Sep 17 00:00:00 2001 From: Manda-supraja26 Date: Tue, 26 Mar 2024 16:28:31 +0530 Subject: [PATCH 2/4] Updating README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a7c43c2..ee47eb6 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ The following templates are available: - [x] API Performance Dashboard. - [x] Cilium K8s Performance Dashboard. - [x] Etcd Dashboard. - - [ ] Hypershift Performance Dashboard. + - [x] Hypershift Performance Dashboard. - [x] K8s Performance Dashboard. - [ ] Kube Burner Dashboard. - [x] OpenShift Performance Dashboard. From c63bb0bd98412b994217f894c06228762c55ba9d Mon Sep 17 00:00:00 2001 From: Manda-supraja26 Date: Tue, 26 Mar 2024 16:51:46 +0530 Subject: [PATCH 3/4] Updating Readme file --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index ee47eb6..2ea2abe 100644 --- a/README.md +++ b/README.md @@ -69,8 +69,7 @@ rm -rf bin rendered tmp templates/grafonnet-lib ## Templates available -The following templates are available: - +Dashboards Available after Migration to Grafonnet: - CPT - [x] Ingress Perf Dashboard. - [x] K8s Netperf Dashboard. From 6c615b56f0888ef8058801291d37fe99f2048438 Mon Sep 17 00:00:00 2001 From: Manda-supraja26 Date: Tue, 26 Mar 2024 17:26:37 +0530 Subject: [PATCH 4/4] Adding changes to README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2ea2abe..6770c23 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ rm -rf bin rendered tmp templates/grafonnet-lib ## Templates available -Dashboards Available after Migration to Grafonnet: +Dashboards Available after Migration to Grafonnet v10.1.0(latest): - CPT - [x] Ingress Perf Dashboard. - [x] K8s Netperf Dashboard.