diff --git a/README.md b/README.md index 8089e5a0..36b18f3f 100644 --- a/README.md +++ b/README.md @@ -107,6 +107,12 @@ Since the LNET metrics are currently exported based on the [DebugFS](https://doc To export that metrics running the exporter as Systemd service as `root` user, the default user option `prometheus` must be changed in the [service file](systemd/prometheus-lustre-exporter.service). +## Grafana Dashboards + +Example dashboards for [Grafana](https://grafana.com/) are provided under the [dashboards](dashboards) directroy. + +> It is highly recommended to also install the [Prometheus Node Exporter](https://github.com/prometheus/node_exporter) to get a complete view within the provided dashboards. + ## Troubleshooting In the event that you encounter issues with specific metrics (especially on versions of Lustre older than 2.12), please try disabling those specific troublesome metrics using the documented collector flags in the 'disabled' or 'core' state. Users have encountered bugs within Lustre where specific sysfs and procfs files miscommunicate their sizes, causing read calls to fail. diff --git a/dashboards/lustre_exporter_mds_overview.json b/dashboards/lustre_exporter_mds_overview.json new file mode 100644 index 00000000..80be0471 --- /dev/null +++ b/dashboards/lustre_exporter_mds_overview.json @@ -0,0 +1,1204 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + }, + { + "name": "VAR_LUSTRE_PORT", + "type": "constant", + "label": "lustre_port", + "value": "9169", + "description": "" + }, + { + "name": "VAR_NODE_PORT", + "type": "constant", + "label": "node_port", + "value": "9100", + "description": "" + }, + { + "name": "VAR_NODE_JOB", + "type": "constant", + "label": "node_job", + "value": "node", + "description": "" + }, + { + "name": "VAR_LUSTRE_JOB", + "type": "constant", + "label": "lustre_job", + "value": "lustre", + "description": "" + } + ], + "__elements": [], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.5.5" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph (old)", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "iteration": 1714121860436, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "yKpIni7nk" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 11, + "panels": [], + "title": "CPU Load", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "CPU load average 1 minute with resolution for Lustre Hebe Meta Data Server", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 1 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "avg(node_load1{instance=~\"$host:$node_port\",job=~\"$node_job\"}) by (instance) / count(node_cpu_scaling_frequency_hertz{instance=~\"$host:$node_port\",job=~\"$node_job\"}) by (instance) * 100", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + }, + { + "exemplar": true, + "expr": "avg(node_load1{instance=~\"$host:$node_port\",job=~\"$node_job\"}) by (instance)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "thresholds": [], + "timeRegions": [], + "title": "CPU Load Average (1m resolution)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:142", + "format": "percent", + "label": "", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:143", + "format": "percent", + "label": "", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "yKpIni7nk" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 13, + "panels": [], + "title": "Metadata Operations", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 11 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": true, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "sum(rate(lustre_stats_total{instance=~\"$host:$lustre_port\",job=\"$lustre_job\", component=\"mdt\"}[120s])) by (target)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{target}}", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Metadata Operations per MDT", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:66", + "format": "ops", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:67", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "yKpIni7nk" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 17, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 21 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "100 - lustre_inodes_free{instance=~\"$host:$lustre_port\",job=\"$lustre_job\", component=\"mdt\"} / lustre_inodes_maximum{instance=~\"$host:$lustre_port\",job=\"$lustre_job\", component=\"mdt\"} * 100", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-[{{target}}]", + "refId": "C", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "thresholds": [], + "timeRegions": [], + "title": "I-Nodes Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:502", + "format": "percent", + "label": "", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:503", + "format": "short", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 30 + }, + "hiddenSeries": false, + "id": 18, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "100 - lustre_available_kilobytes{instance=~\"$host:$lustre_port\",job=\"$lustre_job\", component=\"mdt\"} / lustre_capacity_kilobytes{instance=~\"$host:$lustre_port\",job=\"$lustre_job\", component=\"mdt\"} * 100", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}-[{{target}}]", + "refId": "C", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Disk Space Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:644", + "format": "percent", + "label": "", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:645", + "format": "short", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + } + ], + "title": "Disk Usage", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "yKpIni7nk" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 15, + "panels": [ + { + "aliasColors": { + "Recv_bytes_eth2": "#7EB26D", + "Recv_bytes_lo": "#0A50A1", + "Recv_drop_eth2": "#6ED0E0", + "Recv_drop_lo": "#E0F9D7", + "Recv_errs_eth2": "#BF1B00", + "Recv_errs_lo": "#CCA300", + "Trans_bytes_eth2": "#7EB26D", + "Trans_bytes_lo": "#0A50A1", + "Trans_drop_eth2": "#6ED0E0", + "Trans_drop_lo": "#E0F9D7", + "Trans_errs_eth2": "#BF1B00", + "Trans_errs_lo": "#CCA300", + "recv_bytes_lo": "#0A50A1", + "recv_drop_eth0": "#99440A", + "recv_drop_lo": "#967302", + "recv_errs_eth0": "#BF1B00", + "recv_errs_lo": "#890F02", + "trans_bytes_eth0": "#7EB26D", + "trans_bytes_lo": "#0A50A1", + "trans_drop_eth0": "#99440A", + "trans_drop_lo": "#967302", + "trans_errs_eth0": "#BF1B00", + "trans_errs_lo": "#890F02" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Received InfiniBand Network Traffic aggregated over all available IB interfaces (e.g. ib0/ib1)", + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 22 + }, + "hiddenSeries": false, + "id": 6, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "total", + "sortDesc": true, + "total": true, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(irate(node_infiniband_port_data_received_bytes_total{instance=~\"$host:$node_port\",job=~\"$node_job\"}[1m])) by (instance)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Received InfiniBand Network Traffic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:724", + "format": "decbytes", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:725", + "format": "pps", + "label": "", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": { + "Recv_bytes_eth2": "#7EB26D", + "Recv_bytes_lo": "#0A50A1", + "Recv_drop_eth2": "#6ED0E0", + "Recv_drop_lo": "#E0F9D7", + "Recv_errs_eth2": "#BF1B00", + "Recv_errs_lo": "#CCA300", + "Trans_bytes_eth2": "#7EB26D", + "Trans_bytes_lo": "#0A50A1", + "Trans_drop_eth2": "#6ED0E0", + "Trans_drop_lo": "#E0F9D7", + "Trans_errs_eth2": "#BF1B00", + "Trans_errs_lo": "#CCA300", + "recv_bytes_lo": "#0A50A1", + "recv_drop_eth0": "#99440A", + "recv_drop_lo": "#967302", + "recv_errs_eth0": "#BF1B00", + "recv_errs_lo": "#890F02", + "trans_bytes_eth0": "#7EB26D", + "trans_bytes_lo": "#0A50A1", + "trans_drop_eth0": "#99440A", + "trans_drop_lo": "#967302", + "trans_errs_eth0": "#BF1B00", + "trans_errs_lo": "#890F02" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Transmitted InfiniBand Network Traffic aggregated over all available IB interfaces (e.g. ib0/ib1)", + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 31 + }, + "hiddenSeries": false, + "id": 7, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "total", + "sortDesc": true, + "total": true, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(irate(node_infiniband_port_data_transmitted_bytes_total{instance=~\"$host:$node_port\",job=~\"$node_job\"}[1m])) by (instance)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 240, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Transmitted InfiniBand Network Traffic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:796", + "format": "decbytes", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:797", + "format": "pps", + "label": "", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + } + ], + "title": "Network Traffic", + "type": "row" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 20, + "panels": [], + "title": "Status", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bool_yes_no" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 22, + "options": { + "legend": { + "calcs": [ + "min", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "sortBy": "Min", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.5.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "lustre_health_check{instance=~\"$host:$lustre_port\",job=\"$lustre_job\"}", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Healthy", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Count of LNET errors", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 24, + "options": { + "legend": { + "calcs": [ + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(lustre_errors_total{instance=~\"$host:$lustre_port\"}[1m])", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "LNET Errors", + "type": "timeseries" + } + ], + "refresh": "", + "schemaVersion": 36, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "lustre_changelog_current_index", + "description": "", + "hide": 0, + "includeAll": false, + "label": "Filesystem", + "multi": false, + "name": "fs_name", + "options": [], + "query": { + "query": "lustre_changelog_current_index", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "/target=\"([\\w]+)\\-MDT/", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(lustre_changelog_current_index{target=~\"$fs_name-MDT.*\"}, instance)", + "description": "Target Host", + "hide": 0, + "includeAll": true, + "label": "Host", + "multi": true, + "name": "host", + "options": [], + "query": { + "query": "label_values(lustre_changelog_current_index{target=~\"$fs_name-MDT.*\"}, instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "/([^:]+):/", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "description": "Port Number of Lustre Exporter", + "hide": 2, + "name": "lustre_port", + "query": "${VAR_LUSTRE_PORT}", + "skipUrlSync": false, + "type": "constant", + "current": { + "value": "${VAR_LUSTRE_PORT}", + "text": "${VAR_LUSTRE_PORT}", + "selected": false + }, + "options": [ + { + "value": "${VAR_LUSTRE_PORT}", + "text": "${VAR_LUSTRE_PORT}", + "selected": false + } + ] + }, + { + "description": "Port Number of Node Exporter", + "hide": 2, + "name": "node_port", + "query": "${VAR_NODE_PORT}", + "skipUrlSync": false, + "type": "constant", + "current": { + "value": "${VAR_NODE_PORT}", + "text": "${VAR_NODE_PORT}", + "selected": false + }, + "options": [ + { + "value": "${VAR_NODE_PORT}", + "text": "${VAR_NODE_PORT}", + "selected": false + } + ] + }, + { + "description": "Job Name of Node Exporter", + "hide": 2, + "name": "node_job", + "query": "${VAR_NODE_JOB}", + "skipUrlSync": false, + "type": "constant", + "current": { + "value": "${VAR_NODE_JOB}", + "text": "${VAR_NODE_JOB}", + "selected": false + }, + "options": [ + { + "value": "${VAR_NODE_JOB}", + "text": "${VAR_NODE_JOB}", + "selected": false + } + ] + }, + { + "description": "Job Name of Lustre Exporter", + "hide": 2, + "name": "lustre_job", + "query": "${VAR_LUSTRE_JOB}", + "skipUrlSync": false, + "type": "constant", + "current": { + "value": "${VAR_LUSTRE_JOB}", + "text": "${VAR_LUSTRE_JOB}", + "selected": false + }, + "options": [ + { + "value": "${VAR_LUSTRE_JOB}", + "text": "${VAR_LUSTRE_JOB}", + "selected": false + } + ] + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Lustre Exporter MDS Overview", + "uid": "aClpf4vWkGENERALfgdfgdsrer", + "version": 9, + "weekStart": "" +} \ No newline at end of file diff --git a/dashboards/lustre_exporter_oss_detailed.json b/dashboards/lustre_exporter_oss_detailed.json new file mode 100644 index 00000000..19b96af7 --- /dev/null +++ b/dashboards/lustre_exporter_oss_detailed.json @@ -0,0 +1,1682 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + }, + { + "name": "VAR_LUSTRE_PORT", + "type": "constant", + "label": "Lustre-Port", + "value": "9169", + "description": "" + }, + { + "name": "VAR_NODE_PORT", + "type": "constant", + "label": "Node-Port", + "value": "9100", + "description": "" + }, + { + "name": "VAR_NODE_JOB", + "type": "constant", + "label": "Node-Job", + "value": "node", + "description": "" + }, + { + "name": "VAR_LUSTRE_JOB", + "type": "constant", + "label": "Lustre-Job", + "value": "lustre", + "description": "" + } + ], + "__elements": [], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.5.5" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph (old)", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "iteration": 1714122074789, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "yKpIni7nk" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 17, + "panels": [], + "title": "CPU Utilization", + "type": "row" + }, + { + "aliasColors": { + "Busy": "#EAB839", + "Busy Iowait": "#890F02", + "Busy other": "#1F78C1", + "Idle": "#052B51", + "Idle - Waiting for something to happen": "#052B51", + "guest": "#9AC48A", + "idle": "#052B51", + "iowait": "#EAB839", + "irq": "#BF1B00", + "nice": "#C15C17", + "softirq": "#E24D42", + "steal": "#FCE2DE", + "system": "#508642", + "user": "#5195CE" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "decimals": 2, + "description": "Basic CPU info", + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 1 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 250, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": true, + "pluginVersion": "8.5.5", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Busy Iowait", + "color": "#890F02" + }, + { + "alias": "Idle", + "color": "#7EB26D" + }, + { + "alias": "Busy System", + "color": "#EAB839" + }, + { + "alias": "Busy User", + "color": "#0A437C" + }, + { + "alias": "Busy Other", + "color": "#6D1F62" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum by (instance)(irate(node_cpu_seconds_total{mode=\"system\",instance=~\"$host:$node_port\",job=~\"$node_job\"}[5m])) * 100", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Busy System", + "refId": "B", + "step": 240, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + }, + { + "exemplar": true, + "expr": "sum by (instance)(irate(node_cpu_seconds_total{mode='user',instance=~\"$host:$node_port\",job=~\"$node_job\"}[5m])) * 100", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Busy User", + "refId": "D", + "step": 240, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + }, + { + "exemplar": true, + "expr": "sum by (instance)(irate(node_cpu_seconds_total{mode='iowait',instance=~\"$host:$node_port\",job=~\"$node_job\"}[5m])) * 100", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Busy Iowait", + "refId": "E", + "step": 240, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + }, + { + "exemplar": true, + "expr": "sum by (instance)(irate(node_cpu_seconds_total{mode=~\".*irq\",instance=~\"$host:$node_port\",job=~\"$node_job\"}[5m])) * 100", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Busy IRQs", + "refId": "F", + "step": 240, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + }, + { + "exemplar": true, + "expr": "sum (irate(node_cpu_seconds_total{mode!='idle',mode!='user',mode!='system',mode!='iowait',mode!='irq',mode!='softirq',instance=~\"$host:$node_port\",job=~\"$node_job\"}[5m])) * 100", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Busy Other", + "refId": "A", + "step": 240, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + }, + { + "exemplar": true, + "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='idle',instance=~\"$host:$node_port\",job=~\"$node_job\"}[5m])) * 100", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Idle", + "refId": "C", + "step": 240, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "thresholds": [], + "timeRegions": [], + "title": "CPU Basic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1402", + "format": "short", + "label": "", + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1403", + "format": "short", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "yKpIni7nk" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 19, + "panels": [], + "title": "OST Performance", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 24, + "x": 0, + "y": 9 + }, + "hiddenSeries": false, + "id": 9, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "total", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "rate(lustre_read_bytes_total{instance=\"$host:$lustre_port\",job=\"$lustre_job\"}[1m])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{target}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Read Throughput per OST", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1474", + "format": "Bps", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:1475", + "format": "decbytes", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 24, + "x": 0, + "y": 14 + }, + "hiddenSeries": false, + "id": 10, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "total", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "rate(lustre_write_bytes_total{instance=\"$host:$lustre_port\",job=\"$lustre_job\"}[1m])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{target}}", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Write Throughput per OST", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1832", + "format": "Bps", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:1833", + "format": "decbytes", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "yKpIni7nk" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 29, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 20 + }, + "hiddenSeries": false, + "id": 32, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "100 - ((lustre_available_kilobytes{instance=\"$host:$lustre_port\",job=\"$lustre_job\",component=\"ost\"} / lustre_capacity_kilobytes{instance=\"$host:$lustre_port\",job=\"$lustre_job\",component=\"ost\"}) * 100)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{target}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Disk Space Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1906", + "format": "percent", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:1907", + "format": "none", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 28 + }, + "hiddenSeries": false, + "id": 31, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "100 - lustre_inodes_free{instance=~\"$host:$lustre_port\",job=\"$lustre_job\", component=\"ost\"} / lustre_inodes_maximum{instance=~\"$host:$lustre_port\",job=\"$lustre_job\", component=\"ost\"} * 100", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{target}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "I-Node Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1906", + "format": "percent", + "label": "", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:1907", + "format": "none", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + } + ], + "title": "Disk Usage", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "yKpIni7nk" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 23, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Top read jobs aggregated over all OSTs with throughput > 10MB/s", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 21 + }, + "hiddenSeries": false, + "id": 13, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 0.5, + "points": true, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk(1, sum(irate(lustre_job_read_bytes_total{component=\"ost\",instance=\"$host:$lustre_port\",job=\"$lustre_job\"}[1m])>10000000) by (jobid))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{jobid}}", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Top Read Jobs (Throughput > 10MB/s)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1984", + "format": "Bps", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:1985", + "format": "decbytes", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Top write jobs aggregated over all OSTs with throughput > 10MB/s", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 21 + }, + "hiddenSeries": false, + "id": 15, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 0.5, + "points": true, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk(1, sum(irate(lustre_job_write_bytes_total{component=\"ost\",instance=\"$host:$lustre_port\",job=\"$lustre_job\"}[1m])>10000000) by (jobid))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{jobid}}", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Top Write Jobs (Throughput > 10MB/s)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2056", + "format": "Bps", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:2057", + "format": "decbytes", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + } + ], + "title": "Jobstats Overview", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "yKpIni7nk" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 27, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Count of processes involved in IO operations on the file server", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 22 + }, + "hiddenSeries": false, + "id": 25, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 0.5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "count(irate(lustre_job_read_bytes_total{component=\"ost\",instance=\"$host:$lustre_port\",job=\"$lustre_job\"}[1m]) > 0)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "read", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "count(irate(lustre_job_write_bytes_total{component=\"ost\",instance=\"$host:$lustre_port\",job=\"$lustre_job\"}[1m]) > 0)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "write", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Process Count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2134", + "format": "none", + "label": "", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:2135", + "format": "decbytes", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + } + ], + "title": "Process Count", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "yKpIni7nk" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 21, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "RDMA InfiniBand Bandwith over all NIC", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 23 + }, + "hiddenSeries": false, + "id": 7, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": true, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(irate(node_infiniband_port_data_received_bytes_total{instance=~\"$host:$node_port\",job=\"$node_job\"}[1m]))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Received", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + }, + { + "exemplar": true, + "expr": "sum(irate(node_infiniband_port_data_transmitted_bytes_total{instance=~\"$host:$node_port\",job=\"$node_job\"}[1m]))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Transmitted", + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "thresholds": [], + "timeRegions": [], + "title": "InfiniBand RDMA Bandwith", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2210", + "format": "decbytes", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:2211", + "format": "kbytes", + "label": "gxfdgdf", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + } + ], + "title": "Network Traffic", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 34, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Binary indicator as to whether or not the pool is degraded - 0 for not degraded, 1 for degraded", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "bool_yes_no" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 36, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.5.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "lustre_degraded{instance=\"$host:$lustre_port\", job=\"$lustre_job\"}", + "legendFormat": "{{target}}", + "range": true, + "refId": "A" + } + ], + "title": "Degraded", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bool_yes_no" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 24, + "x": 0, + "y": 29 + }, + "id": 38, + "options": { + "legend": { + "calcs": [ + "min", + "lastNotNull" + ], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.5.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "lustre_health_check{instance=~\"$host:$lustre_port\",job=\"$lustre_job\"}", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Healthy", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Count of LNET errors", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 39, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.5.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(lustre_errors_total{instance=\"$host:$lustre_port\", job=\"$lustre_job\"}[1m])", + "legendFormat": "{{label_name}}", + "range": true, + "refId": "A" + } + ], + "title": "LNET Errors", + "type": "timeseries" + } + ], + "title": "Status", + "type": "row" + } + ], + "refresh": "", + "schemaVersion": 36, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "lustre_exports_total{component=\"ost\"}", + "hide": 0, + "includeAll": false, + "label": "Filesystem", + "multi": false, + "name": "fs_name", + "options": [], + "query": { + "query": "lustre_exports_total{component=\"ost\"}", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "/target=\"([\\w]+)\\-OST/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(lustre_exports_total{target=~\"$fs_name-OST.*\"}, instance)", + "hide": 0, + "includeAll": false, + "label": "Host", + "multi": false, + "name": "host", + "options": [], + "query": { + "query": "label_values(lustre_exports_total{target=~\"$fs_name-OST.*\"}, instance)", + "refId": "Prometheus-host-Variable-Query" + }, + "refresh": 1, + "regex": "/([^:]+):/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "hide": 2, + "label": "Lustre-Port", + "name": "lustre_port", + "query": "${VAR_LUSTRE_PORT}", + "skipUrlSync": false, + "type": "constant", + "current": { + "value": "${VAR_LUSTRE_PORT}", + "text": "${VAR_LUSTRE_PORT}", + "selected": false + }, + "options": [ + { + "value": "${VAR_LUSTRE_PORT}", + "text": "${VAR_LUSTRE_PORT}", + "selected": false + } + ] + }, + { + "hide": 2, + "label": "Node-Port", + "name": "node_port", + "query": "${VAR_NODE_PORT}", + "skipUrlSync": false, + "type": "constant", + "current": { + "value": "${VAR_NODE_PORT}", + "text": "${VAR_NODE_PORT}", + "selected": false + }, + "options": [ + { + "value": "${VAR_NODE_PORT}", + "text": "${VAR_NODE_PORT}", + "selected": false + } + ] + }, + { + "hide": 2, + "label": "Node-Job", + "name": "node_job", + "query": "${VAR_NODE_JOB}", + "skipUrlSync": false, + "type": "constant", + "current": { + "value": "${VAR_NODE_JOB}", + "text": "${VAR_NODE_JOB}", + "selected": false + }, + "options": [ + { + "value": "${VAR_NODE_JOB}", + "text": "${VAR_NODE_JOB}", + "selected": false + } + ] + }, + { + "hide": 2, + "label": "Lustre-Job", + "name": "lustre_job", + "query": "${VAR_LUSTRE_JOB}", + "skipUrlSync": false, + "type": "constant", + "current": { + "value": "${VAR_LUSTRE_JOB}", + "text": "${VAR_LUSTRE_JOB}", + "selected": false + }, + "options": [ + { + "value": "${VAR_LUSTRE_JOB}", + "text": "${VAR_LUSTRE_JOB}", + "selected": false + } + ] + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Lustre Exporter OSS Detailed", + "uid": "yD1c1QpZz", + "version": 15, + "weekStart": "" +} \ No newline at end of file diff --git a/dashboards/lustre_exporter_oss_overview.json b/dashboards/lustre_exporter_oss_overview.json new file mode 100644 index 00000000..4675d4c4 --- /dev/null +++ b/dashboards/lustre_exporter_oss_overview.json @@ -0,0 +1,1262 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + }, + { + "name": "VAR_NODE_PORT", + "type": "constant", + "label": "node_port", + "value": "9100", + "description": "" + }, + { + "name": "VAR_LUSTRE_PORT", + "type": "constant", + "label": "lustre_port", + "value": "9169", + "description": "" + } + ], + "__elements": [], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.5.5" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph (old)", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "iteration": 1714122073362, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "yKpIni7nk" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 9, + "panels": [], + "title": "General", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "CPU load average 1 minute with resolution for Lustre Hebe Object Store Server", + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 1 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 300, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "node_load1{group=\"$group\", instance=~\"$hosts:$node_port\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "CPU Load Average (1m resolution)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1226", + "format": "short", + "label": "Load", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:1227", + "format": "percent", + "label": "", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Selected File Server InfiniBand Bandwith with RDMA over all lanes on the NIC.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 11 + }, + "hiddenSeries": false, + "id": 7, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 300, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(irate(node_infiniband_port_data_transmitted_bytes_total{group=\"$group\", instance=~\"$hosts:$node_port\"}[1m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Read", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(irate(node_infiniband_port_data_received_bytes_total{group=\"$group\", instance=~\"$hosts:$node_port\"}[1m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Write", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Network Bandwith", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1300", + "format": "Bps", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:1301", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "yKpIni7nk" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 11, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Transmitted File Server InfiniBand Bandwith with RDMA over all lanes on the NIC.", + "fill": 5, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 27 + }, + "hiddenSeries": false, + "id": 6, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(irate(node_infiniband_port_data_transmitted_bytes_total{group=\"$group\", instance=~\"$hosts:$node_port\"}[1m])) by (instance)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Read Bandwith", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Received File Server InfiniBand Bandwith with RDMA over all lanes on the NIC.", + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 33 + }, + "hiddenSeries": false, + "id": 5, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(irate(node_infiniband_port_data_received_bytes_total{group=\"$group\", instance=~\"$hosts:$node_port\"}[1m])) by (instance)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Write Bandwith", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + } + ], + "title": "Distributed Network Bandwith", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 17, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 19, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "last" + ], + "displayMode": "table", + "placement": "right", + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "avg(100 - ((lustre_available_kilobytes{group=\"$group\",instance=~\"$hosts:$lustre_port\",component=\"ost\"} / lustre_capacity_kilobytes{group=\"$group\",instance=~\"$hosts:$lustre_port\",component=\"ost\"}) * 100)) by (instance)", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Disk Space Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 20, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "last" + ], + "displayMode": "table", + "placement": "right", + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "avg(100 - ((lustre_inodes_free{group=\"$group\",instance=~\"$hosts:$lustre_port\", component=\"ost\"} / lustre_inodes_maximum{group=\"$group\",instance=~\"$hosts:$lustre_port\",component=\"ost\"}) * 100)) by (instance)", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "I-Node Usage", + "type": "timeseries" + } + ], + "title": "Disk Usage", + "type": "row" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 13, + "panels": [], + "title": "Status", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "bool_yes_no" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 15, + "options": { + "legend": { + "calcs": [ + "max", + "last" + ], + "displayMode": "table", + "placement": "right", + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "max(lustre_degraded{group=\"$group\", instance=~\"$hosts:$lustre_port\"}) by (instance)", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Degraded", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bool_yes_no" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 21, + "options": { + "legend": { + "calcs": [ + "min", + "last" + ], + "displayMode": "table", + "placement": "right", + "sortBy": "Min", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "lustre_health_check{group=\"$group\", instance=~\"$hosts:$lustre_port\"}", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Healthy", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Count of LNET errors", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 24, + "x": 0, + "y": 44 + }, + "id": 23, + "options": { + "legend": { + "calcs": [ + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(lustre_errors_total{group=\"$group\", instance=~\"$hosts:$lustre_port\"}[1m])", + "legendFormat": "{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "LNET Errors", + "type": "timeseries" + } + ], + "refresh": "", + "schemaVersion": 36, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "selected": false, + "text": "hebe_oss", + "value": "hebe_oss" + }, + "hide": 0, + "label": "OSS-Group", + "name": "group", + "options": [ + { + "selected": true, + "text": "hebe_oss", + "value": "hebe_oss" + }, + { + "selected": false, + "text": "hebetest_oss", + "value": "hebetest_oss" + } + ], + "query": "hebe_oss, hebetest_oss", + "queryValue": "", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "query_result(min(max_over_time(node_load1{group=\"$group\"}[$time]) >= $load) by (instance))", + "hide": 0, + "includeAll": true, + "label": "Hosts", + "multi": true, + "name": "hosts", + "options": [], + "query": { + "query": "query_result(min(max_over_time(node_load1{group=\"$group\"}[$time]) >= $load) by (instance))", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "/^{instance=\"(.+):$node_port\"}/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "0", + "value": "0" + }, + "hide": 0, + "label": "Max Load Over Time >=", + "name": "load", + "options": [ + { + "selected": true, + "text": "0", + "value": "0" + } + ], + "query": "0", + "skipUrlSync": false, + "type": "textbox" + }, + { + "auto": true, + "auto_count": 1, + "auto_min": "5m", + "current": { + "selected": false, + "text": "auto", + "value": "$__auto_interval_time" + }, + "hide": 2, + "label": "Time", + "name": "time", + "options": [ + { + "selected": true, + "text": "auto", + "value": "$__auto_interval_time" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "14d", + "value": "14d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "5m,10m,30m,1h,6h,12h,1d,7d,14d,30d", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + }, + { + "description": "Port of the Node Exporter", + "hide": 2, + "name": "node_port", + "query": "${VAR_NODE_PORT}", + "skipUrlSync": false, + "type": "constant", + "current": { + "value": "${VAR_NODE_PORT}", + "text": "${VAR_NODE_PORT}", + "selected": false + }, + "options": [ + { + "value": "${VAR_NODE_PORT}", + "text": "${VAR_NODE_PORT}", + "selected": false + } + ] + }, + { + "description": "Port of the Lustre Exporter", + "hide": 2, + "name": "lustre_port", + "query": "${VAR_LUSTRE_PORT}", + "skipUrlSync": false, + "type": "constant", + "current": { + "value": "${VAR_LUSTRE_PORT}", + "text": "${VAR_LUSTRE_PORT}", + "selected": false + }, + "options": [ + { + "value": "${VAR_LUSTRE_PORT}", + "text": "${VAR_LUSTRE_PORT}", + "selected": false + } + ] + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Lustre Exporter OSS Overview", + "uid": "7t2-GNdWkGENERAL5gdffdg", + "version": 7, + "weekStart": "" +} \ No newline at end of file diff --git a/dashboards/lustre_jobstats_detailed.json b/dashboards/lustre_jobstats_detailed.json new file mode 100644 index 00000000..22e19470 --- /dev/null +++ b/dashboards/lustre_jobstats_detailed.json @@ -0,0 +1,736 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": [], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.5.5" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph (old)", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "iteration": 1714122077560, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 11, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "sum" + ], + "displayMode": "table", + "placement": "bottom", + "sortBy": "Total", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(irate(lustre_job_stats_total{component=\"mdt\", jobid=~\"$jobid\"}[1m])) by (target, operation) > 0", + "legendFormat": "{{target}}-{{operation}}", + "range": true, + "refId": "A" + } + ], + "title": "Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 13 + }, + "id": 13, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "sum" + ], + "displayMode": "table", + "placement": "bottom", + "sortBy": "Total", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(lustre_job_stats_total{component=\"mdt\", jobid=~\"$jobid\"}) by (target, operation)", + "legendFormat": "{{target}}-{{operation}}", + "range": true, + "refId": "A" + } + ], + "title": "Count", + "type": "timeseries" + } + ], + "title": "Metadata Operations", + "type": "row" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "yKpIni7nk" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 7, + "panels": [], + "title": "Throughput", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fill": 3, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 2 + }, + "hiddenSeries": false, + "id": 2, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(lustre_job_read_bytes_total{component=~\"ost\",jobid=~\"$jobid\"}[1m])) by (instance, target) > 0", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}} - {{target}}", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Read Throughput", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:270", + "format": "Bps", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:271", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fill": 3, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 11 + }, + "hiddenSeries": false, + "id": 3, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(lustre_job_write_bytes_total{component=~\"ost\",jobid=~\"$jobid\"}[1m])) by (instance, target) > 0", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}} - {{target}}", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Write Throughput", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:81", + "format": "Bps", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:82", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "collapsed": true, + "datasource": { + "type": "prometheus", + "uid": "yKpIni7nk" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 5, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 21 + }, + "hiddenSeries": false, + "id": 9, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "lustre_job_read_bytes_total{component=~\"ost\",jobid=~\"$jobid\"} > 0", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}} - {{target}}", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Read Total Bytes", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 30 + }, + "hiddenSeries": false, + "id": 8, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.5", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": true, + "targets": [ + { + "expr": "lustre_job_write_bytes_total{component=~\"ost\",jobid=~\"$jobid\"} > 0", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}} - {{target}}", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + } + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Write Total Bytes", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + } + ], + "title": "Total Bytes", + "type": "row" + } + ], + "refresh": false, + "schemaVersion": 36, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "hide": 0, + "label": "JobID", + "name": "jobid", + "options": [ + { + "selected": true, + "text": "cmake.7463", + "value": "cmake.7463" + } + ], + "query": "", + "skipUrlSync": false, + "type": "textbox" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Lustre Jobstats Detailed", + "uid": "xYbBVmbWk", + "version": 4, + "weekStart": "" +} \ No newline at end of file