-
Notifications
You must be signed in to change notification settings - Fork 13
feat: add observability tooling for Flashbox images #93
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
1b86700
b920e85
b0557e3
0c9121c
2468d13
8d00e76
5431bd6
0187cad
5ee980b
0681f8d
5961c8e
4d23a65
0246c7b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| process_names: | ||
| # Monitor the searcher container (conmon + all children via --children flag) | ||
| - name: "searcher-container" | ||
| cmdline: | ||
| - 'conmon.*searcher-container' | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,43 @@ | ||
| global: | ||
| scrape_interval: 15s | ||
| evaluation_interval: 15s | ||
|
|
||
| # Recording rules for aggregated metrics | ||
| rule_files: | ||
| - /etc/prometheus/recording_rules.yml | ||
|
|
||
| # Scrape configurations | ||
| scrape_configs: | ||
| # Node exporter on localhost | ||
| - job_name: 'node' | ||
| static_configs: | ||
| - targets: ['localhost:9100'] | ||
| metric_relabel_configs: | ||
| # Only keep aggregated metrics for remote write | ||
| - source_labels: [__name__] | ||
| regex: 'node_(cpu|memory|disk|filesystem|network)_.*' | ||
| action: keep | ||
|
|
||
| # Process exporter for container monitoring | ||
| - job_name: 'process' | ||
| static_configs: | ||
| - targets: ['localhost:9256'] | ||
|
|
||
| {{- $config := (datasource "config") }} | ||
| {{- if $config.remote_write_flashbots_url }} | ||
|
|
||
| # Remote write configuration (dynamically configured) | ||
| remote_write: | ||
| # Flashbots endpoint | ||
| - url: {{ $config.remote_write_flashbots_url }} | ||
| write_relabel_configs: | ||
| # Only send aggregated metrics | ||
| - source_labels: [__name__] | ||
| regex: 'flashbox:.*' | ||
| action: keep | ||
| {{- if $config.remote_write_flashbots_auth }} | ||
| basic_auth: | ||
| username: {{ $config.remote_write_flashbots_username }} | ||
| password: {{ $config.remote_write_flashbots_password }} | ||
| {{- end }} | ||
| {{- end }} |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,49 @@ | ||
| groups: | ||
| - name: flashbox_aggregated_metrics | ||
| interval: 30s # How often to evaluate rules | ||
| rules: | ||
| # CPU aggregated metrics | ||
| - record: flashbox:cpu_usage_percent | ||
| expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) | ||
|
|
||
| - record: flashbox:cpu_usage_percent_by_mode | ||
| expr: avg(rate(node_cpu_seconds_total[5m])) by (mode) * 100 | ||
|
|
||
| # Memory aggregated metrics | ||
| - record: flashbox:memory_usage_percent | ||
| expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 | ||
|
|
||
| - record: flashbox:memory_available_gb | ||
| expr: node_memory_MemAvailable_bytes / 1024 / 1024 / 1024 | ||
|
|
||
| # Disk aggregated metrics - both root and persistent | ||
| # Root filesystem (always available) | ||
| - record: flashbox:disk_usage_percent_root | ||
| expr: 100 - (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100) | ||
|
|
||
| # Persistent storage (available after mount) - returns no data if not mounted | ||
| - record: flashbox:disk_usage_percent_persistent | ||
| expr: 100 - (node_filesystem_avail_bytes{mountpoint="/persistent"} / node_filesystem_size_bytes{mountpoint="/persistent"} * 100) | ||
|
|
||
| - record: flashbox:disk_io_read_mb_per_sec | ||
| expr: rate(node_disk_read_bytes_total[5m]) / 1024 / 1024 | ||
|
|
||
| - record: flashbox:disk_io_write_mb_per_sec | ||
| expr: rate(node_disk_written_bytes_total[5m]) / 1024 / 1024 | ||
|
|
||
| # Container health metrics (using process exporter) | ||
| - record: flashbox:container_alive | ||
| expr: up{job="process"} * on(instance) group_left(cgroup) namedprocess_namegroup_num_procs{groupname=~".*searcher-container.*"} | ||
|
|
||
| - record: flashbox:container_cpu_percent | ||
| expr: rate(namedprocess_namegroup_cpu_seconds_total{groupname=~".*searcher-container.*"}[5m]) * 100 | ||
|
|
||
| - record: flashbox:container_memory_mb | ||
| expr: namedprocess_namegroup_memory_bytes{groupname=~".*searcher-container.*"} / 1024 / 1024 | ||
|
|
||
| # Network metrics (only counters, no detailed info) | ||
| - record: flashbox:network_receive_mb_total | ||
| expr: sum(node_network_receive_bytes_total) / 1024 / 1024 | ||
|
|
||
| - record: flashbox:network_transmit_mb_total | ||
| expr: sum(node_network_transmit_bytes_total) / 1024 / 1024 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| [Unit] | ||
| Description=Prometheus Node Exporter | ||
| Documentation=https://github.com/prometheus/node_exporter | ||
| After=network-online.target | ||
| Wants=network-online.target | ||
|
|
||
| [Service] | ||
| Type=simple | ||
| User=prometheus | ||
| Group=prometheus | ||
| ExecStart=/usr/bin/prometheus-node-exporter \ | ||
| --web.listen-address=127.0.0.1:9100 \ | ||
| --collector.cpu \ | ||
| --collector.meminfo \ | ||
| --collector.diskstats \ | ||
| --collector.filesystem \ | ||
| --collector.netdev \ | ||
| --collector.loadavg \ | ||
| --no-collector.arp \ | ||
| --no-collector.bcache \ | ||
| --no-collector.bonding \ | ||
| --no-collector.conntrack \ | ||
| --no-collector.cpufreq \ | ||
| --no-collector.edac \ | ||
| --no-collector.entropy \ | ||
| --no-collector.filefd \ | ||
| --no-collector.hwmon \ | ||
| --no-collector.infiniband \ | ||
| --no-collector.ipvs \ | ||
| --no-collector.mdadm \ | ||
| --no-collector.netclass \ | ||
| --no-collector.netstat \ | ||
| --no-collector.nfs \ | ||
| --no-collector.nfsd \ | ||
| --no-collector.pressure \ | ||
| --no-collector.rapl \ | ||
| --no-collector.schedstat \ | ||
| --no-collector.sockstat \ | ||
| --no-collector.softnet \ | ||
| --no-collector.stat \ | ||
| --no-collector.textfile \ | ||
| --no-collector.thermal_zone \ | ||
| --no-collector.time \ | ||
| --no-collector.timex \ | ||
| --no-collector.udp_queues \ | ||
| --no-collector.uname \ | ||
| --no-collector.vmstat \ | ||
| --no-collector.xfs \ | ||
| --no-collector.zfs \ | ||
| --no-collector.systemd \ | ||
| --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/docker)($|/) | ||
| Restart=on-failure | ||
| RestartSec=5s | ||
|
|
||
| [Install] | ||
| WantedBy=minimal.target |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| [Unit] | ||
| Description=Prometheus Process Exporter | ||
| Documentation=https://github.com/ncabatoff/process-exporter | ||
| After=network-online.target searcher-container.service | ||
| Wants=network-online.target | ||
|
|
||
| [Service] | ||
| Type=simple | ||
| User=prometheus | ||
| Group=prometheus | ||
| ExecStart=/usr/bin/prometheus-process-exporter \ | ||
| --web.listen-address=127.0.0.1:9256 \ | ||
| --config.path=/etc/prometheus/process-exporter.yml \ | ||
| --children | ||
| Restart=on-failure | ||
| RestartSec=5s | ||
|
|
||
| [Install] | ||
| WantedBy=minimal.target |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,25 @@ | ||
| [Unit] | ||
| Description=Prometheus Monitoring System | ||
| Documentation=https://prometheus.io/docs/introduction/overview/ | ||
| After=network-online.target fetch-config.service | ||
| Wants=network-online.target | ||
| Requires=fetch-config.service | ||
|
|
||
| [Service] | ||
| Type=simple | ||
| User=prometheus | ||
| Group=prometheus | ||
| ExecStartPre=+/usr/bin/gomplate -f /etc/prometheus/prometheus.yml.tmpl -o /etc/prometheus/prometheus.yml -d config=/etc/flashbox/observability-config.json | ||
| ExecStart=/usr/bin/prometheus \ | ||
| --config.file=/etc/prometheus/prometheus.yml \ | ||
| --storage.tsdb.path=/var/lib/prometheus/ \ | ||
| --storage.tsdb.retention.time=24h \ | ||
| --web.console.templates=/usr/share/prometheus/consoles \ | ||
| --web.console.libraries=/usr/share/prometheus/console_libraries \ | ||
| --web.listen-address=127.0.0.1:9090 | ||
| ExecReload=/bin/kill -HUP $MAINPID | ||
| Restart=on-failure | ||
| RestartSec=5s | ||
|
|
||
| [Install] | ||
| WantedBy=minimal.target |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,157 @@ | ||
| #!/bin/bash | ||
| set -eu -o pipefail | ||
|
|
||
| # Common configuration fetching script for FlashBox (bob-l1 and bob-l2) | ||
| # This script provides shared functionality for configuration management | ||
| # Project-specific configuration should be done via /etc/bob/dynamic-config.sh | ||
|
|
||
| CONFIG_PATH=/etc/bob/config.env | ||
| OBSERVABILITY_CONFIG_PATH=/etc/flashbox/observability-config.json | ||
|
|
||
| # Don't override if config already exists | ||
| if [ -f "$CONFIG_PATH" ]; then | ||
| echo "Config already exists at $CONFIG_PATH, skipping" | ||
| exit 0 | ||
| fi | ||
|
|
||
| # Helper functions | ||
| fetch_metadata_value() { | ||
| curl -s \ | ||
| --header "Metadata-Flavor: Google" \ | ||
| "http://metadata/computeMetadata/v1/instance/attributes/$1" | ||
| } | ||
|
Comment on lines
+18
to
+22
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would not work for Flashbox L1 images on Azure
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I recall, we are dropping Azure support for the first iteration. But if that is not the case, we can re-iterate on this and add support for Azure too |
||
|
|
||
| get_ips_from_uris() { | ||
| # Extract IP addresses from URIs | ||
| echo "$1" | grep -oE '[0-9]{1,3}(\.[0-9]{1,3}){3}' || echo "" | ||
| } | ||
|
|
||
| write_observability_config() { | ||
| local metrics_flashbots_url="$1" | ||
| local metrics_flashbots_username="$2" | ||
| local metrics_flashbots_password="$3" | ||
|
|
||
| # Extract IP for firewall rules | ||
| local metrics_endpoint="" | ||
|
|
||
| if [ -n "$metrics_flashbots_url" ]; then | ||
| metrics_endpoint=$(get_ips_from_uris "$metrics_flashbots_url" | head -1) | ||
| fi | ||
|
|
||
| # Append observability config to main config | ||
| cat <<EOF >> "$CONFIG_PATH" | ||
| CONFIG_METRICS_FLASHBOTS_URL='${metrics_flashbots_url}' | ||
| CONFIG_METRICS_FLASHBOTS_USERNAME='${metrics_flashbots_username}' | ||
| CONFIG_METRICS_FLASHBOTS_PASSWORD='${metrics_flashbots_password}' | ||
| METRICS_ENDPOINT='${metrics_endpoint}' | ||
| EOF | ||
|
|
||
| # Create observability config for Prometheus (always needed for gomplate templating) | ||
| mkdir -p /etc/flashbox | ||
| cat <<EOF > "$OBSERVABILITY_CONFIG_PATH" | ||
| { | ||
| "remote_write_flashbots_url": "${metrics_flashbots_url}", | ||
| "remote_write_flashbots_username": "${metrics_flashbots_username}", | ||
| "remote_write_flashbots_password": "${metrics_flashbots_password}", | ||
| "remote_write_flashbots_auth": $([ -n "${metrics_flashbots_username}" ] && echo '"true"' || echo '""') | ||
| } | ||
| EOF | ||
| echo "Observability configuration written to $OBSERVABILITY_CONFIG_PATH" | ||
| } | ||
|
|
||
| # Check for local QEMU development environment | ||
| if dmidecode -s system-manufacturer 2>/dev/null | grep -q "QEMU" && \ | ||
| [ -f /etc/systemd/system/serial-console.service ]; then | ||
| echo "Running in local QEMU dev image, using default test values" | ||
|
|
||
| # Get default gateway (host in QEMU user-mode networking) | ||
| GATEWAY=$(ip route | awk '/default/ {print $3}') | ||
| if [ -z "$GATEWAY" ]; then | ||
| echo "Warning: Could not detect gateway, falling back to 10.0.2.2" | ||
| GATEWAY="10.0.2.2" | ||
| fi | ||
|
|
||
| # Export gateway for custom script | ||
| export GATEWAY | ||
|
|
||
| # Call project-specific configuration if it exists | ||
| if [ -x /etc/bob/dynamic-config.sh ]; then | ||
| echo "Running project-specific configuration..." | ||
| /etc/bob/dynamic-config.sh qemu "$CONFIG_PATH" | ||
| else | ||
| echo "Warning: No project-specific configuration found at /etc/bob/dynamic-config.sh" | ||
| fi | ||
|
|
||
| # Add empty observability config for local dev | ||
| write_observability_config "" "" "" | ||
|
|
||
| exit 0 | ||
| fi | ||
|
|
||
| # Production configuration using Vault | ||
| echo "Fetching configuration from Vault..." | ||
|
|
||
| # Get instance metadata | ||
| instance_name=$(fetch_metadata_value "name") | ||
| vault_addr=$(fetch_metadata_value "vault_addr") | ||
| vault_auth_mount=$(fetch_metadata_value "vault_auth_mount_gcp") | ||
| vault_kv_path=$(fetch_metadata_value "vault_kv_path") | ||
| vault_kv_common_suffix=$(fetch_metadata_value "vault_kv_common_suffix") | ||
|
|
||
| # Authenticate with Vault using GCP identity | ||
| gcp_token=$(curl \ | ||
| --header "Metadata-Flavor: Google" \ | ||
| --data-urlencode "audience=http://vault/$instance_name" \ | ||
| --data-urlencode "format=full" \ | ||
| "http://metadata/computeMetadata/v1/instance/service-accounts/default/identity") | ||
|
|
||
| vault_token=$(curl \ | ||
| --data "$(printf '{"role":"%s","jwt":"%s"}' "$instance_name" "$gcp_token")" \ | ||
| "${vault_addr}/v1/${vault_auth_mount}/login" | \ | ||
| jq -r .auth.client_token) | ||
|
|
||
| # Fetch common and instance-specific data | ||
| common_data=$(curl \ | ||
| --header "X-Vault-Token: ${vault_token}" \ | ||
| "${vault_addr}/v1/${vault_kv_path}/node/${vault_kv_common_suffix}" | | ||
| jq -c .data.data) | ||
|
|
||
| secret_data=$(curl \ | ||
| --header "X-Vault-Token: ${vault_token}" \ | ||
| "${vault_addr}/v1/${vault_kv_path}/node/${instance_name}" | | ||
| jq -c .data.data) | ||
|
|
||
| # Merge data objects | ||
| data=$(echo "$common_data $secret_data" | jq -s 'add') | ||
|
|
||
| # Helper to get values from merged data | ||
| get_data_value() { | ||
| echo "$data" | jq -rc --arg key "$1" '.[$key] // ""' | ||
| } | ||
|
|
||
| # Export data for project-specific script | ||
| export VAULT_DATA="$data" | ||
| export -f get_data_value | ||
| export -f get_ips_from_uris | ||
|
|
||
| # Call project-specific configuration | ||
| if [ -x /etc/bob/dynamic-config.sh ]; then | ||
| echo "Running project-specific configuration..." | ||
| /etc/bob/dynamic-config.sh vault "$CONFIG_PATH" | ||
| else | ||
| echo "Error: No project-specific configuration found at /etc/bob/dynamic-config.sh" | ||
| exit 1 | ||
| fi | ||
|
|
||
| # Fetch observability configuration | ||
| metrics_flashbots_url=$(get_data_value metrics_flashbots_url) | ||
| metrics_flashbots_username=$(get_data_value metrics_flashbots_username) | ||
| metrics_flashbots_password=$(get_data_value metrics_flashbots_password) | ||
|
|
||
| # Write observability configuration | ||
| write_observability_config \ | ||
| "$metrics_flashbots_url" \ | ||
| "$metrics_flashbots_username" \ | ||
| "$metrics_flashbots_password" | ||
|
|
||
| echo "Configuration successfully fetched and written to $CONFIG_PATH" | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should we also monitor lighthouse in bob-l1?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If there is a need for a dedicated monitoring for lighthouse in bob-l1 image, then it should be somehow placed in the bob-l1 directory setup and make it extend this configuration if possible, wdyt?