Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions images/flashbox-l2.conf
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Include=shared/mkosi.conf
Include=modules/flashbox/common/mkosi.conf
Include=modules/flashbox/flashbox-l2/mkosi.conf
Include=modules/flashbox/observability/mkosi.conf

[Config]
Profiles=gcp
Expand Down
4 changes: 4 additions & 0 deletions modules/flashbox/common/mkosi.extra/usr/bin/init-firewall.sh
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,10 @@ drop_dst_ip() {
#
# `source` is not supported in dash
###########################################################################

# Load observability config if the module is included (metrics endpoint IP)
[ -f /etc/flashbox/observability.env ] && . /etc/flashbox/observability.env

. /etc/bob/firewall-config

###########################################################################
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ accept_dst_port $CHAIN_ALWAYS_IN tcp $CVM_REVERSE_PROXY_PORT "CVM reverse-proxy"
accept_dst_port $CHAIN_ALWAYS_OUT udp $NTP_PORT "NTP"
accept_dst_port $CHAIN_ALWAYS_OUT tcp $NTP_NTS_PORT "NTP-NTS"

# Observability metrics endpoint (loaded from /etc/flashbox/observability.env)
if [ -n "${METRICS_ENDPOINT:-}" ]; then
accept_dst_ip_port $CHAIN_ALWAYS_OUT tcp "$METRICS_ENDPOINT" $HTTPS_PORT "Metrics endpoint (Flashbots)"
fi

###########################################################################
# (3) MAINTENANCE_IN: Inbound rules for Maintenance Mode
###########################################################################
Expand Down
12 changes: 12 additions & 0 deletions modules/flashbox/observability/mkosi.build
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash
set -euxo pipefail

source scripts/make_git_package.sh

# Build gomplate (template engine for Prometheus config)
make_git_package \
"gomplate" \
"v4.3.3" \
"https://github.com/hairyhenderson/gomplate" \
'go build -trimpath -ldflags "-s -w -buildid=" -o ./build/gomplate ./cmd/gomplate' \
"build/gomplate:/usr/bin/gomplate"
15 changes: 15 additions & 0 deletions modules/flashbox/observability/mkosi.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[Build]
WithNetwork=true

[Content]
ExtraTrees=modules/flashbox/observability/mkosi.extra
PostInstallationScripts=modules/flashbox/observability/mkosi.postinst
BuildScripts=modules/flashbox/observability/mkosi.build

Packages=prometheus
prometheus-node-exporter
prometheus-process-exporter

BuildPackages=build-essential
git
golang
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
process_names:
# Monitor the searcher container (conmon + all children via --children flag)
- name: "searcher-container"
cmdline:
- 'conmon.*searcher-container'
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
global:
scrape_interval: 15s
evaluation_interval: 15s

# Recording rules for aggregated metrics
rule_files:
- /etc/prometheus/recording_rules.yml

# Scrape configurations
scrape_configs:
# Node exporter on localhost
- job_name: 'node'
static_configs:
- targets: ['localhost:9100']
metric_relabel_configs:
# Only keep aggregated metrics for remote write
- source_labels: [__name__]
regex: 'node_(cpu|memory|disk|filesystem|network|vmstat)_.*'
action: keep

# Process exporter for container monitoring
- job_name: 'process'
static_configs:
- targets: ['localhost:9256']

{{- $config := (datasource "config") }}
{{- if $config.remote_write_flashbots_url }}

# Remote write configuration (dynamically configured)
remote_write:
# Flashbots endpoint
- url: {{ $config.remote_write_flashbots_url }}
write_relabel_configs:
# Only send flashbox: prefixed metrics
- source_labels: [__name__]
regex: 'flashbox:.*'
action: keep
{{- if $config.remote_write_flashbots_auth }}
basic_auth:
username: {{ $config.remote_write_flashbots_username }}
password: {{ $config.remote_write_flashbots_password }}
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
groups:
# Base metrics — local: prefix means they stay inside the TEE
# (remote_write only forwards flashbox:*)
- name: local_container_metrics
interval: 30s
rules:
- record: local:container_cpu_percent
expr: sum(rate(namedprocess_namegroup_cpu_seconds_total{groupname=~".*searcher-container.*"}[5m])) * 100

# Forwarded metrics — flashbox: prefix, picked up by remote_write
- name: flashbox_health
interval: 30s
rules:
- record: flashbox:container_alive
expr: up{job="process"} * on(instance) group_left(cgroup) namedprocess_namegroup_num_procs{groupname=~".*searcher-container.*"}

# Spike-guarded: current 15m avg must be under 80%,
# AND the 10m max ending 5m ago must have been under 70%
- record: flashbox:container_average_cpu_is_under_80_percent
expr: >
(avg_over_time(local:container_cpu_percent[15m]) < bool 80)
* (max_over_time(local:container_cpu_percent[10m] offset 5m) < bool 70)

- record: flashbox:container_oom_kills_count
expr: node_vmstat_oom_kill

- record: flashbox:disk_free_space_is_over_10_percent
expr: >
(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) > bool 0.1

- record: flashbox:disk_free_space_is_over_128_gb
expr: >
(node_filesystem_avail_bytes{mountpoint="/persistent"}) > bool (128 * 1024 * 1024 * 1024)

- record: flashbox:network_is_up
expr: >
(sum(rate(node_network_receive_bytes_total{device!~"lo"}[5m]))
+ sum(rate(node_network_transmit_bytes_total{device!~"lo"}[5m])))
> bool 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[Unit]
Description=Fetch observability configuration
After=network-online.target
Wants=network-online.target

[Service]
Type=oneshot
ExecStart=/usr/bin/fetch-observability-config.sh
RemainAfterExit=yes
StandardOutput=journal
StandardError=journal

[Install]
WantedBy=minimal.target
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think something like

--collector.disable-defaults
--collector.enable-filter="^(cpu|meminfo|diskstats|filesystem|netdev|loadavg|vmstat)$"

might be cleaner and more future proof

Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
[Unit]
Description=Prometheus Node Exporter
Documentation=https://github.com/prometheus/node_exporter
After=network-online.target
Wants=network-online.target

[Service]
Type=simple
User=prometheus
Group=prometheus
ExecStart=/usr/bin/prometheus-node-exporter \
--web.listen-address=127.0.0.1:9100 \
--collector.cpu \
--collector.meminfo \
--collector.diskstats \
--collector.filesystem \
--collector.netdev \
--collector.loadavg \
--no-collector.arp \
--no-collector.bcache \
--no-collector.bonding \
--no-collector.conntrack \
--no-collector.cpufreq \
--no-collector.edac \
--no-collector.entropy \
--no-collector.filefd \
--no-collector.hwmon \
--no-collector.infiniband \
--no-collector.ipvs \
--no-collector.mdadm \
--no-collector.netclass \
--no-collector.netstat \
--no-collector.nfs \
--no-collector.nfsd \
--no-collector.pressure \
--no-collector.rapl \
--no-collector.schedstat \
--no-collector.sockstat \
--no-collector.softnet \
--no-collector.stat \
--no-collector.textfile \
--no-collector.thermal_zone \
--no-collector.time \
--no-collector.timex \
--no-collector.udp_queues \
--no-collector.uname \
--collector.vmstat \
--no-collector.xfs \
--no-collector.zfs \
--no-collector.systemd \
--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/docker)($|/)
Restart=on-failure
RestartSec=5s

[Install]
WantedBy=minimal.target
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[Unit]
Description=Prometheus Process Exporter
Documentation=https://github.com/ncabatoff/process-exporter
After=network-online.target searcher-container.service
Wants=network-online.target

[Service]
Type=simple
User=prometheus
Group=prometheus
ExecStart=/usr/bin/prometheus-process-exporter \
--web.listen-address=127.0.0.1:9256 \
--config.path=/etc/prometheus/process-exporter.yml \
--children
Restart=on-failure
RestartSec=5s

[Install]
WantedBy=minimal.target
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
[Unit]
Description=Prometheus Monitoring System
Documentation=https://prometheus.io/docs/introduction/overview/
After=network-online.target fetch-observability-config.service
Wants=network-online.target
Requires=fetch-observability-config.service

[Service]
Type=simple
User=prometheus
Group=prometheus
ExecStartPre=+/usr/bin/gomplate -f /etc/prometheus/prometheus.yml.tmpl -o /etc/prometheus/prometheus.yml -d config=/etc/flashbox/observability-config.json
ExecStart=/usr/bin/prometheus \
--config.file=/etc/prometheus/prometheus.yml \
--storage.tsdb.path=/var/lib/prometheus/ \
--storage.tsdb.retention.time=24h \
--web.console.templates=/usr/share/prometheus/consoles \
--web.console.libraries=/usr/share/prometheus/console_libraries \
--web.listen-address=127.0.0.1:9090
ExecReload=/bin/kill -HUP $MAINPID
Restart=on-failure
RestartSec=5s

[Install]
WantedBy=minimal.target
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[Unit]
After=fetch-observability-config.service
Wants=fetch-observability-config.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#!/bin/sh
set -eu -o pipefail

# Fetches observability configuration (metrics endpoint credentials) and writes:
# /etc/flashbox/observability-config.json — consumed by gomplate for Prometheus config
# /etc/flashbox/observability.env — sourced by firewall for metrics endpoint IP
#
# On failure: logs a warning and writes empty defaults. Prometheus runs locally
# without remote_write. This is intentional — observability should never block boot.

OBSERVABILITY_CONFIG_PATH=/etc/flashbox/observability-config.json
OBSERVABILITY_ENV_PATH=/etc/flashbox/observability.env

write_config() {
local url="${1:-}"
local username="${2:-}"
local password="${3:-}"

# Extract IP for firewall rules
local metrics_endpoint=""
if [ -n "$url" ]; then
metrics_endpoint=$(echo "$url" | grep -oE '[0-9]{1,3}(\.[0-9]{1,3}){3}' | head -1 || true)
fi

mkdir -p /etc/flashbox

# JSON config for Prometheus gomplate template
cat <<EOF > "$OBSERVABILITY_CONFIG_PATH"
{
"remote_write_flashbots_url": "${url}",
"remote_write_flashbots_username": "${username}",
"remote_write_flashbots_password": "${password}",
"remote_write_flashbots_auth": $([ -n "${username}" ] && echo '"true"' || echo '""')
}
EOF

# Env file for firewall (sourced by init-firewall.sh)
cat <<EOF > "$OBSERVABILITY_ENV_PATH"
METRICS_ENDPOINT='${metrics_endpoint}'
EOF

echo "Observability config written (endpoint: ${metrics_endpoint:-none})"
}

# Don't override if config already exists
if [ -f "$OBSERVABILITY_CONFIG_PATH" ]; then
echo "Observability config already exists, skipping"
exit 0
fi

# Local QEMU dev: no remote_write
if dmidecode -s system-manufacturer 2>/dev/null | grep -q "QEMU" && \
[ -f /etc/systemd/system/serial-console.service ]; then
echo "QEMU dev environment, writing empty observability config"
write_config "" "" ""
exit 0
fi

# Production: fetch from Vault (non-fatal on failure)
echo "Fetching observability config from Vault..."

fetch_metadata_value() {
curl -sf \
--header "Metadata-Flavor: Google" \
"http://metadata/computeMetadata/v1/instance/attributes/$1"
}

if ! instance_name=$(fetch_metadata_value "name") || \
! vault_addr=$(fetch_metadata_value "vault_addr") || \
! vault_auth_mount=$(fetch_metadata_value "vault_auth_mount_gcp") || \
! vault_kv_path=$(fetch_metadata_value "vault_kv_path") || \
! vault_kv_common_suffix=$(fetch_metadata_value "vault_kv_common_suffix"); then
echo "WARNING: Could not fetch GCP metadata, writing empty observability config"
write_config "" "" ""
exit 0
fi

# Authenticate with Vault using GCP identity
gcp_token=$(curl -sf \
--header "Metadata-Flavor: Google" \
--data-urlencode "audience=http://vault/$instance_name" \
--data-urlencode "format=full" \
"http://metadata/computeMetadata/v1/instance/service-accounts/default/identity") || true

if [ -z "${gcp_token:-}" ]; then
echo "WARNING: Could not get GCP identity token, writing empty observability config"
write_config "" "" ""
exit 0
fi

vault_token=$(curl -sf \
--data "$(printf '{"role":"%s","jwt":"%s"}' "$instance_name" "$gcp_token")" \
"${vault_addr}/v1/${vault_auth_mount}/login" | \
jq -r .auth.client_token) || true

if [ -z "${vault_token:-}" ]; then
echo "WARNING: Could not authenticate with Vault, writing empty observability config"
write_config "" "" ""
exit 0
fi

# Fetch common data (observability keys live here)
common_data=$(curl -sf \
--header "X-Vault-Token: ${vault_token}" \
"${vault_addr}/v1/${vault_kv_path}/node/${vault_kv_common_suffix}" |
jq -c .data.data) || true

if [ -z "${common_data:-}" ]; then
echo "WARNING: Could not fetch Vault data, writing empty observability config"
write_config "" "" ""
exit 0
fi

get_value() {
echo "$common_data" | jq -rc --arg key "$1" '.[$key] // ""'
}

write_config \
"$(get_value metrics_flashbots_url)" \
"$(get_value metrics_flashbots_username)" \
"$(get_value metrics_flashbots_password)"
Loading