Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions bob-common/mkosi.build
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,11 @@ build_rust_package \
"input-only-proxy" \
"v0.0.2" \
"https://github.com/flashbots/input-only-proxy"

# Build gomplate (template engine for Prometheus config)
make_git_package \
"gomplate" \
"v4.3.3" \
"https://github.com/hairyhenderson/gomplate" \
'go build -trimpath -ldflags "-s -w -buildid=" -o ./build/gomplate ./cmd/gomplate' \
"build/gomplate:/usr/bin/gomplate"
3 changes: 3 additions & 0 deletions bob-common/mkosi.conf
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ Packages=podman
openssh-sftp-server
udev
libsnappy1v5
prometheus
prometheus-node-exporter
prometheus-process-exporter

BuildPackages=build-essential
git
Expand Down
5 changes: 5 additions & 0 deletions bob-common/mkosi.extra/etc/prometheus/process-exporter.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
process_names:
# Monitor the searcher container (conmon + all children via --children flag)
- name: "searcher-container"
cmdline:
- 'conmon.*searcher-container'
43 changes: 43 additions & 0 deletions bob-common/mkosi.extra/etc/prometheus/prometheus.yml.tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
global:
scrape_interval: 15s
evaluation_interval: 15s

# Recording rules for aggregated metrics
rule_files:
- /etc/prometheus/recording_rules.yml

# Scrape configurations
scrape_configs:
# Node exporter on localhost
- job_name: 'node'
static_configs:
- targets: ['localhost:9100']
metric_relabel_configs:
# Only keep aggregated metrics for remote write
- source_labels: [__name__]
regex: 'node_(cpu|memory|disk|filesystem|network)_.*'
action: keep

# Process exporter for container monitoring
- job_name: 'process'
static_configs:
- targets: ['localhost:9256']

{{- $config := (datasource "config") }}
{{- if $config.remote_write_flashbots_url }}

# Remote write configuration (dynamically configured)
remote_write:
# Flashbots endpoint
- url: {{ $config.remote_write_flashbots_url }}
write_relabel_configs:
# Only send aggregated metrics
- source_labels: [__name__]
regex: 'flashbox:.*'
action: keep
{{- if $config.remote_write_flashbots_auth }}
basic_auth:
username: {{ $config.remote_write_flashbots_username }}
password: {{ $config.remote_write_flashbots_password }}
{{- end }}
{{- end }}
49 changes: 49 additions & 0 deletions bob-common/mkosi.extra/etc/prometheus/recording_rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
groups:
- name: flashbox_aggregated_metrics
interval: 30s # How often to evaluate rules
rules:
# CPU aggregated metrics
- record: flashbox:cpu_usage_percent
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)

- record: flashbox:cpu_usage_percent_by_mode
expr: avg(rate(node_cpu_seconds_total[5m])) by (mode) * 100

# Memory aggregated metrics
- record: flashbox:memory_usage_percent
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100

- record: flashbox:memory_available_gb
expr: node_memory_MemAvailable_bytes / 1024 / 1024 / 1024

# Disk aggregated metrics - both root and persistent
# Root filesystem (always available)
- record: flashbox:disk_usage_percent_root
expr: 100 - (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100)

# Persistent storage (available after mount) - returns no data if not mounted
- record: flashbox:disk_usage_percent_persistent
expr: 100 - (node_filesystem_avail_bytes{mountpoint="/persistent"} / node_filesystem_size_bytes{mountpoint="/persistent"} * 100)

- record: flashbox:disk_io_read_mb_per_sec
expr: rate(node_disk_read_bytes_total[5m]) / 1024 / 1024

- record: flashbox:disk_io_write_mb_per_sec
expr: rate(node_disk_written_bytes_total[5m]) / 1024 / 1024

# Container health metrics (using process exporter)
- record: flashbox:container_alive
expr: up{job="process"} * on(instance) group_left(cgroup) namedprocess_namegroup_num_procs{groupname=~".*searcher-container.*"}

- record: flashbox:container_cpu_percent
expr: rate(namedprocess_namegroup_cpu_seconds_total{groupname=~".*searcher-container.*"}[5m]) * 100

- record: flashbox:container_memory_mb
expr: namedprocess_namegroup_memory_bytes{groupname=~".*searcher-container.*"} / 1024 / 1024

# Network metrics (only counters, no detailed info)
- record: flashbox:network_receive_mb_total
expr: sum(node_network_receive_bytes_total) / 1024 / 1024

- record: flashbox:network_transmit_mb_total
expr: sum(node_network_transmit_bytes_total) / 1024 / 1024
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
[Unit]
After=wait-for-key.service searcher-firewall.service
Requires=wait-for-key.service searcher-firewall.service
Requires=wait-for-key.service
Wants=searcher-firewall.service
Comment on lines -3 to +4
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain the rational behind this change? what is the scenario causing this to be locked out?
To my knowledge, this is the execution flow:

  • Image boots (in maintenance mode) and fetches the configuration that includes secrets and firewall configs.
  • searcher-firewall service triggers and sets up the correct firewall rules with the correctly configured IPtables
  • Simultaneously, the wait-for-key service is expecting the operator to server the user's (searcher) ssh pubkey through curl command to be set and allow the ssh command control panel to be ready for the searcher only.
  • dropbear service starts and searcher can executes ssh commands like initialize, toggle, log, etc...


[Service]
ExecStartPre=/usr/bin/chown -R searcher:searcher /home/searcher
Expand Down
56 changes: 56 additions & 0 deletions bob-common/mkosi.extra/etc/systemd/system/node-exporter.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
[Unit]
Description=Prometheus Node Exporter
Documentation=https://github.com/prometheus/node_exporter
After=network-online.target
Wants=network-online.target

[Service]
Type=simple
User=prometheus
Group=prometheus
ExecStart=/usr/bin/prometheus-node-exporter \
--web.listen-address=127.0.0.1:9100 \
--collector.cpu \
--collector.meminfo \
--collector.diskstats \
--collector.filesystem \
--collector.netdev \
--collector.loadavg \
--no-collector.arp \
--no-collector.bcache \
--no-collector.bonding \
--no-collector.conntrack \
--no-collector.cpufreq \
--no-collector.edac \
--no-collector.entropy \
--no-collector.filefd \
--no-collector.hwmon \
--no-collector.infiniband \
--no-collector.ipvs \
--no-collector.mdadm \
--no-collector.netclass \
--no-collector.netstat \
--no-collector.nfs \
--no-collector.nfsd \
--no-collector.pressure \
--no-collector.rapl \
--no-collector.schedstat \
--no-collector.sockstat \
--no-collector.softnet \
--no-collector.stat \
--no-collector.textfile \
--no-collector.thermal_zone \
--no-collector.time \
--no-collector.timex \
--no-collector.udp_queues \
--no-collector.uname \
--no-collector.vmstat \
--no-collector.xfs \
--no-collector.zfs \
--no-collector.systemd \
--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/docker)($|/)
Restart=on-failure
RestartSec=5s

[Install]
WantedBy=multi-user.target
19 changes: 19 additions & 0 deletions bob-common/mkosi.extra/etc/systemd/system/process-exporter.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[Unit]
Description=Prometheus Process Exporter
Documentation=https://github.com/ncabatoff/process-exporter
After=network-online.target searcher-container.service
Wants=network-online.target

[Service]
Type=simple
User=prometheus
Group=prometheus
ExecStart=/usr/bin/prometheus-process-exporter \
--web.listen-address=127.0.0.1:9256 \
--config.path=/etc/prometheus/process-exporter.yml \
--children
Restart=on-failure
RestartSec=5s

[Install]
WantedBy=multi-user.target
25 changes: 25 additions & 0 deletions bob-common/mkosi.extra/etc/systemd/system/prometheus.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
[Unit]
Description=Prometheus Monitoring System
Documentation=https://prometheus.io/docs/introduction/overview/
After=network-online.target fetch-config.service
Wants=network-online.target
Requires=fetch-config.service

[Service]
Type=simple
User=prometheus
Group=prometheus
ExecStartPre=+/usr/bin/gomplate -f /etc/prometheus/prometheus.yml.tmpl -o /etc/prometheus/prometheus.yml -d config=/etc/flashbox/observability-config.json
ExecStart=/usr/bin/prometheus \
--config.file=/etc/prometheus/prometheus.yml \
--storage.tsdb.path=/var/lib/prometheus/ \
--storage.tsdb.retention.time=24h \
--web.console.templates=/usr/share/prometheus/consoles \
--web.console.libraries=/usr/share/prometheus/console_libraries \
--web.listen-address=127.0.0.1:9090
ExecReload=/bin/kill -HUP $MAINPID
Restart=on-failure
RestartSec=5s

[Install]
WantedBy=multi-user.target
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[Unit]
Description=Searcher Network and Firewall Rules
After=network.target network-setup.service
Requires=network-setup.service
After=network.target network-setup.service fetch-config.service
Requires=network-setup.service fetch-config.service

[Service]
Type=oneshot
Expand Down
Loading