Skip to content

Commit 7cd03f1

Browse files
committed
Performance Alerting
Introduces an early-warning series of prometheus alerts to attempt to catch issues with performance at an early stage in development. Signed-off-by: Daniel Franz <[email protected]>
1 parent 1333f7b commit 7cd03f1

16 files changed

+338
-234
lines changed

.github/workflows/e2e.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,21 @@ jobs:
3535
- name: Run e2e tests
3636
run: ARTIFACT_PATH=/tmp/artifacts make test-e2e
3737

38+
- name: alerts-check
39+
# Grab all current alerts, filtering out pending, and print the GH actions warning string
40+
# containing the alert name and description.
41+
#
42+
# NOTE: Leaving this as annotating-only instead of failing the run until we have some more
43+
# finely-tuned alerts.
44+
run: |
45+
if [[ -s /tmp/artifacts/alerts.out ]]; then \
46+
jq -r 'if .state=="firing" then
47+
"::error title=Prometheus Alert Firing::\(.labels.alertname): \(.annotations.description)"
48+
elif .state=="pending" then
49+
"::warning title=Prometheus Alert Pending::\(.labels.alertname): \(.annotations.description)"
50+
end' /tmp/artifacts/alerts.out
51+
fi
52+
3853
- uses: actions/upload-artifact@v4
3954
if: failure()
4055
with:

Makefile

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -277,19 +277,23 @@ test-experimental-e2e: run image-registry prometheus experimental-e2e e2e e2e-me
277277
.PHONY: prometheus
278278
prometheus: PROMETHEUS_NAMESPACE := olmv1-system
279279
prometheus: PROMETHEUS_VERSION := v0.83.0
280+
prometheus: TMPDIR := $(shell mktemp -d)
280281
prometheus: #EXHELP Deploy Prometheus into specified namespace
281-
./hack/test/setup-monitoring.sh $(PROMETHEUS_NAMESPACE) $(PROMETHEUS_VERSION) $(KUSTOMIZE)
282-
283-
# The metrics.out file contains raw json data of the metrics collected during a test run.
284-
# In an upcoming PR, this query will be replaced with one that checks for alerts from
285-
# prometheus. Prometheus will gather metrics we currently query for over the test run,
286-
# and provide alerts from the metrics based on the rules that we set.
282+
trap 'echo "Cleaning up $(TMPDIR)"; rm -rf "$(TMPDIR)"' EXIT; \
283+
curl -s "https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/refs/tags/$(PROMETHEUS_VERSION)/kustomization.yaml" > "$(TMPDIR)/kustomization.yaml"; \
284+
curl -s "https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/refs/tags/$(PROMETHEUS_VERSION)/bundle.yaml" > "$(TMPDIR)/bundle.yaml"; \
285+
(cd $(TMPDIR) && $(KUSTOMIZE) edit set namespace $(PROMETHEUS_NAMESPACE)) && kubectl create -k "$(TMPDIR)"
286+
kubectl wait --for=condition=Ready pods -n $(PROMETHEUS_NAMESPACE) -l app.kubernetes.io/name=prometheus-operator
287+
$(KUSTOMIZE) build config/prometheus | CATALOGD_SERVICE_CERT=$(shell kubectl get certificate -n olmv1-system catalogd-service-cert -o jsonpath={.spec.secretName}) envsubst '$$CATALOGD_SERVICE_CERT' | kubectl apply -f -
288+
kubectl wait --for=condition=Ready pods -n $(PROMETHEUS_NAMESPACE) -l app.kubernetes.io/name=prometheus-operator --timeout=60s
289+
kubectl wait --for=create pods -n $(PROMETHEUS_NAMESPACE) prometheus-prometheus-0 --timeout=60s
290+
kubectl wait --for=condition=Ready pods -n $(PROMETHEUS_NAMESPACE) prometheus-prometheus-0 --timeout=120s
291+
292+
# The output alerts.out file contains any alerts, pending or firing, collected during a test run in json format.
287293
.PHONY: e2e-metrics
288-
e2e-metrics: #EXHELP Request metrics from prometheus; place in ARTIFACT_PATH if set
289-
curl -X POST \
290-
-H "Content-Type: application/x-www-form-urlencoded" \
291-
--data 'query={pod=~"operator-controller-controller-manager-.*|catalogd-controller-manager-.*"}' \
292-
http://localhost:30900/api/v1/query > $(if $(ARTIFACT_PATH),$(ARTIFACT_PATH),.)/metrics.out
294+
e2e-metrics: ALERTS_FILE_PATH := $(if $(ARTIFACT_PATH),$(ARTIFACT_PATH),.)/alerts.out
295+
e2e-metrics: #EXHELP Request metrics from prometheus; select only actively firing alerts; place in ARTIFACT_PATH if set
296+
curl -X GET http://localhost:30900/api/v1/alerts | jq 'if (.data.alerts | length) > 0 then .data.alerts.[] else empty end' > $(ALERTS_FILE_PATH)
293297

294298
.PHONY: extension-developer-e2e
295299
extension-developer-e2e: KIND_CLUSTER_NAME := operator-controller-ext-dev-e2e

config/prometheus/auth_token.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
apiVersion: v1
2+
kind: Secret
3+
type: kubernetes.io/service-account-token
4+
metadata:
5+
name: prometheus-metrics-token
6+
namespace: system
7+
annotations:
8+
kubernetes.io/service-account.name: prometheus
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: ServiceMonitor
3+
metadata:
4+
name: catalogd-controller-manager-metrics-monitor
5+
namespace: system
6+
spec:
7+
endpoints:
8+
- path: /metrics
9+
port: metrics
10+
interval: 10s
11+
scheme: https
12+
authorization:
13+
credentials:
14+
name: prometheus-metrics-token
15+
key: token
16+
tlsConfig:
17+
# NAMESPACE_PLACEHOLDER replaced by replacements in kustomization.yaml
18+
serverName: catalogd-service.NAMESPACE_PLACEHOLDER.svc
19+
insecureSkipVerify: false
20+
ca:
21+
secret:
22+
# CATALOGD_SERVICE_CERT must be replaced by envsubst
23+
name: ${CATALOGD_SERVICE_CERT}
24+
key: ca.crt
25+
cert:
26+
secret:
27+
name: ${CATALOGD_SERVICE_CERT}
28+
key: tls.crt
29+
keySecret:
30+
name: ${CATALOGD_SERVICE_CERT}
31+
key: tls.key
32+
selector:
33+
matchLabels:
34+
app.kubernetes.io/name: catalogd
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: ServiceMonitor
3+
metadata:
4+
name: kubelet
5+
namespace: system
6+
labels:
7+
k8s-app: kubelet
8+
spec:
9+
jobLabel: k8s-app
10+
endpoints:
11+
- port: https-metrics
12+
scheme: https
13+
path: /metrics
14+
interval: 10s
15+
honorLabels: true
16+
tlsConfig:
17+
insecureSkipVerify: true
18+
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
19+
metricRelabelings:
20+
- action: keep
21+
sourceLabels: [pod,container]
22+
regex: (operator-controller|catalogd).*;manager
23+
- port: https-metrics
24+
scheme: https
25+
path: /metrics/cadvisor
26+
interval: 10s
27+
honorLabels: true
28+
tlsConfig:
29+
insecureSkipVerify: true
30+
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
31+
metricRelabelings:
32+
- action: keep
33+
sourceLabels: [pod,container]
34+
regex: (operator-controller|catalogd).*;manager
35+
selector:
36+
matchLabels:
37+
k8s-app: kubelet
38+
namespaceSelector:
39+
matchNames:
40+
- kube-system

config/prometheus/kustomization.yaml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
apiVersion: kustomize.config.k8s.io/v1beta1
2+
kind: Kustomization
3+
namespace: olmv1-system
4+
resources:
5+
- prometheus.yaml
6+
- catalogd_service_monitor.yaml
7+
- kubelet_service_monitor.yaml
8+
- operator_controller_service_monitor.yaml
9+
- prometheus_rule.yaml
10+
- auth_token.yaml
11+
- network_policy.yaml
12+
- service.yaml
13+
- rbac
14+
replacements:
15+
- source:
16+
kind: ServiceMonitor
17+
name: catalogd-controller-manager-metrics-monitor
18+
fieldPath: metadata.namespace
19+
targets:
20+
- select:
21+
kind: ServiceMonitor
22+
name: catalogd-controller-manager-metrics-monitor
23+
fieldPaths:
24+
- spec.endpoints.0.tlsConfig.serverName
25+
options:
26+
delimiter: '.'
27+
index: 1
28+
- select:
29+
kind: ServiceMonitor
30+
name: operator-controller-controller-manager-metrics-monitor
31+
fieldPaths:
32+
- spec.endpoints.0.tlsConfig.serverName
33+
options:
34+
delimiter: '.'
35+
index: 1

config/prometheus/network_policy.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
apiVersion: networking.k8s.io/v1
2+
kind: NetworkPolicy
3+
metadata:
4+
name: prometheus
5+
namespace: system
6+
spec:
7+
podSelector:
8+
matchLabels:
9+
app.kubernetes.io/name: prometheus
10+
policyTypes:
11+
- Egress
12+
- Ingress
13+
egress:
14+
- {} # Allows all egress traffic for metrics requests
15+
ingress:
16+
- {} # Allows us to query prometheus
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: ServiceMonitor
3+
metadata:
4+
name: operator-controller-controller-manager-metrics-monitor
5+
namespace: system
6+
spec:
7+
endpoints:
8+
- path: /metrics
9+
interval: 10s
10+
port: https
11+
scheme: https
12+
authorization:
13+
credentials:
14+
name: prometheus-metrics-token
15+
key: token
16+
tlsConfig:
17+
# NAMESPACE_PLACEHOLDER replaced by replacements in kustomization.yaml
18+
serverName: operator-controller-service.NAMESPACE_PLACEHOLDER.svc
19+
insecureSkipVerify: false
20+
ca:
21+
secret:
22+
name: olmv1-cert
23+
key: ca.crt
24+
cert:
25+
secret:
26+
name: olmv1-cert
27+
key: tls.crt
28+
keySecret:
29+
name: olmv1-cert
30+
key: tls.key
31+
selector:
32+
matchLabels:
33+
control-plane: operator-controller-controller-manager

config/prometheus/prometheus.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: Prometheus
3+
metadata:
4+
name: prometheus
5+
namespace: system
6+
spec:
7+
logLevel: debug
8+
serviceAccountName: prometheus
9+
scrapeTimeout: 30s
10+
scrapeInterval: 1m
11+
securityContext:
12+
runAsNonRoot: true
13+
runAsUser: 65534
14+
seccompProfile:
15+
type: RuntimeDefault
16+
ruleSelector: {}
17+
serviceDiscoveryRole: EndpointSlice
18+
serviceMonitorSelector: {}
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: PrometheusRule
3+
metadata:
4+
name: controller-alerts
5+
namespace: system
6+
spec:
7+
groups:
8+
- name: controller-panic
9+
rules:
10+
- alert: reconciler-panic
11+
expr: controller_runtime_reconcile_panics_total{} > 0
12+
annotations:
13+
description: "controller of pod {{ $labels.pod }} experienced panic(s); count={{ $value }}"
14+
- alert: webhook-panic
15+
expr: controller_runtime_webhook_panics_total{} > 0
16+
annotations:
17+
description: "controller webhook of pod {{ $labels.pod }} experienced panic(s); count={{ $value }}"
18+
- name: resource-usage
19+
rules:
20+
- alert: oom-events
21+
expr: container_oom_events_total > 0
22+
annotations:
23+
description: "container {{ $labels.container }} of pod {{ $labels.pod }} experienced OOM event(s); count={{ $value }}"
24+
- alert: operator-controller-memory-growth
25+
expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 50_000
26+
for: 5m
27+
keep_firing_for: 1d
28+
annotations:
29+
description: "operator-controller pod memory usage growing at a high rate for 5 minutes: {{ $value | humanize }}B/sec"
30+
- alert: catalogd-memory-growth
31+
expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 50_000
32+
for: 5m
33+
keep_firing_for: 1d
34+
annotations:
35+
description: "catalogd pod memory usage growing at a high rate for 5 minutes: {{ $value | humanize }}B/sec"
36+
- alert: operator-controller-memory-usage
37+
expr: sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"}) > 100_000_000
38+
for: 5m
39+
keep_firing_for: 1d
40+
annotations:
41+
description: "operator-controller pod using high memory resources for the last 5 minutes: {{ $value | humanize }}B"
42+
- alert: catalogd-memory-usage
43+
expr: sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"}) > 75_000_000
44+
for: 5m
45+
keep_firing_for: 1d
46+
annotations:
47+
description: "catalogd pod using high memory resources for the last 5 minutes: {{ $value | humanize }}B"
48+
- alert: operator-controller-cpu-usage
49+
expr: rate(container_cpu_usage_seconds_total{pod=~"operator-controller.*",container="manager"}[5m]) * 100 > 20
50+
for: 5m
51+
keep_firing_for: 1d
52+
annotations:
53+
description: "operator-controller using high cpu resource for 5 minutes: {{ $value | printf \"%.2f\" }}%"
54+
- alert: catalogd-cpu-usage
55+
expr: rate(container_cpu_usage_seconds_total{pod=~"catalogd.*",container="manager"}[5m]) * 100 > 20
56+
for: 5m
57+
keep_firing_for: 1d
58+
annotations:
59+
description: "catalogd using high cpu resources for 5 minutes: {{ $value | printf \"%.2f\" }}%"

0 commit comments

Comments
 (0)