Performance Alerting

dtfranz · dtfranz · commit 7cd03f1d7071 · 2025-07-10T17:05:43.000+09:00
Introduces an early-warning series of prometheus alerts to attempt to catch issues with performance at an early stage in development.

Signed-off-by: Daniel Franz &lt;dfranz@redhat.com&gt;
diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml
@@ -35,6 +35,21 @@ jobs:
       - name: Run e2e tests
         run: ARTIFACT_PATH=/tmp/artifacts make test-e2e
 
+      - name: alerts-check
+        # Grab all current alerts, filtering out pending, and print the GH actions warning string
+        # containing the alert name and description.
+        #
+        # NOTE: Leaving this as annotating-only instead of failing the run until we have some more 
+        # finely-tuned alerts.
+        run: |
+          if [[ -s /tmp/artifacts/alerts.out ]]; then \
+            jq -r 'if .state=="firing" then 
+              "::error title=Prometheus Alert Firing::\(.labels.alertname): \(.annotations.description)" 
+            elif .state=="pending" then
+              "::warning title=Prometheus Alert Pending::\(.labels.alertname): \(.annotations.description)"
+            end' /tmp/artifacts/alerts.out
+          fi
+
       - uses: actions/upload-artifact@v4
         if: failure()
         with:
diff --git a/Makefile b/Makefile
@@ -277,19 +277,23 @@ test-experimental-e2e: run image-registry prometheus experimental-e2e e2e e2e-me
 .PHONY: prometheus
 prometheus: PROMETHEUS_NAMESPACE := olmv1-system
 prometheus: PROMETHEUS_VERSION := v0.83.0
+prometheus: TMPDIR := $(shell mktemp -d)
 prometheus: #EXHELP Deploy Prometheus into specified namespace
-	./hack/test/setup-monitoring.sh $(PROMETHEUS_NAMESPACE) $(PROMETHEUS_VERSION) $(KUSTOMIZE)
-
-# The metrics.out file contains raw json data of the metrics collected during a test run.
-# In an upcoming PR, this query will be replaced with one that checks for alerts from
-# prometheus. Prometheus will gather metrics we currently query for over the test run, 
-# and provide alerts from the metrics based on the rules that we set.
+	trap 'echo "Cleaning up $(TMPDIR)"; rm -rf "$(TMPDIR)"' EXIT; \
+	curl -s "https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/refs/tags/$(PROMETHEUS_VERSION)/kustomization.yaml" > "$(TMPDIR)/kustomization.yaml"; \
+	curl -s "https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/refs/tags/$(PROMETHEUS_VERSION)/bundle.yaml" > "$(TMPDIR)/bundle.yaml"; \
+	(cd $(TMPDIR) && $(KUSTOMIZE) edit set namespace $(PROMETHEUS_NAMESPACE)) && kubectl create -k "$(TMPDIR)"
+	kubectl wait --for=condition=Ready pods -n $(PROMETHEUS_NAMESPACE) -l app.kubernetes.io/name=prometheus-operator
+	$(KUSTOMIZE) build config/prometheus | CATALOGD_SERVICE_CERT=$(shell kubectl get certificate -n olmv1-system catalogd-service-cert -o jsonpath={.spec.secretName}) envsubst '$$CATALOGD_SERVICE_CERT' | kubectl apply -f -
+	kubectl wait --for=condition=Ready pods -n $(PROMETHEUS_NAMESPACE) -l app.kubernetes.io/name=prometheus-operator --timeout=60s
+	kubectl wait --for=create pods -n $(PROMETHEUS_NAMESPACE) prometheus-prometheus-0 --timeout=60s
+	kubectl wait --for=condition=Ready pods -n $(PROMETHEUS_NAMESPACE) prometheus-prometheus-0 --timeout=120s
+
+# The output alerts.out file contains any alerts, pending or firing, collected during a test run in json format. 
 .PHONY: e2e-metrics
-e2e-metrics: #EXHELP Request metrics from prometheus; place in ARTIFACT_PATH if set
-	curl -X POST \
-	-H "Content-Type: application/x-www-form-urlencoded" \
-	--data 'query={pod=~"operator-controller-controller-manager-.*|catalogd-controller-manager-.*"}' \
-	http://localhost:30900/api/v1/query > $(if $(ARTIFACT_PATH),$(ARTIFACT_PATH),.)/metrics.out
+e2e-metrics: ALERTS_FILE_PATH := $(if $(ARTIFACT_PATH),$(ARTIFACT_PATH),.)/alerts.out
+e2e-metrics: #EXHELP Request metrics from prometheus; select only actively firing alerts; place in ARTIFACT_PATH if set
+	curl -X GET http://localhost:30900/api/v1/alerts | jq 'if (.data.alerts | length) > 0 then .data.alerts.[] else empty end' > $(ALERTS_FILE_PATH)
 
 .PHONY: extension-developer-e2e
 extension-developer-e2e: KIND_CLUSTER_NAME := operator-controller-ext-dev-e2e
diff --git a/config/prometheus/auth_token.yaml b/config/prometheus/auth_token.yaml
@@ -0,0 +1,8 @@
+apiVersion: v1
+kind: Secret
+type: kubernetes.io/service-account-token
+metadata:
+  name: prometheus-metrics-token
+  namespace: system
+  annotations:
+    kubernetes.io/service-account.name: prometheus
diff --git a/config/prometheus/catalogd_service_monitor.yaml b/config/prometheus/catalogd_service_monitor.yaml
@@ -0,0 +1,34 @@
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: catalogd-controller-manager-metrics-monitor
+  namespace: system
+spec:
+  endpoints:
+    - path: /metrics
+      port: metrics
+      interval: 10s
+      scheme: https
+      authorization:
+        credentials:
+          name: prometheus-metrics-token
+          key: token
+      tlsConfig:
+        # NAMESPACE_PLACEHOLDER replaced by replacements in kustomization.yaml
+        serverName: catalogd-service.NAMESPACE_PLACEHOLDER.svc
+        insecureSkipVerify: false
+        ca:
+          secret:
+            # CATALOGD_SERVICE_CERT must be replaced by envsubst
+            name: ${CATALOGD_SERVICE_CERT}
+            key: ca.crt
+        cert:
+          secret:
+            name: ${CATALOGD_SERVICE_CERT}
+            key: tls.crt
+        keySecret:
+          name: ${CATALOGD_SERVICE_CERT}
+          key: tls.key
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: catalogd
diff --git a/config/prometheus/kubelet_service_monitor.yaml b/config/prometheus/kubelet_service_monitor.yaml
@@ -0,0 +1,40 @@
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: kubelet
+  namespace: system
+  labels:
+    k8s-app: kubelet
+spec:
+  jobLabel: k8s-app
+  endpoints:
+  - port: https-metrics
+    scheme: https
+    path: /metrics
+    interval: 10s
+    honorLabels: true
+    tlsConfig:
+      insecureSkipVerify: true
+    bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+    metricRelabelings:
+      - action: keep
+        sourceLabels: [pod,container]
+        regex: (operator-controller|catalogd).*;manager
+  - port: https-metrics
+    scheme: https
+    path: /metrics/cadvisor
+    interval: 10s
+    honorLabels: true
+    tlsConfig:
+      insecureSkipVerify: true
+    bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+    metricRelabelings:
+      - action: keep
+        sourceLabels: [pod,container]
+        regex: (operator-controller|catalogd).*;manager
+  selector:
+    matchLabels:
+      k8s-app: kubelet
+  namespaceSelector:
+    matchNames:
+    - kube-system
diff --git a/config/prometheus/kustomization.yaml b/config/prometheus/kustomization.yaml
@@ -0,0 +1,35 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: olmv1-system
+resources:
+- prometheus.yaml
+- catalogd_service_monitor.yaml
+- kubelet_service_monitor.yaml
+- operator_controller_service_monitor.yaml
+- prometheus_rule.yaml
+- auth_token.yaml
+- network_policy.yaml
+- service.yaml
+- rbac
+replacements:
+- source:
+    kind: ServiceMonitor
+    name: catalogd-controller-manager-metrics-monitor
+    fieldPath: metadata.namespace
+  targets:
+  - select:
+      kind: ServiceMonitor
+      name: catalogd-controller-manager-metrics-monitor
+    fieldPaths:
+    - spec.endpoints.0.tlsConfig.serverName
+    options:
+      delimiter: '.'
+      index: 1
+  - select:
+      kind: ServiceMonitor
+      name: operator-controller-controller-manager-metrics-monitor
+    fieldPaths:
+    - spec.endpoints.0.tlsConfig.serverName
+    options:
+      delimiter: '.'
+      index: 1
diff --git a/config/prometheus/network_policy.yaml b/config/prometheus/network_policy.yaml
@@ -0,0 +1,16 @@
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: prometheus
+  namespace: system
+spec:
+  podSelector:
+    matchLabels:
+      app.kubernetes.io/name: prometheus
+  policyTypes:
+    - Egress
+    - Ingress
+  egress:
+    - {}  # Allows all egress traffic for metrics requests
+  ingress:
+    - {}  # Allows us to query prometheus
diff --git a/config/prometheus/operator_controller_service_monitor.yaml b/config/prometheus/operator_controller_service_monitor.yaml
@@ -0,0 +1,33 @@
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: operator-controller-controller-manager-metrics-monitor
+  namespace: system
+spec:
+  endpoints:
+    - path: /metrics
+      interval: 10s
+      port: https
+      scheme: https
+      authorization:
+        credentials:
+          name: prometheus-metrics-token
+          key: token
+      tlsConfig:
+        # NAMESPACE_PLACEHOLDER replaced by replacements in kustomization.yaml
+        serverName: operator-controller-service.NAMESPACE_PLACEHOLDER.svc
+        insecureSkipVerify: false
+        ca:
+          secret:
+            name: olmv1-cert
+            key: ca.crt
+        cert:
+          secret:
+            name: olmv1-cert
+            key: tls.crt
+        keySecret:
+          name: olmv1-cert
+          key: tls.key
+  selector:
+    matchLabels:
+      control-plane: operator-controller-controller-manager
diff --git a/config/prometheus/prometheus.yaml b/config/prometheus/prometheus.yaml
@@ -0,0 +1,18 @@
+apiVersion: monitoring.coreos.com/v1
+kind: Prometheus
+metadata:
+  name: prometheus
+  namespace: system
+spec:
+  logLevel: debug
+  serviceAccountName: prometheus
+  scrapeTimeout: 30s
+  scrapeInterval: 1m
+  securityContext:
+    runAsNonRoot: true
+    runAsUser: 65534
+    seccompProfile:
+        type: RuntimeDefault
+  ruleSelector: {}
+  serviceDiscoveryRole: EndpointSlice
+  serviceMonitorSelector: {}
diff --git a/config/prometheus/prometheus_rule.yaml b/config/prometheus/prometheus_rule.yaml
@@ -0,0 +1,59 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: controller-alerts
+  namespace: system
+spec:
+  groups:
+  - name: controller-panic
+    rules:
+    - alert: reconciler-panic
+      expr: controller_runtime_reconcile_panics_total{} > 0
+      annotations:
+        description: "controller of pod {{ $labels.pod }} experienced panic(s); count={{ $value }}"
+    - alert: webhook-panic
+      expr: controller_runtime_webhook_panics_total{} > 0
+      annotations:
+        description: "controller webhook of pod {{ $labels.pod }} experienced panic(s); count={{ $value }}"
+  - name: resource-usage
+    rules:
+    - alert: oom-events
+      expr: container_oom_events_total > 0
+      annotations:
+        description: "container {{ $labels.container }} of pod {{ $labels.pod }} experienced OOM event(s); count={{ $value }}"
+    - alert: operator-controller-memory-growth
+      expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 50_000
+      for: 5m
+      keep_firing_for: 1d
+      annotations:
+        description: "operator-controller pod memory usage growing at a high rate for 5 minutes: {{ $value | humanize }}B/sec"
+    - alert: catalogd-memory-growth
+      expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 50_000
+      for: 5m
+      keep_firing_for: 1d
+      annotations:
+        description: "catalogd pod memory usage growing at a high rate for 5 minutes: {{ $value | humanize }}B/sec"
+    - alert: operator-controller-memory-usage
+      expr: sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"}) > 100_000_000
+      for: 5m
+      keep_firing_for: 1d
+      annotations:
+        description: "operator-controller pod using high memory resources for the last 5 minutes: {{ $value | humanize }}B"
+    - alert: catalogd-memory-usage
+      expr: sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"}) > 75_000_000
+      for: 5m
+      keep_firing_for: 1d
+      annotations:
+        description: "catalogd pod using high memory resources for the last 5 minutes: {{ $value | humanize }}B"
+    - alert: operator-controller-cpu-usage
+      expr: rate(container_cpu_usage_seconds_total{pod=~"operator-controller.*",container="manager"}[5m]) * 100 > 20
+      for: 5m
+      keep_firing_for: 1d
+      annotations:
+        description: "operator-controller using high cpu resource for 5 minutes: {{ $value | printf \"%.2f\" }}%"
+    - alert: catalogd-cpu-usage
+      expr: rate(container_cpu_usage_seconds_total{pod=~"catalogd.*",container="manager"}[5m]) * 100 > 20
+      for: 5m
+      keep_firing_for: 1d
+      annotations:
+        description: "catalogd using high cpu resources for 5 minutes: {{ $value | printf \"%.2f\" }}%"
diff --git a/config/prometheus/rbac/kustomization.yaml b/config/prometheus/rbac/kustomization.yaml
@@ -0,0 +1,4 @@
+resources:
+- prometheus_service_account.yaml
+- prometheus_cluster_role.yaml
+- prometheus_cluster_rolebinding.yaml
diff --git a/config/prometheus/rbac/prometheus_cluster_role.yaml b/config/prometheus/rbac/prometheus_cluster_role.yaml
@@ -0,0 +1,29 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: prometheus
+rules:
+- apiGroups: [""]
+  resources:
+  - nodes
+  - nodes/metrics
+  - services
+  - endpoints
+  - pods
+  verbs: ["get", "list", "watch"]
+- apiGroups: [""]
+  resources:
+  - configmaps
+  verbs: ["get"]
+- apiGroups:
+  - discovery.k8s.io
+  resources:
+  - endpointslices
+  verbs: ["get", "list", "watch"]
+- apiGroups:
+  - networking.k8s.io
+  resources:
+  - ingresses
+  verbs: ["get", "list", "watch"]
+- nonResourceURLs: ["/metrics"]
+  verbs: ["get"]
diff --git a/config/prometheus/rbac/prometheus_cluster_rolebinding.yaml b/config/prometheus/rbac/prometheus_cluster_rolebinding.yaml
@@ -0,0 +1,12 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: prometheus
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: prometheus
+subjects:
+- kind: ServiceAccount
+  name: prometheus
+  namespace: system
diff --git a/config/prometheus/rbac/prometheus_service_account.yaml b/config/prometheus/rbac/prometheus_service_account.yaml
@@ -0,0 +1,5 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: prometheus
+  namespace: system
diff --git a/config/prometheus/service.yaml b/config/prometheus/service.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: prometheus-service
+  namespace: system
+spec:
+  type: NodePort
+  ports:
+  - name: web
+    nodePort: 30900
+    port: 9090
+    protocol: TCP
+    targetPort: web
+  selector:
+    prometheus: prometheus
diff --git a/hack/test/setup-monitoring.sh b/hack/test/setup-monitoring.sh