diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index d0dd6b8f9..e5d4c0858 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -35,6 +35,21 @@ jobs: - name: Run e2e tests run: ARTIFACT_PATH=/tmp/artifacts make test-e2e + - name: alerts-check + # Grab all current alerts, filtering out pending, and print the GH actions warning string + # containing the alert name and description. + # + # NOTE: Leaving this as annotating-only instead of failing the run until we have some more + # finely-tuned alerts. + run: | + if [[ -s /tmp/artifacts/alerts.out ]]; then \ + jq -r 'if .state=="firing" then + "::error title=Prometheus Alert Firing::\(.labels.alertname): \(.annotations.description)" + elif .state=="pending" then + "::warning title=Prometheus Alert Pending::\(.labels.alertname): \(.annotations.description)" + end' /tmp/artifacts/alerts.out + fi + - uses: actions/upload-artifact@v4 if: failure() with: diff --git a/Makefile b/Makefile index e429f88a3..6550c96fc 100644 --- a/Makefile +++ b/Makefile @@ -277,19 +277,23 @@ test-experimental-e2e: run image-registry prometheus experimental-e2e e2e e2e-me .PHONY: prometheus prometheus: PROMETHEUS_NAMESPACE := olmv1-system prometheus: PROMETHEUS_VERSION := v0.83.0 +prometheus: TMPDIR := $(shell mktemp -d) prometheus: #EXHELP Deploy Prometheus into specified namespace - ./hack/test/setup-monitoring.sh $(PROMETHEUS_NAMESPACE) $(PROMETHEUS_VERSION) $(KUSTOMIZE) - -# The metrics.out file contains raw json data of the metrics collected during a test run. -# In an upcoming PR, this query will be replaced with one that checks for alerts from -# prometheus. Prometheus will gather metrics we currently query for over the test run, -# and provide alerts from the metrics based on the rules that we set. + trap 'echo "Cleaning up $(TMPDIR)"; rm -rf "$(TMPDIR)"' EXIT; \ + curl -s "https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/refs/tags/$(PROMETHEUS_VERSION)/kustomization.yaml" > "$(TMPDIR)/kustomization.yaml"; \ + curl -s "https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/refs/tags/$(PROMETHEUS_VERSION)/bundle.yaml" > "$(TMPDIR)/bundle.yaml"; \ + (cd $(TMPDIR) && $(KUSTOMIZE) edit set namespace $(PROMETHEUS_NAMESPACE)) && kubectl create -k "$(TMPDIR)" + kubectl wait --for=condition=Ready pods -n $(PROMETHEUS_NAMESPACE) -l app.kubernetes.io/name=prometheus-operator + $(KUSTOMIZE) build config/prometheus | CATALOGD_SERVICE_CERT=$(shell kubectl get certificate -n olmv1-system catalogd-service-cert -o jsonpath={.spec.secretName}) envsubst '$$CATALOGD_SERVICE_CERT' | kubectl apply -f - + kubectl wait --for=condition=Ready pods -n $(PROMETHEUS_NAMESPACE) -l app.kubernetes.io/name=prometheus-operator --timeout=60s + kubectl wait --for=create pods -n $(PROMETHEUS_NAMESPACE) prometheus-prometheus-0 --timeout=60s + kubectl wait --for=condition=Ready pods -n $(PROMETHEUS_NAMESPACE) prometheus-prometheus-0 --timeout=120s + +# The output alerts.out file contains any alerts, pending or firing, collected during a test run in json format. .PHONY: e2e-metrics -e2e-metrics: #EXHELP Request metrics from prometheus; place in ARTIFACT_PATH if set - curl -X POST \ - -H "Content-Type: application/x-www-form-urlencoded" \ - --data 'query={pod=~"operator-controller-controller-manager-.*|catalogd-controller-manager-.*"}' \ - http://localhost:30900/api/v1/query > $(if $(ARTIFACT_PATH),$(ARTIFACT_PATH),.)/metrics.out +e2e-metrics: ALERTS_FILE_PATH := $(if $(ARTIFACT_PATH),$(ARTIFACT_PATH),.)/alerts.out +e2e-metrics: #EXHELP Request metrics from prometheus; select only actively firing alerts; place in ARTIFACT_PATH if set + curl -X GET http://localhost:30900/api/v1/alerts | jq 'if (.data.alerts | length) > 0 then .data.alerts.[] else empty end' > $(ALERTS_FILE_PATH) .PHONY: extension-developer-e2e extension-developer-e2e: KIND_CLUSTER_NAME := operator-controller-ext-dev-e2e diff --git a/config/prometheus/auth_token.yaml b/config/prometheus/auth_token.yaml new file mode 100644 index 000000000..e0939c4e0 --- /dev/null +++ b/config/prometheus/auth_token.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Secret +type: kubernetes.io/service-account-token +metadata: + name: prometheus-metrics-token + namespace: system + annotations: + kubernetes.io/service-account.name: prometheus diff --git a/config/prometheus/catalogd_service_monitor.yaml b/config/prometheus/catalogd_service_monitor.yaml new file mode 100644 index 000000000..524734b00 --- /dev/null +++ b/config/prometheus/catalogd_service_monitor.yaml @@ -0,0 +1,34 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: catalogd-controller-manager-metrics-monitor + namespace: system +spec: + endpoints: + - path: /metrics + port: metrics + interval: 10s + scheme: https + authorization: + credentials: + name: prometheus-metrics-token + key: token + tlsConfig: + # NAMESPACE_PLACEHOLDER replaced by replacements in kustomization.yaml + serverName: catalogd-service.NAMESPACE_PLACEHOLDER.svc + insecureSkipVerify: false + ca: + secret: + # CATALOGD_SERVICE_CERT must be replaced by envsubst + name: ${CATALOGD_SERVICE_CERT} + key: ca.crt + cert: + secret: + name: ${CATALOGD_SERVICE_CERT} + key: tls.crt + keySecret: + name: ${CATALOGD_SERVICE_CERT} + key: tls.key + selector: + matchLabels: + app.kubernetes.io/name: catalogd diff --git a/config/prometheus/kubelet_service_monitor.yaml b/config/prometheus/kubelet_service_monitor.yaml new file mode 100644 index 000000000..6c540c581 --- /dev/null +++ b/config/prometheus/kubelet_service_monitor.yaml @@ -0,0 +1,40 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: kubelet + namespace: system + labels: + k8s-app: kubelet +spec: + jobLabel: k8s-app + endpoints: + - port: https-metrics + scheme: https + path: /metrics + interval: 10s + honorLabels: true + tlsConfig: + insecureSkipVerify: true + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + metricRelabelings: + - action: keep + sourceLabels: [pod,container] + regex: (operator-controller|catalogd).*;manager + - port: https-metrics + scheme: https + path: /metrics/cadvisor + interval: 10s + honorLabels: true + tlsConfig: + insecureSkipVerify: true + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + metricRelabelings: + - action: keep + sourceLabels: [pod,container] + regex: (operator-controller|catalogd).*;manager + selector: + matchLabels: + k8s-app: kubelet + namespaceSelector: + matchNames: + - kube-system diff --git a/config/prometheus/kustomization.yaml b/config/prometheus/kustomization.yaml new file mode 100644 index 000000000..96a0503d3 --- /dev/null +++ b/config/prometheus/kustomization.yaml @@ -0,0 +1,35 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: olmv1-system +resources: +- prometheus.yaml +- catalogd_service_monitor.yaml +- kubelet_service_monitor.yaml +- operator_controller_service_monitor.yaml +- prometheus_rule.yaml +- auth_token.yaml +- network_policy.yaml +- service.yaml +- rbac +replacements: +- source: + kind: ServiceMonitor + name: catalogd-controller-manager-metrics-monitor + fieldPath: metadata.namespace + targets: + - select: + kind: ServiceMonitor + name: catalogd-controller-manager-metrics-monitor + fieldPaths: + - spec.endpoints.0.tlsConfig.serverName + options: + delimiter: '.' + index: 1 + - select: + kind: ServiceMonitor + name: operator-controller-controller-manager-metrics-monitor + fieldPaths: + - spec.endpoints.0.tlsConfig.serverName + options: + delimiter: '.' + index: 1 diff --git a/config/prometheus/network_policy.yaml b/config/prometheus/network_policy.yaml new file mode 100644 index 000000000..5fe716799 --- /dev/null +++ b/config/prometheus/network_policy.yaml @@ -0,0 +1,16 @@ +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: prometheus + namespace: system +spec: + podSelector: + matchLabels: + app.kubernetes.io/name: prometheus + policyTypes: + - Egress + - Ingress + egress: + - {} # Allows all egress traffic for metrics requests + ingress: + - {} # Allows us to query prometheus diff --git a/config/prometheus/operator_controller_service_monitor.yaml b/config/prometheus/operator_controller_service_monitor.yaml new file mode 100644 index 000000000..b35c5de75 --- /dev/null +++ b/config/prometheus/operator_controller_service_monitor.yaml @@ -0,0 +1,33 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: operator-controller-controller-manager-metrics-monitor + namespace: system +spec: + endpoints: + - path: /metrics + interval: 10s + port: https + scheme: https + authorization: + credentials: + name: prometheus-metrics-token + key: token + tlsConfig: + # NAMESPACE_PLACEHOLDER replaced by replacements in kustomization.yaml + serverName: operator-controller-service.NAMESPACE_PLACEHOLDER.svc + insecureSkipVerify: false + ca: + secret: + name: olmv1-cert + key: ca.crt + cert: + secret: + name: olmv1-cert + key: tls.crt + keySecret: + name: olmv1-cert + key: tls.key + selector: + matchLabels: + control-plane: operator-controller-controller-manager diff --git a/config/prometheus/prometheus.yaml b/config/prometheus/prometheus.yaml new file mode 100644 index 000000000..9686f63ad --- /dev/null +++ b/config/prometheus/prometheus.yaml @@ -0,0 +1,18 @@ +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + name: prometheus + namespace: system +spec: + logLevel: debug + serviceAccountName: prometheus + scrapeTimeout: 30s + scrapeInterval: 1m + securityContext: + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + ruleSelector: {} + serviceDiscoveryRole: EndpointSlice + serviceMonitorSelector: {} diff --git a/config/prometheus/prometheus_rule.yaml b/config/prometheus/prometheus_rule.yaml new file mode 100644 index 000000000..16e4bfd1a --- /dev/null +++ b/config/prometheus/prometheus_rule.yaml @@ -0,0 +1,59 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: controller-alerts + namespace: system +spec: + groups: + - name: controller-panic + rules: + - alert: reconciler-panic + expr: controller_runtime_reconcile_panics_total{} > 0 + annotations: + description: "controller of pod {{ $labels.pod }} experienced panic(s); count={{ $value }}" + - alert: webhook-panic + expr: controller_runtime_webhook_panics_total{} > 0 + annotations: + description: "controller webhook of pod {{ $labels.pod }} experienced panic(s); count={{ $value }}" + - name: resource-usage + rules: + - alert: oom-events + expr: container_oom_events_total > 0 + annotations: + description: "container {{ $labels.container }} of pod {{ $labels.pod }} experienced OOM event(s); count={{ $value }}" + - alert: operator-controller-memory-growth + expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 50_000 + for: 5m + keep_firing_for: 1d + annotations: + description: "operator-controller pod memory usage growing at a high rate for 5 minutes: {{ $value | humanize }}B/sec" + - alert: catalogd-memory-growth + expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 50_000 + for: 5m + keep_firing_for: 1d + annotations: + description: "catalogd pod memory usage growing at a high rate for 5 minutes: {{ $value | humanize }}B/sec" + - alert: operator-controller-memory-usage + expr: sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"}) > 100_000_000 + for: 5m + keep_firing_for: 1d + annotations: + description: "operator-controller pod using high memory resources for the last 5 minutes: {{ $value | humanize }}B" + - alert: catalogd-memory-usage + expr: sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"}) > 75_000_000 + for: 5m + keep_firing_for: 1d + annotations: + description: "catalogd pod using high memory resources for the last 5 minutes: {{ $value | humanize }}B" + - alert: operator-controller-cpu-usage + expr: rate(container_cpu_usage_seconds_total{pod=~"operator-controller.*",container="manager"}[5m]) * 100 > 20 + for: 5m + keep_firing_for: 1d + annotations: + description: "operator-controller using high cpu resource for 5 minutes: {{ $value | printf \"%.2f\" }}%" + - alert: catalogd-cpu-usage + expr: rate(container_cpu_usage_seconds_total{pod=~"catalogd.*",container="manager"}[5m]) * 100 > 20 + for: 5m + keep_firing_for: 1d + annotations: + description: "catalogd using high cpu resources for 5 minutes: {{ $value | printf \"%.2f\" }}%" diff --git a/config/prometheus/rbac/kustomization.yaml b/config/prometheus/rbac/kustomization.yaml new file mode 100644 index 000000000..566195983 --- /dev/null +++ b/config/prometheus/rbac/kustomization.yaml @@ -0,0 +1,4 @@ +resources: +- prometheus_service_account.yaml +- prometheus_cluster_role.yaml +- prometheus_cluster_rolebinding.yaml diff --git a/config/prometheus/rbac/prometheus_cluster_role.yaml b/config/prometheus/rbac/prometheus_cluster_role.yaml new file mode 100644 index 000000000..176c3b389 --- /dev/null +++ b/config/prometheus/rbac/prometheus_cluster_role.yaml @@ -0,0 +1,29 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: +- apiGroups: [""] + resources: + - nodes + - nodes/metrics + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - configmaps + verbs: ["get"] +- apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: ["get", "list", "watch"] +- apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: ["get", "list", "watch"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] diff --git a/config/prometheus/rbac/prometheus_cluster_rolebinding.yaml b/config/prometheus/rbac/prometheus_cluster_rolebinding.yaml new file mode 100644 index 000000000..bd93b45c7 --- /dev/null +++ b/config/prometheus/rbac/prometheus_cluster_rolebinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: +- kind: ServiceAccount + name: prometheus + namespace: system diff --git a/config/prometheus/rbac/prometheus_service_account.yaml b/config/prometheus/rbac/prometheus_service_account.yaml new file mode 100644 index 000000000..df06091c9 --- /dev/null +++ b/config/prometheus/rbac/prometheus_service_account.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus + namespace: system diff --git a/config/prometheus/service.yaml b/config/prometheus/service.yaml new file mode 100644 index 000000000..0d041e008 --- /dev/null +++ b/config/prometheus/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: prometheus-service + namespace: system +spec: + type: NodePort + ports: + - name: web + nodePort: 30900 + port: 9090 + protocol: TCP + targetPort: web + selector: + prometheus: prometheus diff --git a/hack/test/setup-monitoring.sh b/hack/test/setup-monitoring.sh deleted file mode 100755 index 3435988b2..000000000 --- a/hack/test/setup-monitoring.sh +++ /dev/null @@ -1,223 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -help="setup-monitoring.sh is used to set up prometheus monitoring for e2e testing. - -Usage: - setup-monitoring.sh [PROMETHEUS_NAMESPACE] [PROMETHEUS_VERSION] [KUSTOMIZE] -" - -if [[ "$#" -ne 3 ]]; then - echo "Illegal number of arguments passed" - echo "${help}" - exit 1 -fi - -NAMESPACE=$1 -PROMETHEUS_VERSION=$2 -KUSTOMIZE=$3 - -TMPDIR=$(mktemp -d) -trap 'echo "Cleaning up ${TMPDIR}"; rm -rf "${TMPDIR}"' EXIT -curl -s "https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/refs/tags/${PROMETHEUS_VERSION}/kustomization.yaml" > "${TMPDIR}/kustomization.yaml" -curl -s "https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/refs/tags/${PROMETHEUS_VERSION}/bundle.yaml" > "${TMPDIR}/bundle.yaml" -(cd ${TMPDIR} && ${KUSTOMIZE} edit set namespace ${NAMESPACE}) && kubectl create -k "${TMPDIR}" -kubectl wait --for=condition=Ready pods -n ${NAMESPACE} -l app.kubernetes.io/name=prometheus-operator - -kubectl apply -f - << EOF -apiVersion: v1 -kind: ServiceAccount -metadata: - name: prometheus - namespace: ${NAMESPACE} ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: prometheus -rules: -- apiGroups: [""] - resources: - - nodes - - nodes/metrics - - services - - endpoints - - pods - verbs: ["get", "list", "watch"] -- apiGroups: [""] - resources: - - configmaps - verbs: ["get"] -- apiGroups: - - discovery.k8s.io - resources: - - endpointslices - verbs: ["get", "list", "watch"] -- apiGroups: - - networking.k8s.io - resources: - - ingresses - verbs: ["get", "list", "watch"] -- nonResourceURLs: ["/metrics"] - verbs: ["get"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: prometheus -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus -subjects: -- kind: ServiceAccount - name: prometheus - namespace: ${NAMESPACE} -EOF - -kubectl apply -f - << EOF -apiVersion: monitoring.coreos.com/v1 -kind: Prometheus -metadata: - name: prometheus - namespace: ${NAMESPACE} -spec: - logLevel: debug - serviceAccountName: prometheus - scrapeTimeout: 30s - scrapeInterval: 1m - securityContext: - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault - serviceDiscoveryRole: EndpointSlice - serviceMonitorSelector: {} -EOF - -kubectl apply -f - << EOF -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - name: prometheus - namespace: ${NAMESPACE} -spec: - podSelector: - matchLabels: - app.kubernetes.io/name: prometheus - policyTypes: - - Egress - - Ingress - egress: - - {} # Allows all egress traffic for metrics requests - ingress: - - {} # Allows us to query prometheus -EOF - -# Give the operator time to create the pod -kubectl wait --for=create pods -n ${NAMESPACE} prometheus-prometheus-0 --timeout=60s -kubectl wait --for=condition=Ready pods -n ${NAMESPACE} prometheus-prometheus-0 --timeout=120s - -# Authentication token for the scrape requests -kubectl apply -f - <