From 2ecc15dcaae92976a3cc710bc3cd9149839de8ac Mon Sep 17 00:00:00 2001 From: Nick Pillitteri <56quarters@users.noreply.github.com> Date: Thu, 19 Dec 2024 12:51:52 -0500 Subject: [PATCH] mixin: Ignore cache delete errors for cache error alerts (#10287) Delete operations are expected to fail when the key doesn't exist when keys are deleted as part of cache invalidation. Signed-off-by: Nick Pillitteri --- CHANGELOG.md | 1 + .../templates/metamonitoring/mixin-alerts.yaml | 4 ++-- operations/mimir-mixin-compiled-baremetal/alerts.yaml | 4 ++-- operations/mimir-mixin-compiled/alerts.yaml | 4 ++-- operations/mimir-mixin/alerts/alerts.libsonnet | 9 +++++---- 5 files changed, 12 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a04f2bdf4d7..042c546291a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ ### Mixin * [BUGFIX] Dashboards: fix how we switch between classic and native histograms. #10018 +* [BUGFIX] Alerts: Ignore cache errors performing `delete` operations since these are expected to fail when keys don't exist. #10287 ### Jsonnet diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml index 8f5d18333dc..8413eb2972e 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml @@ -119,11 +119,11 @@ spec: expr: | ( sum by(cluster, namespace, name, operation) ( - rate(thanos_cache_operation_failures_total{operation!="add"}[1m]) + rate(thanos_cache_operation_failures_total{operation!~"add|delete"}[1m]) ) / sum by(cluster, namespace, name, operation) ( - rate(thanos_cache_operations_total{operation!="add"}[1m]) + rate(thanos_cache_operations_total{operation!~"add|delete"}[1m]) ) ) * 100 > 5 for: 5m diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index d4776cf7c22..ae306fa3dd6 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -107,11 +107,11 @@ groups: expr: | ( sum by(cluster, namespace, name, operation) ( - rate(thanos_cache_operation_failures_total{operation!="add"}[1m]) + rate(thanos_cache_operation_failures_total{operation!~"add|delete"}[1m]) ) / sum by(cluster, namespace, name, operation) ( - rate(thanos_cache_operations_total{operation!="add"}[1m]) + rate(thanos_cache_operations_total{operation!~"add|delete"}[1m]) ) ) * 100 > 5 for: 5m diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index 275dd27111d..9c622d12c95 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -107,11 +107,11 @@ groups: expr: | ( sum by(cluster, namespace, name, operation) ( - rate(thanos_cache_operation_failures_total{operation!="add"}[1m]) + rate(thanos_cache_operation_failures_total{operation!~"add|delete"}[1m]) ) / sum by(cluster, namespace, name, operation) ( - rate(thanos_cache_operations_total{operation!="add"}[1m]) + rate(thanos_cache_operations_total{operation!~"add|delete"}[1m]) ) ) * 100 > 5 for: 5m diff --git a/operations/mimir-mixin/alerts/alerts.libsonnet b/operations/mimir-mixin/alerts/alerts.libsonnet index 75c7e1ca78b..e967ace3984 100644 --- a/operations/mimir-mixin/alerts/alerts.libsonnet +++ b/operations/mimir-mixin/alerts/alerts.libsonnet @@ -202,16 +202,17 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, { alert: $.alertName('CacheRequestErrors'), - // Specifically exclude "add" operations which are used for cache invalidation and "locking" since - // they are expected to sometimes fail in normal operation (such as when a "lock" already exists). + // Specifically exclude "add" and "delete" operations which are used for cache invalidation and "locking" + // since they are expected to sometimes fail in normal operation (such as when a "lock" already exists or + // key being invalidated does not exist). expr: ||| ( sum by(%(group_by)s, name, operation) ( - rate(thanos_cache_operation_failures_total{operation!="add"}[%(range_interval)s]) + rate(thanos_cache_operation_failures_total{operation!~"add|delete"}[%(range_interval)s]) ) / sum by(%(group_by)s, name, operation) ( - rate(thanos_cache_operations_total{operation!="add"}[%(range_interval)s]) + rate(thanos_cache_operations_total{operation!~"add|delete"}[%(range_interval)s]) ) ) * 100 > 5 ||| % {