From 07ec4d57fed0a0fd67fe8dcfdd3fe0b5e5254b96 Mon Sep 17 00:00:00 2001 From: Sandy Chen Date: Thu, 11 Jun 2026 09:00:27 +0900 Subject: [PATCH 1/2] fix(queryeviction): increment eviction metric before cancelling the victim The eviction loop in QueryEvictor.running() called victim.Cancel() before incrementing cortex_query_evictions_total. Cancel() is the externally observable commit point of an eviction: the cancellation callback is where tests (and any future hook) synchronize on the event. Because the counter was incremented only after that signal fired, an observer woken by the cancellation could read a stale counter value - there was no happens-before edge between the increment and the observation. This is exactly the flake in TestPrometheusMetrics_IncrementedCorrectly (expected: 3, actual: 2) and the same window exists for the assertions in TestCPUCheckedBeforeHeap. The counter is atomic, so -race never flags it; it is purely an ordering gap (reproduced naturally on master at roughly 10/3200 runs under CPU contention with -cpu 1,2). Move the Inc() before victim.Cancel() so the accounting is complete before the eviction becomes observable. There is no failure path between the two calls (Cancel is an infallible, idempotent context.CancelFunc), so the number of increments is unchanged - only which side of the cancellation window they land on. The warn log intentionally stays after Cancel(): logging is not bookkeeping, and a slow logger must not delay resource relief. Add TestEvictionsTotal_IncrementedBeforeCancelObserved, which reads the counter inside the cancel callback (on the evictor goroutine, at the instant the signal fires). It fails deterministically without this fix (expected: 1, actual: 0) and pins the invariant "account before signal". Co-Authored-By: Claude Fable 5 Signed-off-by: Sandy Chen --- CHANGELOG.md | 1 + pkg/util/queryeviction/evictor.go | 6 +++-- pkg/util/queryeviction/evictor_test.go | 32 ++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0756d48d25..01b98d63d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -57,6 +57,7 @@ * [BUGFIX] Query Frontend: Fix native histogram responses not being handled correctly in `minTime()` sort ordering for split_by_interval merge. #7555 * [BUGFIX] Distributor: Release the push worker pool goroutines on shutdown by stopping the async executor during the stopping phase when `-distributor.num-push-workers` is set. #7602 * [BUGFIX] Querier: Fix unbounded resource leak in the bucket-scan blocks finder (used when the bucket index is disabled). Per-tenant metadata fetchers, their Prometheus registries, and on-disk meta caches are now evicted once a tenant is no longer active, instead of being retained for the lifetime of the process. #7573 +* [BUGFIX] Querier: Increment `cortex_query_evictions_total` and deregister the victim before cancelling it in the query evictor, fixing flaky `TestPrometheusMetrics_IncrementedCorrectly` and double-counted evictions of still-unwinding queries. #7616 ## 1.21.0 2026-04-24 diff --git a/pkg/util/queryeviction/evictor.go b/pkg/util/queryeviction/evictor.go index 690d3fb875..7d084665e0 100644 --- a/pkg/util/queryeviction/evictor.go +++ b/pkg/util/queryeviction/evictor.go @@ -99,6 +99,10 @@ func (e *QueryEvictor) running(ctx context.Context) error { // Evict each victim. for _, victim := range victims { metricValue := e.registry.metric(victim.Stats) + // Account the eviction before cancelling the victim: Cancel is the + // externally observable commit point, so observers synchronized on + // the cancellation must already see it in evictionsTotal. + e.evictionsTotal.WithLabelValues(string(breachedResource)).Inc() victim.Cancel() level.Warn(e.logger).Log( @@ -112,8 +116,6 @@ func (e *QueryEvictor) running(ctx context.Context) error { "metric", e.cfg.EvictionMetric, "metric_value", metricValue, ) - - e.evictionsTotal.WithLabelValues(string(breachedResource)).Inc() } // Enter cooldown. diff --git a/pkg/util/queryeviction/evictor_test.go b/pkg/util/queryeviction/evictor_test.go index a6bbe92491..3d5fc54f8c 100644 --- a/pkg/util/queryeviction/evictor_test.go +++ b/pkg/util/queryeviction/evictor_test.go @@ -2,6 +2,7 @@ package queryeviction import ( "context" + "sync" "testing" "time" @@ -210,6 +211,37 @@ func TestPrometheusMetrics_IncrementedCorrectly(t *testing.T) { assert.Equal(t, float64(3), promtest.ToFloat64(evictor.evictionsTotal.WithLabelValues(string(resource.CPU)))) } +func TestEvictionsTotal_IncrementedBeforeCancelObserved(t *testing.T) { + reg := NewQueryRegistry(testMetricFunc) + evictor := startEvictor(t, newMockMonitor(0.95, 0.0), reg, testEvictorConfig(0.9, 0, 0)) + + ctx, cancel := context.WithCancel(context.Background()) + stats := &querier_stats.QueryStats{} + stats.AddFetchedSamples(1000) + + // Read the eviction counter inside the cancel callback, i.e. on the evictor + // goroutine at the instant the cancellation signal fires: the eviction must + // already be accounted in evictionsTotal before it becomes externally + // observable. The sync.Once keeps the callback idempotent so a re-picked + // victim cannot close the channel twice. + var ( + once sync.Once + counterAtCancel float64 + ) + evicted := make(chan struct{}) + reg.Register(func() { + once.Do(func() { + counterAtCancel = promtest.ToFloat64(evictor.evictionsTotal.WithLabelValues(string(resource.CPU))) + cancel() + close(evicted) + }) + }, stats, "up", "user1", "") + _ = ctx + + waitEvicted(t, evicted) + assert.Equal(t, float64(1), counterAtCancel) +} + func TestNewQueryEvictor_ReturnsNilWhenDisabled(t *testing.T) { cfg := configs.EvictionConfig{CheckInterval: time.Second, EvictionMetric: "fetched_samples", MaxEvictionsPerCycle: 1} evictor := NewQueryEvictor(newMockMonitor(0, 0), NewQueryRegistry(testMetricFunc), cfg, log.NewNopLogger(), prometheus.NewPedanticRegistry(), "test") From fce96a085ef7f0b43fb67656d839f789d098943f Mon Sep 17 00:00:00 2001 From: Sandy Chen Date: Thu, 11 Jun 2026 09:01:23 +0900 Subject: [PATCH 2/2] fix(queryeviction): deregister evicted queries to prevent double accounting Evicted victims were never removed from the QueryRegistry by the evictor; the only cleanup is the deferred Deregister in trackedQuery.Exec, which runs when the query goroutine eventually unwinds. Until then the already-cancelled victim remained in the evictable candidate set, so a later eviction cycle could re-pick it: cortex_query_evictions_total was double-counted for a single query, a MaxEvictionsPerCycle slot was burned on a no-op Cancel instead of relieving pressure, and in tests the second Cancel callback panicked on a double channel close. Deregister the victim before cancelling it, keeping all bookkeeping (metric, registry) ahead of the externally observable cancellation signal. Deregister is an idempotent locked delete, so the deferred Deregister in trackedQuery.Exec remains a safe no-op when the query finishes unwinding. Note the registry semantic shift: registry.Len() (and the registered_queries debug log field) now means "evictable candidates" - it no longer includes cancelled-but-still-unwinding victims. Add TestEvictedVictim_RemovedFromRegistry, which fails deterministically without this fix (expected: 0, actual: 1): the victim stayed registered after its eviction was observed. Co-Authored-By: Claude Fable 5 Signed-off-by: Sandy Chen --- pkg/util/queryeviction/evictor.go | 5 +++++ pkg/util/queryeviction/evictor_test.go | 29 ++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/pkg/util/queryeviction/evictor.go b/pkg/util/queryeviction/evictor.go index 7d084665e0..a6c778411e 100644 --- a/pkg/util/queryeviction/evictor.go +++ b/pkg/util/queryeviction/evictor.go @@ -103,6 +103,11 @@ func (e *QueryEvictor) running(ctx context.Context) error { // externally observable commit point, so observers synchronized on // the cancellation must already see it in evictionsTotal. e.evictionsTotal.WithLabelValues(string(breachedResource)).Inc() + // Retire the victim before cancelling it so later cycles can never + // re-pick (and double-count) an already-cancelled query that is + // still unwinding. trackedQuery.Exec's own deferred Deregister + // remains a safe no-op. + e.registry.Deregister(victim.QueryID) victim.Cancel() level.Warn(e.logger).Log( diff --git a/pkg/util/queryeviction/evictor_test.go b/pkg/util/queryeviction/evictor_test.go index 3d5fc54f8c..699902a8f3 100644 --- a/pkg/util/queryeviction/evictor_test.go +++ b/pkg/util/queryeviction/evictor_test.go @@ -242,6 +242,35 @@ func TestEvictionsTotal_IncrementedBeforeCancelObserved(t *testing.T) { assert.Equal(t, float64(1), counterAtCancel) } +func TestEvictedVictim_RemovedFromRegistry(t *testing.T) { + reg := NewQueryRegistry(testMetricFunc) + + ctx, cancel := context.WithCancel(context.Background()) + stats := &querier_stats.QueryStats{} + stats.AddFetchedSamples(1000) + + // The sync.Once keeps the callback idempotent: if the evictor wrongly + // re-picked the still-registered victim on a later cycle, the second + // Cancel would otherwise panic on the double close. + var once sync.Once + evicted := make(chan struct{}) + reg.Register(func() { + once.Do(func() { + cancel() + close(evicted) + }) + }, stats, "up", "user1", "") + _ = ctx + + startEvictor(t, newMockMonitor(0.95, 0.0), reg, testEvictorConfig(0.9, 0, 0)) + waitEvicted(t, evicted) + + // The evictor deregisters the victim before cancelling it, so once the + // eviction is observable the victim must already have left the registry + // and can never be re-picked (and double-counted) while still unwinding. + assert.Equal(t, 0, reg.Len()) +} + func TestNewQueryEvictor_ReturnsNilWhenDisabled(t *testing.T) { cfg := configs.EvictionConfig{CheckInterval: time.Second, EvictionMetric: "fetched_samples", MaxEvictionsPerCycle: 1} evictor := NewQueryEvictor(newMockMonitor(0, 0), NewQueryRegistry(testMetricFunc), cfg, log.NewNopLogger(), prometheus.NewPedanticRegistry(), "test")