From e315ebb4c1c3dca30bf7c62eeefdef0b7ac8e5ad Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 24 Oct 2024 15:39:38 +0200 Subject: [PATCH 001/105] Poc: cost attribution proposal 2 --- cmd/mimir/config-descriptor.json | 66 ++++ cmd/mimir/help-all.txt.tmpl | 12 + .../config/mimir.yaml | 9 +- pkg/api/api.go | 6 + pkg/blockbuilder/tsdb.go | 2 +- pkg/costattribution/manager.go | 173 +++++++++ pkg/costattribution/manager_test.go | 193 ++++++++++ pkg/costattribution/tracker.go | 345 ++++++++++++++++++ pkg/costattribution/tracker_test.go | 163 +++++++++ pkg/distributor/allcase.txt | 90 +++++ pkg/distributor/distributor.go | 60 +-- pkg/distributor/distributor_test.go | 175 +++++---- pkg/distributor/validate.go | 22 +- pkg/distributor/validate_test.go | 14 +- .../activeseries/active_labels_test.go | 6 +- .../active_native_histogram_postings_test.go | 32 +- .../activeseries/active_postings_test.go | 21 +- pkg/ingester/activeseries/active_series.go | 112 ++++-- .../activeseries/active_series_test.go | 210 ++++++----- pkg/ingester/ingester.go | 63 +++- .../ingester_early_compaction_test.go | 2 +- pkg/ingester/ingester_ingest_storage_test.go | 2 +- pkg/ingester/ingester_test.go | 152 +++++--- pkg/ingester/user_tsdb.go | 4 +- pkg/mimir/mimir.go | 8 + pkg/mimir/modules.go | 26 +- pkg/storage/soft_append_error_processor.go | 6 +- .../benchmarks/comparison_test.go | 2 +- pkg/streamingpromql/benchmarks/ingester.go | 2 +- pkg/util/validation/limits.go | 33 ++ pkg/util/validation/limits_test.go | 6 + 31 files changed, 1670 insertions(+), 347 deletions(-) create mode 100644 pkg/costattribution/manager.go create mode 100644 pkg/costattribution/manager_test.go create mode 100644 pkg/costattribution/tracker.go create mode 100644 pkg/costattribution/tracker_test.go create mode 100644 pkg/distributor/allcase.txt diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 3ac7f5c294b..1334a1b047e 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -4368,6 +4368,50 @@ "fieldType": "int", "fieldCategory": "experimental" }, + { + "kind": "field", + "name": "cost_attribution_labels", + "required": false, + "desc": "Defines labels for cost attribution, applied to metrics like cortex_distributor_attributed_received_samples_total. Set to an empty string to disable. Example: 'team,service' will produce metrics such as cortex_distributor_attributed_received_samples_total{team='frontend', service='api'}.", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldFlag": "validation.cost-attribution-labels", + "fieldType": "string", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "max_cost_attribution_labels_per_user", + "required": false, + "desc": "Maximum number of cost attribution labels allowed per user.", + "fieldValue": null, + "fieldDefaultValue": 2, + "fieldFlag": "validation.max-cost-attribution-labels-per-user", + "fieldType": "int", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "max_cost_attribution_cardinality_per_user", + "required": false, + "desc": "Maximum cardinality of cost attribution labels allowed per user.", + "fieldValue": null, + "fieldDefaultValue": 10000, + "fieldFlag": "validation.max-cost-attribution-cardinality-per-user", + "fieldType": "int", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "cost_attribution_cooldown", + "required": false, + "desc": "Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system will stay in overflow mode and extend the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset will occur once the cardinality falls below the limit.", + "fieldValue": null, + "fieldDefaultValue": 0, + "fieldFlag": "validation.cost-attribution-cooldown", + "fieldType": "duration", + "fieldCategory": "experimental" + }, { "kind": "field", "name": "ruler_evaluation_delay_duration", @@ -19639,6 +19683,28 @@ "fieldFlag": "timeseries-unmarshal-caching-optimization-enabled", "fieldType": "boolean", "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "cost_attribution_eviction_interval", + "required": false, + "desc": "Time interval at which inactive cost attributions are evicted from the counter, ensuring they are not included in the cost attribution cardinality per user limit.", + "fieldValue": null, + "fieldDefaultValue": 1200000000000, + "fieldFlag": "cost-attribution.eviction-interval", + "fieldType": "duration", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "cost_attribution_registry_path", + "required": false, + "desc": "Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldFlag": "cost-attribution.registry-path", + "fieldType": "string", + "fieldCategory": "experimental" } ], "fieldValue": null, diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 08bc71314d3..0324a354ceb 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -1283,6 +1283,10 @@ Usage of ./cmd/mimir/mimir: Expands ${var} or $var in config according to the values of the environment variables. -config.file value Configuration file to load. + -cost-attribution.eviction-interval duration + [experimental] Time interval at which inactive cost attributions are evicted from the counter, ensuring they are not included in the cost attribution cardinality per user limit. (default 20m0s) + -cost-attribution.registry-path string + [experimental] Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed. -debug.block-profile-rate int Fraction of goroutine blocking events that are reported in the blocking profile. 1 to include every blocking event in the profile, 0 to disable. -debug.mutex-profile-fraction int @@ -3317,10 +3321,18 @@ Usage of ./cmd/mimir/mimir: Enable anonymous usage reporting. (default true) -usage-stats.installation-mode string Installation mode. Supported values: custom, helm, jsonnet. (default "custom") + -validation.cost-attribution-cooldown duration + [experimental] Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system will stay in overflow mode and extend the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset will occur once the cardinality falls below the limit. + -validation.cost-attribution-labels comma-separated-list-of-strings + [experimental] Defines labels for cost attribution, applied to metrics like cortex_distributor_attributed_received_samples_total. Set to an empty string to disable. Example: 'team,service' will produce metrics such as cortex_distributor_attributed_received_samples_total{team='frontend', service='api'}. -validation.create-grace-period duration Controls how far into the future incoming samples and exemplars are accepted compared to the wall clock. Any sample or exemplar will be rejected if its timestamp is greater than '(now + creation_grace_period)'. This configuration is enforced in the distributor and ingester. (default 10m) -validation.enforce-metadata-metric-name Enforce every metadata has a metric name. (default true) + -validation.max-cost-attribution-cardinality-per-user int + [experimental] Maximum cardinality of cost attribution labels allowed per user. (default 10000) + -validation.max-cost-attribution-labels-per-user int + [experimental] Maximum number of cost attribution labels allowed per user. (default 2) -validation.max-label-names-per-info-series int Maximum number of label names per info series. Has no effect if less than the value of the maximum number of label names per series option (-validation.max-label-names-per-series) (default 80) -validation.max-label-names-per-series int diff --git a/development/mimir-microservices-mode/config/mimir.yaml b/development/mimir-microservices-mode/config/mimir.yaml index 5d245999115..31702611891 100644 --- a/development/mimir-microservices-mode/config/mimir.yaml +++ b/development/mimir-microservices-mode/config/mimir.yaml @@ -1,4 +1,6 @@ multitenancy_enabled: false +cost_attribution_registry_path: "/usage-metrics" +cost_attribution_eviction_interval: 10m distributor: ha_tracker: @@ -184,5 +186,10 @@ limits: ha_replica_label: ha_replica ha_max_clusters: 10 + cost_attribution_labels: "container" + max_cost_attribution_labels_per_user: 2 + max_cost_attribution_cardinality_per_user: 100 + cost_attribution_cooldown: 20m + runtime_config: - file: ./config/runtime.yaml + file: ./config/runtime.yaml \ No newline at end of file diff --git a/pkg/api/api.go b/pkg/api/api.go index e2f6da5735c..131b50643c1 100644 --- a/pkg/api/api.go +++ b/pkg/api/api.go @@ -20,6 +20,7 @@ import ( "github.com/grafana/dskit/middleware" "github.com/grafana/dskit/server" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/grafana/mimir/pkg/alertmanager" "github.com/grafana/mimir/pkg/alertmanager/alertmanagerpb" @@ -280,6 +281,11 @@ func (a *API) RegisterDistributor(d *distributor.Distributor, pushConfig distrib a.RegisterRoute("/distributor/ha_tracker", d.HATracker, false, true, "GET") } +// RegisterCostAttribution registers a Prometheus HTTP handler for the cost attribution metrics. +func (a *API) RegisterCostAttribution(customRegistryPath string, reg *prometheus.Registry) { + a.RegisterRoute(customRegistryPath, promhttp.HandlerFor(reg, promhttp.HandlerOpts{}), false, false, "GET") +} + // Ingester is defined as an interface to allow for alternative implementations // of ingesters to be passed into the API.RegisterIngester() method. type Ingester interface { diff --git a/pkg/blockbuilder/tsdb.go b/pkg/blockbuilder/tsdb.go index ee2d610fe78..97cf6ede36d 100644 --- a/pkg/blockbuilder/tsdb.go +++ b/pkg/blockbuilder/tsdb.go @@ -50,7 +50,7 @@ type TSDBBuilder struct { var softErrProcessor = mimir_storage.NewSoftAppendErrorProcessor( func() {}, func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {}, - func() {}, func([]mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, + func([]mimirpb.LabelAdapter) {}, func([]mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, ) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go new file mode 100644 index 00000000000..0c60ed54505 --- /dev/null +++ b/pkg/costattribution/manager.go @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package costattribution + +import ( + "context" + "sort" + "sync" + "time" + + "github.com/go-kit/log" + "github.com/grafana/dskit/services" + "github.com/prometheus/client_golang/prometheus" + + "github.com/grafana/mimir/pkg/util/validation" +) + +const ( + TrackerLabel = "tracker" + TenantLabel = "tenant" + defaultTrackerName = "cost-attribution" + missingValue = "__missing__" + overflowValue = "__overflow__" +) + +type Manager struct { + services.Service + logger log.Logger + inactiveTimeout time.Duration + limits *validation.Overrides + + mtx sync.RWMutex + trackersByUserID map[string]*Tracker + reg *prometheus.Registry + cleanupInterval time.Duration + metricsExportInterval time.Duration +} + +func NewManager(cleanupInterval, exportInterval, inactiveTimeout time.Duration, logger log.Logger, limits *validation.Overrides, reg *prometheus.Registry) (*Manager, error) { + m := &Manager{ + trackersByUserID: make(map[string]*Tracker), + limits: limits, + mtx: sync.RWMutex{}, + inactiveTimeout: inactiveTimeout, + logger: logger, + reg: reg, + cleanupInterval: cleanupInterval, + metricsExportInterval: exportInterval, + } + + m.Service = services.NewBasicService(nil, m.running, nil).WithName("cost attribution manager") + if err := reg.Register(m); err != nil { + return nil, err + } + return m, nil +} + +func (m *Manager) running(ctx context.Context) error { + t := time.NewTicker(m.cleanupInterval) + defer t.Stop() + + for { + select { + case <-t.C: + if err := m.purgeInactiveAttributionsUntil(time.Now().Add(-m.inactiveTimeout).Unix()); err != nil { + return err + } + case <-ctx.Done(): + return nil + } + } +} + +func (m *Manager) EnabledForUser(userID string) bool { + if m == nil { + return false + } + return len(m.limits.CostAttributionLabels(userID)) > 0 +} + +func (m *Manager) TrackerForUser(userID string) *Tracker { + if !m.EnabledForUser(userID) { + return nil + } + + m.mtx.Lock() + defer m.mtx.Unlock() + + if tracker, exists := m.trackersByUserID[userID]; exists { + return tracker + } + + tracker := newTracker(userID, m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) + m.trackersByUserID[userID] = tracker + return tracker +} + +func (m *Manager) Collect(out chan<- prometheus.Metric) { + m.mtx.RLock() + defer m.mtx.RUnlock() + for _, tracker := range m.trackersByUserID { + tracker.Collect(out) + } +} + +func (m *Manager) Describe(chan<- *prometheus.Desc) { +} + +func (m *Manager) deleteUserTracker(userID string) { + m.mtx.Lock() + defer m.mtx.Unlock() + delete(m.trackersByUserID, userID) +} + +func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) error { + m.mtx.RLock() + userIDs := make([]string, 0, len(m.trackersByUserID)) + for userID := range m.trackersByUserID { + userIDs = append(userIDs, userID) + } + m.mtx.RUnlock() + + for _, userID := range userIDs { + if !m.EnabledForUser(userID) { + m.deleteUserTracker(userID) + continue + } + + invalidKeys := m.inactiveObservationsForUser(userID, deadline) + cat := m.TrackerForUser(userID) + for _, key := range invalidKeys { + cat.cleanupTrackerAttribution(key) + } + + if cat != nil && cat.cooldownUntil != nil && cat.cooldownUntil.Load() < deadline { + if len(cat.observed) <= cat.MaxCardinality() { + cat.state = OverflowComplete + m.deleteUserTracker(userID) + } else { + cat.cooldownUntil.Store(deadline + cat.cooldownDuration) + } + } + } + return nil +} + +func (m *Manager) inactiveObservationsForUser(userID string, deadline int64) []string { + cat := m.TrackerForUser(userID) + newTrackedLabels := m.limits.CostAttributionLabels(userID) + sort.Slice(newTrackedLabels, func(i, j int) bool { + return newTrackedLabels[i] < newTrackedLabels[j] + }) + + if !cat.CompareCALabels(newTrackedLabels) { + m.mtx.Lock() + cat = newTracker(userID, newTrackedLabels, m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) + m.trackersByUserID[userID] = cat + m.mtx.Unlock() + return nil + } else { + maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) + if cat.MaxCardinality() != maxCardinality { + cat.UpdateMaxCardinality(maxCardinality) + } + + cooldown := int64(m.limits.CostAttributionCooldown(userID).Seconds()) + if cooldown != cat.CooldownDuration() { + cat.UpdateCooldownDuration(cooldown) + } + } + + return cat.InactiveObservations(deadline) +} diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go new file mode 100644 index 00000000000..1e67704b287 --- /dev/null +++ b/pkg/costattribution/manager_test.go @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package costattribution + +import ( + "strings" + "testing" + "time" + + "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/prometheus/prometheus/model/labels" + "github.com/stretchr/testify/assert" + + "github.com/grafana/mimir/pkg/util/validation" +) + +func getMockLimits(idx int) (*validation.Overrides, error) { + baseLimits := map[string]*validation.Limits{ + "user1": {MaxCostAttributionCardinalityPerUser: 5, CostAttributionLabels: []string{"team"}}, + "user2": {MaxCostAttributionCardinalityPerUser: 2, CostAttributionLabels: []string{}}, + "user3": {MaxCostAttributionCardinalityPerUser: 2, CostAttributionLabels: []string{"department", "service"}}, + "user4": {MaxCostAttributionCardinalityPerUser: 5, CostAttributionLabels: []string{"platform"}}, + } + + switch idx { + case 1: + baseLimits["user1"].CostAttributionLabels = []string{} + case 2: + baseLimits["user3"].CostAttributionLabels = []string{"team", "feature"} + case 3: + baseLimits["user3"].MaxCostAttributionCardinalityPerUser = 3 + case 4: + baseLimits["user1"].MaxCostAttributionCardinalityPerUser = 2 + case 5: + baseLimits["user1"].CostAttributionLabels = []string{"department"} + } + + return validation.NewOverrides(validation.Limits{}, validation.NewMockTenantLimits(baseLimits)) +} + +func newTestManager() *Manager { + logger := log.NewNopLogger() + limits, _ := getMockLimits(0) + reg := prometheus.NewRegistry() + manager, err := NewManager(5*time.Second, time.Second, 10*time.Second, logger, limits, reg) + if err != nil { + panic(err) + } + return manager +} + +func Test_NewManager(t *testing.T) { + manager := newTestManager() + assert.NotNil(t, manager) + assert.NotNil(t, manager.trackersByUserID) + assert.Equal(t, 10*time.Second, manager.inactiveTimeout) +} + +func Test_CreateDeleteTracker(t *testing.T) { + manager := newTestManager() + + t.Run("Tracker existence and attributes", func(t *testing.T) { + user1Tracker := manager.TrackerForUser("user1") + assert.NotNil(t, user1Tracker) + assert.True(t, user1Tracker.CompareCALabels([]string{"team"})) + assert.Equal(t, 5, user1Tracker.MaxCardinality()) + + assert.Nil(t, manager.TrackerForUser("user2")) + + user3Tracker := manager.TrackerForUser("user3") + assert.NotNil(t, user3Tracker) + assert.True(t, user3Tracker.CompareCALabels([]string{"department", "service"})) + assert.Equal(t, 2, user3Tracker.MaxCardinality()) + }) + + t.Run("Metrics tracking", func(t *testing.T) { + manager.TrackerForUser("user1").IncrementDiscardedSamples(labels.FromStrings("team", "bar"), 1, "invalid-metrics-name", time.Unix(6, 0)) + manager.TrackerForUser("user1").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(12, 0)) + manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings("department", "foo", "service", "dodo"), 1, time.Unix(20, 0)) + + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="bar",tenant="user1",tracker="cost-attribution"} 1 + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="cost-attribution"} 1 + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{department="foo",service="dodo",tenant="user3",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total")) + }) + + t.Run("Purge inactive attributions", func(t *testing.T) { + manager.purgeInactiveAttributionsUntil(time.Unix(10, 0).Unix()) + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total")) + }) + + t.Run("Disabling user cost attribution", func(t *testing.T) { + manager.limits, _ = getMockLimits(1) + manager.purgeInactiveAttributionsUntil(time.Unix(11, 0).Unix()) + assert.Equal(t, 1, len(manager.trackersByUserID)) + + expectedMetrics := ` + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{department="foo",service="dodo",tenant="user3",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total")) + }) + + t.Run("Updating user cardinality and labels", func(t *testing.T) { + manager.limits, _ = getMockLimits(2) + manager.purgeInactiveAttributionsUntil(time.Unix(12, 0).Unix()) + assert.Equal(t, 1, len(manager.trackersByUserID)) + assert.True(t, manager.TrackerForUser("user3").CompareCALabels([]string{"feature", "team"})) + + manager.TrackerForUser("user3").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(13, 0)) + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{feature="__missing__",reason="invalid-metrics-name",team="foo",tenant="user3",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total")) + }) + + t.Run("Overflow metrics on cardinality limit", func(t *testing.T) { + manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings("team", "bar", "feature", "bar"), 1, time.Unix(15, 0)) + manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings("team", "baz", "feature", "baz"), 1, time.Unix(16, 0)) + manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings("team", "foo", "feature", "foo"), 1, time.Unix(17, 0)) + expectedMetrics := ` + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{feature="__overflow__",team="__overflow__",tenant="user3",tracker="cost-attribution"} 2 + ` + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total")) + }) +} + +func Test_PurgeInactiveAttributionsUntil(t *testing.T) { + manager := newTestManager() + + manager.TrackerForUser("user1").IncrementReceivedSamples(labels.FromStrings("team", "foo"), 1, time.Unix(1, 0)) + manager.TrackerForUser("user1").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(1, 0)) + manager.TrackerForUser("user3").IncrementDiscardedSamples(labels.FromStrings("department", "foo", "service", "bar"), 1, "out-of-window", time.Unix(10, 0)) + + t.Run("Purge before inactive timeout", func(t *testing.T) { + manager.purgeInactiveAttributionsUntil(time.Unix(0, 0).Unix()) + assert.Equal(t, 2, len(manager.trackersByUserID)) + + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="cost-attribution"} 1 + cortex_discarded_attributed_samples_total{department="foo",reason="out-of-window",service="bar",tenant="user3",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total")) + }) + + t.Run("Purge after inactive timeout", func(t *testing.T) { + // disable cost attribution for user1 to test purging + manager.limits, _ = getMockLimits(1) + manager.purgeInactiveAttributionsUntil(time.Unix(5, 0).Unix()) + + // User3's tracker should remain since it's active, user1's tracker should be removed + assert.Equal(t, 1, len(manager.trackersByUserID), "Expected one active tracker after purging") + assert.Nil(t, manager.TrackerForUser("user1"), "Expected user1 tracker to be purged") + + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{department="foo",reason="out-of-window",service="bar",tenant="user3",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total")) + }) + + t.Run("Purge all trackers", func(t *testing.T) { + // Trigger a purge that should remove all inactive trackers + manager.purgeInactiveAttributionsUntil(time.Unix(20, 0).Unix()) + + // Tracker would stay at 1 since user1's tracker is disabled + assert.Equal(t, 1, len(manager.trackersByUserID), "Expected one active tracker after full purge") + + // No metrics should remain after all purged + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(""), "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total")) + }) +} diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go new file mode 100644 index 00000000000..0a232195848 --- /dev/null +++ b/pkg/costattribution/tracker.go @@ -0,0 +1,345 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package costattribution + +import ( + "bytes" + "sort" + "strings" + "sync" + "time" + + "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/prometheus/model/labels" + "go.uber.org/atomic" +) + +type TrackerState int + +const ( + Normal TrackerState = iota + Overflow + OverflowComplete +) + +const sep = rune(0x80) + +type Observation struct { + lastUpdate *atomic.Int64 + activeSerie *atomic.Float64 + receivedSample *atomic.Float64 + discardSamplemtx sync.Mutex + discardedSample map[string]*atomic.Float64 + totalDiscarded *atomic.Float64 +} + +type Tracker struct { + userID string + caLabels []string + caLabelMap map[string]int + maxCardinality int + activeSeriesPerUserAttribution *prometheus.Desc + receivedSamplesAttribution *prometheus.Desc + discardedSampleAttribution *prometheus.Desc + overflowLabels []string + obseveredMtx sync.RWMutex + observed map[string]*Observation + hashBuffer []byte + state TrackerState + overflowCounter *Observation + cooldownUntil *atomic.Int64 + cooldownDuration int64 + logger log.Logger +} + +func newTracker(userID string, trackedLabels []string, limit int, cooldown time.Duration, logger log.Logger) *Tracker { + sort.Slice(trackedLabels, func(i, j int) bool { + return trackedLabels[i] < trackedLabels[j] + }) + + // Create a map for fast lookup, and overflow labels to export when overflow happens + caLabelMap := make(map[string]int, len(trackedLabels)) + overflowLabels := make([]string, len(trackedLabels)+2) + for i, label := range trackedLabels { + caLabelMap[label] = i + overflowLabels[i] = overflowValue + } + + overflowLabels[len(trackedLabels)] = userID + overflowLabels[len(trackedLabels)+1] = overflowValue + + tracker := &Tracker{ + userID: userID, + caLabels: trackedLabels, + caLabelMap: caLabelMap, + maxCardinality: limit, + observed: make(map[string]*Observation), + hashBuffer: make([]byte, 0, 1024), + cooldownDuration: int64(cooldown.Seconds()), + logger: logger, + overflowLabels: overflowLabels, + } + + tracker.discardedSampleAttribution = prometheus.NewDesc("cortex_discarded_attributed_samples_total", + "The total number of samples that were discarded per attribution.", + append(trackedLabels, TenantLabel, "reason"), + prometheus.Labels{TrackerLabel: defaultTrackerName}) + + tracker.receivedSamplesAttribution = prometheus.NewDesc("cortex_received_attributed_samples_total", + "The total number of samples that were received per attribution.", + append(trackedLabels, TenantLabel), + prometheus.Labels{TrackerLabel: defaultTrackerName}) + + tracker.activeSeriesPerUserAttribution = prometheus.NewDesc("cortex_ingester_attributed_active_series", + "The total number of active series per user and attribution.", append(trackedLabels, TenantLabel), + prometheus.Labels{TrackerLabel: defaultTrackerName}) + + return tracker +} + +func (t *Tracker) CompareCALabels(currentLabels []string) bool { + if t == nil { + return len(currentLabels) == 0 + } + if len(t.caLabels) != len(currentLabels) { + return false + } + for _, v := range currentLabels { + if _, exists := t.caLabelMap[v]; !exists { + return false + } + } + return true +} + +func (t *Tracker) MaxCardinality() int { + if t == nil { + return 0 + } + return t.maxCardinality +} + +func (t *Tracker) CooldownDuration() int64 { + if t == nil { + return 0 + } + return t.cooldownDuration +} + +var bufferPool = sync.Pool{ + New: func() interface{} { + return new(bytes.Buffer) + }, +} + +func (t *Tracker) cleanupTrackerAttribution(key string) { + if t == nil { + return + } + t.obseveredMtx.Lock() + defer t.obseveredMtx.Unlock() + delete(t.observed, key) +} + +func (t *Tracker) IncrementActiveSeries(lbs labels.Labels, now time.Time) { + if t == nil { + return + } + t.updateCounters(lbs, now.Unix(), 1, 0, 0, nil) +} + +func (t *Tracker) DecrementActiveSeries(lbs labels.Labels, now time.Time) { + if t == nil { + return + } + t.updateCounters(lbs, now.Unix(), -1, 0, 0, nil) +} + +func (t *Tracker) Collect(out chan<- prometheus.Metric) { + switch t.state { + case Overflow: + out <- prometheus.MustNewConstMetric(t.activeSeriesPerUserAttribution, prometheus.GaugeValue, t.overflowCounter.activeSerie.Load(), t.overflowLabels[:len(t.overflowLabels)-1]...) + out <- prometheus.MustNewConstMetric(t.receivedSamplesAttribution, prometheus.CounterValue, t.overflowCounter.receivedSample.Load(), t.overflowLabels[:len(t.overflowLabels)-1]...) + out <- prometheus.MustNewConstMetric(t.discardedSampleAttribution, prometheus.CounterValue, t.overflowCounter.totalDiscarded.Load(), t.overflowLabels...) + case Normal: + // Collect metrics for all observed streams + t.obseveredMtx.RLock() + defer t.obseveredMtx.RUnlock() + for key, o := range t.observed { + keys := strings.Split(key, string(sep)) + keys = append(keys, t.userID) + if o.activeSerie.Load() > 0 { + out <- prometheus.MustNewConstMetric(t.activeSeriesPerUserAttribution, prometheus.GaugeValue, o.activeSerie.Load(), keys...) + } + if o.receivedSample.Load() > 0 { + out <- prometheus.MustNewConstMetric(t.receivedSamplesAttribution, prometheus.CounterValue, o.receivedSample.Load(), keys...) + } + o.discardSamplemtx.Lock() + for reason, discarded := range o.discardedSample { + out <- prometheus.MustNewConstMetric(t.discardedSampleAttribution, prometheus.CounterValue, discarded.Load(), append(keys, reason)...) + } + o.discardSamplemtx.Unlock() + } + } +} + +func (t *Tracker) IncrementDiscardedSamples(lbs labels.Labels, value float64, reason string, now time.Time) { + if t == nil { + return + } + t.updateCounters(lbs, now.Unix(), 0, 0, value, &reason) +} + +func (t *Tracker) IncrementReceivedSamples(lbs labels.Labels, value float64, now time.Time) { + if t == nil { + return + } + t.updateCounters(lbs, now.Unix(), 0, value, 0, nil) +} + +func (t *Tracker) updateCounters(lbls labels.Labels, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { + labelValues := make([]string, len(t.caLabels)) + lbls.Range(func(l labels.Label) { + if idx, ok := t.caLabelMap[l.Name]; ok { + labelValues[idx] = l.Value + } + }) + for i := 0; i < len(labelValues); i++ { + if labelValues[i] == "" { + labelValues[i] = missingValue + } + } + + buf := bufferPool.Get().(*bytes.Buffer) + buf.Reset() + defer bufferPool.Put(buf) + + // Build the stream key + for i, value := range labelValues { + if i > 0 { + buf.WriteRune(sep) + } + buf.WriteString(value) + } + + t.obseveredMtx.Lock() + defer t.obseveredMtx.Unlock() + + t.updateOverflow(buf.String(), ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) +} + +// handleObservation updates or creates a new stream observation in the 'observed' map. +func (t *Tracker) handleObservation(stream string, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { + if o, known := t.observed[stream]; known && o.lastUpdate != nil { + // Update the timestamp if needed + if o.lastUpdate.Load() < ts { + o.lastUpdate.Store(ts) + } + if activeSeriesIncrement != 0 { + o.activeSerie.Add(activeSeriesIncrement) + } + if receivedSampleIncrement > 0 { + o.receivedSample.Add(receivedSampleIncrement) + } + if discardedSampleIncrement > 0 && reason != nil { + o.discardSamplemtx.Lock() + o.discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) + o.discardSamplemtx.Unlock() + } + } else if len(t.observed) < t.maxCardinality*2 { + // Create a new observation for the stream + t.createNewObservation(stream, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) + } +} + +func (t *Tracker) updateOverflow(stream string, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { + // Update the stream in the observed map + t.handleObservation(stream, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) + t.handleOverflow(ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement) +} + +// handleOverflow checks if the tracker has exceeded its max cardinality and updates overflow state if necessary. +func (t *Tracker) handleOverflow(ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64) { + // Transition to overflow mode if maximum cardinality is exceeded. + previousState := t.state + if t.state == Normal && len(t.observed) > t.maxCardinality { + t.state = Overflow + // Initialize the overflow counter. + t.overflowCounter = &Observation{ + lastUpdate: atomic.NewInt64(ts), + activeSerie: atomic.NewFloat64(0), + receivedSample: atomic.NewFloat64(0), + totalDiscarded: atomic.NewFloat64(0), + } + + // Aggregate active series from all streams into the overflow counter. + for _, o := range t.observed { + if o != nil { + t.overflowCounter.activeSerie.Add(o.activeSerie.Load()) + } + } + t.cooldownUntil = atomic.NewInt64(ts + t.cooldownDuration) + } + + if t.state == Overflow { + // if already in overflow mode, update the overflow counter. If it was normal mode, the active series are already applied. + if previousState == Overflow && activeSeriesIncrement != 0 { + t.overflowCounter.activeSerie.Add(activeSeriesIncrement) + } + if receivedSampleIncrement > 0 { + t.overflowCounter.receivedSample.Add(receivedSampleIncrement) + } + if discardedSampleIncrement > 0 { + t.overflowCounter.totalDiscarded.Add(discardedSampleIncrement) + } + } +} + +// createNewObservation creates a new observation in the 'observed' map. +func (t *Tracker) createNewObservation(stream string, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { + t.observed[stream] = &Observation{ + lastUpdate: atomic.NewInt64(ts), + activeSerie: atomic.NewFloat64(activeSeriesIncrement), + receivedSample: atomic.NewFloat64(receivedSampleIncrement), + discardedSample: map[string]*atomic.Float64{}, + discardSamplemtx: sync.Mutex{}, + } + if discardedSampleIncrement > 0 && reason != nil { + t.observed[stream].discardSamplemtx.Lock() + t.observed[stream].discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) + t.observed[stream].discardSamplemtx.Unlock() + } +} + +func (t *Tracker) InactiveObservations(deadline int64) []string { + if t == nil { + return nil + } + + // otherwise, we need to check all observations and clean up the ones that are inactive + var invalidKeys []string + t.obseveredMtx.RLock() + defer t.obseveredMtx.RUnlock() + for labkey, ob := range t.observed { + if ob != nil && ob.lastUpdate != nil && ob.lastUpdate.Load() <= deadline { + invalidKeys = append(invalidKeys, labkey) + } + } + + return invalidKeys +} + +func (t *Tracker) UpdateMaxCardinality(limit int) { + if t == nil { + return + } + t.maxCardinality = limit +} + +func (t *Tracker) UpdateCooldownDuration(cooldownDuration int64) { + if t == nil { + return + } + t.cooldownDuration = cooldownDuration +} diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go new file mode 100644 index 00000000000..82de4e8b64c --- /dev/null +++ b/pkg/costattribution/tracker_test.go @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package costattribution + +import ( + "strings" + "sync" + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/prometheus/prometheus/model/labels" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func Test_GetCALabels(t *testing.T) { + cat := newTestManager().TrackerForUser("user1") + assert.True(t, cat.CompareCALabels([]string{"team"}), "Expected cost attribution labels mismatch") +} + +func Test_GetMaxCardinality(t *testing.T) { + cat := newTestManager().TrackerForUser("user1") + assert.Equal(t, 5, cat.MaxCardinality(), "Expected max cardinality mismatch") +} + +func Test_CreateCleanupTracker(t *testing.T) { + tManager := newTestManager() + cat := tManager.TrackerForUser("user4") + + reg := prometheus.NewRegistry() + err := reg.Register(tManager) + require.NoError(t, err) + + cat.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), time.Unix(1, 0)) + cat.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "2"), time.Unix(2, 0)) + cat.DecrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "3"), time.Unix(3, 0)) + cat.IncrementReceivedSamples(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), 5, time.Unix(4, 0)) + cat.IncrementDiscardedSamples(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), 2, "sample-out-of-order", time.Unix(4, 0)) + + cat.IncrementActiveSeries(labels.FromStrings("platform", "bar", "tenant", "user4", "team", "2"), time.Unix(6, 0)) + + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{platform="foo",reason="sample-out-of-order", tenant="user4",tracker="cost-attribution"} 2 + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{platform="foo",tenant="user4",tracker="cost-attribution"} 1 + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 5 + ` + + metricNames := []string{ + "cortex_discarded_attributed_samples_total", + "cortex_received_attributed_samples_total", + "cortex_ingester_attributed_active_series", + } + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + assert.Equal(t, []string{"foo"}, cat.InactiveObservations(5)) + tManager.purgeInactiveAttributionsUntil(5) + + expectedMetrics = ` + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + tManager.deleteUserTracker("user4") + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(""), metricNames...)) +} + +func Test_UpdateCounters(t *testing.T) { + cat := newTestManager().TrackerForUser("user3") + lbls1 := labels.FromStrings("department", "foo", "service", "bar") + lbls2 := labels.FromStrings("department", "bar", "service", "baz") + lbls3 := labels.FromStrings("department", "baz", "service", "foo") + + cat.updateCounters(lbls1, 1, 1, 0, 0, nil) + assert.Equal(t, Normal, cat.state, "First observation, should not overflow") + + cat.updateCounters(lbls2, 2, 1, 0, 0, nil) + assert.Equal(t, Normal, cat.state, "Second observation, should not overflow") + + cat.updateCounters(lbls3, 3, 1, 0, 0, nil) + assert.Equal(t, Overflow, cat.state, "Third observation, should overflow") + + cat.updateCounters(lbls3, 4, 1, 0, 0, nil) + assert.Equal(t, Overflow, cat.state, "Fourth observation, should stay overflow") + + assert.Equal(t, int64(3+cat.cooldownDuration), cat.cooldownUntil.Load(), "CooldownUntil should be updated correctly") +} + +func Test_GetInactiveObservations(t *testing.T) { + // Setup the test environment: create a tracker for user1 with a "team" label and max cardinality of 5. + cat := newTestManager().TrackerForUser("user1") + + // Create two observations with different last update timestamps. + observations := []labels.Labels{ + labels.FromStrings("team", "foo"), + labels.FromStrings("team", "bar"), + labels.FromStrings("team", "baz"), + } + // Simulate samples discarded with different timestamps. + cat.IncrementDiscardedSamples(observations[0], 1, "invalid-metrics-name", time.Unix(1, 0)) + cat.IncrementDiscardedSamples(observations[1], 2, "out-of-window-sample", time.Unix(12, 0)) + cat.IncrementDiscardedSamples(observations[2], 3, "invalid-metrics-name", time.Unix(20, 0)) + + // Ensure that two observations were successfully added to the tracker. + require.Len(t, cat.observed, 3) + + // Purge observations that haven't been updated in the last 10 seconds. + purged := cat.InactiveObservations(0) + require.Len(t, purged, 0) + + purged = cat.InactiveObservations(10) + assert.ElementsMatch(t, []string{"foo"}, purged) + + purged = cat.InactiveObservations(15) + assert.ElementsMatch(t, []string{"foo", "bar"}, purged) + + // Check that the purged observation matches the expected details. + purged = cat.InactiveObservations(25) + assert.ElementsMatch(t, []string{"foo", "bar", "baz"}, purged) +} + +func Test_UpdateMaxCardinality(t *testing.T) { + // user1 original max cardinality is 5 + cat := newTestManager().TrackerForUser("user1") + cat.UpdateMaxCardinality(2) + assert.Equal(t, 2, cat.MaxCardinality(), "Expected max cardinality update to 2") +} + +func Test_Concurrency(t *testing.T) { + m := newTestManager() + cat := m.TrackerForUser("user1") + + var wg sync.WaitGroup + for i := 0; i < 100; i++ { + wg.Add(1) + go func(i int) { + defer wg.Done() + lbls := labels.FromStrings("team", string(rune('A'+(i%26)))) + cat.updateCounters(lbls, int64(i), 1, 0, 0, nil) + }(i) + } + wg.Wait() + + // Verify no data races or inconsistencies + assert.True(t, len(cat.observed) > 0, "Observed set should not be empty after concurrent updates") + assert.LessOrEqual(t, len(cat.observed), 2*cat.MaxCardinality(), "Observed count should not exceed 2 times of max cardinality") + assert.Equal(t, Overflow, cat.state, "Expected state to be Overflow") + + expectedMetrics := ` + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{team="__overflow__",tenant="user1",tracker="cost-attribution"} 100 +` + assert.NoError(t, testutil.GatherAndCompare(m.reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) +} diff --git a/pkg/distributor/allcase.txt b/pkg/distributor/allcase.txt new file mode 100644 index 00000000000..5efb38bff35 --- /dev/null +++ b/pkg/distributor/allcase.txt @@ -0,0 +1,90 @@ +goos: darwin +goarch: amd64 +pkg: github.com/grafana/mimir/pkg/distributor +cpu: Intel(R) Core(TM) i5-1038NG7 CPU @ 2.00GHz +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 292 4093113 ns/op 1137807 B/op 5058 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 295 4286668 ns/op 1136742 B/op 5057 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 258 4621600 ns/op 1137652 B/op 5058 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 300 4381770 ns/op 1137330 B/op 5058 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 306 3978604 ns/op 1138153 B/op 5058 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 303 3889851 ns/op 1136827 B/op 5058 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 217 5309972 ns/op 1218313 B/op 6059 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 223 5308695 ns/op 1218015 B/op 6059 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 225 5686183 ns/op 1220126 B/op 6060 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 222 5320854 ns/op 1219277 B/op 6059 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 224 5362158 ns/op 1218447 B/op 6059 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 222 5352613 ns/op 1218641 B/op 6060 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 720 1637728 ns/op 324601 B/op 4054 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 668 1699484 ns/op 324867 B/op 4054 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 704 1650014 ns/op 324865 B/op 4054 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 697 1678209 ns/op 324811 B/op 4054 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 712 1679228 ns/op 324811 B/op 4054 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 720 1650075 ns/op 325052 B/op 4054 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 312 3780976 ns/op 1571034 B/op 7090 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 308 3830179 ns/op 1572930 B/op 7104 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 308 3778948 ns/op 1567952 B/op 7089 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 288 4163770 ns/op 1559790 B/op 7088 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 310 3775677 ns/op 1565793 B/op 7093 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 309 4826310 ns/op 1566713 B/op 7091 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 652 1911060 ns/op 165520 B/op 79 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 657 1825805 ns/op 167283 B/op 79 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 631 1823762 ns/op 166046 B/op 81 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 639 1800926 ns/op 167361 B/op 84 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 645 1801281 ns/op 165645 B/op 79 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 646 1813022 ns/op 166700 B/op 79 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1327 906046 ns/op 2407 B/op 43 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1261 894881 ns/op 2523 B/op 43 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1237 905868 ns/op 2527 B/op 43 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1347 883890 ns/op 2510 B/op 43 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1269 880076 ns/op 2520 B/op 43 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1333 884934 ns/op 2484 B/op 43 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 202 6823420 ns/op 1201064 B/op 5059 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 202 5941364 ns/op 1201755 B/op 5059 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 202 6066547 ns/op 1200638 B/op 5058 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 193 5998870 ns/op 1201690 B/op 5059 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 201 5828347 ns/op 1201056 B/op 5059 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 193 5906302 ns/op 1200750 B/op 5059 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 291 4090687 ns/op 1590964 B/op 8098 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 291 4113064 ns/op 1589749 B/op 8091 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 265 4166235 ns/op 1583910 B/op 8096 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 283 4157170 ns/op 1583275 B/op 8099 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 237 4237111 ns/op 1586094 B/op 8093 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 285 4207373 ns/op 1585480 B/op 8095 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 550 2176540 ns/op 183504 B/op 1081 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 502 2186461 ns/op 183481 B/op 1080 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 526 2187088 ns/op 181204 B/op 1080 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 504 2205968 ns/op 182120 B/op 1079 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 531 2192123 ns/op 182981 B/op 1079 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 525 2195721 ns/op 182929 B/op 1080 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1226 986827 ns/op 2559 B/op 45 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1179 980126 ns/op 2446 B/op 45 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1222 971585 ns/op 2496 B/op 45 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1154 983680 ns/op 2541 B/op 45 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1215 959667 ns/op 2529 B/op 45 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1222 983919 ns/op 2558 B/op 45 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 181 10726471 ns/op 1226302 B/op 7062 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 170 7175109 ns/op 1224269 B/op 7060 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 184 6481711 ns/op 1225092 B/op 7060 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 182 6501399 ns/op 1224896 B/op 7059 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 181 7033662 ns/op 1225391 B/op 7060 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 177 6617141 ns/op 1224477 B/op 7060 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 270 4385703 ns/op 1162346 B/op 7059 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 272 4401598 ns/op 1161965 B/op 7059 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 270 4378841 ns/op 1161221 B/op 7059 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 266 4438176 ns/op 1161650 B/op 7059 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 268 4528658 ns/op 1161541 B/op 7059 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 264 4430113 ns/op 1161600 B/op 7059 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 206 6302555 ns/op 1243108 B/op 8060 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 206 5960008 ns/op 1241662 B/op 8059 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 199 6671300 ns/op 1243085 B/op 8061 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 202 5823528 ns/op 1241662 B/op 8060 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 208 5834922 ns/op 1241914 B/op 8060 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 206 5758215 ns/op 1242172 B/op 8060 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 554 2115840 ns/op 348972 B/op 6055 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 564 2145631 ns/op 348762 B/op 6055 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 566 2088044 ns/op 349132 B/op 6055 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 562 2152042 ns/op 349683 B/op 6055 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 544 2103713 ns/op 348848 B/op 6055 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 531 2125180 ns/op 349253 B/op 6055 allocs/op +PASS +ok github.com/grafana/mimir/pkg/distributor 176.572s diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 7bf589f7bc4..3594123435d 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -34,20 +34,8 @@ import ( "github.com/grafana/dskit/services" "github.com/grafana/dskit/tenant" "github.com/grafana/dskit/user" - "github.com/opentracing/opentracing-go" - "github.com/opentracing/opentracing-go/ext" - "github.com/pkg/errors" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" - "github.com/prometheus/common/model" - "github.com/prometheus/prometheus/model/labels" - "github.com/prometheus/prometheus/model/relabel" - "github.com/prometheus/prometheus/scrape" - "go.uber.org/atomic" - "golang.org/x/exp/slices" - "golang.org/x/sync/errgroup" - "github.com/grafana/mimir/pkg/cardinality" + "github.com/grafana/mimir/pkg/costattribution" ingester_client "github.com/grafana/mimir/pkg/ingester/client" "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/querier/stats" @@ -60,6 +48,18 @@ import ( "github.com/grafana/mimir/pkg/util/pool" "github.com/grafana/mimir/pkg/util/spanlogger" "github.com/grafana/mimir/pkg/util/validation" + "github.com/opentracing/opentracing-go" + "github.com/opentracing/opentracing-go/ext" + "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/model/labels" + "github.com/prometheus/prometheus/model/relabel" + "github.com/prometheus/prometheus/scrape" + "go.uber.org/atomic" + "golang.org/x/exp/slices" + "golang.org/x/sync/errgroup" ) func init() { @@ -112,6 +112,7 @@ type Distributor struct { distributorsRing *ring.Ring healthyInstancesCount *atomic.Uint32 + costAttributionMgr *costattribution.Manager // For handling HA replicas. HATracker haTracker @@ -328,7 +329,7 @@ func (m *PushMetrics) deleteUserMetrics(user string) { } // New constructs a new Distributor -func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, activeGroupsCleanupService *util.ActiveGroupsCleanupService, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionInstanceRing, canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) { +func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionMgr *costattribution.Manager, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionInstanceRing, canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) { clientMetrics := ingester_client.NewMetrics(reg) if cfg.IngesterClientFactory == nil { cfg.IngesterClientFactory = ring_client.PoolInstFunc(func(inst ring.InstanceDesc) (ring_client.PoolClient, error) { @@ -349,6 +350,7 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove ingesterPool: NewPool(cfg.PoolConfig, ingestersRing, cfg.IngesterClientFactory, log), healthyInstancesCount: atomic.NewUint32(0), limits: limits, + costAttributionMgr: costAttributionMgr, ingestionRate: util_math.NewEWMARate(0.2, instanceIngestionRateTickInterval), queryDuration: instrument.NewHistogramCollector(promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ @@ -742,13 +744,16 @@ func (d *Distributor) checkSample(ctx context.Context, userID, cluster, replica // Returns an error explaining the first validation finding. // May alter timeseries data in-place. // The returned error may retain the series labels. -func (d *Distributor) validateSamples(now model.Time, ts *mimirpb.PreallocTimeseries, userID, group string) error { + +func (d *Distributor) validateSamples(tnow model.Time, ts *mimirpb.PreallocTimeseries, userID, group string) error { if len(ts.Samples) == 0 { return nil } + cat := d.costAttributionMgr.TrackerForUser(userID) + if len(ts.Samples) == 1 { - return validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, ts.Samples[0]) + return validateSample(d.sampleValidationMetrics, tnow, d.limits, userID, group, ts.Labels, ts.Samples[0], cat) } timestamps := make(map[int64]struct{}, min(len(ts.Samples), 100)) @@ -762,7 +767,7 @@ func (d *Distributor) validateSamples(now model.Time, ts *mimirpb.PreallocTimese } timestamps[s.TimestampMs] = struct{}{} - if err := validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, s); err != nil { + if err := validateSample(d.sampleValidationMetrics, tnow, d.limits, userID, group, ts.Labels, s, cat); err != nil { return err } @@ -782,13 +787,14 @@ func (d *Distributor) validateSamples(now model.Time, ts *mimirpb.PreallocTimese // Returns an error explaining the first validation finding. // May alter timeseries data in-place. // The returned error may retain the series labels. -func (d *Distributor) validateHistograms(now model.Time, ts *mimirpb.PreallocTimeseries, userID, group string) error { +func (d *Distributor) validateHistograms(tnow model.Time, ts *mimirpb.PreallocTimeseries, userID, group string) error { if len(ts.Histograms) == 0 { return nil } + cat := d.costAttributionMgr.TrackerForUser(userID) if len(ts.Histograms) == 1 { - updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[0]) + updated, err := validateSampleHistogram(d.sampleValidationMetrics, tnow, d.limits, userID, group, ts.Labels, &ts.Histograms[0], cat) if err != nil { return err } @@ -801,6 +807,7 @@ func (d *Distributor) validateHistograms(now model.Time, ts *mimirpb.PreallocTim timestamps := make(map[int64]struct{}, min(len(ts.Histograms), 100)) currPos := 0 histogramsUpdated := false + for idx := range ts.Histograms { if _, ok := timestamps[ts.Histograms[idx].Timestamp]; ok { // A sample with the same timestamp has already been validated, so we skip it. @@ -809,7 +816,7 @@ func (d *Distributor) validateHistograms(now model.Time, ts *mimirpb.PreallocTim } timestamps[ts.Histograms[idx].Timestamp] = struct{}{} - updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[idx]) + updated, err := validateSampleHistogram(d.sampleValidationMetrics, tnow, d.limits, userID, group, ts.Labels, &ts.Histograms[idx], cat) if err != nil { return err } @@ -873,10 +880,10 @@ func (d *Distributor) validateExemplars(ts *mimirpb.PreallocTimeseries, userID s // The returned error may retain the series labels. // It uses the passed nowt time to observe the delay of sample timestamps. func (d *Distributor) validateSeries(nowt time.Time, ts *mimirpb.PreallocTimeseries, userID, group string, skipLabelValidation, skipLabelCountValidation bool, minExemplarTS, maxExemplarTS int64) (bool, error) { - if err := validateLabels(d.sampleValidationMetrics, d.limits, userID, group, ts.Labels, skipLabelValidation, skipLabelCountValidation); err != nil { + cat := d.costAttributionMgr.TrackerForUser(userID) + if err := validateLabels(d.sampleValidationMetrics, d.limits, userID, group, ts.Labels, skipLabelValidation, skipLabelCountValidation, cat, nowt); err != nil { return true, err } - now := model.TimeFromUnixNano(nowt.UnixNano()) totalSamplesAndHistograms := len(ts.Samples) + len(ts.Histograms) @@ -966,7 +973,8 @@ func (d *Distributor) prePushHaDedupeMiddleware(next PushFunc) PushFunc { } numSamples := 0 - group := d.activeGroups.UpdateActiveGroupTimestamp(userID, validation.GroupLabel(d.limits, userID, req.Timeseries), time.Now()) + tnow := time.Now() + group := d.activeGroups.UpdateActiveGroupTimestamp(userID, validation.GroupLabel(d.limits, userID, req.Timeseries), tnow) for _, ts := range req.Timeseries { numSamples += len(ts.Samples) + len(ts.Histograms) } @@ -980,6 +988,7 @@ func (d *Distributor) prePushHaDedupeMiddleware(next PushFunc) PushFunc { if errors.As(err, &tooManyClustersError{}) { d.discardedSamplesTooManyHaClusters.WithLabelValues(userID, group).Add(float64(numSamples)) + d.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(numSamples), reasonTooManyHAClusters, tnow) } return err @@ -1237,6 +1246,9 @@ func (d *Distributor) prePushValidationMiddleware(next PushFunc) PushFunc { totalN := validatedSamples + validatedExemplars + validatedMetadata if !d.ingestionRateLimiter.AllowN(now, userID, totalN) { + if len(req.Timeseries) > 0 { + d.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(validatedSamples), reasonRateLimited, now) + } d.discardedSamplesRateLimited.WithLabelValues(userID, group).Add(float64(validatedSamples)) d.discardedExemplarsRateLimited.WithLabelValues(userID).Add(float64(validatedExemplars)) d.discardedMetadataRateLimited.WithLabelValues(userID).Add(float64(validatedMetadata)) @@ -1817,9 +1829,11 @@ func tokenForMetadata(userID string, metricName string) uint32 { func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID string) { var receivedSamples, receivedExemplars, receivedMetadata int + for _, ts := range req.Timeseries { receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) + d.costAttributionMgr.TrackerForUser(userID).IncrementReceivedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(receivedSamples), mtime.Now()) } receivedMetadata = len(req.Metadata) diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index 76a27fff797..f115e9626f6 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -54,6 +54,7 @@ import ( "google.golang.org/grpc/metadata" "github.com/grafana/mimir/pkg/cardinality" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/ingester" "github.com/grafana/mimir/pkg/ingester/client" "github.com/grafana/mimir/pkg/mimirpb" @@ -2114,7 +2115,7 @@ func mkLabels(n int, extra ...string) []mimirpb.LabelAdapter { ret[i+1] = mimirpb.LabelAdapter{Name: fmt.Sprintf("name_%d", i), Value: fmt.Sprintf("value_%d", i)} } for i := 0; i < len(extra); i += 2 { - ret[i+n+1] = mimirpb.LabelAdapter{Name: extra[i], Value: extra[i+1]} + ret[i/2+n+1] = mimirpb.LabelAdapter{Name: extra[i], Value: extra[i+1]} } slices.SortFunc(ret, func(a, b mimirpb.LabelAdapter) int { switch { @@ -2147,7 +2148,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(10) + metrics[i] = mkLabels(10, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -2168,7 +2169,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(10) + metrics[i] = mkLabels(10, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -2188,7 +2189,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(31) + metrics[i] = mkLabels(30, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -2209,7 +2210,7 @@ func BenchmarkDistributor_Push(b *testing.B) { for i := 0; i < numSeriesPerRequest; i++ { // Add a label with a very long name. - metrics[i] = mkLabels(10, fmt.Sprintf("xxx_%0.200d", 1), "xxx") + metrics[i] = mkLabels(10, fmt.Sprintf("xxx_%0.200d", 1), "xxx", "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -2230,7 +2231,7 @@ func BenchmarkDistributor_Push(b *testing.B) { for i := 0; i < numSeriesPerRequest; i++ { // Add a label with a very long value. - metrics[i] = mkLabels(10, "xxx", fmt.Sprintf("xxx_%0.200d", 1)) + metrics[i] = mkLabels(10, "xxx", fmt.Sprintf("xxx_%0.200d", 1), "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -2250,7 +2251,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(10) + metrics[i] = mkLabels(10, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().Add(time.Hour).UnixNano() / int64(time.Millisecond), @@ -2261,7 +2262,7 @@ func BenchmarkDistributor_Push(b *testing.B) { }, expectedErr: "received a sample whose timestamp is too far in the future", }, - "all samples go to metric_relabel_configs": { + "all samples go to metric relabel configs": { prepareConfig: func(limits *validation.Limits) { limits.MetricRelabelConfigs = []*relabel.Config{ { @@ -2278,7 +2279,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(10) + metrics[i] = mkLabels(10, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -2291,78 +2292,110 @@ func BenchmarkDistributor_Push(b *testing.B) { }, } - for testName, testData := range tests { - b.Run(testName, func(b *testing.B) { - // Create an in-memory KV store for the ring with 1 ingester registered. - kvStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) - b.Cleanup(func() { assert.NoError(b, closer.Close()) }) + costAttributionCases := []struct { + state string + customRegistry *prometheus.Registry + cfg func(limits *validation.Limits) + }{ + { + state: "disabled", + customRegistry: nil, + cfg: func(_ *validation.Limits) {}, + }, + { + state: "enabled", + customRegistry: prometheus.NewRegistry(), + cfg: func(limits *validation.Limits) { + limits.CostAttributionLabels = []string{"team"} + limits.MaxCostAttributionCardinalityPerUser = 100 + }, + }, + } - err := kvStore.CAS(context.Background(), ingester.IngesterRingKey, - func(_ interface{}) (interface{}, bool, error) { - d := &ring.Desc{} - d.AddIngester("ingester-1", "127.0.0.1", "", ring.NewRandomTokenGenerator().GenerateTokens(128, nil), ring.ACTIVE, time.Now(), false, time.Time{}) - return d, true, nil - }, - ) - require.NoError(b, err) - - ingestersRing, err := ring.New(ring.Config{ - KVStore: kv.Config{Mock: kvStore}, - HeartbeatTimeout: 60 * time.Minute, - ReplicationFactor: 1, - }, ingester.IngesterRingKey, ingester.IngesterRingKey, log.NewNopLogger(), nil) - require.NoError(b, err) - require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingestersRing)) - b.Cleanup(func() { - require.NoError(b, services.StopAndAwaitTerminated(context.Background(), ingestersRing)) - }) + for _, caCase := range costAttributionCases { + b.Run(fmt.Sprintf("cost_attribution=%s", caCase.state), func(b *testing.B) { + for testName, testData := range tests { + b.Run(fmt.Sprintf("scenario=%s", testName), func(b *testing.B) { + // Create an in-memory KV store for the ring with 1 ingester registered. + kvStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) + b.Cleanup(func() { assert.NoError(b, closer.Close()) }) - test.Poll(b, time.Second, 1, func() interface{} { - return ingestersRing.InstancesCount() - }) + err := kvStore.CAS(context.Background(), ingester.IngesterRingKey, + func(_ interface{}) (interface{}, bool, error) { + d := &ring.Desc{} + d.AddIngester("ingester-1", "127.0.0.1", "", ring.NewRandomTokenGenerator().GenerateTokens(128, nil), ring.ACTIVE, time.Now(), false, time.Time{}) + return d, true, nil + }, + ) + require.NoError(b, err) + + ingestersRing, err := ring.New(ring.Config{ + KVStore: kv.Config{Mock: kvStore}, + HeartbeatTimeout: 60 * time.Minute, + ReplicationFactor: 1, + }, ingester.IngesterRingKey, ingester.IngesterRingKey, log.NewNopLogger(), nil) + require.NoError(b, err) + require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingestersRing)) + b.Cleanup(func() { + require.NoError(b, services.StopAndAwaitTerminated(context.Background(), ingestersRing)) + }) - // Prepare the distributor configuration. - var distributorCfg Config - var clientConfig client.Config - limits := validation.Limits{} - flagext.DefaultValues(&distributorCfg, &clientConfig, &limits) - distributorCfg.DistributorRing.Common.KVStore.Store = "inmemory" + test.Poll(b, time.Second, 1, func() interface{} { + return ingestersRing.InstancesCount() + }) - limits.IngestionRate = float64(rate.Inf) // Unlimited. - testData.prepareConfig(&limits) + // Prepare the distributor configuration. + var distributorCfg Config + var clientConfig client.Config + limits := validation.Limits{} + flagext.DefaultValues(&distributorCfg, &clientConfig, &limits) + distributorCfg.DistributorRing.Common.KVStore.Store = "inmemory" - distributorCfg.IngesterClientFactory = ring_client.PoolInstFunc(func(ring.InstanceDesc) (ring_client.PoolClient, error) { - return &noopIngester{}, nil - }) + limits.IngestionRate = float64(rate.Inf) // Unlimited. + testData.prepareConfig(&limits) - overrides, err := validation.NewOverrides(limits, nil) - require.NoError(b, err) + distributorCfg.IngesterClientFactory = ring_client.PoolInstFunc(func(ring.InstanceDesc) (ring_client.PoolClient, error) { + return &noopIngester{}, nil + }) - // Start the distributor. - distributor, err := New(distributorCfg, clientConfig, overrides, nil, ingestersRing, nil, true, nil, log.NewNopLogger()) - require.NoError(b, err) - require.NoError(b, services.StartAndAwaitRunning(context.Background(), distributor)) + caCase.cfg(&limits) + overrides, err := validation.NewOverrides(limits, nil) + require.NoError(b, err) - b.Cleanup(func() { - require.NoError(b, services.StopAndAwaitTerminated(context.Background(), distributor)) - }) + // Initialize the cost attribution manager + var cam *costattribution.Manager + if caCase.customRegistry != nil { + cam, err = costattribution.NewManager(5*time.Second, time.Second, 10*time.Second, nil, overrides, caCase.customRegistry) + require.NoError(b, err) + } - // Prepare the series to remote write before starting the benchmark. - metrics, samples := testData.prepareSeries() + // Start the distributor. + distributor, err := New(distributorCfg, clientConfig, overrides, nil, cam, ingestersRing, nil, true, nil, log.NewNopLogger()) + require.NoError(b, err) + require.NoError(b, services.StartAndAwaitRunning(context.Background(), distributor)) - // Run the benchmark. - b.ReportAllocs() - b.ResetTimer() + b.Cleanup(func() { + require.NoError(b, services.StopAndAwaitTerminated(context.Background(), distributor)) + }) - for n := 0; n < b.N; n++ { - _, err := distributor.Push(ctx, mimirpb.ToWriteRequest(metrics, samples, nil, nil, mimirpb.API)) + // Prepare the series to remote write before starting the benchmark. + metrics, samples := testData.prepareSeries() - if testData.expectedErr == "" && err != nil { - b.Fatalf("no error expected but got %v", err) - } - if testData.expectedErr != "" && (err == nil || !strings.Contains(err.Error(), testData.expectedErr)) { - b.Fatalf("expected %v error but got %v", testData.expectedErr, err) - } + // Run the benchmark. + b.ReportAllocs() + b.ResetTimer() + + for n := 0; n < b.N; n++ { + _, err := distributor.Push(ctx, mimirpb.ToWriteRequest(metrics, samples, nil, nil, mimirpb.API)) + + if testData.expectedErr == "" && err != nil { + b.Fatalf("no error expected but got %v", err) + } + if testData.expectedErr != "" && (err == nil || !strings.Contains(err.Error(), testData.expectedErr)) { + b.Fatalf("expected %v error but got %v", testData.expectedErr, err) + } + } + }) } }) } @@ -5627,7 +5660,7 @@ func prepare(t testing.TB, cfg prepConfig) ([]*Distributor, []*mockIngester, []* require.NoError(t, err) reg := prometheus.NewPedanticRegistry() - d, err := New(distributorCfg, clientConfig, overrides, nil, ingestersRing, partitionsRing, true, reg, log.NewNopLogger()) + d, err := New(distributorCfg, clientConfig, overrides, nil, nil, ingestersRing, partitionsRing, true, reg, log.NewNopLogger()) require.NoError(t, err) require.NoError(t, services.StartAndAwaitRunning(ctx, d)) t.Cleanup(func() { @@ -8263,7 +8296,7 @@ func TestCheckStartedMiddleware(t *testing.T) { overrides, err := validation.NewOverrides(limits, nil) require.NoError(t, err) - distributor, err := New(distributorConfig, clientConfig, overrides, nil, ingestersRing, nil, true, nil, log.NewNopLogger()) + distributor, err := New(distributorConfig, clientConfig, overrides, nil, nil, ingestersRing, nil, true, nil, log.NewNopLogger()) require.NoError(t, err) ctx := user.InjectOrgID(context.Background(), "user") diff --git a/pkg/distributor/validate.go b/pkg/distributor/validate.go index ab9426513ad..8b9849ba730 100644 --- a/pkg/distributor/validate.go +++ b/pkg/distributor/validate.go @@ -16,6 +16,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/util/extract" "github.com/grafana/mimir/pkg/util/globalerror" @@ -238,15 +239,17 @@ func newExemplarValidationMetrics(r prometheus.Registerer) *exemplarValidationMe // validateSample returns an err if the sample is invalid. // The returned error may retain the provided series labels. // It uses the passed 'now' time to measure the relative time of the sample. -func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s mimirpb.Sample) error { +func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s mimirpb.Sample, cat *costattribution.Tracker) error { if model.Time(s.TimestampMs) > now.Add(cfg.CreationGracePeriod(userID)) { m.tooFarInFuture.WithLabelValues(userID, group).Inc() + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInFuture, now.Time()) unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return fmt.Errorf(sampleTimestampTooNewMsgFormat, s.TimestampMs, unsafeMetricName) } if cfg.PastGracePeriod(userID) > 0 && model.Time(s.TimestampMs) < now.Add(-cfg.PastGracePeriod(userID)).Add(-cfg.OutOfOrderTimeWindow(userID)) { m.tooFarInPast.WithLabelValues(userID, group).Inc() + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInPast, now.Time()) unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return fmt.Errorf(sampleTimestampTooOldMsgFormat, s.TimestampMs, unsafeMetricName) } @@ -257,20 +260,23 @@ func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValida // validateSampleHistogram returns an err if the sample is invalid. // The returned error may retain the provided series labels. // It uses the passed 'now' time to measure the relative time of the sample. -func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s *mimirpb.Histogram) (bool, error) { +func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s *mimirpb.Histogram, cat *costattribution.Tracker) (bool, error) { if model.Time(s.Timestamp) > now.Add(cfg.CreationGracePeriod(userID)) { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInFuture, now.Time()) m.tooFarInFuture.WithLabelValues(userID, group).Inc() unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return false, fmt.Errorf(sampleTimestampTooNewMsgFormat, s.Timestamp, unsafeMetricName) } if cfg.PastGracePeriod(userID) > 0 && model.Time(s.Timestamp) < now.Add(-cfg.PastGracePeriod(userID)).Add(-cfg.OutOfOrderTimeWindow(userID)) { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInPast, now.Time()) m.tooFarInPast.WithLabelValues(userID, group).Inc() unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return false, fmt.Errorf(sampleTimestampTooOldMsgFormat, s.Timestamp, unsafeMetricName) } if s.Schema < mimirpb.MinimumHistogramSchema || s.Schema > mimirpb.MaximumHistogramSchema { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidNativeHistogramSchema, now.Time()) m.invalidNativeHistogramSchema.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(invalidSchemaNativeHistogramMsgFormat, s.Schema) } @@ -284,6 +290,7 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam } if bucketCount > bucketLimit { if !cfg.ReduceNativeHistogramOverMaxBuckets(userID) { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxNativeHistogramBuckets, now.Time()) m.maxNativeHistogramBuckets.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(maxNativeHistogramBucketsMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), bucketCount, bucketLimit) } @@ -291,6 +298,7 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam for { bc, err := s.ReduceResolution() if err != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxNativeHistogramBuckets, now.Time()) m.maxNativeHistogramBuckets.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(notReducibleNativeHistogramMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), bucketCount, bucketLimit) } @@ -392,14 +400,16 @@ func removeNonASCIIChars(in string) (out string) { // validateLabels returns an err if the labels are invalid. // The returned error may retain the provided series labels. -func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelValidation, skipLabelCountValidation bool) error { +func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelValidation, skipLabelCountValidation bool, cat *costattribution.Tracker, ts time.Time) error { unsafeMetricName, err := extract.UnsafeMetricNameFromLabelAdapters(ls) if err != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMissingMetricName, ts) m.missingMetricName.WithLabelValues(userID, group).Inc() return errors.New(noMetricNameMsgFormat) } if !model.IsValidMetricName(model.LabelValue(unsafeMetricName)) { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidMetricName, ts) m.invalidMetricName.WithLabelValues(userID, group).Inc() return fmt.Errorf(invalidMetricNameMsgFormat, removeNonASCIIChars(unsafeMetricName)) } @@ -408,11 +418,13 @@ func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userI if strings.HasSuffix(unsafeMetricName, "_info") { if len(ls) > cfg.MaxLabelNamesPerInfoSeries(userID) { m.maxLabelNamesPerInfoSeries.WithLabelValues(userID, group).Inc() + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxLabelNamesPerInfoSeries, ts) metric, ellipsis := getMetricAndEllipsis(ls) return fmt.Errorf(tooManyInfoLabelsMsgFormat, len(ls), cfg.MaxLabelNamesPerInfoSeries(userID), metric, ellipsis) } } else { m.maxLabelNamesPerSeries.WithLabelValues(userID, group).Inc() + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxLabelNamesPerSeries, ts) metric, ellipsis := getMetricAndEllipsis(ls) return fmt.Errorf(tooManyLabelsMsgFormat, len(ls), cfg.MaxLabelNamesPerSeries(userID), metric, ellipsis) } @@ -424,17 +436,21 @@ func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userI for _, l := range ls { if !skipLabelValidation && !model.LabelName(l.Name).IsValid() { m.invalidLabel.WithLabelValues(userID, group).Inc() + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidLabel, ts) return fmt.Errorf(invalidLabelMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } else if len(l.Name) > maxLabelNameLength { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonLabelNameTooLong, ts) m.labelNameTooLong.WithLabelValues(userID, group).Inc() return fmt.Errorf(labelNameTooLongMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } else if !skipLabelValidation && !model.LabelValue(l.Value).IsValid() { m.invalidLabelValue.WithLabelValues(userID, group).Inc() return fmt.Errorf(invalidLabelValueMsgFormat, l.Name, strings.ToValidUTF8(l.Value, ""), unsafeMetricName) } else if len(l.Value) > maxLabelValueLength { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonLabelValueTooLong, ts) m.labelValueTooLong.WithLabelValues(userID, group).Inc() return fmt.Errorf(labelValueTooLongMsgFormat, l.Name, l.Value, mimirpb.FromLabelAdaptersToString(ls)) } else if lastLabelName == l.Name { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonDuplicateLabelNames, ts) m.duplicateLabelNames.WithLabelValues(userID, group).Inc() return fmt.Errorf(duplicateLabelMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } diff --git a/pkg/distributor/validate_test.go b/pkg/distributor/validate_test.go index df4de2dd60f..c84ed0b58a8 100644 --- a/pkg/distributor/validate_test.go +++ b/pkg/distributor/validate_test.go @@ -60,6 +60,7 @@ func (vm validateMetadataCfg) MaxMetadataLength(_ string) int { } func TestValidateLabels(t *testing.T) { + ts := time.Now() reg := prometheus.NewPedanticRegistry() s := newSampleValidationMetrics(reg) @@ -222,7 +223,7 @@ func TestValidateLabels(t *testing.T) { err: nil, }, } { - err := validateLabels(s, cfg, userID, "custom label", mimirpb.FromMetricsToLabelAdapters(c.metric), c.skipLabelNameValidation, c.skipLabelCountValidation) + err := validateLabels(s, cfg, userID, "custom label", mimirpb.FromMetricsToLabelAdapters(c.metric), c.skipLabelNameValidation, c.skipLabelCountValidation, nil, ts) assert.Equal(t, c.err, err, "wrong error") } @@ -416,17 +417,17 @@ func TestValidateMetadata(t *testing.T) { } func TestValidateLabelDuplication(t *testing.T) { + ts := time.Now() var cfg validateLabelsCfg cfg.maxLabelNameLength = 10 cfg.maxLabelNamesPerSeries = 10 cfg.maxLabelValueLength = 10 userID := "testUser" - actual := validateLabels(newSampleValidationMetrics(nil), cfg, userID, "", []mimirpb.LabelAdapter{ {Name: model.MetricNameLabel, Value: "a"}, {Name: model.MetricNameLabel, Value: "b"}, - }, false, false) + }, false, false, nil, ts) expected := fmt.Errorf( duplicateLabelMsgFormat, model.MetricNameLabel, @@ -443,7 +444,7 @@ func TestValidateLabelDuplication(t *testing.T) { {Name: model.MetricNameLabel, Value: "a"}, {Name: "a", Value: "a"}, {Name: "a", Value: "a"}, - }, false, false) + }, false, false, nil, ts) expected = fmt.Errorf( duplicateLabelMsgFormat, "a", @@ -594,7 +595,6 @@ func TestMaxNativeHistorgramBuckets(t *testing.T) { registry := prometheus.NewRegistry() metrics := newSampleValidationMetrics(registry) - for _, limit := range []int{0, 1, 2} { for name, h := range testCases { t.Run(fmt.Sprintf("limit-%d-%s", limit, name), func(t *testing.T) { @@ -602,7 +602,7 @@ func TestMaxNativeHistorgramBuckets(t *testing.T) { cfg.maxNativeHistogramBuckets = limit ls := []mimirpb.LabelAdapter{{Name: model.MetricNameLabel, Value: "a"}, {Name: "a", Value: "a"}} - _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", ls, &h) + _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", ls, &h, nil) if limit == 1 { require.Error(t, err) @@ -649,7 +649,7 @@ func TestInvalidNativeHistogramSchema(t *testing.T) { for testName, testCase := range testCases { t.Run(testName, func(t *testing.T) { hist.Schema = testCase.schema - _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", labels, hist) + _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", labels, hist, nil) require.Equal(t, testCase.expectedError, err) }) } diff --git a/pkg/ingester/activeseries/active_labels_test.go b/pkg/ingester/activeseries/active_labels_test.go index aa7f928d7dd..6fdf3e00bc4 100644 --- a/pkg/ingester/activeseries/active_labels_test.go +++ b/pkg/ingester/activeseries/active_labels_test.go @@ -41,7 +41,7 @@ func TestIsLabelValueActive(t *testing.T) { labels.FromStrings("a", "5"), } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) memPostings := index.NewMemPostings() for i, l := range series { @@ -51,10 +51,10 @@ func TestIsLabelValueActive(t *testing.T) { // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) require.True(t, valid) result, err := IsLabelValueActive(ctx, reader, activeSeries, "a", "1") diff --git a/pkg/ingester/activeseries/active_native_histogram_postings_test.go b/pkg/ingester/activeseries/active_native_histogram_postings_test.go index 665f5787c61..2b95020c68d 100644 --- a/pkg/ingester/activeseries/active_native_histogram_postings_test.go +++ b/pkg/ingester/activeseries/active_native_histogram_postings_test.go @@ -26,7 +26,7 @@ func TestNativeHistogramPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -34,10 +34,10 @@ func TestNativeHistogramPostings_Expand(t *testing.T) { if i+1 == 3 || i+1 == 4 { buckets = 10 // Native histogram with 10 buckets. } - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -62,7 +62,7 @@ func TestNativeHistogramPostings_ExpandWithBucketCount(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -70,10 +70,10 @@ func TestNativeHistogramPostings_ExpandWithBucketCount(t *testing.T) { if i == 2 || i == 3 { buckets = i * 10 // Native histogram with i*10 buckets. } - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 5, allActive) @@ -106,17 +106,18 @@ func TestNativeHistogramPostings_SeekSkipsNonNative(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { buckets := i * 10 if i+1 == 4 { buckets = -1 // Make ref==4 not a native histogram to check that Seek skips it. } - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -145,14 +146,15 @@ func TestNativeHistogramPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { buckets := i * 10 - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -181,14 +183,14 @@ func TestNativeHistogramPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), 10) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), 10, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 0, allActive) diff --git a/pkg/ingester/activeseries/active_postings_test.go b/pkg/ingester/activeseries/active_postings_test.go index a2345841d11..84c71634e72 100644 --- a/pkg/ingester/activeseries/active_postings_test.go +++ b/pkg/ingester/activeseries/active_postings_test.go @@ -26,13 +26,14 @@ func TestPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -57,13 +58,14 @@ func TestPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -88,13 +90,14 @@ func TestPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 0, allActive) diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 71044b5e348..e7895404a22 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -13,10 +13,12 @@ import ( "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/storage" + "github.com/prometheus/prometheus/tsdb" "github.com/prometheus/prometheus/tsdb/chunks" "github.com/prometheus/prometheus/util/zeropool" "go.uber.org/atomic" + "github.com/grafana/mimir/pkg/costattribution" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" ) @@ -44,10 +46,11 @@ type ActiveSeries struct { stripes [numStripes]seriesStripe deleted deletedSeries - // matchersMutex protects matchers and lastMatchersUpdate. - matchersMutex sync.RWMutex - matchers *asmodel.Matchers - lastMatchersUpdate time.Time + // configMutex protects matchers and lastMatchersUpdate. it used by both matchers and cat + configMutex sync.RWMutex + matchers *asmodel.Matchers + cat *costattribution.Tracker + lastConfigUpdate time.Time // The duration after which series become inactive. // Also used to determine if enough time has passed since configuration reload for valid results. @@ -63,8 +66,8 @@ type seriesStripe struct { // Unix nanoseconds. Only used by purge. Zero = unknown. // Updated in purge and when old timestamp is used when updating series (in this case, oldestEntryTs is updated // without holding the lock -- hence the atomic). - oldestEntryTs atomic.Int64 - + oldestEntryTs atomic.Int64 + cat *costattribution.Tracker mu sync.RWMutex refs map[storage.SeriesRef]seriesEntry active uint32 // Number of active entries in this stripe. Only decreased during purge or clear. @@ -84,50 +87,61 @@ type seriesEntry struct { deleted bool // This series was marked as deleted, so before purging we need to remove the refence to it from the deletedSeries. } -func NewActiveSeries(asm *asmodel.Matchers, timeout time.Duration) *ActiveSeries { - c := &ActiveSeries{matchers: asm, timeout: timeout} +func NewActiveSeries( + asm *asmodel.Matchers, + timeout time.Duration, + cat *costattribution.Tracker, +) *ActiveSeries { + c := &ActiveSeries{ + matchers: asm, timeout: timeout, cat: cat, + } // Stripes are pre-allocated so that we only read on them and no lock is required. for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted) + c.stripes[i].reinitialize(asm, &c.deleted, cat) } return c } func (c *ActiveSeries) CurrentMatcherNames() []string { - c.matchersMutex.RLock() - defer c.matchersMutex.RUnlock() + c.configMutex.RLock() + defer c.configMutex.RUnlock() return c.matchers.MatcherNames() } +func (c *ActiveSeries) ConfigDiffers(ctCfg asmodel.CustomTrackersConfig, caCfg *costattribution.Tracker) bool { + currentCTC, currentCAT := c.CurrentConfig() + // TODO: I think here to check the pointer is not equal is already enough, if we recreate tracker, it is for a good reason, otherwise, nothing changed + return ctCfg.String() != currentCTC.String() || caCfg != currentCAT //|| !costattribution.CompareCALabels(caCfg.CALabels(), currentCAT.CALabels()) +} + func (c *ActiveSeries) ReloadMatchers(asm *asmodel.Matchers, now time.Time) { - c.matchersMutex.Lock() - defer c.matchersMutex.Unlock() + c.configMutex.Lock() + defer c.configMutex.Unlock() for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted) + c.stripes[i].reinitialize(asm, &c.deleted, c.cat) } c.matchers = asm - c.lastMatchersUpdate = now + c.lastConfigUpdate = now } -func (c *ActiveSeries) CurrentConfig() asmodel.CustomTrackersConfig { - c.matchersMutex.RLock() - defer c.matchersMutex.RUnlock() - return c.matchers.Config() +func (c *ActiveSeries) CurrentConfig() (asmodel.CustomTrackersConfig, *costattribution.Tracker) { + c.configMutex.RLock() + defer c.configMutex.RUnlock() + return c.matchers.Config(), c.cat } // UpdateSeries updates series timestamp to 'now'. Function is called to make a copy of labels if entry doesn't exist yet. // Pass -1 in numNativeHistogramBuckets if the series is not a native histogram series. -func (c *ActiveSeries) UpdateSeries(series labels.Labels, ref storage.SeriesRef, now time.Time, numNativeHistogramBuckets int) { +func (c *ActiveSeries) UpdateSeries(series labels.Labels, ref storage.SeriesRef, now time.Time, numNativeHistogramBuckets int, idx tsdb.IndexReader) { stripeID := ref % numStripes - created := c.stripes[stripeID].updateSeriesTimestamp(now, series, ref, numNativeHistogramBuckets) if created { if deleted, ok := c.deleted.find(series); ok { deletedStripeID := deleted.ref % numStripes - c.stripes[deletedStripeID].remove(deleted.ref) + c.stripes[deletedStripeID].remove(deleted.ref, idx) } } } @@ -149,19 +163,19 @@ func (c *ActiveSeries) PostDeletion(deleted map[chunks.HeadSeriesRef]labels.Labe // Purge purges expired entries and returns true if enough time has passed since // last reload. This should be called periodically to avoid unbounded memory // growth. -func (c *ActiveSeries) Purge(now time.Time) bool { - c.matchersMutex.Lock() - defer c.matchersMutex.Unlock() +func (c *ActiveSeries) Purge(now time.Time, idx tsdb.IndexReader) bool { + c.configMutex.Lock() + defer c.configMutex.Unlock() purgeTime := now.Add(-c.timeout) - c.purge(purgeTime) + c.purge(purgeTime, idx) - return !c.lastMatchersUpdate.After(purgeTime) + return !c.lastConfigUpdate.After(purgeTime) } // purge removes expired entries from the cache. -func (c *ActiveSeries) purge(keepUntil time.Time) { +func (c *ActiveSeries) purge(keepUntil time.Time, idx tsdb.IndexReader) { for s := 0; s < numStripes; s++ { - c.stripes[s].purge(keepUntil) + c.stripes[s].purge(keepUntil, idx) } } @@ -196,8 +210,8 @@ func (c *ActiveSeries) Active() (total, totalNativeHistograms, totalNativeHistog // of buckets in those active native histogram series. This method does not purge // expired entries, so Purge should be called periodically. func (c *ActiveSeries) ActiveWithMatchers() (total int, totalMatching []int, totalNativeHistograms int, totalMatchingNativeHistograms []int, totalNativeHistogramBuckets int, totalMatchingNativeHistogramBuckets []int) { - c.matchersMutex.RLock() - defer c.matchersMutex.RUnlock() + c.configMutex.RLock() + defer c.configMutex.RUnlock() totalMatching = make([]int, len(c.matchers.MatcherNames())) totalMatchingNativeHistograms = make([]int, len(c.matchers.MatcherNames())) @@ -212,9 +226,9 @@ func (c *ActiveSeries) ActiveWithMatchers() (total int, totalMatching []int, tot return } -func (c *ActiveSeries) Delete(ref chunks.HeadSeriesRef) { +func (c *ActiveSeries) Delete(ref chunks.HeadSeriesRef, idx tsdb.IndexReader) { stripeID := storage.SeriesRef(ref) % numStripes - c.stripes[stripeID].remove(storage.SeriesRef(ref)) + c.stripes[stripeID].remove(storage.SeriesRef(ref), idx) } func (c *ActiveSeries) Clear() { @@ -394,6 +408,9 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef numNativeHistogramBuckets: numNativeHistogramBuckets, } + // here if we have a cost attribution label, we can split the serie count based on the value of the label + // we also set the reference to the value of the label in the entry, so when remove, we can decrease the counter accordingly + s.cat.IncrementActiveSeries(series, time.Unix(0, nowNanos)) s.refs[ref] = e return e.nanos, true } @@ -415,10 +432,13 @@ func (s *seriesStripe) clear() { } // Reinitialize assigns new matchers and corresponding size activeMatching slices. -func (s *seriesStripe) reinitialize(asm *asmodel.Matchers, deleted *deletedSeries) { +func (s *seriesStripe) reinitialize( + asm *asmodel.Matchers, + deleted *deletedSeries, + cat *costattribution.Tracker, +) { s.mu.Lock() defer s.mu.Unlock() - s.deleted = deleted s.oldestEntryTs.Store(0) s.refs = map[storage.SeriesRef]seriesEntry{} @@ -429,9 +449,10 @@ func (s *seriesStripe) reinitialize(asm *asmodel.Matchers, deleted *deletedSerie s.activeMatching = resizeAndClear(len(asm.MatcherNames()), s.activeMatching) s.activeMatchingNativeHistograms = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistograms) s.activeMatchingNativeHistogramBuckets = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistogramBuckets) + s.cat = cat } -func (s *seriesStripe) purge(keepUntil time.Time) { +func (s *seriesStripe) purge(keepUntil time.Time, idx tsdb.IndexReader) { keepUntilNanos := keepUntil.UnixNano() if oldest := s.oldestEntryTs.Load(); oldest > 0 && keepUntilNanos <= oldest { // Nothing to do. @@ -449,12 +470,21 @@ func (s *seriesStripe) purge(keepUntil time.Time) { s.activeMatchingNativeHistogramBuckets = resizeAndClear(len(s.activeMatchingNativeHistogramBuckets), s.activeMatchingNativeHistogramBuckets) oldest := int64(math.MaxInt64) + buf := labels.NewScratchBuilder(128) for ref, entry := range s.refs { ts := entry.nanos.Load() if ts < keepUntilNanos { if entry.deleted { s.deleted.purge(ref) } + + if idx != nil { + if err := idx.Series(ref, &buf, nil); err != nil { + //TODO: think about what to do here + _ = err + } + s.cat.DecrementActiveSeries(buf.Labels(), keepUntil) + } delete(s.refs, ref) continue } @@ -489,7 +519,7 @@ func (s *seriesStripe) purge(keepUntil time.Time) { // This is mostly the same logic from purge() but we decrement counters for a single entry instead of incrementing for each entry. // Note: we might remove the oldest series here, but the worst thing can happen is that we let run a useless purge() cycle later, // so this method doesn't update the oldestEntryTs. -func (s *seriesStripe) remove(ref storage.SeriesRef) { +func (s *seriesStripe) remove(ref storage.SeriesRef, idx tsdb.IndexReader) { s.mu.Lock() defer s.mu.Unlock() @@ -502,6 +532,14 @@ func (s *seriesStripe) remove(ref storage.SeriesRef) { } s.active-- + if idx != nil { + buf := labels.NewScratchBuilder(10) + if err := idx.Series(ref, &buf, nil); err != nil { + //TODO: think about what to do here + _ = err + } + s.cat.DecrementActiveSeries(buf.Labels(), time.Now()) + } if entry.numNativeHistogramBuckets >= 0 { s.activeNativeHistograms-- s.activeNativeHistogramBuckets -= uint32(entry.numNativeHistogramBuckets) diff --git a/pkg/ingester/activeseries/active_series_test.go b/pkg/ingester/activeseries/active_series_test.go index cf821c5bca5..ca36450f823 100644 --- a/pkg/ingester/activeseries/active_series_test.go +++ b/pkg/ingester/activeseries/active_series_test.go @@ -37,10 +37,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { ref3, ls3 := storage.SeriesRef(3), labels.FromStrings("a", "3") ref4, ls4 := storage.SeriesRef(4), labels.FromStrings("a", "4") ref5 := storage.SeriesRef(5) // will be used for ls1 again. - - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) - - valid := c.Purge(time.Now()) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) + valid := c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) @@ -50,8 +48,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 0, allActiveBuckets) assert.Empty(t, activeMatchingBuckets) - c.UpdateSeries(ls1, ref1, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls1, ref1, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -62,8 +60,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls1, ref1, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls1, ref1, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -74,8 +72,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls2, ref2, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls2, ref2, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 2, allActive) @@ -86,8 +84,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls3, ref3, time.Now(), 5) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), 5, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 3, allActive) @@ -98,8 +96,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 1, allActiveHistograms) assert.Equal(t, 5, allActiveBuckets) - c.UpdateSeries(ls4, ref4, time.Now(), 3) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls4, ref4, time.Now(), 3, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -111,8 +109,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 8, allActiveBuckets) // more buckets for a histogram - c.UpdateSeries(ls3, ref3, time.Now(), 7) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), 7, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -124,8 +122,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 10, allActiveBuckets) // changing a metric from histogram to float - c.UpdateSeries(ls4, ref4, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls4, ref4, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -150,7 +148,7 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 7, allActiveBuckets) // Doesn't change after purging. - valid = c.Purge(time.Now()) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -162,7 +160,7 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 7, allActiveBuckets) // ref5 is created with the same labelset as ls1, it shouldn't be accounted as different series. - c.UpdateSeries(ls1, ref5, time.Now(), -1) + c.UpdateSeries(ls1, ref5, time.Now(), -1, nil) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) assert.Equal(t, 1, allActiveHistograms) @@ -173,7 +171,7 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 7, allActiveBuckets) // Doesn't change after purging. - valid = c.Purge(time.Now()) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -204,19 +202,19 @@ func TestActiveSeries_ContainsRef(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) // Update each series with a different timestamp according to each index for i := 0; i < len(series); i++ { - c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1) + c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1, nil) } - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) // The expected number of series is the total number of series minus the ttl // because the first ttl series should be purged exp := len(series) - (ttl) - valid := c.Purge(mockedTime) + valid := c.Purge(mockedTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, exp, allActive) @@ -231,7 +229,7 @@ func TestActiveSeries_ContainsRef(t *testing.T) { func TestActiveSeries_UpdateSeries_WithMatchers(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, nil) testUpdateSeries(t, c) } @@ -243,7 +241,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { ref5, ls5 := storage.SeriesRef(5), labels.FromStrings("a", "5") ref6 := storage.SeriesRef(6) // same as ls2 - valid := c.Purge(time.Now()) + valid := c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) @@ -257,8 +255,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls1, ref1, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls1, ref1, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -272,8 +270,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls2, ref2, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls2, ref2, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 2, allActive) @@ -287,8 +285,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls3, ref3, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 3, allActive) @@ -302,8 +300,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls3, ref3, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 3, allActive) @@ -317,8 +315,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls4, ref4, time.Now(), 3) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls4, ref4, time.Now(), 3, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -332,8 +330,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 1, allActiveHistograms) assert.Equal(t, 3, allActiveBuckets) - c.UpdateSeries(ls5, ref5, time.Now(), 5) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls5, ref5, time.Now(), 5, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -348,8 +346,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 8, allActiveBuckets) // changing a metric from float to histogram - c.UpdateSeries(ls3, ref3, time.Now(), 6) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), 6, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -364,8 +362,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 14, allActiveBuckets) // fewer (zero) buckets for a histogram - c.UpdateSeries(ls4, ref4, time.Now(), 0) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls4, ref4, time.Now(), 0, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -397,7 +395,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 11, allActiveBuckets) // Don't change after purging. - valid = c.Purge(time.Now()) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -412,7 +410,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 11, allActiveBuckets) // ls2 is pushed again, this time with ref6 - c.UpdateSeries(ls2, ref6, time.Now(), -1) + c.UpdateSeries(ls2, ref6, time.Now(), -1, nil) // Numbers don't change. allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -427,7 +425,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 11, allActiveBuckets) // Don't change after purging. - valid = c.Purge(time.Now()) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -448,7 +446,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { func TestActiveSeries_UpdateSeries_Clear(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, nil) testUpdateSeries(t, c) c.Clear() @@ -488,12 +486,11 @@ func labelsWithHashCollision() (labels.Labels, labels.Labels) { func TestActiveSeries_ShouldCorrectlyHandleHashCollisions(t *testing.T) { ls1, ls2 := labelsWithHashCollision() ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) + c.UpdateSeries(ls1, ref1, time.Now(), -1, nil) + c.UpdateSeries(ls2, ref2, time.Now(), -1, nil) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) - c.UpdateSeries(ls1, ref1, time.Now(), -1) - c.UpdateSeries(ls2, ref2, time.Now(), -1) - - valid := c.Purge(time.Now()) + valid := c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 2, allActive) @@ -517,22 +514,22 @@ func TestActiveSeries_Purge_NoMatchers(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) for i := 0; i < len(series); i++ { - c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1) + c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1, nil) } c.PostDeletion(map[chunks.HeadSeriesRef]labels.Labels{ deletedRef: deletedLabels, }) - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) // call purge twice, just to hit "quick" path. It doesn't really do anything. - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) exp := len(series) - (ttl) // Purge is not intended to purge - valid := c.Purge(mockedTime) + valid := c.Purge(mockedTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, exp, allActive) @@ -563,13 +560,13 @@ func TestActiveSeries_Purge_WithMatchers(t *testing.T) { t.Run(fmt.Sprintf("ttl=%d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(asm, 5*time.Minute) + c := NewActiveSeries(asm, 5*time.Minute, nil) exp := len(series) - ttl expMatchingSeries := 0 for i, s := range series { - c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1) + c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1, nil) // if this series is matching, and they're within the ttl tmp := asm.Matches(s) @@ -578,11 +575,11 @@ func TestActiveSeries_Purge_WithMatchers(t *testing.T) { } } - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) // call purge twice, just to hit "quick" path. It doesn't really do anything. - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) - valid := c.Purge(mockedTime) + valid := c.Purge(mockedTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, exp, allActive) @@ -596,28 +593,28 @@ func TestActiveSeries_PurgeOpt(t *testing.T) { ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) currentTime := time.Now() - c := NewActiveSeries(&asmodel.Matchers{}, 59*time.Second) + c := NewActiveSeries(&asmodel.Matchers{}, 59*time.Second, nil) - c.UpdateSeries(ls1, ref1, currentTime.Add(-2*time.Minute), -1) - c.UpdateSeries(ls2, ref2, currentTime, -1) + c.UpdateSeries(ls1, ref1, currentTime.Add(-2*time.Minute), -1, nil) + c.UpdateSeries(ls2, ref2, currentTime, -1, nil) - valid := c.Purge(currentTime) + valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, _, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 1, allActive) - c.UpdateSeries(ls1, ref1, currentTime.Add(-1*time.Minute), -1) - c.UpdateSeries(ls2, ref2, currentTime, -1) + c.UpdateSeries(ls1, ref1, currentTime.Add(-1*time.Minute), -1, nil) + c.UpdateSeries(ls2, ref2, currentTime, -1, nil) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, _, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) // This will *not* update the series, since there is already newer timestamp. - c.UpdateSeries(ls2, ref2, currentTime.Add(-1*time.Minute), -1) + c.UpdateSeries(ls2, ref2, currentTime.Add(-1*time.Minute), -1, nil) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, _, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -632,30 +629,30 @@ func TestActiveSeries_ReloadSeriesMatchers(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~.*}`})) currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, nil) - valid := c.Purge(currentTime) + valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) assert.Equal(t, []int{0}, activeMatching) - c.UpdateSeries(ls1, ref1, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls1, ref1, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) assert.Equal(t, []int{1}, activeMatching) c.ReloadMatchers(asm, currentTime) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.False(t, valid) // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - c.UpdateSeries(ls1, ref1, currentTime, -1) - c.UpdateSeries(ls2, ref2, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls1, ref1, currentTime, -1, nil) + c.UpdateSeries(ls2, ref2, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 2, allActive) @@ -666,8 +663,8 @@ func TestActiveSeries_ReloadSeriesMatchers(t *testing.T) { // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - c.UpdateSeries(ls3, ref3, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls3, ref3, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -681,8 +678,8 @@ func TestActiveSeries_ReloadSeriesMatchers(t *testing.T) { // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - c.UpdateSeries(ls4, ref4, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls4, ref4, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -698,15 +695,15 @@ func TestActiveSeries_ReloadSeriesMatchers_LessMatchers(t *testing.T) { })) currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout) - valid := c.Purge(currentTime) + c := NewActiveSeries(asm, DefaultTimeout, nil) + valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) assert.Equal(t, []int{0, 0}, activeMatching) - c.UpdateSeries(ls1, ref1, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls1, ref1, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -717,10 +714,10 @@ func TestActiveSeries_ReloadSeriesMatchers_LessMatchers(t *testing.T) { })) c.ReloadMatchers(asm, currentTime) - c.purge(time.Time{}) + c.purge(time.Time{}, nil) // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 0, allActive) @@ -736,16 +733,15 @@ func TestActiveSeries_ReloadSeriesMatchers_SameSizeNewLabels(t *testing.T) { })) currentTime := time.Now() - - c := NewActiveSeries(asm, DefaultTimeout) - valid := c.Purge(currentTime) + c := NewActiveSeries(asm, DefaultTimeout, nil) + valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) assert.Equal(t, []int{0, 0}, activeMatching) - c.UpdateSeries(ls1, ref1, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls1, ref1, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -757,11 +753,11 @@ func TestActiveSeries_ReloadSeriesMatchers_SameSizeNewLabels(t *testing.T) { })) c.ReloadMatchers(asm, currentTime) - c.purge(time.Time{}) + c.purge(time.Time{}, nil) // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 0, allActive) @@ -790,7 +786,7 @@ func benchmarkActiveSeriesUpdateSeriesConcurrency(b *testing.B, numSeries, numGo var ( // Run the active series tracker with an active timeout = 0 so that the Purge() will always // purge the series. - c = NewActiveSeries(&asmodel.Matchers{}, 0) + c = NewActiveSeries(&asmodel.Matchers{}, 0, nil) updateGroup = &sync.WaitGroup{} purgeGroup = &sync.WaitGroup{} start = make(chan struct{}) @@ -824,7 +820,7 @@ func benchmarkActiveSeriesUpdateSeriesConcurrency(b *testing.B, numSeries, numGo nextSeriesID = 0 } - c.UpdateSeries(seriesList[nextSeriesID], storage.SeriesRef(nextSeriesID), now(), -1) + c.UpdateSeries(seriesList[nextSeriesID], storage.SeriesRef(nextSeriesID), now(), -1, nil) } }(i) } @@ -841,7 +837,7 @@ func benchmarkActiveSeriesUpdateSeriesConcurrency(b *testing.B, numSeries, numGo case <-stopPurge: return default: - c.Purge(future()) + c.Purge(future(), nil) } // Throttle, but keep high pressure from Purge(). @@ -928,10 +924,10 @@ func BenchmarkActiveSeries_UpdateSeries(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, nil) for round := 0; round <= tt.nRounds; round++ { for ix := 0; ix < tt.nSeries; ix++ { - c.UpdateSeries(series[ix], refs[ix], time.Unix(0, now), -1) + c.UpdateSeries(series[ix], refs[ix], time.Unix(0, now), -1, nil) now++ } } @@ -953,7 +949,7 @@ func benchmarkPurge(b *testing.B, twice bool) { const numExpiresSeries = numSeries / 25 currentTime := time.Now() - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) series := [numSeries]labels.Labels{} refs := [numSeries]storage.SeriesRef{} @@ -968,13 +964,13 @@ func benchmarkPurge(b *testing.B, twice bool) { // Prepare series for ix, s := range series { if ix < numExpiresSeries { - c.UpdateSeries(s, refs[ix], currentTime.Add(-DefaultTimeout), -1) + c.UpdateSeries(s, refs[ix], currentTime.Add(-DefaultTimeout), -1, nil) } else { - c.UpdateSeries(s, refs[ix], currentTime, -1) + c.UpdateSeries(s, refs[ix], currentTime, -1, nil) } } - valid := c.Purge(currentTime) + valid := c.Purge(currentTime, nil) assert.True(b, valid) allActive, _, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(b, numSeries, allActive) @@ -982,13 +978,13 @@ func benchmarkPurge(b *testing.B, twice bool) { // Purge is going to purge everything currentTime = currentTime.Add(DefaultTimeout) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(b, valid) allActive, _, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(b, numSeries-numExpiresSeries, allActive) if twice { - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(b, valid) allActive, _, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(b, numSeries-numExpiresSeries, allActive) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 08edb6ab54c..2b3561a3530 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -52,6 +52,7 @@ import ( "golang.org/x/exp/slices" "golang.org/x/sync/errgroup" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/ingester/activeseries" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" "github.com/grafana/mimir/pkg/ingester/client" @@ -314,6 +315,8 @@ type Ingester struct { activeGroups *util.ActiveGroupsCleanupService + costAttributionMgr *costattribution.Manager + tsdbMetrics *tsdbMetrics forceCompactTrigger chan requestWithUsersAndCallback @@ -368,8 +371,9 @@ func newIngester(cfg Config, limits *validation.Overrides, registerer prometheus limits: limits, logger: logger, - tsdbs: make(map[string]*userTSDB), - usersMetadata: make(map[string]*userMetricsMetadata), + tsdbs: make(map[string]*userTSDB), + usersMetadata: make(map[string]*userMetricsMetadata), + bucket: bucketClient, tsdbMetrics: newTSDBMetrics(registerer, logger), shipperMetrics: newShipperMetrics(registerer), @@ -382,7 +386,7 @@ func newIngester(cfg Config, limits *validation.Overrides, registerer prometheus } // New returns an Ingester that uses Mimir block storage. -func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, partitionRingWatcher *ring.PartitionRingWatcher, activeGroupsCleanupService *util.ActiveGroupsCleanupService, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { +func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, partitionRingWatcher *ring.PartitionRingWatcher, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionMgr *costattribution.Manager, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { i, err := newIngester(cfg, limits, registerer, logger) if err != nil { return nil, err @@ -391,6 +395,7 @@ func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, i.metrics = newIngesterMetrics(registerer, cfg.ActiveSeriesMetrics.Enabled, i.getInstanceLimits, i.ingestionRate, &i.inflightPushRequests, &i.inflightPushRequestsBytes) i.activeGroups = activeGroupsCleanupService + i.costAttributionMgr = costAttributionMgr // We create a circuit breaker, which will be activated on a successful completion of starting. i.circuitBreaker = newIngesterCircuitBreaker(i.cfg.PushCircuitBreaker, i.cfg.ReadCircuitBreaker, logger, registerer) @@ -783,10 +788,13 @@ func (i *Ingester) updateActiveSeries(now time.Time) { } newMatchersConfig := i.limits.ActiveSeriesCustomTrackersConfig(userID) - if newMatchersConfig.String() != userDB.activeSeries.CurrentConfig().String() { + newCostAttributionTracker := i.costAttributionMgr.TrackerForUser(userID) + if userDB.activeSeries.ConfigDiffers(newMatchersConfig, newCostAttributionTracker) { i.replaceMatchers(asmodel.NewMatchers(newMatchersConfig), userDB, now) } - valid := userDB.activeSeries.Purge(now) + + idx, _ := userDB.Head().Index() + valid := userDB.activeSeries.Purge(now, idx) if !valid { // Active series config has been reloaded, exposing loading metric until MetricsIdleTimeout passes. i.metrics.activeSeriesLoading.WithLabelValues(userID).Set(1) @@ -1159,7 +1167,6 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques // Note that we don't .Finish() the span in this method on purpose spanlog := spanlogger.FromContext(ctx, i.logger) spanlog.DebugLog("event", "acquired append lock") - var ( startAppend = time.Now() @@ -1190,48 +1197,56 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTimestampTooOldCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTimestampTooOld, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooOld, func() softError { return newSampleTimestampTooOldError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleOutOfOrderCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfOrder, startAppend) updateFirstPartial(i.errorSamplers.sampleOutOfOrder, func() softError { return newSampleOutOfOrderError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTooOldCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooOld, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooOldOOOEnabled, func() softError { return newSampleTimestampTooOldOOOEnabledError(model.Time(timestamp), labels, outOfOrderWindow) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTooFarInFutureCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooFarInFuture, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooFarInFuture, func() softError { return newSampleTimestampTooFarInFutureError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.newValueForTimestampCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonNewValueForTimestamp, startAppend) updateFirstPartial(i.errorSamplers.sampleDuplicateTimestamp, func() softError { return newSampleDuplicateTimestampError(model.Time(timestamp), labels) }) }, - func() { + func(labels []mimirpb.LabelAdapter) { stats.perUserSeriesLimitCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerUserSeriesLimit, startAppend) updateFirstPartial(i.errorSamplers.maxSeriesPerUserLimitExceeded, func() softError { return newPerUserSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerUser(userID)) }) }, func(labels []mimirpb.LabelAdapter) { stats.perMetricSeriesLimitCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerMetricSeriesLimit, startAppend) updateFirstPartial(i.errorSamplers.maxSeriesPerMetricLimitExceeded, func() softError { return newPerMetricSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerMetric(userID), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleOutOfOrderCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfOrder, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { e := newNativeHistogramValidationError(globalerror.NativeHistogramOOODisabled, err, model.Time(timestamp), labels) return e @@ -1239,30 +1254,35 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramCountMismatch, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramCountNotBigEnough, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramNegativeBucketCount, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramSpanNegativeOffset, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramSpansBucketsMismatch, err, model.Time(timestamp), labels) }) @@ -1377,7 +1397,6 @@ func (i *Ingester) updateMetricsFromPushStats(userID string, group string, stats func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.PreallocTimeseries, app extendedAppender, startAppend time.Time, stats *pushStats, errProcessor *mimir_storage.SoftAppendErrorProcessor, updateFirstPartial func(sampler *util_log.Sampler, errFn softErrorFunction), activeSeries *activeseries.ActiveSeries, outOfOrderWindow time.Duration, minAppendTimeAvailable bool, minAppendTime int64) error { - // Fetch limits once per push request both to avoid processing half the request differently. var ( nativeHistogramsIngestionEnabled = i.limits.NativeHistogramsIngestionEnabled(userID) @@ -1390,6 +1409,11 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre var builder labels.ScratchBuilder var nonCopiedLabels labels.Labels + + // idx is used to decrease active series count in case of error for cost attribution. + idx, _ := i.getTSDB(userID).Head().Index() + // TODO: deal with the error here + for _, ts := range timeseries { // The labels must be sorted (in our case, it's guaranteed a write request // has sorted labels once hit the ingester). @@ -1405,8 +1429,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre allOutOfBoundsHistograms(ts.Histograms, minAppendTime) { stats.failedSamplesCount += len(ts.Samples) + len(ts.Histograms) - stats.sampleTimestampTooOldCount += len(ts.Samples) + len(ts.Histograms) + stats.sampleTimestampTooOldCount += len(ts.Samples) + len(ts.Histograms) + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)+len(ts.Histograms)), reasonSampleTimestampTooOld, startAppend) var firstTimestamp int64 if len(ts.Samples) > 0 { firstTimestamp = ts.Samples[0].TimestampMs @@ -1424,10 +1449,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre // ignore native histograms in the condition and statitics as well if outOfOrderWindow <= 0 && minAppendTimeAvailable && len(ts.Exemplars) == 0 && len(ts.Samples) > 0 && allOutOfBoundsFloats(ts.Samples, minAppendTime) { - stats.failedSamplesCount += len(ts.Samples) stats.sampleTimestampTooOldCount += len(ts.Samples) - + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)), reasonSampleTimestampTooOld, startAppend) firstTimestamp := ts.Samples[0].TimestampMs updateFirstPartial(i.errorSamplers.sampleTimestampTooOld, func() softError { @@ -1548,7 +1572,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre } if activeSeries != nil && stats.succeededSamplesCount > oldSucceededSamplesCount { - activeSeries.UpdateSeries(nonCopiedLabels, ref, startAppend, numNativeHistogramBuckets) + activeSeries.UpdateSeries(nonCopiedLabels, ref, startAppend, numNativeHistogramBuckets, idx) } if len(ts.Exemplars) > 0 && i.limits.MaxGlobalExemplarsPerUser(userID) > 0 { @@ -2642,8 +2666,12 @@ func (i *Ingester) createTSDB(userID string, walReplayConcurrency int) (*userTSD } userDB := &userTSDB{ - userID: userID, - activeSeries: activeseries.NewActiveSeries(asmodel.NewMatchers(matchersConfig), i.cfg.ActiveSeriesMetrics.IdleTimeout), + userID: userID, + activeSeries: activeseries.NewActiveSeries( + asmodel.NewMatchers(matchersConfig), + i.cfg.ActiveSeriesMetrics.IdleTimeout, + i.costAttributionMgr.TrackerForUser(userID), + ), seriesInMetric: newMetricCounter(i.limiter, i.cfg.getIgnoreSeriesLimitForMetricNamesMap()), ingestedAPISamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), ingestedRuleSamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), @@ -3243,7 +3271,12 @@ func (i *Ingester) compactBlocksToReduceInMemorySeries(ctx context.Context, now } // Purge the active series so that the next call to Active() will return the up-to-date count. - db.activeSeries.Purge(now) + idx, err := db.Head().Index() + if err != nil { + level.Warn(i.logger).Log("msg", "failed to get the index of the TSDB head", "user", userID, "err", err) + continue + } + db.activeSeries.Purge(now, idx) // Estimate the number of series that would be dropped from the TSDB Head if we would // compact the head up until "now - active series idle timeout". diff --git a/pkg/ingester/ingester_early_compaction_test.go b/pkg/ingester/ingester_early_compaction_test.go index 531d8a673f0..822e13374ee 100644 --- a/pkg/ingester/ingester_early_compaction_test.go +++ b/pkg/ingester/ingester_early_compaction_test.go @@ -129,7 +129,7 @@ func TestIngester_compactBlocksToReduceInMemorySeries_ShouldTriggerCompactionOnl require.Len(t, listBlocksInDir(t, userBlocksDir), 0) // Use a trick to track all series we've written so far as "inactive". - ingester.getTSDB(userID).activeSeries.Purge(now.Add(30 * time.Minute)) + ingester.getTSDB(userID).activeSeries.Purge(now.Add(30*time.Minute), nil) // Pre-condition check. require.Equal(t, uint64(10), ingester.getTSDB(userID).Head().NumSeries()) diff --git a/pkg/ingester/ingester_ingest_storage_test.go b/pkg/ingester/ingester_ingest_storage_test.go index 4a529321155..fcf79dd4bc7 100644 --- a/pkg/ingester/ingester_ingest_storage_test.go +++ b/pkg/ingester/ingester_ingest_storage_test.go @@ -650,7 +650,7 @@ func createTestIngesterWithIngestStorage(t testing.TB, ingesterCfg *Config, over require.NoError(t, services.StopAndAwaitTerminated(ctx, prw)) }) - ingester, err := New(*ingesterCfg, overrides, nil, prw, nil, reg, util_test.NewTestingLogger(t)) + ingester, err := New(*ingesterCfg, overrides, nil, prw, nil, nil, reg, util_test.NewTestingLogger(t)) require.NoError(t, err) return ingester, kafkaCluster, prw diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index 6d03bc83535..b4bbe219fd7 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -60,6 +60,7 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/codes" + "github.com/grafana/mimir/pkg/costattribution" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" "github.com/grafana/mimir/pkg/ingester/client" "github.com/grafana/mimir/pkg/mimirpb" @@ -3589,53 +3590,114 @@ func TestIngester_Push_DecreaseInactiveSeries(t *testing.T) { } func BenchmarkIngesterPush(b *testing.B) { - registry := prometheus.NewRegistry() - ctx := user.InjectOrgID(context.Background(), userID) + costAttributionCases := []struct { + state string + limitsCfg func(*validation.Limits) + customRegistry *prometheus.Registry + }{ + { + state: "enabled", + limitsCfg: func(*validation.Limits) {}, + customRegistry: nil, + }, + { + state: "disabled", + limitsCfg: func(limits *validation.Limits) { + if limits == nil { + return + } + limits.CostAttributionLabels = []string{"cpu"} + limits.MaxCostAttributionCardinalityPerUser = 100 + }, + customRegistry: prometheus.NewRegistry(), + }, + } - // Create a mocked ingester - cfg := defaultIngesterTestConfig(b) + tests := []struct { + name string + limitsCfg func() validation.Limits + }{ + { + name: "ingester push succeeded", + limitsCfg: func() validation.Limits { + limitsCfg := defaultLimitsTestConfig() + limitsCfg.NativeHistogramsIngestionEnabled = true + return limitsCfg + }, + }, + } - ingester, err := prepareIngesterWithBlocksStorage(b, cfg, nil, registry) - require.NoError(b, err) - require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingester)) - defer services.StopAndAwaitTerminated(context.Background(), ingester) //nolint:errcheck + for _, caCase := range costAttributionCases { + b.Run(fmt.Sprintf("cost_attribution=%s", caCase.state), func(b *testing.B) { + for _, t := range tests { + b.Run(fmt.Sprintf("scenario=%s", t.name), func(b *testing.B) { + registry := prometheus.NewRegistry() + ctx := user.InjectOrgID(context.Background(), userID) - // Wait until the ingester is healthy - test.Poll(b, 100*time.Millisecond, 1, func() interface{} { - return ingester.lifecycler.HealthyInstancesCount() - }) + // Create a mocked ingester + cfg := defaultIngesterTestConfig(b) - // Push a single time series to set the TSDB min time. - metricLabelAdapters := [][]mimirpb.LabelAdapter{{{Name: labels.MetricName, Value: "test"}}} - startTime := util.TimeToMillis(time.Now()) + limitCfg := t.limitsCfg() + caCase.limitsCfg(&limitCfg) - currTimeReq := mimirpb.ToWriteRequest( - metricLabelAdapters, - []mimirpb.Sample{{Value: 1, TimestampMs: startTime}}, - nil, - nil, - mimirpb.API, - ) - _, err = ingester.Push(ctx, currTimeReq) - require.NoError(b, err) + overrides, err := validation.NewOverrides(limitCfg, nil) + require.NoError(b, err) - const ( - series = 10 - samples = 1 - ) + var cam *costattribution.Manager + if caCase.customRegistry != nil { + cam, err = costattribution.NewManager(5*time.Second, time.Second, 10*time.Second, nil, overrides, caCase.customRegistry) + require.NoError(b, err) + } + + ingester, err := prepareIngesterWithBlockStorageOverridesAndCostAttribution(b, cfg, overrides, nil, "", "", registry, cam) + require.NoError(b, err) + require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingester)) - allLabels, allSamples := benchmarkData(series) + b.Cleanup(func() { + require.NoError(b, services.StopAndAwaitTerminated(context.Background(), ingester)) + }) - b.ResetTimer() - for iter := 0; iter < b.N; iter++ { - // Bump the timestamp on each of our test samples each time round the loop - for j := 0; j < samples; j++ { - for i := range allSamples { - allSamples[i].TimestampMs = startTime + int64(iter*samples+j+1) + // Wait until the ingester is healthy + test.Poll(b, 100*time.Millisecond, 1, func() interface{} { + return ingester.lifecycler.HealthyInstancesCount() + }) + + // Push a single time series to set the TSDB min time. + metricLabelAdapters := [][]mimirpb.LabelAdapter{{{Name: labels.MetricName, Value: "test"}}} + startTime := util.TimeToMillis(time.Now()) + + currTimeReq := mimirpb.ToWriteRequest( + metricLabelAdapters, + []mimirpb.Sample{{Value: 1, TimestampMs: startTime}}, + nil, + nil, + mimirpb.API, + ) + _, err = ingester.Push(ctx, currTimeReq) + require.NoError(b, err) + + // so we are benchmark 5000 series with 10 sample each + const ( + series = 5000 + samples = 10 + ) + + allLabels, allSamples := benchmarkData(series) + + b.ResetTimer() + for iter := 0; iter < b.N; iter++ { + // Bump the timestamp on each of our test samples each time round the loop + for j := 0; j < samples; j++ { + for i := range allSamples { + allSamples[i].TimestampMs = startTime + int64(iter*samples+j+1) + } + _, err := ingester.Push(ctx, mimirpb.ToWriteRequest(allLabels, allSamples, nil, nil, mimirpb.API)) + require.NoError(b, err) + } + } + }) } - _, err := ingester.Push(ctx, mimirpb.ToWriteRequest(allLabels, allSamples, nil, nil, mimirpb.API)) - require.NoError(b, err) - } + }) } } @@ -6232,10 +6294,14 @@ func prepareIngesterWithBlocksStorageAndLimits(t testing.TB, ingesterCfg Config, } func prepareIngesterWithBlockStorageAndOverrides(t testing.TB, ingesterCfg Config, overrides *validation.Overrides, ingestersRing ring.ReadRing, dataDir string, bucketDir string, registerer prometheus.Registerer) (*Ingester, error) { - return prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t, ingesterCfg, overrides, ingestersRing, nil, dataDir, bucketDir, registerer) + return prepareIngesterWithBlockStorageOverridesAndCostAttribution(t, ingesterCfg, overrides, ingestersRing, dataDir, bucketDir, registerer, nil) +} + +func prepareIngesterWithBlockStorageOverridesAndCostAttribution(t testing.TB, ingesterCfg Config, overrides *validation.Overrides, ingestersRing ring.ReadRing, dataDir string, bucketDir string, registerer prometheus.Registerer, cam *costattribution.Manager) (*Ingester, error) { + return prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t, ingesterCfg, overrides, ingestersRing, nil, dataDir, bucketDir, registerer, cam) } -func prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t testing.TB, ingesterCfg Config, overrides *validation.Overrides, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionRingWatcher, dataDir string, bucketDir string, registerer prometheus.Registerer) (*Ingester, error) { +func prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t testing.TB, ingesterCfg Config, overrides *validation.Overrides, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionRingWatcher, dataDir string, bucketDir string, registerer prometheus.Registerer, cam *costattribution.Manager) (*Ingester, error) { // Create a data dir if none has been provided. if dataDir == "" { dataDir = t.TempDir() @@ -6256,7 +6322,7 @@ func prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t testing.TB, i ingestersRing = createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()) } - ingester, err := New(ingesterCfg, overrides, ingestersRing, partitionsRing, nil, registerer, noDebugNoopLogger{}) // LOGGING: log.NewLogfmtLogger(os.Stderr) + ingester, err := New(ingesterCfg, overrides, ingestersRing, partitionsRing, nil, cam, registerer, noDebugNoopLogger{}) // LOGGING: log.NewLogfmtLogger(os.Stderr) if err != nil { return nil, err } @@ -6462,7 +6528,7 @@ func TestIngester_OpenExistingTSDBOnStartup(t *testing.T) { // setup the tsdbs dir testData.setup(t, tempDir) - ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, log.NewNopLogger()) + ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, nil, log.NewNopLogger()) require.NoError(t, err) startErr := services.StartAndAwaitRunning(context.Background(), ingester) @@ -7622,7 +7688,7 @@ func TestHeadCompactionOnStartup(t *testing.T) { ingesterCfg.BlocksStorageConfig.Bucket.S3.Endpoint = "localhost" ingesterCfg.BlocksStorageConfig.TSDB.Retention = 2 * 24 * time.Hour // Make sure that no newly created blocks are deleted. - ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, log.NewNopLogger()) + ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, nil, log.NewNopLogger()) require.NoError(t, err) require.NoError(t, services.StartAndAwaitRunning(context.Background(), ingester)) diff --git a/pkg/ingester/user_tsdb.go b/pkg/ingester/user_tsdb.go index 5a3ed82c28c..2f31f41892e 100644 --- a/pkg/ingester/user_tsdb.go +++ b/pkg/ingester/user_tsdb.go @@ -619,12 +619,14 @@ func (u *userTSDB) computeOwnedSeries() int { } count := 0 + idx, _ := u.Head().Index() + // TODO: deal with the err here u.Head().ForEachSecondaryHash(func(refs []chunks.HeadSeriesRef, secondaryHashes []uint32) { for i, sh := range secondaryHashes { if u.ownedTokenRanges.IncludesKey(sh) { count++ } else { - u.activeSeries.Delete(refs[i]) + u.activeSeries.Delete(refs[i], idx) } } }) diff --git a/pkg/mimir/mimir.go b/pkg/mimir/mimir.go index 7bcd3eac250..31baea29e7e 100644 --- a/pkg/mimir/mimir.go +++ b/pkg/mimir/mimir.go @@ -52,6 +52,7 @@ import ( blockbuilderscheduler "github.com/grafana/mimir/pkg/blockbuilder/scheduler" "github.com/grafana/mimir/pkg/compactor" "github.com/grafana/mimir/pkg/continuoustest" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/distributor" "github.com/grafana/mimir/pkg/flusher" "github.com/grafana/mimir/pkg/frontend" @@ -148,6 +149,9 @@ type Config struct { Common CommonConfig `yaml:"common"` TimeseriesUnmarshalCachingOptimizationEnabled bool `yaml:"timeseries_unmarshal_caching_optimization_enabled" category:"experimental"` + + CostAttributionEvictionInterval time.Duration `yaml:"cost_attribution_eviction_interval" category:"experimental"` + CostAttributionRegistryPath string `yaml:"cost_attribution_registry_path" category:"experimental"` } // RegisterFlags registers flags. @@ -173,6 +177,8 @@ func (c *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) { f.IntVar(&c.MaxSeparateMetricsGroupsPerUser, "max-separate-metrics-groups-per-user", 1000, "Maximum number of groups allowed per user by which specified distributor and ingester metrics can be further separated.") f.BoolVar(&c.EnableGoRuntimeMetrics, "enable-go-runtime-metrics", false, "Set to true to enable all Go runtime metrics, such as go_sched_* and go_memstats_*.") f.BoolVar(&c.TimeseriesUnmarshalCachingOptimizationEnabled, "timeseries-unmarshal-caching-optimization-enabled", true, "Enables optimized marshaling of timeseries.") + f.StringVar(&c.CostAttributionRegistryPath, "cost-attribution.registry-path", "", "Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.") + f.DurationVar(&c.CostAttributionEvictionInterval, "cost-attribution.eviction-interval", 20*time.Minute, "Time interval at which inactive cost attributions are evicted from the counter, ensuring they are not included in the cost attribution cardinality per user limit.") c.API.RegisterFlags(f) c.registerServerFlagsWithChangedDefaultValues(f) @@ -739,6 +745,8 @@ type Mimir struct { BlockBuilderScheduler *blockbuilderscheduler.BlockBuilderScheduler ContinuousTestManager *continuoustest.Manager BuildInfoHandler http.Handler + + CostAttributionManager *costattribution.Manager } // New makes a new Mimir. diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index 697501af98f..127a771b889 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -43,6 +43,7 @@ import ( blockbuilderscheduler "github.com/grafana/mimir/pkg/blockbuilder/scheduler" "github.com/grafana/mimir/pkg/compactor" "github.com/grafana/mimir/pkg/continuoustest" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/distributor" "github.com/grafana/mimir/pkg/flusher" "github.com/grafana/mimir/pkg/frontend" @@ -80,6 +81,7 @@ const ( OverridesExporter string = "overrides-exporter" Server string = "server" ActiveGroupsCleanupService string = "active-groups-cleanup-service" + CostAttributionService string = "cost-attribution-service" Distributor string = "distributor" DistributorService string = "distributor-service" Ingester string = "ingester" @@ -462,7 +464,9 @@ func (t *Mimir) initDistributorService() (serv services.Service, err error) { t.Cfg.Distributor.PreferAvailabilityZone = t.Cfg.Querier.PreferAvailabilityZone t.Cfg.Distributor.IngestStorageConfig = t.Cfg.IngestStorage - t.Distributor, err = distributor.New(t.Cfg.Distributor, t.Cfg.IngesterClient, t.Overrides, t.ActiveGroupsCleanup, t.IngesterRing, t.IngesterPartitionInstanceRing, canJoinDistributorsRing, t.Registerer, util_log.Logger) + t.Distributor, err = distributor.New(t.Cfg.Distributor, t.Cfg.IngesterClient, t.Overrides, + t.ActiveGroupsCleanup, t.CostAttributionManager, t.IngesterRing, t.IngesterPartitionInstanceRing, + canJoinDistributorsRing, t.Registerer, util_log.Logger) if err != nil { return } @@ -644,6 +648,18 @@ func (t *Mimir) initActiveGroupsCleanupService() (services.Service, error) { return t.ActiveGroupsCleanup, nil } +func (t *Mimir) initCostAttributionService() (services.Service, error) { + // The cost attribution service is only initilized if the custom registry path is provided. + if t.Cfg.CostAttributionRegistryPath != "" { + reg := prometheus.NewRegistry() + var err error + t.CostAttributionManager, err = costattribution.NewManager(3*time.Minute, time.Minute, t.Cfg.CostAttributionEvictionInterval, util_log.Logger, t.Overrides, reg) + t.API.RegisterCostAttribution(t.Cfg.CostAttributionRegistryPath, reg) + return t.CostAttributionManager, err + } + return nil, nil +} + func (t *Mimir) tsdbIngesterConfig() { t.Cfg.Ingester.BlocksStorageConfig = t.Cfg.BlocksStorage } @@ -655,7 +671,7 @@ func (t *Mimir) initIngesterService() (serv services.Service, err error) { t.Cfg.Ingester.IngestStorageConfig = t.Cfg.IngestStorage t.tsdbIngesterConfig() - t.Ingester, err = ingester.New(t.Cfg.Ingester, t.Overrides, t.IngesterRing, t.IngesterPartitionRingWatcher, t.ActiveGroupsCleanup, t.Registerer, util_log.Logger) + t.Ingester, err = ingester.New(t.Cfg.Ingester, t.Overrides, t.IngesterRing, t.IngesterPartitionRingWatcher, t.ActiveGroupsCleanup, t.CostAttributionManager, t.Registerer, util_log.Logger) if err != nil { return } @@ -1138,6 +1154,7 @@ func (t *Mimir) setupModuleManager() error { mm.RegisterModule(Overrides, t.initOverrides, modules.UserInvisibleModule) mm.RegisterModule(OverridesExporter, t.initOverridesExporter) mm.RegisterModule(ActiveGroupsCleanupService, t.initActiveGroupsCleanupService, modules.UserInvisibleModule) + mm.RegisterModule(CostAttributionService, t.initCostAttributionService, modules.UserInvisibleModule) mm.RegisterModule(Distributor, t.initDistributor) mm.RegisterModule(DistributorService, t.initDistributorService, modules.UserInvisibleModule) mm.RegisterModule(Ingester, t.initIngester) @@ -1178,9 +1195,10 @@ func (t *Mimir) setupModuleManager() error { Overrides: {RuntimeConfig}, OverridesExporter: {Overrides, MemberlistKV, Vault}, Distributor: {DistributorService, API, ActiveGroupsCleanupService, Vault}, - DistributorService: {IngesterRing, IngesterPartitionRing, Overrides, Vault}, + DistributorService: {IngesterRing, IngesterPartitionRing, Overrides, Vault, CostAttributionService}, + CostAttributionService: {API, Overrides}, Ingester: {IngesterService, API, ActiveGroupsCleanupService, Vault}, - IngesterService: {IngesterRing, IngesterPartitionRing, Overrides, RuntimeConfig, MemberlistKV}, + IngesterService: {IngesterRing, IngesterPartitionRing, Overrides, RuntimeConfig, MemberlistKV, CostAttributionService}, Flusher: {Overrides, API}, Queryable: {Overrides, DistributorService, IngesterRing, IngesterPartitionRing, API, StoreQueryable, MemberlistKV}, Querier: {TenantFederation, Vault}, diff --git a/pkg/storage/soft_append_error_processor.go b/pkg/storage/soft_append_error_processor.go index 0f02131537d..6fdda3ae588 100644 --- a/pkg/storage/soft_append_error_processor.go +++ b/pkg/storage/soft_append_error_processor.go @@ -22,7 +22,7 @@ type SoftAppendErrorProcessor struct { errTooOldSample func(int64, []mimirpb.LabelAdapter) sampleTooFarInFuture func(int64, []mimirpb.LabelAdapter) errDuplicateSampleForTimestamp func(int64, []mimirpb.LabelAdapter) - maxSeriesPerUser func() + maxSeriesPerUser func(labels []mimirpb.LabelAdapter) maxSeriesPerMetric func(labels []mimirpb.LabelAdapter) errOOONativeHistogramsDisabled func(error, int64, []mimirpb.LabelAdapter) errHistogramCountMismatch func(error, int64, []mimirpb.LabelAdapter) @@ -39,7 +39,7 @@ func NewSoftAppendErrorProcessor( errTooOldSample func(int64, []mimirpb.LabelAdapter), sampleTooFarInFuture func(int64, []mimirpb.LabelAdapter), errDuplicateSampleForTimestamp func(int64, []mimirpb.LabelAdapter), - maxSeriesPerUser func(), + maxSeriesPerUser func([]mimirpb.LabelAdapter), maxSeriesPerMetric func(labels []mimirpb.LabelAdapter), errOOONativeHistogramsDisabled func(error, int64, []mimirpb.LabelAdapter), errHistogramCountMismatch func(error, int64, []mimirpb.LabelAdapter), @@ -89,7 +89,7 @@ func (e *SoftAppendErrorProcessor) ProcessErr(err error, ts int64, labels []mimi e.errDuplicateSampleForTimestamp(ts, labels) return true case errors.Is(err, globalerror.MaxSeriesPerUser): - e.maxSeriesPerUser() + e.maxSeriesPerUser(labels) return true case errors.Is(err, globalerror.MaxSeriesPerMetric): e.maxSeriesPerMetric(labels) diff --git a/pkg/streamingpromql/benchmarks/comparison_test.go b/pkg/streamingpromql/benchmarks/comparison_test.go index 5b26a5d6c45..4b147583d31 100644 --- a/pkg/streamingpromql/benchmarks/comparison_test.go +++ b/pkg/streamingpromql/benchmarks/comparison_test.go @@ -237,7 +237,7 @@ func createIngesterQueryable(t testing.TB, address string) storage.Queryable { overrides, err := validation.NewOverrides(limits, nil) require.NoError(t, err) - d, err := distributor.New(distributorCfg, clientCfg, overrides, nil, ingestersRing, nil, false, nil, logger) + d, err := distributor.New(distributorCfg, clientCfg, overrides, nil, nil, ingestersRing, nil, false, nil, logger) require.NoError(t, err) queryMetrics := stats.NewQueryMetrics(nil) diff --git a/pkg/streamingpromql/benchmarks/ingester.go b/pkg/streamingpromql/benchmarks/ingester.go index 6f3b5f04a9a..9107b66f64f 100644 --- a/pkg/streamingpromql/benchmarks/ingester.go +++ b/pkg/streamingpromql/benchmarks/ingester.go @@ -96,7 +96,7 @@ func startBenchmarkIngester(rootDataDir string) (*ingester.Ingester, string, fun return services.StopAndAwaitTerminated(context.Background(), ingestersRing) }) - ing, err := ingester.New(ingesterCfg, overrides, ingestersRing, nil, nil, nil, log.NewNopLogger()) + ing, err := ingester.New(ingesterCfg, overrides, ingestersRing, nil, nil, nil, nil, log.NewNopLogger()) if err != nil { cleanup() return nil, "", nil, fmt.Errorf("could not create ingester: %w", err) diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index b12fc465fd0..9fc26f99b71 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -62,6 +62,8 @@ const ( QueryIngestersWithinFlag = "querier.query-ingesters-within" AlertmanagerMaxGrafanaConfigSizeFlag = "alertmanager.max-grafana-config-size-bytes" AlertmanagerMaxGrafanaStateSizeFlag = "alertmanager.max-grafana-state-size-bytes" + costAttributionLabelsFlag = "validation.cost-attribution-labels" + maxCostAttributionLabelsPerUserFlag = "validation.max-cost-attribution-labels-per-user" // MinCompactorPartialBlockDeletionDelay is the minimum partial blocks deletion delay that can be configured in Mimir. MinCompactorPartialBlockDeletionDelay = 4 * time.Hour @@ -70,6 +72,7 @@ const ( var ( errInvalidIngestStorageReadConsistency = fmt.Errorf("invalid ingest storage read consistency (supported values: %s)", strings.Join(api.ReadConsistencies, ", ")) errInvalidMaxEstimatedChunksPerQueryMultiplier = errors.New("invalid value for -" + MaxEstimatedChunksPerQueryMultiplierFlag + ": must be 0 or greater than or equal to 1") + errCostAttributionLabelsLimitExceeded = errors.New("invalid value for -" + costAttributionLabelsFlag + ": exceeds the limit defined by -" + maxCostAttributionLabelsPerUserFlag) ) // LimitError is a marker interface for the errors that do not comply with the specified limits. @@ -187,6 +190,12 @@ type Limits struct { LabelValuesMaxCardinalityLabelNamesPerRequest int `yaml:"label_values_max_cardinality_label_names_per_request" json:"label_values_max_cardinality_label_names_per_request"` ActiveSeriesResultsMaxSizeBytes int `yaml:"active_series_results_max_size_bytes" json:"active_series_results_max_size_bytes" category:"experimental"` + // Cost attribution and limit. + CostAttributionLabels flagext.StringSliceCSV `yaml:"cost_attribution_labels" json:"cost_attribution_labels" category:"experimental"` + MaxCostAttributionLabelsPerUser int `yaml:"max_cost_attribution_labels_per_user" json:"max_cost_attribution_labels_per_user" category:"experimental"` + MaxCostAttributionCardinalityPerUser int `yaml:"max_cost_attribution_cardinality_per_user" json:"max_cost_attribution_cardinality_per_user" category:"experimental"` + CostAttributionCooldown model.Duration `yaml:"cost_attribution_cooldown" json:"cost_attribution_cooldown" category:"experimental"` + // Ruler defaults and limits. RulerEvaluationDelay model.Duration `yaml:"ruler_evaluation_delay_duration" json:"ruler_evaluation_delay_duration"` RulerTenantShardSize int `yaml:"ruler_tenant_shard_size" json:"ruler_tenant_shard_size"` @@ -300,6 +309,10 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.StringVar(&l.SeparateMetricsGroupLabel, "validation.separate-metrics-group-label", "", "Label used to define the group label for metrics separation. For each write request, the group is obtained from the first non-empty group label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'group' label with group label's value. Currently applies to the following metrics: cortex_discarded_samples_total") + f.Var(&l.CostAttributionLabels, costAttributionLabelsFlag, "Defines labels for cost attribution, applied to metrics like cortex_distributor_attributed_received_samples_total. Set to an empty string to disable. Example: 'team,service' will produce metrics such as cortex_distributor_attributed_received_samples_total{team='frontend', service='api'}.") + f.IntVar(&l.MaxCostAttributionLabelsPerUser, maxCostAttributionLabelsPerUserFlag, 2, "Maximum number of cost attribution labels allowed per user.") + f.IntVar(&l.MaxCostAttributionCardinalityPerUser, "validation.max-cost-attribution-cardinality-per-user", 10000, "Maximum cardinality of cost attribution labels allowed per user.") + f.Var(&l.CostAttributionCooldown, "validation.cost-attribution-cooldown", "Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system will stay in overflow mode and extend the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset will occur once the cardinality falls below the limit.") f.IntVar(&l.MaxChunksPerQuery, MaxChunksPerQueryFlag, 2e6, "Maximum number of chunks that can be fetched in a single query from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable.") f.Float64Var(&l.MaxEstimatedChunksPerQueryMultiplier, MaxEstimatedChunksPerQueryMultiplierFlag, 0, "Maximum number of chunks estimated to be fetched in a single query from ingesters and store-gateways, as a multiple of -"+MaxChunksPerQueryFlag+". This limit is enforced in the querier. Must be greater than or equal to 1, or 0 to disable.") f.IntVar(&l.MaxFetchedSeriesPerQuery, MaxSeriesPerQueryFlag, 0, "The maximum number of unique series for which a query can fetch samples from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable") @@ -476,6 +489,10 @@ func (l *Limits) validate() error { return errInvalidIngestStorageReadConsistency } + if len(l.CostAttributionLabels) > l.MaxCostAttributionLabelsPerUser { + return errCostAttributionLabelsLimitExceeded + } + return nil } @@ -797,6 +814,22 @@ func (o *Overrides) SeparateMetricsGroupLabel(userID string) string { return o.getOverridesForUser(userID).SeparateMetricsGroupLabel } +func (o *Overrides) CostAttributionLabels(userID string) []string { + return o.getOverridesForUser(userID).CostAttributionLabels +} + +func (o *Overrides) MaxCostAttributionLabelsPerUser(userID string) int { + return o.getOverridesForUser(userID).MaxCostAttributionLabelsPerUser +} + +func (o *Overrides) CostAttributionCooldown(userID string) time.Duration { + return time.Duration(o.getOverridesForUser(userID).CostAttributionCooldown) +} + +func (o *Overrides) MaxCostAttributionCardinalityPerUser(userID string) int { + return o.getOverridesForUser(userID).MaxCostAttributionCardinalityPerUser +} + // IngestionTenantShardSize returns the ingesters shard size for a given user. func (o *Overrides) IngestionTenantShardSize(userID string) int { return o.getOverridesForUser(userID).IngestionTenantShardSize diff --git a/pkg/util/validation/limits_test.go b/pkg/util/validation/limits_test.go index 9dc82df2d05..c56cb1ab026 100644 --- a/pkg/util/validation/limits_test.go +++ b/pkg/util/validation/limits_test.go @@ -1076,6 +1076,12 @@ metric_relabel_configs: cfg: `ingest_storage_read_consistency: xyz`, expectedErr: errInvalidIngestStorageReadConsistency.Error(), }, + "should fail when cost_attribution_labels exceed max_cost_attribution_labels_per_user": { + cfg: ` +cost_attribution_labels: label1, label2, label3, +max_cost_attribution_labels_per_user: 2`, + expectedErr: errCostAttributionLabelsLimitExceeded.Error(), + }, } for testName, testData := range tests { From f04c28f6eb68c74044618dd98d96a97c90e2b009 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 17 Dec 2024 21:58:35 +0100 Subject: [PATCH 002/105] refectory --- .../config/mimir.yaml | 9 +-- .../configuration-parameters/index.md | 39 ++++++++++++ pkg/costattribution/manager.go | 17 +++-- pkg/costattribution/manager_test.go | 21 ++++--- pkg/costattribution/tracker.go | 46 +++++++++----- pkg/costattribution/tracker_test.go | 13 +++- pkg/distributor/distributor.go | 47 +++++++------- pkg/distributor/validate.go | 1 + pkg/ingester/activeseries/active_series.go | 63 +++++++++---------- pkg/ingester/ingester.go | 30 ++++----- pkg/ingester/user_tsdb.go | 7 ++- 11 files changed, 178 insertions(+), 115 deletions(-) diff --git a/development/mimir-microservices-mode/config/mimir.yaml b/development/mimir-microservices-mode/config/mimir.yaml index 31702611891..5d245999115 100644 --- a/development/mimir-microservices-mode/config/mimir.yaml +++ b/development/mimir-microservices-mode/config/mimir.yaml @@ -1,6 +1,4 @@ multitenancy_enabled: false -cost_attribution_registry_path: "/usage-metrics" -cost_attribution_eviction_interval: 10m distributor: ha_tracker: @@ -186,10 +184,5 @@ limits: ha_replica_label: ha_replica ha_max_clusters: 10 - cost_attribution_labels: "container" - max_cost_attribution_labels_per_user: 2 - max_cost_attribution_cardinality_per_user: 100 - cost_attribution_cooldown: 20m - runtime_config: - file: ./config/runtime.yaml \ No newline at end of file + file: ./config/runtime.yaml diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index f9a147e7bca..137e88d3fc1 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -455,6 +455,18 @@ overrides_exporter: # (experimental) Enables optimized marshaling of timeseries. # CLI flag: -timeseries-unmarshal-caching-optimization-enabled [timeseries_unmarshal_caching_optimization_enabled: | default = true] + +# (experimental) Time interval at which inactive cost attributions are evicted +# from the counter, ensuring they are not included in the cost attribution +# cardinality per user limit. +# CLI flag: -cost-attribution.eviction-interval +[cost_attribution_eviction_interval: | default = 20m] + +# (experimental) Defines a custom path for the registry. When specified, Mimir +# will expose cost attribution metrics through this custom path, if not +# specified, cost attribution metrics won't be exposed. +# CLI flag: -cost-attribution.registry-path +[cost_attribution_registry_path: | default = ""] ``` ### common @@ -3569,6 +3581,33 @@ The `limits` block configures default and per-tenant limits imposed by component # CLI flag: -querier.active-series-results-max-size-bytes [active_series_results_max_size_bytes: | default = 419430400] +# (experimental) Defines labels for cost attribution, applied to metrics like +# cortex_distributor_attributed_received_samples_total. Set to an empty string +# to disable. Example: 'team,service' will produce metrics such as +# cortex_distributor_attributed_received_samples_total{team='frontend', +# service='api'}. +# CLI flag: -validation.cost-attribution-labels +[cost_attribution_labels: | default = ""] + +# (experimental) Maximum number of cost attribution labels allowed per user. +# CLI flag: -validation.max-cost-attribution-labels-per-user +[max_cost_attribution_labels_per_user: | default = 2] + +# (experimental) Maximum cardinality of cost attribution labels allowed per +# user. +# CLI flag: -validation.max-cost-attribution-cardinality-per-user +[max_cost_attribution_cardinality_per_user: | default = 10000] + +# (experimental) Cooldown period for cost attribution labels. Specifies the +# duration the cost attribution remains in overflow before attempting a reset. +# If the cardinality remains above the limit after this period, the system will +# stay in overflow mode and extend the cooldown. Setting this value to 0 +# disables the cooldown, causing the system to continuously check whether the +# cardinality has dropped below the limit. A reset will occur once the +# cardinality falls below the limit. +# CLI flag: -validation.cost-attribution-cooldown +[cost_attribution_cooldown: | default = 0s] + # Duration to delay the evaluation of rules to ensure the underlying metrics # have been pushed. # CLI flag: -ruler.evaluation-delay-duration diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 0c60ed54505..b39da15689b 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -157,16 +157,15 @@ func (m *Manager) inactiveObservationsForUser(userID string, deadline int64) []s m.trackersByUserID[userID] = cat m.mtx.Unlock() return nil - } else { - maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) - if cat.MaxCardinality() != maxCardinality { - cat.UpdateMaxCardinality(maxCardinality) - } + } + maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) + if cat.MaxCardinality() != maxCardinality { + cat.UpdateMaxCardinality(maxCardinality) + } - cooldown := int64(m.limits.CostAttributionCooldown(userID).Seconds()) - if cooldown != cat.CooldownDuration() { - cat.UpdateCooldownDuration(cooldown) - } + cooldown := int64(m.limits.CostAttributionCooldown(userID).Seconds()) + if cooldown != cat.CooldownDuration() { + cat.UpdateCooldownDuration(cooldown) } return cat.InactiveObservations(deadline) diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index 1e67704b287..4fbf42fe35a 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -93,7 +93,8 @@ func Test_CreateDeleteTracker(t *testing.T) { }) t.Run("Purge inactive attributions", func(t *testing.T) { - manager.purgeInactiveAttributionsUntil(time.Unix(10, 0).Unix()) + err := manager.purgeInactiveAttributionsUntil(time.Unix(10, 0).Unix()) + assert.NoError(t, err) expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. # TYPE cortex_discarded_attributed_samples_total counter @@ -103,8 +104,10 @@ func Test_CreateDeleteTracker(t *testing.T) { }) t.Run("Disabling user cost attribution", func(t *testing.T) { - manager.limits, _ = getMockLimits(1) - manager.purgeInactiveAttributionsUntil(time.Unix(11, 0).Unix()) + var err error + manager.limits, err = getMockLimits(1) + assert.NoError(t, err) + assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(11, 0).Unix())) assert.Equal(t, 1, len(manager.trackersByUserID)) expectedMetrics := ` @@ -116,8 +119,10 @@ func Test_CreateDeleteTracker(t *testing.T) { }) t.Run("Updating user cardinality and labels", func(t *testing.T) { - manager.limits, _ = getMockLimits(2) - manager.purgeInactiveAttributionsUntil(time.Unix(12, 0).Unix()) + var err error + manager.limits, err = getMockLimits(2) + assert.NoError(t, err) + assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(12, 0).Unix())) assert.Equal(t, 1, len(manager.trackersByUserID)) assert.True(t, manager.TrackerForUser("user3").CompareCALabels([]string{"feature", "team"})) @@ -151,7 +156,7 @@ func Test_PurgeInactiveAttributionsUntil(t *testing.T) { manager.TrackerForUser("user3").IncrementDiscardedSamples(labels.FromStrings("department", "foo", "service", "bar"), 1, "out-of-window", time.Unix(10, 0)) t.Run("Purge before inactive timeout", func(t *testing.T) { - manager.purgeInactiveAttributionsUntil(time.Unix(0, 0).Unix()) + assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(0, 0).Unix())) assert.Equal(t, 2, len(manager.trackersByUserID)) expectedMetrics := ` @@ -166,7 +171,7 @@ func Test_PurgeInactiveAttributionsUntil(t *testing.T) { t.Run("Purge after inactive timeout", func(t *testing.T) { // disable cost attribution for user1 to test purging manager.limits, _ = getMockLimits(1) - manager.purgeInactiveAttributionsUntil(time.Unix(5, 0).Unix()) + assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(5, 0).Unix())) // User3's tracker should remain since it's active, user1's tracker should be removed assert.Equal(t, 1, len(manager.trackersByUserID), "Expected one active tracker after purging") @@ -182,7 +187,7 @@ func Test_PurgeInactiveAttributionsUntil(t *testing.T) { t.Run("Purge all trackers", func(t *testing.T) { // Trigger a purge that should remove all inactive trackers - manager.purgeInactiveAttributionsUntil(time.Unix(20, 0).Unix()) + assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(20, 0).Unix())) // Tracker would stay at 1 since user1's tracker is disabled assert.Equal(t, 1, len(manager.trackersByUserID), "Expected one active tracker after full purge") diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 0a232195848..8f245c1a6e1 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -42,6 +42,7 @@ type Tracker struct { activeSeriesPerUserAttribution *prometheus.Desc receivedSamplesAttribution *prometheus.Desc discardedSampleAttribution *prometheus.Desc + failedActiveSeriesDecrement *prometheus.Desc overflowLabels []string obseveredMtx sync.RWMutex observed map[string]*Observation @@ -49,6 +50,7 @@ type Tracker struct { state TrackerState overflowCounter *Observation cooldownUntil *atomic.Int64 + totalFailedActiveSeries *atomic.Float64 cooldownDuration int64 logger log.Logger } @@ -70,15 +72,16 @@ func newTracker(userID string, trackedLabels []string, limit int, cooldown time. overflowLabels[len(trackedLabels)+1] = overflowValue tracker := &Tracker{ - userID: userID, - caLabels: trackedLabels, - caLabelMap: caLabelMap, - maxCardinality: limit, - observed: make(map[string]*Observation), - hashBuffer: make([]byte, 0, 1024), - cooldownDuration: int64(cooldown.Seconds()), - logger: logger, - overflowLabels: overflowLabels, + userID: userID, + caLabels: trackedLabels, + caLabelMap: caLabelMap, + maxCardinality: limit, + observed: make(map[string]*Observation), + hashBuffer: make([]byte, 0, 1024), + cooldownDuration: int64(cooldown.Seconds()), + logger: logger, + overflowLabels: overflowLabels, + totalFailedActiveSeries: atomic.NewFloat64(0), } tracker.discardedSampleAttribution = prometheus.NewDesc("cortex_discarded_attributed_samples_total", @@ -94,7 +97,9 @@ func newTracker(userID string, trackedLabels []string, limit int, cooldown time. tracker.activeSeriesPerUserAttribution = prometheus.NewDesc("cortex_ingester_attributed_active_series", "The total number of active series per user and attribution.", append(trackedLabels, TenantLabel), prometheus.Labels{TrackerLabel: defaultTrackerName}) - + tracker.failedActiveSeriesDecrement = prometheus.NewDesc("cortex_ingester_attributed_active_series_failure", + "The total number of failed active series decrement per user and tracker.", []string{TenantLabel}, + prometheus.Labels{TrackerLabel: defaultTrackerName}) return tracker } @@ -149,11 +154,11 @@ func (t *Tracker) IncrementActiveSeries(lbs labels.Labels, now time.Time) { t.updateCounters(lbs, now.Unix(), 1, 0, 0, nil) } -func (t *Tracker) DecrementActiveSeries(lbs labels.Labels, now time.Time) { +func (t *Tracker) DecrementActiveSeries(lbs labels.Labels) { if t == nil { return } - t.updateCounters(lbs, now.Unix(), -1, 0, 0, nil) + t.updateCounters(lbs, -1, -1, 0, 0, nil) } func (t *Tracker) Collect(out chan<- prometheus.Metric) { @@ -182,6 +187,9 @@ func (t *Tracker) Collect(out chan<- prometheus.Metric) { o.discardSamplemtx.Unlock() } } + if t.totalFailedActiveSeries.Load() > 0 { + out <- prometheus.MustNewConstMetric(t.failedActiveSeriesDecrement, prometheus.CounterValue, t.totalFailedActiveSeries.Load(), t.userID) + } } func (t *Tracker) IncrementDiscardedSamples(lbs labels.Labels, value float64, reason string, now time.Time) { @@ -198,6 +206,13 @@ func (t *Tracker) IncrementReceivedSamples(lbs labels.Labels, value float64, now t.updateCounters(lbs, now.Unix(), 0, value, 0, nil) } +func (t *Tracker) IncrementActiveSeriesFailure(value float64) { + if t == nil { + return + } + t.totalFailedActiveSeries.Add(value) +} + func (t *Tracker) updateCounters(lbls labels.Labels, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { labelValues := make([]string, len(t.caLabels)) lbls.Range(func(l labels.Label) { @@ -248,8 +263,11 @@ func (t *Tracker) handleObservation(stream string, ts int64, activeSeriesIncreme o.discardSamplemtx.Unlock() } } else if len(t.observed) < t.maxCardinality*2 { - // Create a new observation for the stream - t.createNewObservation(stream, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) + // If the ts is negative, it means that the method is called from DecrementActiveSeries, when key doesn't exist we should ignore the call + // Otherwise create a new observation for the stream + if ts >= 0 { + t.createNewObservation(stream, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) + } } } diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index 82de4e8b64c..bc08b5ccb27 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -35,11 +35,11 @@ func Test_CreateCleanupTracker(t *testing.T) { cat.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), time.Unix(1, 0)) cat.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "2"), time.Unix(2, 0)) - cat.DecrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "3"), time.Unix(3, 0)) + cat.DecrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "3")) cat.IncrementReceivedSamples(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), 5, time.Unix(4, 0)) cat.IncrementDiscardedSamples(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), 2, "sample-out-of-order", time.Unix(4, 0)) - cat.IncrementActiveSeries(labels.FromStrings("platform", "bar", "tenant", "user4", "team", "2"), time.Unix(6, 0)) + cat.IncrementActiveSeriesFailure(1) expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. @@ -49,6 +49,9 @@ func Test_CreateCleanupTracker(t *testing.T) { # TYPE cortex_ingester_attributed_active_series gauge cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="cost-attribution"} 1 cortex_ingester_attributed_active_series{platform="foo",tenant="user4",tracker="cost-attribution"} 1 + # HELP cortex_ingester_attributed_active_series_failure The total number of failed active series decrement per user and tracker. + # TYPE cortex_ingester_attributed_active_series_failure counter + cortex_ingester_attributed_active_series_failure{tenant="user4",tracker="cost-attribution"} 1 # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. # TYPE cortex_received_attributed_samples_total counter cortex_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 5 @@ -58,15 +61,19 @@ func Test_CreateCleanupTracker(t *testing.T) { "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total", "cortex_ingester_attributed_active_series", + "cortex_ingester_attributed_active_series_failure", } assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) assert.Equal(t, []string{"foo"}, cat.InactiveObservations(5)) - tManager.purgeInactiveAttributionsUntil(5) + assert.NoError(t, tManager.purgeInactiveAttributionsUntil(5)) expectedMetrics = ` # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. # TYPE cortex_ingester_attributed_active_series gauge cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="cost-attribution"} 1 + # HELP cortex_ingester_attributed_active_series_failure The total number of failed active series decrement per user and tracker. + # TYPE cortex_ingester_attributed_active_series_failure counter + cortex_ingester_attributed_active_series_failure{tenant="user4",tracker="cost-attribution"} 1 ` assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) tManager.deleteUserTracker("user4") diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 3594123435d..a14bf4b52f4 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -34,6 +34,19 @@ import ( "github.com/grafana/dskit/services" "github.com/grafana/dskit/tenant" "github.com/grafana/dskit/user" + "github.com/opentracing/opentracing-go" + "github.com/opentracing/opentracing-go/ext" + "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/model/labels" + "github.com/prometheus/prometheus/model/relabel" + "github.com/prometheus/prometheus/scrape" + "go.uber.org/atomic" + "golang.org/x/exp/slices" + "golang.org/x/sync/errgroup" + "github.com/grafana/mimir/pkg/cardinality" "github.com/grafana/mimir/pkg/costattribution" ingester_client "github.com/grafana/mimir/pkg/ingester/client" @@ -48,18 +61,6 @@ import ( "github.com/grafana/mimir/pkg/util/pool" "github.com/grafana/mimir/pkg/util/spanlogger" "github.com/grafana/mimir/pkg/util/validation" - "github.com/opentracing/opentracing-go" - "github.com/opentracing/opentracing-go/ext" - "github.com/pkg/errors" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" - "github.com/prometheus/common/model" - "github.com/prometheus/prometheus/model/labels" - "github.com/prometheus/prometheus/model/relabel" - "github.com/prometheus/prometheus/scrape" - "go.uber.org/atomic" - "golang.org/x/exp/slices" - "golang.org/x/sync/errgroup" ) func init() { @@ -745,15 +746,14 @@ func (d *Distributor) checkSample(ctx context.Context, userID, cluster, replica // May alter timeseries data in-place. // The returned error may retain the series labels. -func (d *Distributor) validateSamples(tnow model.Time, ts *mimirpb.PreallocTimeseries, userID, group string) error { +func (d *Distributor) validateSamples(now model.Time, ts *mimirpb.PreallocTimeseries, userID, group string) error { if len(ts.Samples) == 0 { return nil } cat := d.costAttributionMgr.TrackerForUser(userID) - if len(ts.Samples) == 1 { - return validateSample(d.sampleValidationMetrics, tnow, d.limits, userID, group, ts.Labels, ts.Samples[0], cat) + return validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, ts.Samples[0], cat) } timestamps := make(map[int64]struct{}, min(len(ts.Samples), 100)) @@ -767,7 +767,7 @@ func (d *Distributor) validateSamples(tnow model.Time, ts *mimirpb.PreallocTimes } timestamps[s.TimestampMs] = struct{}{} - if err := validateSample(d.sampleValidationMetrics, tnow, d.limits, userID, group, ts.Labels, s, cat); err != nil { + if err := validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, s, cat); err != nil { return err } @@ -787,14 +787,14 @@ func (d *Distributor) validateSamples(tnow model.Time, ts *mimirpb.PreallocTimes // Returns an error explaining the first validation finding. // May alter timeseries data in-place. // The returned error may retain the series labels. -func (d *Distributor) validateHistograms(tnow model.Time, ts *mimirpb.PreallocTimeseries, userID, group string) error { +func (d *Distributor) validateHistograms(now model.Time, ts *mimirpb.PreallocTimeseries, userID, group string) error { if len(ts.Histograms) == 0 { return nil } cat := d.costAttributionMgr.TrackerForUser(userID) if len(ts.Histograms) == 1 { - updated, err := validateSampleHistogram(d.sampleValidationMetrics, tnow, d.limits, userID, group, ts.Labels, &ts.Histograms[0], cat) + updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[0], cat) if err != nil { return err } @@ -807,7 +807,6 @@ func (d *Distributor) validateHistograms(tnow model.Time, ts *mimirpb.PreallocTi timestamps := make(map[int64]struct{}, min(len(ts.Histograms), 100)) currPos := 0 histogramsUpdated := false - for idx := range ts.Histograms { if _, ok := timestamps[ts.Histograms[idx].Timestamp]; ok { // A sample with the same timestamp has already been validated, so we skip it. @@ -816,7 +815,7 @@ func (d *Distributor) validateHistograms(tnow model.Time, ts *mimirpb.PreallocTi } timestamps[ts.Histograms[idx].Timestamp] = struct{}{} - updated, err := validateSampleHistogram(d.sampleValidationMetrics, tnow, d.limits, userID, group, ts.Labels, &ts.Histograms[idx], cat) + updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[idx], cat) if err != nil { return err } @@ -884,6 +883,7 @@ func (d *Distributor) validateSeries(nowt time.Time, ts *mimirpb.PreallocTimeser if err := validateLabels(d.sampleValidationMetrics, d.limits, userID, group, ts.Labels, skipLabelValidation, skipLabelCountValidation, cat, nowt); err != nil { return true, err } + now := model.TimeFromUnixNano(nowt.UnixNano()) totalSamplesAndHistograms := len(ts.Samples) + len(ts.Histograms) @@ -973,8 +973,8 @@ func (d *Distributor) prePushHaDedupeMiddleware(next PushFunc) PushFunc { } numSamples := 0 - tnow := time.Now() - group := d.activeGroups.UpdateActiveGroupTimestamp(userID, validation.GroupLabel(d.limits, userID, req.Timeseries), tnow) + now := time.Now() + group := d.activeGroups.UpdateActiveGroupTimestamp(userID, validation.GroupLabel(d.limits, userID, req.Timeseries), now) for _, ts := range req.Timeseries { numSamples += len(ts.Samples) + len(ts.Histograms) } @@ -988,7 +988,7 @@ func (d *Distributor) prePushHaDedupeMiddleware(next PushFunc) PushFunc { if errors.As(err, &tooManyClustersError{}) { d.discardedSamplesTooManyHaClusters.WithLabelValues(userID, group).Add(float64(numSamples)) - d.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(numSamples), reasonTooManyHAClusters, tnow) + d.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(numSamples), reasonTooManyHAClusters, now) } return err @@ -1829,7 +1829,6 @@ func tokenForMetadata(userID string, metricName string) uint32 { func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID string) { var receivedSamples, receivedExemplars, receivedMetadata int - for _, ts := range req.Timeseries { receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) diff --git a/pkg/distributor/validate.go b/pkg/distributor/validate.go index 8b9849ba730..5b6775cdf9f 100644 --- a/pkg/distributor/validate.go +++ b/pkg/distributor/validate.go @@ -443,6 +443,7 @@ func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userI m.labelNameTooLong.WithLabelValues(userID, group).Inc() return fmt.Errorf(labelNameTooLongMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } else if !skipLabelValidation && !model.LabelValue(l.Value).IsValid() { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidLabelValue, ts) m.invalidLabelValue.WithLabelValues(userID, group).Inc() return fmt.Errorf(invalidLabelValueMsgFormat, l.Name, strings.ToValidUTF8(l.Value, ""), unsafeMetricName) } else if len(l.Value) > maxLabelValueLength { diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index e7895404a22..6c06a62e162 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -49,9 +49,10 @@ type ActiveSeries struct { // configMutex protects matchers and lastMatchersUpdate. it used by both matchers and cat configMutex sync.RWMutex matchers *asmodel.Matchers - cat *costattribution.Tracker lastConfigUpdate time.Time + cat *costattribution.Tracker + // The duration after which series become inactive. // Also used to determine if enough time has passed since configuration reload for valid results. timeout time.Duration @@ -67,7 +68,6 @@ type seriesStripe struct { // Updated in purge and when old timestamp is used when updating series (in this case, oldestEntryTs is updated // without holding the lock -- hence the atomic). oldestEntryTs atomic.Int64 - cat *costattribution.Tracker mu sync.RWMutex refs map[storage.SeriesRef]seriesEntry active uint32 // Number of active entries in this stripe. Only decreased during purge or clear. @@ -76,6 +76,8 @@ type seriesStripe struct { activeMatchingNativeHistograms []uint32 // Number of active entries (only native histograms) in this stripe matching each matcher of the configured Matchers. activeNativeHistogramBuckets uint32 // Number of buckets in active native histogram entries in this stripe. Only decreased during purge or clear. activeMatchingNativeHistogramBuckets []uint32 // Number of buckets in active native histogram entries in this stripe matching each matcher of the configured Matchers. + + cat *costattribution.Tracker } // seriesEntry holds a timestamp for single series. @@ -87,14 +89,8 @@ type seriesEntry struct { deleted bool // This series was marked as deleted, so before purging we need to remove the refence to it from the deletedSeries. } -func NewActiveSeries( - asm *asmodel.Matchers, - timeout time.Duration, - cat *costattribution.Tracker, -) *ActiveSeries { - c := &ActiveSeries{ - matchers: asm, timeout: timeout, cat: cat, - } +func NewActiveSeries(asm *asmodel.Matchers, timeout time.Duration, cat *costattribution.Tracker) *ActiveSeries { + c := &ActiveSeries{matchers: asm, timeout: timeout, cat: cat} // Stripes are pre-allocated so that we only read on them and no lock is required. for i := 0; i < numStripes; i++ { @@ -112,8 +108,7 @@ func (c *ActiveSeries) CurrentMatcherNames() []string { func (c *ActiveSeries) ConfigDiffers(ctCfg asmodel.CustomTrackersConfig, caCfg *costattribution.Tracker) bool { currentCTC, currentCAT := c.CurrentConfig() - // TODO: I think here to check the pointer is not equal is already enough, if we recreate tracker, it is for a good reason, otherwise, nothing changed - return ctCfg.String() != currentCTC.String() || caCfg != currentCAT //|| !costattribution.CompareCALabels(caCfg.CALabels(), currentCAT.CALabels()) + return ctCfg.String() != currentCTC.String() || caCfg != currentCAT } func (c *ActiveSeries) ReloadMatchers(asm *asmodel.Matchers, now time.Time) { @@ -137,6 +132,7 @@ func (c *ActiveSeries) CurrentConfig() (asmodel.CustomTrackersConfig, *costattri // Pass -1 in numNativeHistogramBuckets if the series is not a native histogram series. func (c *ActiveSeries) UpdateSeries(series labels.Labels, ref storage.SeriesRef, now time.Time, numNativeHistogramBuckets int, idx tsdb.IndexReader) { stripeID := ref % numStripes + created := c.stripes[stripeID].updateSeriesTimestamp(now, series, ref, numNativeHistogramBuckets) if created { if deleted, ok := c.deleted.find(series); ok { @@ -408,8 +404,6 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef numNativeHistogramBuckets: numNativeHistogramBuckets, } - // here if we have a cost attribution label, we can split the serie count based on the value of the label - // we also set the reference to the value of the label in the entry, so when remove, we can decrease the counter accordingly s.cat.IncrementActiveSeries(series, time.Unix(0, nowNanos)) s.refs[ref] = e return e.nanos, true @@ -432,11 +426,7 @@ func (s *seriesStripe) clear() { } // Reinitialize assigns new matchers and corresponding size activeMatching slices. -func (s *seriesStripe) reinitialize( - asm *asmodel.Matchers, - deleted *deletedSeries, - cat *costattribution.Tracker, -) { +func (s *seriesStripe) reinitialize(asm *asmodel.Matchers, deleted *deletedSeries, cat *costattribution.Tracker) { s.mu.Lock() defer s.mu.Unlock() s.deleted = deleted @@ -474,17 +464,20 @@ func (s *seriesStripe) purge(keepUntil time.Time, idx tsdb.IndexReader) { for ref, entry := range s.refs { ts := entry.nanos.Load() if ts < keepUntilNanos { + // cost attribution is enabled, if it's not nil, we need to decrement the active series count, otherwise means received error when get idx, + // we need to increment the active series failure count. + if s.cat != nil { + if idx == nil { + s.cat.IncrementActiveSeriesFailure(1) + } else if err := idx.Series(ref, &buf, nil); err != nil { + s.cat.IncrementActiveSeriesFailure(1) + } else { + s.cat.DecrementActiveSeries(buf.Labels()) + } + } if entry.deleted { s.deleted.purge(ref) } - - if idx != nil { - if err := idx.Series(ref, &buf, nil); err != nil { - //TODO: think about what to do here - _ = err - } - s.cat.DecrementActiveSeries(buf.Labels(), keepUntil) - } delete(s.refs, ref) continue } @@ -532,13 +525,17 @@ func (s *seriesStripe) remove(ref storage.SeriesRef, idx tsdb.IndexReader) { } s.active-- - if idx != nil { - buf := labels.NewScratchBuilder(10) - if err := idx.Series(ref, &buf, nil); err != nil { - //TODO: think about what to do here - _ = err + if s.cat != nil { + if idx == nil { + s.cat.IncrementActiveSeriesFailure(1) + } else { + buf := labels.NewScratchBuilder(128) + if err := idx.Series(ref, &buf, nil); err != nil { + s.cat.IncrementActiveSeriesFailure(1) + } else { + s.cat.DecrementActiveSeries(buf.Labels()) + } } - s.cat.DecrementActiveSeries(buf.Labels(), time.Now()) } if entry.numNativeHistogramBuckets >= 0 { s.activeNativeHistograms-- diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 2b3561a3530..763ce527c5c 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -371,9 +371,8 @@ func newIngester(cfg Config, limits *validation.Overrides, registerer prometheus limits: limits, logger: logger, - tsdbs: make(map[string]*userTSDB), - usersMetadata: make(map[string]*userMetricsMetadata), - + tsdbs: make(map[string]*userTSDB), + usersMetadata: make(map[string]*userMetricsMetadata), bucket: bucketClient, tsdbMetrics: newTSDBMetrics(registerer, logger), shipperMetrics: newShipperMetrics(registerer), @@ -793,7 +792,12 @@ func (i *Ingester) updateActiveSeries(now time.Time) { i.replaceMatchers(asmodel.NewMatchers(newMatchersConfig), userDB, now) } - idx, _ := userDB.Head().Index() + // If the userDB idx is unavailable, pass nil pointer to Purge methode, and record it as a failure in metrics when decrementing active series. + idx, err := userDB.Head().Index() + if err != nil { + level.Warn(i.logger).Log("msg", "failed to get the index of the TSDB head", "user", userID, "err", err) + idx = nil + } valid := userDB.activeSeries.Purge(now, idx) if !valid { // Active series config has been reloaded, exposing loading metric until MetricsIdleTimeout passes. @@ -1167,6 +1171,7 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques // Note that we don't .Finish() the span in this method on purpose spanlog := spanlogger.FromContext(ctx, i.logger) spanlog.DebugLog("event", "acquired append lock") + var ( startAppend = time.Now() @@ -1411,8 +1416,10 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre var nonCopiedLabels labels.Labels // idx is used to decrease active series count in case of error for cost attribution. - idx, _ := i.getTSDB(userID).Head().Index() - // TODO: deal with the error here + idx, err := i.getTSDB(userID).Head().Index() + if err != nil { + idx = nil + } for _, ts := range timeseries { // The labels must be sorted (in our case, it's guaranteed a write request @@ -1429,7 +1436,6 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre allOutOfBoundsHistograms(ts.Histograms, minAppendTime) { stats.failedSamplesCount += len(ts.Samples) + len(ts.Histograms) - stats.sampleTimestampTooOldCount += len(ts.Samples) + len(ts.Histograms) i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)+len(ts.Histograms)), reasonSampleTimestampTooOld, startAppend) var firstTimestamp int64 @@ -2666,12 +2672,8 @@ func (i *Ingester) createTSDB(userID string, walReplayConcurrency int) (*userTSD } userDB := &userTSDB{ - userID: userID, - activeSeries: activeseries.NewActiveSeries( - asmodel.NewMatchers(matchersConfig), - i.cfg.ActiveSeriesMetrics.IdleTimeout, - i.costAttributionMgr.TrackerForUser(userID), - ), + userID: userID, + activeSeries: activeseries.NewActiveSeries(asmodel.NewMatchers(matchersConfig), i.cfg.ActiveSeriesMetrics.IdleTimeout, i.costAttributionMgr.TrackerForUser(userID)), seriesInMetric: newMetricCounter(i.limiter, i.cfg.getIgnoreSeriesLimitForMetricNamesMap()), ingestedAPISamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), ingestedRuleSamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), @@ -3274,7 +3276,7 @@ func (i *Ingester) compactBlocksToReduceInMemorySeries(ctx context.Context, now idx, err := db.Head().Index() if err != nil { level.Warn(i.logger).Log("msg", "failed to get the index of the TSDB head", "user", userID, "err", err) - continue + idx = nil } db.activeSeries.Purge(now, idx) diff --git a/pkg/ingester/user_tsdb.go b/pkg/ingester/user_tsdb.go index 2f31f41892e..61c1aa244ee 100644 --- a/pkg/ingester/user_tsdb.go +++ b/pkg/ingester/user_tsdb.go @@ -619,8 +619,11 @@ func (u *userTSDB) computeOwnedSeries() int { } count := 0 - idx, _ := u.Head().Index() - // TODO: deal with the err here + idx, err := u.Head().Index() + if err != nil { + idx = nil + } + u.Head().ForEachSecondaryHash(func(refs []chunks.HeadSeriesRef, secondaryHashes []uint32) { for i, sh := range secondaryHashes { if u.ownedTokenRanges.IncludesKey(sh) { From 2c422d1538374c698660cc1f63713ff05652dd74 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 14:09:31 +0100 Subject: [PATCH 003/105] add experimental features in about-versioning.md --- docs/sources/mimir/configure/about-versioning.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/docs/sources/mimir/configure/about-versioning.md b/docs/sources/mimir/configure/about-versioning.md index 3eb5a0de095..4ffe5f05536 100644 --- a/docs/sources/mimir/configure/about-versioning.md +++ b/docs/sources/mimir/configure/about-versioning.md @@ -45,7 +45,17 @@ We do not guarantee backwards compatibility for experimental features and flags. Experimental configuration and flags are subject to change. The following features are currently experimental: - +- Cost attribution + - Configure labels for cost attribution + - `-validation.cost-attribution-labels` + - Configure cost attribution limits, such as label cardinality and the maximum number of cost attribution labels + - `-validation.max-cost-attribution-labels-per-user` + - `-validation.max-cost-attribution-cardinality-per-user` + - Configure cooldown periods and eviction intervals for cost attribution + - `-validation.cost-attribution-cooldown` + - `-cost-attribution.eviction-interval` + - Configure the metrics endpoint dedicated to cost attribution + - `-cost-attribution.registry-path` - Alertmanager - Enable a set of experimental API endpoints to help support the migration of the Grafana Alertmanager to the Mimir Alertmanager. - `-alertmanager.grafana-alertmanager-compatibility-enabled` From d2eab6b54e76aa986689d8ab2a83b31e15201cea Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 14:18:14 +0100 Subject: [PATCH 004/105] change const variable to private --- pkg/costattribution/manager.go | 4 ++-- pkg/costattribution/tracker.go | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index b39da15689b..7fe9f494b82 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -16,8 +16,8 @@ import ( ) const ( - TrackerLabel = "tracker" - TenantLabel = "tenant" + trackerLabel = "tracker" + tenantLabel = "tenant" defaultTrackerName = "cost-attribution" missingValue = "__missing__" overflowValue = "__overflow__" diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 8f245c1a6e1..b17389b6df8 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -86,20 +86,20 @@ func newTracker(userID string, trackedLabels []string, limit int, cooldown time. tracker.discardedSampleAttribution = prometheus.NewDesc("cortex_discarded_attributed_samples_total", "The total number of samples that were discarded per attribution.", - append(trackedLabels, TenantLabel, "reason"), - prometheus.Labels{TrackerLabel: defaultTrackerName}) + append(trackedLabels, tenantLabel, "reason"), + prometheus.Labels{trackerLabel: defaultTrackerName}) tracker.receivedSamplesAttribution = prometheus.NewDesc("cortex_received_attributed_samples_total", "The total number of samples that were received per attribution.", - append(trackedLabels, TenantLabel), - prometheus.Labels{TrackerLabel: defaultTrackerName}) + append(trackedLabels, tenantLabel), + prometheus.Labels{trackerLabel: defaultTrackerName}) tracker.activeSeriesPerUserAttribution = prometheus.NewDesc("cortex_ingester_attributed_active_series", - "The total number of active series per user and attribution.", append(trackedLabels, TenantLabel), - prometheus.Labels{TrackerLabel: defaultTrackerName}) + "The total number of active series per user and attribution.", append(trackedLabels, tenantLabel), + prometheus.Labels{trackerLabel: defaultTrackerName}) tracker.failedActiveSeriesDecrement = prometheus.NewDesc("cortex_ingester_attributed_active_series_failure", - "The total number of failed active series decrement per user and tracker.", []string{TenantLabel}, - prometheus.Labels{TrackerLabel: defaultTrackerName}) + "The total number of failed active series decrement per user and tracker.", []string{tenantLabel}, + prometheus.Labels{trackerLabel: defaultTrackerName}) return tracker } From 1f392821a3e24ac01813682d28c44c71c62e7814 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 14:30:41 +0100 Subject: [PATCH 005/105] make timer service --- .../mimir/configure/about-versioning.md | 1 + pkg/costattribution/manager.go | 18 +++--------------- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/docs/sources/mimir/configure/about-versioning.md b/docs/sources/mimir/configure/about-versioning.md index 4ffe5f05536..05751699ead 100644 --- a/docs/sources/mimir/configure/about-versioning.md +++ b/docs/sources/mimir/configure/about-versioning.md @@ -45,6 +45,7 @@ We do not guarantee backwards compatibility for experimental features and flags. Experimental configuration and flags are subject to change. The following features are currently experimental: + - Cost attribution - Configure labels for cost attribution - `-validation.cost-attribution-labels` diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 7fe9f494b82..54ac7325a38 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -48,27 +48,15 @@ func NewManager(cleanupInterval, exportInterval, inactiveTimeout time.Duration, metricsExportInterval: exportInterval, } - m.Service = services.NewBasicService(nil, m.running, nil).WithName("cost attribution manager") + m.Service = services.NewTimerService(cleanupInterval, nil, m.iteration, nil).WithName("cost attribution manager") if err := reg.Register(m); err != nil { return nil, err } return m, nil } -func (m *Manager) running(ctx context.Context) error { - t := time.NewTicker(m.cleanupInterval) - defer t.Stop() - - for { - select { - case <-t.C: - if err := m.purgeInactiveAttributionsUntil(time.Now().Add(-m.inactiveTimeout).Unix()); err != nil { - return err - } - case <-ctx.Done(): - return nil - } - } +func (m *Manager) iteration(_ context.Context) error { + return m.purgeInactiveAttributionsUntil(time.Now().Add(-m.inactiveTimeout).Unix()) } func (m *Manager) EnabledForUser(userID string) bool { From 9b4337d14400f3af48fcfa0c0620867213b4c00f Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 14:32:19 +0100 Subject: [PATCH 006/105] rename TrackerForUser to Tracker --- pkg/costattribution/manager.go | 6 ++--- pkg/costattribution/manager_test.go | 30 ++++++++++++------------- pkg/costattribution/tracker_test.go | 14 ++++++------ pkg/distributor/distributor.go | 12 +++++----- pkg/ingester/ingester.go | 34 ++++++++++++++--------------- 5 files changed, 48 insertions(+), 48 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 54ac7325a38..a6345b7dd0c 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -66,7 +66,7 @@ func (m *Manager) EnabledForUser(userID string) bool { return len(m.limits.CostAttributionLabels(userID)) > 0 } -func (m *Manager) TrackerForUser(userID string) *Tracker { +func (m *Manager) Tracker(userID string) *Tracker { if !m.EnabledForUser(userID) { return nil } @@ -115,7 +115,7 @@ func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) error { } invalidKeys := m.inactiveObservationsForUser(userID, deadline) - cat := m.TrackerForUser(userID) + cat := m.Tracker(userID) for _, key := range invalidKeys { cat.cleanupTrackerAttribution(key) } @@ -133,7 +133,7 @@ func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) error { } func (m *Manager) inactiveObservationsForUser(userID string, deadline int64) []string { - cat := m.TrackerForUser(userID) + cat := m.Tracker(userID) newTrackedLabels := m.limits.CostAttributionLabels(userID) sort.Slice(newTrackedLabels, func(i, j int) bool { return newTrackedLabels[i] < newTrackedLabels[j] diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index 4fbf42fe35a..c07887bcdc2 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -62,23 +62,23 @@ func Test_CreateDeleteTracker(t *testing.T) { manager := newTestManager() t.Run("Tracker existence and attributes", func(t *testing.T) { - user1Tracker := manager.TrackerForUser("user1") + user1Tracker := manager.Tracker("user1") assert.NotNil(t, user1Tracker) assert.True(t, user1Tracker.CompareCALabels([]string{"team"})) assert.Equal(t, 5, user1Tracker.MaxCardinality()) - assert.Nil(t, manager.TrackerForUser("user2")) + assert.Nil(t, manager.Tracker("user2")) - user3Tracker := manager.TrackerForUser("user3") + user3Tracker := manager.Tracker("user3") assert.NotNil(t, user3Tracker) assert.True(t, user3Tracker.CompareCALabels([]string{"department", "service"})) assert.Equal(t, 2, user3Tracker.MaxCardinality()) }) t.Run("Metrics tracking", func(t *testing.T) { - manager.TrackerForUser("user1").IncrementDiscardedSamples(labels.FromStrings("team", "bar"), 1, "invalid-metrics-name", time.Unix(6, 0)) - manager.TrackerForUser("user1").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(12, 0)) - manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings("department", "foo", "service", "dodo"), 1, time.Unix(20, 0)) + manager.Tracker("user1").IncrementDiscardedSamples(labels.FromStrings("team", "bar"), 1, "invalid-metrics-name", time.Unix(6, 0)) + manager.Tracker("user1").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(12, 0)) + manager.Tracker("user3").IncrementReceivedSamples(labels.FromStrings("department", "foo", "service", "dodo"), 1, time.Unix(20, 0)) expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. @@ -124,9 +124,9 @@ func Test_CreateDeleteTracker(t *testing.T) { assert.NoError(t, err) assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(12, 0).Unix())) assert.Equal(t, 1, len(manager.trackersByUserID)) - assert.True(t, manager.TrackerForUser("user3").CompareCALabels([]string{"feature", "team"})) + assert.True(t, manager.Tracker("user3").CompareCALabels([]string{"feature", "team"})) - manager.TrackerForUser("user3").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(13, 0)) + manager.Tracker("user3").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(13, 0)) expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. # TYPE cortex_discarded_attributed_samples_total counter @@ -136,9 +136,9 @@ func Test_CreateDeleteTracker(t *testing.T) { }) t.Run("Overflow metrics on cardinality limit", func(t *testing.T) { - manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings("team", "bar", "feature", "bar"), 1, time.Unix(15, 0)) - manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings("team", "baz", "feature", "baz"), 1, time.Unix(16, 0)) - manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings("team", "foo", "feature", "foo"), 1, time.Unix(17, 0)) + manager.Tracker("user3").IncrementReceivedSamples(labels.FromStrings("team", "bar", "feature", "bar"), 1, time.Unix(15, 0)) + manager.Tracker("user3").IncrementReceivedSamples(labels.FromStrings("team", "baz", "feature", "baz"), 1, time.Unix(16, 0)) + manager.Tracker("user3").IncrementReceivedSamples(labels.FromStrings("team", "foo", "feature", "foo"), 1, time.Unix(17, 0)) expectedMetrics := ` # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. # TYPE cortex_received_attributed_samples_total counter @@ -151,9 +151,9 @@ func Test_CreateDeleteTracker(t *testing.T) { func Test_PurgeInactiveAttributionsUntil(t *testing.T) { manager := newTestManager() - manager.TrackerForUser("user1").IncrementReceivedSamples(labels.FromStrings("team", "foo"), 1, time.Unix(1, 0)) - manager.TrackerForUser("user1").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(1, 0)) - manager.TrackerForUser("user3").IncrementDiscardedSamples(labels.FromStrings("department", "foo", "service", "bar"), 1, "out-of-window", time.Unix(10, 0)) + manager.Tracker("user1").IncrementReceivedSamples(labels.FromStrings("team", "foo"), 1, time.Unix(1, 0)) + manager.Tracker("user1").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(1, 0)) + manager.Tracker("user3").IncrementDiscardedSamples(labels.FromStrings("department", "foo", "service", "bar"), 1, "out-of-window", time.Unix(10, 0)) t.Run("Purge before inactive timeout", func(t *testing.T) { assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(0, 0).Unix())) @@ -175,7 +175,7 @@ func Test_PurgeInactiveAttributionsUntil(t *testing.T) { // User3's tracker should remain since it's active, user1's tracker should be removed assert.Equal(t, 1, len(manager.trackersByUserID), "Expected one active tracker after purging") - assert.Nil(t, manager.TrackerForUser("user1"), "Expected user1 tracker to be purged") + assert.Nil(t, manager.Tracker("user1"), "Expected user1 tracker to be purged") expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index bc08b5ccb27..9947a917fe6 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -16,18 +16,18 @@ import ( ) func Test_GetCALabels(t *testing.T) { - cat := newTestManager().TrackerForUser("user1") + cat := newTestManager().Tracker("user1") assert.True(t, cat.CompareCALabels([]string{"team"}), "Expected cost attribution labels mismatch") } func Test_GetMaxCardinality(t *testing.T) { - cat := newTestManager().TrackerForUser("user1") + cat := newTestManager().Tracker("user1") assert.Equal(t, 5, cat.MaxCardinality(), "Expected max cardinality mismatch") } func Test_CreateCleanupTracker(t *testing.T) { tManager := newTestManager() - cat := tManager.TrackerForUser("user4") + cat := tManager.Tracker("user4") reg := prometheus.NewRegistry() err := reg.Register(tManager) @@ -81,7 +81,7 @@ func Test_CreateCleanupTracker(t *testing.T) { } func Test_UpdateCounters(t *testing.T) { - cat := newTestManager().TrackerForUser("user3") + cat := newTestManager().Tracker("user3") lbls1 := labels.FromStrings("department", "foo", "service", "bar") lbls2 := labels.FromStrings("department", "bar", "service", "baz") lbls3 := labels.FromStrings("department", "baz", "service", "foo") @@ -103,7 +103,7 @@ func Test_UpdateCounters(t *testing.T) { func Test_GetInactiveObservations(t *testing.T) { // Setup the test environment: create a tracker for user1 with a "team" label and max cardinality of 5. - cat := newTestManager().TrackerForUser("user1") + cat := newTestManager().Tracker("user1") // Create two observations with different last update timestamps. observations := []labels.Labels{ @@ -136,14 +136,14 @@ func Test_GetInactiveObservations(t *testing.T) { func Test_UpdateMaxCardinality(t *testing.T) { // user1 original max cardinality is 5 - cat := newTestManager().TrackerForUser("user1") + cat := newTestManager().Tracker("user1") cat.UpdateMaxCardinality(2) assert.Equal(t, 2, cat.MaxCardinality(), "Expected max cardinality update to 2") } func Test_Concurrency(t *testing.T) { m := newTestManager() - cat := m.TrackerForUser("user1") + cat := m.Tracker("user1") var wg sync.WaitGroup for i := 0; i < 100; i++ { diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index a14bf4b52f4..c6aea2a7dd7 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -751,7 +751,7 @@ func (d *Distributor) validateSamples(now model.Time, ts *mimirpb.PreallocTimese return nil } - cat := d.costAttributionMgr.TrackerForUser(userID) + cat := d.costAttributionMgr.Tracker(userID) if len(ts.Samples) == 1 { return validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, ts.Samples[0], cat) } @@ -792,7 +792,7 @@ func (d *Distributor) validateHistograms(now model.Time, ts *mimirpb.PreallocTim return nil } - cat := d.costAttributionMgr.TrackerForUser(userID) + cat := d.costAttributionMgr.Tracker(userID) if len(ts.Histograms) == 1 { updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[0], cat) if err != nil { @@ -879,7 +879,7 @@ func (d *Distributor) validateExemplars(ts *mimirpb.PreallocTimeseries, userID s // The returned error may retain the series labels. // It uses the passed nowt time to observe the delay of sample timestamps. func (d *Distributor) validateSeries(nowt time.Time, ts *mimirpb.PreallocTimeseries, userID, group string, skipLabelValidation, skipLabelCountValidation bool, minExemplarTS, maxExemplarTS int64) (bool, error) { - cat := d.costAttributionMgr.TrackerForUser(userID) + cat := d.costAttributionMgr.Tracker(userID) if err := validateLabels(d.sampleValidationMetrics, d.limits, userID, group, ts.Labels, skipLabelValidation, skipLabelCountValidation, cat, nowt); err != nil { return true, err } @@ -988,7 +988,7 @@ func (d *Distributor) prePushHaDedupeMiddleware(next PushFunc) PushFunc { if errors.As(err, &tooManyClustersError{}) { d.discardedSamplesTooManyHaClusters.WithLabelValues(userID, group).Add(float64(numSamples)) - d.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(numSamples), reasonTooManyHAClusters, now) + d.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(numSamples), reasonTooManyHAClusters, now) } return err @@ -1247,7 +1247,7 @@ func (d *Distributor) prePushValidationMiddleware(next PushFunc) PushFunc { totalN := validatedSamples + validatedExemplars + validatedMetadata if !d.ingestionRateLimiter.AllowN(now, userID, totalN) { if len(req.Timeseries) > 0 { - d.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(validatedSamples), reasonRateLimited, now) + d.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(validatedSamples), reasonRateLimited, now) } d.discardedSamplesRateLimited.WithLabelValues(userID, group).Add(float64(validatedSamples)) d.discardedExemplarsRateLimited.WithLabelValues(userID).Add(float64(validatedExemplars)) @@ -1832,7 +1832,7 @@ func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID st for _, ts := range req.Timeseries { receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) - d.costAttributionMgr.TrackerForUser(userID).IncrementReceivedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(receivedSamples), mtime.Now()) + d.costAttributionMgr.Tracker(userID).IncrementReceivedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(receivedSamples), mtime.Now()) } receivedMetadata = len(req.Metadata) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 763ce527c5c..de6f02a53af 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -787,7 +787,7 @@ func (i *Ingester) updateActiveSeries(now time.Time) { } newMatchersConfig := i.limits.ActiveSeriesCustomTrackersConfig(userID) - newCostAttributionTracker := i.costAttributionMgr.TrackerForUser(userID) + newCostAttributionTracker := i.costAttributionMgr.Tracker(userID) if userDB.activeSeries.ConfigDiffers(newMatchersConfig, newCostAttributionTracker) { i.replaceMatchers(asmodel.NewMatchers(newMatchersConfig), userDB, now) } @@ -1202,56 +1202,56 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTimestampTooOldCount++ - i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTimestampTooOld, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTimestampTooOld, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooOld, func() softError { return newSampleTimestampTooOldError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleOutOfOrderCount++ - i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfOrder, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfOrder, startAppend) updateFirstPartial(i.errorSamplers.sampleOutOfOrder, func() softError { return newSampleOutOfOrderError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTooOldCount++ - i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooOld, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooOld, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooOldOOOEnabled, func() softError { return newSampleTimestampTooOldOOOEnabledError(model.Time(timestamp), labels, outOfOrderWindow) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTooFarInFutureCount++ - i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooFarInFuture, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooFarInFuture, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooFarInFuture, func() softError { return newSampleTimestampTooFarInFutureError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.newValueForTimestampCount++ - i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonNewValueForTimestamp, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonNewValueForTimestamp, startAppend) updateFirstPartial(i.errorSamplers.sampleDuplicateTimestamp, func() softError { return newSampleDuplicateTimestampError(model.Time(timestamp), labels) }) }, func(labels []mimirpb.LabelAdapter) { stats.perUserSeriesLimitCount++ - i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerUserSeriesLimit, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerUserSeriesLimit, startAppend) updateFirstPartial(i.errorSamplers.maxSeriesPerUserLimitExceeded, func() softError { return newPerUserSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerUser(userID)) }) }, func(labels []mimirpb.LabelAdapter) { stats.perMetricSeriesLimitCount++ - i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerMetricSeriesLimit, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerMetricSeriesLimit, startAppend) updateFirstPartial(i.errorSamplers.maxSeriesPerMetricLimitExceeded, func() softError { return newPerMetricSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerMetric(userID), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleOutOfOrderCount++ - i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfOrder, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfOrder, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { e := newNativeHistogramValidationError(globalerror.NativeHistogramOOODisabled, err, model.Time(timestamp), labels) return e @@ -1259,35 +1259,35 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ - i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramCountMismatch, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ - i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramCountNotBigEnough, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ - i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramNegativeBucketCount, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ - i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramSpanNegativeOffset, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ - i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramSpansBucketsMismatch, err, model.Time(timestamp), labels) }) @@ -1437,7 +1437,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre stats.failedSamplesCount += len(ts.Samples) + len(ts.Histograms) stats.sampleTimestampTooOldCount += len(ts.Samples) + len(ts.Histograms) - i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)+len(ts.Histograms)), reasonSampleTimestampTooOld, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)+len(ts.Histograms)), reasonSampleTimestampTooOld, startAppend) var firstTimestamp int64 if len(ts.Samples) > 0 { firstTimestamp = ts.Samples[0].TimestampMs @@ -1457,7 +1457,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre len(ts.Samples) > 0 && allOutOfBoundsFloats(ts.Samples, minAppendTime) { stats.failedSamplesCount += len(ts.Samples) stats.sampleTimestampTooOldCount += len(ts.Samples) - i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)), reasonSampleTimestampTooOld, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)), reasonSampleTimestampTooOld, startAppend) firstTimestamp := ts.Samples[0].TimestampMs updateFirstPartial(i.errorSamplers.sampleTimestampTooOld, func() softError { @@ -2673,7 +2673,7 @@ func (i *Ingester) createTSDB(userID string, walReplayConcurrency int) (*userTSD userDB := &userTSDB{ userID: userID, - activeSeries: activeseries.NewActiveSeries(asmodel.NewMatchers(matchersConfig), i.cfg.ActiveSeriesMetrics.IdleTimeout, i.costAttributionMgr.TrackerForUser(userID)), + activeSeries: activeseries.NewActiveSeries(asmodel.NewMatchers(matchersConfig), i.cfg.ActiveSeriesMetrics.IdleTimeout, i.costAttributionMgr.Tracker(userID)), seriesInMetric: newMetricCounter(i.limiter, i.cfg.getIgnoreSeriesLimitForMetricNamesMap()), ingestedAPISamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), ingestedRuleSamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), From 1a523e17ab478102992f5b68deea52fad81a28c9 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 14:51:28 +0100 Subject: [PATCH 007/105] use fine locking --- pkg/costattribution/manager.go | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index a6345b7dd0c..ac1bdfd2542 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -71,14 +71,20 @@ func (m *Manager) Tracker(userID string) *Tracker { return nil } + // Check if the tracker already exists, if exists return it. Otherwise lock and create a new tracker. + m.mtx.RLock() + tracker, exists := m.trackersByUserID[userID] + m.mtx.RUnlock() + if exists { + return tracker + } + m.mtx.Lock() defer m.mtx.Unlock() - - if tracker, exists := m.trackersByUserID[userID]; exists { + if tracker, exists = m.trackersByUserID[userID]; exists { return tracker } - - tracker := newTracker(userID, m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) + tracker = newTracker(userID, m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) m.trackersByUserID[userID] = tracker return tracker } From f10f7872610c468ffcf2130a9a7f7c4834e6f529 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 14:56:30 +0100 Subject: [PATCH 008/105] add comments explain why we use unchecked collector --- pkg/costattribution/manager.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index ac1bdfd2542..d50e9b5feb9 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -98,6 +98,8 @@ func (m *Manager) Collect(out chan<- prometheus.Metric) { } func (m *Manager) Describe(chan<- *prometheus.Desc) { + // Describe is not implemented because the metrics include dynamic labels. The Manager functions as an unchecked exporter. + // For more details, refer to the documentation: https://pkg.go.dev/github.com/prometheus/client_golang/prometheus#hdr-Custom_Collectors_and_constant_Metrics } func (m *Manager) deleteUserTracker(userID string) { From cc0e93985a6bac4e3924febaf185c54b17f41b2b Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 15:06:56 +0100 Subject: [PATCH 009/105] rename deleteUserTracker to deleteTracker --- pkg/costattribution/manager.go | 6 +++--- pkg/costattribution/tracker_test.go | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index d50e9b5feb9..54c151da7e5 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -102,7 +102,7 @@ func (m *Manager) Describe(chan<- *prometheus.Desc) { // For more details, refer to the documentation: https://pkg.go.dev/github.com/prometheus/client_golang/prometheus#hdr-Custom_Collectors_and_constant_Metrics } -func (m *Manager) deleteUserTracker(userID string) { +func (m *Manager) deleteTracker(userID string) { m.mtx.Lock() defer m.mtx.Unlock() delete(m.trackersByUserID, userID) @@ -118,7 +118,7 @@ func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) error { for _, userID := range userIDs { if !m.EnabledForUser(userID) { - m.deleteUserTracker(userID) + m.deleteTracker(userID) continue } @@ -131,7 +131,7 @@ func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) error { if cat != nil && cat.cooldownUntil != nil && cat.cooldownUntil.Load() < deadline { if len(cat.observed) <= cat.MaxCardinality() { cat.state = OverflowComplete - m.deleteUserTracker(userID) + m.deleteTracker(userID) } else { cat.cooldownUntil.Store(deadline + cat.cooldownDuration) } diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index 9947a917fe6..be051dccbf0 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -76,7 +76,7 @@ func Test_CreateCleanupTracker(t *testing.T) { cortex_ingester_attributed_active_series_failure{tenant="user4",tracker="cost-attribution"} 1 ` assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) - tManager.deleteUserTracker("user4") + tManager.deleteTracker("user4") assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(""), metricNames...)) } From c020be0eaab34cb5e966c29dd7d376623153ea45 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 15:10:36 +0100 Subject: [PATCH 010/105] rename cat in cost attribution package to t or tracker --- pkg/costattribution/manager.go | 30 +++++------ pkg/costattribution/tracker_test.go | 80 ++++++++++++++--------------- 2 files changed, 55 insertions(+), 55 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 54c151da7e5..bc4073fed0f 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -123,17 +123,17 @@ func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) error { } invalidKeys := m.inactiveObservationsForUser(userID, deadline) - cat := m.Tracker(userID) + t := m.Tracker(userID) for _, key := range invalidKeys { - cat.cleanupTrackerAttribution(key) + t.cleanupTrackerAttribution(key) } - if cat != nil && cat.cooldownUntil != nil && cat.cooldownUntil.Load() < deadline { - if len(cat.observed) <= cat.MaxCardinality() { - cat.state = OverflowComplete + if t != nil && t.cooldownUntil != nil && t.cooldownUntil.Load() < deadline { + if len(t.observed) <= t.MaxCardinality() { + t.state = OverflowComplete m.deleteTracker(userID) } else { - cat.cooldownUntil.Store(deadline + cat.cooldownDuration) + t.cooldownUntil.Store(deadline + t.cooldownDuration) } } } @@ -141,28 +141,28 @@ func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) error { } func (m *Manager) inactiveObservationsForUser(userID string, deadline int64) []string { - cat := m.Tracker(userID) + t := m.Tracker(userID) newTrackedLabels := m.limits.CostAttributionLabels(userID) sort.Slice(newTrackedLabels, func(i, j int) bool { return newTrackedLabels[i] < newTrackedLabels[j] }) - if !cat.CompareCALabels(newTrackedLabels) { + if !t.CompareCALabels(newTrackedLabels) { m.mtx.Lock() - cat = newTracker(userID, newTrackedLabels, m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) - m.trackersByUserID[userID] = cat + t = newTracker(userID, newTrackedLabels, m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) + m.trackersByUserID[userID] = t m.mtx.Unlock() return nil } maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) - if cat.MaxCardinality() != maxCardinality { - cat.UpdateMaxCardinality(maxCardinality) + if t.MaxCardinality() != maxCardinality { + t.UpdateMaxCardinality(maxCardinality) } cooldown := int64(m.limits.CostAttributionCooldown(userID).Seconds()) - if cooldown != cat.CooldownDuration() { - cat.UpdateCooldownDuration(cooldown) + if cooldown != t.CooldownDuration() { + t.UpdateCooldownDuration(cooldown) } - return cat.InactiveObservations(deadline) + return t.InactiveObservations(deadline) } diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index be051dccbf0..237e5c33d91 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -16,30 +16,30 @@ import ( ) func Test_GetCALabels(t *testing.T) { - cat := newTestManager().Tracker("user1") - assert.True(t, cat.CompareCALabels([]string{"team"}), "Expected cost attribution labels mismatch") + tracker := newTestManager().Tracker("user1") + assert.True(t, tracker.CompareCALabels([]string{"team"}), "Expected cost attribution labels mismatch") } func Test_GetMaxCardinality(t *testing.T) { - cat := newTestManager().Tracker("user1") - assert.Equal(t, 5, cat.MaxCardinality(), "Expected max cardinality mismatch") + tracker := newTestManager().Tracker("user1") + assert.Equal(t, 5, tracker.MaxCardinality(), "Expected max cardinality mismatch") } func Test_CreateCleanupTracker(t *testing.T) { tManager := newTestManager() - cat := tManager.Tracker("user4") + tracker := tManager.Tracker("user4") reg := prometheus.NewRegistry() err := reg.Register(tManager) require.NoError(t, err) - cat.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), time.Unix(1, 0)) - cat.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "2"), time.Unix(2, 0)) - cat.DecrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "3")) - cat.IncrementReceivedSamples(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), 5, time.Unix(4, 0)) - cat.IncrementDiscardedSamples(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), 2, "sample-out-of-order", time.Unix(4, 0)) - cat.IncrementActiveSeries(labels.FromStrings("platform", "bar", "tenant", "user4", "team", "2"), time.Unix(6, 0)) - cat.IncrementActiveSeriesFailure(1) + tracker.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), time.Unix(1, 0)) + tracker.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "2"), time.Unix(2, 0)) + tracker.DecrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "3")) + tracker.IncrementReceivedSamples(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), 5, time.Unix(4, 0)) + tracker.IncrementDiscardedSamples(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), 2, "sample-out-of-order", time.Unix(4, 0)) + tracker.IncrementActiveSeries(labels.FromStrings("platform", "bar", "tenant", "user4", "team", "2"), time.Unix(6, 0)) + tracker.IncrementActiveSeriesFailure(1) expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. @@ -64,7 +64,7 @@ func Test_CreateCleanupTracker(t *testing.T) { "cortex_ingester_attributed_active_series_failure", } assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) - assert.Equal(t, []string{"foo"}, cat.InactiveObservations(5)) + assert.Equal(t, []string{"foo"}, tracker.InactiveObservations(5)) assert.NoError(t, tManager.purgeInactiveAttributionsUntil(5)) expectedMetrics = ` @@ -81,29 +81,29 @@ func Test_CreateCleanupTracker(t *testing.T) { } func Test_UpdateCounters(t *testing.T) { - cat := newTestManager().Tracker("user3") + tracker := newTestManager().Tracker("user3") lbls1 := labels.FromStrings("department", "foo", "service", "bar") lbls2 := labels.FromStrings("department", "bar", "service", "baz") lbls3 := labels.FromStrings("department", "baz", "service", "foo") - cat.updateCounters(lbls1, 1, 1, 0, 0, nil) - assert.Equal(t, Normal, cat.state, "First observation, should not overflow") + tracker.updateCounters(lbls1, 1, 1, 0, 0, nil) + assert.Equal(t, Normal, tracker.state, "First observation, should not overflow") - cat.updateCounters(lbls2, 2, 1, 0, 0, nil) - assert.Equal(t, Normal, cat.state, "Second observation, should not overflow") + tracker.updateCounters(lbls2, 2, 1, 0, 0, nil) + assert.Equal(t, Normal, tracker.state, "Second observation, should not overflow") - cat.updateCounters(lbls3, 3, 1, 0, 0, nil) - assert.Equal(t, Overflow, cat.state, "Third observation, should overflow") + tracker.updateCounters(lbls3, 3, 1, 0, 0, nil) + assert.Equal(t, Overflow, tracker.state, "Third observation, should overflow") - cat.updateCounters(lbls3, 4, 1, 0, 0, nil) - assert.Equal(t, Overflow, cat.state, "Fourth observation, should stay overflow") + tracker.updateCounters(lbls3, 4, 1, 0, 0, nil) + assert.Equal(t, Overflow, tracker.state, "Fourth observation, should stay overflow") - assert.Equal(t, int64(3+cat.cooldownDuration), cat.cooldownUntil.Load(), "CooldownUntil should be updated correctly") + assert.Equal(t, int64(3+tracker.cooldownDuration), tracker.cooldownUntil.Load(), "CooldownUntil should be updated correctly") } func Test_GetInactiveObservations(t *testing.T) { // Setup the test environment: create a tracker for user1 with a "team" label and max cardinality of 5. - cat := newTestManager().Tracker("user1") + tracker := newTestManager().Tracker("user1") // Create two observations with different last update timestamps. observations := []labels.Labels{ @@ -112,38 +112,38 @@ func Test_GetInactiveObservations(t *testing.T) { labels.FromStrings("team", "baz"), } // Simulate samples discarded with different timestamps. - cat.IncrementDiscardedSamples(observations[0], 1, "invalid-metrics-name", time.Unix(1, 0)) - cat.IncrementDiscardedSamples(observations[1], 2, "out-of-window-sample", time.Unix(12, 0)) - cat.IncrementDiscardedSamples(observations[2], 3, "invalid-metrics-name", time.Unix(20, 0)) + tracker.IncrementDiscardedSamples(observations[0], 1, "invalid-metrics-name", time.Unix(1, 0)) + tracker.IncrementDiscardedSamples(observations[1], 2, "out-of-window-sample", time.Unix(12, 0)) + tracker.IncrementDiscardedSamples(observations[2], 3, "invalid-metrics-name", time.Unix(20, 0)) // Ensure that two observations were successfully added to the tracker. - require.Len(t, cat.observed, 3) + require.Len(t, tracker.observed, 3) // Purge observations that haven't been updated in the last 10 seconds. - purged := cat.InactiveObservations(0) + purged := tracker.InactiveObservations(0) require.Len(t, purged, 0) - purged = cat.InactiveObservations(10) + purged = tracker.InactiveObservations(10) assert.ElementsMatch(t, []string{"foo"}, purged) - purged = cat.InactiveObservations(15) + purged = tracker.InactiveObservations(15) assert.ElementsMatch(t, []string{"foo", "bar"}, purged) // Check that the purged observation matches the expected details. - purged = cat.InactiveObservations(25) + purged = tracker.InactiveObservations(25) assert.ElementsMatch(t, []string{"foo", "bar", "baz"}, purged) } func Test_UpdateMaxCardinality(t *testing.T) { // user1 original max cardinality is 5 - cat := newTestManager().Tracker("user1") - cat.UpdateMaxCardinality(2) - assert.Equal(t, 2, cat.MaxCardinality(), "Expected max cardinality update to 2") + tracker := newTestManager().Tracker("user1") + tracker.UpdateMaxCardinality(2) + assert.Equal(t, 2, tracker.MaxCardinality(), "Expected max cardinality update to 2") } func Test_Concurrency(t *testing.T) { m := newTestManager() - cat := m.Tracker("user1") + tracker := m.Tracker("user1") var wg sync.WaitGroup for i := 0; i < 100; i++ { @@ -151,15 +151,15 @@ func Test_Concurrency(t *testing.T) { go func(i int) { defer wg.Done() lbls := labels.FromStrings("team", string(rune('A'+(i%26)))) - cat.updateCounters(lbls, int64(i), 1, 0, 0, nil) + tracker.updateCounters(lbls, int64(i), 1, 0, 0, nil) }(i) } wg.Wait() // Verify no data races or inconsistencies - assert.True(t, len(cat.observed) > 0, "Observed set should not be empty after concurrent updates") - assert.LessOrEqual(t, len(cat.observed), 2*cat.MaxCardinality(), "Observed count should not exceed 2 times of max cardinality") - assert.Equal(t, Overflow, cat.state, "Expected state to be Overflow") + assert.True(t, len(tracker.observed) > 0, "Observed set should not be empty after concurrent updates") + assert.LessOrEqual(t, len(tracker.observed), 2*tracker.MaxCardinality(), "Observed count should not exceed 2 times of max cardinality") + assert.Equal(t, Overflow, tracker.state, "Expected state to be Overflow") expectedMetrics := ` # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. From 71e46669757b40c6c5e9281e4abb97d8a9d4da4b Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 15:20:18 +0100 Subject: [PATCH 011/105] avoid get tracker twice --- pkg/costattribution/manager.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index bc4073fed0f..c7d4a18d88f 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -122,8 +122,7 @@ func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) error { continue } - invalidKeys := m.inactiveObservationsForUser(userID, deadline) - t := m.Tracker(userID) + t, invalidKeys := m.inactiveObservationsForUser(userID, deadline) for _, key := range invalidKeys { t.cleanupTrackerAttribution(key) } @@ -140,7 +139,7 @@ func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) error { return nil } -func (m *Manager) inactiveObservationsForUser(userID string, deadline int64) []string { +func (m *Manager) inactiveObservationsForUser(userID string, deadline int64) (*Tracker, []string) { t := m.Tracker(userID) newTrackedLabels := m.limits.CostAttributionLabels(userID) sort.Slice(newTrackedLabels, func(i, j int) bool { @@ -152,7 +151,7 @@ func (m *Manager) inactiveObservationsForUser(userID string, deadline int64) []s t = newTracker(userID, newTrackedLabels, m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) m.trackersByUserID[userID] = t m.mtx.Unlock() - return nil + return t, nil } maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) if t.MaxCardinality() != maxCardinality { @@ -164,5 +163,5 @@ func (m *Manager) inactiveObservationsForUser(userID string, deadline int64) []s t.UpdateCooldownDuration(cooldown) } - return t.InactiveObservations(deadline) + return t, t.InactiveObservations(deadline) } From 9dd101ba552d981cd57a7b4d0909194c2bd4a66e Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 15:36:35 +0100 Subject: [PATCH 012/105] refactor inactiveObservationsForUser --- pkg/costattribution/manager.go | 67 ++++++++++++++++------------- pkg/costattribution/tracker.go | 6 +-- pkg/costattribution/tracker_test.go | 10 ++--- 3 files changed, 42 insertions(+), 41 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index c7d4a18d88f..eca92d6e48d 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -108,6 +108,38 @@ func (m *Manager) deleteTracker(userID string) { delete(m.trackersByUserID, userID) } +func (m *Manager) updateTracker(userID string) *Tracker { + if !m.EnabledForUser(userID) { + m.deleteTracker(userID) + return nil + } + + t := m.Tracker(userID) + newTrackedLabels := m.limits.CostAttributionLabels(userID) + sort.Slice(newTrackedLabels, func(i, j int) bool { + return newTrackedLabels[i] < newTrackedLabels[j] + }) + + if !t.CompareCALabels(newTrackedLabels) { + m.mtx.Lock() + t = newTracker(userID, newTrackedLabels, m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) + m.trackersByUserID[userID] = t + m.mtx.Unlock() + return t + } + + maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) + if t.MaxCardinality() != maxCardinality { + t.UpdateMaxCardinality(maxCardinality) + } + + cooldown := int64(m.limits.CostAttributionCooldown(userID).Seconds()) + if cooldown != t.CooldownDuration() { + t.UpdateCooldownDuration(cooldown) + } + return t +} + func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) error { m.mtx.RLock() userIDs := make([]string, 0, len(m.trackersByUserID)) @@ -117,17 +149,17 @@ func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) error { m.mtx.RUnlock() for _, userID := range userIDs { - if !m.EnabledForUser(userID) { - m.deleteTracker(userID) + t := m.updateTracker(userID) + if t == nil { continue } - t, invalidKeys := m.inactiveObservationsForUser(userID, deadline) + invalidKeys := t.inactiveObservations(deadline) for _, key := range invalidKeys { t.cleanupTrackerAttribution(key) } - if t != nil && t.cooldownUntil != nil && t.cooldownUntil.Load() < deadline { + if t.cooldownUntil != nil && t.cooldownUntil.Load() < deadline { if len(t.observed) <= t.MaxCardinality() { t.state = OverflowComplete m.deleteTracker(userID) @@ -138,30 +170,3 @@ func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) error { } return nil } - -func (m *Manager) inactiveObservationsForUser(userID string, deadline int64) (*Tracker, []string) { - t := m.Tracker(userID) - newTrackedLabels := m.limits.CostAttributionLabels(userID) - sort.Slice(newTrackedLabels, func(i, j int) bool { - return newTrackedLabels[i] < newTrackedLabels[j] - }) - - if !t.CompareCALabels(newTrackedLabels) { - m.mtx.Lock() - t = newTracker(userID, newTrackedLabels, m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) - m.trackersByUserID[userID] = t - m.mtx.Unlock() - return t, nil - } - maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) - if t.MaxCardinality() != maxCardinality { - t.UpdateMaxCardinality(maxCardinality) - } - - cooldown := int64(m.limits.CostAttributionCooldown(userID).Seconds()) - if cooldown != t.CooldownDuration() { - t.UpdateCooldownDuration(cooldown) - } - - return t, t.InactiveObservations(deadline) -} diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index b17389b6df8..7e837215de0 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -330,11 +330,7 @@ func (t *Tracker) createNewObservation(stream string, ts int64, activeSeriesIncr } } -func (t *Tracker) InactiveObservations(deadline int64) []string { - if t == nil { - return nil - } - +func (t *Tracker) inactiveObservations(deadline int64) []string { // otherwise, we need to check all observations and clean up the ones that are inactive var invalidKeys []string t.obseveredMtx.RLock() diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index 237e5c33d91..f0b6680ff78 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -64,7 +64,7 @@ func Test_CreateCleanupTracker(t *testing.T) { "cortex_ingester_attributed_active_series_failure", } assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) - assert.Equal(t, []string{"foo"}, tracker.InactiveObservations(5)) + assert.Equal(t, []string{"foo"}, tracker.inactiveObservations(5)) assert.NoError(t, tManager.purgeInactiveAttributionsUntil(5)) expectedMetrics = ` @@ -120,17 +120,17 @@ func Test_GetInactiveObservations(t *testing.T) { require.Len(t, tracker.observed, 3) // Purge observations that haven't been updated in the last 10 seconds. - purged := tracker.InactiveObservations(0) + purged := tracker.inactiveObservations(0) require.Len(t, purged, 0) - purged = tracker.InactiveObservations(10) + purged = tracker.inactiveObservations(10) assert.ElementsMatch(t, []string{"foo"}, purged) - purged = tracker.InactiveObservations(15) + purged = tracker.inactiveObservations(15) assert.ElementsMatch(t, []string{"foo", "bar"}, purged) // Check that the purged observation matches the expected details. - purged = tracker.InactiveObservations(25) + purged = tracker.inactiveObservations(25) assert.ElementsMatch(t, []string{"foo", "bar", "baz"}, purged) } From 7d4ea9acbb03ffd7e5372493508903adb3e65614 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 15:52:22 +0100 Subject: [PATCH 013/105] refactor shouldDelete function --- pkg/costattribution/manager.go | 9 ++------- pkg/costattribution/tracker.go | 12 +++++++++++- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index eca92d6e48d..6fd2f5b06e3 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -159,13 +159,8 @@ func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) error { t.cleanupTrackerAttribution(key) } - if t.cooldownUntil != nil && t.cooldownUntil.Load() < deadline { - if len(t.observed) <= t.MaxCardinality() { - t.state = OverflowComplete - m.deleteTracker(userID) - } else { - t.cooldownUntil.Store(deadline + t.cooldownDuration) - } + if t.shouldDelete(deadline) { + m.deleteTracker(userID) } } return nil diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 7e837215de0..05a186c151d 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -20,7 +20,6 @@ type TrackerState int const ( Normal TrackerState = iota Overflow - OverflowComplete ) const sep = rune(0x80) @@ -330,6 +329,17 @@ func (t *Tracker) createNewObservation(stream string, ts int64, activeSeriesIncr } } +func (t *Tracker) shouldDelete(deadline int64) bool { + if t.cooldownUntil != nil && t.cooldownUntil.Load() < deadline { + if len(t.observed) <= t.maxCardinality { + return true + } else { + t.cooldownUntil.Store(deadline + t.cooldownDuration) + } + } + return false +} + func (t *Tracker) inactiveObservations(deadline int64) []string { // otherwise, we need to check all observations and clean up the ones that are inactive var invalidKeys []string From 67546662291c7bdff3db6f51b6dd91ec78b25bf7 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 15:56:17 +0100 Subject: [PATCH 014/105] rename calabels and calabelmap to labels and index --- pkg/costattribution/manager.go | 2 +- pkg/costattribution/manager_test.go | 6 +++--- pkg/costattribution/tracker.go | 22 +++++++++++----------- pkg/costattribution/tracker_test.go | 2 +- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 6fd2f5b06e3..ecc6d3ae8f0 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -120,7 +120,7 @@ func (m *Manager) updateTracker(userID string) *Tracker { return newTrackedLabels[i] < newTrackedLabels[j] }) - if !t.CompareCALabels(newTrackedLabels) { + if !t.CompareLabels(newTrackedLabels) { m.mtx.Lock() t = newTracker(userID, newTrackedLabels, m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) m.trackersByUserID[userID] = t diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index c07887bcdc2..e2da3bfdc59 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -64,14 +64,14 @@ func Test_CreateDeleteTracker(t *testing.T) { t.Run("Tracker existence and attributes", func(t *testing.T) { user1Tracker := manager.Tracker("user1") assert.NotNil(t, user1Tracker) - assert.True(t, user1Tracker.CompareCALabels([]string{"team"})) + assert.True(t, user1Tracker.CompareLabels([]string{"team"})) assert.Equal(t, 5, user1Tracker.MaxCardinality()) assert.Nil(t, manager.Tracker("user2")) user3Tracker := manager.Tracker("user3") assert.NotNil(t, user3Tracker) - assert.True(t, user3Tracker.CompareCALabels([]string{"department", "service"})) + assert.True(t, user3Tracker.CompareLabels([]string{"department", "service"})) assert.Equal(t, 2, user3Tracker.MaxCardinality()) }) @@ -124,7 +124,7 @@ func Test_CreateDeleteTracker(t *testing.T) { assert.NoError(t, err) assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(12, 0).Unix())) assert.Equal(t, 1, len(manager.trackersByUserID)) - assert.True(t, manager.Tracker("user3").CompareCALabels([]string{"feature", "team"})) + assert.True(t, manager.Tracker("user3").CompareLabels([]string{"feature", "team"})) manager.Tracker("user3").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(13, 0)) expectedMetrics := ` diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 05a186c151d..89755a4809a 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -35,8 +35,8 @@ type Observation struct { type Tracker struct { userID string - caLabels []string - caLabelMap map[string]int + labels []string + index map[string]int maxCardinality int activeSeriesPerUserAttribution *prometheus.Desc receivedSamplesAttribution *prometheus.Desc @@ -60,10 +60,10 @@ func newTracker(userID string, trackedLabels []string, limit int, cooldown time. }) // Create a map for fast lookup, and overflow labels to export when overflow happens - caLabelMap := make(map[string]int, len(trackedLabels)) + index := make(map[string]int, len(trackedLabels)) overflowLabels := make([]string, len(trackedLabels)+2) for i, label := range trackedLabels { - caLabelMap[label] = i + index[label] = i overflowLabels[i] = overflowValue } @@ -72,8 +72,8 @@ func newTracker(userID string, trackedLabels []string, limit int, cooldown time. tracker := &Tracker{ userID: userID, - caLabels: trackedLabels, - caLabelMap: caLabelMap, + labels: trackedLabels, + index: index, maxCardinality: limit, observed: make(map[string]*Observation), hashBuffer: make([]byte, 0, 1024), @@ -102,15 +102,15 @@ func newTracker(userID string, trackedLabels []string, limit int, cooldown time. return tracker } -func (t *Tracker) CompareCALabels(currentLabels []string) bool { +func (t *Tracker) CompareLabels(currentLabels []string) bool { if t == nil { return len(currentLabels) == 0 } - if len(t.caLabels) != len(currentLabels) { + if len(t.labels) != len(currentLabels) { return false } for _, v := range currentLabels { - if _, exists := t.caLabelMap[v]; !exists { + if _, exists := t.index[v]; !exists { return false } } @@ -213,9 +213,9 @@ func (t *Tracker) IncrementActiveSeriesFailure(value float64) { } func (t *Tracker) updateCounters(lbls labels.Labels, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { - labelValues := make([]string, len(t.caLabels)) + labelValues := make([]string, len(t.labels)) lbls.Range(func(l labels.Label) { - if idx, ok := t.caLabelMap[l.Name]; ok { + if idx, ok := t.index[l.Name]; ok { labelValues[idx] = l.Value } }) diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index f0b6680ff78..96e1fad96c6 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -17,7 +17,7 @@ import ( func Test_GetCALabels(t *testing.T) { tracker := newTestManager().Tracker("user1") - assert.True(t, tracker.CompareCALabels([]string{"team"}), "Expected cost attribution labels mismatch") + assert.True(t, tracker.CompareLabels([]string{"team"}), "Expected cost attribution labels mismatch") } func Test_GetMaxCardinality(t *testing.T) { From fffc5b3544710c9510829bee0087e499990047f1 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 16:02:16 +0100 Subject: [PATCH 015/105] remove getter and setter of max cardinality and cooldown duration --- pkg/costattribution/manager.go | 8 ++++---- pkg/costattribution/manager_test.go | 4 ++-- pkg/costattribution/tracker.go | 28 ---------------------------- pkg/costattribution/tracker_test.go | 14 +------------- 4 files changed, 7 insertions(+), 47 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index ecc6d3ae8f0..7ab8e27d9c8 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -129,13 +129,13 @@ func (m *Manager) updateTracker(userID string) *Tracker { } maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) - if t.MaxCardinality() != maxCardinality { - t.UpdateMaxCardinality(maxCardinality) + if t.maxCardinality != maxCardinality { + t.maxCardinality = maxCardinality } cooldown := int64(m.limits.CostAttributionCooldown(userID).Seconds()) - if cooldown != t.CooldownDuration() { - t.UpdateCooldownDuration(cooldown) + if cooldown != t.cooldownDuration { + t.cooldownDuration = cooldown } return t } diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index e2da3bfdc59..35c1ae5e95f 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -65,14 +65,14 @@ func Test_CreateDeleteTracker(t *testing.T) { user1Tracker := manager.Tracker("user1") assert.NotNil(t, user1Tracker) assert.True(t, user1Tracker.CompareLabels([]string{"team"})) - assert.Equal(t, 5, user1Tracker.MaxCardinality()) + assert.Equal(t, 5, user1Tracker.maxCardinality) assert.Nil(t, manager.Tracker("user2")) user3Tracker := manager.Tracker("user3") assert.NotNil(t, user3Tracker) assert.True(t, user3Tracker.CompareLabels([]string{"department", "service"})) - assert.Equal(t, 2, user3Tracker.MaxCardinality()) + assert.Equal(t, 2, user3Tracker.maxCardinality) }) t.Run("Metrics tracking", func(t *testing.T) { diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 89755a4809a..fa8ce25e4f4 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -117,20 +117,6 @@ func (t *Tracker) CompareLabels(currentLabels []string) bool { return true } -func (t *Tracker) MaxCardinality() int { - if t == nil { - return 0 - } - return t.maxCardinality -} - -func (t *Tracker) CooldownDuration() int64 { - if t == nil { - return 0 - } - return t.cooldownDuration -} - var bufferPool = sync.Pool{ New: func() interface{} { return new(bytes.Buffer) @@ -353,17 +339,3 @@ func (t *Tracker) inactiveObservations(deadline int64) []string { return invalidKeys } - -func (t *Tracker) UpdateMaxCardinality(limit int) { - if t == nil { - return - } - t.maxCardinality = limit -} - -func (t *Tracker) UpdateCooldownDuration(cooldownDuration int64) { - if t == nil { - return - } - t.cooldownDuration = cooldownDuration -} diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index 96e1fad96c6..2ff4a0c1e51 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -20,11 +20,6 @@ func Test_GetCALabels(t *testing.T) { assert.True(t, tracker.CompareLabels([]string{"team"}), "Expected cost attribution labels mismatch") } -func Test_GetMaxCardinality(t *testing.T) { - tracker := newTestManager().Tracker("user1") - assert.Equal(t, 5, tracker.MaxCardinality(), "Expected max cardinality mismatch") -} - func Test_CreateCleanupTracker(t *testing.T) { tManager := newTestManager() tracker := tManager.Tracker("user4") @@ -134,13 +129,6 @@ func Test_GetInactiveObservations(t *testing.T) { assert.ElementsMatch(t, []string{"foo", "bar", "baz"}, purged) } -func Test_UpdateMaxCardinality(t *testing.T) { - // user1 original max cardinality is 5 - tracker := newTestManager().Tracker("user1") - tracker.UpdateMaxCardinality(2) - assert.Equal(t, 2, tracker.MaxCardinality(), "Expected max cardinality update to 2") -} - func Test_Concurrency(t *testing.T) { m := newTestManager() tracker := m.Tracker("user1") @@ -158,7 +146,7 @@ func Test_Concurrency(t *testing.T) { // Verify no data races or inconsistencies assert.True(t, len(tracker.observed) > 0, "Observed set should not be empty after concurrent updates") - assert.LessOrEqual(t, len(tracker.observed), 2*tracker.MaxCardinality(), "Observed count should not exceed 2 times of max cardinality") + assert.LessOrEqual(t, len(tracker.observed), 2*tracker.maxCardinality, "Observed count should not exceed 2 times of max cardinality") assert.Equal(t, Overflow, tracker.state, "Expected state to be Overflow") expectedMetrics := ` From 2cf8c3e488ef2414031387fc3525dedeecb054ae Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 16:06:13 +0100 Subject: [PATCH 016/105] rename CompareLabels to hasSameLabels --- pkg/costattribution/manager.go | 2 +- pkg/costattribution/manager_test.go | 6 +++--- pkg/costattribution/tracker.go | 8 ++++---- pkg/costattribution/tracker_test.go | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 7ab8e27d9c8..3a185ae44ae 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -120,7 +120,7 @@ func (m *Manager) updateTracker(userID string) *Tracker { return newTrackedLabels[i] < newTrackedLabels[j] }) - if !t.CompareLabels(newTrackedLabels) { + if !t.hasSameLabels(newTrackedLabels) { m.mtx.Lock() t = newTracker(userID, newTrackedLabels, m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) m.trackersByUserID[userID] = t diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index 35c1ae5e95f..b9f4f7b7eba 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -64,14 +64,14 @@ func Test_CreateDeleteTracker(t *testing.T) { t.Run("Tracker existence and attributes", func(t *testing.T) { user1Tracker := manager.Tracker("user1") assert.NotNil(t, user1Tracker) - assert.True(t, user1Tracker.CompareLabels([]string{"team"})) + assert.True(t, user1Tracker.hasSameLabels([]string{"team"})) assert.Equal(t, 5, user1Tracker.maxCardinality) assert.Nil(t, manager.Tracker("user2")) user3Tracker := manager.Tracker("user3") assert.NotNil(t, user3Tracker) - assert.True(t, user3Tracker.CompareLabels([]string{"department", "service"})) + assert.True(t, user3Tracker.hasSameLabels([]string{"department", "service"})) assert.Equal(t, 2, user3Tracker.maxCardinality) }) @@ -124,7 +124,7 @@ func Test_CreateDeleteTracker(t *testing.T) { assert.NoError(t, err) assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(12, 0).Unix())) assert.Equal(t, 1, len(manager.trackersByUserID)) - assert.True(t, manager.Tracker("user3").CompareLabels([]string{"feature", "team"})) + assert.True(t, manager.Tracker("user3").hasSameLabels([]string{"feature", "team"})) manager.Tracker("user3").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(13, 0)) expectedMetrics := ` diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index fa8ce25e4f4..3eb753719ae 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -102,14 +102,14 @@ func newTracker(userID string, trackedLabels []string, limit int, cooldown time. return tracker } -func (t *Tracker) CompareLabels(currentLabels []string) bool { +func (t *Tracker) hasSameLabels(labels []string) bool { if t == nil { - return len(currentLabels) == 0 + return len(labels) == 0 } - if len(t.labels) != len(currentLabels) { + if len(t.labels) != len(labels) { return false } - for _, v := range currentLabels { + for _, v := range labels { if _, exists := t.index[v]; !exists { return false } diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index 2ff4a0c1e51..425694fb67f 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -17,7 +17,7 @@ import ( func Test_GetCALabels(t *testing.T) { tracker := newTestManager().Tracker("user1") - assert.True(t, tracker.CompareLabels([]string{"team"}), "Expected cost attribution labels mismatch") + assert.True(t, tracker.hasSameLabels([]string{"team"}), "Expected cost attribution labels mismatch") } func Test_CreateCleanupTracker(t *testing.T) { From f994034d4d41b67476b9add02d7e86df18c4ad16 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 16:16:25 +0100 Subject: [PATCH 017/105] remove the mapping logic since the slices are ordered --- pkg/costattribution/manager.go | 2 ++ pkg/costattribution/tracker.go | 11 ++--------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 3a185ae44ae..5c40018f538 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -116,6 +116,8 @@ func (m *Manager) updateTracker(userID string) *Tracker { t := m.Tracker(userID) newTrackedLabels := m.limits.CostAttributionLabels(userID) + + // sort the labels to ensure the order is consistent sort.Slice(newTrackedLabels, func(i, j int) bool { return newTrackedLabels[i] < newTrackedLabels[j] }) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 3eb753719ae..e07e9565b0d 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -4,6 +4,7 @@ package costattribution import ( "bytes" + "slices" "sort" "strings" "sync" @@ -106,15 +107,7 @@ func (t *Tracker) hasSameLabels(labels []string) bool { if t == nil { return len(labels) == 0 } - if len(t.labels) != len(labels) { - return false - } - for _, v := range labels { - if _, exists := t.index[v]; !exists { - return false - } - } - return true + return slices.Equal(t.labels, labels) } var bufferPool = sync.Pool{ From b060c093e6befcbc4302cd75438cbcbd5202a331 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 16:23:36 +0100 Subject: [PATCH 018/105] remove unnecessary tracker nil checking --- pkg/costattribution/manager.go | 5 +++-- pkg/costattribution/tracker.go | 6 ------ 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 5c40018f538..5127e991df6 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -109,12 +109,13 @@ func (m *Manager) deleteTracker(userID string) { } func (m *Manager) updateTracker(userID string) *Tracker { - if !m.EnabledForUser(userID) { + t := m.Tracker(userID) + + if t == nil { m.deleteTracker(userID) return nil } - t := m.Tracker(userID) newTrackedLabels := m.limits.CostAttributionLabels(userID) // sort the labels to ensure the order is consistent diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index e07e9565b0d..d808ad654c2 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -104,9 +104,6 @@ func newTracker(userID string, trackedLabels []string, limit int, cooldown time. } func (t *Tracker) hasSameLabels(labels []string) bool { - if t == nil { - return len(labels) == 0 - } return slices.Equal(t.labels, labels) } @@ -117,9 +114,6 @@ var bufferPool = sync.Pool{ } func (t *Tracker) cleanupTrackerAttribution(key string) { - if t == nil { - return - } t.obseveredMtx.Lock() defer t.obseveredMtx.Unlock() delete(t.observed, key) From e35a8d94f19b86cc801092f736698e97ffd4c356 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 16:26:14 +0100 Subject: [PATCH 019/105] fix linting --- pkg/costattribution/tracker.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index d808ad654c2..25b95bcc13f 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -306,9 +306,8 @@ func (t *Tracker) shouldDelete(deadline int64) bool { if t.cooldownUntil != nil && t.cooldownUntil.Load() < deadline { if len(t.observed) <= t.maxCardinality { return true - } else { - t.cooldownUntil.Store(deadline + t.cooldownDuration) } + t.cooldownUntil.Store(deadline + t.cooldownDuration) } return false } From 5cc0b5df279d8065a722384aad9f917a7002e9fe Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 16:42:20 +0100 Subject: [PATCH 020/105] refactor updateOverflow method --- pkg/costattribution/tracker.go | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 25b95bcc13f..5c107cf6979 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -201,7 +201,6 @@ func (t *Tracker) updateCounters(lbls labels.Labels, ts int64, activeSeriesIncre buf := bufferPool.Get().(*bytes.Buffer) buf.Reset() defer bufferPool.Put(buf) - // Build the stream key for i, value := range labelValues { if i > 0 { @@ -213,12 +212,13 @@ func (t *Tracker) updateCounters(lbls labels.Labels, ts int64, activeSeriesIncre t.obseveredMtx.Lock() defer t.obseveredMtx.Unlock() - t.updateOverflow(buf.String(), ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) + t.updateObservations(buf.Bytes(), ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) + t.updateState(ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement) } -// handleObservation updates or creates a new stream observation in the 'observed' map. -func (t *Tracker) handleObservation(stream string, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { - if o, known := t.observed[stream]; known && o.lastUpdate != nil { +// updateObservations updates or creates a new stream observation in the 'observed' map. +func (t *Tracker) updateObservations(key []byte, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { + if o, known := t.observed[string(key)]; known && o.lastUpdate != nil { // Update the timestamp if needed if o.lastUpdate.Load() < ts { o.lastUpdate.Store(ts) @@ -238,19 +238,13 @@ func (t *Tracker) handleObservation(stream string, ts int64, activeSeriesIncreme // If the ts is negative, it means that the method is called from DecrementActiveSeries, when key doesn't exist we should ignore the call // Otherwise create a new observation for the stream if ts >= 0 { - t.createNewObservation(stream, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) + t.createNewObservation(key, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) } } } -func (t *Tracker) updateOverflow(stream string, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { - // Update the stream in the observed map - t.handleObservation(stream, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) - t.handleOverflow(ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement) -} - -// handleOverflow checks if the tracker has exceeded its max cardinality and updates overflow state if necessary. -func (t *Tracker) handleOverflow(ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64) { +// updateState checks if the tracker has exceeded its max cardinality and updates overflow state if necessary. +func (t *Tracker) updateState(ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64) { // Transition to overflow mode if maximum cardinality is exceeded. previousState := t.state if t.state == Normal && len(t.observed) > t.maxCardinality { @@ -287,8 +281,8 @@ func (t *Tracker) handleOverflow(ts int64, activeSeriesIncrement, receivedSample } // createNewObservation creates a new observation in the 'observed' map. -func (t *Tracker) createNewObservation(stream string, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { - t.observed[stream] = &Observation{ +func (t *Tracker) createNewObservation(key []byte, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { + t.observed[string(key)] = &Observation{ lastUpdate: atomic.NewInt64(ts), activeSerie: atomic.NewFloat64(activeSeriesIncrement), receivedSample: atomic.NewFloat64(receivedSampleIncrement), @@ -296,9 +290,9 @@ func (t *Tracker) createNewObservation(stream string, ts int64, activeSeriesIncr discardSamplemtx: sync.Mutex{}, } if discardedSampleIncrement > 0 && reason != nil { - t.observed[stream].discardSamplemtx.Lock() - t.observed[stream].discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) - t.observed[stream].discardSamplemtx.Unlock() + t.observed[string(key)].discardSamplemtx.Lock() + t.observed[string(key)].discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) + t.observed[string(key)].discardSamplemtx.Unlock() } } From 389dff038baa6ecf435669e1e8f1999d821968f6 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 16:44:44 +0100 Subject: [PATCH 021/105] remove stream in comments --- pkg/costattribution/tracker.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 5c107cf6979..665da729ae4 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -140,7 +140,7 @@ func (t *Tracker) Collect(out chan<- prometheus.Metric) { out <- prometheus.MustNewConstMetric(t.receivedSamplesAttribution, prometheus.CounterValue, t.overflowCounter.receivedSample.Load(), t.overflowLabels[:len(t.overflowLabels)-1]...) out <- prometheus.MustNewConstMetric(t.discardedSampleAttribution, prometheus.CounterValue, t.overflowCounter.totalDiscarded.Load(), t.overflowLabels...) case Normal: - // Collect metrics for all observed streams + // Collect metrics for all observed keys t.obseveredMtx.RLock() defer t.obseveredMtx.RUnlock() for key, o := range t.observed { @@ -201,7 +201,7 @@ func (t *Tracker) updateCounters(lbls labels.Labels, ts int64, activeSeriesIncre buf := bufferPool.Get().(*bytes.Buffer) buf.Reset() defer bufferPool.Put(buf) - // Build the stream key + // Build the observation key for i, value := range labelValues { if i > 0 { buf.WriteRune(sep) @@ -216,7 +216,7 @@ func (t *Tracker) updateCounters(lbls labels.Labels, ts int64, activeSeriesIncre t.updateState(ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement) } -// updateObservations updates or creates a new stream observation in the 'observed' map. +// updateObservations updates or creates a new observation in the 'observed' map. func (t *Tracker) updateObservations(key []byte, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { if o, known := t.observed[string(key)]; known && o.lastUpdate != nil { // Update the timestamp if needed @@ -236,7 +236,7 @@ func (t *Tracker) updateObservations(key []byte, ts int64, activeSeriesIncrement } } else if len(t.observed) < t.maxCardinality*2 { // If the ts is negative, it means that the method is called from DecrementActiveSeries, when key doesn't exist we should ignore the call - // Otherwise create a new observation for the stream + // Otherwise create a new observation for the key if ts >= 0 { t.createNewObservation(key, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) } @@ -257,7 +257,7 @@ func (t *Tracker) updateState(ts int64, activeSeriesIncrement, receivedSampleInc totalDiscarded: atomic.NewFloat64(0), } - // Aggregate active series from all streams into the overflow counter. + // Aggregate active series from all keys into the overflow counter. for _, o := range t.observed { if o != nil { t.overflowCounter.activeSerie.Add(o.activeSerie.Load()) From 116a69edebea6bc1ed2156ab49c06201c6bfa82b Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 16:48:58 +0100 Subject: [PATCH 022/105] make observation struct private --- pkg/costattribution/tracker.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 665da729ae4..e295bb45f44 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -25,7 +25,7 @@ const ( const sep = rune(0x80) -type Observation struct { +type observation struct { lastUpdate *atomic.Int64 activeSerie *atomic.Float64 receivedSample *atomic.Float64 @@ -45,10 +45,10 @@ type Tracker struct { failedActiveSeriesDecrement *prometheus.Desc overflowLabels []string obseveredMtx sync.RWMutex - observed map[string]*Observation + observed map[string]*observation hashBuffer []byte state TrackerState - overflowCounter *Observation + overflowCounter *observation cooldownUntil *atomic.Int64 totalFailedActiveSeries *atomic.Float64 cooldownDuration int64 @@ -76,7 +76,7 @@ func newTracker(userID string, trackedLabels []string, limit int, cooldown time. labels: trackedLabels, index: index, maxCardinality: limit, - observed: make(map[string]*Observation), + observed: make(map[string]*observation), hashBuffer: make([]byte, 0, 1024), cooldownDuration: int64(cooldown.Seconds()), logger: logger, @@ -250,7 +250,7 @@ func (t *Tracker) updateState(ts int64, activeSeriesIncrement, receivedSampleInc if t.state == Normal && len(t.observed) > t.maxCardinality { t.state = Overflow // Initialize the overflow counter. - t.overflowCounter = &Observation{ + t.overflowCounter = &observation{ lastUpdate: atomic.NewInt64(ts), activeSerie: atomic.NewFloat64(0), receivedSample: atomic.NewFloat64(0), @@ -282,7 +282,7 @@ func (t *Tracker) updateState(ts int64, activeSeriesIncrement, receivedSampleInc // createNewObservation creates a new observation in the 'observed' map. func (t *Tracker) createNewObservation(key []byte, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { - t.observed[string(key)] = &Observation{ + t.observed[string(key)] = &observation{ lastUpdate: atomic.NewInt64(ts), activeSerie: atomic.NewFloat64(activeSeriesIncrement), receivedSample: atomic.NewFloat64(receivedSampleIncrement), From 9c30445a730191b1c38d8cd11ad93b3573d775ea Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 16:57:53 +0100 Subject: [PATCH 023/105] remove unnecessary pointers --- pkg/costattribution/tracker.go | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index e295bb45f44..3c28014c441 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -27,11 +27,11 @@ const sep = rune(0x80) type observation struct { lastUpdate *atomic.Int64 - activeSerie *atomic.Float64 - receivedSample *atomic.Float64 + activeSerie atomic.Float64 + receivedSample atomic.Float64 discardSamplemtx sync.Mutex - discardedSample map[string]*atomic.Float64 - totalDiscarded *atomic.Float64 + discardedSample map[string]atomic.Float64 + totalDiscarded atomic.Float64 } type Tracker struct { @@ -231,7 +231,7 @@ func (t *Tracker) updateObservations(key []byte, ts int64, activeSeriesIncrement } if discardedSampleIncrement > 0 && reason != nil { o.discardSamplemtx.Lock() - o.discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) + o.discardedSample[*reason] = *atomic.NewFloat64(discardedSampleIncrement) o.discardSamplemtx.Unlock() } } else if len(t.observed) < t.maxCardinality*2 { @@ -252,9 +252,9 @@ func (t *Tracker) updateState(ts int64, activeSeriesIncrement, receivedSampleInc // Initialize the overflow counter. t.overflowCounter = &observation{ lastUpdate: atomic.NewInt64(ts), - activeSerie: atomic.NewFloat64(0), - receivedSample: atomic.NewFloat64(0), - totalDiscarded: atomic.NewFloat64(0), + activeSerie: *atomic.NewFloat64(0), + receivedSample: *atomic.NewFloat64(0), + totalDiscarded: *atomic.NewFloat64(0), } // Aggregate active series from all keys into the overflow counter. @@ -284,14 +284,14 @@ func (t *Tracker) updateState(ts int64, activeSeriesIncrement, receivedSampleInc func (t *Tracker) createNewObservation(key []byte, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { t.observed[string(key)] = &observation{ lastUpdate: atomic.NewInt64(ts), - activeSerie: atomic.NewFloat64(activeSeriesIncrement), - receivedSample: atomic.NewFloat64(receivedSampleIncrement), - discardedSample: map[string]*atomic.Float64{}, + activeSerie: *atomic.NewFloat64(activeSeriesIncrement), + receivedSample: *atomic.NewFloat64(receivedSampleIncrement), + discardedSample: map[string]atomic.Float64{}, discardSamplemtx: sync.Mutex{}, } if discardedSampleIncrement > 0 && reason != nil { t.observed[string(key)].discardSamplemtx.Lock() - t.observed[string(key)].discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) + t.observed[string(key)].discardedSample[*reason] = *atomic.NewFloat64(discardedSampleIncrement) t.observed[string(key)].discardSamplemtx.Unlock() } } From 88ef49e2c3af96eb4a775e3fa58128289b29c32e Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 17:03:54 +0100 Subject: [PATCH 024/105] rename discardSampleMtx to discardedSampleMtx --- pkg/costattribution/tracker.go | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 3c28014c441..54f4b03d8bd 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -26,12 +26,12 @@ const ( const sep = rune(0x80) type observation struct { - lastUpdate *atomic.Int64 - activeSerie atomic.Float64 - receivedSample atomic.Float64 - discardSamplemtx sync.Mutex - discardedSample map[string]atomic.Float64 - totalDiscarded atomic.Float64 + lastUpdate *atomic.Int64 + activeSerie atomic.Float64 + receivedSample atomic.Float64 + discardedSampleMtx sync.Mutex + discardedSample map[string]atomic.Float64 + totalDiscarded atomic.Float64 } type Tracker struct { @@ -152,11 +152,11 @@ func (t *Tracker) Collect(out chan<- prometheus.Metric) { if o.receivedSample.Load() > 0 { out <- prometheus.MustNewConstMetric(t.receivedSamplesAttribution, prometheus.CounterValue, o.receivedSample.Load(), keys...) } - o.discardSamplemtx.Lock() + o.discardedSampleMtx.Lock() for reason, discarded := range o.discardedSample { out <- prometheus.MustNewConstMetric(t.discardedSampleAttribution, prometheus.CounterValue, discarded.Load(), append(keys, reason)...) } - o.discardSamplemtx.Unlock() + o.discardedSampleMtx.Unlock() } } if t.totalFailedActiveSeries.Load() > 0 { @@ -230,9 +230,9 @@ func (t *Tracker) updateObservations(key []byte, ts int64, activeSeriesIncrement o.receivedSample.Add(receivedSampleIncrement) } if discardedSampleIncrement > 0 && reason != nil { - o.discardSamplemtx.Lock() + o.discardedSampleMtx.Lock() o.discardedSample[*reason] = *atomic.NewFloat64(discardedSampleIncrement) - o.discardSamplemtx.Unlock() + o.discardedSampleMtx.Unlock() } } else if len(t.observed) < t.maxCardinality*2 { // If the ts is negative, it means that the method is called from DecrementActiveSeries, when key doesn't exist we should ignore the call @@ -283,16 +283,16 @@ func (t *Tracker) updateState(ts int64, activeSeriesIncrement, receivedSampleInc // createNewObservation creates a new observation in the 'observed' map. func (t *Tracker) createNewObservation(key []byte, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { t.observed[string(key)] = &observation{ - lastUpdate: atomic.NewInt64(ts), - activeSerie: *atomic.NewFloat64(activeSeriesIncrement), - receivedSample: *atomic.NewFloat64(receivedSampleIncrement), - discardedSample: map[string]atomic.Float64{}, - discardSamplemtx: sync.Mutex{}, + lastUpdate: atomic.NewInt64(ts), + activeSerie: *atomic.NewFloat64(activeSeriesIncrement), + receivedSample: *atomic.NewFloat64(receivedSampleIncrement), + discardedSample: map[string]atomic.Float64{}, + discardedSampleMtx: sync.Mutex{}, } if discardedSampleIncrement > 0 && reason != nil { - t.observed[string(key)].discardSamplemtx.Lock() + t.observed[string(key)].discardedSampleMtx.Lock() t.observed[string(key)].discardedSample[*reason] = *atomic.NewFloat64(discardedSampleIncrement) - t.observed[string(key)].discardSamplemtx.Unlock() + t.observed[string(key)].discardedSampleMtx.Unlock() } } From 130636a1feb5e425cd4993479e6e591f95a8ba72 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 17:11:09 +0100 Subject: [PATCH 025/105] rename variable observedMtx because I write with feet --- pkg/costattribution/tracker.go | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 54f4b03d8bd..b6c0d6d4420 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -44,7 +44,7 @@ type Tracker struct { discardedSampleAttribution *prometheus.Desc failedActiveSeriesDecrement *prometheus.Desc overflowLabels []string - obseveredMtx sync.RWMutex + observedMtx sync.RWMutex observed map[string]*observation hashBuffer []byte state TrackerState @@ -114,8 +114,8 @@ var bufferPool = sync.Pool{ } func (t *Tracker) cleanupTrackerAttribution(key string) { - t.obseveredMtx.Lock() - defer t.obseveredMtx.Unlock() + t.observedMtx.Lock() + defer t.observedMtx.Unlock() delete(t.observed, key) } @@ -141,8 +141,8 @@ func (t *Tracker) Collect(out chan<- prometheus.Metric) { out <- prometheus.MustNewConstMetric(t.discardedSampleAttribution, prometheus.CounterValue, t.overflowCounter.totalDiscarded.Load(), t.overflowLabels...) case Normal: // Collect metrics for all observed keys - t.obseveredMtx.RLock() - defer t.obseveredMtx.RUnlock() + t.observedMtx.RLock() + defer t.observedMtx.RUnlock() for key, o := range t.observed { keys := strings.Split(key, string(sep)) keys = append(keys, t.userID) @@ -209,8 +209,8 @@ func (t *Tracker) updateCounters(lbls labels.Labels, ts int64, activeSeriesIncre buf.WriteString(value) } - t.obseveredMtx.Lock() - defer t.obseveredMtx.Unlock() + t.observedMtx.Lock() + defer t.observedMtx.Unlock() t.updateObservations(buf.Bytes(), ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) t.updateState(ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement) @@ -309,8 +309,8 @@ func (t *Tracker) shouldDelete(deadline int64) bool { func (t *Tracker) inactiveObservations(deadline int64) []string { // otherwise, we need to check all observations and clean up the ones that are inactive var invalidKeys []string - t.obseveredMtx.RLock() - defer t.obseveredMtx.RUnlock() + t.observedMtx.RLock() + defer t.observedMtx.RUnlock() for labkey, ob := range t.observed { if ob != nil && ob.lastUpdate != nil && ob.lastUpdate.Load() <= deadline { invalidKeys = append(invalidKeys, labkey) From b701ba7885fd2c843fe6fea7175526e2f70a0e2d Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 17:16:05 +0100 Subject: [PATCH 026/105] update test name dum dum --- pkg/costattribution/manager_test.go | 6 +++--- pkg/costattribution/tracker_test.go | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index b9f4f7b7eba..f4f085b0c4b 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -51,14 +51,14 @@ func newTestManager() *Manager { return manager } -func Test_NewManager(t *testing.T) { +func TestManager_New(t *testing.T) { manager := newTestManager() assert.NotNil(t, manager) assert.NotNil(t, manager.trackersByUserID) assert.Equal(t, 10*time.Second, manager.inactiveTimeout) } -func Test_CreateDeleteTracker(t *testing.T) { +func TestManager_CreateDeleteTracker(t *testing.T) { manager := newTestManager() t.Run("Tracker existence and attributes", func(t *testing.T) { @@ -148,7 +148,7 @@ func Test_CreateDeleteTracker(t *testing.T) { }) } -func Test_PurgeInactiveAttributionsUntil(t *testing.T) { +func TestManager_PurgeInactiveAttributionsUntil(t *testing.T) { manager := newTestManager() manager.Tracker("user1").IncrementReceivedSamples(labels.FromStrings("team", "foo"), 1, time.Unix(1, 0)) diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index 425694fb67f..2a52e7b285f 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -15,12 +15,12 @@ import ( "github.com/stretchr/testify/require" ) -func Test_GetCALabels(t *testing.T) { +func TestTracker_hasSameLabels(t *testing.T) { tracker := newTestManager().Tracker("user1") assert.True(t, tracker.hasSameLabels([]string{"team"}), "Expected cost attribution labels mismatch") } -func Test_CreateCleanupTracker(t *testing.T) { +func TestTracker_CreateDelete(t *testing.T) { tManager := newTestManager() tracker := tManager.Tracker("user4") @@ -75,7 +75,7 @@ func Test_CreateCleanupTracker(t *testing.T) { assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(""), metricNames...)) } -func Test_UpdateCounters(t *testing.T) { +func TestTracker_updateCounters(t *testing.T) { tracker := newTestManager().Tracker("user3") lbls1 := labels.FromStrings("department", "foo", "service", "bar") lbls2 := labels.FromStrings("department", "bar", "service", "baz") @@ -96,7 +96,7 @@ func Test_UpdateCounters(t *testing.T) { assert.Equal(t, int64(3+tracker.cooldownDuration), tracker.cooldownUntil.Load(), "CooldownUntil should be updated correctly") } -func Test_GetInactiveObservations(t *testing.T) { +func TestTracker_inactiveObservations(t *testing.T) { // Setup the test environment: create a tracker for user1 with a "team" label and max cardinality of 5. tracker := newTestManager().Tracker("user1") @@ -129,7 +129,7 @@ func Test_GetInactiveObservations(t *testing.T) { assert.ElementsMatch(t, []string{"foo", "bar", "baz"}, purged) } -func Test_Concurrency(t *testing.T) { +func TestTracker_Concurrency(t *testing.T) { m := newTestManager() tracker := m.Tracker("user1") From dccd9c8669aa690c0400bc48cea8950511084c4f Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 17:18:12 +0100 Subject: [PATCH 027/105] remove test result --- pkg/distributor/allcase.txt | 90 ------------------------------------- 1 file changed, 90 deletions(-) delete mode 100644 pkg/distributor/allcase.txt diff --git a/pkg/distributor/allcase.txt b/pkg/distributor/allcase.txt deleted file mode 100644 index 5efb38bff35..00000000000 --- a/pkg/distributor/allcase.txt +++ /dev/null @@ -1,90 +0,0 @@ -goos: darwin -goarch: amd64 -pkg: github.com/grafana/mimir/pkg/distributor -cpu: Intel(R) Core(TM) i5-1038NG7 CPU @ 2.00GHz -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 292 4093113 ns/op 1137807 B/op 5058 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 295 4286668 ns/op 1136742 B/op 5057 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 258 4621600 ns/op 1137652 B/op 5058 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 300 4381770 ns/op 1137330 B/op 5058 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 306 3978604 ns/op 1138153 B/op 5058 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 303 3889851 ns/op 1136827 B/op 5058 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 217 5309972 ns/op 1218313 B/op 6059 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 223 5308695 ns/op 1218015 B/op 6059 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 225 5686183 ns/op 1220126 B/op 6060 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 222 5320854 ns/op 1219277 B/op 6059 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 224 5362158 ns/op 1218447 B/op 6059 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 222 5352613 ns/op 1218641 B/op 6060 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 720 1637728 ns/op 324601 B/op 4054 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 668 1699484 ns/op 324867 B/op 4054 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 704 1650014 ns/op 324865 B/op 4054 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 697 1678209 ns/op 324811 B/op 4054 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 712 1679228 ns/op 324811 B/op 4054 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 720 1650075 ns/op 325052 B/op 4054 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 312 3780976 ns/op 1571034 B/op 7090 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 308 3830179 ns/op 1572930 B/op 7104 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 308 3778948 ns/op 1567952 B/op 7089 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 288 4163770 ns/op 1559790 B/op 7088 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 310 3775677 ns/op 1565793 B/op 7093 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 309 4826310 ns/op 1566713 B/op 7091 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 652 1911060 ns/op 165520 B/op 79 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 657 1825805 ns/op 167283 B/op 79 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 631 1823762 ns/op 166046 B/op 81 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 639 1800926 ns/op 167361 B/op 84 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 645 1801281 ns/op 165645 B/op 79 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 646 1813022 ns/op 166700 B/op 79 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1327 906046 ns/op 2407 B/op 43 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1261 894881 ns/op 2523 B/op 43 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1237 905868 ns/op 2527 B/op 43 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1347 883890 ns/op 2510 B/op 43 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1269 880076 ns/op 2520 B/op 43 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1333 884934 ns/op 2484 B/op 43 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 202 6823420 ns/op 1201064 B/op 5059 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 202 5941364 ns/op 1201755 B/op 5059 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 202 6066547 ns/op 1200638 B/op 5058 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 193 5998870 ns/op 1201690 B/op 5059 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 201 5828347 ns/op 1201056 B/op 5059 allocs/op -BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 193 5906302 ns/op 1200750 B/op 5059 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 291 4090687 ns/op 1590964 B/op 8098 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 291 4113064 ns/op 1589749 B/op 8091 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 265 4166235 ns/op 1583910 B/op 8096 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 283 4157170 ns/op 1583275 B/op 8099 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 237 4237111 ns/op 1586094 B/op 8093 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 285 4207373 ns/op 1585480 B/op 8095 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 550 2176540 ns/op 183504 B/op 1081 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 502 2186461 ns/op 183481 B/op 1080 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 526 2187088 ns/op 181204 B/op 1080 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 504 2205968 ns/op 182120 B/op 1079 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 531 2192123 ns/op 182981 B/op 1079 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 525 2195721 ns/op 182929 B/op 1080 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1226 986827 ns/op 2559 B/op 45 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1179 980126 ns/op 2446 B/op 45 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1222 971585 ns/op 2496 B/op 45 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1154 983680 ns/op 2541 B/op 45 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1215 959667 ns/op 2529 B/op 45 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1222 983919 ns/op 2558 B/op 45 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 181 10726471 ns/op 1226302 B/op 7062 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 170 7175109 ns/op 1224269 B/op 7060 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 184 6481711 ns/op 1225092 B/op 7060 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 182 6501399 ns/op 1224896 B/op 7059 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 181 7033662 ns/op 1225391 B/op 7060 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 177 6617141 ns/op 1224477 B/op 7060 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 270 4385703 ns/op 1162346 B/op 7059 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 272 4401598 ns/op 1161965 B/op 7059 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 270 4378841 ns/op 1161221 B/op 7059 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 266 4438176 ns/op 1161650 B/op 7059 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 268 4528658 ns/op 1161541 B/op 7059 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 264 4430113 ns/op 1161600 B/op 7059 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 206 6302555 ns/op 1243108 B/op 8060 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 206 5960008 ns/op 1241662 B/op 8059 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 199 6671300 ns/op 1243085 B/op 8061 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 202 5823528 ns/op 1241662 B/op 8060 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 208 5834922 ns/op 1241914 B/op 8060 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 206 5758215 ns/op 1242172 B/op 8060 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 554 2115840 ns/op 348972 B/op 6055 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 564 2145631 ns/op 348762 B/op 6055 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 566 2088044 ns/op 349132 B/op 6055 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 562 2152042 ns/op 349683 B/op 6055 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 544 2103713 ns/op 348848 B/op 6055 allocs/op -BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 531 2125180 ns/op 349253 B/op 6055 allocs/op -PASS -ok github.com/grafana/mimir/pkg/distributor 176.572s From eebd0288658ce4e2c9447d89f5efc8f96e0878ee Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 19 Dec 2024 17:31:01 +0100 Subject: [PATCH 028/105] address doc change --- cmd/mimir/config-descriptor.json | 6 +++--- cmd/mimir/help-all.txt.tmpl | 6 +++--- .../configuration-parameters/index.md | 20 +++++++++---------- pkg/mimir/mimir.go | 2 +- pkg/util/validation/limits.go | 4 ++-- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 1334a1b047e..4a6ec05c420 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -4372,7 +4372,7 @@ "kind": "field", "name": "cost_attribution_labels", "required": false, - "desc": "Defines labels for cost attribution, applied to metrics like cortex_distributor_attributed_received_samples_total. Set to an empty string to disable. Example: 'team,service' will produce metrics such as cortex_distributor_attributed_received_samples_total{team='frontend', service='api'}.", + "desc": "Defines labels for cost attribution. Applies to metrics like cortex_distributor_attributed_received_samples_total. To disable, set to an empty string. For example, 'team,service' produces metrics such as cortex_distributor_attributed_received_samples_total{team='frontend', service='api'}.", "fieldValue": null, "fieldDefaultValue": "", "fieldFlag": "validation.cost-attribution-labels", @@ -4405,7 +4405,7 @@ "kind": "field", "name": "cost_attribution_cooldown", "required": false, - "desc": "Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system will stay in overflow mode and extend the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset will occur once the cardinality falls below the limit.", + "desc": "Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system stays in overflow mode and extends the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset occurs when the cardinality falls below the limit.", "fieldValue": null, "fieldDefaultValue": 0, "fieldFlag": "validation.cost-attribution-cooldown", @@ -19699,7 +19699,7 @@ "kind": "field", "name": "cost_attribution_registry_path", "required": false, - "desc": "Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.", + "desc": "Defines a custom path for the registry. When specified, Mimir exposes cost attribution metrics through this custom path. If not specified, cost attribution metrics aren't exposed.", "fieldValue": null, "fieldDefaultValue": "", "fieldFlag": "cost-attribution.registry-path", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 0324a354ceb..410b0ead6f1 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -1286,7 +1286,7 @@ Usage of ./cmd/mimir/mimir: -cost-attribution.eviction-interval duration [experimental] Time interval at which inactive cost attributions are evicted from the counter, ensuring they are not included in the cost attribution cardinality per user limit. (default 20m0s) -cost-attribution.registry-path string - [experimental] Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed. + [experimental] Defines a custom path for the registry. When specified, Mimir exposes cost attribution metrics through this custom path. If not specified, cost attribution metrics aren't exposed. -debug.block-profile-rate int Fraction of goroutine blocking events that are reported in the blocking profile. 1 to include every blocking event in the profile, 0 to disable. -debug.mutex-profile-fraction int @@ -3322,9 +3322,9 @@ Usage of ./cmd/mimir/mimir: -usage-stats.installation-mode string Installation mode. Supported values: custom, helm, jsonnet. (default "custom") -validation.cost-attribution-cooldown duration - [experimental] Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system will stay in overflow mode and extend the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset will occur once the cardinality falls below the limit. + [experimental] Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system stays in overflow mode and extends the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset occurs when the cardinality falls below the limit. -validation.cost-attribution-labels comma-separated-list-of-strings - [experimental] Defines labels for cost attribution, applied to metrics like cortex_distributor_attributed_received_samples_total. Set to an empty string to disable. Example: 'team,service' will produce metrics such as cortex_distributor_attributed_received_samples_total{team='frontend', service='api'}. + [experimental] Defines labels for cost attribution. Applies to metrics like cortex_distributor_attributed_received_samples_total. To disable, set to an empty string. For example, 'team,service' produces metrics such as cortex_distributor_attributed_received_samples_total{team='frontend', service='api'}. -validation.create-grace-period duration Controls how far into the future incoming samples and exemplars are accepted compared to the wall clock. Any sample or exemplar will be rejected if its timestamp is greater than '(now + creation_grace_period)'. This configuration is enforced in the distributor and ingester. (default 10m) -validation.enforce-metadata-metric-name diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 137e88d3fc1..0b6903162e7 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -463,8 +463,8 @@ overrides_exporter: [cost_attribution_eviction_interval: | default = 20m] # (experimental) Defines a custom path for the registry. When specified, Mimir -# will expose cost attribution metrics through this custom path, if not -# specified, cost attribution metrics won't be exposed. +# exposes cost attribution metrics through this custom path. If not specified, +# cost attribution metrics aren't exposed. # CLI flag: -cost-attribution.registry-path [cost_attribution_registry_path: | default = ""] ``` @@ -3581,9 +3581,9 @@ The `limits` block configures default and per-tenant limits imposed by component # CLI flag: -querier.active-series-results-max-size-bytes [active_series_results_max_size_bytes: | default = 419430400] -# (experimental) Defines labels for cost attribution, applied to metrics like -# cortex_distributor_attributed_received_samples_total. Set to an empty string -# to disable. Example: 'team,service' will produce metrics such as +# (experimental) Defines labels for cost attribution. Applies to metrics like +# cortex_distributor_attributed_received_samples_total. To disable, set to an +# empty string. For example, 'team,service' produces metrics such as # cortex_distributor_attributed_received_samples_total{team='frontend', # service='api'}. # CLI flag: -validation.cost-attribution-labels @@ -3600,11 +3600,11 @@ The `limits` block configures default and per-tenant limits imposed by component # (experimental) Cooldown period for cost attribution labels. Specifies the # duration the cost attribution remains in overflow before attempting a reset. -# If the cardinality remains above the limit after this period, the system will -# stay in overflow mode and extend the cooldown. Setting this value to 0 -# disables the cooldown, causing the system to continuously check whether the -# cardinality has dropped below the limit. A reset will occur once the -# cardinality falls below the limit. +# If the cardinality remains above the limit after this period, the system stays +# in overflow mode and extends the cooldown. Setting this value to 0 disables +# the cooldown, causing the system to continuously check whether the cardinality +# has dropped below the limit. A reset occurs when the cardinality falls below +# the limit. # CLI flag: -validation.cost-attribution-cooldown [cost_attribution_cooldown: | default = 0s] diff --git a/pkg/mimir/mimir.go b/pkg/mimir/mimir.go index 31baea29e7e..d5eddc4bf35 100644 --- a/pkg/mimir/mimir.go +++ b/pkg/mimir/mimir.go @@ -177,7 +177,7 @@ func (c *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) { f.IntVar(&c.MaxSeparateMetricsGroupsPerUser, "max-separate-metrics-groups-per-user", 1000, "Maximum number of groups allowed per user by which specified distributor and ingester metrics can be further separated.") f.BoolVar(&c.EnableGoRuntimeMetrics, "enable-go-runtime-metrics", false, "Set to true to enable all Go runtime metrics, such as go_sched_* and go_memstats_*.") f.BoolVar(&c.TimeseriesUnmarshalCachingOptimizationEnabled, "timeseries-unmarshal-caching-optimization-enabled", true, "Enables optimized marshaling of timeseries.") - f.StringVar(&c.CostAttributionRegistryPath, "cost-attribution.registry-path", "", "Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.") + f.StringVar(&c.CostAttributionRegistryPath, "cost-attribution.registry-path", "", "Defines a custom path for the registry. When specified, Mimir exposes cost attribution metrics through this custom path. If not specified, cost attribution metrics aren't exposed.") f.DurationVar(&c.CostAttributionEvictionInterval, "cost-attribution.eviction-interval", 20*time.Minute, "Time interval at which inactive cost attributions are evicted from the counter, ensuring they are not included in the cost attribution cardinality per user limit.") c.API.RegisterFlags(f) diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 9fc26f99b71..e54f0ccd2af 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -309,10 +309,10 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.StringVar(&l.SeparateMetricsGroupLabel, "validation.separate-metrics-group-label", "", "Label used to define the group label for metrics separation. For each write request, the group is obtained from the first non-empty group label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'group' label with group label's value. Currently applies to the following metrics: cortex_discarded_samples_total") - f.Var(&l.CostAttributionLabels, costAttributionLabelsFlag, "Defines labels for cost attribution, applied to metrics like cortex_distributor_attributed_received_samples_total. Set to an empty string to disable. Example: 'team,service' will produce metrics such as cortex_distributor_attributed_received_samples_total{team='frontend', service='api'}.") + f.Var(&l.CostAttributionLabels, costAttributionLabelsFlag, "Defines labels for cost attribution. Applies to metrics like cortex_distributor_attributed_received_samples_total. To disable, set to an empty string. For example, 'team,service' produces metrics such as cortex_distributor_attributed_received_samples_total{team='frontend', service='api'}.") f.IntVar(&l.MaxCostAttributionLabelsPerUser, maxCostAttributionLabelsPerUserFlag, 2, "Maximum number of cost attribution labels allowed per user.") f.IntVar(&l.MaxCostAttributionCardinalityPerUser, "validation.max-cost-attribution-cardinality-per-user", 10000, "Maximum cardinality of cost attribution labels allowed per user.") - f.Var(&l.CostAttributionCooldown, "validation.cost-attribution-cooldown", "Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system will stay in overflow mode and extend the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset will occur once the cardinality falls below the limit.") + f.Var(&l.CostAttributionCooldown, "validation.cost-attribution-cooldown", "Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system stays in overflow mode and extends the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset occurs when the cardinality falls below the limit.") f.IntVar(&l.MaxChunksPerQuery, MaxChunksPerQueryFlag, 2e6, "Maximum number of chunks that can be fetched in a single query from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable.") f.Float64Var(&l.MaxEstimatedChunksPerQueryMultiplier, MaxEstimatedChunksPerQueryMultiplierFlag, 0, "Maximum number of chunks estimated to be fetched in a single query from ingesters and store-gateways, as a multiple of -"+MaxChunksPerQueryFlag+". This limit is enforced in the querier. Must be greater than or equal to 1, or 0 to disable.") f.IntVar(&l.MaxFetchedSeriesPerQuery, MaxSeriesPerQueryFlag, 0, "The maximum number of unique series for which a query can fetch samples from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable") From 83865039abbf656ee3f1ba9b487cb8b96090336f Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 24 Dec 2024 12:02:33 +0100 Subject: [PATCH 029/105] remove time checking --- pkg/costattribution/tracker.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index b6c0d6d4420..33f06d2165b 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -219,10 +219,7 @@ func (t *Tracker) updateCounters(lbls labels.Labels, ts int64, activeSeriesIncre // updateObservations updates or creates a new observation in the 'observed' map. func (t *Tracker) updateObservations(key []byte, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { if o, known := t.observed[string(key)]; known && o.lastUpdate != nil { - // Update the timestamp if needed - if o.lastUpdate.Load() < ts { - o.lastUpdate.Store(ts) - } + o.lastUpdate.Store(ts) if activeSeriesIncrement != 0 { o.activeSerie.Add(activeSeriesIncrement) } From d8f1e9b01e4a38a1a667d6679657521418a6c600 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 24 Dec 2024 12:09:56 +0100 Subject: [PATCH 030/105] add createIfDoesNotExist parameter --- pkg/costattribution/tracker.go | 22 ++++++++++------------ pkg/costattribution/tracker_test.go | 10 +++++----- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 33f06d2165b..84cd17ad5b3 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -123,14 +123,14 @@ func (t *Tracker) IncrementActiveSeries(lbs labels.Labels, now time.Time) { if t == nil { return } - t.updateCounters(lbs, now.Unix(), 1, 0, 0, nil) + t.updateCounters(lbs, now.Unix(), 1, 0, 0, nil, true) } func (t *Tracker) DecrementActiveSeries(lbs labels.Labels) { if t == nil { return } - t.updateCounters(lbs, -1, -1, 0, 0, nil) + t.updateCounters(lbs, -1, -1, 0, 0, nil, false) } func (t *Tracker) Collect(out chan<- prometheus.Metric) { @@ -168,14 +168,14 @@ func (t *Tracker) IncrementDiscardedSamples(lbs labels.Labels, value float64, re if t == nil { return } - t.updateCounters(lbs, now.Unix(), 0, 0, value, &reason) + t.updateCounters(lbs, now.Unix(), 0, 0, value, &reason, true) } func (t *Tracker) IncrementReceivedSamples(lbs labels.Labels, value float64, now time.Time) { if t == nil { return } - t.updateCounters(lbs, now.Unix(), 0, value, 0, nil) + t.updateCounters(lbs, now.Unix(), 0, value, 0, nil, true) } func (t *Tracker) IncrementActiveSeriesFailure(value float64) { @@ -185,7 +185,7 @@ func (t *Tracker) IncrementActiveSeriesFailure(value float64) { t.totalFailedActiveSeries.Add(value) } -func (t *Tracker) updateCounters(lbls labels.Labels, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { +func (t *Tracker) updateCounters(lbls labels.Labels, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { labelValues := make([]string, len(t.labels)) lbls.Range(func(l labels.Label) { if idx, ok := t.index[l.Name]; ok { @@ -212,12 +212,12 @@ func (t *Tracker) updateCounters(lbls labels.Labels, ts int64, activeSeriesIncre t.observedMtx.Lock() defer t.observedMtx.Unlock() - t.updateObservations(buf.Bytes(), ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) + t.updateObservations(buf.Bytes(), ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) t.updateState(ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement) } // updateObservations updates or creates a new observation in the 'observed' map. -func (t *Tracker) updateObservations(key []byte, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { +func (t *Tracker) updateObservations(key []byte, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { if o, known := t.observed[string(key)]; known && o.lastUpdate != nil { o.lastUpdate.Store(ts) if activeSeriesIncrement != 0 { @@ -231,12 +231,10 @@ func (t *Tracker) updateObservations(key []byte, ts int64, activeSeriesIncrement o.discardedSample[*reason] = *atomic.NewFloat64(discardedSampleIncrement) o.discardedSampleMtx.Unlock() } - } else if len(t.observed) < t.maxCardinality*2 { - // If the ts is negative, it means that the method is called from DecrementActiveSeries, when key doesn't exist we should ignore the call + } else if len(t.observed) < t.maxCardinality*2 && createIfDoesNotExist { + // When createIfDoesNotExist is false, it means that the method is called from DecrementActiveSeries, when key doesn't exist we should ignore the call // Otherwise create a new observation for the key - if ts >= 0 { - t.createNewObservation(key, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) - } + t.createNewObservation(key, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) } } diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index 2a52e7b285f..1172863ecd4 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -81,16 +81,16 @@ func TestTracker_updateCounters(t *testing.T) { lbls2 := labels.FromStrings("department", "bar", "service", "baz") lbls3 := labels.FromStrings("department", "baz", "service", "foo") - tracker.updateCounters(lbls1, 1, 1, 0, 0, nil) + tracker.updateCounters(lbls1, 1, 1, 0, 0, nil, true) assert.Equal(t, Normal, tracker.state, "First observation, should not overflow") - tracker.updateCounters(lbls2, 2, 1, 0, 0, nil) + tracker.updateCounters(lbls2, 2, 1, 0, 0, nil, true) assert.Equal(t, Normal, tracker.state, "Second observation, should not overflow") - tracker.updateCounters(lbls3, 3, 1, 0, 0, nil) + tracker.updateCounters(lbls3, 3, 1, 0, 0, nil, true) assert.Equal(t, Overflow, tracker.state, "Third observation, should overflow") - tracker.updateCounters(lbls3, 4, 1, 0, 0, nil) + tracker.updateCounters(lbls3, 4, 1, 0, 0, nil, true) assert.Equal(t, Overflow, tracker.state, "Fourth observation, should stay overflow") assert.Equal(t, int64(3+tracker.cooldownDuration), tracker.cooldownUntil.Load(), "CooldownUntil should be updated correctly") @@ -139,7 +139,7 @@ func TestTracker_Concurrency(t *testing.T) { go func(i int) { defer wg.Done() lbls := labels.FromStrings("team", string(rune('A'+(i%26)))) - tracker.updateCounters(lbls, int64(i), 1, 0, 0, nil) + tracker.updateCounters(lbls, int64(i), 1, 0, 0, nil, true) }(i) } wg.Wait() From b9efb943d9b16e4806d7e794ddae4b425668dc3c Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 24 Dec 2024 12:23:53 +0100 Subject: [PATCH 031/105] add more condition for trigger newTracker --- pkg/costattribution/manager.go | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 5127e991df6..2c74e8f8e94 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -123,7 +123,8 @@ func (m *Manager) updateTracker(userID string) *Tracker { return newTrackedLabels[i] < newTrackedLabels[j] }) - if !t.hasSameLabels(newTrackedLabels) { + // if the labels have changed or the max cardinality or cooldown duration have changed, create a new tracker + if !t.hasSameLabels(newTrackedLabels) || t.maxCardinality != m.limits.MaxCostAttributionCardinalityPerUser(userID) || t.cooldownDuration != int64(m.limits.CostAttributionCooldown(userID).Seconds()) { m.mtx.Lock() t = newTracker(userID, newTrackedLabels, m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) m.trackersByUserID[userID] = t @@ -131,15 +132,6 @@ func (m *Manager) updateTracker(userID string) *Tracker { return t } - maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) - if t.maxCardinality != maxCardinality { - t.maxCardinality = maxCardinality - } - - cooldown := int64(m.limits.CostAttributionCooldown(userID).Seconds()) - if cooldown != t.cooldownDuration { - t.cooldownDuration = cooldown - } return t } From a37e6deb52de09f5cdddba3573bbbe9bf279a39c Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 24 Dec 2024 13:34:57 +0100 Subject: [PATCH 032/105] remove the label adapter to labels call --- pkg/costattribution/manager_test.go | 23 ++++++------ pkg/costattribution/tracker.go | 57 +++++++++++++++++++++++------ pkg/costattribution/tracker_test.go | 13 ++++--- pkg/distributor/distributor.go | 6 +-- pkg/distributor/validate.go | 32 ++++++++-------- pkg/ingester/ingester.go | 30 +++++++-------- 6 files changed, 99 insertions(+), 62 deletions(-) diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index f4f085b0c4b..bf111790e9e 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -10,9 +10,9 @@ import ( "github.com/go-kit/log" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" - "github.com/prometheus/prometheus/model/labels" "github.com/stretchr/testify/assert" + "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/util/validation" ) @@ -76,9 +76,9 @@ func TestManager_CreateDeleteTracker(t *testing.T) { }) t.Run("Metrics tracking", func(t *testing.T) { - manager.Tracker("user1").IncrementDiscardedSamples(labels.FromStrings("team", "bar"), 1, "invalid-metrics-name", time.Unix(6, 0)) - manager.Tracker("user1").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(12, 0)) - manager.Tracker("user3").IncrementReceivedSamples(labels.FromStrings("department", "foo", "service", "dodo"), 1, time.Unix(20, 0)) + manager.Tracker("user1").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "bar"}}, 1, "invalid-metrics-name", time.Unix(6, 0)) + manager.Tracker("user1").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "foo"}}, 1, "invalid-metrics-name", time.Unix(12, 0)) + manager.Tracker("user3").IncrementReceivedSamples([]mimirpb.LabelAdapter{{Name: "department", Value: "foo"}, {Name: "service", Value: "dodo"}}, 1, time.Unix(20, 0)) expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. @@ -126,7 +126,7 @@ func TestManager_CreateDeleteTracker(t *testing.T) { assert.Equal(t, 1, len(manager.trackersByUserID)) assert.True(t, manager.Tracker("user3").hasSameLabels([]string{"feature", "team"})) - manager.Tracker("user3").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(13, 0)) + manager.Tracker("user3").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "foo"}}, 1, "invalid-metrics-name", time.Unix(13, 0)) expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. # TYPE cortex_discarded_attributed_samples_total counter @@ -136,9 +136,10 @@ func TestManager_CreateDeleteTracker(t *testing.T) { }) t.Run("Overflow metrics on cardinality limit", func(t *testing.T) { - manager.Tracker("user3").IncrementReceivedSamples(labels.FromStrings("team", "bar", "feature", "bar"), 1, time.Unix(15, 0)) - manager.Tracker("user3").IncrementReceivedSamples(labels.FromStrings("team", "baz", "feature", "baz"), 1, time.Unix(16, 0)) - manager.Tracker("user3").IncrementReceivedSamples(labels.FromStrings("team", "foo", "feature", "foo"), 1, time.Unix(17, 0)) + + manager.Tracker("user3").IncrementReceivedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "bar"}, {Name: "feature", Value: "bar"}}, 1, time.Unix(15, 0)) + manager.Tracker("user3").IncrementReceivedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "baz"}, {Name: "feature", Value: "baz"}}, 1, time.Unix(16, 0)) + manager.Tracker("user3").IncrementReceivedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "foo"}, {Name: "feature", Value: "foo"}}, 1, time.Unix(17, 0)) expectedMetrics := ` # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. # TYPE cortex_received_attributed_samples_total counter @@ -151,9 +152,9 @@ func TestManager_CreateDeleteTracker(t *testing.T) { func TestManager_PurgeInactiveAttributionsUntil(t *testing.T) { manager := newTestManager() - manager.Tracker("user1").IncrementReceivedSamples(labels.FromStrings("team", "foo"), 1, time.Unix(1, 0)) - manager.Tracker("user1").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(1, 0)) - manager.Tracker("user3").IncrementDiscardedSamples(labels.FromStrings("department", "foo", "service", "bar"), 1, "out-of-window", time.Unix(10, 0)) + manager.Tracker("user1").IncrementReceivedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "foo"}}, 1, time.Unix(1, 0)) + manager.Tracker("user1").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "foo"}}, 1, "invalid-metrics-name", time.Unix(1, 0)) + manager.Tracker("user3").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "department", Value: "foo"}, {Name: "service", Value: "bar"}}, 1, "out-of-window", time.Unix(10, 0)) t.Run("Purge before inactive timeout", func(t *testing.T) { assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(0, 0).Unix())) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 84cd17ad5b3..13ccf4a3d50 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -11,6 +11,7 @@ import ( "time" "github.com/go-kit/log" + "github.com/grafana/mimir/pkg/mimirpb" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/prometheus/model/labels" "go.uber.org/atomic" @@ -164,18 +165,18 @@ func (t *Tracker) Collect(out chan<- prometheus.Metric) { } } -func (t *Tracker) IncrementDiscardedSamples(lbs labels.Labels, value float64, reason string, now time.Time) { +func (t *Tracker) IncrementDiscardedSamples(lbs []mimirpb.LabelAdapter, value float64, reason string, now time.Time) { if t == nil { return } - t.updateCounters(lbs, now.Unix(), 0, 0, value, &reason, true) + t.updateCountersWithLabelAdapter(lbs, now.Unix(), 0, 0, value, &reason, true) } -func (t *Tracker) IncrementReceivedSamples(lbs labels.Labels, value float64, now time.Time) { +func (t *Tracker) IncrementReceivedSamples(lbs []mimirpb.LabelAdapter, value float64, now time.Time) { if t == nil { return } - t.updateCounters(lbs, now.Unix(), 0, value, 0, nil, true) + t.updateCountersWithLabelAdapter(lbs, now.Unix(), 0, value, 0, nil, true) } func (t *Tracker) IncrementActiveSeriesFailure(value float64) { @@ -185,23 +186,55 @@ func (t *Tracker) IncrementActiveSeriesFailure(value float64) { t.totalFailedActiveSeries.Add(value) } -func (t *Tracker) updateCounters(lbls labels.Labels, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { - labelValues := make([]string, len(t.labels)) - lbls.Range(func(l labels.Label) { - if idx, ok := t.index[l.Name]; ok { - labelValues[idx] = l.Value +func (t *Tracker) updateCountersWithLabelAdapter(lbls []mimirpb.LabelAdapter, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { + extractValues := func() []string { + labelValues := make([]string, len(t.labels)) + for _, l := range lbls { + if idx, ok := t.index[l.Name]; ok { + labelValues[idx] = l.Value + } } - }) + return labelValues + } + t.updateCountersCommon(extractValues, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) +} + +func (t *Tracker) updateCounters(lbls labels.Labels, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { + extractValues := func() []string { + labelValues := make([]string, len(t.labels)) + lbls.Range(func(l labels.Label) { + if idx, ok := t.index[l.Name]; ok { + labelValues[idx] = l.Value + } + }) + return labelValues + } + t.updateCountersCommon(extractValues, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) +} + +func (t *Tracker) updateCountersCommon( + extractValues func() []string, + ts int64, + activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, + reason *string, + createIfDoesNotExist bool, +) { + // Extract label values + labelValues := extractValues() + + // Fill missing label values for i := 0; i < len(labelValues); i++ { if labelValues[i] == "" { labelValues[i] = missingValue } } + // Reuse buffer from pool for building the observation key buf := bufferPool.Get().(*bytes.Buffer) buf.Reset() defer bufferPool.Put(buf) - // Build the observation key + + // Construct the observation key by joining label values for i, value := range labelValues { if i > 0 { buf.WriteRune(sep) @@ -209,9 +242,11 @@ func (t *Tracker) updateCounters(lbls labels.Labels, ts int64, activeSeriesIncre buf.WriteString(value) } + // Lock access to the observation map t.observedMtx.Lock() defer t.observedMtx.Unlock() + // Update observations and state t.updateObservations(buf.Bytes(), ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) t.updateState(ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement) } diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index 1172863ecd4..025d0ad96a2 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -8,6 +8,7 @@ import ( "testing" "time" + "github.com/grafana/mimir/pkg/mimirpb" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" "github.com/prometheus/prometheus/model/labels" @@ -31,8 +32,8 @@ func TestTracker_CreateDelete(t *testing.T) { tracker.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), time.Unix(1, 0)) tracker.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "2"), time.Unix(2, 0)) tracker.DecrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "3")) - tracker.IncrementReceivedSamples(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), 5, time.Unix(4, 0)) - tracker.IncrementDiscardedSamples(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), 2, "sample-out-of-order", time.Unix(4, 0)) + tracker.IncrementReceivedSamples([]mimirpb.LabelAdapter{{Name: "platform", Value: "foo"}, {Name: "team", Value: "1"}}, 5, time.Unix(4, 0)) + tracker.IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "platform", Value: "foo"}, {Name: "team", Value: "1"}}, 2, "sample-out-of-order", time.Unix(4, 0)) tracker.IncrementActiveSeries(labels.FromStrings("platform", "bar", "tenant", "user4", "team", "2"), time.Unix(6, 0)) tracker.IncrementActiveSeriesFailure(1) @@ -101,10 +102,10 @@ func TestTracker_inactiveObservations(t *testing.T) { tracker := newTestManager().Tracker("user1") // Create two observations with different last update timestamps. - observations := []labels.Labels{ - labels.FromStrings("team", "foo"), - labels.FromStrings("team", "bar"), - labels.FromStrings("team", "baz"), + observations := [][]mimirpb.LabelAdapter{ + {{Name: "team", Value: "foo"}}, + {{Name: "team", Value: "bar"}}, + {{Name: "team", Value: "baz"}}, } // Simulate samples discarded with different timestamps. tracker.IncrementDiscardedSamples(observations[0], 1, "invalid-metrics-name", time.Unix(1, 0)) diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index c6aea2a7dd7..627aa6d1b51 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -988,7 +988,7 @@ func (d *Distributor) prePushHaDedupeMiddleware(next PushFunc) PushFunc { if errors.As(err, &tooManyClustersError{}) { d.discardedSamplesTooManyHaClusters.WithLabelValues(userID, group).Add(float64(numSamples)) - d.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(numSamples), reasonTooManyHAClusters, now) + d.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(req.Timeseries[0].Labels, float64(numSamples), reasonTooManyHAClusters, now) } return err @@ -1247,7 +1247,7 @@ func (d *Distributor) prePushValidationMiddleware(next PushFunc) PushFunc { totalN := validatedSamples + validatedExemplars + validatedMetadata if !d.ingestionRateLimiter.AllowN(now, userID, totalN) { if len(req.Timeseries) > 0 { - d.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(validatedSamples), reasonRateLimited, now) + d.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(req.Timeseries[0].Labels, float64(validatedSamples), reasonRateLimited, now) } d.discardedSamplesRateLimited.WithLabelValues(userID, group).Add(float64(validatedSamples)) d.discardedExemplarsRateLimited.WithLabelValues(userID).Add(float64(validatedExemplars)) @@ -1832,7 +1832,7 @@ func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID st for _, ts := range req.Timeseries { receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) - d.costAttributionMgr.Tracker(userID).IncrementReceivedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(receivedSamples), mtime.Now()) + d.costAttributionMgr.Tracker(userID).IncrementReceivedSamples(ts.Labels, float64(receivedSamples), mtime.Now()) } receivedMetadata = len(req.Metadata) diff --git a/pkg/distributor/validate.go b/pkg/distributor/validate.go index 5b6775cdf9f..d48f91ff8bc 100644 --- a/pkg/distributor/validate.go +++ b/pkg/distributor/validate.go @@ -242,14 +242,14 @@ func newExemplarValidationMetrics(r prometheus.Registerer) *exemplarValidationMe func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s mimirpb.Sample, cat *costattribution.Tracker) error { if model.Time(s.TimestampMs) > now.Add(cfg.CreationGracePeriod(userID)) { m.tooFarInFuture.WithLabelValues(userID, group).Inc() - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInFuture, now.Time()) + cat.IncrementDiscardedSamples(ls, 1, reasonTooFarInFuture, now.Time()) unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return fmt.Errorf(sampleTimestampTooNewMsgFormat, s.TimestampMs, unsafeMetricName) } if cfg.PastGracePeriod(userID) > 0 && model.Time(s.TimestampMs) < now.Add(-cfg.PastGracePeriod(userID)).Add(-cfg.OutOfOrderTimeWindow(userID)) { m.tooFarInPast.WithLabelValues(userID, group).Inc() - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInPast, now.Time()) + cat.IncrementDiscardedSamples(ls, 1, reasonTooFarInPast, now.Time()) unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return fmt.Errorf(sampleTimestampTooOldMsgFormat, s.TimestampMs, unsafeMetricName) } @@ -262,21 +262,21 @@ func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValida // It uses the passed 'now' time to measure the relative time of the sample. func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s *mimirpb.Histogram, cat *costattribution.Tracker) (bool, error) { if model.Time(s.Timestamp) > now.Add(cfg.CreationGracePeriod(userID)) { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInFuture, now.Time()) + cat.IncrementDiscardedSamples(ls, 1, reasonTooFarInFuture, now.Time()) m.tooFarInFuture.WithLabelValues(userID, group).Inc() unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return false, fmt.Errorf(sampleTimestampTooNewMsgFormat, s.Timestamp, unsafeMetricName) } if cfg.PastGracePeriod(userID) > 0 && model.Time(s.Timestamp) < now.Add(-cfg.PastGracePeriod(userID)).Add(-cfg.OutOfOrderTimeWindow(userID)) { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInPast, now.Time()) + cat.IncrementDiscardedSamples(ls, 1, reasonTooFarInPast, now.Time()) m.tooFarInPast.WithLabelValues(userID, group).Inc() unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return false, fmt.Errorf(sampleTimestampTooOldMsgFormat, s.Timestamp, unsafeMetricName) } if s.Schema < mimirpb.MinimumHistogramSchema || s.Schema > mimirpb.MaximumHistogramSchema { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidNativeHistogramSchema, now.Time()) + cat.IncrementDiscardedSamples(ls, 1, reasonInvalidNativeHistogramSchema, now.Time()) m.invalidNativeHistogramSchema.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(invalidSchemaNativeHistogramMsgFormat, s.Schema) } @@ -290,7 +290,7 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam } if bucketCount > bucketLimit { if !cfg.ReduceNativeHistogramOverMaxBuckets(userID) { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxNativeHistogramBuckets, now.Time()) + cat.IncrementDiscardedSamples(ls, 1, reasonMaxNativeHistogramBuckets, now.Time()) m.maxNativeHistogramBuckets.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(maxNativeHistogramBucketsMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), bucketCount, bucketLimit) } @@ -298,7 +298,7 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam for { bc, err := s.ReduceResolution() if err != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxNativeHistogramBuckets, now.Time()) + cat.IncrementDiscardedSamples(ls, 1, reasonMaxNativeHistogramBuckets, now.Time()) m.maxNativeHistogramBuckets.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(notReducibleNativeHistogramMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), bucketCount, bucketLimit) } @@ -403,13 +403,13 @@ func removeNonASCIIChars(in string) (out string) { func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelValidation, skipLabelCountValidation bool, cat *costattribution.Tracker, ts time.Time) error { unsafeMetricName, err := extract.UnsafeMetricNameFromLabelAdapters(ls) if err != nil { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMissingMetricName, ts) + cat.IncrementDiscardedSamples(ls, 1, reasonMissingMetricName, ts) m.missingMetricName.WithLabelValues(userID, group).Inc() return errors.New(noMetricNameMsgFormat) } if !model.IsValidMetricName(model.LabelValue(unsafeMetricName)) { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidMetricName, ts) + cat.IncrementDiscardedSamples(ls, 1, reasonInvalidMetricName, ts) m.invalidMetricName.WithLabelValues(userID, group).Inc() return fmt.Errorf(invalidMetricNameMsgFormat, removeNonASCIIChars(unsafeMetricName)) } @@ -418,13 +418,13 @@ func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userI if strings.HasSuffix(unsafeMetricName, "_info") { if len(ls) > cfg.MaxLabelNamesPerInfoSeries(userID) { m.maxLabelNamesPerInfoSeries.WithLabelValues(userID, group).Inc() - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxLabelNamesPerInfoSeries, ts) + cat.IncrementDiscardedSamples(ls, 1, reasonMaxLabelNamesPerInfoSeries, ts) metric, ellipsis := getMetricAndEllipsis(ls) return fmt.Errorf(tooManyInfoLabelsMsgFormat, len(ls), cfg.MaxLabelNamesPerInfoSeries(userID), metric, ellipsis) } } else { m.maxLabelNamesPerSeries.WithLabelValues(userID, group).Inc() - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxLabelNamesPerSeries, ts) + cat.IncrementDiscardedSamples(ls, 1, reasonMaxLabelNamesPerSeries, ts) metric, ellipsis := getMetricAndEllipsis(ls) return fmt.Errorf(tooManyLabelsMsgFormat, len(ls), cfg.MaxLabelNamesPerSeries(userID), metric, ellipsis) } @@ -436,22 +436,22 @@ func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userI for _, l := range ls { if !skipLabelValidation && !model.LabelName(l.Name).IsValid() { m.invalidLabel.WithLabelValues(userID, group).Inc() - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidLabel, ts) + cat.IncrementDiscardedSamples(ls, 1, reasonInvalidLabel, ts) return fmt.Errorf(invalidLabelMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } else if len(l.Name) > maxLabelNameLength { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonLabelNameTooLong, ts) + cat.IncrementDiscardedSamples(ls, 1, reasonLabelNameTooLong, ts) m.labelNameTooLong.WithLabelValues(userID, group).Inc() return fmt.Errorf(labelNameTooLongMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } else if !skipLabelValidation && !model.LabelValue(l.Value).IsValid() { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidLabelValue, ts) + cat.IncrementDiscardedSamples(ls, 1, reasonInvalidLabelValue, ts) m.invalidLabelValue.WithLabelValues(userID, group).Inc() return fmt.Errorf(invalidLabelValueMsgFormat, l.Name, strings.ToValidUTF8(l.Value, ""), unsafeMetricName) } else if len(l.Value) > maxLabelValueLength { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonLabelValueTooLong, ts) + cat.IncrementDiscardedSamples(ls, 1, reasonLabelValueTooLong, ts) m.labelValueTooLong.WithLabelValues(userID, group).Inc() return fmt.Errorf(labelValueTooLongMsgFormat, l.Name, l.Value, mimirpb.FromLabelAdaptersToString(ls)) } else if lastLabelName == l.Name { - cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonDuplicateLabelNames, ts) + cat.IncrementDiscardedSamples(ls, 1, reasonDuplicateLabelNames, ts) m.duplicateLabelNames.WithLabelValues(userID, group).Inc() return fmt.Errorf(duplicateLabelMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index de6f02a53af..2291981bd26 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -1202,56 +1202,56 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTimestampTooOldCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTimestampTooOld, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonSampleTimestampTooOld, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooOld, func() softError { return newSampleTimestampTooOldError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleOutOfOrderCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfOrder, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonSampleOutOfOrder, startAppend) updateFirstPartial(i.errorSamplers.sampleOutOfOrder, func() softError { return newSampleOutOfOrderError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTooOldCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooOld, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonSampleTooOld, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooOldOOOEnabled, func() softError { return newSampleTimestampTooOldOOOEnabledError(model.Time(timestamp), labels, outOfOrderWindow) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTooFarInFutureCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooFarInFuture, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonSampleTooFarInFuture, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooFarInFuture, func() softError { return newSampleTimestampTooFarInFutureError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.newValueForTimestampCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonNewValueForTimestamp, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonNewValueForTimestamp, startAppend) updateFirstPartial(i.errorSamplers.sampleDuplicateTimestamp, func() softError { return newSampleDuplicateTimestampError(model.Time(timestamp), labels) }) }, func(labels []mimirpb.LabelAdapter) { stats.perUserSeriesLimitCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerUserSeriesLimit, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonPerUserSeriesLimit, startAppend) updateFirstPartial(i.errorSamplers.maxSeriesPerUserLimitExceeded, func() softError { return newPerUserSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerUser(userID)) }) }, func(labels []mimirpb.LabelAdapter) { stats.perMetricSeriesLimitCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerMetricSeriesLimit, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonPerMetricSeriesLimit, startAppend) updateFirstPartial(i.errorSamplers.maxSeriesPerMetricLimitExceeded, func() softError { return newPerMetricSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerMetric(userID), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleOutOfOrderCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfOrder, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonSampleOutOfOrder, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { e := newNativeHistogramValidationError(globalerror.NativeHistogramOOODisabled, err, model.Time(timestamp), labels) return e @@ -1259,35 +1259,35 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramCountMismatch, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramCountNotBigEnough, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramNegativeBucketCount, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramSpanNegativeOffset, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramSpansBucketsMismatch, err, model.Time(timestamp), labels) }) @@ -1437,7 +1437,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre stats.failedSamplesCount += len(ts.Samples) + len(ts.Histograms) stats.sampleTimestampTooOldCount += len(ts.Samples) + len(ts.Histograms) - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)+len(ts.Histograms)), reasonSampleTimestampTooOld, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(ts.Labels, float64(len(ts.Samples)+len(ts.Histograms)), reasonSampleTimestampTooOld, startAppend) var firstTimestamp int64 if len(ts.Samples) > 0 { firstTimestamp = ts.Samples[0].TimestampMs @@ -1457,7 +1457,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre len(ts.Samples) > 0 && allOutOfBoundsFloats(ts.Samples, minAppendTime) { stats.failedSamplesCount += len(ts.Samples) stats.sampleTimestampTooOldCount += len(ts.Samples) - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)), reasonSampleTimestampTooOld, startAppend) + i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(ts.Labels, float64(len(ts.Samples)), reasonSampleTimestampTooOld, startAppend) firstTimestamp := ts.Samples[0].TimestampMs updateFirstPartial(i.errorSamplers.sampleTimestampTooOld, func() softError { From 211b3a2ca5a0ecab5665b462647436adc95b97f9 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 24 Dec 2024 13:46:38 +0100 Subject: [PATCH 033/105] remove useless function dum dum --- pkg/ingester/activeseries/active_series.go | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 6c06a62e162..46d3fd197be 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -107,8 +107,9 @@ func (c *ActiveSeries) CurrentMatcherNames() []string { } func (c *ActiveSeries) ConfigDiffers(ctCfg asmodel.CustomTrackersConfig, caCfg *costattribution.Tracker) bool { - currentCTC, currentCAT := c.CurrentConfig() - return ctCfg.String() != currentCTC.String() || caCfg != currentCAT + c.configMutex.RLock() + defer c.configMutex.RUnlock() + return ctCfg.String() != c.matchers.Config().String() || caCfg != c.cat } func (c *ActiveSeries) ReloadMatchers(asm *asmodel.Matchers, now time.Time) { @@ -122,12 +123,6 @@ func (c *ActiveSeries) ReloadMatchers(asm *asmodel.Matchers, now time.Time) { c.lastConfigUpdate = now } -func (c *ActiveSeries) CurrentConfig() (asmodel.CustomTrackersConfig, *costattribution.Tracker) { - c.configMutex.RLock() - defer c.configMutex.RUnlock() - return c.matchers.Config(), c.cat -} - // UpdateSeries updates series timestamp to 'now'. Function is called to make a copy of labels if entry doesn't exist yet. // Pass -1 in numNativeHistogramBuckets if the series is not a native histogram series. func (c *ActiveSeries) UpdateSeries(series labels.Labels, ref storage.SeriesRef, now time.Time, numNativeHistogramBuckets int, idx tsdb.IndexReader) { From f697e6f59d728da39f120bf2b464639264820f1f Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 24 Dec 2024 13:48:37 +0100 Subject: [PATCH 034/105] make hardcoded increment value --- pkg/costattribution/tracker.go | 4 ++-- pkg/costattribution/tracker_test.go | 2 +- pkg/ingester/activeseries/active_series.go | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 13ccf4a3d50..5057c26ad3f 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -179,11 +179,11 @@ func (t *Tracker) IncrementReceivedSamples(lbs []mimirpb.LabelAdapter, value flo t.updateCountersWithLabelAdapter(lbs, now.Unix(), 0, value, 0, nil, true) } -func (t *Tracker) IncrementActiveSeriesFailure(value float64) { +func (t *Tracker) IncrementActiveSeriesFailure() { if t == nil { return } - t.totalFailedActiveSeries.Add(value) + t.totalFailedActiveSeries.Add(1) } func (t *Tracker) updateCountersWithLabelAdapter(lbls []mimirpb.LabelAdapter, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index 025d0ad96a2..08ad7d66449 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -35,7 +35,7 @@ func TestTracker_CreateDelete(t *testing.T) { tracker.IncrementReceivedSamples([]mimirpb.LabelAdapter{{Name: "platform", Value: "foo"}, {Name: "team", Value: "1"}}, 5, time.Unix(4, 0)) tracker.IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "platform", Value: "foo"}, {Name: "team", Value: "1"}}, 2, "sample-out-of-order", time.Unix(4, 0)) tracker.IncrementActiveSeries(labels.FromStrings("platform", "bar", "tenant", "user4", "team", "2"), time.Unix(6, 0)) - tracker.IncrementActiveSeriesFailure(1) + tracker.IncrementActiveSeriesFailure() expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 46d3fd197be..c2732cbbfd6 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -463,9 +463,9 @@ func (s *seriesStripe) purge(keepUntil time.Time, idx tsdb.IndexReader) { // we need to increment the active series failure count. if s.cat != nil { if idx == nil { - s.cat.IncrementActiveSeriesFailure(1) + s.cat.IncrementActiveSeriesFailure() } else if err := idx.Series(ref, &buf, nil); err != nil { - s.cat.IncrementActiveSeriesFailure(1) + s.cat.IncrementActiveSeriesFailure() } else { s.cat.DecrementActiveSeries(buf.Labels()) } @@ -522,11 +522,11 @@ func (s *seriesStripe) remove(ref storage.SeriesRef, idx tsdb.IndexReader) { s.active-- if s.cat != nil { if idx == nil { - s.cat.IncrementActiveSeriesFailure(1) + s.cat.IncrementActiveSeriesFailure() } else { buf := labels.NewScratchBuilder(128) if err := idx.Series(ref, &buf, nil); err != nil { - s.cat.IncrementActiveSeriesFailure(1) + s.cat.IncrementActiveSeriesFailure() } else { s.cat.DecrementActiveSeries(buf.Labels()) } From fe8a1e5565b1cc7b506fe31a64db5ec01b7a31e4 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 24 Dec 2024 14:09:47 +0100 Subject: [PATCH 035/105] rename + make cooldownuntil a normal int64 and lock with observedMtx --- pkg/costattribution/manager.go | 2 +- pkg/costattribution/tracker.go | 25 +++++++++++++++++++------ pkg/costattribution/tracker_test.go | 2 +- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 2c74e8f8e94..8ba8c264226 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -154,7 +154,7 @@ func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) error { t.cleanupTrackerAttribution(key) } - if t.shouldDelete(deadline) { + if t.recoverFromOverflow(deadline) { m.deleteTracker(userID) } } diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 5057c26ad3f..0975779bba3 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -45,12 +45,12 @@ type Tracker struct { discardedSampleAttribution *prometheus.Desc failedActiveSeriesDecrement *prometheus.Desc overflowLabels []string - observedMtx sync.RWMutex observed map[string]*observation + observedMtx sync.RWMutex + cooldownUntil int64 hashBuffer []byte state TrackerState overflowCounter *observation - cooldownUntil *atomic.Int64 totalFailedActiveSeries *atomic.Float64 cooldownDuration int64 logger log.Logger @@ -293,7 +293,7 @@ func (t *Tracker) updateState(ts int64, activeSeriesIncrement, receivedSampleInc t.overflowCounter.activeSerie.Add(o.activeSerie.Load()) } } - t.cooldownUntil = atomic.NewInt64(ts + t.cooldownDuration) + t.cooldownUntil = ts + t.cooldownDuration } if t.state == Overflow { @@ -326,12 +326,25 @@ func (t *Tracker) createNewObservation(key []byte, ts int64, activeSeriesIncreme } } -func (t *Tracker) shouldDelete(deadline int64) bool { - if t.cooldownUntil != nil && t.cooldownUntil.Load() < deadline { +func (t *Tracker) recoverFromOverflow(deadline int64) bool { + t.observedMtx.RLock() + if t.cooldownUntil != 0 && t.cooldownUntil < deadline { + if len(t.observed) <= t.maxCardinality { + t.observedMtx.RUnlock() + return true + } + t.observedMtx.RUnlock() + + // Increase the cooldown duration if the number of observations is still above the max cardinality + t.observedMtx.Lock() if len(t.observed) <= t.maxCardinality { + t.observedMtx.Unlock() return true } - t.cooldownUntil.Store(deadline + t.cooldownDuration) + t.cooldownUntil = deadline + t.cooldownDuration + t.observedMtx.Unlock() + } else { + t.observedMtx.RUnlock() } return false } diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index 08ad7d66449..bd5360f5552 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -94,7 +94,7 @@ func TestTracker_updateCounters(t *testing.T) { tracker.updateCounters(lbls3, 4, 1, 0, 0, nil, true) assert.Equal(t, Overflow, tracker.state, "Fourth observation, should stay overflow") - assert.Equal(t, int64(3+tracker.cooldownDuration), tracker.cooldownUntil.Load(), "CooldownUntil should be updated correctly") + assert.Equal(t, int64(3+tracker.cooldownDuration), tracker.cooldownUntil, "CooldownUntil should be updated correctly") } func TestTracker_inactiveObservations(t *testing.T) { From 8b5836f3ac546913dad152c0f70cb71fd6745518 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 24 Dec 2024 14:18:37 +0100 Subject: [PATCH 036/105] use build-in functon dum dum --- pkg/costattribution/manager.go | 6 ++---- pkg/costattribution/tracker.go | 5 +---- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 8ba8c264226..4586b444dee 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -4,7 +4,7 @@ package costattribution import ( "context" - "sort" + "slices" "sync" "time" @@ -119,9 +119,7 @@ func (m *Manager) updateTracker(userID string) *Tracker { newTrackedLabels := m.limits.CostAttributionLabels(userID) // sort the labels to ensure the order is consistent - sort.Slice(newTrackedLabels, func(i, j int) bool { - return newTrackedLabels[i] < newTrackedLabels[j] - }) + slices.Sort(newTrackedLabels) // if the labels have changed or the max cardinality or cooldown duration have changed, create a new tracker if !t.hasSameLabels(newTrackedLabels) || t.maxCardinality != m.limits.MaxCostAttributionCardinalityPerUser(userID) || t.cooldownDuration != int64(m.limits.CostAttributionCooldown(userID).Seconds()) { diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 0975779bba3..f83f7e66746 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -5,7 +5,6 @@ package costattribution import ( "bytes" "slices" - "sort" "strings" "sync" "time" @@ -57,9 +56,7 @@ type Tracker struct { } func newTracker(userID string, trackedLabels []string, limit int, cooldown time.Duration, logger log.Logger) *Tracker { - sort.Slice(trackedLabels, func(i, j int) bool { - return trackedLabels[i] < trackedLabels[j] - }) + slices.Sort(trackedLabels) // Create a map for fast lookup, and overflow labels to export when overflow happens index := make(map[string]int, len(trackedLabels)) From 888d8b06231e3e67d1cb7ccf38bac0b4275a95c9 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 24 Dec 2024 14:21:51 +0100 Subject: [PATCH 037/105] modify the copy of calabels instead of directly the slice --- pkg/costattribution/manager.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 4586b444dee..6dcf9bc8728 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -116,7 +116,10 @@ func (m *Manager) updateTracker(userID string) *Tracker { return nil } - newTrackedLabels := m.limits.CostAttributionLabels(userID) + lbls := m.limits.CostAttributionLabels(userID) + + newTrackedLabels := make([]string, 0, len(lbls)) + copy(newTrackedLabels, lbls) // sort the labels to ensure the order is consistent slices.Sort(newTrackedLabels) From b15b4871dece3d14b77e1856f1f53052f3cb2a88 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 24 Dec 2024 15:55:35 +0100 Subject: [PATCH 038/105] update mimir-prometheus --- go.mod | 6 +- go.sum | 16 +- pkg/costattribution/tracker.go | 4 + pkg/ingester/activeseries/active_series.go | 16 +- pkg/ingester/ingester.go | 22 +- pkg/ingester/user_tsdb.go | 6 +- .../prometheus/common/config/http_config.go | 48 +-- .../prometheus/common/expfmt/encode.go | 4 +- .../prometheus/common/expfmt/expfmt.go | 4 +- .../common/expfmt/openmetrics_create.go | 4 +- .../prometheus/common/expfmt/text_parse.go | 2 +- .../prometheus/common/model/alert.go | 7 +- .../prometheus/common/model/metric.go | 31 +- .../prometheus/common/model/silence.go | 17 +- .../prometheus/common/model/value_float.go | 3 +- .../common/model/value_histogram.go | 7 +- .../prometheus/common/promslog/slog.go | 19 +- .../prometheus/common/version/info.go | 8 + .../prometheus/prometheus/config/config.go | 44 ++- .../prometheus/model/exemplar/exemplar.go | 2 +- .../prometheus/model/labels/labels.go | 6 + .../prometheus/model/labels/labels_common.go | 6 +- .../model/labels/labels_dedupelabels.go | 5 + .../model/labels/labels_stringlabels.go | 5 + .../model/textparse/openmetricsparse.go | 2 +- .../prometheus/prometheus/promql/functions.go | 13 +- .../promql/parser/generated_parser.y | 4 +- .../promql/parser/generated_parser.y.go | 333 +++++++++--------- .../prometheus/promql/parser/parse.go | 3 +- .../prometheus/promql/parser/printer.go | 17 +- .../prometheus/promql/promqltest/test.go | 38 +- .../promql/promqltest/testdata/functions.test | 29 ++ .../prometheus/prometheus/rules/group.go | 62 +++- .../prometheus/prometheus/rules/manager.go | 6 + .../prometheus/prometheus/scrape/scrape.go | 1 + .../prometheus/normalize_name.go | 31 +- .../prometheusremotewrite/helper.go | 2 +- .../storage/remote/queue_manager.go | 4 +- .../prometheus/prometheus/tsdb/head.go | 2 +- .../prometheus/prometheus/tsdb/head_read.go | 4 + .../prometheus/tsdb/index/postings.go | 7 +- .../prometheus/prometheus/tsdb/querier.go | 2 +- .../prometheus/util/logging/dedupe.go | 5 +- vendor/modules.txt | 8 +- 44 files changed, 502 insertions(+), 363 deletions(-) diff --git a/go.mod b/go.mod index 555bbc18a00..b63868994e1 100644 --- a/go.mod +++ b/go.mod @@ -36,7 +36,7 @@ require ( github.com/prometheus/alertmanager v0.27.0 github.com/prometheus/client_golang v1.20.5 github.com/prometheus/client_model v0.6.1 - github.com/prometheus/common v0.60.1 + github.com/prometheus/common v0.61.0 github.com/prometheus/prometheus v1.99.0 github.com/segmentio/fasthash v1.0.3 github.com/sirupsen/logrus v1.9.3 @@ -49,7 +49,7 @@ require ( golang.org/x/net v0.32.0 golang.org/x/sync v0.10.0 golang.org/x/time v0.8.0 - google.golang.org/grpc v1.67.1 + google.golang.org/grpc v1.68.1 gopkg.in/yaml.v2 v2.4.0 gopkg.in/yaml.v3 v3.0.1 ) @@ -285,7 +285,7 @@ require ( ) // Using a fork of Prometheus with Mimir-specific changes. -replace github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20241210170917-0a0a41616520 +replace github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20241224134504-460b7be5bce8 // Replace memberlist with our fork which includes some fixes that haven't been // merged upstream yet: diff --git a/go.sum b/go.sum index 44c23833928..1b0058e0664 100644 --- a/go.sum +++ b/go.sum @@ -931,8 +931,8 @@ github.com/cncf/xds/go v0.0.0-20230105202645-06c439db220b/go.mod h1:eXthEFrGJvWH github.com/cncf/xds/go v0.0.0-20230310173818-32f1caf87195/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cncf/xds/go v0.0.0-20230607035331-e9ce68804cb4/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cncf/xds/go v0.0.0-20240423153145-555b57ec207b/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8= -github.com/cncf/xds/go v0.0.0-20240723142845-024c85f92f20 h1:N+3sFI5GUjRKBi+i0TxYVST9h4Ie192jJWpHvthBBgg= -github.com/cncf/xds/go v0.0.0-20240723142845-024c85f92f20/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8= +github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78 h1:QVw89YDxXxEe+l8gU8ETbOasdwEV+avkR75ZzsVV9WI= +github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8= github.com/colega/go-yaml-yaml v0.0.0-20220720105220-255a8d16d094 h1:FpZSn61BWXbtyH68+uSv416veEswX1M2HRyQfdHnOyQ= github.com/colega/go-yaml-yaml v0.0.0-20220720105220-255a8d16d094/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= github.com/coreos/go-semver v0.3.0 h1:wkHLiw0WNATZnSG7epLsujiMCgPAc9xhjJ4tgnAxmfM= @@ -1279,8 +1279,8 @@ github.com/grafana/gomemcache v0.0.0-20241016125027-0a5bcc5aef40 h1:1TeKhyS+pvzO github.com/grafana/gomemcache v0.0.0-20241016125027-0a5bcc5aef40/go.mod h1:IGRj8oOoxwJbHBYl1+OhS9UjQR0dv6SQOep7HqmtyFU= github.com/grafana/memberlist v0.3.1-0.20220714140823-09ffed8adbbe h1:yIXAAbLswn7VNWBIvM71O2QsgfgW9fRXZNR0DXe6pDU= github.com/grafana/memberlist v0.3.1-0.20220714140823-09ffed8adbbe/go.mod h1:MS2lj3INKhZjWNqd3N0m3J+Jxf3DAOnAH9VT3Sh9MUE= -github.com/grafana/mimir-prometheus v0.0.0-20241210170917-0a0a41616520 h1:FADazl5oVYBARbfVMtLkPQ9IfIwhiE9lrPrKNPOHBV4= -github.com/grafana/mimir-prometheus v0.0.0-20241210170917-0a0a41616520/go.mod h1:NpYc1U0eC7m6xUh3t3Pq565KxaIc08Oaquiu71dEMi8= +github.com/grafana/mimir-prometheus v0.0.0-20241224134504-460b7be5bce8 h1:/TwjdoLAxL7URxKJGJUeI539w6LUqcwIcj0WCUxDY/c= +github.com/grafana/mimir-prometheus v0.0.0-20241224134504-460b7be5bce8/go.mod h1:a5LEa2Vy87wOp0Vu6sLmEIR1V59fqH3QosOSiErAr30= github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956 h1:em1oddjXL8c1tL0iFdtVtPloq2hRPen2MJQKoAWpxu0= github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956/go.mod h1:qtI1ogk+2JhVPIXVc6q+NHziSmy2W5GbdQZFUHADCBU= github.com/grafana/prometheus-alertmanager v0.25.1-0.20240930132144-b5e64e81e8d3 h1:6D2gGAwyQBElSrp3E+9lSr7k8gLuP3Aiy20rweLWeBw= @@ -1390,8 +1390,8 @@ github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4= github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= -github.com/ionos-cloud/sdk-go/v6 v6.2.1 h1:mxxN+frNVmbFrmmFfXnBC3g2USYJrl6mc1LW2iNYbFY= -github.com/ionos-cloud/sdk-go/v6 v6.2.1/go.mod h1:SXrO9OGyWjd2rZhAhEpdYN6VUAODzzqRdqA9BCviQtI= +github.com/ionos-cloud/sdk-go/v6 v6.3.0 h1:/lTieTH9Mo/CWm3cTlFLnK10jgxjUGkAqRffGqvPteY= +github.com/ionos-cloud/sdk-go/v6 v6.3.0/go.mod h1:SXrO9OGyWjd2rZhAhEpdYN6VUAODzzqRdqA9BCviQtI= github.com/jessevdk/go-flags v1.5.0 h1:1jKYvbxEjfUl0fmqTCOfonvskHHXMjBySTLW4y9LFvc= github.com/jessevdk/go-flags v1.5.0/go.mod h1:Fw0T6WPc1dYxT4mKEZRfG5kJhaTDP9pj1c2EWnYs/m4= github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= @@ -1611,8 +1611,8 @@ github.com/prometheus/common v0.9.1/go.mod h1:yhUN8i9wzaXS3w1O07YhxHEBxD+W35wd8b github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo= github.com/prometheus/common v0.26.0/go.mod h1:M7rCNAaPfAosfx8veZJCuw84e35h3Cfd9VFqTh1DIvc= github.com/prometheus/common v0.29.0/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+M/gUGO4Hls= -github.com/prometheus/common v0.60.1 h1:FUas6GcOw66yB/73KC+BOZoFJmbo/1pojoILArPAaSc= -github.com/prometheus/common v0.60.1/go.mod h1:h0LYf1R1deLSKtD4Vdg8gy4RuOvENW2J/h19V5NADQw= +github.com/prometheus/common v0.61.0 h1:3gv/GThfX0cV2lpO7gkTUwZru38mxevy90Bj8YFSRQQ= +github.com/prometheus/common v0.61.0/go.mod h1:zr29OCN/2BsJRaFwG8QOBr41D6kkchKbpeNH7pAjb/s= github.com/prometheus/common/sigv4 v0.1.0 h1:qoVebwtwwEhS85Czm2dSROY5fTo2PAPEVdDeppTwGX4= github.com/prometheus/common/sigv4 v0.1.0/go.mod h1:2Jkxxk9yYvCkE5G1sQT7GuEXm57JrvHu9k5YwTjsNtI= github.com/prometheus/exporter-toolkit v0.13.1 h1:Evsh0gWQo2bdOHlnz9+0Nm7/OFfIwhE2Ws4A2jIlR04= diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index f83f7e66746..74233e3a686 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -142,7 +142,11 @@ func (t *Tracker) Collect(out chan<- prometheus.Metric) { t.observedMtx.RLock() defer t.observedMtx.RUnlock() for key, o := range t.observed { + if key == "" { + continue + } keys := strings.Split(key, string(sep)) + keys = append(keys, t.userID) if o.activeSerie.Load() > 0 { out <- prometheus.MustNewConstMetric(t.activeSeriesPerUserAttribution, prometheus.GaugeValue, o.activeSerie.Load(), keys...) diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index c2732cbbfd6..79fdc8988b5 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -459,12 +459,8 @@ func (s *seriesStripe) purge(keepUntil time.Time, idx tsdb.IndexReader) { for ref, entry := range s.refs { ts := entry.nanos.Load() if ts < keepUntilNanos { - // cost attribution is enabled, if it's not nil, we need to decrement the active series count, otherwise means received error when get idx, - // we need to increment the active series failure count. if s.cat != nil { - if idx == nil { - s.cat.IncrementActiveSeriesFailure() - } else if err := idx.Series(ref, &buf, nil); err != nil { + if err := idx.Series(ref, &buf, nil); err != nil { s.cat.IncrementActiveSeriesFailure() } else { s.cat.DecrementActiveSeries(buf.Labels()) @@ -521,15 +517,11 @@ func (s *seriesStripe) remove(ref storage.SeriesRef, idx tsdb.IndexReader) { s.active-- if s.cat != nil { - if idx == nil { + buf := labels.NewScratchBuilder(128) + if err := idx.Series(ref, &buf, nil); err != nil { s.cat.IncrementActiveSeriesFailure() } else { - buf := labels.NewScratchBuilder(128) - if err := idx.Series(ref, &buf, nil); err != nil { - s.cat.IncrementActiveSeriesFailure() - } else { - s.cat.DecrementActiveSeries(buf.Labels()) - } + s.cat.DecrementActiveSeries(buf.Labels()) } } if entry.numNativeHistogramBuckets >= 0 { diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 2291981bd26..00cec46160c 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -792,13 +792,10 @@ func (i *Ingester) updateActiveSeries(now time.Time) { i.replaceMatchers(asmodel.NewMatchers(newMatchersConfig), userDB, now) } - // If the userDB idx is unavailable, pass nil pointer to Purge methode, and record it as a failure in metrics when decrementing active series. - idx, err := userDB.Head().Index() - if err != nil { - level.Warn(i.logger).Log("msg", "failed to get the index of the TSDB head", "user", userID, "err", err) - idx = nil - } + idx := userDB.Head().MustIndex() valid := userDB.activeSeries.Purge(now, idx) + idx.Close() + if !valid { // Active series config has been reloaded, exposing loading metric until MetricsIdleTimeout passes. i.metrics.activeSeriesLoading.WithLabelValues(userID).Set(1) @@ -1416,10 +1413,8 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre var nonCopiedLabels labels.Labels // idx is used to decrease active series count in case of error for cost attribution. - idx, err := i.getTSDB(userID).Head().Index() - if err != nil { - idx = nil - } + idx := i.getTSDB(userID).Head().MustIndex() + defer idx.Close() for _, ts := range timeseries { // The labels must be sorted (in our case, it's guaranteed a write request @@ -3273,12 +3268,9 @@ func (i *Ingester) compactBlocksToReduceInMemorySeries(ctx context.Context, now } // Purge the active series so that the next call to Active() will return the up-to-date count. - idx, err := db.Head().Index() - if err != nil { - level.Warn(i.logger).Log("msg", "failed to get the index of the TSDB head", "user", userID, "err", err) - idx = nil - } + idx := db.Head().MustIndex() db.activeSeries.Purge(now, idx) + idx.Close() // Estimate the number of series that would be dropped from the TSDB Head if we would // compact the head up until "now - active series idle timeout". diff --git a/pkg/ingester/user_tsdb.go b/pkg/ingester/user_tsdb.go index 61c1aa244ee..2e3d40e0d3d 100644 --- a/pkg/ingester/user_tsdb.go +++ b/pkg/ingester/user_tsdb.go @@ -619,10 +619,8 @@ func (u *userTSDB) computeOwnedSeries() int { } count := 0 - idx, err := u.Head().Index() - if err != nil { - idx = nil - } + idx := u.Head().MustIndex() + defer idx.Close() u.Head().ForEachSecondaryHash(func(refs []chunks.HeadSeriesRef, secondaryHashes []uint32) { for i, sh := range secondaryHashes { diff --git a/vendor/github.com/prometheus/common/config/http_config.go b/vendor/github.com/prometheus/common/config/http_config.go index e6bdd4c035d..57ec252adff 100644 --- a/vendor/github.com/prometheus/common/config/http_config.go +++ b/vendor/github.com/prometheus/common/config/http_config.go @@ -357,33 +357,33 @@ func nonZeroCount[T comparable](values ...T) int { func (c *HTTPClientConfig) Validate() error { // Backwards compatibility with the bearer_token field. if len(c.BearerToken) > 0 && len(c.BearerTokenFile) > 0 { - return fmt.Errorf("at most one of bearer_token & bearer_token_file must be configured") + return errors.New("at most one of bearer_token & bearer_token_file must be configured") } if (c.BasicAuth != nil || c.OAuth2 != nil) && (len(c.BearerToken) > 0 || len(c.BearerTokenFile) > 0) { - return fmt.Errorf("at most one of basic_auth, oauth2, bearer_token & bearer_token_file must be configured") + return errors.New("at most one of basic_auth, oauth2, bearer_token & bearer_token_file must be configured") } if c.BasicAuth != nil && nonZeroCount(string(c.BasicAuth.Username) != "", c.BasicAuth.UsernameFile != "", c.BasicAuth.UsernameRef != "") > 1 { - return fmt.Errorf("at most one of basic_auth username, username_file & username_ref must be configured") + return errors.New("at most one of basic_auth username, username_file & username_ref must be configured") } if c.BasicAuth != nil && nonZeroCount(string(c.BasicAuth.Password) != "", c.BasicAuth.PasswordFile != "", c.BasicAuth.PasswordRef != "") > 1 { - return fmt.Errorf("at most one of basic_auth password, password_file & password_ref must be configured") + return errors.New("at most one of basic_auth password, password_file & password_ref must be configured") } if c.Authorization != nil { if len(c.BearerToken) > 0 || len(c.BearerTokenFile) > 0 { - return fmt.Errorf("authorization is not compatible with bearer_token & bearer_token_file") + return errors.New("authorization is not compatible with bearer_token & bearer_token_file") } if nonZeroCount(string(c.Authorization.Credentials) != "", c.Authorization.CredentialsFile != "", c.Authorization.CredentialsRef != "") > 1 { - return fmt.Errorf("at most one of authorization credentials & credentials_file must be configured") + return errors.New("at most one of authorization credentials & credentials_file must be configured") } c.Authorization.Type = strings.TrimSpace(c.Authorization.Type) if len(c.Authorization.Type) == 0 { c.Authorization.Type = "Bearer" } if strings.ToLower(c.Authorization.Type) == "basic" { - return fmt.Errorf(`authorization type cannot be set to "basic", use "basic_auth" instead`) + return errors.New(`authorization type cannot be set to "basic", use "basic_auth" instead`) } if c.BasicAuth != nil || c.OAuth2 != nil { - return fmt.Errorf("at most one of basic_auth, oauth2 & authorization must be configured") + return errors.New("at most one of basic_auth, oauth2 & authorization must be configured") } } else { if len(c.BearerToken) > 0 { @@ -399,16 +399,16 @@ func (c *HTTPClientConfig) Validate() error { } if c.OAuth2 != nil { if c.BasicAuth != nil { - return fmt.Errorf("at most one of basic_auth, oauth2 & authorization must be configured") + return errors.New("at most one of basic_auth, oauth2 & authorization must be configured") } if len(c.OAuth2.ClientID) == 0 { - return fmt.Errorf("oauth2 client_id must be configured") + return errors.New("oauth2 client_id must be configured") } if len(c.OAuth2.TokenURL) == 0 { - return fmt.Errorf("oauth2 token_url must be configured") + return errors.New("oauth2 token_url must be configured") } if nonZeroCount(len(c.OAuth2.ClientSecret) > 0, len(c.OAuth2.ClientSecretFile) > 0, len(c.OAuth2.ClientSecretRef) > 0) > 1 { - return fmt.Errorf("at most one of oauth2 client_secret, client_secret_file & client_secret_ref must be configured") + return errors.New("at most one of oauth2 client_secret, client_secret_file & client_secret_ref must be configured") } } if err := c.ProxyConfig.Validate(); err != nil { @@ -735,7 +735,7 @@ func (s *FileSecret) Fetch(ctx context.Context) (string, error) { } func (s *FileSecret) Description() string { - return fmt.Sprintf("file %s", s.file) + return "file " + s.file } func (s *FileSecret) Immutable() bool { @@ -753,7 +753,7 @@ func (s *refSecret) Fetch(ctx context.Context) (string, error) { } func (s *refSecret) Description() string { - return fmt.Sprintf("ref %s", s.ref) + return "ref " + s.ref } func (s *refSecret) Immutable() bool { @@ -1045,7 +1045,7 @@ func NewTLSConfigWithContext(ctx context.Context, cfg *TLSConfig, optFuncs ...TL if cfg.MaxVersion != 0 && cfg.MinVersion != 0 { if cfg.MaxVersion < cfg.MinVersion { - return nil, fmt.Errorf("tls_config.max_version must be greater than or equal to tls_config.min_version if both are specified") + return nil, errors.New("tls_config.max_version must be greater than or equal to tls_config.min_version if both are specified") } } @@ -1144,19 +1144,19 @@ func (c *TLSConfig) UnmarshalYAML(unmarshal func(interface{}) error) error { // used. func (c *TLSConfig) Validate() error { if nonZeroCount(len(c.CA) > 0, len(c.CAFile) > 0, len(c.CARef) > 0) > 1 { - return fmt.Errorf("at most one of ca, ca_file & ca_ref must be configured") + return errors.New("at most one of ca, ca_file & ca_ref must be configured") } if nonZeroCount(len(c.Cert) > 0, len(c.CertFile) > 0, len(c.CertRef) > 0) > 1 { - return fmt.Errorf("at most one of cert, cert_file & cert_ref must be configured") + return errors.New("at most one of cert, cert_file & cert_ref must be configured") } if nonZeroCount(len(c.Key) > 0, len(c.KeyFile) > 0, len(c.KeyRef) > 0) > 1 { - return fmt.Errorf("at most one of key and key_file must be configured") + return errors.New("at most one of key and key_file must be configured") } if c.usingClientCert() && !c.usingClientKey() { - return fmt.Errorf("exactly one of key or key_file must be configured when a client certificate is configured") + return errors.New("exactly one of key or key_file must be configured when a client certificate is configured") } else if c.usingClientKey() && !c.usingClientCert() { - return fmt.Errorf("exactly one of cert or cert_file must be configured when a client key is configured") + return errors.New("exactly one of cert or cert_file must be configured when a client key is configured") } return nil @@ -1460,16 +1460,16 @@ type ProxyConfig struct { // UnmarshalYAML implements the yaml.Unmarshaler interface. func (c *ProxyConfig) Validate() error { if len(c.ProxyConnectHeader) > 0 && (!c.ProxyFromEnvironment && (c.ProxyURL.URL == nil || c.ProxyURL.String() == "")) { - return fmt.Errorf("if proxy_connect_header is configured, proxy_url or proxy_from_environment must also be configured") + return errors.New("if proxy_connect_header is configured, proxy_url or proxy_from_environment must also be configured") } if c.ProxyFromEnvironment && c.ProxyURL.URL != nil && c.ProxyURL.String() != "" { - return fmt.Errorf("if proxy_from_environment is configured, proxy_url must not be configured") + return errors.New("if proxy_from_environment is configured, proxy_url must not be configured") } if c.ProxyFromEnvironment && c.NoProxy != "" { - return fmt.Errorf("if proxy_from_environment is configured, no_proxy must not be configured") + return errors.New("if proxy_from_environment is configured, no_proxy must not be configured") } if c.ProxyURL.URL == nil && c.NoProxy != "" { - return fmt.Errorf("if no_proxy is configured, proxy_url must also be configured") + return errors.New("if no_proxy is configured, proxy_url must also be configured") } return nil } diff --git a/vendor/github.com/prometheus/common/expfmt/encode.go b/vendor/github.com/prometheus/common/expfmt/encode.go index cf0c150c2e1..d7f3d76f55d 100644 --- a/vendor/github.com/prometheus/common/expfmt/encode.go +++ b/vendor/github.com/prometheus/common/expfmt/encode.go @@ -68,7 +68,7 @@ func Negotiate(h http.Header) Format { if escapeParam := ac.Params[model.EscapingKey]; escapeParam != "" { switch Format(escapeParam) { case model.AllowUTF8, model.EscapeUnderscores, model.EscapeDots, model.EscapeValues: - escapingScheme = Format(fmt.Sprintf("; escaping=%s", escapeParam)) + escapingScheme = Format("; escaping=" + escapeParam) default: // If the escaping parameter is unknown, ignore it. } @@ -101,7 +101,7 @@ func NegotiateIncludingOpenMetrics(h http.Header) Format { if escapeParam := ac.Params[model.EscapingKey]; escapeParam != "" { switch Format(escapeParam) { case model.AllowUTF8, model.EscapeUnderscores, model.EscapeDots, model.EscapeValues: - escapingScheme = Format(fmt.Sprintf("; escaping=%s", escapeParam)) + escapingScheme = Format("; escaping=" + escapeParam) default: // If the escaping parameter is unknown, ignore it. } diff --git a/vendor/github.com/prometheus/common/expfmt/expfmt.go b/vendor/github.com/prometheus/common/expfmt/expfmt.go index d942af8edd4..b26886560d7 100644 --- a/vendor/github.com/prometheus/common/expfmt/expfmt.go +++ b/vendor/github.com/prometheus/common/expfmt/expfmt.go @@ -15,7 +15,7 @@ package expfmt import ( - "fmt" + "errors" "strings" "github.com/prometheus/common/model" @@ -109,7 +109,7 @@ func NewOpenMetricsFormat(version string) (Format, error) { if version == OpenMetricsVersion_1_0_0 { return FmtOpenMetrics_1_0_0, nil } - return FmtUnknown, fmt.Errorf("unknown open metrics version string") + return FmtUnknown, errors.New("unknown open metrics version string") } // WithEscapingScheme returns a copy of Format with the specified escaping diff --git a/vendor/github.com/prometheus/common/expfmt/openmetrics_create.go b/vendor/github.com/prometheus/common/expfmt/openmetrics_create.go index 11c8ff4b9db..f1c495dd606 100644 --- a/vendor/github.com/prometheus/common/expfmt/openmetrics_create.go +++ b/vendor/github.com/prometheus/common/expfmt/openmetrics_create.go @@ -152,8 +152,8 @@ func MetricFamilyToOpenMetrics(out io.Writer, in *dto.MetricFamily, options ...E if metricType == dto.MetricType_COUNTER && strings.HasSuffix(compliantName, "_total") { compliantName = name[:len(name)-6] } - if toOM.withUnit && in.Unit != nil && !strings.HasSuffix(compliantName, fmt.Sprintf("_%s", *in.Unit)) { - compliantName = compliantName + fmt.Sprintf("_%s", *in.Unit) + if toOM.withUnit && in.Unit != nil && !strings.HasSuffix(compliantName, "_"+*in.Unit) { + compliantName = compliantName + "_" + *in.Unit } // Comments, first HELP, then TYPE. diff --git a/vendor/github.com/prometheus/common/expfmt/text_parse.go b/vendor/github.com/prometheus/common/expfmt/text_parse.go index f085a923f6c..b4607fe4d27 100644 --- a/vendor/github.com/prometheus/common/expfmt/text_parse.go +++ b/vendor/github.com/prometheus/common/expfmt/text_parse.go @@ -895,7 +895,7 @@ func histogramMetricName(name string) string { func parseFloat(s string) (float64, error) { if strings.ContainsAny(s, "pP_") { - return 0, fmt.Errorf("unsupported character in float") + return 0, errors.New("unsupported character in float") } return strconv.ParseFloat(s, 64) } diff --git a/vendor/github.com/prometheus/common/model/alert.go b/vendor/github.com/prometheus/common/model/alert.go index 80d1fe944ea..bd3a39e3e14 100644 --- a/vendor/github.com/prometheus/common/model/alert.go +++ b/vendor/github.com/prometheus/common/model/alert.go @@ -14,6 +14,7 @@ package model import ( + "errors" "fmt" "time" ) @@ -89,16 +90,16 @@ func (a *Alert) StatusAt(ts time.Time) AlertStatus { // Validate checks whether the alert data is inconsistent. func (a *Alert) Validate() error { if a.StartsAt.IsZero() { - return fmt.Errorf("start time missing") + return errors.New("start time missing") } if !a.EndsAt.IsZero() && a.EndsAt.Before(a.StartsAt) { - return fmt.Errorf("start time must be before end time") + return errors.New("start time must be before end time") } if err := a.Labels.Validate(); err != nil { return fmt.Errorf("invalid label set: %w", err) } if len(a.Labels) == 0 { - return fmt.Errorf("at least one label pair required") + return errors.New("at least one label pair required") } if err := a.Annotations.Validate(); err != nil { return fmt.Errorf("invalid annotations: %w", err) diff --git a/vendor/github.com/prometheus/common/model/metric.go b/vendor/github.com/prometheus/common/model/metric.go index f50966bc494..0daca836afa 100644 --- a/vendor/github.com/prometheus/common/model/metric.go +++ b/vendor/github.com/prometheus/common/model/metric.go @@ -14,9 +14,11 @@ package model import ( + "errors" "fmt" "regexp" "sort" + "strconv" "strings" "unicode/utf8" @@ -269,10 +271,6 @@ func metricNeedsEscaping(m *dto.Metric) bool { return false } -const ( - lowerhex = "0123456789abcdef" -) - // EscapeName escapes the incoming name according to the provided escaping // scheme. Depending on the rules of escaping, this may cause no change in the // string that is returned. (Especially NoEscaping, which by definition is a @@ -307,7 +305,7 @@ func EscapeName(name string, scheme EscapingScheme) string { } else if isValidLegacyRune(b, i) { escaped.WriteRune(b) } else { - escaped.WriteRune('_') + escaped.WriteString("__") } } return escaped.String() @@ -317,21 +315,15 @@ func EscapeName(name string, scheme EscapingScheme) string { } escaped.WriteString("U__") for i, b := range name { - if isValidLegacyRune(b, i) { + if b == '_' { + escaped.WriteString("__") + } else if isValidLegacyRune(b, i) { escaped.WriteRune(b) } else if !utf8.ValidRune(b) { escaped.WriteString("_FFFD_") - } else if b < 0x100 { - escaped.WriteRune('_') - for s := 4; s >= 0; s -= 4 { - escaped.WriteByte(lowerhex[b>>uint(s)&0xF]) - } - escaped.WriteRune('_') - } else if b < 0x10000 { + } else { escaped.WriteRune('_') - for s := 12; s >= 0; s -= 4 { - escaped.WriteByte(lowerhex[b>>uint(s)&0xF]) - } + escaped.WriteString(strconv.FormatInt(int64(b), 16)) escaped.WriteRune('_') } } @@ -389,8 +381,9 @@ func UnescapeName(name string, scheme EscapingScheme) string { // We think we are in a UTF-8 code, process it. var utf8Val uint for j := 0; i < len(escapedName); j++ { - // This is too many characters for a utf8 value. - if j > 4 { + // This is too many characters for a utf8 value based on the MaxRune + // value of '\U0010FFFF'. + if j >= 6 { return name } // Found a closing underscore, convert to a rune, check validity, and append. @@ -443,7 +436,7 @@ func (e EscapingScheme) String() string { func ToEscapingScheme(s string) (EscapingScheme, error) { if s == "" { - return NoEscaping, fmt.Errorf("got empty string instead of escaping scheme") + return NoEscaping, errors.New("got empty string instead of escaping scheme") } switch s { case AllowUTF8: diff --git a/vendor/github.com/prometheus/common/model/silence.go b/vendor/github.com/prometheus/common/model/silence.go index 910b0b71fcc..8f91a9702e0 100644 --- a/vendor/github.com/prometheus/common/model/silence.go +++ b/vendor/github.com/prometheus/common/model/silence.go @@ -15,6 +15,7 @@ package model import ( "encoding/json" + "errors" "fmt" "regexp" "time" @@ -34,7 +35,7 @@ func (m *Matcher) UnmarshalJSON(b []byte) error { } if len(m.Name) == 0 { - return fmt.Errorf("label name in matcher must not be empty") + return errors.New("label name in matcher must not be empty") } if m.IsRegex { if _, err := regexp.Compile(m.Value); err != nil { @@ -77,7 +78,7 @@ type Silence struct { // Validate returns true iff all fields of the silence have valid values. func (s *Silence) Validate() error { if len(s.Matchers) == 0 { - return fmt.Errorf("at least one matcher required") + return errors.New("at least one matcher required") } for _, m := range s.Matchers { if err := m.Validate(); err != nil { @@ -85,22 +86,22 @@ func (s *Silence) Validate() error { } } if s.StartsAt.IsZero() { - return fmt.Errorf("start time missing") + return errors.New("start time missing") } if s.EndsAt.IsZero() { - return fmt.Errorf("end time missing") + return errors.New("end time missing") } if s.EndsAt.Before(s.StartsAt) { - return fmt.Errorf("start time must be before end time") + return errors.New("start time must be before end time") } if s.CreatedBy == "" { - return fmt.Errorf("creator information missing") + return errors.New("creator information missing") } if s.Comment == "" { - return fmt.Errorf("comment missing") + return errors.New("comment missing") } if s.CreatedAt.IsZero() { - return fmt.Errorf("creation timestamp missing") + return errors.New("creation timestamp missing") } return nil } diff --git a/vendor/github.com/prometheus/common/model/value_float.go b/vendor/github.com/prometheus/common/model/value_float.go index ae35cc2ab4b..6bfc757d18b 100644 --- a/vendor/github.com/prometheus/common/model/value_float.go +++ b/vendor/github.com/prometheus/common/model/value_float.go @@ -15,6 +15,7 @@ package model import ( "encoding/json" + "errors" "fmt" "math" "strconv" @@ -39,7 +40,7 @@ func (v SampleValue) MarshalJSON() ([]byte, error) { // UnmarshalJSON implements json.Unmarshaler. func (v *SampleValue) UnmarshalJSON(b []byte) error { if len(b) < 2 || b[0] != '"' || b[len(b)-1] != '"' { - return fmt.Errorf("sample value must be a quoted string") + return errors.New("sample value must be a quoted string") } f, err := strconv.ParseFloat(string(b[1:len(b)-1]), 64) if err != nil { diff --git a/vendor/github.com/prometheus/common/model/value_histogram.go b/vendor/github.com/prometheus/common/model/value_histogram.go index 54bb038cfff..895e6a3e839 100644 --- a/vendor/github.com/prometheus/common/model/value_histogram.go +++ b/vendor/github.com/prometheus/common/model/value_histogram.go @@ -15,6 +15,7 @@ package model import ( "encoding/json" + "errors" "fmt" "strconv" "strings" @@ -32,7 +33,7 @@ func (v FloatString) MarshalJSON() ([]byte, error) { func (v *FloatString) UnmarshalJSON(b []byte) error { if len(b) < 2 || b[0] != '"' || b[len(b)-1] != '"' { - return fmt.Errorf("float value must be a quoted string") + return errors.New("float value must be a quoted string") } f, err := strconv.ParseFloat(string(b[1:len(b)-1]), 64) if err != nil { @@ -141,7 +142,7 @@ type SampleHistogramPair struct { func (s SampleHistogramPair) MarshalJSON() ([]byte, error) { if s.Histogram == nil { - return nil, fmt.Errorf("histogram is nil") + return nil, errors.New("histogram is nil") } t, err := json.Marshal(s.Timestamp) if err != nil { @@ -164,7 +165,7 @@ func (s *SampleHistogramPair) UnmarshalJSON(buf []byte) error { return fmt.Errorf("wrong number of fields: %d != %d", gotLen, wantLen) } if s.Histogram == nil { - return fmt.Errorf("histogram is null") + return errors.New("histogram is null") } return nil } diff --git a/vendor/github.com/prometheus/common/promslog/slog.go b/vendor/github.com/prometheus/common/promslog/slog.go index 1677605af1e..6e8fbabce5d 100644 --- a/vendor/github.com/prometheus/common/promslog/slog.go +++ b/vendor/github.com/prometheus/common/promslog/slog.go @@ -68,13 +68,16 @@ var ( return a } - truncateSourceAttrFunc = func(groups []string, a slog.Attr) slog.Attr { - if a.Key != slog.SourceKey { - return a - } - - if src, ok := a.Value.Any().(*slog.Source); ok { + defaultReplaceAttrFunc = func(groups []string, a slog.Attr) slog.Attr { + key := a.Key + switch key { + case slog.TimeKey: + t := a.Value.Time() + a.Value = slog.TimeValue(t.UTC()) + case slog.SourceKey: + src, _ := a.Value.Any().(*slog.Source) a.Value = slog.StringValue(filepath.Base(src.File) + ":" + strconv.Itoa(src.Line)) + default: } return a @@ -115,7 +118,7 @@ func (l *AllowedLevel) Set(s string) error { l.lvl = &slog.LevelVar{} } - switch s { + switch strings.ToLower(s) { case "debug": l.lvl.Set(slog.LevelDebug) callerAddFunc = true @@ -178,7 +181,7 @@ func New(config *Config) *slog.Logger { logHandlerOpts := &slog.HandlerOptions{ Level: config.Level.lvl, AddSource: true, - ReplaceAttr: truncateSourceAttrFunc, + ReplaceAttr: defaultReplaceAttrFunc, } if config.Style == GoKitStyle { diff --git a/vendor/github.com/prometheus/common/version/info.go b/vendor/github.com/prometheus/common/version/info.go index 197d95e5c8b..61ed1ba314b 100644 --- a/vendor/github.com/prometheus/common/version/info.go +++ b/vendor/github.com/prometheus/common/version/info.go @@ -90,6 +90,14 @@ func GetTags() string { return computedTags } +func PrometheusUserAgent() string { + return ComponentUserAgent("Prometheus") +} + +func ComponentUserAgent(component string) string { + return component + "/" + Version +} + func init() { computedRevision, computedTags = computeRevision() } diff --git a/vendor/github.com/prometheus/prometheus/config/config.go b/vendor/github.com/prometheus/prometheus/config/config.go index 86d8563536a..73282ac4295 100644 --- a/vendor/github.com/prometheus/prometheus/config/config.go +++ b/vendor/github.com/prometheus/prometheus/config/config.go @@ -117,11 +117,12 @@ func Load(s string, logger *slog.Logger) (*Config, error) { default: return nil, fmt.Errorf("unsupported OTLP translation strategy %q", cfg.OTLPConfig.TranslationStrategy) } - + cfg.loaded = true return cfg, nil } -// LoadFile parses the given YAML file into a Config. +// LoadFile parses and validates the given YAML file into a read-only Config. +// Callers should never write to or shallow copy the returned Config. func LoadFile(filename string, agentMode bool, logger *slog.Logger) (*Config, error) { content, err := os.ReadFile(filename) if err != nil { @@ -270,9 +271,12 @@ type Config struct { RemoteWriteConfigs []*RemoteWriteConfig `yaml:"remote_write,omitempty"` RemoteReadConfigs []*RemoteReadConfig `yaml:"remote_read,omitempty"` OTLPConfig OTLPConfig `yaml:"otlp,omitempty"` + + loaded bool // Certain methods require configuration to use Load validation. } // SetDirectory joins any relative file paths with dir. +// This method writes to config, and it's not concurrency safe. func (c *Config) SetDirectory(dir string) { c.GlobalConfig.SetDirectory(dir) c.AlertingConfig.SetDirectory(dir) @@ -302,24 +306,26 @@ func (c Config) String() string { return string(b) } -// GetScrapeConfigs returns the scrape configurations. +// GetScrapeConfigs returns the read-only, validated scrape configurations including +// the ones from the scrape_config_files. +// This method does not write to config, and it's concurrency safe (the pointer receiver is for efficiency). +// This method also assumes the Config was created by Load or LoadFile function, it returns error +// if it was not. We can't re-validate or apply globals here due to races, +// read more https://github.com/prometheus/prometheus/issues/15538. func (c *Config) GetScrapeConfigs() ([]*ScrapeConfig, error) { - scfgs := make([]*ScrapeConfig, len(c.ScrapeConfigs)) + if !c.loaded { + // Programmatic error, we warn before more confusing errors would happen due to lack of the globalization. + return nil, errors.New("scrape config cannot be fetched, main config was not validated and loaded correctly; should not happen") + } + scfgs := make([]*ScrapeConfig, len(c.ScrapeConfigs)) jobNames := map[string]string{} for i, scfg := range c.ScrapeConfigs { - // We do these checks for library users that would not call validate in - // Unmarshal. - if err := scfg.Validate(c.GlobalConfig); err != nil { - return nil, err - } - - if _, ok := jobNames[scfg.JobName]; ok { - return nil, fmt.Errorf("found multiple scrape configs with job name %q", scfg.JobName) - } jobNames[scfg.JobName] = "main config file" scfgs[i] = scfg } + + // Re-read and validate the dynamic scrape config rules. for _, pat := range c.ScrapeConfigFiles { fs, err := filepath.Glob(pat) if err != nil { @@ -355,6 +361,7 @@ func (c *Config) GetScrapeConfigs() ([]*ScrapeConfig, error) { } // UnmarshalYAML implements the yaml.Unmarshaler interface. +// NOTE: This method should not be used outside of this package. Use Load or LoadFile instead. func (c *Config) UnmarshalYAML(unmarshal func(interface{}) error) error { *c = DefaultConfig // We want to set c to the defaults and then overwrite it with the input. @@ -391,18 +398,18 @@ func (c *Config) UnmarshalYAML(unmarshal func(interface{}) error) error { } } - // Do global overrides and validate unique names. + // Do global overrides and validation. jobNames := map[string]struct{}{} for _, scfg := range c.ScrapeConfigs { if err := scfg.Validate(c.GlobalConfig); err != nil { return err } - if _, ok := jobNames[scfg.JobName]; ok { return fmt.Errorf("found multiple scrape configs with job name %q", scfg.JobName) } jobNames[scfg.JobName] = struct{}{} } + rwNames := map[string]struct{}{} for _, rwcfg := range c.RemoteWriteConfigs { if rwcfg == nil { @@ -1420,10 +1427,13 @@ func getGoGCEnv() int { type translationStrategyOption string var ( - // NoUTF8EscapingWithSuffixes will keep UTF-8 characters as they are, units and type suffixes will still be added. + // NoUTF8EscapingWithSuffixes will accept metric/label names as they are. + // Unit and type suffixes may be added to metric names, according to certain rules. NoUTF8EscapingWithSuffixes translationStrategyOption = "NoUTF8EscapingWithSuffixes" // UnderscoreEscapingWithSuffixes is the default option for translating OTLP to Prometheus. - // This option will translate all UTF-8 characters to underscores, while adding units and type suffixes. + // This option will translate metric name characters that are not alphanumerics/underscores/colons to underscores, + // and label name characters that are not alphanumerics/underscores to underscores. + // Unit and type suffixes may be appended to metric names, according to certain rules. UnderscoreEscapingWithSuffixes translationStrategyOption = "UnderscoreEscapingWithSuffixes" ) diff --git a/vendor/github.com/prometheus/prometheus/model/exemplar/exemplar.go b/vendor/github.com/prometheus/prometheus/model/exemplar/exemplar.go index 2c28b172571..d03940f1b29 100644 --- a/vendor/github.com/prometheus/prometheus/model/exemplar/exemplar.go +++ b/vendor/github.com/prometheus/prometheus/model/exemplar/exemplar.go @@ -18,7 +18,7 @@ import "github.com/prometheus/prometheus/model/labels" // ExemplarMaxLabelSetLength is defined by OpenMetrics: "The combined length of // the label names and values of an Exemplar's LabelSet MUST NOT exceed 128 // UTF-8 characters." -// https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#exemplars +// https://github.com/prometheus/OpenMetrics/blob/v1.0.0/specification/OpenMetrics.md#exemplars const ExemplarMaxLabelSetLength = 128 // Exemplar is additional information associated with a time series. diff --git a/vendor/github.com/prometheus/prometheus/model/labels/labels.go b/vendor/github.com/prometheus/prometheus/model/labels/labels.go index f4de7496ce7..0747ab90d92 100644 --- a/vendor/github.com/prometheus/prometheus/model/labels/labels.go +++ b/vendor/github.com/prometheus/prometheus/model/labels/labels.go @@ -19,6 +19,7 @@ import ( "bytes" "slices" "strings" + "unsafe" "github.com/cespare/xxhash/v2" ) @@ -488,3 +489,8 @@ func (b *ScratchBuilder) Labels() Labels { func (b *ScratchBuilder) Overwrite(ls *Labels) { *ls = append((*ls)[:0], b.add...) } + +// SizeOfLabels returns the approximate space required for n copies of a label. +func SizeOfLabels(name, value string, n uint64) uint64 { + return (uint64(len(name)) + uint64(unsafe.Sizeof(name)) + uint64(len(value)) + uint64(unsafe.Sizeof(value))) * n +} diff --git a/vendor/github.com/prometheus/prometheus/model/labels/labels_common.go b/vendor/github.com/prometheus/prometheus/model/labels/labels_common.go index 99529a38367..a232eeea5d3 100644 --- a/vendor/github.com/prometheus/prometheus/model/labels/labels_common.go +++ b/vendor/github.com/prometheus/prometheus/model/labels/labels_common.go @@ -51,7 +51,11 @@ func (ls Labels) String() string { b.WriteByte(',') b.WriteByte(' ') } - b.WriteString(l.Name) + if !model.LabelName(l.Name).IsValidLegacy() { + b.Write(strconv.AppendQuote(b.AvailableBuffer(), l.Name)) + } else { + b.WriteString(l.Name) + } b.WriteByte('=') b.Write(strconv.AppendQuote(b.AvailableBuffer(), l.Value)) i++ diff --git a/vendor/github.com/prometheus/prometheus/model/labels/labels_dedupelabels.go b/vendor/github.com/prometheus/prometheus/model/labels/labels_dedupelabels.go index da8a88cc158..a0d83e00447 100644 --- a/vendor/github.com/prometheus/prometheus/model/labels/labels_dedupelabels.go +++ b/vendor/github.com/prometheus/prometheus/model/labels/labels_dedupelabels.go @@ -815,3 +815,8 @@ func (b *ScratchBuilder) Overwrite(ls *Labels) { ls.syms = b.syms.nameTable ls.data = yoloString(b.overwriteBuffer) } + +// SizeOfLabels returns the approximate space required for n copies of a label. +func SizeOfLabels(name, value string, n uint64) uint64 { + return uint64(len(name)+len(value)) + n*4 // Assuming most symbol-table entries are 2 bytes long. +} diff --git a/vendor/github.com/prometheus/prometheus/model/labels/labels_stringlabels.go b/vendor/github.com/prometheus/prometheus/model/labels/labels_stringlabels.go index c64bb990e02..f49ed96f650 100644 --- a/vendor/github.com/prometheus/prometheus/model/labels/labels_stringlabels.go +++ b/vendor/github.com/prometheus/prometheus/model/labels/labels_stringlabels.go @@ -691,3 +691,8 @@ func NewScratchBuilderWithSymbolTable(_ *SymbolTable, n int) ScratchBuilder { func (b *ScratchBuilder) SetSymbolTable(_ *SymbolTable) { // no-op } + +// SizeOfLabels returns the approximate space required for n copies of a label. +func SizeOfLabels(name, value string, n uint64) uint64 { + return uint64(labelSize(&Label{Name: name, Value: value})) * n +} diff --git a/vendor/github.com/prometheus/prometheus/model/textparse/openmetricsparse.go b/vendor/github.com/prometheus/prometheus/model/textparse/openmetricsparse.go index 16e805f3a93..f0dd51afeed 100644 --- a/vendor/github.com/prometheus/prometheus/model/textparse/openmetricsparse.go +++ b/vendor/github.com/prometheus/prometheus/model/textparse/openmetricsparse.go @@ -337,7 +337,7 @@ func (p *OpenMetricsParser) CreatedTimestamp() *int64 { } // All timestamps in OpenMetrics are Unix Epoch in seconds. Convert to milliseconds. - // https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#timestamps + // https://github.com/prometheus/OpenMetrics/blob/v1.0.0/specification/OpenMetrics.md#timestamps ct := int64(p.val * 1000.0) p.setCTParseValues(ct, currHash, currName, true) return &ct diff --git a/vendor/github.com/prometheus/prometheus/promql/functions.go b/vendor/github.com/prometheus/prometheus/promql/functions.go index 016e676d316..da1821fd18a 100644 --- a/vendor/github.com/prometheus/prometheus/promql/functions.go +++ b/vendor/github.com/prometheus/prometheus/promql/functions.go @@ -345,11 +345,14 @@ func calcTrendValue(i int, tf, s0, s1, b float64) float64 { return x + y } -// Holt-Winters is similar to a weighted moving average, where historical data has exponentially less influence on the current data. -// Holt-Winter also accounts for trends in data. The smoothing factor (0 < sf < 1) affects how historical data will affect the current -// data. A lower smoothing factor increases the influence of historical data. The trend factor (0 < tf < 1) affects -// how trends in historical data will affect the current data. A higher trend factor increases the influence. -// of trends. Algorithm taken from https://en.wikipedia.org/wiki/Exponential_smoothing titled: "Double exponential smoothing". +// Double exponential smoothing is similar to a weighted moving average, where +// historical data has exponentially less influence on the current data. It also +// accounts for trends in data. The smoothing factor (0 < sf < 1) affects how +// historical data will affect the current data. A lower smoothing factor +// increases the influence of historical data. The trend factor (0 < tf < 1) +// affects how trends in historical data will affect the current data. A higher +// trend factor increases the influence. of trends. Algorithm taken from +// https://en.wikipedia.org/wiki/Exponential_smoothing . func funcDoubleExponentialSmoothing(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper) (Vector, annotations.Annotations) { samples := vals[0].(Matrix)[0] diff --git a/vendor/github.com/prometheus/prometheus/promql/parser/generated_parser.y b/vendor/github.com/prometheus/prometheus/promql/parser/generated_parser.y index c321a1e9735..3865dc6548d 100644 --- a/vendor/github.com/prometheus/prometheus/promql/parser/generated_parser.y +++ b/vendor/github.com/prometheus/prometheus/promql/parser/generated_parser.y @@ -669,14 +669,14 @@ label_set_item : IDENTIFIER EQL STRING { $$ = labels.Label{Name: $1.Val, Value: yylex.(*parser).unquoteString($3.Val) } } | string_identifier EQL STRING { $$ = labels.Label{Name: $1.Val, Value: yylex.(*parser).unquoteString($3.Val) } } + | string_identifier + { $$ = labels.Label{Name: labels.MetricName, Value: $1.Val} } | IDENTIFIER EQL error { yylex.(*parser).unexpected("label set", "string"); $$ = labels.Label{}} | string_identifier EQL error { yylex.(*parser).unexpected("label set", "string"); $$ = labels.Label{}} | IDENTIFIER error { yylex.(*parser).unexpected("label set", "\"=\""); $$ = labels.Label{}} - | string_identifier error - { yylex.(*parser).unexpected("label set", "\"=\""); $$ = labels.Label{}} | error { yylex.(*parser).unexpected("label set", "identifier or \"}\""); $$ = labels.Label{} } ; diff --git a/vendor/github.com/prometheus/prometheus/promql/parser/generated_parser.y.go b/vendor/github.com/prometheus/prometheus/promql/parser/generated_parser.y.go index 8979410ceb4..7ff8591169b 100644 --- a/vendor/github.com/prometheus/prometheus/promql/parser/generated_parser.y.go +++ b/vendor/github.com/prometheus/prometheus/promql/parser/generated_parser.y.go @@ -339,7 +339,7 @@ var yyExca = [...]int16{ 79, 197, 85, 197, -2, 125, - -1, 205, + -1, 204, 9, 246, 12, 246, 13, 246, @@ -371,7 +371,7 @@ var yyExca = [...]int16{ 88, 246, 89, 246, -2, 0, - -1, 206, + -1, 205, 9, 246, 12, 246, 13, 246, @@ -407,139 +407,139 @@ var yyExca = [...]int16{ const yyPrivate = 57344 -const yyLast = 804 +const yyLast = 803 var yyAct = [...]int16{ - 155, 339, 337, 158, 344, 231, 39, 197, 281, 44, - 296, 295, 84, 120, 82, 181, 109, 108, 351, 352, - 353, 354, 107, 111, 203, 136, 204, 159, 154, 112, - 205, 206, 234, 6, 271, 55, 163, 163, 107, 334, - 333, 307, 244, 275, 309, 54, 162, 162, 250, 363, - 91, 272, 330, 131, 362, 233, 60, 270, 276, 110, - 100, 101, 298, 115, 103, 116, 106, 90, 164, 164, - 114, 265, 113, 361, 277, 307, 360, 246, 247, 338, - 103, 248, 106, 153, 165, 165, 264, 316, 201, 261, - 122, 105, 235, 237, 239, 240, 241, 249, 251, 254, - 255, 256, 257, 258, 262, 263, 273, 105, 236, 238, - 242, 243, 245, 252, 253, 152, 117, 166, 259, 260, - 176, 164, 170, 173, 163, 168, 223, 169, 172, 2, - 3, 4, 5, 107, 162, 199, 111, 165, 187, 202, - 189, 171, 112, 269, 207, 208, 209, 210, 211, 212, - 213, 214, 215, 216, 217, 218, 219, 220, 221, 200, - 89, 91, 113, 222, 123, 193, 268, 329, 224, 225, - 183, 100, 101, 191, 121, 103, 104, 106, 90, 7, - 85, 234, 266, 182, 55, 183, 328, 86, 192, 123, - 83, 244, 122, 267, 54, 132, 190, 250, 188, 121, - 345, 230, 105, 86, 233, 77, 35, 119, 304, 10, - 185, 327, 86, 303, 293, 294, 157, 315, 297, 79, - 184, 186, 326, 163, 274, 185, 246, 247, 302, 325, - 248, 324, 314, 162, 323, 184, 186, 299, 261, 313, - 322, 235, 237, 239, 240, 241, 249, 251, 254, 255, - 256, 257, 258, 262, 263, 164, 321, 236, 238, 242, - 243, 245, 252, 253, 180, 126, 320, 259, 260, 179, - 125, 165, 305, 319, 306, 308, 318, 310, 317, 130, - 88, 129, 178, 124, 311, 312, 137, 138, 139, 140, + 154, 338, 336, 157, 343, 230, 39, 196, 280, 44, + 295, 294, 84, 120, 82, 233, 180, 109, 108, 350, + 351, 352, 353, 110, 111, 243, 202, 158, 203, 135, + 112, 249, 361, 6, 333, 329, 113, 332, 232, 204, + 205, 308, 271, 60, 130, 270, 297, 268, 162, 315, + 156, 360, 153, 306, 359, 344, 200, 162, 161, 55, + 245, 246, 222, 115, 247, 116, 107, 161, 269, 54, + 267, 114, 260, 306, 182, 234, 236, 238, 239, 240, + 248, 250, 253, 254, 255, 256, 257, 261, 262, 163, + 122, 235, 237, 241, 242, 244, 251, 252, 192, 328, + 111, 258, 259, 117, 190, 164, 112, 152, 103, 55, + 106, 337, 77, 113, 184, 151, 35, 165, 327, 54, + 175, 191, 169, 172, 183, 185, 167, 189, 168, 2, + 3, 4, 5, 107, 198, 105, 159, 160, 201, 186, + 188, 7, 326, 206, 207, 208, 209, 210, 211, 212, + 213, 214, 215, 216, 217, 218, 219, 220, 199, 194, + 89, 91, 221, 162, 264, 325, 197, 223, 224, 171, + 200, 100, 101, 161, 162, 103, 104, 106, 90, 263, + 233, 324, 170, 162, 161, 323, 362, 322, 321, 274, + 243, 122, 266, 161, 131, 163, 249, 272, 123, 320, + 229, 319, 105, 232, 275, 318, 163, 317, 121, 85, + 316, 164, 163, 292, 293, 163, 265, 296, 129, 83, + 276, 86, 164, 273, 10, 245, 246, 187, 164, 247, + 88, 164, 86, 50, 79, 36, 298, 260, 1, 78, + 234, 236, 238, 239, 240, 248, 250, 253, 254, 255, + 256, 257, 261, 262, 123, 49, 235, 237, 241, 242, + 244, 251, 252, 181, 121, 182, 258, 259, 128, 48, + 127, 304, 119, 305, 307, 59, 309, 86, 9, 9, + 47, 46, 134, 310, 311, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, - 151, 195, 160, 161, 50, 163, 36, 167, 198, 331, - 78, 332, 201, 228, 55, 162, 85, 227, 1, 340, - 341, 342, 336, 49, 54, 343, 83, 347, 346, 349, - 348, 48, 226, 47, 81, 355, 356, 164, 55, 86, - 357, 53, 77, 301, 56, 8, 359, 22, 54, 37, - 55, 175, 46, 165, 57, 128, 135, 127, 45, 43, - 54, 364, 300, 59, 133, 174, 9, 9, 42, 134, - 75, 41, 40, 51, 196, 358, 18, 19, 278, 87, - 20, 194, 229, 80, 350, 156, 76, 58, 232, 52, - 118, 61, 62, 63, 64, 65, 66, 67, 68, 69, - 70, 71, 72, 73, 74, 0, 0, 0, 13, 0, - 0, 0, 24, 0, 30, 0, 0, 31, 32, 55, - 38, 0, 53, 77, 0, 56, 280, 0, 22, 54, - 0, 0, 0, 279, 0, 57, 0, 283, 284, 282, - 289, 291, 288, 290, 285, 286, 287, 292, 0, 0, - 0, 75, 0, 0, 0, 0, 0, 18, 19, 0, - 0, 20, 0, 0, 0, 0, 0, 76, 0, 0, - 0, 0, 61, 62, 63, 64, 65, 66, 67, 68, - 69, 70, 71, 72, 73, 74, 0, 0, 0, 13, - 0, 0, 0, 24, 0, 30, 0, 55, 31, 32, - 53, 77, 0, 56, 335, 0, 22, 54, 0, 0, - 0, 0, 0, 57, 0, 283, 284, 282, 289, 291, - 288, 290, 285, 286, 287, 292, 0, 0, 0, 75, - 0, 0, 0, 0, 0, 18, 19, 0, 0, 20, - 0, 0, 0, 17, 77, 76, 0, 0, 0, 22, + 45, 43, 132, 173, 179, 184, 166, 85, 330, 178, + 331, 42, 133, 55, 41, 183, 185, 83, 339, 340, + 341, 335, 177, 54, 342, 81, 346, 345, 348, 347, + 86, 303, 40, 314, 354, 355, 302, 55, 51, 356, + 53, 77, 300, 56, 195, 358, 22, 54, 313, 55, + 174, 301, 227, 57, 8, 312, 226, 357, 37, 54, + 363, 299, 126, 277, 87, 193, 228, 125, 80, 75, + 349, 225, 155, 58, 231, 18, 19, 52, 118, 20, + 124, 0, 0, 0, 0, 76, 0, 0, 0, 0, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 0, 0, 0, 13, 0, 0, - 0, 24, 0, 30, 0, 0, 31, 32, 18, 19, - 0, 0, 20, 0, 0, 0, 17, 35, 0, 0, - 0, 0, 22, 11, 12, 14, 15, 16, 21, 23, - 25, 26, 27, 28, 29, 33, 34, 0, 0, 0, - 13, 0, 0, 0, 24, 0, 30, 0, 0, 31, - 32, 18, 19, 0, 0, 20, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 11, 12, 14, 15, - 16, 21, 23, 25, 26, 27, 28, 29, 33, 34, - 107, 0, 0, 13, 0, 0, 0, 24, 177, 30, - 0, 0, 31, 32, 0, 0, 0, 0, 0, 107, - 0, 0, 0, 0, 0, 0, 0, 89, 91, 92, - 0, 93, 94, 95, 96, 97, 98, 99, 100, 101, - 102, 0, 103, 104, 106, 90, 89, 91, 92, 0, + 0, 24, 0, 30, 0, 0, 31, 32, 55, 38, + 107, 53, 77, 0, 56, 279, 0, 22, 54, 0, + 0, 0, 278, 0, 57, 0, 282, 283, 281, 288, + 290, 287, 289, 284, 285, 286, 291, 0, 91, 0, + 75, 0, 0, 0, 0, 0, 18, 19, 100, 101, + 20, 0, 103, 0, 106, 90, 76, 0, 0, 0, + 0, 61, 62, 63, 64, 65, 66, 67, 68, 69, + 70, 71, 72, 73, 74, 0, 0, 0, 13, 105, + 0, 0, 24, 0, 30, 0, 55, 31, 32, 53, + 77, 0, 56, 334, 0, 22, 54, 0, 0, 0, + 0, 0, 57, 0, 282, 283, 281, 288, 290, 287, + 289, 284, 285, 286, 291, 0, 0, 0, 75, 0, + 0, 0, 0, 0, 18, 19, 0, 0, 20, 0, + 0, 0, 17, 77, 76, 0, 0, 0, 22, 61, + 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 0, 0, 0, 13, 0, 0, 0, + 24, 0, 30, 0, 0, 31, 32, 18, 19, 0, + 0, 20, 0, 0, 0, 17, 35, 0, 0, 0, + 0, 22, 11, 12, 14, 15, 16, 21, 23, 25, + 26, 27, 28, 29, 33, 34, 0, 0, 0, 13, + 0, 0, 0, 24, 0, 30, 0, 0, 31, 32, + 18, 19, 0, 0, 20, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 11, 12, 14, 15, 16, + 21, 23, 25, 26, 27, 28, 29, 33, 34, 107, + 0, 0, 13, 0, 0, 0, 24, 176, 30, 0, + 0, 31, 32, 0, 0, 0, 0, 0, 107, 0, + 0, 0, 0, 0, 0, 0, 89, 91, 92, 0, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, - 0, 103, 104, 106, 90, 107, 0, 0, 0, 105, + 0, 103, 104, 106, 90, 89, 91, 92, 0, 93, + 94, 95, 96, 97, 98, 99, 100, 101, 102, 0, + 103, 104, 106, 90, 107, 0, 0, 0, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 107, 0, 0, 0, 105, 0, - 0, 0, 89, 91, 92, 0, 93, 94, 95, 0, - 97, 98, 99, 100, 101, 102, 0, 103, 104, 106, - 90, 89, 91, 92, 0, 93, 94, 0, 0, 97, - 98, 0, 100, 101, 102, 0, 103, 104, 106, 90, - 0, 0, 0, 0, 105, 0, 0, 0, 0, 0, + 0, 0, 0, 107, 0, 0, 0, 105, 0, 0, + 0, 89, 91, 92, 0, 93, 94, 95, 0, 97, + 98, 99, 100, 101, 102, 0, 103, 104, 106, 90, + 89, 91, 92, 0, 93, 94, 0, 0, 97, 98, + 0, 100, 101, 102, 0, 103, 104, 106, 90, 0, + 0, 0, 0, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 105, + 0, 0, 105, } var yyPact = [...]int16{ - 31, 169, 574, 574, 410, 531, -1000, -1000, -1000, 193, + 31, 131, 573, 573, 409, 530, -1000, -1000, -1000, 103, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, - -1000, -1000, -1000, -1000, -1000, 314, -1000, 278, -1000, 655, + -1000, -1000, -1000, -1000, -1000, 305, -1000, 228, -1000, 654, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, - -1000, -1000, 57, 147, -1000, -1000, 488, -1000, 488, 192, + -1000, -1000, 21, 98, -1000, -1000, 487, -1000, 487, 99, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, - -1000, -1000, -1000, -1000, -1000, -1000, -1000, 187, -1000, -1000, - 263, -1000, -1000, 353, 277, -1000, -1000, 29, -1000, -53, - -53, -53, -53, -53, -53, -53, -53, -53, -53, -53, - -53, -53, -53, -53, -53, 26, 214, 305, 147, -56, - -1000, 126, 126, 329, -1000, 636, 24, -1000, 262, -1000, - -1000, 181, 166, -1000, -1000, 178, -1000, 171, -1000, 163, - -1000, 296, 488, -1000, -58, -50, -1000, 488, 488, 488, - 488, 488, 488, 488, 488, 488, 488, 488, 488, 488, - 488, 488, -1000, 175, -1000, -1000, 111, -1000, -1000, -1000, - -1000, -1000, -1000, -1000, 115, 115, 311, -1000, -1000, -1000, - -1000, 179, -1000, -1000, 64, -1000, 655, -1000, -1000, 162, - -1000, 141, -1000, -1000, -1000, -1000, -1000, 32, -1000, -1000, - -1000, -1000, -1000, -1000, -1000, 25, 80, 17, -1000, -1000, - -1000, 409, 8, 126, 126, 126, 126, 24, 24, 119, - 119, 119, 720, 701, 119, 119, 720, 24, 24, 119, - 24, 8, -1000, 40, -1000, -1000, -1000, 341, -1000, 206, + -1000, -1000, -1000, -1000, -1000, -1000, -1000, 252, -1000, -1000, + 360, -1000, -1000, 266, 214, -1000, -1000, 20, -1000, -49, + -49, -49, -49, -49, -49, -49, -49, -49, -49, -49, + -49, -49, -49, -49, -49, 50, 48, 304, 98, -55, + -1000, 167, 167, 328, -1000, 635, 52, -1000, 302, -1000, + -1000, 261, 70, -1000, -1000, 207, -1000, 102, -1000, 96, + 154, 487, -1000, -56, -41, -1000, 487, 487, 487, 487, + 487, 487, 487, 487, 487, 487, 487, 487, 487, 487, + 487, -1000, 100, -1000, -1000, 47, -1000, -1000, -1000, -1000, + -1000, -1000, -1000, 39, 39, 350, -1000, -1000, -1000, -1000, + 178, -1000, -1000, 157, -1000, 654, -1000, -1000, 196, -1000, + 45, -1000, -1000, -1000, -1000, -1000, 43, -1000, -1000, -1000, + -1000, -1000, -1000, -1000, 16, 171, 163, -1000, -1000, -1000, + 408, 406, 167, 167, 167, 167, 52, 52, 119, 119, + 119, 719, 700, 119, 119, 719, 52, 52, 119, 52, + 406, -1000, 24, -1000, -1000, -1000, 340, -1000, 329, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, - -1000, -1000, -1000, -1000, -1000, 488, -1000, -1000, -1000, -1000, - -1000, -1000, 56, 56, 18, 56, 72, 72, 215, 70, - -1000, -1000, 272, 270, 267, 260, 250, 234, 228, 225, - 223, 216, 205, -1000, -1000, -1000, -1000, -1000, -1000, 165, - -1000, -1000, -1000, 30, -1000, 655, -1000, -1000, -1000, 56, - -1000, 14, 13, 487, -1000, -1000, -1000, 22, 27, 27, - 27, 115, 186, 186, 22, 186, 22, -74, -1000, -1000, - -1000, -1000, -1000, 56, 56, -1000, -1000, -1000, 56, -1000, - -1000, -1000, -1000, -1000, -1000, 27, -1000, -1000, -1000, -1000, - -1000, -1000, -1000, -1000, -1000, -1000, -1000, -1000, 52, -1000, - 28, -1000, -1000, -1000, -1000, + -1000, -1000, -1000, -1000, 487, -1000, -1000, -1000, -1000, -1000, + -1000, 34, 34, 15, 34, 40, 40, 331, 32, -1000, + -1000, 204, 201, 199, 195, 193, 182, 181, 179, 175, + 159, 136, -1000, -1000, -1000, -1000, -1000, -1000, 97, -1000, + -1000, -1000, 13, -1000, 654, -1000, -1000, -1000, 34, -1000, + 11, 8, 486, -1000, -1000, -1000, 54, 174, 174, 174, + 39, 41, 41, 54, 41, 54, -73, -1000, -1000, -1000, + -1000, -1000, 34, 34, -1000, -1000, -1000, 34, -1000, -1000, + -1000, -1000, -1000, -1000, 174, -1000, -1000, -1000, -1000, -1000, + -1000, -1000, -1000, -1000, -1000, -1000, -1000, 30, -1000, 165, + -1000, -1000, -1000, -1000, } var yyPgo = [...]int16{ - 0, 390, 13, 389, 5, 15, 388, 363, 387, 385, - 12, 384, 209, 345, 383, 14, 382, 10, 11, 381, - 379, 7, 378, 8, 4, 375, 2, 1, 3, 374, - 27, 0, 373, 372, 17, 195, 371, 369, 6, 368, - 365, 16, 364, 56, 359, 9, 358, 356, 352, 333, - 331, 323, 304, 318, 306, + 0, 378, 13, 377, 5, 16, 374, 275, 373, 372, + 12, 370, 224, 354, 368, 14, 366, 10, 11, 365, + 364, 7, 363, 8, 4, 357, 2, 1, 3, 344, + 27, 0, 338, 332, 18, 194, 314, 312, 6, 311, + 303, 17, 302, 43, 301, 9, 300, 282, 281, 280, + 269, 255, 233, 238, 235, } var yyR1 = [...]int8{ @@ -584,7 +584,7 @@ var yyR2 = [...]int8{ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 2, 0, - 3, 1, 2, 3, 3, 3, 3, 2, 2, 1, + 3, 1, 2, 3, 3, 1, 3, 3, 2, 1, 2, 0, 3, 2, 1, 1, 3, 1, 3, 4, 1, 3, 5, 5, 1, 1, 1, 4, 3, 3, 2, 3, 1, 2, 3, 3, 3, 3, 3, 3, @@ -612,30 +612,30 @@ var yyChk = [...]int16{ 52, 53, 54, 56, 57, 83, 58, 14, -34, -41, 2, 79, 85, 15, -41, -38, -38, -43, -1, 20, -2, 12, -10, 2, 20, 7, 2, 4, 2, 4, - 2, 24, -35, -42, -37, -47, 78, -35, -35, -35, + 24, -35, -42, -37, -47, 78, -35, -35, -35, -35, -35, -35, -35, -35, -35, -35, -35, -35, -35, -35, - -35, -35, -45, 57, 2, -31, -9, 2, -28, -30, - 88, 89, 19, 9, 41, 57, -45, 2, -41, -34, - -17, 15, 2, -17, -40, 22, -38, 22, 20, 7, - 2, -5, 2, 4, 54, 44, 55, -5, 20, -15, - 25, 2, 25, 2, -19, 5, -29, -21, 12, -28, - -30, 16, -38, 82, 84, 80, 81, -38, -38, -38, + -35, -45, 57, 2, -31, -9, 2, -28, -30, 88, + 89, 19, 9, 41, 57, -45, 2, -41, -34, -17, + 15, 2, -17, -40, 22, -38, 22, 20, 7, 2, + -5, 2, 4, 54, 44, 55, -5, 20, -15, 25, + 2, 25, 2, -19, 5, -29, -21, 12, -28, -30, + 16, -38, 82, 84, 80, 81, -38, -38, -38, -38, -38, -38, -38, -38, -38, -38, -38, -38, -38, -38, - -38, -38, -45, 15, -28, -28, 21, 6, 2, -16, - 22, -4, -6, 25, 2, 62, 78, 63, 79, 64, - 65, 66, 80, 81, 12, 82, 47, 48, 51, 67, - 18, 68, 83, 84, 69, 70, 71, 72, 73, 88, - 89, 59, 74, 75, 22, 7, 20, -2, 25, 2, - 25, 2, 26, 26, -30, 26, 41, 57, -22, 24, - 17, -23, 30, 28, 29, 35, 36, 37, 33, 31, - 34, 32, 38, -17, -17, -18, -17, -18, 22, -45, - 21, 2, 22, 7, 2, -38, -27, 19, -27, 26, - -27, -21, -21, 24, 17, 2, 17, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 21, 2, - 22, -4, -27, 26, 26, 17, -23, -26, 57, -27, - -31, -31, -31, -28, -24, 14, -24, -26, -24, -26, - -11, 92, 93, 94, 95, -27, -27, -27, -25, -31, - 24, 21, 2, 21, -31, + -38, -45, 15, -28, -28, 21, 6, 2, -16, 22, + -4, -6, 25, 2, 62, 78, 63, 79, 64, 65, + 66, 80, 81, 12, 82, 47, 48, 51, 67, 18, + 68, 83, 84, 69, 70, 71, 72, 73, 88, 89, + 59, 74, 75, 22, 7, 20, -2, 25, 2, 25, + 2, 26, 26, -30, 26, 41, 57, -22, 24, 17, + -23, 30, 28, 29, 35, 36, 37, 33, 31, 34, + 32, 38, -17, -17, -18, -17, -18, 22, -45, 21, + 2, 22, 7, 2, -38, -27, 19, -27, 26, -27, + -21, -21, 24, 17, 2, 17, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 21, 2, 22, + -4, -27, 26, 26, 17, -23, -26, 57, -27, -31, + -31, -31, -28, -24, 14, -24, -26, -24, -26, -11, + 92, 93, 94, 95, -27, -27, -27, -25, -31, 24, + 21, 2, 21, -31, } var yyDef = [...]int16{ @@ -647,35 +647,35 @@ var yyDef = [...]int16{ 18, 19, 0, 108, 233, 234, 0, 244, 0, 85, 86, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 227, 228, 0, 5, 100, - 0, 128, 131, 0, 0, 139, 245, 140, 144, 43, + 0, 128, 131, 0, 135, 139, 245, 140, 144, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 0, 0, 0, 0, 22, 23, 0, 0, 0, 61, 0, 83, 84, 0, 89, - 91, 0, 95, 99, 126, 0, 132, 0, 137, 0, - 138, 143, 0, 42, 47, 48, 44, 0, 0, 0, + 91, 0, 95, 99, 126, 0, 132, 0, 138, 0, + 143, 0, 42, 47, 48, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 68, 0, 70, 71, 0, 73, 239, 240, - 74, 75, 235, 236, 0, 0, 0, 82, 20, 21, - 24, 0, 54, 25, 0, 63, 65, 67, 87, 0, - 92, 0, 98, 229, 230, 231, 232, 0, 127, 130, - 133, 135, 134, 136, 142, 145, 147, 150, 154, 155, - 156, 0, 26, 0, 0, -2, -2, 27, 28, 29, - 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, - 40, 41, 69, 0, 237, 238, 76, 0, 81, 0, - 53, 56, 58, 59, 60, 198, 199, 200, 201, 202, - 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, - 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, - 223, 224, 225, 226, 62, 66, 88, 90, 93, 97, - 94, 96, 0, 0, 0, 0, 0, 0, 0, 0, - 160, 162, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 45, 46, 49, 247, 50, 72, 0, - 78, 80, 51, 0, 57, 64, 146, 241, 148, 0, - 151, 0, 0, 0, 158, 163, 159, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 77, 79, - 52, 55, 149, 0, 0, 157, 161, 164, 0, 243, - 165, 166, 167, 168, 169, 0, 170, 171, 172, 173, - 174, 180, 181, 182, 183, 152, 153, 242, 0, 178, - 0, 176, 179, 175, 177, + 0, 68, 0, 70, 71, 0, 73, 239, 240, 74, + 75, 235, 236, 0, 0, 0, 82, 20, 21, 24, + 0, 54, 25, 0, 63, 65, 67, 87, 0, 92, + 0, 98, 229, 230, 231, 232, 0, 127, 130, 133, + 136, 134, 137, 142, 145, 147, 150, 154, 155, 156, + 0, 26, 0, 0, -2, -2, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 69, 0, 237, 238, 76, 0, 81, 0, 53, + 56, 58, 59, 60, 198, 199, 200, 201, 202, 203, + 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, + 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 62, 66, 88, 90, 93, 97, 94, + 96, 0, 0, 0, 0, 0, 0, 0, 0, 160, + 162, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 45, 46, 49, 247, 50, 72, 0, 78, + 80, 51, 0, 57, 64, 146, 241, 148, 0, 151, + 0, 0, 0, 158, 163, 159, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 77, 79, 52, + 55, 149, 0, 0, 157, 161, 164, 0, 243, 165, + 166, 167, 168, 169, 0, 170, 171, 172, 173, 174, + 180, 181, 182, 183, 152, 153, 242, 0, 178, 0, + 176, 179, 175, 177, } var yyTok1 = [...]int8{ @@ -1623,10 +1623,9 @@ yydefault: yyVAL.label = labels.Label{Name: yyDollar[1].item.Val, Value: yylex.(*parser).unquoteString(yyDollar[3].item.Val)} } case 135: - yyDollar = yyS[yypt-3 : yypt+1] + yyDollar = yyS[yypt-1 : yypt+1] { - yylex.(*parser).unexpected("label set", "string") - yyVAL.label = labels.Label{} + yyVAL.label = labels.Label{Name: labels.MetricName, Value: yyDollar[1].item.Val} } case 136: yyDollar = yyS[yypt-3 : yypt+1] @@ -1635,9 +1634,9 @@ yydefault: yyVAL.label = labels.Label{} } case 137: - yyDollar = yyS[yypt-2 : yypt+1] + yyDollar = yyS[yypt-3 : yypt+1] { - yylex.(*parser).unexpected("label set", "\"=\"") + yylex.(*parser).unexpected("label set", "string") yyVAL.label = labels.Label{} } case 138: diff --git a/vendor/github.com/prometheus/prometheus/promql/parser/parse.go b/vendor/github.com/prometheus/prometheus/promql/parser/parse.go index e8abe90194f..9d38fd2d6dc 100644 --- a/vendor/github.com/prometheus/prometheus/promql/parser/parse.go +++ b/vendor/github.com/prometheus/prometheus/promql/parser/parse.go @@ -244,7 +244,8 @@ type seriesDescription struct { values []SequenceValue } -// ParseSeriesDesc parses the description of a time series. +// ParseSeriesDesc parses the description of a time series. It is only used in +// the PromQL testing framework code. func ParseSeriesDesc(input string) (labels labels.Labels, values []SequenceValue, err error) { p := NewParser(input) p.lex.seriesDesc = true diff --git a/vendor/github.com/prometheus/prometheus/promql/parser/printer.go b/vendor/github.com/prometheus/prometheus/promql/parser/printer.go index 63b19508276..afe755e7dd4 100644 --- a/vendor/github.com/prometheus/prometheus/promql/parser/printer.go +++ b/vendor/github.com/prometheus/prometheus/promql/parser/printer.go @@ -14,8 +14,10 @@ package parser import ( + "bytes" "fmt" "sort" + "strconv" "strings" "time" @@ -91,13 +93,20 @@ func (node *AggregateExpr) getAggOpStr() string { } func joinLabels(ss []string) string { + var bytea [1024]byte // On stack to avoid memory allocation while building the output. + b := bytes.NewBuffer(bytea[:0]) + for i, s := range ss { - // If the label is already quoted, don't quote it again. - if s[0] != '"' && s[0] != '\'' && s[0] != '`' && !model.IsValidLegacyMetricName(string(model.LabelValue(s))) { - ss[i] = fmt.Sprintf("\"%s\"", s) + if i > 0 { + b.WriteString(", ") + } + if !model.IsValidLegacyMetricName(string(model.LabelValue(s))) { + b.Write(strconv.AppendQuote(b.AvailableBuffer(), s)) + } else { + b.WriteString(s) } } - return strings.Join(ss, ", ") + return b.String() } func (node *BinaryExpr) returnBool() string { diff --git a/vendor/github.com/prometheus/prometheus/promql/promqltest/test.go b/vendor/github.com/prometheus/prometheus/promql/promqltest/test.go index f208b4f3135..efa2136f10a 100644 --- a/vendor/github.com/prometheus/prometheus/promql/promqltest/test.go +++ b/vendor/github.com/prometheus/prometheus/promql/promqltest/test.go @@ -56,6 +56,10 @@ const ( DefaultMaxSamplesPerQuery = 10000 ) +func init() { + model.NameValidationScheme = model.UTF8Validation +} + type TBRun interface { testing.TB Run(string, func(*testing.T)) bool @@ -66,7 +70,7 @@ var testStartTime = time.Unix(0, 0).UTC() // LoadedStorage returns storage with generated data using the provided load statements. // Non-load statements will cause test errors. func LoadedStorage(t testutil.T, input string) *teststorage.TestStorage { - test, err := newTest(t, input, false) + test, err := newTest(t, input, false, newTestStorage) require.NoError(t, err) for _, cmd := range test.cmds { @@ -77,7 +81,7 @@ func LoadedStorage(t testutil.T, input string) *teststorage.TestStorage { t.Errorf("only 'load' commands accepted, got '%s'", cmd) } } - return test.storage + return test.storage.(*teststorage.TestStorage) } // NewTestEngine creates a promql.Engine with enablePerStepStats, lookbackDelta and maxSamples, and returns it. @@ -108,6 +112,11 @@ func NewTestEngineWithOpts(tb testing.TB, opts promql.EngineOpts) *promql.Engine // RunBuiltinTests runs an acceptance test suite against the provided engine. func RunBuiltinTests(t TBRun, engine promql.QueryEngine) { + RunBuiltinTestsWithStorage(t, engine, newTestStorage) +} + +// RunBuiltinTestsWithStorage runs an acceptance test suite against the provided engine and storage. +func RunBuiltinTestsWithStorage(t TBRun, engine promql.QueryEngine, newStorage func(testutil.T) storage.Storage) { t.Cleanup(func() { parser.EnableExperimentalFunctions = false }) parser.EnableExperimentalFunctions = true @@ -118,24 +127,29 @@ func RunBuiltinTests(t TBRun, engine promql.QueryEngine) { t.Run(fn, func(t *testing.T) { content, err := fs.ReadFile(testsFs, fn) require.NoError(t, err) - RunTest(t, string(content), engine) + RunTestWithStorage(t, string(content), engine, newStorage) }) } } // RunTest parses and runs the test against the provided engine. func RunTest(t testutil.T, input string, engine promql.QueryEngine) { - require.NoError(t, runTest(t, input, engine, false)) + RunTestWithStorage(t, input, engine, newTestStorage) +} + +// RunTestWithStorage parses and runs the test against the provided engine and storage. +func RunTestWithStorage(t testutil.T, input string, engine promql.QueryEngine, newStorage func(testutil.T) storage.Storage) { + require.NoError(t, runTest(t, input, engine, newStorage, false)) } // testTest allows tests to be run in "test-the-test" mode (true for // testingMode). This is a special mode for testing test code execution itself. func testTest(t testutil.T, input string, engine promql.QueryEngine) error { - return runTest(t, input, engine, true) + return runTest(t, input, engine, newTestStorage, true) } -func runTest(t testutil.T, input string, engine promql.QueryEngine, testingMode bool) error { - test, err := newTest(t, input, testingMode) +func runTest(t testutil.T, input string, engine promql.QueryEngine, newStorage func(testutil.T) storage.Storage, testingMode bool) error { + test, err := newTest(t, input, testingMode, newStorage) // Why do this before checking err? newTest() can create the test storage and then return an error, // and we want to make sure to clean that up to avoid leaking goroutines. @@ -175,18 +189,20 @@ type test struct { cmds []testCommand - storage *teststorage.TestStorage + open func(testutil.T) storage.Storage + storage storage.Storage context context.Context cancelCtx context.CancelFunc } // newTest returns an initialized empty Test. -func newTest(t testutil.T, input string, testingMode bool) (*test, error) { +func newTest(t testutil.T, input string, testingMode bool, newStorage func(testutil.T) storage.Storage) (*test, error) { test := &test{ T: t, cmds: []testCommand{}, testingMode: testingMode, + open: newStorage, } err := test.parse(input) test.clear() @@ -194,6 +210,8 @@ func newTest(t testutil.T, input string, testingMode bool) (*test, error) { return test, err } +func newTestStorage(t testutil.T) storage.Storage { return teststorage.New(t) } + //go:embed testdata var testsFs embed.FS @@ -1267,7 +1285,7 @@ func (t *test) clear() { if t.cancelCtx != nil { t.cancelCtx() } - t.storage = teststorage.New(t) + t.storage = t.open(t.T) t.context, t.cancelCtx = context.WithCancel(context.Background()) } diff --git a/vendor/github.com/prometheus/prometheus/promql/promqltest/testdata/functions.test b/vendor/github.com/prometheus/prometheus/promql/promqltest/testdata/functions.test index 2ed7ffb6a45..a00ed8a3ea6 100644 --- a/vendor/github.com/prometheus/prometheus/promql/promqltest/testdata/functions.test +++ b/vendor/github.com/prometheus/prometheus/promql/promqltest/testdata/functions.test @@ -256,6 +256,9 @@ clear load 5m testcounter_reset_middle_total 0+10x4 0+10x5 http_requests_total{job="app-server", instance="1", group="canary"} 0+80x10 + testcounter_reset_middle_mix 0+10x4 0+10x5 {{schema:0 sum:1 count:1}} {{schema:1 sum:2 count:2}} + http_requests_mix{job="app-server", instance="1", group="canary"} 0+80x10 {{schema:0 sum:1 count:1}} + http_requests_histogram{job="app-server", instance="1", group="canary"} {{schema:0 sum:1 count:2}}x10 # deriv should return the same as rate in simple cases. eval instant at 50m rate(http_requests_total{group="canary", instance="1", job="app-server"}[50m]) @@ -268,6 +271,16 @@ eval instant at 50m deriv(http_requests_total{group="canary", instance="1", job= eval instant at 50m deriv(testcounter_reset_middle_total[100m]) {} 0.010606060606060607 +# deriv should ignore histograms. +eval instant at 110m deriv(http_requests_mix{group="canary", instance="1", job="app-server"}[110m]) + {group="canary", instance="1", job="app-server"} 0.26666666666666666 + +eval instant at 100m deriv(testcounter_reset_middle_mix[110m]) + {} 0.010606060606060607 + +eval instant at 50m deriv(http_requests_histogram[60m]) + #empty + # predict_linear should return correct result. # X/s = [ 0, 300, 600, 900,1200,1500,1800,2100,2400,2700,3000] # Y = [ 0, 10, 20, 30, 40, 0, 10, 20, 30, 40, 50] @@ -1110,11 +1123,16 @@ clear # Don't return anything when there's something there. load 5m http_requests{job="api-server", instance="0", group="production"} 0+10x10 + http_requests_histogram{job="api-server", instance="0", group="production"} {{schema:0 sum:1 count:1}}x11 eval instant at 50m absent(http_requests) eval instant at 50m absent(sum(http_requests)) +eval instant at 50m absent(http_requests_histogram) + +eval instant at 50m absent(sum(http_requests_histogram)) + clear eval instant at 50m absent(sum(nonexistent{job="testjob", instance="testinstance"})) @@ -1162,6 +1180,7 @@ load 1m httpd_handshake_failures_total{instance="127.0.0.1",job="node"} 1+1x15 httpd_log_lines_total{instance="127.0.0.1",job="node"} 1 ssl_certificate_expiry_seconds{job="ingress"} NaN NaN NaN NaN NaN + http_requests_histogram{path="/foo",instance="127.0.0.1",job="httpd"} {{schema:0 sum:1 count:1}}x11 eval instant at 5m absent_over_time(http_requests_total[5m]) @@ -1205,6 +1224,16 @@ eval instant at 5m absent_over_time({job="ingress"}[4m]) eval instant at 10m absent_over_time({job="ingress"}[4m]) {job="ingress"} 1 +eval instant at 10m absent_over_time(http_requests_histogram[5m]) + +eval instant at 10m absent_over_time(rate(http_requests_histogram[5m])[5m:1m]) + +eval instant at 20m absent_over_time(http_requests_histogram[5m]) + {} 1 + +eval instant at 20m absent_over_time(rate(http_requests_histogram[5m])[5m:1m]) + {} 1 + clear # Testdata for present_over_time() diff --git a/vendor/github.com/prometheus/prometheus/rules/group.go b/vendor/github.com/prometheus/prometheus/rules/group.go index b6feb6f9625..8ad8958f8dd 100644 --- a/vendor/github.com/prometheus/prometheus/rules/group.go +++ b/vendor/github.com/prometheus/prometheus/rules/group.go @@ -44,20 +44,21 @@ import ( // Group is a set of rules that have a logical relation. type Group struct { - name string - file string - interval time.Duration - queryOffset *time.Duration - limit int - rules []Rule - sourceTenants []string - seriesInPreviousEval []map[string]labels.Labels // One per Rule. - staleSeries []labels.Labels - opts *ManagerOptions - mtx sync.Mutex - evaluationTime time.Duration - lastEvaluation time.Time // Wall-clock time of most recent evaluation. - lastEvalTimestamp time.Time // Time slot used for most recent evaluation. + name string + file string + interval time.Duration + queryOffset *time.Duration + limit int + rules []Rule + sourceTenants []string + seriesInPreviousEval []map[string]labels.Labels // One per Rule. + staleSeries []labels.Labels + opts *ManagerOptions + mtx sync.Mutex + evaluationTime time.Duration // Time it took to evaluate the group. + evaluationRuleTimeSum time.Duration // Sum of time it took to evaluate each rule in the group. + lastEvaluation time.Time // Wall-clock time of most recent evaluation. + lastEvalTimestamp time.Time // Time slot used for most recent evaluation. shouldRestore bool @@ -119,6 +120,7 @@ func NewGroup(o GroupOptions) *Group { metrics.EvalFailures.WithLabelValues(key) metrics.GroupLastEvalTime.WithLabelValues(key) metrics.GroupLastDuration.WithLabelValues(key) + metrics.GroupLastRuleDurationSum.WithLabelValues(key) metrics.GroupRules.WithLabelValues(key).Set(float64(len(o.Rules))) metrics.GroupSamples.WithLabelValues(key) metrics.GroupInterval.WithLabelValues(key).Set(o.Interval.Seconds()) @@ -380,6 +382,28 @@ func (g *Group) setEvaluationTime(dur time.Duration) { g.evaluationTime = dur } +// GetRuleEvaluationTimeSum returns the sum of the time it took to evaluate each rule in the group irrespective of concurrency. +func (g *Group) GetRuleEvaluationTimeSum() time.Duration { + g.mtx.Lock() + defer g.mtx.Unlock() + return g.evaluationRuleTimeSum +} + +// updateRuleEvaluationTimeSum updates evaluationRuleTimeSum which is the sum of the time it took to evaluate each rule in the group irrespective of concurrency. +// It collects the times from the rules themselves. +func (g *Group) updateRuleEvaluationTimeSum() { + var sum time.Duration + for _, rule := range g.rules { + sum += rule.GetEvaluationDuration() + } + + g.metrics.GroupLastRuleDurationSum.WithLabelValues(GroupKey(g.file, g.name)).Set(sum.Seconds()) + + g.mtx.Lock() + defer g.mtx.Unlock() + g.evaluationRuleTimeSum = sum +} + // GetLastEvaluation returns the time the last evaluation of the rule group took place. func (g *Group) GetLastEvaluation() time.Time { g.mtx.Lock() @@ -916,6 +940,7 @@ type Metrics struct { GroupInterval *prometheus.GaugeVec GroupLastEvalTime *prometheus.GaugeVec GroupLastDuration *prometheus.GaugeVec + GroupLastRuleDurationSum *prometheus.GaugeVec GroupLastRestoreDuration *prometheus.GaugeVec GroupRules *prometheus.GaugeVec GroupSamples *prometheus.GaugeVec @@ -994,6 +1019,14 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { }, []string{"rule_group"}, ), + GroupLastRuleDurationSum: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: "rule_group_last_rule_duration_sum_seconds", + Help: "The sum of time in seconds it took to evaluate each rule in the group regardless of concurrency. This should be higher than the group duration if rules are evaluated concurrently.", + }, + []string{"rule_group"}, + ), GroupLastRestoreDuration: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: namespace, @@ -1031,6 +1064,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { m.GroupInterval, m.GroupLastEvalTime, m.GroupLastDuration, + m.GroupLastRuleDurationSum, m.GroupLastRestoreDuration, m.GroupRules, m.GroupSamples, diff --git a/vendor/github.com/prometheus/prometheus/rules/manager.go b/vendor/github.com/prometheus/prometheus/rules/manager.go index b5bb0151166..58020126e52 100644 --- a/vendor/github.com/prometheus/prometheus/rules/manager.go +++ b/vendor/github.com/prometheus/prometheus/rules/manager.go @@ -82,6 +82,7 @@ func DefaultEvalIterationFunc(ctx context.Context, g *Group, evalTimestamp time. timeSinceStart := time.Since(start) g.metrics.IterationDuration.Observe(timeSinceStart.Seconds()) + g.updateRuleEvaluationTimeSum() g.setEvaluationTime(timeSinceStart) g.setLastEvaluation(start) g.setLastEvalTimestamp(evalTimestamp) @@ -482,6 +483,11 @@ type ruleDependencyController struct{} // AnalyseRules implements RuleDependencyController. func (c ruleDependencyController) AnalyseRules(rules []Rule) { depMap := buildDependencyMap(rules) + + if depMap == nil { + return + } + for _, r := range rules { r.SetNoDependentRules(depMap.dependents(r) == 0) r.SetNoDependencyRules(depMap.dependencies(r) == 0) diff --git a/vendor/github.com/prometheus/prometheus/scrape/scrape.go b/vendor/github.com/prometheus/prometheus/scrape/scrape.go index 5c6063fa586..4803354cf6f 100644 --- a/vendor/github.com/prometheus/prometheus/scrape/scrape.go +++ b/vendor/github.com/prometheus/prometheus/scrape/scrape.go @@ -361,6 +361,7 @@ func (sp *scrapePool) restartLoops(reuseCache bool) { bodySizeLimit: bodySizeLimit, acceptHeader: acceptHeader(sp.config.ScrapeProtocols, validationScheme), acceptEncodingHeader: acceptEncodingHeader(enableCompression), + metrics: sp.metrics, } newLoop = sp.newLoop(scrapeLoopOptions{ target: t, diff --git a/vendor/github.com/prometheus/prometheus/storage/remote/otlptranslator/prometheus/normalize_name.go b/vendor/github.com/prometheus/prometheus/storage/remote/otlptranslator/prometheus/normalize_name.go index 335705aa8dd..6967ca013c2 100644 --- a/vendor/github.com/prometheus/prometheus/storage/remote/otlptranslator/prometheus/normalize_name.go +++ b/vendor/github.com/prometheus/prometheus/storage/remote/otlptranslator/prometheus/normalize_name.go @@ -30,7 +30,7 @@ import ( // OTLP metrics use the c/s notation as specified at https://ucum.org/ucum.html // (See also https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/metrics/semantic_conventions/README.md#instrument-units) // Prometheus best practices for units: https://prometheus.io/docs/practices/naming/#base-units -// OpenMetrics specification for units: https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#units-and-base-units +// OpenMetrics specification for units: https://github.com/prometheus/OpenMetrics/blob/v1.0.0/specification/OpenMetrics.md#units-and-base-units var unitMap = map[string]string{ // Time "d": "days", @@ -122,17 +122,22 @@ func BuildCompliantName(metric pmetric.Metric, namespace string, addMetricSuffix // Build a normalized name for the specified metric. func normalizeName(metric pmetric.Metric, namespace string, allowUTF8 bool) string { - var translationFunc func(rune) bool + var nameTokens []string + var separators []string if !allowUTF8 { nonTokenMetricCharRE := regexp.MustCompile(`[^a-zA-Z0-9:]`) - translationFunc = func(r rune) bool { return nonTokenMetricCharRE.MatchString(string(r)) } + // Split metric name into "tokens" (of supported metric name runes). + // Note that this has the side effect of replacing multiple consecutive underscores with a single underscore. + // This is part of the OTel to Prometheus specification: https://github.com/open-telemetry/opentelemetry-specification/blob/v1.38.0/specification/compatibility/prometheus_and_openmetrics.md#otlp-metric-points-to-prometheus. + nameTokens = strings.FieldsFunc( + metric.Name(), + func(r rune) bool { return nonTokenMetricCharRE.MatchString(string(r)) }, + ) } else { - translationFunc = func(r rune) bool { return !unicode.IsLetter(r) && !unicode.IsDigit(r) && r != ':' } + translationFunc := func(r rune) bool { return !unicode.IsLetter(r) && !unicode.IsDigit(r) && r != ':' } + // Split metric name into "tokens" (of supported metric name runes). + nameTokens, separators = fieldsFunc(metric.Name(), translationFunc) } - // Split metric name into "tokens" (of supported metric name runes). - // Note that this has the side effect of replacing multiple consecutive underscores with a single underscore. - // This is part of the OTel to Prometheus specification: https://github.com/open-telemetry/opentelemetry-specification/blob/v1.38.0/specification/compatibility/prometheus_and_openmetrics.md#otlp-metric-points-to-prometheus. - nameTokens, separators := fieldsFunc(metric.Name(), translationFunc) // Split unit at the '/' if any unitTokens := strings.SplitN(metric.Unit(), "/", 2) @@ -201,12 +206,14 @@ func normalizeName(metric pmetric.Metric, namespace string, allowUTF8 bool) stri nameTokens = append([]string{namespace}, nameTokens...) } - // Build the string from the tokens + separators. - // If UTF-8 isn't allowed, we'll use underscores as separators. + var normalizedName string if !allowUTF8 { - separators = []string{} + // Build the string from the tokens, separated with underscores + normalizedName = strings.Join(nameTokens, "_") + } else { + // Build the string from the tokens + separators. + normalizedName = join(nameTokens, separators, "_") } - normalizedName := join(nameTokens, separators, "_") // Metric name cannot start with a digit, so prefix it with "_" in this case if normalizedName != "" && unicode.IsDigit(rune(normalizedName[0])) { diff --git a/vendor/github.com/prometheus/prometheus/storage/remote/otlptranslator/prometheusremotewrite/helper.go b/vendor/github.com/prometheus/prometheus/storage/remote/otlptranslator/prometheusremotewrite/helper.go index 1f9c8b6570c..4f12d1f3470 100644 --- a/vendor/github.com/prometheus/prometheus/storage/remote/otlptranslator/prometheusremotewrite/helper.go +++ b/vendor/github.com/prometheus/prometheus/storage/remote/otlptranslator/prometheusremotewrite/helper.go @@ -51,7 +51,7 @@ const ( createdSuffix = "_created" // maxExemplarRunes is the maximum number of UTF-8 exemplar characters // according to the prometheus specification - // https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#exemplars + // https://github.com/prometheus/OpenMetrics/blob/v1.0.0/specification/OpenMetrics.md#exemplars maxExemplarRunes = 128 // Trace and Span id keys are defined as part of the spec: // https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification%2Fmetrics%2Fdatamodel.md#exemplars-2 diff --git a/vendor/github.com/prometheus/prometheus/storage/remote/queue_manager.go b/vendor/github.com/prometheus/prometheus/storage/remote/queue_manager.go index f6d6cbc7e91..475c126eff3 100644 --- a/vendor/github.com/prometheus/prometheus/storage/remote/queue_manager.go +++ b/vendor/github.com/prometheus/prometheus/storage/remote/queue_manager.go @@ -1688,7 +1688,7 @@ func (s *shards) updateMetrics(_ context.Context, err error, sampleCount, exempl s.enqueuedHistograms.Sub(int64(histogramCount)) } -// sendSamples to the remote storage with backoff for recoverable errors. +// sendSamplesWithBackoff to the remote storage with backoff for recoverable errors. func (s *shards) sendSamplesWithBackoff(ctx context.Context, samples []prompb.TimeSeries, sampleCount, exemplarCount, histogramCount, metadataCount int, pBuf *proto.Buffer, buf *[]byte, enc Compression) (WriteResponseStats, error) { // Build the WriteRequest with no metadata. req, highest, lowest, err := buildWriteRequest(s.qm.logger, samples, nil, pBuf, buf, nil, enc) @@ -1802,7 +1802,7 @@ func (s *shards) sendSamplesWithBackoff(ctx context.Context, samples []prompb.Ti return accumulatedStats, err } -// sendV2Samples to the remote storage with backoff for recoverable errors. +// sendV2SamplesWithBackoff to the remote storage with backoff for recoverable errors. func (s *shards) sendV2SamplesWithBackoff(ctx context.Context, samples []writev2.TimeSeries, labels []string, sampleCount, exemplarCount, histogramCount, metadataCount int, pBuf, buf *[]byte, enc Compression) (WriteResponseStats, error) { // Build the WriteRequest with no metadata. req, highest, lowest, err := buildV2WriteRequest(s.qm.logger, samples, labels, pBuf, buf, nil, enc) diff --git a/vendor/github.com/prometheus/prometheus/tsdb/head.go b/vendor/github.com/prometheus/prometheus/tsdb/head.go index 324b0a60607..b4fa652a438 100644 --- a/vendor/github.com/prometheus/prometheus/tsdb/head.go +++ b/vendor/github.com/prometheus/prometheus/tsdb/head.go @@ -1079,7 +1079,7 @@ func (h *Head) PostingsCardinalityStats(statsByLabelName string, limit int) *ind return h.cardinalityCache } h.cardinalityCacheKey = cacheKey - h.cardinalityCache = h.postings.Stats(statsByLabelName, limit) + h.cardinalityCache = h.postings.Stats(statsByLabelName, limit, labels.SizeOfLabels) h.lastPostingsStatsCall = time.Duration(time.Now().Unix()) * time.Second return h.cardinalityCache diff --git a/vendor/github.com/prometheus/prometheus/tsdb/head_read.go b/vendor/github.com/prometheus/prometheus/tsdb/head_read.go index a3cd7b653d1..9ec0595dbdb 100644 --- a/vendor/github.com/prometheus/prometheus/tsdb/head_read.go +++ b/vendor/github.com/prometheus/prometheus/tsdb/head_read.go @@ -37,6 +37,10 @@ func (h *Head) Index() (IndexReader, error) { return h.indexRange(math.MinInt64, math.MaxInt64), nil } +func (h *Head) MustIndex() IndexReader { + return h.indexRange(math.MinInt64, math.MaxInt64) +} + func (h *Head) indexRange(mint, maxt int64) *headIndexReader { if hmin := h.MinTime(); hmin > mint { mint = hmin diff --git a/vendor/github.com/prometheus/prometheus/tsdb/index/postings.go b/vendor/github.com/prometheus/prometheus/tsdb/index/postings.go index b44b4089275..3e550ed5e65 100644 --- a/vendor/github.com/prometheus/prometheus/tsdb/index/postings.go +++ b/vendor/github.com/prometheus/prometheus/tsdb/index/postings.go @@ -72,7 +72,7 @@ type MemPostings struct { // lvs holds the label values for each label name. // lvs[name] is essentially an unsorted append-only list of all keys in m[name] // mtx must be held when interacting with lvs. - // Since it's append-only, it is safe to the label values slice after releasing the lock. + // Since it's append-only, it is safe to read the label values slice after releasing the lock. lvs map[string][]string ordered bool @@ -190,7 +190,8 @@ type PostingsStats struct { } // Stats calculates the cardinality statistics from postings. -func (p *MemPostings) Stats(label string, limit int) *PostingsStats { +// Caller can pass in a function which computes the space required for n series with a given label. +func (p *MemPostings) Stats(label string, limit int, labelSizeFunc func(string, string, uint64) uint64) *PostingsStats { var size uint64 p.mtx.RLock() @@ -218,7 +219,7 @@ func (p *MemPostings) Stats(label string, limit int) *PostingsStats { } seriesCnt := uint64(len(values)) labelValuePairs.push(Stat{Name: n + "=" + name, Count: seriesCnt}) - size += uint64(len(name)) * seriesCnt + size += labelSizeFunc(n, name, seriesCnt) } labelValueLength.push(Stat{Name: n, Count: size}) } diff --git a/vendor/github.com/prometheus/prometheus/tsdb/querier.go b/vendor/github.com/prometheus/prometheus/tsdb/querier.go index 7f4c4317f23..0d2f1ddcd89 100644 --- a/vendor/github.com/prometheus/prometheus/tsdb/querier.go +++ b/vendor/github.com/prometheus/prometheus/tsdb/querier.go @@ -270,7 +270,7 @@ func PostingsForMatchers(ctx context.Context, ix IndexPostingsReader, ms ...*lab its = append(its, it) case m.Type == labels.MatchNotRegexp && m.Value == ".+": // .+ regexp matches any non-empty string: get postings for all label values and remove them. - its = append(notIts, ix.PostingsForAllLabelValues(ctx, m.Name)) + notIts = append(notIts, ix.PostingsForAllLabelValues(ctx, m.Name)) case labelMustBeSet[m.Name]: // If this matcher must be non-empty, we can be smarter. diff --git a/vendor/github.com/prometheus/prometheus/util/logging/dedupe.go b/vendor/github.com/prometheus/prometheus/util/logging/dedupe.go index e7dff20f786..8137f4f22b9 100644 --- a/vendor/github.com/prometheus/prometheus/util/logging/dedupe.go +++ b/vendor/github.com/prometheus/prometheus/util/logging/dedupe.go @@ -33,7 +33,7 @@ type Deduper struct { next *slog.Logger repeat time.Duration quit chan struct{} - mtx sync.RWMutex + mtx *sync.RWMutex seen map[string]time.Time } @@ -43,6 +43,7 @@ func Dedupe(next *slog.Logger, repeat time.Duration) *Deduper { next: next, repeat: repeat, quit: make(chan struct{}), + mtx: new(sync.RWMutex), seen: map[string]time.Time{}, } go d.run() @@ -88,6 +89,7 @@ func (d *Deduper) WithAttrs(attrs []slog.Attr) slog.Handler { repeat: d.repeat, quit: d.quit, seen: d.seen, + mtx: d.mtx, } } @@ -103,6 +105,7 @@ func (d *Deduper) WithGroup(name string) slog.Handler { repeat: d.repeat, quit: d.quit, seen: d.seen, + mtx: d.mtx, } } diff --git a/vendor/modules.txt b/vendor/modules.txt index 20a6a3279f2..b47625f48ec 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -997,7 +997,7 @@ github.com/prometheus/client_golang/prometheus/testutil/promlint/validations # github.com/prometheus/client_model v0.6.1 ## explicit; go 1.19 github.com/prometheus/client_model/go -# github.com/prometheus/common v0.60.1 +# github.com/prometheus/common v0.61.0 ## explicit; go 1.21 github.com/prometheus/common/config github.com/prometheus/common/expfmt @@ -1017,7 +1017,7 @@ github.com/prometheus/exporter-toolkit/web github.com/prometheus/procfs github.com/prometheus/procfs/internal/fs github.com/prometheus/procfs/internal/util -# github.com/prometheus/prometheus v1.99.0 => github.com/grafana/mimir-prometheus v0.0.0-20241210170917-0a0a41616520 +# github.com/prometheus/prometheus v1.99.0 => github.com/grafana/mimir-prometheus v0.0.0-20241224134504-460b7be5bce8 ## explicit; go 1.22.0 github.com/prometheus/prometheus/config github.com/prometheus/prometheus/discovery @@ -1494,7 +1494,7 @@ google.golang.org/genproto/googleapis/api/annotations google.golang.org/genproto/googleapis/rpc/code google.golang.org/genproto/googleapis/rpc/errdetails google.golang.org/genproto/googleapis/rpc/status -# google.golang.org/grpc v1.67.1 => google.golang.org/grpc v1.65.0 +# google.golang.org/grpc v1.68.1 => google.golang.org/grpc v1.65.0 ## explicit; go 1.21 google.golang.org/grpc google.golang.org/grpc/attributes @@ -1688,7 +1688,7 @@ sigs.k8s.io/kustomize/kyaml/yaml/walk sigs.k8s.io/yaml sigs.k8s.io/yaml/goyaml.v2 sigs.k8s.io/yaml/goyaml.v3 -# github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20241210170917-0a0a41616520 +# github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20241224134504-460b7be5bce8 # github.com/hashicorp/memberlist => github.com/grafana/memberlist v0.3.1-0.20220714140823-09ffed8adbbe # gopkg.in/yaml.v3 => github.com/colega/go-yaml-yaml v0.0.0-20220720105220-255a8d16d094 # github.com/grafana/regexp => github.com/grafana/regexp v0.0.0-20240531075221-3685f1377d7b From 4706bde40e735e38c882d52827fcf94e34f1bdb3 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 24 Dec 2024 17:14:12 +0100 Subject: [PATCH 039/105] vendor new mimir-prometheus --- go.sum | 5 ----- pkg/costattribution/manager.go | 8 ++++---- pkg/costattribution/tracker.go | 3 ++- pkg/mimir/modules.go | 7 +++---- vendor/modules.txt | 4 ---- 5 files changed, 9 insertions(+), 18 deletions(-) diff --git a/go.sum b/go.sum index 0af76e267d7..c4ca1735f01 100644 --- a/go.sum +++ b/go.sum @@ -1279,13 +1279,8 @@ github.com/grafana/gomemcache v0.0.0-20241016125027-0a5bcc5aef40 h1:1TeKhyS+pvzO github.com/grafana/gomemcache v0.0.0-20241016125027-0a5bcc5aef40/go.mod h1:IGRj8oOoxwJbHBYl1+OhS9UjQR0dv6SQOep7HqmtyFU= github.com/grafana/memberlist v0.3.1-0.20220714140823-09ffed8adbbe h1:yIXAAbLswn7VNWBIvM71O2QsgfgW9fRXZNR0DXe6pDU= github.com/grafana/memberlist v0.3.1-0.20220714140823-09ffed8adbbe/go.mod h1:MS2lj3INKhZjWNqd3N0m3J+Jxf3DAOnAH9VT3Sh9MUE= -<<<<<<< HEAD github.com/grafana/mimir-prometheus v0.0.0-20241224134504-460b7be5bce8 h1:/TwjdoLAxL7URxKJGJUeI539w6LUqcwIcj0WCUxDY/c= github.com/grafana/mimir-prometheus v0.0.0-20241224134504-460b7be5bce8/go.mod h1:a5LEa2Vy87wOp0Vu6sLmEIR1V59fqH3QosOSiErAr30= -======= -github.com/grafana/mimir-prometheus v0.0.0-20241219104229-b50052711673 h1:z3nSCBMtEMtD/LAIkwrHsT03n7qgeU+0M6rEMZQbxVI= -github.com/grafana/mimir-prometheus v0.0.0-20241219104229-b50052711673/go.mod h1:a5LEa2Vy87wOp0Vu6sLmEIR1V59fqH3QosOSiErAr30= ->>>>>>> origin/r322 github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956 h1:em1oddjXL8c1tL0iFdtVtPloq2hRPen2MJQKoAWpxu0= github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956/go.mod h1:qtI1ogk+2JhVPIXVc6q+NHziSmy2W5GbdQZFUHADCBU= github.com/grafana/prometheus-alertmanager v0.25.1-0.20240930132144-b5e64e81e8d3 h1:6D2gGAwyQBElSrp3E+9lSr7k8gLuP3Aiy20rweLWeBw= diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 6dcf9bc8728..ba6f5608f4f 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -109,16 +109,16 @@ func (m *Manager) deleteTracker(userID string) { } func (m *Manager) updateTracker(userID string) *Tracker { - t := m.Tracker(userID) - - if t == nil { + if !m.EnabledForUser(userID) { m.deleteTracker(userID) return nil } + t := m.Tracker(userID) + lbls := m.limits.CostAttributionLabels(userID) - newTrackedLabels := make([]string, 0, len(lbls)) + newTrackedLabels := make([]string, len(lbls)) copy(newTrackedLabels, lbls) // sort the labels to ensure the order is consistent diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 74233e3a686..3caceb37a7f 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -80,6 +80,7 @@ func newTracker(userID string, trackedLabels []string, limit int, cooldown time. logger: logger, overflowLabels: overflowLabels, totalFailedActiveSeries: atomic.NewFloat64(0), + cooldownUntil: 0, } tracker.discardedSampleAttribution = prometheus.NewDesc("cortex_discarded_attributed_samples_total", @@ -329,7 +330,7 @@ func (t *Tracker) createNewObservation(key []byte, ts int64, activeSeriesIncreme func (t *Tracker) recoverFromOverflow(deadline int64) bool { t.observedMtx.RLock() - if t.cooldownUntil != 0 && t.cooldownUntil < deadline { + if t.cooldownUntil > 0 && t.cooldownUntil < deadline { if len(t.observed) <= t.maxCardinality { t.observedMtx.RUnlock() return true diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index cf4999a01d9..d4a6f0bca4c 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -1203,10 +1203,10 @@ func (t *Mimir) setupModuleManager() error { Overrides: {RuntimeConfig}, OverridesExporter: {Overrides, MemberlistKV, Vault}, Distributor: {DistributorService, API, ActiveGroupsCleanupService, Vault}, - DistributorService: {IngesterRing, IngesterPartitionRing, Overrides, Vault, CostAttributionService}, - CostAttributionService: {API, Overrides}, + DistributorService: {IngesterRing, IngesterPartitionRing, Overrides, Vault, CostAttributionService}, + CostAttributionService: {API, Overrides}, Ingester: {IngesterService, API, ActiveGroupsCleanupService, Vault}, - IngesterService: {IngesterRing, IngesterPartitionRing, Overrides, RuntimeConfig, MemberlistKV, CostAttributionService}, + IngesterService: {IngesterRing, IngesterPartitionRing, Overrides, RuntimeConfig, MemberlistKV, CostAttributionService}, Flusher: {Overrides, API}, Queryable: {Overrides, DistributorService, IngesterRing, IngesterPartitionRing, API, StoreQueryable, MemberlistKV}, Querier: {TenantFederation, Vault}, @@ -1228,7 +1228,6 @@ func (t *Mimir) setupModuleManager() error { Read: {QueryFrontend, Querier}, Backend: {QueryScheduler, Ruler, StoreGateway, Compactor, AlertManager, OverridesExporter}, All: {QueryFrontend, Querier, Ingester, Distributor, StoreGateway, Ruler, Compactor}, ->>>>>>> origin/r322 } for mod, targets := range deps { if err := mm.AddDependency(mod, targets...); err != nil { diff --git a/vendor/modules.txt b/vendor/modules.txt index 1f67df92c28..cedb067c5ca 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1688,11 +1688,7 @@ sigs.k8s.io/kustomize/kyaml/yaml/walk sigs.k8s.io/yaml sigs.k8s.io/yaml/goyaml.v2 sigs.k8s.io/yaml/goyaml.v3 -<<<<<<< HEAD # github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20241224134504-460b7be5bce8 -======= -# github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20241219104229-b50052711673 ->>>>>>> origin/r322 # github.com/hashicorp/memberlist => github.com/grafana/memberlist v0.3.1-0.20220714140823-09ffed8adbbe # gopkg.in/yaml.v3 => github.com/colega/go-yaml-yaml v0.0.0-20220720105220-255a8d16d094 # github.com/grafana/regexp => github.com/grafana/regexp v0.0.0-20240531075221-3685f1377d7b From 1ab1f004491008554952703204449eeb28c1740d Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 24 Dec 2024 17:15:51 +0100 Subject: [PATCH 040/105] rename function --- pkg/costattribution/manager.go | 2 +- pkg/costattribution/tracker.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index ba6f5608f4f..ffc4746c1f9 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -155,7 +155,7 @@ func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) error { t.cleanupTrackerAttribution(key) } - if t.recoverFromOverflow(deadline) { + if t.recoveredFromOverflow(deadline) { m.deleteTracker(userID) } } diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 3caceb37a7f..2b3e302715f 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -328,7 +328,7 @@ func (t *Tracker) createNewObservation(key []byte, ts int64, activeSeriesIncreme } } -func (t *Tracker) recoverFromOverflow(deadline int64) bool { +func (t *Tracker) recoveredFromOverflow(deadline int64) bool { t.observedMtx.RLock() if t.cooldownUntil > 0 && t.cooldownUntil < deadline { if len(t.observed) <= t.maxCardinality { From 8111b6c1c1d8e34db9442f972d8beab5ae062867 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Wed, 25 Dec 2024 00:33:27 +0100 Subject: [PATCH 041/105] fix lint --- pkg/costattribution/tracker.go | 3 ++- pkg/costattribution/tracker_test.go | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 2b3e302715f..8dd61a864ec 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -10,10 +10,11 @@ import ( "time" "github.com/go-kit/log" - "github.com/grafana/mimir/pkg/mimirpb" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/prometheus/model/labels" "go.uber.org/atomic" + + "github.com/grafana/mimir/pkg/mimirpb" ) type TrackerState int diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index bd5360f5552..3ad8b0a4dea 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -8,12 +8,13 @@ import ( "testing" "time" - "github.com/grafana/mimir/pkg/mimirpb" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" "github.com/prometheus/prometheus/model/labels" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + + "github.com/grafana/mimir/pkg/mimirpb" ) func TestTracker_hasSameLabels(t *testing.T) { From 17b64a911cdccb0a1d5c1b2fd6eb6837caf1321a Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 26 Dec 2024 16:02:44 +0100 Subject: [PATCH 042/105] add unittest in active series --- pkg/costattribution/manager_test.go | 34 +--- pkg/costattribution/testutils/test_utils.go | 30 ++++ .../activeseries/active_series_test.go | 156 ++++++++++++++++++ 3 files changed, 191 insertions(+), 29 deletions(-) create mode 100644 pkg/costattribution/testutils/test_utils.go diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index bf111790e9e..2acda2ffaeb 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -12,37 +12,13 @@ import ( "github.com/prometheus/client_golang/prometheus/testutil" "github.com/stretchr/testify/assert" + "github.com/grafana/mimir/pkg/costattribution/testutils" "github.com/grafana/mimir/pkg/mimirpb" - "github.com/grafana/mimir/pkg/util/validation" ) -func getMockLimits(idx int) (*validation.Overrides, error) { - baseLimits := map[string]*validation.Limits{ - "user1": {MaxCostAttributionCardinalityPerUser: 5, CostAttributionLabels: []string{"team"}}, - "user2": {MaxCostAttributionCardinalityPerUser: 2, CostAttributionLabels: []string{}}, - "user3": {MaxCostAttributionCardinalityPerUser: 2, CostAttributionLabels: []string{"department", "service"}}, - "user4": {MaxCostAttributionCardinalityPerUser: 5, CostAttributionLabels: []string{"platform"}}, - } - - switch idx { - case 1: - baseLimits["user1"].CostAttributionLabels = []string{} - case 2: - baseLimits["user3"].CostAttributionLabels = []string{"team", "feature"} - case 3: - baseLimits["user3"].MaxCostAttributionCardinalityPerUser = 3 - case 4: - baseLimits["user1"].MaxCostAttributionCardinalityPerUser = 2 - case 5: - baseLimits["user1"].CostAttributionLabels = []string{"department"} - } - - return validation.NewOverrides(validation.Limits{}, validation.NewMockTenantLimits(baseLimits)) -} - func newTestManager() *Manager { logger := log.NewNopLogger() - limits, _ := getMockLimits(0) + limits, _ := testutils.GetMockCostAttributionLimits(0) reg := prometheus.NewRegistry() manager, err := NewManager(5*time.Second, time.Second, 10*time.Second, logger, limits, reg) if err != nil { @@ -105,7 +81,7 @@ func TestManager_CreateDeleteTracker(t *testing.T) { t.Run("Disabling user cost attribution", func(t *testing.T) { var err error - manager.limits, err = getMockLimits(1) + manager.limits, err = testutils.GetMockCostAttributionLimits(1) assert.NoError(t, err) assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(11, 0).Unix())) assert.Equal(t, 1, len(manager.trackersByUserID)) @@ -120,7 +96,7 @@ func TestManager_CreateDeleteTracker(t *testing.T) { t.Run("Updating user cardinality and labels", func(t *testing.T) { var err error - manager.limits, err = getMockLimits(2) + manager.limits, err = testutils.GetMockCostAttributionLimits(2) assert.NoError(t, err) assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(12, 0).Unix())) assert.Equal(t, 1, len(manager.trackersByUserID)) @@ -171,7 +147,7 @@ func TestManager_PurgeInactiveAttributionsUntil(t *testing.T) { t.Run("Purge after inactive timeout", func(t *testing.T) { // disable cost attribution for user1 to test purging - manager.limits, _ = getMockLimits(1) + manager.limits, _ = testutils.GetMockCostAttributionLimits(1) assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(5, 0).Unix())) // User3's tracker should remain since it's active, user1's tracker should be removed diff --git a/pkg/costattribution/testutils/test_utils.go b/pkg/costattribution/testutils/test_utils.go new file mode 100644 index 00000000000..7ca86eb9154 --- /dev/null +++ b/pkg/costattribution/testutils/test_utils.go @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package testutils + +import "github.com/grafana/mimir/pkg/util/validation" + +func GetMockCostAttributionLimits(idx int) (*validation.Overrides, error) { + baseLimits := map[string]*validation.Limits{ + "user1": {MaxCostAttributionCardinalityPerUser: 5, CostAttributionLabels: []string{"team"}}, + "user2": {MaxCostAttributionCardinalityPerUser: 2, CostAttributionLabels: []string{}}, + "user3": {MaxCostAttributionCardinalityPerUser: 2, CostAttributionLabels: []string{"department", "service"}}, + "user4": {MaxCostAttributionCardinalityPerUser: 5, CostAttributionLabels: []string{"platform"}}, + "user5": {MaxCostAttributionCardinalityPerUser: 10, CostAttributionLabels: []string{"a"}}, + } + + switch idx { + case 1: + baseLimits["user1"].CostAttributionLabels = []string{} + case 2: + baseLimits["user3"].CostAttributionLabels = []string{"team", "feature"} + case 3: + baseLimits["user3"].MaxCostAttributionCardinalityPerUser = 3 + case 4: + baseLimits["user1"].MaxCostAttributionCardinalityPerUser = 2 + case 5: + baseLimits["user1"].CostAttributionLabels = []string{"department"} + } + + return validation.NewOverrides(validation.Limits{}, validation.NewMockTenantLimits(baseLimits)) +} diff --git a/pkg/ingester/activeseries/active_series_test.go b/pkg/ingester/activeseries/active_series_test.go index ca36450f823..a0c48e8d7fb 100644 --- a/pkg/ingester/activeseries/active_series_test.go +++ b/pkg/ingester/activeseries/active_series_test.go @@ -9,17 +9,25 @@ import ( "fmt" "math" "strconv" + "strings" "sync" "testing" "time" + "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/storage" + "github.com/prometheus/prometheus/tsdb" "github.com/prometheus/prometheus/tsdb/chunks" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" "go.uber.org/atomic" + "github.com/grafana/mimir/pkg/costattribution" + catestutils "github.com/grafana/mimir/pkg/costattribution/testutils" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" ) @@ -227,6 +235,154 @@ func TestActiveSeries_ContainsRef(t *testing.T) { } } +type mockIndex struct { + mock.Mock + tsdb.IndexReader + existingLabels map[storage.SeriesRef]labels.Labels +} + +func (m *mockIndex) Series(ref storage.SeriesRef, builder *labels.ScratchBuilder, chks *[]chunks.Meta) error { + if ls, ok := m.existingLabels[ref]; ok { + builder.Assign(ls) + return nil + } + return fmt.Errorf("no labels found for ref %d", ref) +} + +func TestActiveSeries_UpdateSeries_WithCostAttribution(t *testing.T) { + limits, _ := catestutils.GetMockCostAttributionLimits(0) + reg := prometheus.NewRegistry() + manager, err := costattribution.NewManager(5*time.Second, time.Second, 10*time.Second, log.NewNopLogger(), limits, reg) + require.NoError(t, err) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, manager.Tracker("user5")) + testCostAttributionUpdateSeries(t, c, reg) +} + +func testCostAttributionUpdateSeries(t *testing.T, c *ActiveSeries, reg *prometheus.Registry) { + ref1, ls1 := storage.SeriesRef(1), labels.FromStrings("a", "1") + ref2, ls2 := storage.SeriesRef(2), labels.FromStrings("a", "2") + ref3, ls3 := storage.SeriesRef(3), labels.FromStrings("a", "3") + ref4, ls4 := storage.SeriesRef(4), labels.FromStrings("a", "4") + ref5, ls5 := storage.SeriesRef(5), labels.FromStrings("a", "5") + ref6 := storage.SeriesRef(6) // same as ls2 + ref7, ls7 := storage.SeriesRef(7), labels.FromStrings("a", "2", "b", "1") + idx := mockIndex{existingLabels: map[storage.SeriesRef]labels.Labels{ref1: ls1, ref2: ls2, ref3: ls3, ref4: ls4, ref5: ls5, ref7: ls7}} + valid := c.Purge(time.Now(), &idx) + assert.True(t, valid) + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(""), "cortex_ingester_attributed_active_series")) + + c.UpdateSeries(ls1, ref1, time.Now(), -1, &idx) + valid = c.Purge(time.Now(), &idx) + assert.True(t, valid) + expectedMetrics := ` + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{a="1",tenant="user5",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + c.UpdateSeries(ls2, ref2, time.Now(), -1, &idx) + valid = c.Purge(time.Now(), &idx) + assert.True(t, valid) + expectedMetrics = ` + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{a="1",tenant="user5",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{a="2",tenant="user5",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + c.UpdateSeries(ls3, ref3, time.Now(), -1, &idx) + valid = c.Purge(time.Now(), &idx) + assert.True(t, valid) + expectedMetrics = ` + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{a="1",tenant="user5",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{a="2",tenant="user5",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{a="3",tenant="user5",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + // ref7 has the same cost attribution labels as ref2, but it's a different series. + c.UpdateSeries(ls7, ref7, time.Now(), -1, &idx) + valid = c.Purge(time.Now(), &idx) + assert.True(t, valid) + expectedMetrics = ` + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{a="1",tenant="user5",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{a="2",tenant="user5",tracker="cost-attribution"} 2 + cortex_ingester_attributed_active_series{a="3",tenant="user5",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + c.UpdateSeries(ls4, ref4, time.Now(), 3, &idx) + valid = c.Purge(time.Now(), &idx) + assert.True(t, valid) + expectedMetrics = ` + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{a="1",tenant="user5",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{a="2",tenant="user5",tracker="cost-attribution"} 2 + cortex_ingester_attributed_active_series{a="3",tenant="user5",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{a="4",tenant="user5",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + c.UpdateSeries(ls5, ref5, time.Now(), 5, &idx) + valid = c.Purge(time.Now(), &idx) + assert.True(t, valid) + expectedMetrics = ` + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{a="1",tenant="user5",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{a="2",tenant="user5",tracker="cost-attribution"} 2 + cortex_ingester_attributed_active_series{a="3",tenant="user5",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{a="4",tenant="user5",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{a="5",tenant="user5",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + // changing a metric from float to histogram + c.UpdateSeries(ls3, ref3, time.Now(), 6, &idx) + valid = c.Purge(time.Now(), &idx) + assert.True(t, valid) + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + // fewer (zero) buckets for a histogram + c.UpdateSeries(ls4, ref4, time.Now(), 0, &idx) + valid = c.Purge(time.Now(), &idx) + assert.True(t, valid) + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + // ref2 is deleted from the head, but still active. + c.PostDeletion(map[chunks.HeadSeriesRef]labels.Labels{ + chunks.HeadSeriesRef(ref2): ls2, + }) + // Numbers don't change. + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + // Don't change after purging. + valid = c.Purge(time.Now(), &idx) + assert.True(t, valid) + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + // // ls2 is pushed again, this time with ref6 + c.UpdateSeries(ls2, ref6, time.Now(), -1, &idx) + // Numbers don't change. + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + // Don't change after purging. + valid = c.Purge(time.Now(), &idx) + assert.True(t, valid) + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + // Make sure deleted is empty, so we're not leaking. + assert.Empty(t, c.deleted.refs) + assert.Empty(t, c.deleted.keys) +} + func TestActiveSeries_UpdateSeries_WithMatchers(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) c := NewActiveSeries(asm, DefaultTimeout, nil) From a1910441078ae1d1f44c3081e104e1fa29d42a7a Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 26 Dec 2024 19:52:12 +0100 Subject: [PATCH 043/105] copy slice instead --- pkg/costattribution/tracker.go | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 8dd61a864ec..705af9a4196 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -57,22 +57,23 @@ type Tracker struct { } func newTracker(userID string, trackedLabels []string, limit int, cooldown time.Duration, logger log.Logger) *Tracker { - slices.Sort(trackedLabels) + orderedLables := slices.Clone(trackedLabels) + slices.Sort(orderedLables) // Create a map for fast lookup, and overflow labels to export when overflow happens - index := make(map[string]int, len(trackedLabels)) - overflowLabels := make([]string, len(trackedLabels)+2) - for i, label := range trackedLabels { + index := make(map[string]int, len(orderedLables)) + overflowLabels := make([]string, len(orderedLables)+2) + for i, label := range orderedLables { index[label] = i overflowLabels[i] = overflowValue } - overflowLabels[len(trackedLabels)] = userID - overflowLabels[len(trackedLabels)+1] = overflowValue + overflowLabels[len(orderedLables)] = userID + overflowLabels[len(orderedLables)+1] = overflowValue tracker := &Tracker{ userID: userID, - labels: trackedLabels, + labels: orderedLables, index: index, maxCardinality: limit, observed: make(map[string]*observation), @@ -84,18 +85,20 @@ func newTracker(userID string, trackedLabels []string, limit int, cooldown time. cooldownUntil: 0, } + variableLabels := slices.Clone(orderedLables) + variableLabels = append(variableLabels, tenantLabel, "reason") tracker.discardedSampleAttribution = prometheus.NewDesc("cortex_discarded_attributed_samples_total", "The total number of samples that were discarded per attribution.", - append(trackedLabels, tenantLabel, "reason"), + variableLabels, prometheus.Labels{trackerLabel: defaultTrackerName}) tracker.receivedSamplesAttribution = prometheus.NewDesc("cortex_received_attributed_samples_total", "The total number of samples that were received per attribution.", - append(trackedLabels, tenantLabel), + variableLabels[:len(variableLabels)-1], prometheus.Labels{trackerLabel: defaultTrackerName}) tracker.activeSeriesPerUserAttribution = prometheus.NewDesc("cortex_ingester_attributed_active_series", - "The total number of active series per user and attribution.", append(trackedLabels, tenantLabel), + "The total number of active series per user and attribution.", variableLabels[:len(variableLabels)-1], prometheus.Labels{trackerLabel: defaultTrackerName}) tracker.failedActiveSeriesDecrement = prometheus.NewDesc("cortex_ingester_attributed_active_series_failure", "The total number of failed active series decrement per user and tracker.", []string{tenantLabel}, From 2bb1845fbf4bf1b8ca7d2518fe199149b76b87e2 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 26 Dec 2024 21:33:05 +0100 Subject: [PATCH 044/105] add test for discarded samples --- pkg/costattribution/manager_test.go | 8 +-- pkg/costattribution/testutils/test_utils.go | 9 ++- pkg/costattribution/tracker.go | 7 +- pkg/distributor/validate_test.go | 64 +++++++++++++------ .../activeseries/active_series_test.go | 4 +- 5 files changed, 64 insertions(+), 28 deletions(-) diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index 2acda2ffaeb..81a94a674c9 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -18,7 +18,7 @@ import ( func newTestManager() *Manager { logger := log.NewNopLogger() - limits, _ := testutils.GetMockCostAttributionLimits(0) + limits, _ := testutils.NewMockCostAttributionLimits(0) reg := prometheus.NewRegistry() manager, err := NewManager(5*time.Second, time.Second, 10*time.Second, logger, limits, reg) if err != nil { @@ -81,7 +81,7 @@ func TestManager_CreateDeleteTracker(t *testing.T) { t.Run("Disabling user cost attribution", func(t *testing.T) { var err error - manager.limits, err = testutils.GetMockCostAttributionLimits(1) + manager.limits, err = testutils.NewMockCostAttributionLimits(1) assert.NoError(t, err) assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(11, 0).Unix())) assert.Equal(t, 1, len(manager.trackersByUserID)) @@ -96,7 +96,7 @@ func TestManager_CreateDeleteTracker(t *testing.T) { t.Run("Updating user cardinality and labels", func(t *testing.T) { var err error - manager.limits, err = testutils.GetMockCostAttributionLimits(2) + manager.limits, err = testutils.NewMockCostAttributionLimits(2) assert.NoError(t, err) assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(12, 0).Unix())) assert.Equal(t, 1, len(manager.trackersByUserID)) @@ -147,7 +147,7 @@ func TestManager_PurgeInactiveAttributionsUntil(t *testing.T) { t.Run("Purge after inactive timeout", func(t *testing.T) { // disable cost attribution for user1 to test purging - manager.limits, _ = testutils.GetMockCostAttributionLimits(1) + manager.limits, _ = testutils.NewMockCostAttributionLimits(1) assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(5, 0).Unix())) // User3's tracker should remain since it's active, user1's tracker should be removed diff --git a/pkg/costattribution/testutils/test_utils.go b/pkg/costattribution/testutils/test_utils.go index 7ca86eb9154..f79f4861cd9 100644 --- a/pkg/costattribution/testutils/test_utils.go +++ b/pkg/costattribution/testutils/test_utils.go @@ -4,7 +4,7 @@ package testutils import "github.com/grafana/mimir/pkg/util/validation" -func GetMockCostAttributionLimits(idx int) (*validation.Overrides, error) { +func NewMockCostAttributionLimits(idx int, lvs ...string) (*validation.Overrides, error) { baseLimits := map[string]*validation.Limits{ "user1": {MaxCostAttributionCardinalityPerUser: 5, CostAttributionLabels: []string{"team"}}, "user2": {MaxCostAttributionCardinalityPerUser: 2, CostAttributionLabels: []string{}}, @@ -12,7 +12,12 @@ func GetMockCostAttributionLimits(idx int) (*validation.Overrides, error) { "user4": {MaxCostAttributionCardinalityPerUser: 5, CostAttributionLabels: []string{"platform"}}, "user5": {MaxCostAttributionCardinalityPerUser: 10, CostAttributionLabels: []string{"a"}}, } - + if len(lvs) > 0 { + baseLimits[lvs[0]] = &validation.Limits{ + MaxCostAttributionCardinalityPerUser: 10, + CostAttributionLabels: lvs[1:], + } + } switch idx { case 1: baseLimits["user1"].CostAttributionLabels = []string{} diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 705af9a4196..14e66b3a424 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -269,7 +269,12 @@ func (t *Tracker) updateObservations(key []byte, ts int64, activeSeriesIncrement } if discardedSampleIncrement > 0 && reason != nil { o.discardedSampleMtx.Lock() - o.discardedSample[*reason] = *atomic.NewFloat64(discardedSampleIncrement) + if v, ok := o.discardedSample[*reason]; ok { + v.Add(discardedSampleIncrement) + o.discardedSample[*reason] = v + } else { + o.discardedSample[*reason] = *atomic.NewFloat64(discardedSampleIncrement) + } o.discardedSampleMtx.Unlock() } } else if len(t.observed) < t.maxCardinality*2 && createIfDoesNotExist { diff --git a/pkg/distributor/validate_test.go b/pkg/distributor/validate_test.go index 9707b89f378..d0539a9d45b 100644 --- a/pkg/distributor/validate_test.go +++ b/pkg/distributor/validate_test.go @@ -14,6 +14,7 @@ import ( "time" "unicode/utf8" + "github.com/go-kit/log" "github.com/gogo/protobuf/proto" "github.com/grafana/dskit/grpcutil" "github.com/grafana/dskit/httpgrpc" @@ -25,6 +26,8 @@ import ( grpcstatus "google.golang.org/grpc/status" golangproto "google.golang.org/protobuf/proto" + "github.com/grafana/mimir/pkg/costattribution" + catestutils "github.com/grafana/mimir/pkg/costattribution/testutils" "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/util/validation" ) @@ -75,8 +78,13 @@ func TestValidateLabels(t *testing.T) { cfg.maxLabelValueLength = 25 cfg.maxLabelNameLength = 25 - cfg.maxLabelNamesPerSeries = 2 - cfg.maxLabelNamesPerInfoSeries = 3 + cfg.maxLabelNamesPerSeries = 3 + cfg.maxLabelNamesPerInfoSeries = 4 + limits, _ := catestutils.NewMockCostAttributionLimits(0, userID, "team") + careg := prometheus.NewRegistry() + manager, err := costattribution.NewManager(5*time.Second, time.Second, 10*time.Second, log.NewNopLogger(), limits, careg) + require.NoError(t, err) + cat := manager.Tracker(userID) for _, c := range []struct { metric model.Metric @@ -85,25 +93,25 @@ func TestValidateLabels(t *testing.T) { err error }{ { - metric: map[model.LabelName]model.LabelValue{}, + metric: map[model.LabelName]model.LabelValue{"team": "a"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: errors.New(noMetricNameMsgFormat), }, { - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: " "}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: " ", "team": "a"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: fmt.Errorf(invalidMetricNameMsgFormat, " "), }, { - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "metric_name_with_\xb0_invalid_utf8_\xb0"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "metric_name_with_\xb0_invalid_utf8_\xb0", "team": "a"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: fmt.Errorf(invalidMetricNameMsgFormat, "metric_name_with__invalid_utf8_ (non-ascii characters removed)"), }, { - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "valid", "foo ": "bar"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "valid", "foo ": "bar", "team": "a"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: fmt.Errorf( @@ -113,18 +121,19 @@ func TestValidateLabels(t *testing.T) { []mimirpb.LabelAdapter{ {Name: model.MetricNameLabel, Value: "valid"}, {Name: "foo ", Value: "bar"}, + {Name: "team", Value: "a"}, }, ), ), }, { - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "valid"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "valid", "team": "c"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: nil, }, { - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "badLabelName", "this_is_a_really_really_long_name_that_should_cause_an_error": "test_value_please_ignore"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "badLabelName", "this_is_a_really_really_long_name_that_should_cause_an_error": "test_value_please_ignore", "team": "biz"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: fmt.Errorf( @@ -132,6 +141,7 @@ func TestValidateLabels(t *testing.T) { "this_is_a_really_really_long_name_that_should_cause_an_error", mimirpb.FromLabelAdaptersToString( []mimirpb.LabelAdapter{ + {Name: "team", Value: "biz"}, {Name: model.MetricNameLabel, Value: "badLabelName"}, {Name: "this_is_a_really_really_long_name_that_should_cause_an_error", Value: "test_value_please_ignore"}, }, @@ -139,7 +149,7 @@ func TestValidateLabels(t *testing.T) { ), }, { - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "badLabelValue", "much_shorter_name": "test_value_please_ignore_no_really_nothing_to_see_here"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "badLabelValue", "much_shorter_name": "test_value_please_ignore_no_really_nothing_to_see_here", "team": "biz"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: fmt.Errorf( @@ -150,12 +160,13 @@ func TestValidateLabels(t *testing.T) { []mimirpb.LabelAdapter{ {Name: model.MetricNameLabel, Value: "badLabelValue"}, {Name: "much_shorter_name", Value: "test_value_please_ignore_no_really_nothing_to_see_here"}, + {Name: "team", Value: "biz"}, }, ), ), }, { - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo", "bar": "baz", "blip": "blop"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo", "bar": "baz", "blip": "blop", "team": "plof"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: fmt.Errorf( @@ -165,21 +176,22 @@ func TestValidateLabels(t *testing.T) { {Name: model.MetricNameLabel, Value: "foo"}, {Name: "bar", Value: "baz"}, {Name: "blip", Value: "blop"}, + {Name: "team", Value: "plof"}, }, - 2, + 3, )..., ), }, { // *_info metrics have higher label limits. - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo_info", "bar": "baz", "blip": "blop"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo_info", "bar": "baz", "blip": "blop", "team": "a"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: nil, }, { // *_info metrics have higher label limits. - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo_info", "bar": "baz", "blip": "blop", "blap": "blup"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo_info", "bar": "baz", "blip": "blop", "blap": "blup", "team": "a"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: fmt.Errorf( @@ -190,31 +202,32 @@ func TestValidateLabels(t *testing.T) { {Name: "bar", Value: "baz"}, {Name: "blip", Value: "blop"}, {Name: "blap", Value: "blup"}, + {Name: "team", Value: "a"}, }, - 3, + 4, )..., ), }, { - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo", "bar": "baz", "blip": "blop"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo", "bar": "baz", "blip": "blop", "team": "a"}, skipLabelNameValidation: false, skipLabelCountValidation: true, err: nil, }, { - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo", "invalid%label&name": "bar"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo", "invalid%label&name": "bar", "team": "biz"}, skipLabelNameValidation: true, skipLabelCountValidation: false, err: nil, }, { - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo", "label1": "你好"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo", "label1": "你好", "team": "plof"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: nil, }, { - metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo", "label1": "abc\xfe\xfddef"}, + metric: map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo", "label1": "abc\xfe\xfddef", "team": "plof"}, skipLabelNameValidation: false, skipLabelCountValidation: false, err: fmt.Errorf( @@ -229,7 +242,7 @@ func TestValidateLabels(t *testing.T) { err: nil, }, } { - err := validateLabels(s, cfg, userID, "custom label", mimirpb.FromMetricsToLabelAdapters(c.metric), c.skipLabelNameValidation, c.skipLabelCountValidation, nil, ts) + err := validateLabels(s, cfg, userID, "custom label", mimirpb.FromMetricsToLabelAdapters(c.metric), c.skipLabelNameValidation, c.skipLabelCountValidation, cat, ts) assert.Equal(t, c.err, err, "wrong error") } @@ -250,6 +263,19 @@ func TestValidateLabels(t *testing.T) { cortex_discarded_samples_total{group="custom label",reason="random reason",user="different user"} 1 `), "cortex_discarded_samples_total")) + require.NoError(t, testutil.GatherAndCompare(careg, strings.NewReader(` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{reason="label_invalid",team="a",tenant="testUser",tracker="cost-attribution"} 1 + cortex_discarded_attributed_samples_total{reason="label_name_too_long",team="biz",tenant="testUser",tracker="cost-attribution"} 1 + cortex_discarded_attributed_samples_total{reason="label_value_invalid",team="plof",tenant="testUser",tracker="cost-attribution"} 1 + cortex_discarded_attributed_samples_total{reason="label_value_too_long",team="biz",tenant="testUser",tracker="cost-attribution"} 1 + cortex_discarded_attributed_samples_total{reason="max_label_names_per_info_series",team="a",tenant="testUser",tracker="cost-attribution"} 1 + cortex_discarded_attributed_samples_total{reason="max_label_names_per_series",team="plof",tenant="testUser",tracker="cost-attribution"} 1 + cortex_discarded_attributed_samples_total{reason="metric_name_invalid",team="a",tenant="testUser",tracker="cost-attribution"} 2 + cortex_discarded_attributed_samples_total{reason="missing_metric_name",team="a",tenant="testUser",tracker="cost-attribution"} 1 +`), "cortex_discarded_attributed_samples_total")) + s.deleteUserMetrics(userID) require.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(` diff --git a/pkg/ingester/activeseries/active_series_test.go b/pkg/ingester/activeseries/active_series_test.go index a0c48e8d7fb..d4e16150b38 100644 --- a/pkg/ingester/activeseries/active_series_test.go +++ b/pkg/ingester/activeseries/active_series_test.go @@ -241,7 +241,7 @@ type mockIndex struct { existingLabels map[storage.SeriesRef]labels.Labels } -func (m *mockIndex) Series(ref storage.SeriesRef, builder *labels.ScratchBuilder, chks *[]chunks.Meta) error { +func (m *mockIndex) Series(ref storage.SeriesRef, builder *labels.ScratchBuilder, _ *[]chunks.Meta) error { if ls, ok := m.existingLabels[ref]; ok { builder.Assign(ls) return nil @@ -250,7 +250,7 @@ func (m *mockIndex) Series(ref storage.SeriesRef, builder *labels.ScratchBuilder } func TestActiveSeries_UpdateSeries_WithCostAttribution(t *testing.T) { - limits, _ := catestutils.GetMockCostAttributionLimits(0) + limits, _ := catestutils.NewMockCostAttributionLimits(0) reg := prometheus.NewRegistry() manager, err := costattribution.NewManager(5*time.Second, time.Second, 10*time.Second, log.NewNopLogger(), limits, reg) require.NoError(t, err) From ddd507de8d548cd93d264a88d21a3d06c6abb43a Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Fri, 27 Dec 2024 12:22:42 +0100 Subject: [PATCH 045/105] change small map to slice since it is quicker --- pkg/costattribution/tracker.go | 36 +++++++++++++++------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 14e66b3a424..5072d00ce38 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -38,7 +38,6 @@ type observation struct { type Tracker struct { userID string labels []string - index map[string]int maxCardinality int activeSeriesPerUserAttribution *prometheus.Desc receivedSamplesAttribution *prometheus.Desc @@ -60,11 +59,9 @@ func newTracker(userID string, trackedLabels []string, limit int, cooldown time. orderedLables := slices.Clone(trackedLabels) slices.Sort(orderedLables) - // Create a map for fast lookup, and overflow labels to export when overflow happens - index := make(map[string]int, len(orderedLables)) + // Create a map for overflow labels to export when overflow happens overflowLabels := make([]string, len(orderedLables)+2) - for i, label := range orderedLables { - index[label] = i + for i := range orderedLables { overflowLabels[i] = overflowValue } @@ -74,7 +71,6 @@ func newTracker(userID string, trackedLabels []string, limit int, cooldown time. tracker := &Tracker{ userID: userID, labels: orderedLables, - index: index, maxCardinality: limit, observed: make(map[string]*observation), hashBuffer: make([]byte, 0, 1024), @@ -195,9 +191,15 @@ func (t *Tracker) IncrementActiveSeriesFailure() { func (t *Tracker) updateCountersWithLabelAdapter(lbls []mimirpb.LabelAdapter, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { extractValues := func() []string { labelValues := make([]string, len(t.labels)) - for _, l := range lbls { - if idx, ok := t.index[l.Name]; ok { - labelValues[idx] = l.Value + for idx, cal := range t.labels { + for _, l := range lbls { + if l.Name == cal { + labelValues[idx] = l.Value + break + } + } + if labelValues[idx] == "" { + labelValues[idx] = missingValue } } return labelValues @@ -208,11 +210,12 @@ func (t *Tracker) updateCountersWithLabelAdapter(lbls []mimirpb.LabelAdapter, ts func (t *Tracker) updateCounters(lbls labels.Labels, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { extractValues := func() []string { labelValues := make([]string, len(t.labels)) - lbls.Range(func(l labels.Label) { - if idx, ok := t.index[l.Name]; ok { - labelValues[idx] = l.Value + for idx, cal := range t.labels { + labelValues[idx] = lbls.Get(cal) + if labelValues[idx] == "" { + labelValues[idx] = missingValue } - }) + } return labelValues } t.updateCountersCommon(extractValues, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) @@ -228,13 +231,6 @@ func (t *Tracker) updateCountersCommon( // Extract label values labelValues := extractValues() - // Fill missing label values - for i := 0; i < len(labelValues); i++ { - if labelValues[i] == "" { - labelValues[i] = missingValue - } - } - // Reuse buffer from pool for building the observation key buf := bufferPool.Get().(*bytes.Buffer) buf.Reset() From b27e379ccc711b6b194f69db7a5d1a0c82452fe3 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Fri, 27 Dec 2024 12:31:51 +0100 Subject: [PATCH 046/105] remove unused parameter --- pkg/costattribution/manager.go | 26 +++++++++---------- pkg/costattribution/manager_test.go | 2 +- pkg/distributor/distributor_test.go | 2 +- pkg/distributor/validate_test.go | 2 +- .../activeseries/active_series_test.go | 2 +- pkg/ingester/ingester_test.go | 2 +- pkg/mimir/modules.go | 2 +- 7 files changed, 18 insertions(+), 20 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index ffc4746c1f9..b144e90b52c 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -29,23 +29,21 @@ type Manager struct { inactiveTimeout time.Duration limits *validation.Overrides - mtx sync.RWMutex - trackersByUserID map[string]*Tracker - reg *prometheus.Registry - cleanupInterval time.Duration - metricsExportInterval time.Duration + mtx sync.RWMutex + trackersByUserID map[string]*Tracker + reg *prometheus.Registry + cleanupInterval time.Duration } -func NewManager(cleanupInterval, exportInterval, inactiveTimeout time.Duration, logger log.Logger, limits *validation.Overrides, reg *prometheus.Registry) (*Manager, error) { +func NewManager(cleanupInterval, inactiveTimeout time.Duration, logger log.Logger, limits *validation.Overrides, reg *prometheus.Registry) (*Manager, error) { m := &Manager{ - trackersByUserID: make(map[string]*Tracker), - limits: limits, - mtx: sync.RWMutex{}, - inactiveTimeout: inactiveTimeout, - logger: logger, - reg: reg, - cleanupInterval: cleanupInterval, - metricsExportInterval: exportInterval, + trackersByUserID: make(map[string]*Tracker), + limits: limits, + mtx: sync.RWMutex{}, + inactiveTimeout: inactiveTimeout, + logger: logger, + reg: reg, + cleanupInterval: cleanupInterval, } m.Service = services.NewTimerService(cleanupInterval, nil, m.iteration, nil).WithName("cost attribution manager") diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index 81a94a674c9..d7654951e5b 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -20,7 +20,7 @@ func newTestManager() *Manager { logger := log.NewNopLogger() limits, _ := testutils.NewMockCostAttributionLimits(0) reg := prometheus.NewRegistry() - manager, err := NewManager(5*time.Second, time.Second, 10*time.Second, logger, limits, reg) + manager, err := NewManager(5*time.Second, 10*time.Second, logger, limits, reg) if err != nil { panic(err) } diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index 9119158ea70..163ecf09721 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -2365,7 +2365,7 @@ func BenchmarkDistributor_Push(b *testing.B) { // Initialize the cost attribution manager var cam *costattribution.Manager if caCase.customRegistry != nil { - cam, err = costattribution.NewManager(5*time.Second, time.Second, 10*time.Second, nil, overrides, caCase.customRegistry) + cam, err = costattribution.NewManager(5*time.Second, 10*time.Second, nil, overrides, caCase.customRegistry) require.NoError(b, err) } diff --git a/pkg/distributor/validate_test.go b/pkg/distributor/validate_test.go index d0539a9d45b..47af497ca64 100644 --- a/pkg/distributor/validate_test.go +++ b/pkg/distributor/validate_test.go @@ -82,7 +82,7 @@ func TestValidateLabels(t *testing.T) { cfg.maxLabelNamesPerInfoSeries = 4 limits, _ := catestutils.NewMockCostAttributionLimits(0, userID, "team") careg := prometheus.NewRegistry() - manager, err := costattribution.NewManager(5*time.Second, time.Second, 10*time.Second, log.NewNopLogger(), limits, careg) + manager, err := costattribution.NewManager(5*time.Second, 10*time.Second, log.NewNopLogger(), limits, careg) require.NoError(t, err) cat := manager.Tracker(userID) diff --git a/pkg/ingester/activeseries/active_series_test.go b/pkg/ingester/activeseries/active_series_test.go index d4e16150b38..690b6b6c71c 100644 --- a/pkg/ingester/activeseries/active_series_test.go +++ b/pkg/ingester/activeseries/active_series_test.go @@ -252,7 +252,7 @@ func (m *mockIndex) Series(ref storage.SeriesRef, builder *labels.ScratchBuilder func TestActiveSeries_UpdateSeries_WithCostAttribution(t *testing.T) { limits, _ := catestutils.NewMockCostAttributionLimits(0) reg := prometheus.NewRegistry() - manager, err := costattribution.NewManager(5*time.Second, time.Second, 10*time.Second, log.NewNopLogger(), limits, reg) + manager, err := costattribution.NewManager(5*time.Second, 10*time.Second, log.NewNopLogger(), limits, reg) require.NoError(t, err) c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, manager.Tracker("user5")) testCostAttributionUpdateSeries(t, c, reg) diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index 6f03a3b7ec7..829a6293c8a 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -3645,7 +3645,7 @@ func BenchmarkIngesterPush(b *testing.B) { var cam *costattribution.Manager if caCase.customRegistry != nil { - cam, err = costattribution.NewManager(5*time.Second, time.Second, 10*time.Second, nil, overrides, caCase.customRegistry) + cam, err = costattribution.NewManager(5*time.Second, 10*time.Second, nil, overrides, caCase.customRegistry) require.NoError(b, err) } diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index d4a6f0bca4c..11a12d0c830 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -654,7 +654,7 @@ func (t *Mimir) initCostAttributionService() (services.Service, error) { if t.Cfg.CostAttributionRegistryPath != "" { reg := prometheus.NewRegistry() var err error - t.CostAttributionManager, err = costattribution.NewManager(3*time.Minute, time.Minute, t.Cfg.CostAttributionEvictionInterval, util_log.Logger, t.Overrides, reg) + t.CostAttributionManager, err = costattribution.NewManager(3*time.Minute, t.Cfg.CostAttributionEvictionInterval, util_log.Logger, t.Overrides, reg) t.API.RegisterCostAttribution(t.Cfg.CostAttributionRegistryPath, reg) return t.CostAttributionManager, err } From a79fac787a5685c1202c3f0fa7b5d6897227ef9f Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Fri, 27 Dec 2024 15:58:56 +0100 Subject: [PATCH 047/105] add new parameter --- cmd/mimir/config-descriptor.json | 11 +++++++++++ cmd/mimir/help-all.txt.tmpl | 2 ++ .../mimir/configure/configuration-parameters/index.md | 5 +++++ pkg/mimir/mimir.go | 2 ++ pkg/mimir/modules.go | 2 +- 5 files changed, 21 insertions(+), 1 deletion(-) diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 4a6ec05c420..f2fc6ad3f2a 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -19705,6 +19705,17 @@ "fieldFlag": "cost-attribution.registry-path", "fieldType": "string", "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "cost_attribution_cleanup_interval", + "required": false, + "desc": "Time interval at which the cost attribution cleanup process runs, ensuring inactive cost attribution entries are purged.", + "fieldValue": null, + "fieldDefaultValue": 180000000000, + "fieldFlag": "cleanup-interval", + "fieldType": "duration", + "fieldCategory": "experimental" } ], "fieldValue": null, diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 410b0ead6f1..3ab952fbd74 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -971,6 +971,8 @@ Usage of ./cmd/mimir/mimir: Maximum number of CPUs that can simultaneously processes WAL replay. If it is set to 0, then each TSDB is replayed with a concurrency equal to the number of CPU cores available on the machine. -blocks-storage.tsdb.wal-segment-size-bytes int TSDB WAL segments files max size (bytes). (default 134217728) + -cleanup-interval duration + [experimental] Time interval at which the cost attribution cleanup process runs, ensuring inactive cost attribution entries are purged. (default 3m0s) -common.storage.azure.account-key string Azure storage account key. If unset, Azure managed identities will be used for authentication instead. -common.storage.azure.account-name string diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 0b6903162e7..ac4c01aafe3 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -467,6 +467,11 @@ overrides_exporter: # cost attribution metrics aren't exposed. # CLI flag: -cost-attribution.registry-path [cost_attribution_registry_path: | default = ""] + +# (experimental) Time interval at which the cost attribution cleanup process +# runs, ensuring inactive cost attribution entries are purged. +# CLI flag: -cleanup-interval +[cost_attribution_cleanup_interval: | default = 3m] ``` ### common diff --git a/pkg/mimir/mimir.go b/pkg/mimir/mimir.go index 2724fdff52b..9558069af5b 100644 --- a/pkg/mimir/mimir.go +++ b/pkg/mimir/mimir.go @@ -152,6 +152,7 @@ type Config struct { CostAttributionEvictionInterval time.Duration `yaml:"cost_attribution_eviction_interval" category:"experimental"` CostAttributionRegistryPath string `yaml:"cost_attribution_registry_path" category:"experimental"` + CostAttributionCleanupInterval time.Duration `yaml:"cost_attribution_cleanup_interval" category:"experimental"` } // RegisterFlags registers flags. @@ -179,6 +180,7 @@ func (c *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) { f.BoolVar(&c.TimeseriesUnmarshalCachingOptimizationEnabled, "timeseries-unmarshal-caching-optimization-enabled", true, "Enables optimized marshaling of timeseries.") f.StringVar(&c.CostAttributionRegistryPath, "cost-attribution.registry-path", "", "Defines a custom path for the registry. When specified, Mimir exposes cost attribution metrics through this custom path. If not specified, cost attribution metrics aren't exposed.") f.DurationVar(&c.CostAttributionEvictionInterval, "cost-attribution.eviction-interval", 20*time.Minute, "Time interval at which inactive cost attributions are evicted from the counter, ensuring they are not included in the cost attribution cardinality per user limit.") + f.DurationVar(&c.CostAttributionCleanupInterval, "cleanup-interval", 3*time.Minute, "Time interval at which the cost attribution cleanup process runs, ensuring inactive cost attribution entries are purged.") c.API.RegisterFlags(f) c.registerServerFlagsWithChangedDefaultValues(f) diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index 11a12d0c830..0c8b1479802 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -654,7 +654,7 @@ func (t *Mimir) initCostAttributionService() (services.Service, error) { if t.Cfg.CostAttributionRegistryPath != "" { reg := prometheus.NewRegistry() var err error - t.CostAttributionManager, err = costattribution.NewManager(3*time.Minute, t.Cfg.CostAttributionEvictionInterval, util_log.Logger, t.Overrides, reg) + t.CostAttributionManager, err = costattribution.NewManager(t.Cfg.CostAttributionCleanupInterval, t.Cfg.CostAttributionEvictionInterval, util_log.Logger, t.Overrides, reg) t.API.RegisterCostAttribution(t.Cfg.CostAttributionRegistryPath, reg) return t.CostAttributionManager, err } From 37901b795add7d61ace140cdeec434e42080617f Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Fri, 27 Dec 2024 16:05:00 +0100 Subject: [PATCH 048/105] update config file --- cmd/mimir/config-descriptor.json | 2 +- cmd/mimir/help-all.txt.tmpl | 4 ++-- docs/sources/mimir/configure/about-versioning.md | 2 ++ .../sources/mimir/configure/configuration-parameters/index.md | 2 +- pkg/mimir/mimir.go | 2 +- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index f2fc6ad3f2a..199aaecdf44 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -19713,7 +19713,7 @@ "desc": "Time interval at which the cost attribution cleanup process runs, ensuring inactive cost attribution entries are purged.", "fieldValue": null, "fieldDefaultValue": 180000000000, - "fieldFlag": "cleanup-interval", + "fieldFlag": "cost-attribution.cleanup-interval", "fieldType": "duration", "fieldCategory": "experimental" } diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 3ab952fbd74..7fdd772c442 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -971,8 +971,6 @@ Usage of ./cmd/mimir/mimir: Maximum number of CPUs that can simultaneously processes WAL replay. If it is set to 0, then each TSDB is replayed with a concurrency equal to the number of CPU cores available on the machine. -blocks-storage.tsdb.wal-segment-size-bytes int TSDB WAL segments files max size (bytes). (default 134217728) - -cleanup-interval duration - [experimental] Time interval at which the cost attribution cleanup process runs, ensuring inactive cost attribution entries are purged. (default 3m0s) -common.storage.azure.account-key string Azure storage account key. If unset, Azure managed identities will be used for authentication instead. -common.storage.azure.account-name string @@ -1285,6 +1283,8 @@ Usage of ./cmd/mimir/mimir: Expands ${var} or $var in config according to the values of the environment variables. -config.file value Configuration file to load. + -cost-attribution.cleanup-interval duration + [experimental] Time interval at which the cost attribution cleanup process runs, ensuring inactive cost attribution entries are purged. (default 3m0s) -cost-attribution.eviction-interval duration [experimental] Time interval at which inactive cost attributions are evicted from the counter, ensuring they are not included in the cost attribution cardinality per user limit. (default 20m0s) -cost-attribution.registry-path string diff --git a/docs/sources/mimir/configure/about-versioning.md b/docs/sources/mimir/configure/about-versioning.md index 05751699ead..5219b71118a 100644 --- a/docs/sources/mimir/configure/about-versioning.md +++ b/docs/sources/mimir/configure/about-versioning.md @@ -57,6 +57,8 @@ The following features are currently experimental: - `-cost-attribution.eviction-interval` - Configure the metrics endpoint dedicated to cost attribution - `-cost-attribution.registry-path` + - Configure the cost attribution cleanup process run interval + - `-cost-attribution.cleanup-interval` - Alertmanager - Enable a set of experimental API endpoints to help support the migration of the Grafana Alertmanager to the Mimir Alertmanager. - `-alertmanager.grafana-alertmanager-compatibility-enabled` diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index ac4c01aafe3..2a982ccd36e 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -470,7 +470,7 @@ overrides_exporter: # (experimental) Time interval at which the cost attribution cleanup process # runs, ensuring inactive cost attribution entries are purged. -# CLI flag: -cleanup-interval +# CLI flag: -cost-attribution.cleanup-interval [cost_attribution_cleanup_interval: | default = 3m] ``` diff --git a/pkg/mimir/mimir.go b/pkg/mimir/mimir.go index 9558069af5b..540d0f7bbdd 100644 --- a/pkg/mimir/mimir.go +++ b/pkg/mimir/mimir.go @@ -180,7 +180,7 @@ func (c *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) { f.BoolVar(&c.TimeseriesUnmarshalCachingOptimizationEnabled, "timeseries-unmarshal-caching-optimization-enabled", true, "Enables optimized marshaling of timeseries.") f.StringVar(&c.CostAttributionRegistryPath, "cost-attribution.registry-path", "", "Defines a custom path for the registry. When specified, Mimir exposes cost attribution metrics through this custom path. If not specified, cost attribution metrics aren't exposed.") f.DurationVar(&c.CostAttributionEvictionInterval, "cost-attribution.eviction-interval", 20*time.Minute, "Time interval at which inactive cost attributions are evicted from the counter, ensuring they are not included in the cost attribution cardinality per user limit.") - f.DurationVar(&c.CostAttributionCleanupInterval, "cleanup-interval", 3*time.Minute, "Time interval at which the cost attribution cleanup process runs, ensuring inactive cost attribution entries are purged.") + f.DurationVar(&c.CostAttributionCleanupInterval, "cost-attribution.cleanup-interval", 3*time.Minute, "Time interval at which the cost attribution cleanup process runs, ensuring inactive cost attribution entries are purged.") c.API.RegisterFlags(f) c.registerServerFlagsWithChangedDefaultValues(f) From f7115f4fc719953ae1457620de0287e5567a6108 Mon Sep 17 00:00:00 2001 From: Ying WANG <74549700+ying-jeanne@users.noreply.github.com> Date: Fri, 27 Dec 2024 16:21:36 +0100 Subject: [PATCH 049/105] Update pkg/costattribution/manager.go Co-authored-by: Oleg Zaytsev --- pkg/costattribution/manager.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index b144e90b52c..72c7a94c268 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -154,6 +154,10 @@ func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) error { } if t.recoveredFromOverflow(deadline) { + // We delete the current tracker here, + // this will cause the creation of a new one later. + // ActiveSeries tracker compares the pointer of the tracker, + // and this change will cause a reload there. m.deleteTracker(userID) } } From 679f2cc02d02ec43420278880d8371f699e5795b Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 30 Dec 2024 10:58:15 +0100 Subject: [PATCH 050/105] take config before locking tracker map --- pkg/costattribution/manager.go | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 72c7a94c268..19cd63b817a 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -57,7 +57,7 @@ func (m *Manager) iteration(_ context.Context) error { return m.purgeInactiveAttributionsUntil(time.Now().Add(-m.inactiveTimeout).Unix()) } -func (m *Manager) EnabledForUser(userID string) bool { +func (m *Manager) enabledForUser(userID string) bool { if m == nil { return false } @@ -65,7 +65,7 @@ func (m *Manager) EnabledForUser(userID string) bool { } func (m *Manager) Tracker(userID string) *Tracker { - if !m.EnabledForUser(userID) { + if !m.enabledForUser(userID) { return nil } @@ -77,12 +77,17 @@ func (m *Manager) Tracker(userID string) *Tracker { return tracker } + // We need to create a new tracker, get all the necessary information from the limits before locking and creating the tracker. + labels := m.limits.CostAttributionLabels(userID) + maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) + cooldownDuration := m.limits.CostAttributionCooldown(userID) + m.mtx.Lock() defer m.mtx.Unlock() if tracker, exists = m.trackersByUserID[userID]; exists { return tracker } - tracker = newTracker(userID, m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) + tracker = newTracker(userID, labels, maxCardinality, cooldownDuration, m.logger) m.trackersByUserID[userID] = tracker return tracker } @@ -107,7 +112,7 @@ func (m *Manager) deleteTracker(userID string) { } func (m *Manager) updateTracker(userID string) *Tracker { - if !m.EnabledForUser(userID) { + if !m.enabledForUser(userID) { m.deleteTracker(userID) return nil } From 66accc9512287f763306180e478d5d90a301c074 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 30 Dec 2024 11:14:06 +0100 Subject: [PATCH 051/105] simplify logics --- pkg/costattribution/manager.go | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 19cd63b817a..e5e2f0127b0 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -118,19 +118,15 @@ func (m *Manager) updateTracker(userID string) *Tracker { } t := m.Tracker(userID) - - lbls := m.limits.CostAttributionLabels(userID) - - newTrackedLabels := make([]string, len(lbls)) - copy(newTrackedLabels, lbls) + lbls := slices.Clone(m.limits.CostAttributionLabels(userID)) // sort the labels to ensure the order is consistent - slices.Sort(newTrackedLabels) + slices.Sort(lbls) // if the labels have changed or the max cardinality or cooldown duration have changed, create a new tracker - if !t.hasSameLabels(newTrackedLabels) || t.maxCardinality != m.limits.MaxCostAttributionCardinalityPerUser(userID) || t.cooldownDuration != int64(m.limits.CostAttributionCooldown(userID).Seconds()) { + if !t.hasSameLabels(lbls) || t.maxCardinality != m.limits.MaxCostAttributionCardinalityPerUser(userID) || t.cooldownDuration != m.limits.CostAttributionCooldown(userID) { m.mtx.Lock() - t = newTracker(userID, newTrackedLabels, m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) + t = newTracker(userID, lbls, m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) m.trackersByUserID[userID] = t m.mtx.Unlock() return t From f4a4efde884013a8e52669a12fbd1ff16c62e9f6 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 30 Dec 2024 11:31:33 +0100 Subject: [PATCH 052/105] remove useless initialization --- pkg/costattribution/manager.go | 2 +- pkg/costattribution/tracker.go | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index e5e2f0127b0..bc0cbc71a4e 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -124,7 +124,7 @@ func (m *Manager) updateTracker(userID string) *Tracker { slices.Sort(lbls) // if the labels have changed or the max cardinality or cooldown duration have changed, create a new tracker - if !t.hasSameLabels(lbls) || t.maxCardinality != m.limits.MaxCostAttributionCardinalityPerUser(userID) || t.cooldownDuration != m.limits.CostAttributionCooldown(userID) { + if !t.hasSameLabels(lbls) || t.maxCardinality != m.limits.MaxCostAttributionCardinalityPerUser(userID) || t.cooldownDuration != int64(m.limits.CostAttributionCooldown(userID).Seconds()) { m.mtx.Lock() t = newTracker(userID, lbls, m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) m.trackersByUserID[userID] = t diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 5072d00ce38..d8bcf0a5500 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -288,10 +288,7 @@ func (t *Tracker) updateState(ts int64, activeSeriesIncrement, receivedSampleInc t.state = Overflow // Initialize the overflow counter. t.overflowCounter = &observation{ - lastUpdate: atomic.NewInt64(ts), - activeSerie: *atomic.NewFloat64(0), - receivedSample: *atomic.NewFloat64(0), - totalDiscarded: *atomic.NewFloat64(0), + lastUpdate: atomic.NewInt64(ts), } // Aggregate active series from all keys into the overflow counter. From f90ac0ef07d184a1d08e1ca0496a349b457f14fc Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 30 Dec 2024 13:18:09 +0100 Subject: [PATCH 053/105] change int64 to time.x --- pkg/costattribution/manager.go | 6 ++--- pkg/costattribution/manager_test.go | 12 ++++----- pkg/costattribution/tracker.go | 39 ++++++++++++++--------------- pkg/costattribution/tracker_test.go | 30 +++++++++++----------- 4 files changed, 44 insertions(+), 43 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index bc0cbc71a4e..41b5f578f70 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -54,7 +54,7 @@ func NewManager(cleanupInterval, inactiveTimeout time.Duration, logger log.Logge } func (m *Manager) iteration(_ context.Context) error { - return m.purgeInactiveAttributionsUntil(time.Now().Add(-m.inactiveTimeout).Unix()) + return m.purgeInactiveAttributionsUntil(time.Now().Add(-m.inactiveTimeout)) } func (m *Manager) enabledForUser(userID string) bool { @@ -124,7 +124,7 @@ func (m *Manager) updateTracker(userID string) *Tracker { slices.Sort(lbls) // if the labels have changed or the max cardinality or cooldown duration have changed, create a new tracker - if !t.hasSameLabels(lbls) || t.maxCardinality != m.limits.MaxCostAttributionCardinalityPerUser(userID) || t.cooldownDuration != int64(m.limits.CostAttributionCooldown(userID).Seconds()) { + if !t.hasSameLabels(lbls) || t.maxCardinality != m.limits.MaxCostAttributionCardinalityPerUser(userID) || t.cooldownDuration != m.limits.CostAttributionCooldown(userID) { m.mtx.Lock() t = newTracker(userID, lbls, m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) m.trackersByUserID[userID] = t @@ -135,7 +135,7 @@ func (m *Manager) updateTracker(userID string) *Tracker { return t } -func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) error { +func (m *Manager) purgeInactiveAttributionsUntil(deadline time.Time) error { m.mtx.RLock() userIDs := make([]string, 0, len(m.trackersByUserID)) for userID := range m.trackersByUserID { diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index d7654951e5b..d7b4ebd7e27 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -69,7 +69,7 @@ func TestManager_CreateDeleteTracker(t *testing.T) { }) t.Run("Purge inactive attributions", func(t *testing.T) { - err := manager.purgeInactiveAttributionsUntil(time.Unix(10, 0).Unix()) + err := manager.purgeInactiveAttributionsUntil(time.Unix(10, 0)) assert.NoError(t, err) expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. @@ -83,7 +83,7 @@ func TestManager_CreateDeleteTracker(t *testing.T) { var err error manager.limits, err = testutils.NewMockCostAttributionLimits(1) assert.NoError(t, err) - assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(11, 0).Unix())) + assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(11, 0))) assert.Equal(t, 1, len(manager.trackersByUserID)) expectedMetrics := ` @@ -98,7 +98,7 @@ func TestManager_CreateDeleteTracker(t *testing.T) { var err error manager.limits, err = testutils.NewMockCostAttributionLimits(2) assert.NoError(t, err) - assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(12, 0).Unix())) + assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(12, 0))) assert.Equal(t, 1, len(manager.trackersByUserID)) assert.True(t, manager.Tracker("user3").hasSameLabels([]string{"feature", "team"})) @@ -133,7 +133,7 @@ func TestManager_PurgeInactiveAttributionsUntil(t *testing.T) { manager.Tracker("user3").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "department", Value: "foo"}, {Name: "service", Value: "bar"}}, 1, "out-of-window", time.Unix(10, 0)) t.Run("Purge before inactive timeout", func(t *testing.T) { - assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(0, 0).Unix())) + assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(0, 0))) assert.Equal(t, 2, len(manager.trackersByUserID)) expectedMetrics := ` @@ -148,7 +148,7 @@ func TestManager_PurgeInactiveAttributionsUntil(t *testing.T) { t.Run("Purge after inactive timeout", func(t *testing.T) { // disable cost attribution for user1 to test purging manager.limits, _ = testutils.NewMockCostAttributionLimits(1) - assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(5, 0).Unix())) + assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(5, 0))) // User3's tracker should remain since it's active, user1's tracker should be removed assert.Equal(t, 1, len(manager.trackersByUserID), "Expected one active tracker after purging") @@ -164,7 +164,7 @@ func TestManager_PurgeInactiveAttributionsUntil(t *testing.T) { t.Run("Purge all trackers", func(t *testing.T) { // Trigger a purge that should remove all inactive trackers - assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(20, 0).Unix())) + assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(20, 0))) // Tracker would stay at 1 since user1's tracker is disabled assert.Equal(t, 1, len(manager.trackersByUserID), "Expected one active tracker after full purge") diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index d8bcf0a5500..ce3b3f4839a 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -46,12 +46,12 @@ type Tracker struct { overflowLabels []string observed map[string]*observation observedMtx sync.RWMutex - cooldownUntil int64 hashBuffer []byte state TrackerState overflowCounter *observation totalFailedActiveSeries *atomic.Float64 - cooldownDuration int64 + cooldownDuration time.Duration + cooldownUntil time.Time logger log.Logger } @@ -74,11 +74,10 @@ func newTracker(userID string, trackedLabels []string, limit int, cooldown time. maxCardinality: limit, observed: make(map[string]*observation), hashBuffer: make([]byte, 0, 1024), - cooldownDuration: int64(cooldown.Seconds()), + cooldownDuration: cooldown, logger: logger, overflowLabels: overflowLabels, totalFailedActiveSeries: atomic.NewFloat64(0), - cooldownUntil: 0, } variableLabels := slices.Clone(orderedLables) @@ -122,14 +121,14 @@ func (t *Tracker) IncrementActiveSeries(lbs labels.Labels, now time.Time) { if t == nil { return } - t.updateCounters(lbs, now.Unix(), 1, 0, 0, nil, true) + t.updateCounters(lbs, now, 1, 0, 0, nil, true) } func (t *Tracker) DecrementActiveSeries(lbs labels.Labels) { if t == nil { return } - t.updateCounters(lbs, -1, -1, 0, 0, nil, false) + t.updateCounters(lbs, time.Time{}, -1, 0, 0, nil, false) } func (t *Tracker) Collect(out chan<- prometheus.Metric) { @@ -171,14 +170,14 @@ func (t *Tracker) IncrementDiscardedSamples(lbs []mimirpb.LabelAdapter, value fl if t == nil { return } - t.updateCountersWithLabelAdapter(lbs, now.Unix(), 0, 0, value, &reason, true) + t.updateCountersWithLabelAdapter(lbs, now, 0, 0, value, &reason, true) } func (t *Tracker) IncrementReceivedSamples(lbs []mimirpb.LabelAdapter, value float64, now time.Time) { if t == nil { return } - t.updateCountersWithLabelAdapter(lbs, now.Unix(), 0, value, 0, nil, true) + t.updateCountersWithLabelAdapter(lbs, now, 0, value, 0, nil, true) } func (t *Tracker) IncrementActiveSeriesFailure() { @@ -188,7 +187,7 @@ func (t *Tracker) IncrementActiveSeriesFailure() { t.totalFailedActiveSeries.Add(1) } -func (t *Tracker) updateCountersWithLabelAdapter(lbls []mimirpb.LabelAdapter, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { +func (t *Tracker) updateCountersWithLabelAdapter(lbls []mimirpb.LabelAdapter, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { extractValues := func() []string { labelValues := make([]string, len(t.labels)) for idx, cal := range t.labels { @@ -207,7 +206,7 @@ func (t *Tracker) updateCountersWithLabelAdapter(lbls []mimirpb.LabelAdapter, ts t.updateCountersCommon(extractValues, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) } -func (t *Tracker) updateCounters(lbls labels.Labels, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { +func (t *Tracker) updateCounters(lbls labels.Labels, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { extractValues := func() []string { labelValues := make([]string, len(t.labels)) for idx, cal := range t.labels { @@ -223,7 +222,7 @@ func (t *Tracker) updateCounters(lbls labels.Labels, ts int64, activeSeriesIncre func (t *Tracker) updateCountersCommon( extractValues func() []string, - ts int64, + ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool, @@ -249,7 +248,7 @@ func (t *Tracker) updateCountersCommon( defer t.observedMtx.Unlock() // Update observations and state - t.updateObservations(buf.Bytes(), ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) + t.updateObservations(buf.Bytes(), ts.Unix(), activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) t.updateState(ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement) } @@ -281,14 +280,14 @@ func (t *Tracker) updateObservations(key []byte, ts int64, activeSeriesIncrement } // updateState checks if the tracker has exceeded its max cardinality and updates overflow state if necessary. -func (t *Tracker) updateState(ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64) { +func (t *Tracker) updateState(ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64) { // Transition to overflow mode if maximum cardinality is exceeded. previousState := t.state if t.state == Normal && len(t.observed) > t.maxCardinality { t.state = Overflow // Initialize the overflow counter. t.overflowCounter = &observation{ - lastUpdate: atomic.NewInt64(ts), + lastUpdate: atomic.NewInt64(ts.Unix()), } // Aggregate active series from all keys into the overflow counter. @@ -297,7 +296,7 @@ func (t *Tracker) updateState(ts int64, activeSeriesIncrement, receivedSampleInc t.overflowCounter.activeSerie.Add(o.activeSerie.Load()) } } - t.cooldownUntil = ts + t.cooldownDuration + t.cooldownUntil = ts.Add(t.cooldownDuration) } if t.state == Overflow { @@ -330,9 +329,9 @@ func (t *Tracker) createNewObservation(key []byte, ts int64, activeSeriesIncreme } } -func (t *Tracker) recoveredFromOverflow(deadline int64) bool { +func (t *Tracker) recoveredFromOverflow(deadline time.Time) bool { t.observedMtx.RLock() - if t.cooldownUntil > 0 && t.cooldownUntil < deadline { + if !t.cooldownUntil.IsZero() && t.cooldownUntil.Before(deadline) { if len(t.observed) <= t.maxCardinality { t.observedMtx.RUnlock() return true @@ -345,7 +344,7 @@ func (t *Tracker) recoveredFromOverflow(deadline int64) bool { t.observedMtx.Unlock() return true } - t.cooldownUntil = deadline + t.cooldownDuration + t.cooldownUntil = deadline.Add(t.cooldownDuration) t.observedMtx.Unlock() } else { t.observedMtx.RUnlock() @@ -353,13 +352,13 @@ func (t *Tracker) recoveredFromOverflow(deadline int64) bool { return false } -func (t *Tracker) inactiveObservations(deadline int64) []string { +func (t *Tracker) inactiveObservations(deadline time.Time) []string { // otherwise, we need to check all observations and clean up the ones that are inactive var invalidKeys []string t.observedMtx.RLock() defer t.observedMtx.RUnlock() for labkey, ob := range t.observed { - if ob != nil && ob.lastUpdate != nil && ob.lastUpdate.Load() <= deadline { + if ob != nil && ob.lastUpdate != nil && ob.lastUpdate.Load() <= deadline.Unix() { invalidKeys = append(invalidKeys, labkey) } } diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index 3ad8b0a4dea..f9457ceb062 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -61,8 +61,8 @@ func TestTracker_CreateDelete(t *testing.T) { "cortex_ingester_attributed_active_series_failure", } assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) - assert.Equal(t, []string{"foo"}, tracker.inactiveObservations(5)) - assert.NoError(t, tManager.purgeInactiveAttributionsUntil(5)) + assert.Equal(t, []string{"foo"}, tracker.inactiveObservations(time.Unix(5, 0))) + assert.NoError(t, tManager.purgeInactiveAttributionsUntil(time.Unix(5, 0))) expectedMetrics = ` # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. @@ -83,19 +83,19 @@ func TestTracker_updateCounters(t *testing.T) { lbls2 := labels.FromStrings("department", "bar", "service", "baz") lbls3 := labels.FromStrings("department", "baz", "service", "foo") - tracker.updateCounters(lbls1, 1, 1, 0, 0, nil, true) + tracker.updateCounters(lbls1, time.Unix(1, 0), 1, 0, 0, nil, true) assert.Equal(t, Normal, tracker.state, "First observation, should not overflow") - tracker.updateCounters(lbls2, 2, 1, 0, 0, nil, true) + tracker.updateCounters(lbls2, time.Unix(2, 0), 1, 0, 0, nil, true) assert.Equal(t, Normal, tracker.state, "Second observation, should not overflow") - tracker.updateCounters(lbls3, 3, 1, 0, 0, nil, true) + tracker.updateCounters(lbls3, time.Unix(3, 0), 1, 0, 0, nil, true) assert.Equal(t, Overflow, tracker.state, "Third observation, should overflow") - tracker.updateCounters(lbls3, 4, 1, 0, 0, nil, true) + tracker.updateCounters(lbls3, time.Unix(4, 0), 1, 0, 0, nil, true) assert.Equal(t, Overflow, tracker.state, "Fourth observation, should stay overflow") - assert.Equal(t, int64(3+tracker.cooldownDuration), tracker.cooldownUntil, "CooldownUntil should be updated correctly") + assert.Equal(t, time.Unix(3, 0).Add(tracker.cooldownDuration), tracker.cooldownUntil, "CooldownUntil should be updated correctly") } func TestTracker_inactiveObservations(t *testing.T) { @@ -108,6 +108,7 @@ func TestTracker_inactiveObservations(t *testing.T) { {{Name: "team", Value: "bar"}}, {{Name: "team", Value: "baz"}}, } + // Simulate samples discarded with different timestamps. tracker.IncrementDiscardedSamples(observations[0], 1, "invalid-metrics-name", time.Unix(1, 0)) tracker.IncrementDiscardedSamples(observations[1], 2, "out-of-window-sample", time.Unix(12, 0)) @@ -117,17 +118,17 @@ func TestTracker_inactiveObservations(t *testing.T) { require.Len(t, tracker.observed, 3) // Purge observations that haven't been updated in the last 10 seconds. - purged := tracker.inactiveObservations(0) + purged := tracker.inactiveObservations(time.Unix(0, 0)) require.Len(t, purged, 0) - purged = tracker.inactiveObservations(10) + purged = tracker.inactiveObservations(time.Unix(10, 0)) assert.ElementsMatch(t, []string{"foo"}, purged) - purged = tracker.inactiveObservations(15) + purged = tracker.inactiveObservations(time.Unix(15, 0)) assert.ElementsMatch(t, []string{"foo", "bar"}, purged) // Check that the purged observation matches the expected details. - purged = tracker.inactiveObservations(25) + purged = tracker.inactiveObservations(time.Unix(25, 0)) assert.ElementsMatch(t, []string{"foo", "bar", "baz"}, purged) } @@ -136,12 +137,13 @@ func TestTracker_Concurrency(t *testing.T) { tracker := m.Tracker("user1") var wg sync.WaitGroup - for i := 0; i < 100; i++ { + var i int64 + for i = 0; i < 100; i++ { wg.Add(1) - go func(i int) { + go func(i int64) { defer wg.Done() lbls := labels.FromStrings("team", string(rune('A'+(i%26)))) - tracker.updateCounters(lbls, int64(i), 1, 0, 0, nil, true) + tracker.updateCounters(lbls, time.Unix(i, 0), 1, 0, 0, nil, true) }(i) } wg.Wait() From 1ab89c582f7ddafc3643995127151ed1f941c145 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 30 Dec 2024 13:43:49 +0100 Subject: [PATCH 054/105] change pointer to instance --- pkg/costattribution/tracker.go | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index ce3b3f4839a..26737b98e9c 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -27,7 +27,7 @@ const ( const sep = rune(0x80) type observation struct { - lastUpdate *atomic.Int64 + lastUpdate atomic.Int64 activeSerie atomic.Float64 receivedSample atomic.Float64 discardedSampleMtx sync.Mutex @@ -254,7 +254,7 @@ func (t *Tracker) updateCountersCommon( // updateObservations updates or creates a new observation in the 'observed' map. func (t *Tracker) updateObservations(key []byte, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { - if o, known := t.observed[string(key)]; known && o.lastUpdate != nil { + if o, known := t.observed[string(key)]; known && o.lastUpdate.Load() != 0 { o.lastUpdate.Store(ts) if activeSeriesIncrement != 0 { o.activeSerie.Add(activeSeriesIncrement) @@ -286,9 +286,7 @@ func (t *Tracker) updateState(ts time.Time, activeSeriesIncrement, receivedSampl if t.state == Normal && len(t.observed) > t.maxCardinality { t.state = Overflow // Initialize the overflow counter. - t.overflowCounter = &observation{ - lastUpdate: atomic.NewInt64(ts.Unix()), - } + t.overflowCounter = &observation{} // Aggregate active series from all keys into the overflow counter. for _, o := range t.observed { @@ -316,7 +314,7 @@ func (t *Tracker) updateState(ts time.Time, activeSeriesIncrement, receivedSampl // createNewObservation creates a new observation in the 'observed' map. func (t *Tracker) createNewObservation(key []byte, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { t.observed[string(key)] = &observation{ - lastUpdate: atomic.NewInt64(ts), + lastUpdate: *atomic.NewInt64(ts), activeSerie: *atomic.NewFloat64(activeSeriesIncrement), receivedSample: *atomic.NewFloat64(receivedSampleIncrement), discardedSample: map[string]atomic.Float64{}, @@ -358,7 +356,7 @@ func (t *Tracker) inactiveObservations(deadline time.Time) []string { t.observedMtx.RLock() defer t.observedMtx.RUnlock() for labkey, ob := range t.observed { - if ob != nil && ob.lastUpdate != nil && ob.lastUpdate.Load() <= deadline.Unix() { + if ob != nil && ob.lastUpdate.Load() != 0 && ob.lastUpdate.Load() <= deadline.Unix() { invalidKeys = append(invalidKeys, labkey) } } From 23b32cff274b9737494ae18db6178b0161314f87 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 30 Dec 2024 14:00:02 +0100 Subject: [PATCH 055/105] change instance to pointer in map --- pkg/costattribution/tracker.go | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 26737b98e9c..18ffbeeb666 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -31,7 +31,7 @@ type observation struct { activeSerie atomic.Float64 receivedSample atomic.Float64 discardedSampleMtx sync.Mutex - discardedSample map[string]atomic.Float64 + discardedSample map[string]*atomic.Float64 totalDiscarded atomic.Float64 } @@ -264,11 +264,10 @@ func (t *Tracker) updateObservations(key []byte, ts int64, activeSeriesIncrement } if discardedSampleIncrement > 0 && reason != nil { o.discardedSampleMtx.Lock() - if v, ok := o.discardedSample[*reason]; ok { - v.Add(discardedSampleIncrement) - o.discardedSample[*reason] = v + if _, ok := o.discardedSample[*reason]; ok { + o.discardedSample[*reason].Add(discardedSampleIncrement) } else { - o.discardedSample[*reason] = *atomic.NewFloat64(discardedSampleIncrement) + o.discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) } o.discardedSampleMtx.Unlock() } @@ -317,12 +316,12 @@ func (t *Tracker) createNewObservation(key []byte, ts int64, activeSeriesIncreme lastUpdate: *atomic.NewInt64(ts), activeSerie: *atomic.NewFloat64(activeSeriesIncrement), receivedSample: *atomic.NewFloat64(receivedSampleIncrement), - discardedSample: map[string]atomic.Float64{}, + discardedSample: make(map[string]*atomic.Float64), discardedSampleMtx: sync.Mutex{}, } if discardedSampleIncrement > 0 && reason != nil { t.observed[string(key)].discardedSampleMtx.Lock() - t.observed[string(key)].discardedSample[*reason] = *atomic.NewFloat64(discardedSampleIncrement) + t.observed[string(key)].discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) t.observed[string(key)].discardedSampleMtx.Unlock() } } From 7a60c7d7c63f5a8dd5385e2a0af8e3644b46b529 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 30 Dec 2024 14:07:45 +0100 Subject: [PATCH 056/105] remove callback --- pkg/costattribution/tracker.go | 49 ++++++++++++++-------------------- 1 file changed, 20 insertions(+), 29 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 18ffbeeb666..c8669e95a08 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -17,10 +17,10 @@ import ( "github.com/grafana/mimir/pkg/mimirpb" ) -type TrackerState int +type trackerState int const ( - Normal TrackerState = iota + Normal trackerState = iota Overflow ) @@ -47,7 +47,7 @@ type Tracker struct { observed map[string]*observation observedMtx sync.RWMutex hashBuffer []byte - state TrackerState + state trackerState overflowCounter *observation totalFailedActiveSeries *atomic.Float64 cooldownDuration time.Duration @@ -188,48 +188,39 @@ func (t *Tracker) IncrementActiveSeriesFailure() { } func (t *Tracker) updateCountersWithLabelAdapter(lbls []mimirpb.LabelAdapter, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { - extractValues := func() []string { - labelValues := make([]string, len(t.labels)) - for idx, cal := range t.labels { - for _, l := range lbls { - if l.Name == cal { - labelValues[idx] = l.Value - break - } - } - if labelValues[idx] == "" { - labelValues[idx] = missingValue + labelValues := make([]string, len(t.labels)) + for idx, cal := range t.labels { + for _, l := range lbls { + if l.Name == cal { + labelValues[idx] = l.Value + break } } - return labelValues + if labelValues[idx] == "" { + labelValues[idx] = missingValue + } } - t.updateCountersCommon(extractValues, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) + t.updateCountersCommon(labelValues, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) } func (t *Tracker) updateCounters(lbls labels.Labels, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { - extractValues := func() []string { - labelValues := make([]string, len(t.labels)) - for idx, cal := range t.labels { - labelValues[idx] = lbls.Get(cal) - if labelValues[idx] == "" { - labelValues[idx] = missingValue - } + labelValues := make([]string, len(t.labels)) + for idx, cal := range t.labels { + labelValues[idx] = lbls.Get(cal) + if labelValues[idx] == "" { + labelValues[idx] = missingValue } - return labelValues } - t.updateCountersCommon(extractValues, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) + t.updateCountersCommon(labelValues, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) } func (t *Tracker) updateCountersCommon( - extractValues func() []string, + labelValues []string, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool, ) { - // Extract label values - labelValues := extractValues() - // Reuse buffer from pool for building the observation key buf := bufferPool.Get().(*bytes.Buffer) buf.Reset() From 0287bf6cad53e0d4f32fdce6f47f44298efee148 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 30 Dec 2024 14:09:54 +0100 Subject: [PATCH 057/105] use string when create new key in map --- pkg/costattribution/tracker.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index c8669e95a08..24940120262 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -265,7 +265,7 @@ func (t *Tracker) updateObservations(key []byte, ts int64, activeSeriesIncrement } else if len(t.observed) < t.maxCardinality*2 && createIfDoesNotExist { // When createIfDoesNotExist is false, it means that the method is called from DecrementActiveSeries, when key doesn't exist we should ignore the call // Otherwise create a new observation for the key - t.createNewObservation(key, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) + t.createNewObservation(string(key), ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) } } @@ -302,8 +302,8 @@ func (t *Tracker) updateState(ts time.Time, activeSeriesIncrement, receivedSampl } // createNewObservation creates a new observation in the 'observed' map. -func (t *Tracker) createNewObservation(key []byte, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { - t.observed[string(key)] = &observation{ +func (t *Tracker) createNewObservation(key string, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { + t.observed[key] = &observation{ lastUpdate: *atomic.NewInt64(ts), activeSerie: *atomic.NewFloat64(activeSeriesIncrement), receivedSample: *atomic.NewFloat64(receivedSampleIncrement), @@ -311,9 +311,9 @@ func (t *Tracker) createNewObservation(key []byte, ts int64, activeSeriesIncreme discardedSampleMtx: sync.Mutex{}, } if discardedSampleIncrement > 0 && reason != nil { - t.observed[string(key)].discardedSampleMtx.Lock() - t.observed[string(key)].discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) - t.observed[string(key)].discardedSampleMtx.Unlock() + t.observed[key].discardedSampleMtx.Lock() + t.observed[key].discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) + t.observed[key].discardedSampleMtx.Unlock() } } From 9c4c2dfa32e4961b412e0d91eb391e9184b80c03 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 30 Dec 2024 14:20:58 +0100 Subject: [PATCH 058/105] move the logic to different place --- pkg/costattribution/tracker.go | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 24940120262..e9225a0cc10 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -245,7 +245,17 @@ func (t *Tracker) updateCountersCommon( // updateObservations updates or creates a new observation in the 'observed' map. func (t *Tracker) updateObservations(key []byte, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { - if o, known := t.observed[string(key)]; known && o.lastUpdate.Load() != 0 { + o, known := t.observed[string(key)] + if !known { + if len(t.observed) < t.maxCardinality*2 && createIfDoesNotExist { + // When createIfDoesNotExist is false, it means that the method is called from DecrementActiveSeries, when key doesn't exist we should ignore the call + // Otherwise create a new observation for the key + t.createNewObservation(string(key), ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) + } + return + } + + if o.lastUpdate.Load() != 0 { o.lastUpdate.Store(ts) if activeSeriesIncrement != 0 { o.activeSerie.Add(activeSeriesIncrement) @@ -262,10 +272,6 @@ func (t *Tracker) updateObservations(key []byte, ts int64, activeSeriesIncrement } o.discardedSampleMtx.Unlock() } - } else if len(t.observed) < t.maxCardinality*2 && createIfDoesNotExist { - // When createIfDoesNotExist is false, it means that the method is called from DecrementActiveSeries, when key doesn't exist we should ignore the call - // Otherwise create a new observation for the key - t.createNewObservation(string(key), ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) } } From f8f2a49360e0028d56562b6da033fc0dd5c8f63a Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 30 Dec 2024 14:23:51 +0100 Subject: [PATCH 059/105] get cat once out of loop --- pkg/distributor/distributor.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 3568c495133..5e7dc09f140 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -1829,10 +1829,11 @@ func tokenForMetadata(userID string, metricName string) uint32 { func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID string) { var receivedSamples, receivedExemplars, receivedMetadata int + cat := d.costAttributionMgr.Tracker(userID) for _, ts := range req.Timeseries { receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) - d.costAttributionMgr.Tracker(userID).IncrementReceivedSamples(ts.Labels, float64(receivedSamples), mtime.Now()) + cat.IncrementReceivedSamples(ts.Labels, float64(receivedSamples), mtime.Now()) } receivedMetadata = len(req.Metadata) From 1ad99ad7d15c5ceb51646bfaef07721b35e3a471 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 30 Dec 2024 16:08:51 +0100 Subject: [PATCH 060/105] update tracker per request for received samples --- pkg/costattribution/manager_test.go | 11 ++-- pkg/costattribution/testutils/test_utils.go | 27 ++++++++- pkg/costattribution/tracker.go | 61 +++++++++++++-------- pkg/costattribution/tracker_test.go | 58 +++++++++++++++++--- pkg/distributor/distributor.go | 3 +- 5 files changed, 119 insertions(+), 41 deletions(-) diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index d7b4ebd7e27..de6a17c0512 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -54,7 +54,7 @@ func TestManager_CreateDeleteTracker(t *testing.T) { t.Run("Metrics tracking", func(t *testing.T) { manager.Tracker("user1").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "bar"}}, 1, "invalid-metrics-name", time.Unix(6, 0)) manager.Tracker("user1").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "foo"}}, 1, "invalid-metrics-name", time.Unix(12, 0)) - manager.Tracker("user3").IncrementReceivedSamples([]mimirpb.LabelAdapter{{Name: "department", Value: "foo"}, {Name: "service", Value: "dodo"}}, 1, time.Unix(20, 0)) + manager.Tracker("user3").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"department", "foo", "service", "dodo"}, SamplesCount: 1}}), time.Unix(20, 0)) expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. @@ -112,10 +112,9 @@ func TestManager_CreateDeleteTracker(t *testing.T) { }) t.Run("Overflow metrics on cardinality limit", func(t *testing.T) { - - manager.Tracker("user3").IncrementReceivedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "bar"}, {Name: "feature", Value: "bar"}}, 1, time.Unix(15, 0)) - manager.Tracker("user3").IncrementReceivedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "baz"}, {Name: "feature", Value: "baz"}}, 1, time.Unix(16, 0)) - manager.Tracker("user3").IncrementReceivedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "foo"}, {Name: "feature", Value: "foo"}}, 1, time.Unix(17, 0)) + manager.Tracker("user3").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"team", "bar", "feature", "bar"}, SamplesCount: 1}}), time.Unix(15, 0)) + manager.Tracker("user3").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"team", "baz", "feature", "baz"}, SamplesCount: 1}}), time.Unix(16, 0)) + manager.Tracker("user3").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"team", "foo", "feature", "foo"}, SamplesCount: 1}}), time.Unix(17, 0)) expectedMetrics := ` # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. # TYPE cortex_received_attributed_samples_total counter @@ -128,7 +127,7 @@ func TestManager_CreateDeleteTracker(t *testing.T) { func TestManager_PurgeInactiveAttributionsUntil(t *testing.T) { manager := newTestManager() - manager.Tracker("user1").IncrementReceivedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "foo"}}, 1, time.Unix(1, 0)) + manager.Tracker("user1").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"team", "foo"}, SamplesCount: 1}}), time.Unix(1, 0)) manager.Tracker("user1").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "foo"}}, 1, "invalid-metrics-name", time.Unix(1, 0)) manager.Tracker("user3").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "department", Value: "foo"}, {Name: "service", Value: "bar"}}, 1, "out-of-window", time.Unix(10, 0)) diff --git a/pkg/costattribution/testutils/test_utils.go b/pkg/costattribution/testutils/test_utils.go index f79f4861cd9..62dc617e04b 100644 --- a/pkg/costattribution/testutils/test_utils.go +++ b/pkg/costattribution/testutils/test_utils.go @@ -2,7 +2,10 @@ package testutils -import "github.com/grafana/mimir/pkg/util/validation" +import ( + "github.com/grafana/mimir/pkg/mimirpb" + "github.com/grafana/mimir/pkg/util/validation" +) func NewMockCostAttributionLimits(idx int, lvs ...string) (*validation.Overrides, error) { baseLimits := map[string]*validation.Limits{ @@ -33,3 +36,25 @@ func NewMockCostAttributionLimits(idx int, lvs ...string) (*validation.Overrides return validation.NewOverrides(validation.Limits{}, validation.NewMockTenantLimits(baseLimits)) } + +type Series struct { + LabelValues []string + SamplesCount int +} + +func CreateRequest(data []Series) *mimirpb.WriteRequest { + timeSeries := make([]mimirpb.PreallocTimeseries, 0, len(data)) + for i := 0; i < len(data); i++ { + var Labels []mimirpb.LabelAdapter + for j := 0; j+1 < len(data[i].LabelValues); j += 2 { + Labels = append(Labels, mimirpb.LabelAdapter{Name: data[i].LabelValues[j], Value: data[i].LabelValues[j+1]}) + } + timeSeries = append(timeSeries, mimirpb.PreallocTimeseries{ + TimeSeries: &mimirpb.TimeSeries{ + Labels: Labels, + Samples: make([]mimirpb.Sample, data[i].SamplesCount), + }, + }) + } + return &mimirpb.WriteRequest{Timeseries: timeSeries} +} diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index e9225a0cc10..1c9559fb478 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -173,11 +173,19 @@ func (t *Tracker) IncrementDiscardedSamples(lbs []mimirpb.LabelAdapter, value fl t.updateCountersWithLabelAdapter(lbs, now, 0, 0, value, &reason, true) } -func (t *Tracker) IncrementReceivedSamples(lbs []mimirpb.LabelAdapter, value float64, now time.Time) { +func (t *Tracker) IncrementReceivedSamples(req *mimirpb.WriteRequest, now time.Time) { if t == nil { return } - t.updateCountersWithLabelAdapter(lbs, now, 0, value, 0, nil, true) + + dict := make(map[string]int) + for _, ts := range req.Timeseries { + lvs := t.extractLabelValuesFromLabelAdapater(ts.Labels) + dict[t.hashLabelValues(lvs)] += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) + } + for k, v := range dict { + t.updateCountersCommon(k, now, 0, float64(v), 0, nil, true) + } } func (t *Tracker) IncrementActiveSeriesFailure() { @@ -188,6 +196,26 @@ func (t *Tracker) IncrementActiveSeriesFailure() { } func (t *Tracker) updateCountersWithLabelAdapter(lbls []mimirpb.LabelAdapter, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { + labelValues := t.extractLabelValuesFromLabelAdapater(lbls) + key := t.hashLabelValues(labelValues) + t.updateCountersCommon(key, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) +} + +func (t *Tracker) hashLabelValues(labelValues []string) string { + buf := bufferPool.Get().(*bytes.Buffer) + buf.Reset() + defer bufferPool.Put(buf) + + for i, value := range labelValues { + if i > 0 { + buf.WriteRune(sep) + } + buf.WriteString(value) + } + return buf.String() +} + +func (t *Tracker) extractLabelValuesFromLabelAdapater(lbls []mimirpb.LabelAdapter) []string { labelValues := make([]string, len(t.labels)) for idx, cal := range t.labels { for _, l := range lbls { @@ -200,7 +228,7 @@ func (t *Tracker) updateCountersWithLabelAdapter(lbls []mimirpb.LabelAdapter, ts labelValues[idx] = missingValue } } - t.updateCountersCommon(labelValues, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) + return labelValues } func (t *Tracker) updateCounters(lbls labels.Labels, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { @@ -211,46 +239,33 @@ func (t *Tracker) updateCounters(lbls labels.Labels, ts time.Time, activeSeriesI labelValues[idx] = missingValue } } - t.updateCountersCommon(labelValues, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) + key := t.hashLabelValues(labelValues) + t.updateCountersCommon(key, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) } func (t *Tracker) updateCountersCommon( - labelValues []string, + key string, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool, ) { - // Reuse buffer from pool for building the observation key - buf := bufferPool.Get().(*bytes.Buffer) - buf.Reset() - defer bufferPool.Put(buf) - - // Construct the observation key by joining label values - for i, value := range labelValues { - if i > 0 { - buf.WriteRune(sep) - } - buf.WriteString(value) - } - // Lock access to the observation map t.observedMtx.Lock() defer t.observedMtx.Unlock() - // Update observations and state - t.updateObservations(buf.Bytes(), ts.Unix(), activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) + t.updateObservations(key, ts.Unix(), activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) t.updateState(ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement) } // updateObservations updates or creates a new observation in the 'observed' map. -func (t *Tracker) updateObservations(key []byte, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { - o, known := t.observed[string(key)] +func (t *Tracker) updateObservations(key string, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { + o, known := t.observed[key] if !known { if len(t.observed) < t.maxCardinality*2 && createIfDoesNotExist { // When createIfDoesNotExist is false, it means that the method is called from DecrementActiveSeries, when key doesn't exist we should ignore the call // Otherwise create a new observation for the key - t.createNewObservation(string(key), ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) + t.createNewObservation(key, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) } return } diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index f9457ceb062..cb009f384c2 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -8,12 +8,12 @@ import ( "testing" "time" - "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" "github.com/prometheus/prometheus/model/labels" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/grafana/mimir/pkg/costattribution/testutils" "github.com/grafana/mimir/pkg/mimirpb" ) @@ -22,18 +22,58 @@ func TestTracker_hasSameLabels(t *testing.T) { assert.True(t, tracker.hasSameLabels([]string{"team"}), "Expected cost attribution labels mismatch") } -func TestTracker_CreateDelete(t *testing.T) { +func TestTracker_IncrementReceviedSamples(t *testing.T) { tManager := newTestManager() tracker := tManager.Tracker("user4") + t.Run("One Single Series in Request", func(t *testing.T) { + tracker.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"platform", "foo", "service", "dodo"}, SamplesCount: 3}}), time.Unix(10, 0)) - reg := prometheus.NewRegistry() - err := reg.Register(tManager) - require.NoError(t, err) + expectedMetrics := ` + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 3 + ` + assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total")) + }) + t.Run("Multiple Different Series in Request", func(t *testing.T) { + tracker.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{ + {LabelValues: []string{"platform", "foo", "service", "dodo"}, SamplesCount: 3}, + {LabelValues: []string{"platform", "bar", "service", "yoyo"}, SamplesCount: 5}, + }), time.Unix(20, 0)) + + expectedMetrics := ` + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 6 + cortex_received_attributed_samples_total{platform="bar",tenant="user4",tracker="cost-attribution"} 5 + ` + assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total")) + }) + + t.Run("Multiple Series in Request with Same Labels", func(t *testing.T) { + tracker.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{ + {LabelValues: []string{"platform", "foo", "service", "dodo"}, SamplesCount: 3}, + {LabelValues: []string{"platform", "foo", "service", "yoyo"}, SamplesCount: 5}, + }), time.Unix(30, 0)) + + expectedMetrics := ` + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 14 + cortex_received_attributed_samples_total{platform="bar",tenant="user4",tracker="cost-attribution"} 5 + ` + assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total")) + }) +} + +func TestTracker_CreateDelete(t *testing.T) { + tManager := newTestManager() + tracker := tManager.Tracker("user4") tracker.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), time.Unix(1, 0)) tracker.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "2"), time.Unix(2, 0)) tracker.DecrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "3")) - tracker.IncrementReceivedSamples([]mimirpb.LabelAdapter{{Name: "platform", Value: "foo"}, {Name: "team", Value: "1"}}, 5, time.Unix(4, 0)) + tracker.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"platform", "foo", "team", "1"}, SamplesCount: 5}}), time.Unix(4, 0)) tracker.IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "platform", Value: "foo"}, {Name: "team", Value: "1"}}, 2, "sample-out-of-order", time.Unix(4, 0)) tracker.IncrementActiveSeries(labels.FromStrings("platform", "bar", "tenant", "user4", "team", "2"), time.Unix(6, 0)) tracker.IncrementActiveSeriesFailure() @@ -60,7 +100,7 @@ func TestTracker_CreateDelete(t *testing.T) { "cortex_ingester_attributed_active_series", "cortex_ingester_attributed_active_series_failure", } - assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), metricNames...)) assert.Equal(t, []string{"foo"}, tracker.inactiveObservations(time.Unix(5, 0))) assert.NoError(t, tManager.purgeInactiveAttributionsUntil(time.Unix(5, 0))) @@ -72,9 +112,9 @@ func TestTracker_CreateDelete(t *testing.T) { # TYPE cortex_ingester_attributed_active_series_failure counter cortex_ingester_attributed_active_series_failure{tenant="user4",tracker="cost-attribution"} 1 ` - assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), metricNames...)) tManager.deleteTracker("user4") - assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(""), metricNames...)) + assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(""), metricNames...)) } func TestTracker_updateCounters(t *testing.T) { diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 5e7dc09f140..2a1d376ed90 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -1829,12 +1829,11 @@ func tokenForMetadata(userID string, metricName string) uint32 { func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID string) { var receivedSamples, receivedExemplars, receivedMetadata int - cat := d.costAttributionMgr.Tracker(userID) for _, ts := range req.Timeseries { receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) - cat.IncrementReceivedSamples(ts.Labels, float64(receivedSamples), mtime.Now()) } + d.costAttributionMgr.Tracker(userID).IncrementReceivedSamples(req, mtime.Now()) receivedMetadata = len(req.Metadata) d.receivedSamples.WithLabelValues(userID).Add(float64(receivedSamples)) From fa62ee1afa9d410fe37b85a47bba6db1b36d2f54 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 30 Dec 2024 17:48:48 +0100 Subject: [PATCH 061/105] make the lock fanny by dum dum --- pkg/costattribution/tracker.go | 139 +++++++++++++++------------- pkg/costattribution/tracker_test.go | 10 +- 2 files changed, 78 insertions(+), 71 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 1c9559fb478..e6336cec4e1 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -17,13 +17,6 @@ import ( "github.com/grafana/mimir/pkg/mimirpb" ) -type trackerState int - -const ( - Normal trackerState = iota - Overflow -) - const sep = rune(0x80) type observation struct { @@ -47,7 +40,7 @@ type Tracker struct { observed map[string]*observation observedMtx sync.RWMutex hashBuffer []byte - state trackerState + isOverflow atomic.Bool overflowCounter *observation totalFailedActiveSeries *atomic.Float64 cooldownDuration time.Duration @@ -132,37 +125,38 @@ func (t *Tracker) DecrementActiveSeries(lbs labels.Labels) { } func (t *Tracker) Collect(out chan<- prometheus.Metric) { - switch t.state { - case Overflow: + if t.totalFailedActiveSeries.Load() > 0 { + out <- prometheus.MustNewConstMetric(t.failedActiveSeriesDecrement, prometheus.CounterValue, t.totalFailedActiveSeries.Load(), t.userID) + } + + if t.isOverflow.Load() { out <- prometheus.MustNewConstMetric(t.activeSeriesPerUserAttribution, prometheus.GaugeValue, t.overflowCounter.activeSerie.Load(), t.overflowLabels[:len(t.overflowLabels)-1]...) out <- prometheus.MustNewConstMetric(t.receivedSamplesAttribution, prometheus.CounterValue, t.overflowCounter.receivedSample.Load(), t.overflowLabels[:len(t.overflowLabels)-1]...) out <- prometheus.MustNewConstMetric(t.discardedSampleAttribution, prometheus.CounterValue, t.overflowCounter.totalDiscarded.Load(), t.overflowLabels...) - case Normal: - // Collect metrics for all observed keys - t.observedMtx.RLock() - defer t.observedMtx.RUnlock() - for key, o := range t.observed { - if key == "" { - continue - } - keys := strings.Split(key, string(sep)) + return + } - keys = append(keys, t.userID) - if o.activeSerie.Load() > 0 { - out <- prometheus.MustNewConstMetric(t.activeSeriesPerUserAttribution, prometheus.GaugeValue, o.activeSerie.Load(), keys...) - } - if o.receivedSample.Load() > 0 { - out <- prometheus.MustNewConstMetric(t.receivedSamplesAttribution, prometheus.CounterValue, o.receivedSample.Load(), keys...) - } - o.discardedSampleMtx.Lock() - for reason, discarded := range o.discardedSample { - out <- prometheus.MustNewConstMetric(t.discardedSampleAttribution, prometheus.CounterValue, discarded.Load(), append(keys, reason)...) - } - o.discardedSampleMtx.Unlock() + // Collect metrics for all observed keys + t.observedMtx.RLock() + defer t.observedMtx.RUnlock() + for key, o := range t.observed { + if key == "" { + continue } - } - if t.totalFailedActiveSeries.Load() > 0 { - out <- prometheus.MustNewConstMetric(t.failedActiveSeriesDecrement, prometheus.CounterValue, t.totalFailedActiveSeries.Load(), t.userID) + keys := strings.Split(key, string(sep)) + + keys = append(keys, t.userID) + if o.activeSerie.Load() > 0 { + out <- prometheus.MustNewConstMetric(t.activeSeriesPerUserAttribution, prometheus.GaugeValue, o.activeSerie.Load(), keys...) + } + if o.receivedSample.Load() > 0 { + out <- prometheus.MustNewConstMetric(t.receivedSamplesAttribution, prometheus.CounterValue, o.receivedSample.Load(), keys...) + } + o.discardedSampleMtx.Lock() + for reason, discarded := range o.discardedSample { + out <- prometheus.MustNewConstMetric(t.discardedSampleAttribution, prometheus.CounterValue, discarded.Load(), append(keys, reason)...) + } + o.discardedSampleMtx.Unlock() } } @@ -250,17 +244,20 @@ func (t *Tracker) updateCountersCommon( reason *string, createIfDoesNotExist bool, ) { + // Update observations and state + t.updateObservations(key, ts.Unix(), activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) // Lock access to the observation map t.observedMtx.Lock() defer t.observedMtx.Unlock() - // Update observations and state - t.updateObservations(key, ts.Unix(), activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) t.updateState(ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement) } // updateObservations updates or creates a new observation in the 'observed' map. func (t *Tracker) updateObservations(key string, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { + t.observedMtx.RLock() o, known := t.observed[key] + t.observedMtx.RUnlock() + if !known { if len(t.observed) < t.maxCardinality*2 && createIfDoesNotExist { // When createIfDoesNotExist is false, it means that the method is called from DecrementActiveSeries, when key doesn't exist we should ignore the call @@ -270,47 +267,51 @@ func (t *Tracker) updateObservations(key string, ts int64, activeSeriesIncrement return } - if o.lastUpdate.Load() != 0 { - o.lastUpdate.Store(ts) - if activeSeriesIncrement != 0 { - o.activeSerie.Add(activeSeriesIncrement) - } - if receivedSampleIncrement > 0 { - o.receivedSample.Add(receivedSampleIncrement) - } - if discardedSampleIncrement > 0 && reason != nil { - o.discardedSampleMtx.Lock() - if _, ok := o.discardedSample[*reason]; ok { - o.discardedSample[*reason].Add(discardedSampleIncrement) - } else { - o.discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) - } - o.discardedSampleMtx.Unlock() + o.lastUpdate.Store(ts) + if activeSeriesIncrement != 0 { + o.activeSerie.Add(activeSeriesIncrement) + } + if receivedSampleIncrement > 0 { + o.receivedSample.Add(receivedSampleIncrement) + } + if discardedSampleIncrement > 0 && reason != nil { + o.discardedSampleMtx.Lock() + if _, ok := o.discardedSample[*reason]; ok { + o.discardedSample[*reason].Add(discardedSampleIncrement) + } else { + o.discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) } + o.discardedSampleMtx.Unlock() } } // updateState checks if the tracker has exceeded its max cardinality and updates overflow state if necessary. func (t *Tracker) updateState(ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64) { + previousOverflow := true + t.observedMtx.RLock() + // Transition to overflow mode if maximum cardinality is exceeded. - previousState := t.state - if t.state == Normal && len(t.observed) > t.maxCardinality { - t.state = Overflow - // Initialize the overflow counter. - t.overflowCounter = &observation{} - - // Aggregate active series from all keys into the overflow counter. - for _, o := range t.observed { - if o != nil { - t.overflowCounter.activeSerie.Add(o.activeSerie.Load()) + if !t.isOverflow.Load() && len(t.observed) > t.maxCardinality { + // Make sure that we count current overflow only when state is switched to overflow from normal. + previousOverflow = t.isOverflow.Swap(true) + if !previousOverflow { + // Initialize the overflow counter. + t.overflowCounter = &observation{} + + // Aggregate active series from all keys into the overflow counter. + for _, o := range t.observed { + if o != nil { + t.overflowCounter.activeSerie.Add(o.activeSerie.Load()) + } } + t.cooldownUntil = ts.Add(t.cooldownDuration) } - t.cooldownUntil = ts.Add(t.cooldownDuration) } + t.observedMtx.RUnlock() - if t.state == Overflow { + if t.isOverflow.Load() { // if already in overflow mode, update the overflow counter. If it was normal mode, the active series are already applied. - if previousState == Overflow && activeSeriesIncrement != 0 { + if previousOverflow && activeSeriesIncrement != 0 { t.overflowCounter.activeSerie.Add(activeSeriesIncrement) } if receivedSampleIncrement > 0 { @@ -324,6 +325,12 @@ func (t *Tracker) updateState(ts time.Time, activeSeriesIncrement, receivedSampl // createNewObservation creates a new observation in the 'observed' map. func (t *Tracker) createNewObservation(key string, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { + t.observedMtx.Lock() + defer t.observedMtx.Unlock() + if _, exists := t.observed[key]; exists { + return + } + t.observed[key] = &observation{ lastUpdate: *atomic.NewInt64(ts), activeSerie: *atomic.NewFloat64(activeSeriesIncrement), @@ -367,7 +374,7 @@ func (t *Tracker) inactiveObservations(deadline time.Time) []string { t.observedMtx.RLock() defer t.observedMtx.RUnlock() for labkey, ob := range t.observed { - if ob != nil && ob.lastUpdate.Load() != 0 && ob.lastUpdate.Load() <= deadline.Unix() { + if ob != nil && ob.lastUpdate.Load() <= deadline.Unix() { invalidKeys = append(invalidKeys, labkey) } } diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index cb009f384c2..bf8277e0913 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -124,16 +124,16 @@ func TestTracker_updateCounters(t *testing.T) { lbls3 := labels.FromStrings("department", "baz", "service", "foo") tracker.updateCounters(lbls1, time.Unix(1, 0), 1, 0, 0, nil, true) - assert.Equal(t, Normal, tracker.state, "First observation, should not overflow") + assert.False(t, tracker.isOverflow.Load(), "First observation, should not overflow") tracker.updateCounters(lbls2, time.Unix(2, 0), 1, 0, 0, nil, true) - assert.Equal(t, Normal, tracker.state, "Second observation, should not overflow") + assert.False(t, tracker.isOverflow.Load(), "Second observation, should not overflow") tracker.updateCounters(lbls3, time.Unix(3, 0), 1, 0, 0, nil, true) - assert.Equal(t, Overflow, tracker.state, "Third observation, should overflow") + assert.True(t, tracker.isOverflow.Load(), "Third observation, should overflow") tracker.updateCounters(lbls3, time.Unix(4, 0), 1, 0, 0, nil, true) - assert.Equal(t, Overflow, tracker.state, "Fourth observation, should stay overflow") + assert.True(t, tracker.isOverflow.Load(), "Fourth observation, should stay overflow") assert.Equal(t, time.Unix(3, 0).Add(tracker.cooldownDuration), tracker.cooldownUntil, "CooldownUntil should be updated correctly") } @@ -191,7 +191,7 @@ func TestTracker_Concurrency(t *testing.T) { // Verify no data races or inconsistencies assert.True(t, len(tracker.observed) > 0, "Observed set should not be empty after concurrent updates") assert.LessOrEqual(t, len(tracker.observed), 2*tracker.maxCardinality, "Observed count should not exceed 2 times of max cardinality") - assert.Equal(t, Overflow, tracker.state, "Expected state to be Overflow") + assert.True(t, tracker.isOverflow.Load(), "Expected state to be Overflow") expectedMetrics := ` # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. From 1b0fb005cc0ce0c144f2ca83e21e428c8e70aa03 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 30 Dec 2024 18:38:46 +0100 Subject: [PATCH 062/105] make ingester work --- pkg/costattribution/tracker.go | 12 +------ pkg/costattribution/tracker_test.go | 8 ----- pkg/ingester/activeseries/active_series.go | 38 +++++++++++++++------- pkg/ingester/ingester.go | 5 +++ pkg/ingester/metrics.go | 8 ++++- 5 files changed, 40 insertions(+), 31 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index e6336cec4e1..c1cb4cee78c 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -35,7 +35,6 @@ type Tracker struct { activeSeriesPerUserAttribution *prometheus.Desc receivedSamplesAttribution *prometheus.Desc discardedSampleAttribution *prometheus.Desc - failedActiveSeriesDecrement *prometheus.Desc overflowLabels []string observed map[string]*observation observedMtx sync.RWMutex @@ -88,9 +87,7 @@ func newTracker(userID string, trackedLabels []string, limit int, cooldown time. tracker.activeSeriesPerUserAttribution = prometheus.NewDesc("cortex_ingester_attributed_active_series", "The total number of active series per user and attribution.", variableLabels[:len(variableLabels)-1], prometheus.Labels{trackerLabel: defaultTrackerName}) - tracker.failedActiveSeriesDecrement = prometheus.NewDesc("cortex_ingester_attributed_active_series_failure", - "The total number of failed active series decrement per user and tracker.", []string{tenantLabel}, - prometheus.Labels{trackerLabel: defaultTrackerName}) + return tracker } @@ -182,13 +179,6 @@ func (t *Tracker) IncrementReceivedSamples(req *mimirpb.WriteRequest, now time.T } } -func (t *Tracker) IncrementActiveSeriesFailure() { - if t == nil { - return - } - t.totalFailedActiveSeries.Add(1) -} - func (t *Tracker) updateCountersWithLabelAdapter(lbls []mimirpb.LabelAdapter, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { labelValues := t.extractLabelValuesFromLabelAdapater(lbls) key := t.hashLabelValues(labelValues) diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index bf8277e0913..b00462ca3b9 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -76,7 +76,6 @@ func TestTracker_CreateDelete(t *testing.T) { tracker.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"platform", "foo", "team", "1"}, SamplesCount: 5}}), time.Unix(4, 0)) tracker.IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "platform", Value: "foo"}, {Name: "team", Value: "1"}}, 2, "sample-out-of-order", time.Unix(4, 0)) tracker.IncrementActiveSeries(labels.FromStrings("platform", "bar", "tenant", "user4", "team", "2"), time.Unix(6, 0)) - tracker.IncrementActiveSeriesFailure() expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. @@ -86,9 +85,6 @@ func TestTracker_CreateDelete(t *testing.T) { # TYPE cortex_ingester_attributed_active_series gauge cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="cost-attribution"} 1 cortex_ingester_attributed_active_series{platform="foo",tenant="user4",tracker="cost-attribution"} 1 - # HELP cortex_ingester_attributed_active_series_failure The total number of failed active series decrement per user and tracker. - # TYPE cortex_ingester_attributed_active_series_failure counter - cortex_ingester_attributed_active_series_failure{tenant="user4",tracker="cost-attribution"} 1 # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. # TYPE cortex_received_attributed_samples_total counter cortex_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 5 @@ -98,7 +94,6 @@ func TestTracker_CreateDelete(t *testing.T) { "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total", "cortex_ingester_attributed_active_series", - "cortex_ingester_attributed_active_series_failure", } assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), metricNames...)) assert.Equal(t, []string{"foo"}, tracker.inactiveObservations(time.Unix(5, 0))) @@ -108,9 +103,6 @@ func TestTracker_CreateDelete(t *testing.T) { # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. # TYPE cortex_ingester_attributed_active_series gauge cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="cost-attribution"} 1 - # HELP cortex_ingester_attributed_active_series_failure The total number of failed active series decrement per user and tracker. - # TYPE cortex_ingester_attributed_active_series_failure counter - cortex_ingester_attributed_active_series_failure{tenant="user4",tracker="cost-attribution"} 1 ` assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), metricNames...)) tManager.deleteTracker("user4") diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 79fdc8988b5..a5261e17031 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -67,15 +67,16 @@ type seriesStripe struct { // Unix nanoseconds. Only used by purge. Zero = unknown. // Updated in purge and when old timestamp is used when updating series (in this case, oldestEntryTs is updated // without holding the lock -- hence the atomic). - oldestEntryTs atomic.Int64 - mu sync.RWMutex - refs map[storage.SeriesRef]seriesEntry - active uint32 // Number of active entries in this stripe. Only decreased during purge or clear. - activeMatching []uint32 // Number of active entries in this stripe matching each matcher of the configured Matchers. - activeNativeHistograms uint32 // Number of active entries (only native histograms) in this stripe. Only decreased during purge or clear. - activeMatchingNativeHistograms []uint32 // Number of active entries (only native histograms) in this stripe matching each matcher of the configured Matchers. - activeNativeHistogramBuckets uint32 // Number of buckets in active native histogram entries in this stripe. Only decreased during purge or clear. - activeMatchingNativeHistogramBuckets []uint32 // Number of buckets in active native histogram entries in this stripe matching each matcher of the configured Matchers. + oldestEntryTs atomic.Int64 + mu sync.RWMutex + refs map[storage.SeriesRef]seriesEntry + activeSeriesAttributionFailureCounter atomic.Float64 + active uint32 // Number of active entries in this stripe. Only decreased during purge or clear. + activeMatching []uint32 // Number of active entries in this stripe matching each matcher of the configured Matchers. + activeNativeHistograms uint32 // Number of active entries (only native histograms) in this stripe. Only decreased during purge or clear. + activeMatchingNativeHistograms []uint32 // Number of active entries (only native histograms) in this stripe matching each matcher of the configured Matchers. + activeNativeHistogramBuckets uint32 // Number of buckets in active native histogram entries in this stripe. Only decreased during purge or clear. + activeMatchingNativeHistogramBuckets []uint32 // Number of buckets in active native histogram entries in this stripe matching each matcher of the configured Matchers. cat *costattribution.Tracker } @@ -217,6 +218,14 @@ func (c *ActiveSeries) ActiveWithMatchers() (total int, totalMatching []int, tot return } +func (c *ActiveSeries) ActiveSeriesAttributionFailureCount() float64 { + var total float64 + for s := 0; s < numStripes; s++ { + total += c.stripes[s].activeSeriesAttributionFailureCount() + } + return total +} + func (c *ActiveSeries) Delete(ref chunks.HeadSeriesRef, idx tsdb.IndexReader) { stripeID := storage.SeriesRef(ref) % numStripes c.stripes[stripeID].remove(storage.SeriesRef(ref), idx) @@ -461,7 +470,7 @@ func (s *seriesStripe) purge(keepUntil time.Time, idx tsdb.IndexReader) { if ts < keepUntilNanos { if s.cat != nil { if err := idx.Series(ref, &buf, nil); err != nil { - s.cat.IncrementActiveSeriesFailure() + s.activeSeriesAttributionFailureCounter.Add(1) } else { s.cat.DecrementActiveSeries(buf.Labels()) } @@ -499,6 +508,13 @@ func (s *seriesStripe) purge(keepUntil time.Time, idx tsdb.IndexReader) { } } +func (s *seriesStripe) activeSeriesAttributionFailureCount() float64 { + s.mu.Lock() + defer s.mu.Unlock() + + return s.activeSeriesAttributionFailureCounter.Swap(0) +} + // remove a single series from the stripe. // This is mostly the same logic from purge() but we decrement counters for a single entry instead of incrementing for each entry. // Note: we might remove the oldest series here, but the worst thing can happen is that we let run a useless purge() cycle later, @@ -519,7 +535,7 @@ func (s *seriesStripe) remove(ref storage.SeriesRef, idx tsdb.IndexReader) { if s.cat != nil { buf := labels.NewScratchBuilder(128) if err := idx.Series(ref, &buf, nil); err != nil { - s.cat.IncrementActiveSeriesFailure() + s.activeSeriesAttributionFailureCounter.Add(1) } else { s.cat.DecrementActiveSeries(buf.Labels()) } diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 39ea139edf9..6debbca48df 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -818,6 +818,11 @@ func (i *Ingester) updateActiveSeries(now time.Time) { i.metrics.activeNativeHistogramBucketsPerUser.DeleteLabelValues(userID) } + AttributedActiveSeriesFailure := userDB.activeSeries.ActiveSeriesAttributionFailureCount() + if AttributedActiveSeriesFailure > 0 { + i.metrics.attributedActiveSeriesFailuresPerUser.WithLabelValues(userID).Add(AttributedActiveSeriesFailure) + } + for idx, name := range userDB.activeSeries.CurrentMatcherNames() { // We only set the metrics for matchers that actually exist, to avoid increasing cardinality with zero valued metrics. if activeMatching[idx] > 0 { diff --git a/pkg/ingester/metrics.go b/pkg/ingester/metrics.go index 38e53d3c090..bf920c383a3 100644 --- a/pkg/ingester/metrics.go +++ b/pkg/ingester/metrics.go @@ -44,6 +44,8 @@ type ingesterMetrics struct { activeNativeHistogramBucketsPerUser *prometheus.GaugeVec activeNativeHistogramBucketsCustomTrackersPerUser *prometheus.GaugeVec + attributedActiveSeriesFailuresPerUser *prometheus.CounterVec + // Owned series ownedSeriesPerUser *prometheus.GaugeVec @@ -193,7 +195,10 @@ func newIngesterMetrics( Name: "cortex_ingester_owned_series", Help: "Number of currently owned series per user.", }, []string{"user"}), - + attributedActiveSeriesFailuresPerUser: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_ingester_attributed_active_series_failure", + Help: "The total number of failed active series decrement per user", + }, []string{"user"}), maxUsersGauge: promauto.With(r).NewGaugeFunc(prometheus.GaugeOpts{ Name: instanceLimits, Help: instanceLimitsHelp, @@ -401,6 +406,7 @@ func (m *ingesterMetrics) deletePerUserMetrics(userID string) { m.maxLocalSeriesPerUser.DeleteLabelValues(userID) m.ownedSeriesPerUser.DeleteLabelValues(userID) + m.attributedActiveSeriesFailuresPerUser.DeleteLabelValues(userID) } func (m *ingesterMetrics) deletePerGroupMetricsForUser(userID, group string) { From ced834608aaa79e2162e668866ca3893e953f7c6 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 30 Dec 2024 18:46:23 +0100 Subject: [PATCH 063/105] fix lock --- pkg/costattribution/tracker.go | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index c1cb4cee78c..e1ce8c4c795 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -122,9 +122,6 @@ func (t *Tracker) DecrementActiveSeries(lbs labels.Labels) { } func (t *Tracker) Collect(out chan<- prometheus.Metric) { - if t.totalFailedActiveSeries.Load() > 0 { - out <- prometheus.MustNewConstMetric(t.failedActiveSeriesDecrement, prometheus.CounterValue, t.totalFailedActiveSeries.Load(), t.userID) - } if t.isOverflow.Load() { out <- prometheus.MustNewConstMetric(t.activeSeriesPerUserAttribution, prometheus.GaugeValue, t.overflowCounter.activeSerie.Load(), t.overflowLabels[:len(t.overflowLabels)-1]...) @@ -234,11 +231,7 @@ func (t *Tracker) updateCountersCommon( reason *string, createIfDoesNotExist bool, ) { - // Update observations and state t.updateObservations(key, ts.Unix(), activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) - // Lock access to the observation map - t.observedMtx.Lock() - defer t.observedMtx.Unlock() t.updateState(ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement) } @@ -279,7 +272,6 @@ func (t *Tracker) updateObservations(key string, ts int64, activeSeriesIncrement func (t *Tracker) updateState(ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64) { previousOverflow := true t.observedMtx.RLock() - // Transition to overflow mode if maximum cardinality is exceeded. if !t.isOverflow.Load() && len(t.observed) > t.maxCardinality { // Make sure that we count current overflow only when state is switched to overflow from normal. From 0a7c858cc3f308ef436fe99423edd4af1e0387df Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 31 Dec 2024 10:09:44 +0100 Subject: [PATCH 064/105] add changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c31ba2b06fd..32a3a0efe60 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## main / unreleased * [CHANGE] Query-frontend: Add `topic` label to `cortex_ingest_storage_strong_consistency_requests_total`, `cortex_ingest_storage_strong_consistency_failures_total`, and `cortex_ingest_storage_strong_consistency_wait_duration_seconds` metrics. #10220 +[FEATURE] Ingester/Distributor: Add support for exporting cost attribution metrics with labels specified by customers to a custom Prometheus registry. This enables customers to track billing data more flexibly. #10269 ### Grafana Mimir From 4336f7f224c08ae8aaaa0f5d080172ae5af02627 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 31 Dec 2024 10:12:33 +0100 Subject: [PATCH 065/105] update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 32a3a0efe60..638dd7b29a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ ## main / unreleased * [CHANGE] Query-frontend: Add `topic` label to `cortex_ingest_storage_strong_consistency_requests_total`, `cortex_ingest_storage_strong_consistency_failures_total`, and `cortex_ingest_storage_strong_consistency_wait_duration_seconds` metrics. #10220 -[FEATURE] Ingester/Distributor: Add support for exporting cost attribution metrics with labels specified by customers to a custom Prometheus registry. This enables customers to track billing data more flexibly. #10269 +* [FEATURE] Ingester/Distributor: Add support for exporting cost attribution metrics (`cortex_ingester_attributed_active_series`, `cortex_received_attributed_samples_total`, and `cortex_discarded_attributed_samples_total`) with labels specified by customers to a custom Prometheus registry. This feature enables more flexible billing data tracking. #10269 ### Grafana Mimir From 67b6cea5c26cc082b2aea9e8f836ccc9478ef1c2 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 31 Dec 2024 10:36:26 +0100 Subject: [PATCH 066/105] update doc with correct metrics name --- cmd/mimir/config-descriptor.json | 2 +- cmd/mimir/help-all.txt.tmpl | 2 +- .../mimir/configure/configuration-parameters/index.md | 7 +++---- pkg/util/validation/limits.go | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 199aaecdf44..6b95fb0c29d 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -4372,7 +4372,7 @@ "kind": "field", "name": "cost_attribution_labels", "required": false, - "desc": "Defines labels for cost attribution. Applies to metrics like cortex_distributor_attributed_received_samples_total. To disable, set to an empty string. For example, 'team,service' produces metrics such as cortex_distributor_attributed_received_samples_total{team='frontend', service='api'}.", + "desc": "Defines labels for cost attribution. Applies to metrics like cortex_received_attributed_samples_total. To disable, set to an empty string. For example, 'team,service' produces metrics such as cortex_received_attributed_samples_total{team='frontend', service='api'}.", "fieldValue": null, "fieldDefaultValue": "", "fieldFlag": "validation.cost-attribution-labels", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 7fdd772c442..e7d007312a8 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -3326,7 +3326,7 @@ Usage of ./cmd/mimir/mimir: -validation.cost-attribution-cooldown duration [experimental] Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system stays in overflow mode and extends the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset occurs when the cardinality falls below the limit. -validation.cost-attribution-labels comma-separated-list-of-strings - [experimental] Defines labels for cost attribution. Applies to metrics like cortex_distributor_attributed_received_samples_total. To disable, set to an empty string. For example, 'team,service' produces metrics such as cortex_distributor_attributed_received_samples_total{team='frontend', service='api'}. + [experimental] Defines labels for cost attribution. Applies to metrics like cortex_received_attributed_samples_total. To disable, set to an empty string. For example, 'team,service' produces metrics such as cortex_received_attributed_samples_total{team='frontend', service='api'}. -validation.create-grace-period duration Controls how far into the future incoming samples and exemplars are accepted compared to the wall clock. Any sample or exemplar will be rejected if its timestamp is greater than '(now + creation_grace_period)'. This configuration is enforced in the distributor and ingester. (default 10m) -validation.enforce-metadata-metric-name diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 2a982ccd36e..ca1378a4f37 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -3587,10 +3587,9 @@ The `limits` block configures default and per-tenant limits imposed by component [active_series_results_max_size_bytes: | default = 419430400] # (experimental) Defines labels for cost attribution. Applies to metrics like -# cortex_distributor_attributed_received_samples_total. To disable, set to an -# empty string. For example, 'team,service' produces metrics such as -# cortex_distributor_attributed_received_samples_total{team='frontend', -# service='api'}. +# cortex_received_attributed_samples_total. To disable, set to an empty string. +# For example, 'team,service' produces metrics such as +# cortex_received_attributed_samples_total{team='frontend', service='api'}. # CLI flag: -validation.cost-attribution-labels [cost_attribution_labels: | default = ""] diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 1a71701c84c..76614b8992c 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -309,7 +309,7 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.StringVar(&l.SeparateMetricsGroupLabel, "validation.separate-metrics-group-label", "", "Label used to define the group label for metrics separation. For each write request, the group is obtained from the first non-empty group label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'group' label with group label's value. Currently applies to the following metrics: cortex_discarded_samples_total") - f.Var(&l.CostAttributionLabels, costAttributionLabelsFlag, "Defines labels for cost attribution. Applies to metrics like cortex_distributor_attributed_received_samples_total. To disable, set to an empty string. For example, 'team,service' produces metrics such as cortex_distributor_attributed_received_samples_total{team='frontend', service='api'}.") + f.Var(&l.CostAttributionLabels, costAttributionLabelsFlag, "Defines labels for cost attribution. Applies to metrics like cortex_received_attributed_samples_total. To disable, set to an empty string. For example, 'team,service' produces metrics such as cortex_received_attributed_samples_total{team='frontend', service='api'}.") f.IntVar(&l.MaxCostAttributionLabelsPerUser, maxCostAttributionLabelsPerUserFlag, 2, "Maximum number of cost attribution labels allowed per user.") f.IntVar(&l.MaxCostAttributionCardinalityPerUser, "validation.max-cost-attribution-cardinality-per-user", 10000, "Maximum cardinality of cost attribution labels allowed per user.") f.Var(&l.CostAttributionCooldown, "validation.cost-attribution-cooldown", "Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system stays in overflow mode and extends the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset occurs when the cardinality falls below the limit.") From 800fe8527e41517f6f27bee185c1a0588628609b Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 31 Dec 2024 12:37:31 +0100 Subject: [PATCH 067/105] remove useless function --- pkg/costattribution/tracker.go | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index e1ce8c4c795..6ddf38c7262 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -171,15 +171,22 @@ func (t *Tracker) IncrementReceivedSamples(req *mimirpb.WriteRequest, now time.T lvs := t.extractLabelValuesFromLabelAdapater(ts.Labels) dict[t.hashLabelValues(lvs)] += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) } + + // Update the observations for each label set and update the state per request, + // this would be less precised than per sample but it's more efficient + var total float64 for k, v := range dict { - t.updateCountersCommon(k, now, 0, float64(v), 0, nil, true) + t.updateObservations(k, now.Unix(), 0, float64(v), 0, nil, true) + total += float64(v) } + t.updateState(now, 0, total, 0) } func (t *Tracker) updateCountersWithLabelAdapter(lbls []mimirpb.LabelAdapter, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { labelValues := t.extractLabelValuesFromLabelAdapater(lbls) key := t.hashLabelValues(labelValues) - t.updateCountersCommon(key, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) + t.updateObservations(key, ts.Unix(), activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) + t.updateState(ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement) } func (t *Tracker) hashLabelValues(labelValues []string) string { @@ -221,16 +228,6 @@ func (t *Tracker) updateCounters(lbls labels.Labels, ts time.Time, activeSeriesI } } key := t.hashLabelValues(labelValues) - t.updateCountersCommon(key, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) -} - -func (t *Tracker) updateCountersCommon( - key string, - ts time.Time, - activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, - reason *string, - createIfDoesNotExist bool, -) { t.updateObservations(key, ts.Unix(), activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) t.updateState(ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement) } From 80e69fbcd15413f890bc86788d5f5f74edbe7a74 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 31 Dec 2024 12:41:02 +0100 Subject: [PATCH 068/105] cast only once --- pkg/costattribution/tracker.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 6ddf38c7262..a00b7fd4fc4 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -176,8 +176,9 @@ func (t *Tracker) IncrementReceivedSamples(req *mimirpb.WriteRequest, now time.T // this would be less precised than per sample but it's more efficient var total float64 for k, v := range dict { - t.updateObservations(k, now.Unix(), 0, float64(v), 0, nil, true) - total += float64(v) + count := float64(v) + t.updateObservations(k, now.Unix(), 0, count, 0, nil, true) + total += count } t.updateState(now, 0, total, 0) } From a2ffe5a893c9a9eef26160c760cc6e57e50a3f1d Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 2 Jan 2025 18:32:24 +0100 Subject: [PATCH 069/105] stop using string --- pkg/costattribution/tracker.go | 179 +++++++++++++++++---------------- 1 file changed, 91 insertions(+), 88 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index a00b7fd4fc4..870ad2f26b1 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -38,7 +38,6 @@ type Tracker struct { overflowLabels []string observed map[string]*observation observedMtx sync.RWMutex - hashBuffer []byte isOverflow atomic.Bool overflowCounter *observation totalFailedActiveSeries *atomic.Float64 @@ -65,11 +64,11 @@ func newTracker(userID string, trackedLabels []string, limit int, cooldown time. labels: orderedLables, maxCardinality: limit, observed: make(map[string]*observation), - hashBuffer: make([]byte, 0, 1024), cooldownDuration: cooldown, logger: logger, overflowLabels: overflowLabels, totalFailedActiveSeries: atomic.NewFloat64(0), + overflowCounter: &observation{}, } variableLabels := slices.Clone(orderedLables) @@ -122,36 +121,41 @@ func (t *Tracker) DecrementActiveSeries(lbs labels.Labels) { } func (t *Tracker) Collect(out chan<- prometheus.Metric) { - if t.isOverflow.Load() { - out <- prometheus.MustNewConstMetric(t.activeSeriesPerUserAttribution, prometheus.GaugeValue, t.overflowCounter.activeSerie.Load(), t.overflowLabels[:len(t.overflowLabels)-1]...) + var activeSeries float64 + t.observedMtx.RLock() + for _, o := range t.observed { + activeSeries += o.activeSerie.Load() + } + t.observedMtx.RUnlock() + out <- prometheus.MustNewConstMetric(t.activeSeriesPerUserAttribution, prometheus.GaugeValue, activeSeries+t.overflowCounter.activeSerie.Load(), t.overflowLabels[:len(t.overflowLabels)-1]...) out <- prometheus.MustNewConstMetric(t.receivedSamplesAttribution, prometheus.CounterValue, t.overflowCounter.receivedSample.Load(), t.overflowLabels[:len(t.overflowLabels)-1]...) out <- prometheus.MustNewConstMetric(t.discardedSampleAttribution, prometheus.CounterValue, t.overflowCounter.totalDiscarded.Load(), t.overflowLabels...) return } - - // Collect metrics for all observed keys + // We don't know the performance of out receiver, so we don't want to hold the lock for too long + var prometheusMetrics []prometheus.Metric t.observedMtx.RLock() - defer t.observedMtx.RUnlock() for key, o := range t.observed { - if key == "" { - continue - } keys := strings.Split(key, string(sep)) - keys = append(keys, t.userID) if o.activeSerie.Load() > 0 { - out <- prometheus.MustNewConstMetric(t.activeSeriesPerUserAttribution, prometheus.GaugeValue, o.activeSerie.Load(), keys...) + prometheusMetrics = append(prometheusMetrics, prometheus.MustNewConstMetric(t.activeSeriesPerUserAttribution, prometheus.GaugeValue, o.activeSerie.Load(), keys...)) } if o.receivedSample.Load() > 0 { - out <- prometheus.MustNewConstMetric(t.receivedSamplesAttribution, prometheus.CounterValue, o.receivedSample.Load(), keys...) + prometheusMetrics = append(prometheusMetrics, prometheus.MustNewConstMetric(t.receivedSamplesAttribution, prometheus.CounterValue, o.receivedSample.Load(), keys...)) } o.discardedSampleMtx.Lock() for reason, discarded := range o.discardedSample { - out <- prometheus.MustNewConstMetric(t.discardedSampleAttribution, prometheus.CounterValue, discarded.Load(), append(keys, reason)...) + prometheusMetrics = append(prometheusMetrics, prometheus.MustNewConstMetric(t.discardedSampleAttribution, prometheus.CounterValue, discarded.Load(), append(keys, reason)...)) } o.discardedSampleMtx.Unlock() } + t.observedMtx.RUnlock() + + for _, m := range prometheusMetrics { + out <- m + } } func (t *Tracker) IncrementDiscardedSamples(lbs []mimirpb.LabelAdapter, value float64, reason string, now time.Time) { @@ -166,10 +170,14 @@ func (t *Tracker) IncrementReceivedSamples(req *mimirpb.WriteRequest, now time.T return } + // We precompute the cost attribution per request before update Observations and State to avoid frequently update the atomic counters dict := make(map[string]int) + buf := bufferPool.Get().(*bytes.Buffer) + buf.Reset() + defer bufferPool.Put(buf) for _, ts := range req.Timeseries { - lvs := t.extractLabelValuesFromLabelAdapater(ts.Labels) - dict[t.hashLabelValues(lvs)] += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) + t.fillKeyFromLabelAdapters(ts.Labels, buf) + dict[string(buf.Bytes())] += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) } // Update the observations for each label set and update the state per request, @@ -177,78 +185,97 @@ func (t *Tracker) IncrementReceivedSamples(req *mimirpb.WriteRequest, now time.T var total float64 for k, v := range dict { count := float64(v) - t.updateObservations(k, now.Unix(), 0, count, 0, nil, true) + t.updateObservations(k, now, 0, count, 0, nil, true) total += count } - t.updateState(now, 0, total, 0) } func (t *Tracker) updateCountersWithLabelAdapter(lbls []mimirpb.LabelAdapter, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { - labelValues := t.extractLabelValuesFromLabelAdapater(lbls) - key := t.hashLabelValues(labelValues) - t.updateObservations(key, ts.Unix(), activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) - t.updateState(ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement) -} - -func (t *Tracker) hashLabelValues(labelValues []string) string { buf := bufferPool.Get().(*bytes.Buffer) buf.Reset() defer bufferPool.Put(buf) - - for i, value := range labelValues { - if i > 0 { - buf.WriteRune(sep) - } - buf.WriteString(value) - } - return buf.String() + t.fillKeyFromLabelAdapters(lbls, buf) + t.updateObservations(buf.String(), ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) } -func (t *Tracker) extractLabelValuesFromLabelAdapater(lbls []mimirpb.LabelAdapter) []string { - labelValues := make([]string, len(t.labels)) +func (t *Tracker) fillKeyFromLabelAdapters(lbls []mimirpb.LabelAdapter, buf *bytes.Buffer) { + buf.Reset() + var exists bool for idx, cal := range t.labels { + if idx > 0 { + buf.WriteRune(sep) + } + exists = false for _, l := range lbls { if l.Name == cal { - labelValues[idx] = l.Value + exists = true + buf.WriteString(l.Value) break } } - if labelValues[idx] == "" { - labelValues[idx] = missingValue + if !exists { + buf.WriteString(missingValue) } } - return labelValues } -func (t *Tracker) updateCounters(lbls labels.Labels, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { - labelValues := make([]string, len(t.labels)) +func (t *Tracker) fillKeyFromLabels(lbls labels.Labels, buf *bytes.Buffer) { + buf.Reset() for idx, cal := range t.labels { - labelValues[idx] = lbls.Get(cal) - if labelValues[idx] == "" { - labelValues[idx] = missingValue + if idx > 0 { + buf.WriteRune(sep) + } + v := lbls.Get(cal) + if v != "" { + buf.WriteString(v) + } else { + buf.WriteString(missingValue) } } - key := t.hashLabelValues(labelValues) - t.updateObservations(key, ts.Unix(), activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) - t.updateState(ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement) +} + +func (t *Tracker) updateCounters(lbls labels.Labels, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { + buf := bufferPool.Get().(*bytes.Buffer) + buf.Reset() + defer bufferPool.Put(buf) + t.fillKeyFromLabels(lbls, buf) + t.updateObservations(buf.String(), ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) } // updateObservations updates or creates a new observation in the 'observed' map. -func (t *Tracker) updateObservations(key string, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { +func (t *Tracker) updateObservations(key string, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { t.observedMtx.RLock() o, known := t.observed[key] t.observedMtx.RUnlock() if !known { - if len(t.observed) < t.maxCardinality*2 && createIfDoesNotExist { - // When createIfDoesNotExist is false, it means that the method is called from DecrementActiveSeries, when key doesn't exist we should ignore the call - // Otherwise create a new observation for the key - t.createNewObservation(key, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) + if !createIfDoesNotExist { + return } - return + // We don't want to restart the tracker when we are not sure overflow is fixed, so we keep a observation with 2 times the max cardinality, as soon as + // the tracker is still above the max cardinality, we will keep the overflow state. + t.observedMtx.RLock() + if len(t.observed) < t.maxCardinality*2 { + t.observedMtx.RUnlock() + + // If adding the new observation would exceed the max cardinality, we need to update the state, it is fine only call it here + // because we are sure that the new observation will be added to the map + t.updateState(ts) + + // If we are not in overflow mode, we can create a new observation with input values, otherwise we create a new observation with 0 values + if !t.isOverflow.Load() { + t.createNewObservationAndUpdateState(key, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) + return + } + t.createNewObservationAndUpdateState(key, ts, 0, 0, 0, nil) + } else { + t.observedMtx.RUnlock() + } + // If we are in overflow mode (including the case that observed map size exceed 2 times max cardinality), we update the overflow counter + o = t.overflowCounter } - o.lastUpdate.Store(ts) + o.lastUpdate.Store(ts.Unix()) if activeSeriesIncrement != 0 { o.activeSerie.Add(activeSeriesIncrement) } @@ -267,44 +294,20 @@ func (t *Tracker) updateObservations(key string, ts int64, activeSeriesIncrement } // updateState checks if the tracker has exceeded its max cardinality and updates overflow state if necessary. -func (t *Tracker) updateState(ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64) { - previousOverflow := true - t.observedMtx.RLock() - // Transition to overflow mode if maximum cardinality is exceeded. - if !t.isOverflow.Load() && len(t.observed) > t.maxCardinality { - // Make sure that we count current overflow only when state is switched to overflow from normal. - previousOverflow = t.isOverflow.Swap(true) - if !previousOverflow { - // Initialize the overflow counter. - t.overflowCounter = &observation{} - - // Aggregate active series from all keys into the overflow counter. - for _, o := range t.observed { - if o != nil { - t.overflowCounter.activeSerie.Add(o.activeSerie.Load()) - } - } +// This function is not thread-safe and should be called with the t.observedMtx read lock. +func (t *Tracker) updateState(ts time.Time) { + if !t.isOverflow.Load() && len(t.observed) >= t.maxCardinality { + // Update state to overflow and set cooldown time + t.isOverflow.Store(true) + if t.cooldownUntil.IsZero() { t.cooldownUntil = ts.Add(t.cooldownDuration) } - } - t.observedMtx.RUnlock() - - if t.isOverflow.Load() { - // if already in overflow mode, update the overflow counter. If it was normal mode, the active series are already applied. - if previousOverflow && activeSeriesIncrement != 0 { - t.overflowCounter.activeSerie.Add(activeSeriesIncrement) - } - if receivedSampleIncrement > 0 { - t.overflowCounter.receivedSample.Add(receivedSampleIncrement) - } - if discardedSampleIncrement > 0 { - t.overflowCounter.totalDiscarded.Add(discardedSampleIncrement) - } + t.logger.Log("msg", "tracker is in overflow mode", "userID", t.userID, "maxCardinality", t.maxCardinality) } } -// createNewObservation creates a new observation in the 'observed' map. -func (t *Tracker) createNewObservation(key string, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { +// createNewObservationAndUpdateState creates a new observation in the 'observed' map. Check if the tracker is in overflow mode and updates the state. +func (t *Tracker) createNewObservationAndUpdateState(key string, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { t.observedMtx.Lock() defer t.observedMtx.Unlock() if _, exists := t.observed[key]; exists { @@ -312,7 +315,7 @@ func (t *Tracker) createNewObservation(key string, ts int64, activeSeriesIncreme } t.observed[key] = &observation{ - lastUpdate: *atomic.NewInt64(ts), + lastUpdate: *atomic.NewInt64(ts.Unix()), activeSerie: *atomic.NewFloat64(activeSeriesIncrement), receivedSample: *atomic.NewFloat64(receivedSampleIncrement), discardedSample: make(map[string]*atomic.Float64), From f28d67212d212e1c43b623a47ec887b09d679f82 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 2 Jan 2025 20:11:03 +0100 Subject: [PATCH 070/105] simplify logics --- pkg/costattribution/tracker.go | 76 ++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 22 deletions(-) diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 870ad2f26b1..720c615d9ac 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -252,27 +252,33 @@ func (t *Tracker) updateObservations(key string, ts time.Time, activeSeriesIncre if !createIfDoesNotExist { return } - // We don't want to restart the tracker when we are not sure overflow is fixed, so we keep a observation with 2 times the max cardinality, as soon as - // the tracker is still above the max cardinality, we will keep the overflow state. - t.observedMtx.RLock() - if len(t.observed) < t.maxCardinality*2 { - t.observedMtx.RUnlock() - - // If adding the new observation would exceed the max cardinality, we need to update the state, it is fine only call it here - // because we are sure that the new observation will be added to the map - t.updateState(ts) + createStatus, ob := t.createNewObservationAndUpdateState(key, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) + switch createStatus { + case fullCreate: + return + case alreadyExists: + known = true + o = ob + case exceedLimit: + // If we are in overflow mode (including the case that observed map size exceed 2 times max cardinality), we update the overflow counter + o = t.overflowCounter + case partialCreate: + activeSeriesIncrement = 0 + o = t.overflowCounter + } + } - // If we are not in overflow mode, we can create a new observation with input values, otherwise we create a new observation with 0 values - if !t.isOverflow.Load() { - t.createNewObservationAndUpdateState(key, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) - return + // Rechecking the known flag since we change above when we seen the observation already exists during creation + if known { + // if we already know the observation, we would increment the active series for sure, but we need to check if it is overflow mode + // if yes, that means we only update active series with the real observation, and we update the overflow counter with the rest + if t.isOverflow.Load() { + if activeSeriesIncrement > 0 { + o.activeSerie.Add(activeSeriesIncrement) + activeSeriesIncrement = 0 } - t.createNewObservationAndUpdateState(key, ts, 0, 0, 0, nil) - } else { - t.observedMtx.RUnlock() + o = t.overflowCounter } - // If we are in overflow mode (including the case that observed map size exceed 2 times max cardinality), we update the overflow counter - o = t.overflowCounter } o.lastUpdate.Store(ts.Unix()) @@ -306,26 +312,52 @@ func (t *Tracker) updateState(ts time.Time) { } } +type createStatus int + +const ( + fullCreate createStatus = iota + partialCreate + exceedLimit + alreadyExists +) + // createNewObservationAndUpdateState creates a new observation in the 'observed' map. Check if the tracker is in overflow mode and updates the state. -func (t *Tracker) createNewObservationAndUpdateState(key string, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { +// returns true if update with not overflow mode, it has been full create with full update +func (t *Tracker) createNewObservationAndUpdateState(key string, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) (createStatus, *observation) { t.observedMtx.Lock() defer t.observedMtx.Unlock() - if _, exists := t.observed[key]; exists { - return + if o, exists := t.observed[key]; exists { + return alreadyExists, o + } + + // If adding the new observation would exceed the max cardinality, we need to update the state, it is fine only call it here + // because we are sure that the new observation will be added to the map + t.updateState(ts) + + // We don't want to restart the tracker when we are not sure overflow is fixed, so we keep a observation with 2 times the max cardinality, as soon as + // the tracker is still above the max cardinality, we will keep the overflow state. + if len(t.observed) >= 2*t.maxCardinality { + return exceedLimit, nil } t.observed[key] = &observation{ lastUpdate: *atomic.NewInt64(ts.Unix()), activeSerie: *atomic.NewFloat64(activeSeriesIncrement), - receivedSample: *atomic.NewFloat64(receivedSampleIncrement), discardedSample: make(map[string]*atomic.Float64), discardedSampleMtx: sync.Mutex{}, } + // If we are not in overflow mode, we can create a new observation with input values, otherwise we create a new observation with 0 values except for active series + if t.isOverflow.Load() { + return partialCreate, t.observed[key] + } + + t.observed[key].receivedSample = *atomic.NewFloat64(receivedSampleIncrement) if discardedSampleIncrement > 0 && reason != nil { t.observed[key].discardedSampleMtx.Lock() t.observed[key].discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) t.observed[key].discardedSampleMtx.Unlock() } + return fullCreate, t.observed[key] } func (t *Tracker) recoveredFromOverflow(deadline time.Time) bool { From a5c394493fef04854ed143e4fa6e7a05d758e5b6 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Fri, 10 Jan 2025 13:58:43 +0100 Subject: [PATCH 071/105] add new tracker for active series only --- pkg/costattribution/active_tracker.go | 191 ++++++++++ pkg/costattribution/manager.go | 169 ++++++--- pkg/costattribution/manager_test.go | 12 +- pkg/costattribution/sample_tracker.go | 282 +++++++++++++++ pkg/costattribution/tracker.go | 398 --------------------- pkg/distributor/distributor.go | 20 +- pkg/distributor/validate.go | 6 +- pkg/ingester/activeseries/active_series.go | 10 +- 8 files changed, 610 insertions(+), 478 deletions(-) create mode 100644 pkg/costattribution/active_tracker.go create mode 100644 pkg/costattribution/sample_tracker.go delete mode 100644 pkg/costattribution/tracker.go diff --git a/pkg/costattribution/active_tracker.go b/pkg/costattribution/active_tracker.go new file mode 100644 index 00000000000..8487cc71e81 --- /dev/null +++ b/pkg/costattribution/active_tracker.go @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package costattribution + +import ( + "bytes" + "fmt" + "slices" + "strings" + "sync" + "time" + + "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/prometheus/model/labels" + "go.uber.org/atomic" +) + +type ActiveSeriesTracker struct { + userID string + labels []string + maxCardinality int + activeSeriesPerUserAttribution *prometheus.Desc + overflowLabels []string + observed map[string]*atomic.Int64 + observedMtx sync.RWMutex + overflowSince atomic.Int64 + overflowCounter atomic.Int64 + cooldownDuration time.Duration + logger log.Logger +} + +func newActiveSeriesTracker(userID string, trackedLabels []string, limit int, cooldown time.Duration, logger log.Logger) *ActiveSeriesTracker { + orderedLables := slices.Clone(trackedLabels) + slices.Sort(orderedLables) + + // Create a map for overflow labels to export when overflow happens + overflowLabels := make([]string, len(orderedLables)+2) + for i := range orderedLables { + overflowLabels[i] = overflowValue + } + + overflowLabels[len(orderedLables)] = userID + overflowLabels[len(orderedLables)+1] = overflowValue + + tracker := &ActiveSeriesTracker{ + userID: userID, + labels: orderedLables, + maxCardinality: limit, + observed: make(map[string]*atomic.Int64), + logger: logger, + overflowLabels: overflowLabels, + } + + variableLabels := slices.Clone(orderedLables) + variableLabels = append(variableLabels, tenantLabel, "reason") + + tracker.activeSeriesPerUserAttribution = prometheus.NewDesc("cortex_ingester_attributed_active_series", + "The total number of active series per user and attribution.", variableLabels[:len(variableLabels)-1], + prometheus.Labels{trackerLabel: defaultTrackerName}) + + return tracker +} + +func (t *ActiveSeriesTracker) hasSameLabels(labels []string) bool { + return slices.Equal(t.labels, labels) +} + +func (t *ActiveSeriesTracker) Increment(lbls labels.Labels, now time.Time) { + if t == nil { + return + } + buf := bufferPool.Get().(*bytes.Buffer) + buf.Reset() + defer bufferPool.Put(buf) + t.fillKeyFromLabels(lbls, buf) + t.observedMtx.RLock() + as, ok := t.observed[string(buf.Bytes())] + if ok { + as.Inc() + t.observedMtx.RUnlock() + return + } + t.observedMtx.RUnlock() + + if t.overflowSince.Load() > 0 { + t.overflowCounter.Inc() + return + } + + t.observedMtx.Lock() + defer t.observedMtx.Unlock() + as, ok = t.observed[string(buf.Bytes())] + if ok { + as.Inc() + return + } + + if t.overflowSince.Load() > 0 { + t.overflowCounter.Inc() + return + } + + if len(t.observed) >= t.maxCardinality { + t.overflowSince.Store(now.Unix()) + t.overflowCounter.Inc() + return + } + + t.observed[string(buf.Bytes())] = atomic.NewInt64(1) + +} + +func (t *ActiveSeriesTracker) Decrement(lbls labels.Labels) { + if t == nil { + return + } + buf := bufferPool.Get().(*bytes.Buffer) + buf.Reset() + defer bufferPool.Put(buf) + t.fillKeyFromLabels(lbls, buf) + t.observedMtx.RLock() + as, ok := t.observed[string(buf.Bytes())] + if ok { + nv := as.Dec() + if nv > 0 { + t.observedMtx.RUnlock() + return + } + t.observedMtx.RUnlock() + t.observedMtx.Lock() + as, ok := t.observed[string(buf.Bytes())] + if ok && as.Load() == 0 { + // use buf.String() instead of string(buf.Bytes()) to fix the lint issue + delete(t.observed, buf.String()) + } + t.observedMtx.Unlock() + return + } + t.observedMtx.RUnlock() + + if t.overflowSince.Load() > 0 { + t.overflowCounter.Dec() + return + } + + t.observedMtx.RLock() + defer t.observedMtx.RUnlock() + panic(fmt.Errorf("decrementing non-existent active series: labels=%v, cost attribution keys: %v, the current observation map length: %d, the current cost attribution key: %s", lbls, t.labels, len(t.observed), buf.String())) +} + +func (t *ActiveSeriesTracker) Collect(out chan<- prometheus.Metric) { + if t.overflowSince.Load() > 0 { + var activeSeries int64 + t.observedMtx.RLock() + for _, as := range t.observed { + activeSeries += as.Load() + } + t.observedMtx.RUnlock() + out <- prometheus.MustNewConstMetric(t.activeSeriesPerUserAttribution, prometheus.GaugeValue, float64(activeSeries+t.overflowCounter.Load()), t.overflowLabels[:len(t.overflowLabels)-1]...) + return + } + // We don't know the performance of out receiver, so we don't want to hold the lock for too long + var prometheusMetrics []prometheus.Metric + t.observedMtx.RLock() + for key, as := range t.observed { + keys := strings.Split(key, string(sep)) + keys = append(keys, t.userID) + prometheusMetrics = append(prometheusMetrics, prometheus.MustNewConstMetric(t.activeSeriesPerUserAttribution, prometheus.GaugeValue, float64(as.Load()), keys...)) + } + t.observedMtx.RUnlock() + + for _, m := range prometheusMetrics { + out <- m + } +} + +func (t *ActiveSeriesTracker) fillKeyFromLabels(lbls labels.Labels, buf *bytes.Buffer) { + buf.Reset() + for idx, cal := range t.labels { + if idx > 0 { + buf.WriteRune(sep) + } + v := lbls.Get(cal) + if v != "" { + buf.WriteString(v) + } else { + buf.WriteString(missingValue) + } + } +} diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 41b5f578f70..974d1e0e7ed 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -25,25 +25,32 @@ const ( type Manager struct { services.Service - logger log.Logger - inactiveTimeout time.Duration - limits *validation.Overrides - - mtx sync.RWMutex - trackersByUserID map[string]*Tracker - reg *prometheus.Registry - cleanupInterval time.Duration + logger log.Logger + limits *validation.Overrides + reg *prometheus.Registry + + mstx sync.RWMutex + sampleTrackersByUserID map[string]*SampleTracker + inactiveTimeout time.Duration + cleanupInterval time.Duration + + matx sync.RWMutex + activeTrackersByUserID map[string]*ActiveSeriesTracker } func NewManager(cleanupInterval, inactiveTimeout time.Duration, logger log.Logger, limits *validation.Overrides, reg *prometheus.Registry) (*Manager, error) { m := &Manager{ - trackersByUserID: make(map[string]*Tracker), - limits: limits, - mtx: sync.RWMutex{}, - inactiveTimeout: inactiveTimeout, - logger: logger, - reg: reg, - cleanupInterval: cleanupInterval, + mstx: sync.RWMutex{}, + sampleTrackersByUserID: make(map[string]*SampleTracker), + + matx: sync.RWMutex{}, + activeTrackersByUserID: make(map[string]*ActiveSeriesTracker), + + limits: limits, + inactiveTimeout: inactiveTimeout, + logger: logger, + reg: reg, + cleanupInterval: cleanupInterval, } m.Service = services.NewTimerService(cleanupInterval, nil, m.iteration, nil).WithName("cost attribution manager") @@ -64,15 +71,15 @@ func (m *Manager) enabledForUser(userID string) bool { return len(m.limits.CostAttributionLabels(userID)) > 0 } -func (m *Manager) Tracker(userID string) *Tracker { +func (m *Manager) SampleTracker(userID string) *SampleTracker { if !m.enabledForUser(userID) { return nil } // Check if the tracker already exists, if exists return it. Otherwise lock and create a new tracker. - m.mtx.RLock() - tracker, exists := m.trackersByUserID[userID] - m.mtx.RUnlock() + m.mstx.RLock() + tracker, exists := m.sampleTrackersByUserID[userID] + m.mstx.RUnlock() if exists { return tracker } @@ -82,22 +89,56 @@ func (m *Manager) Tracker(userID string) *Tracker { maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) cooldownDuration := m.limits.CostAttributionCooldown(userID) - m.mtx.Lock() - defer m.mtx.Unlock() - if tracker, exists = m.trackersByUserID[userID]; exists { + m.mstx.Lock() + defer m.mstx.Unlock() + if tracker, exists = m.sampleTrackersByUserID[userID]; exists { return tracker } - tracker = newTracker(userID, labels, maxCardinality, cooldownDuration, m.logger) - m.trackersByUserID[userID] = tracker + tracker = newSampleTracker(userID, labels, maxCardinality, cooldownDuration, m.logger) + m.sampleTrackersByUserID[userID] = tracker + return tracker +} + +func (m *Manager) ActiveSeriesTracker(userID string) *ActiveSeriesTracker { + if !m.enabledForUser(userID) { + return nil + } + + // Check if the tracker already exists, if exists return it. Otherwise lock and create a new tracker. + m.matx.RLock() + tracker, exists := m.activeTrackersByUserID[userID] + m.matx.RUnlock() + if exists { + return tracker + } + + // We need to create a new tracker, get all the necessary information from the limits before locking and creating the tracker. + labels := m.limits.CostAttributionLabels(userID) + maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) + cooldownDuration := m.limits.CostAttributionCooldown(userID) + + m.matx.Lock() + defer m.matx.Unlock() + if tracker, exists = m.activeTrackersByUserID[userID]; exists { + return tracker + } + tracker = newActiveSeriesTracker(userID, labels, maxCardinality, cooldownDuration, m.logger) + m.activeTrackersByUserID[userID] = tracker return tracker } func (m *Manager) Collect(out chan<- prometheus.Metric) { - m.mtx.RLock() - defer m.mtx.RUnlock() - for _, tracker := range m.trackersByUserID { + m.mstx.RLock() + for _, tracker := range m.sampleTrackersByUserID { + tracker.Collect(out) + } + m.mstx.RUnlock() + + m.matx.RLock() + for _, tracker := range m.activeTrackersByUserID { tracker.Collect(out) } + m.matx.RUnlock() } func (m *Manager) Describe(chan<- *prometheus.Desc) { @@ -105,61 +146,77 @@ func (m *Manager) Describe(chan<- *prometheus.Desc) { // For more details, refer to the documentation: https://pkg.go.dev/github.com/prometheus/client_golang/prometheus#hdr-Custom_Collectors_and_constant_Metrics } -func (m *Manager) deleteTracker(userID string) { - m.mtx.Lock() - defer m.mtx.Unlock() - delete(m.trackersByUserID, userID) +func (m *Manager) deleteSampleTracker(userID string) { + m.mstx.Lock() + delete(m.sampleTrackersByUserID, userID) + m.mstx.Unlock() +} + +func (m *Manager) deleteActiveTracker(userID string) { + m.matx.Lock() + delete(m.activeTrackersByUserID, userID) + m.matx.Unlock() } -func (m *Manager) updateTracker(userID string) *Tracker { +func (m *Manager) updateTracker(userID string) (*SampleTracker, *ActiveSeriesTracker) { if !m.enabledForUser(userID) { - m.deleteTracker(userID) - return nil + m.deleteSampleTracker(userID) + m.deleteActiveTracker(userID) + return nil, nil } - t := m.Tracker(userID) + st := m.SampleTracker(userID) + at := m.ActiveSeriesTracker(userID) lbls := slices.Clone(m.limits.CostAttributionLabels(userID)) // sort the labels to ensure the order is consistent slices.Sort(lbls) // if the labels have changed or the max cardinality or cooldown duration have changed, create a new tracker - if !t.hasSameLabels(lbls) || t.maxCardinality != m.limits.MaxCostAttributionCardinalityPerUser(userID) || t.cooldownDuration != m.limits.CostAttributionCooldown(userID) { - m.mtx.Lock() - t = newTracker(userID, lbls, m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) - m.trackersByUserID[userID] = t - m.mtx.Unlock() - return t + if !st.hasSameLabels(lbls) || st.maxCardinality != m.limits.MaxCostAttributionCardinalityPerUser(userID) || st.cooldownDuration != m.limits.CostAttributionCooldown(userID) { + m.mstx.Lock() + st = newSampleTracker(userID, lbls, m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) + m.sampleTrackersByUserID[userID] = st + m.mstx.Unlock() } - return t + if !at.hasSameLabels(lbls) || at.maxCardinality != m.limits.MaxCostAttributionCardinalityPerUser(userID) || at.cooldownDuration != m.limits.CostAttributionCooldown(userID) { + m.matx.Lock() + at = newActiveSeriesTracker(userID, lbls, m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) + m.activeTrackersByUserID[userID] = at + m.matx.Unlock() + } + + return st, at } func (m *Manager) purgeInactiveAttributionsUntil(deadline time.Time) error { - m.mtx.RLock() - userIDs := make([]string, 0, len(m.trackersByUserID)) - for userID := range m.trackersByUserID { + m.mstx.RLock() + userIDs := make([]string, 0, len(m.sampleTrackersByUserID)) + for userID := range m.sampleTrackersByUserID { userIDs = append(userIDs, userID) } - m.mtx.RUnlock() + m.mstx.RUnlock() for _, userID := range userIDs { - t := m.updateTracker(userID) - if t == nil { + st, at := m.updateTracker(userID) + if st == nil && at == nil { continue } - invalidKeys := t.inactiveObservations(deadline) + invalidKeys := st.inactiveObservations(deadline) for _, key := range invalidKeys { - t.cleanupTrackerAttribution(key) + st.cleanupTrackerAttribution(key) + } + + // only sample tracker can recovered from overflow, the activeseries tracker after the cooldown would just be deleted and recreated + if st.recoveredFromOverflow(deadline) { + m.deleteSampleTracker(userID) } - if t.recoveredFromOverflow(deadline) { - // We delete the current tracker here, - // this will cause the creation of a new one later. - // ActiveSeries tracker compares the pointer of the tracker, - // and this change will cause a reload there. - m.deleteTracker(userID) + // if the activeseries tracker has been in overflow for more than the cooldown duration, delete it + if at.overflowSince.Load() > 0 && time.Unix(at.overflowSince.Load(), 0).Add(at.cooldownDuration).Before(deadline) { + m.deleteActiveTracker(userID) } } return nil diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index de6a17c0512..4374770f050 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -30,7 +30,7 @@ func newTestManager() *Manager { func TestManager_New(t *testing.T) { manager := newTestManager() assert.NotNil(t, manager) - assert.NotNil(t, manager.trackersByUserID) + assert.NotNil(t, manager.sampleTrackersByUserID) assert.Equal(t, 10*time.Second, manager.inactiveTimeout) } @@ -84,7 +84,7 @@ func TestManager_CreateDeleteTracker(t *testing.T) { manager.limits, err = testutils.NewMockCostAttributionLimits(1) assert.NoError(t, err) assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(11, 0))) - assert.Equal(t, 1, len(manager.trackersByUserID)) + assert.Equal(t, 1, len(manager.sampleTrackersByUserID)) expectedMetrics := ` # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. @@ -99,7 +99,7 @@ func TestManager_CreateDeleteTracker(t *testing.T) { manager.limits, err = testutils.NewMockCostAttributionLimits(2) assert.NoError(t, err) assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(12, 0))) - assert.Equal(t, 1, len(manager.trackersByUserID)) + assert.Equal(t, 1, len(manager.sampleTrackersByUserID)) assert.True(t, manager.Tracker("user3").hasSameLabels([]string{"feature", "team"})) manager.Tracker("user3").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "foo"}}, 1, "invalid-metrics-name", time.Unix(13, 0)) @@ -133,7 +133,7 @@ func TestManager_PurgeInactiveAttributionsUntil(t *testing.T) { t.Run("Purge before inactive timeout", func(t *testing.T) { assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(0, 0))) - assert.Equal(t, 2, len(manager.trackersByUserID)) + assert.Equal(t, 2, len(manager.sampleTrackersByUserID)) expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. @@ -150,7 +150,7 @@ func TestManager_PurgeInactiveAttributionsUntil(t *testing.T) { assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(5, 0))) // User3's tracker should remain since it's active, user1's tracker should be removed - assert.Equal(t, 1, len(manager.trackersByUserID), "Expected one active tracker after purging") + assert.Equal(t, 1, len(manager.sampleTrackersByUserID), "Expected one active tracker after purging") assert.Nil(t, manager.Tracker("user1"), "Expected user1 tracker to be purged") expectedMetrics := ` @@ -166,7 +166,7 @@ func TestManager_PurgeInactiveAttributionsUntil(t *testing.T) { assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(20, 0))) // Tracker would stay at 1 since user1's tracker is disabled - assert.Equal(t, 1, len(manager.trackersByUserID), "Expected one active tracker after full purge") + assert.Equal(t, 1, len(manager.sampleTrackersByUserID), "Expected one active tracker after full purge") // No metrics should remain after all purged assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(""), "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total")) diff --git a/pkg/costattribution/sample_tracker.go b/pkg/costattribution/sample_tracker.go new file mode 100644 index 00000000000..6369349ad21 --- /dev/null +++ b/pkg/costattribution/sample_tracker.go @@ -0,0 +1,282 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package costattribution + +import ( + "bytes" + "slices" + "strings" + "sync" + "time" + + "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" + "go.uber.org/atomic" + + "github.com/grafana/mimir/pkg/mimirpb" +) + +const sep = rune(0x80) + +type observation struct { + lastUpdate atomic.Int64 + receivedSample atomic.Float64 + discardedSampleMtx sync.Mutex + discardedSample map[string]*atomic.Float64 + totalDiscarded atomic.Float64 +} + +type SampleTracker struct { + userID string + labels []string + maxCardinality int + receivedSamplesAttribution *prometheus.Desc + discardedSampleAttribution *prometheus.Desc + overflowLabels []string + observed map[string]*observation + observedMtx sync.RWMutex + overflowSince atomic.Int64 + overflowCounter *observation + cooldownDuration time.Duration + logger log.Logger +} + +func newSampleTracker(userID string, trackedLabels []string, limit int, cooldown time.Duration, logger log.Logger) *SampleTracker { + orderedLables := slices.Clone(trackedLabels) + slices.Sort(orderedLables) + + // Create a map for overflow labels to export when overflow happens + overflowLabels := make([]string, len(orderedLables)+2) + for i := range orderedLables { + overflowLabels[i] = overflowValue + } + + overflowLabels[len(orderedLables)] = userID + overflowLabels[len(orderedLables)+1] = overflowValue + + tracker := &SampleTracker{ + userID: userID, + labels: orderedLables, + maxCardinality: limit, + observed: make(map[string]*observation), + cooldownDuration: cooldown, + logger: logger, + overflowLabels: overflowLabels, + overflowCounter: &observation{}, + } + + variableLabels := slices.Clone(orderedLables) + variableLabels = append(variableLabels, tenantLabel, "reason") + tracker.discardedSampleAttribution = prometheus.NewDesc("cortex_discarded_attributed_samples_total", + "The total number of samples that were discarded per attribution.", + variableLabels, + prometheus.Labels{trackerLabel: defaultTrackerName}) + + tracker.receivedSamplesAttribution = prometheus.NewDesc("cortex_received_attributed_samples_total", + "The total number of samples that were received per attribution.", + variableLabels[:len(variableLabels)-1], + prometheus.Labels{trackerLabel: defaultTrackerName}) + return tracker +} + +func (t *SampleTracker) hasSameLabels(labels []string) bool { + return slices.Equal(t.labels, labels) +} + +var bufferPool = sync.Pool{ + New: func() interface{} { + return new(bytes.Buffer) + }, +} + +func (t *SampleTracker) cleanupTrackerAttribution(key string) { + t.observedMtx.Lock() + defer t.observedMtx.Unlock() + delete(t.observed, key) +} + +func (t *SampleTracker) Collect(out chan<- prometheus.Metric) { + if t.overflowSince.Load() > 0 { + t.observedMtx.RLock() + t.observedMtx.RUnlock() + out <- prometheus.MustNewConstMetric(t.receivedSamplesAttribution, prometheus.CounterValue, t.overflowCounter.receivedSample.Load(), t.overflowLabels[:len(t.overflowLabels)-1]...) + out <- prometheus.MustNewConstMetric(t.discardedSampleAttribution, prometheus.CounterValue, t.overflowCounter.totalDiscarded.Load(), t.overflowLabels...) + return + } + // We don't know the performance of out receiver, so we don't want to hold the lock for too long + var prometheusMetrics []prometheus.Metric + t.observedMtx.RLock() + for key, o := range t.observed { + keys := strings.Split(key, string(sep)) + keys = append(keys, t.userID) + if o.receivedSample.Load() > 0 { + prometheusMetrics = append(prometheusMetrics, prometheus.MustNewConstMetric(t.receivedSamplesAttribution, prometheus.CounterValue, o.receivedSample.Load(), keys...)) + } + o.discardedSampleMtx.Lock() + for reason, discarded := range o.discardedSample { + prometheusMetrics = append(prometheusMetrics, prometheus.MustNewConstMetric(t.discardedSampleAttribution, prometheus.CounterValue, discarded.Load(), append(keys, reason)...)) + } + o.discardedSampleMtx.Unlock() + } + t.observedMtx.RUnlock() + + for _, m := range prometheusMetrics { + out <- m + } +} + +func (t *SampleTracker) IncrementDiscardedSamples(lbs []mimirpb.LabelAdapter, value float64, reason string, now time.Time) { + if t == nil { + return + } + t.updateCountersWithLabelAdapter(lbs, now, 0, 0, value, &reason) +} + +func (t *SampleTracker) IncrementReceivedSamples(req *mimirpb.WriteRequest, now time.Time) { + if t == nil { + return + } + + // We precompute the cost attribution per request before update Observations and State to avoid frequently update the atomic counters + dict := make(map[string]int) + buf := bufferPool.Get().(*bytes.Buffer) + buf.Reset() + defer bufferPool.Put(buf) + for _, ts := range req.Timeseries { + t.fillKeyFromLabelAdapters(ts.Labels, buf) + dict[string(buf.Bytes())] += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) + } + + // Update the observations for each label set and update the state per request, + // this would be less precised than per sample but it's more efficient + var total float64 + for k, v := range dict { + count := float64(v) + t.updateObservations(k, now, count, 0, nil) + total += count + } +} + +func (t *SampleTracker) updateCountersWithLabelAdapter(lbls []mimirpb.LabelAdapter, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { + buf := bufferPool.Get().(*bytes.Buffer) + buf.Reset() + defer bufferPool.Put(buf) + t.fillKeyFromLabelAdapters(lbls, buf) + t.updateObservations(buf.String(), ts, receivedSampleIncrement, discardedSampleIncrement, reason) +} + +func (t *SampleTracker) fillKeyFromLabelAdapters(lbls []mimirpb.LabelAdapter, buf *bytes.Buffer) { + buf.Reset() + var exists bool + for idx, cal := range t.labels { + if idx > 0 { + buf.WriteRune(sep) + } + exists = false + for _, l := range lbls { + if l.Name == cal { + exists = true + buf.WriteString(l.Value) + break + } + } + if !exists { + buf.WriteString(missingValue) + } + } +} + +// updateObservations updates or creates a new observation in the 'observed' map. +func (t *SampleTracker) updateObservations(key string, ts time.Time, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { + // if overflowSince is set, we only update the overflow counter + if t.overflowSince.Load() > 0 { + t.overflowCounter.receivedSample.Add(receivedSampleIncrement) + if discardedSampleIncrement > 0 && reason != nil { + t.overflowCounter.totalDiscarded.Add(discardedSampleIncrement) + } + return + } + + // if not overflow, we need to check if the key exists in the observed map, + // if yes, we update the observation, otherwise we create a new observation, and set the overflowSince if the max cardinality is exceeded + t.observedMtx.Lock() + defer t.observedMtx.Unlock() + o, known := t.observed[key] + if known && t.overflowSince.Load() == 0 { + o.lastUpdate.Store(ts.Unix()) + o.receivedSample.Add(receivedSampleIncrement) + if discardedSampleIncrement > 0 && reason != nil { + o.discardedSampleMtx.Lock() + if _, ok := o.discardedSample[*reason]; ok { + o.discardedSample[*reason].Add(discardedSampleIncrement) + } else { + o.discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) + } + o.discardedSampleMtx.Unlock() + } + return + } + + if t.overflowSince.Load() > 0 { + t.overflowCounter.receivedSample.Add(receivedSampleIncrement) + if discardedSampleIncrement > 0 && reason != nil { + t.overflowCounter.totalDiscarded.Add(discardedSampleIncrement) + } + return + } + + // If adding the new observation would exceed the max cardinality, we need to update the state + t.observed[key] = &observation{ + lastUpdate: *atomic.NewInt64(ts.Unix()), + discardedSample: make(map[string]*atomic.Float64), + receivedSample: *atomic.NewFloat64(receivedSampleIncrement), + discardedSampleMtx: sync.Mutex{}, + } + + if discardedSampleIncrement > 0 && reason != nil { + t.observed[key].discardedSampleMtx.Lock() + t.observed[key].discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) + t.observed[key].discardedSampleMtx.Unlock() + } + + if len(t.observed) >= t.maxCardinality { + t.overflowSince.Store(ts.Unix()) + } +} + +func (t *SampleTracker) recoveredFromOverflow(deadline time.Time) bool { + t.observedMtx.RLock() + if t.overflowSince.Load() > 0 && time.Unix(t.overflowSince.Load(), 0).Add(t.cooldownDuration).Before(deadline) { + if len(t.observed) <= t.maxCardinality { + t.observedMtx.RUnlock() + return true + } + t.observedMtx.RUnlock() + + // Increase the cooldown duration if the number of observations is still above the max cardinality + t.observedMtx.Lock() + if len(t.observed) <= t.maxCardinality { + t.observedMtx.Unlock() + return true + } + t.overflowSince.Store(deadline.Unix()) + t.observedMtx.Unlock() + } else { + t.observedMtx.RUnlock() + } + return false +} + +func (t *SampleTracker) inactiveObservations(deadline time.Time) []string { + // otherwise, we need to check all observations and clean up the ones that are inactive + var invalidKeys []string + t.observedMtx.RLock() + defer t.observedMtx.RUnlock() + for labkey, ob := range t.observed { + if ob != nil && ob.lastUpdate.Load() <= deadline.Unix() { + invalidKeys = append(invalidKeys, labkey) + } + } + + return invalidKeys +} diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go deleted file mode 100644 index 720c615d9ac..00000000000 --- a/pkg/costattribution/tracker.go +++ /dev/null @@ -1,398 +0,0 @@ -// SPDX-License-Identifier: AGPL-3.0-only - -package costattribution - -import ( - "bytes" - "slices" - "strings" - "sync" - "time" - - "github.com/go-kit/log" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/prometheus/model/labels" - "go.uber.org/atomic" - - "github.com/grafana/mimir/pkg/mimirpb" -) - -const sep = rune(0x80) - -type observation struct { - lastUpdate atomic.Int64 - activeSerie atomic.Float64 - receivedSample atomic.Float64 - discardedSampleMtx sync.Mutex - discardedSample map[string]*atomic.Float64 - totalDiscarded atomic.Float64 -} - -type Tracker struct { - userID string - labels []string - maxCardinality int - activeSeriesPerUserAttribution *prometheus.Desc - receivedSamplesAttribution *prometheus.Desc - discardedSampleAttribution *prometheus.Desc - overflowLabels []string - observed map[string]*observation - observedMtx sync.RWMutex - isOverflow atomic.Bool - overflowCounter *observation - totalFailedActiveSeries *atomic.Float64 - cooldownDuration time.Duration - cooldownUntil time.Time - logger log.Logger -} - -func newTracker(userID string, trackedLabels []string, limit int, cooldown time.Duration, logger log.Logger) *Tracker { - orderedLables := slices.Clone(trackedLabels) - slices.Sort(orderedLables) - - // Create a map for overflow labels to export when overflow happens - overflowLabels := make([]string, len(orderedLables)+2) - for i := range orderedLables { - overflowLabels[i] = overflowValue - } - - overflowLabels[len(orderedLables)] = userID - overflowLabels[len(orderedLables)+1] = overflowValue - - tracker := &Tracker{ - userID: userID, - labels: orderedLables, - maxCardinality: limit, - observed: make(map[string]*observation), - cooldownDuration: cooldown, - logger: logger, - overflowLabels: overflowLabels, - totalFailedActiveSeries: atomic.NewFloat64(0), - overflowCounter: &observation{}, - } - - variableLabels := slices.Clone(orderedLables) - variableLabels = append(variableLabels, tenantLabel, "reason") - tracker.discardedSampleAttribution = prometheus.NewDesc("cortex_discarded_attributed_samples_total", - "The total number of samples that were discarded per attribution.", - variableLabels, - prometheus.Labels{trackerLabel: defaultTrackerName}) - - tracker.receivedSamplesAttribution = prometheus.NewDesc("cortex_received_attributed_samples_total", - "The total number of samples that were received per attribution.", - variableLabels[:len(variableLabels)-1], - prometheus.Labels{trackerLabel: defaultTrackerName}) - - tracker.activeSeriesPerUserAttribution = prometheus.NewDesc("cortex_ingester_attributed_active_series", - "The total number of active series per user and attribution.", variableLabels[:len(variableLabels)-1], - prometheus.Labels{trackerLabel: defaultTrackerName}) - - return tracker -} - -func (t *Tracker) hasSameLabels(labels []string) bool { - return slices.Equal(t.labels, labels) -} - -var bufferPool = sync.Pool{ - New: func() interface{} { - return new(bytes.Buffer) - }, -} - -func (t *Tracker) cleanupTrackerAttribution(key string) { - t.observedMtx.Lock() - defer t.observedMtx.Unlock() - delete(t.observed, key) -} - -func (t *Tracker) IncrementActiveSeries(lbs labels.Labels, now time.Time) { - if t == nil { - return - } - t.updateCounters(lbs, now, 1, 0, 0, nil, true) -} - -func (t *Tracker) DecrementActiveSeries(lbs labels.Labels) { - if t == nil { - return - } - t.updateCounters(lbs, time.Time{}, -1, 0, 0, nil, false) -} - -func (t *Tracker) Collect(out chan<- prometheus.Metric) { - if t.isOverflow.Load() { - var activeSeries float64 - t.observedMtx.RLock() - for _, o := range t.observed { - activeSeries += o.activeSerie.Load() - } - t.observedMtx.RUnlock() - out <- prometheus.MustNewConstMetric(t.activeSeriesPerUserAttribution, prometheus.GaugeValue, activeSeries+t.overflowCounter.activeSerie.Load(), t.overflowLabels[:len(t.overflowLabels)-1]...) - out <- prometheus.MustNewConstMetric(t.receivedSamplesAttribution, prometheus.CounterValue, t.overflowCounter.receivedSample.Load(), t.overflowLabels[:len(t.overflowLabels)-1]...) - out <- prometheus.MustNewConstMetric(t.discardedSampleAttribution, prometheus.CounterValue, t.overflowCounter.totalDiscarded.Load(), t.overflowLabels...) - return - } - // We don't know the performance of out receiver, so we don't want to hold the lock for too long - var prometheusMetrics []prometheus.Metric - t.observedMtx.RLock() - for key, o := range t.observed { - keys := strings.Split(key, string(sep)) - keys = append(keys, t.userID) - if o.activeSerie.Load() > 0 { - prometheusMetrics = append(prometheusMetrics, prometheus.MustNewConstMetric(t.activeSeriesPerUserAttribution, prometheus.GaugeValue, o.activeSerie.Load(), keys...)) - } - if o.receivedSample.Load() > 0 { - prometheusMetrics = append(prometheusMetrics, prometheus.MustNewConstMetric(t.receivedSamplesAttribution, prometheus.CounterValue, o.receivedSample.Load(), keys...)) - } - o.discardedSampleMtx.Lock() - for reason, discarded := range o.discardedSample { - prometheusMetrics = append(prometheusMetrics, prometheus.MustNewConstMetric(t.discardedSampleAttribution, prometheus.CounterValue, discarded.Load(), append(keys, reason)...)) - } - o.discardedSampleMtx.Unlock() - } - t.observedMtx.RUnlock() - - for _, m := range prometheusMetrics { - out <- m - } -} - -func (t *Tracker) IncrementDiscardedSamples(lbs []mimirpb.LabelAdapter, value float64, reason string, now time.Time) { - if t == nil { - return - } - t.updateCountersWithLabelAdapter(lbs, now, 0, 0, value, &reason, true) -} - -func (t *Tracker) IncrementReceivedSamples(req *mimirpb.WriteRequest, now time.Time) { - if t == nil { - return - } - - // We precompute the cost attribution per request before update Observations and State to avoid frequently update the atomic counters - dict := make(map[string]int) - buf := bufferPool.Get().(*bytes.Buffer) - buf.Reset() - defer bufferPool.Put(buf) - for _, ts := range req.Timeseries { - t.fillKeyFromLabelAdapters(ts.Labels, buf) - dict[string(buf.Bytes())] += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) - } - - // Update the observations for each label set and update the state per request, - // this would be less precised than per sample but it's more efficient - var total float64 - for k, v := range dict { - count := float64(v) - t.updateObservations(k, now, 0, count, 0, nil, true) - total += count - } -} - -func (t *Tracker) updateCountersWithLabelAdapter(lbls []mimirpb.LabelAdapter, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { - buf := bufferPool.Get().(*bytes.Buffer) - buf.Reset() - defer bufferPool.Put(buf) - t.fillKeyFromLabelAdapters(lbls, buf) - t.updateObservations(buf.String(), ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) -} - -func (t *Tracker) fillKeyFromLabelAdapters(lbls []mimirpb.LabelAdapter, buf *bytes.Buffer) { - buf.Reset() - var exists bool - for idx, cal := range t.labels { - if idx > 0 { - buf.WriteRune(sep) - } - exists = false - for _, l := range lbls { - if l.Name == cal { - exists = true - buf.WriteString(l.Value) - break - } - } - if !exists { - buf.WriteString(missingValue) - } - } -} - -func (t *Tracker) fillKeyFromLabels(lbls labels.Labels, buf *bytes.Buffer) { - buf.Reset() - for idx, cal := range t.labels { - if idx > 0 { - buf.WriteRune(sep) - } - v := lbls.Get(cal) - if v != "" { - buf.WriteString(v) - } else { - buf.WriteString(missingValue) - } - } -} - -func (t *Tracker) updateCounters(lbls labels.Labels, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { - buf := bufferPool.Get().(*bytes.Buffer) - buf.Reset() - defer bufferPool.Put(buf) - t.fillKeyFromLabels(lbls, buf) - t.updateObservations(buf.String(), ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason, createIfDoesNotExist) -} - -// updateObservations updates or creates a new observation in the 'observed' map. -func (t *Tracker) updateObservations(key string, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string, createIfDoesNotExist bool) { - t.observedMtx.RLock() - o, known := t.observed[key] - t.observedMtx.RUnlock() - - if !known { - if !createIfDoesNotExist { - return - } - createStatus, ob := t.createNewObservationAndUpdateState(key, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) - switch createStatus { - case fullCreate: - return - case alreadyExists: - known = true - o = ob - case exceedLimit: - // If we are in overflow mode (including the case that observed map size exceed 2 times max cardinality), we update the overflow counter - o = t.overflowCounter - case partialCreate: - activeSeriesIncrement = 0 - o = t.overflowCounter - } - } - - // Rechecking the known flag since we change above when we seen the observation already exists during creation - if known { - // if we already know the observation, we would increment the active series for sure, but we need to check if it is overflow mode - // if yes, that means we only update active series with the real observation, and we update the overflow counter with the rest - if t.isOverflow.Load() { - if activeSeriesIncrement > 0 { - o.activeSerie.Add(activeSeriesIncrement) - activeSeriesIncrement = 0 - } - o = t.overflowCounter - } - } - - o.lastUpdate.Store(ts.Unix()) - if activeSeriesIncrement != 0 { - o.activeSerie.Add(activeSeriesIncrement) - } - if receivedSampleIncrement > 0 { - o.receivedSample.Add(receivedSampleIncrement) - } - if discardedSampleIncrement > 0 && reason != nil { - o.discardedSampleMtx.Lock() - if _, ok := o.discardedSample[*reason]; ok { - o.discardedSample[*reason].Add(discardedSampleIncrement) - } else { - o.discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) - } - o.discardedSampleMtx.Unlock() - } -} - -// updateState checks if the tracker has exceeded its max cardinality and updates overflow state if necessary. -// This function is not thread-safe and should be called with the t.observedMtx read lock. -func (t *Tracker) updateState(ts time.Time) { - if !t.isOverflow.Load() && len(t.observed) >= t.maxCardinality { - // Update state to overflow and set cooldown time - t.isOverflow.Store(true) - if t.cooldownUntil.IsZero() { - t.cooldownUntil = ts.Add(t.cooldownDuration) - } - t.logger.Log("msg", "tracker is in overflow mode", "userID", t.userID, "maxCardinality", t.maxCardinality) - } -} - -type createStatus int - -const ( - fullCreate createStatus = iota - partialCreate - exceedLimit - alreadyExists -) - -// createNewObservationAndUpdateState creates a new observation in the 'observed' map. Check if the tracker is in overflow mode and updates the state. -// returns true if update with not overflow mode, it has been full create with full update -func (t *Tracker) createNewObservationAndUpdateState(key string, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) (createStatus, *observation) { - t.observedMtx.Lock() - defer t.observedMtx.Unlock() - if o, exists := t.observed[key]; exists { - return alreadyExists, o - } - - // If adding the new observation would exceed the max cardinality, we need to update the state, it is fine only call it here - // because we are sure that the new observation will be added to the map - t.updateState(ts) - - // We don't want to restart the tracker when we are not sure overflow is fixed, so we keep a observation with 2 times the max cardinality, as soon as - // the tracker is still above the max cardinality, we will keep the overflow state. - if len(t.observed) >= 2*t.maxCardinality { - return exceedLimit, nil - } - - t.observed[key] = &observation{ - lastUpdate: *atomic.NewInt64(ts.Unix()), - activeSerie: *atomic.NewFloat64(activeSeriesIncrement), - discardedSample: make(map[string]*atomic.Float64), - discardedSampleMtx: sync.Mutex{}, - } - // If we are not in overflow mode, we can create a new observation with input values, otherwise we create a new observation with 0 values except for active series - if t.isOverflow.Load() { - return partialCreate, t.observed[key] - } - - t.observed[key].receivedSample = *atomic.NewFloat64(receivedSampleIncrement) - if discardedSampleIncrement > 0 && reason != nil { - t.observed[key].discardedSampleMtx.Lock() - t.observed[key].discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) - t.observed[key].discardedSampleMtx.Unlock() - } - return fullCreate, t.observed[key] -} - -func (t *Tracker) recoveredFromOverflow(deadline time.Time) bool { - t.observedMtx.RLock() - if !t.cooldownUntil.IsZero() && t.cooldownUntil.Before(deadline) { - if len(t.observed) <= t.maxCardinality { - t.observedMtx.RUnlock() - return true - } - t.observedMtx.RUnlock() - - // Increase the cooldown duration if the number of observations is still above the max cardinality - t.observedMtx.Lock() - if len(t.observed) <= t.maxCardinality { - t.observedMtx.Unlock() - return true - } - t.cooldownUntil = deadline.Add(t.cooldownDuration) - t.observedMtx.Unlock() - } else { - t.observedMtx.RUnlock() - } - return false -} - -func (t *Tracker) inactiveObservations(deadline time.Time) []string { - // otherwise, we need to check all observations and clean up the ones that are inactive - var invalidKeys []string - t.observedMtx.RLock() - defer t.observedMtx.RUnlock() - for labkey, ob := range t.observed { - if ob != nil && ob.lastUpdate.Load() <= deadline.Unix() { - invalidKeys = append(invalidKeys, labkey) - } - } - - return invalidKeys -} diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 2a1d376ed90..14129af7755 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -751,9 +751,9 @@ func (d *Distributor) validateSamples(now model.Time, ts *mimirpb.PreallocTimese return nil } - cat := d.costAttributionMgr.Tracker(userID) + cast := d.costAttributionMgr.SampleTracker(userID) if len(ts.Samples) == 1 { - return validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, ts.Samples[0], cat) + return validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, ts.Samples[0], cast) } timestamps := make(map[int64]struct{}, min(len(ts.Samples), 100)) @@ -767,7 +767,7 @@ func (d *Distributor) validateSamples(now model.Time, ts *mimirpb.PreallocTimese } timestamps[s.TimestampMs] = struct{}{} - if err := validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, s, cat); err != nil { + if err := validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, s, cast); err != nil { return err } @@ -792,9 +792,9 @@ func (d *Distributor) validateHistograms(now model.Time, ts *mimirpb.PreallocTim return nil } - cat := d.costAttributionMgr.Tracker(userID) + cast := d.costAttributionMgr.SampleTracker(userID) if len(ts.Histograms) == 1 { - updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[0], cat) + updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[0], cast) if err != nil { return err } @@ -815,7 +815,7 @@ func (d *Distributor) validateHistograms(now model.Time, ts *mimirpb.PreallocTim } timestamps[ts.Histograms[idx].Timestamp] = struct{}{} - updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[idx], cat) + updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[idx], cast) if err != nil { return err } @@ -879,8 +879,8 @@ func (d *Distributor) validateExemplars(ts *mimirpb.PreallocTimeseries, userID s // The returned error may retain the series labels. // It uses the passed nowt time to observe the delay of sample timestamps. func (d *Distributor) validateSeries(nowt time.Time, ts *mimirpb.PreallocTimeseries, userID, group string, skipLabelValidation, skipLabelCountValidation bool, minExemplarTS, maxExemplarTS int64) (bool, error) { - cat := d.costAttributionMgr.Tracker(userID) - if err := validateLabels(d.sampleValidationMetrics, d.limits, userID, group, ts.Labels, skipLabelValidation, skipLabelCountValidation, cat, nowt); err != nil { + cast := d.costAttributionMgr.SampleTracker(userID) + if err := validateLabels(d.sampleValidationMetrics, d.limits, userID, group, ts.Labels, skipLabelValidation, skipLabelCountValidation, cast, nowt); err != nil { return true, err } @@ -988,7 +988,7 @@ func (d *Distributor) prePushHaDedupeMiddleware(next PushFunc) PushFunc { if errors.As(err, &tooManyClustersError{}) { d.discardedSamplesTooManyHaClusters.WithLabelValues(userID, group).Add(float64(numSamples)) - d.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(req.Timeseries[0].Labels, float64(numSamples), reasonTooManyHAClusters, now) + d.costAttributionMgr.SampleTracker(userID).IncrementDiscardedSamples(req.Timeseries[0].Labels, float64(numSamples), reasonTooManyHAClusters, now) } return err @@ -1833,7 +1833,7 @@ func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID st receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) } - d.costAttributionMgr.Tracker(userID).IncrementReceivedSamples(req, mtime.Now()) + d.costAttributionMgr.SampleTracker(userID).IncrementReceivedSamples(req, mtime.Now()) receivedMetadata = len(req.Metadata) d.receivedSamples.WithLabelValues(userID).Add(float64(receivedSamples)) diff --git a/pkg/distributor/validate.go b/pkg/distributor/validate.go index 698da62eab1..9f42acf1eba 100644 --- a/pkg/distributor/validate.go +++ b/pkg/distributor/validate.go @@ -239,7 +239,7 @@ func newExemplarValidationMetrics(r prometheus.Registerer) *exemplarValidationMe // validateSample returns an err if the sample is invalid. // The returned error may retain the provided series labels. // It uses the passed 'now' time to measure the relative time of the sample. -func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s mimirpb.Sample, cat *costattribution.Tracker) error { +func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s mimirpb.Sample, cat *costattribution.SampleTracker) error { if model.Time(s.TimestampMs) > now.Add(cfg.CreationGracePeriod(userID)) { m.tooFarInFuture.WithLabelValues(userID, group).Inc() cat.IncrementDiscardedSamples(ls, 1, reasonTooFarInFuture, now.Time()) @@ -260,7 +260,7 @@ func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValida // validateSampleHistogram returns an err if the sample is invalid. // The returned error may retain the provided series labels. // It uses the passed 'now' time to measure the relative time of the sample. -func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s *mimirpb.Histogram, cat *costattribution.Tracker) (bool, error) { +func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s *mimirpb.Histogram, cat *costattribution.SampleTracker) (bool, error) { if model.Time(s.Timestamp) > now.Add(cfg.CreationGracePeriod(userID)) { cat.IncrementDiscardedSamples(ls, 1, reasonTooFarInFuture, now.Time()) m.tooFarInFuture.WithLabelValues(userID, group).Inc() @@ -400,7 +400,7 @@ func removeNonASCIIChars(in string) (out string) { // validateLabels returns an err if the labels are invalid. // The returned error may retain the provided series labels. -func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelValidation, skipLabelCountValidation bool, cat *costattribution.Tracker, ts time.Time) error { +func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelValidation, skipLabelCountValidation bool, cat *costattribution.SampleTracker, ts time.Time) error { unsafeMetricName, err := extract.UnsafeMetricNameFromLabelAdapters(ls) if err != nil { cat.IncrementDiscardedSamples(ls, 1, reasonMissingMetricName, ts) diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index a5261e17031..1e3002fc21a 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -51,7 +51,7 @@ type ActiveSeries struct { matchers *asmodel.Matchers lastConfigUpdate time.Time - cat *costattribution.Tracker + cat *costattribution.SampleTracker // The duration after which series become inactive. // Also used to determine if enough time has passed since configuration reload for valid results. @@ -78,7 +78,7 @@ type seriesStripe struct { activeNativeHistogramBuckets uint32 // Number of buckets in active native histogram entries in this stripe. Only decreased during purge or clear. activeMatchingNativeHistogramBuckets []uint32 // Number of buckets in active native histogram entries in this stripe matching each matcher of the configured Matchers. - cat *costattribution.Tracker + cat *costattribution.SampleTracker } // seriesEntry holds a timestamp for single series. @@ -90,7 +90,7 @@ type seriesEntry struct { deleted bool // This series was marked as deleted, so before purging we need to remove the refence to it from the deletedSeries. } -func NewActiveSeries(asm *asmodel.Matchers, timeout time.Duration, cat *costattribution.Tracker) *ActiveSeries { +func NewActiveSeries(asm *asmodel.Matchers, timeout time.Duration, cat *costattribution.SampleTracker) *ActiveSeries { c := &ActiveSeries{matchers: asm, timeout: timeout, cat: cat} // Stripes are pre-allocated so that we only read on them and no lock is required. @@ -107,7 +107,7 @@ func (c *ActiveSeries) CurrentMatcherNames() []string { return c.matchers.MatcherNames() } -func (c *ActiveSeries) ConfigDiffers(ctCfg asmodel.CustomTrackersConfig, caCfg *costattribution.Tracker) bool { +func (c *ActiveSeries) ConfigDiffers(ctCfg asmodel.CustomTrackersConfig, caCfg *costattribution.SampleTracker) bool { c.configMutex.RLock() defer c.configMutex.RUnlock() return ctCfg.String() != c.matchers.Config().String() || caCfg != c.cat @@ -430,7 +430,7 @@ func (s *seriesStripe) clear() { } // Reinitialize assigns new matchers and corresponding size activeMatching slices. -func (s *seriesStripe) reinitialize(asm *asmodel.Matchers, deleted *deletedSeries, cat *costattribution.Tracker) { +func (s *seriesStripe) reinitialize(asm *asmodel.Matchers, deleted *deletedSeries, cat *costattribution.SampleTracker) { s.mu.Lock() defer s.mu.Unlock() s.deleted = deleted From aee90499f0b5692bfff0f459e129784649ef3157 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 13 Jan 2025 16:41:16 +0100 Subject: [PATCH 072/105] add new sample tracker vs active series tracker --- pkg/costattribution/active_tracker.go | 121 ++++++++-------- pkg/costattribution/manager.go | 12 +- pkg/costattribution/manager_test.go | 31 +++-- pkg/costattribution/sample_tracker.go | 129 +++++++++--------- pkg/costattribution/tracker_test.go | 93 +++++++------ pkg/distributor/distributor.go | 2 +- pkg/distributor/validate_test.go | 4 +- pkg/ingester/activeseries/active_series.go | 30 ++-- .../activeseries/active_series_test.go | 2 +- pkg/ingester/ingester.go | 37 ++--- 10 files changed, 236 insertions(+), 225 deletions(-) diff --git a/pkg/costattribution/active_tracker.go b/pkg/costattribution/active_tracker.go index 8487cc71e81..5792d4df451 100644 --- a/pkg/costattribution/active_tracker.go +++ b/pkg/costattribution/active_tracker.go @@ -30,7 +30,7 @@ type ActiveSeriesTracker struct { logger log.Logger } -func newActiveSeriesTracker(userID string, trackedLabels []string, limit int, cooldown time.Duration, logger log.Logger) *ActiveSeriesTracker { +func newActiveSeriesTracker(userID string, trackedLabels []string, limit int, cooldownDuration time.Duration, logger log.Logger) *ActiveSeriesTracker { orderedLables := slices.Clone(trackedLabels) slices.Sort(orderedLables) @@ -43,141 +43,142 @@ func newActiveSeriesTracker(userID string, trackedLabels []string, limit int, co overflowLabels[len(orderedLables)] = userID overflowLabels[len(orderedLables)+1] = overflowValue - tracker := &ActiveSeriesTracker{ - userID: userID, - labels: orderedLables, - maxCardinality: limit, - observed: make(map[string]*atomic.Int64), - logger: logger, - overflowLabels: overflowLabels, + ast := &ActiveSeriesTracker{ + userID: userID, + labels: orderedLables, + maxCardinality: limit, + observed: make(map[string]*atomic.Int64), + logger: logger, + overflowLabels: overflowLabels, + cooldownDuration: cooldownDuration, } variableLabels := slices.Clone(orderedLables) variableLabels = append(variableLabels, tenantLabel, "reason") - tracker.activeSeriesPerUserAttribution = prometheus.NewDesc("cortex_ingester_attributed_active_series", + ast.activeSeriesPerUserAttribution = prometheus.NewDesc("cortex_ingester_attributed_active_series", "The total number of active series per user and attribution.", variableLabels[:len(variableLabels)-1], prometheus.Labels{trackerLabel: defaultTrackerName}) - return tracker + return ast } -func (t *ActiveSeriesTracker) hasSameLabels(labels []string) bool { - return slices.Equal(t.labels, labels) +func (at *ActiveSeriesTracker) hasSameLabels(labels []string) bool { + return slices.Equal(at.labels, labels) } -func (t *ActiveSeriesTracker) Increment(lbls labels.Labels, now time.Time) { - if t == nil { +func (at *ActiveSeriesTracker) Increment(lbls labels.Labels, now time.Time) { + if at == nil { return } buf := bufferPool.Get().(*bytes.Buffer) buf.Reset() defer bufferPool.Put(buf) - t.fillKeyFromLabels(lbls, buf) - t.observedMtx.RLock() - as, ok := t.observed[string(buf.Bytes())] + at.fillKeyFromLabels(lbls, buf) + at.observedMtx.RLock() + as, ok := at.observed[string(buf.Bytes())] if ok { as.Inc() - t.observedMtx.RUnlock() + at.observedMtx.RUnlock() return } - t.observedMtx.RUnlock() + at.observedMtx.RUnlock() - if t.overflowSince.Load() > 0 { - t.overflowCounter.Inc() + if at.overflowSince.Load() > 0 { + at.overflowCounter.Inc() return } - t.observedMtx.Lock() - defer t.observedMtx.Unlock() - as, ok = t.observed[string(buf.Bytes())] + at.observedMtx.Lock() + defer at.observedMtx.Unlock() + as, ok = at.observed[string(buf.Bytes())] if ok { as.Inc() return } - if t.overflowSince.Load() > 0 { - t.overflowCounter.Inc() + if at.overflowSince.Load() > 0 { + at.overflowCounter.Inc() return } - if len(t.observed) >= t.maxCardinality { - t.overflowSince.Store(now.Unix()) - t.overflowCounter.Inc() + if len(at.observed) >= at.maxCardinality { + at.overflowSince.Store(now.Unix()) + at.overflowCounter.Inc() return } - t.observed[string(buf.Bytes())] = atomic.NewInt64(1) + at.observed[string(buf.Bytes())] = atomic.NewInt64(1) } -func (t *ActiveSeriesTracker) Decrement(lbls labels.Labels) { - if t == nil { +func (at *ActiveSeriesTracker) Decrement(lbls labels.Labels) { + if at == nil { return } buf := bufferPool.Get().(*bytes.Buffer) buf.Reset() defer bufferPool.Put(buf) - t.fillKeyFromLabels(lbls, buf) - t.observedMtx.RLock() - as, ok := t.observed[string(buf.Bytes())] + at.fillKeyFromLabels(lbls, buf) + at.observedMtx.RLock() + as, ok := at.observed[string(buf.Bytes())] if ok { nv := as.Dec() if nv > 0 { - t.observedMtx.RUnlock() + at.observedMtx.RUnlock() return } - t.observedMtx.RUnlock() - t.observedMtx.Lock() - as, ok := t.observed[string(buf.Bytes())] + at.observedMtx.RUnlock() + at.observedMtx.Lock() + as, ok := at.observed[string(buf.Bytes())] if ok && as.Load() == 0 { // use buf.String() instead of string(buf.Bytes()) to fix the lint issue - delete(t.observed, buf.String()) + delete(at.observed, buf.String()) } - t.observedMtx.Unlock() + at.observedMtx.Unlock() return } - t.observedMtx.RUnlock() + at.observedMtx.RUnlock() - if t.overflowSince.Load() > 0 { - t.overflowCounter.Dec() + if at.overflowSince.Load() > 0 { + at.overflowCounter.Dec() return } - t.observedMtx.RLock() - defer t.observedMtx.RUnlock() - panic(fmt.Errorf("decrementing non-existent active series: labels=%v, cost attribution keys: %v, the current observation map length: %d, the current cost attribution key: %s", lbls, t.labels, len(t.observed), buf.String())) + at.observedMtx.RLock() + defer at.observedMtx.RUnlock() + panic(fmt.Errorf("decrementing non-existent active series: labels=%v, cost attribution keys: %v, the current observation map length: %d, the current cost attribution key: %s", lbls, at.labels, len(at.observed), buf.String())) } -func (t *ActiveSeriesTracker) Collect(out chan<- prometheus.Metric) { - if t.overflowSince.Load() > 0 { +func (at *ActiveSeriesTracker) Collect(out chan<- prometheus.Metric) { + if at.overflowSince.Load() > 0 { var activeSeries int64 - t.observedMtx.RLock() - for _, as := range t.observed { + at.observedMtx.RLock() + for _, as := range at.observed { activeSeries += as.Load() } - t.observedMtx.RUnlock() - out <- prometheus.MustNewConstMetric(t.activeSeriesPerUserAttribution, prometheus.GaugeValue, float64(activeSeries+t.overflowCounter.Load()), t.overflowLabels[:len(t.overflowLabels)-1]...) + at.observedMtx.RUnlock() + out <- prometheus.MustNewConstMetric(at.activeSeriesPerUserAttribution, prometheus.GaugeValue, float64(activeSeries+at.overflowCounter.Load()), at.overflowLabels[:len(at.overflowLabels)-1]...) return } // We don't know the performance of out receiver, so we don't want to hold the lock for too long var prometheusMetrics []prometheus.Metric - t.observedMtx.RLock() - for key, as := range t.observed { + at.observedMtx.RLock() + for key, as := range at.observed { keys := strings.Split(key, string(sep)) - keys = append(keys, t.userID) - prometheusMetrics = append(prometheusMetrics, prometheus.MustNewConstMetric(t.activeSeriesPerUserAttribution, prometheus.GaugeValue, float64(as.Load()), keys...)) + keys = append(keys, at.userID) + prometheusMetrics = append(prometheusMetrics, prometheus.MustNewConstMetric(at.activeSeriesPerUserAttribution, prometheus.GaugeValue, float64(as.Load()), keys...)) } - t.observedMtx.RUnlock() + at.observedMtx.RUnlock() for _, m := range prometheusMetrics { out <- m } } -func (t *ActiveSeriesTracker) fillKeyFromLabels(lbls labels.Labels, buf *bytes.Buffer) { +func (at *ActiveSeriesTracker) fillKeyFromLabels(lbls labels.Labels, buf *bytes.Buffer) { buf.Reset() - for idx, cal := range t.labels { + for idx, cal := range at.labels { if idx > 0 { buf.WriteRune(sep) } diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 974d1e0e7ed..38da4a8806a 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -122,6 +122,7 @@ func (m *Manager) ActiveSeriesTracker(userID string) *ActiveSeriesTracker { if tracker, exists = m.activeTrackersByUserID[userID]; exists { return tracker } + tracker = newActiveSeriesTracker(userID, labels, maxCardinality, cooldownDuration, m.logger) m.activeTrackersByUserID[userID] = tracker return tracker @@ -173,16 +174,19 @@ func (m *Manager) updateTracker(userID string) (*SampleTracker, *ActiveSeriesTra slices.Sort(lbls) // if the labels have changed or the max cardinality or cooldown duration have changed, create a new tracker - if !st.hasSameLabels(lbls) || st.maxCardinality != m.limits.MaxCostAttributionCardinalityPerUser(userID) || st.cooldownDuration != m.limits.CostAttributionCooldown(userID) { + newMaxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) + newCooldownDuration := m.limits.CostAttributionCooldown(userID) + + if !st.hasSameLabels(lbls) || st.maxCardinality != newMaxCardinality || st.cooldownDuration != newCooldownDuration { m.mstx.Lock() - st = newSampleTracker(userID, lbls, m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) + st = newSampleTracker(userID, lbls, newMaxCardinality, newCooldownDuration, m.logger) m.sampleTrackersByUserID[userID] = st m.mstx.Unlock() } - if !at.hasSameLabels(lbls) || at.maxCardinality != m.limits.MaxCostAttributionCardinalityPerUser(userID) || at.cooldownDuration != m.limits.CostAttributionCooldown(userID) { + if !at.hasSameLabels(lbls) || at.maxCardinality != newMaxCardinality || st.cooldownDuration != newCooldownDuration { m.matx.Lock() - at = newActiveSeriesTracker(userID, lbls, m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) + at = newActiveSeriesTracker(userID, lbls, newMaxCardinality, newCooldownDuration, m.logger) m.activeTrackersByUserID[userID] = at m.matx.Unlock() } diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index 4374770f050..4639a654a10 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -38,23 +38,23 @@ func TestManager_CreateDeleteTracker(t *testing.T) { manager := newTestManager() t.Run("Tracker existence and attributes", func(t *testing.T) { - user1Tracker := manager.Tracker("user1") + user1Tracker := manager.SampleTracker("user1") assert.NotNil(t, user1Tracker) assert.True(t, user1Tracker.hasSameLabels([]string{"team"})) assert.Equal(t, 5, user1Tracker.maxCardinality) - assert.Nil(t, manager.Tracker("user2")) + assert.Nil(t, manager.SampleTracker("user2")) - user3Tracker := manager.Tracker("user3") + user3Tracker := manager.ActiveSeriesTracker("user3") assert.NotNil(t, user3Tracker) assert.True(t, user3Tracker.hasSameLabels([]string{"department", "service"})) assert.Equal(t, 2, user3Tracker.maxCardinality) }) t.Run("Metrics tracking", func(t *testing.T) { - manager.Tracker("user1").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "bar"}}, 1, "invalid-metrics-name", time.Unix(6, 0)) - manager.Tracker("user1").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "foo"}}, 1, "invalid-metrics-name", time.Unix(12, 0)) - manager.Tracker("user3").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"department", "foo", "service", "dodo"}, SamplesCount: 1}}), time.Unix(20, 0)) + manager.SampleTracker("user1").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "bar"}}, 1, "invalid-metrics-name", time.Unix(6, 0)) + manager.SampleTracker("user1").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "foo"}}, 1, "invalid-metrics-name", time.Unix(12, 0)) + manager.SampleTracker("user3").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"department", "foo", "service", "dodo"}, SamplesCount: 1}}), time.Unix(20, 0)) expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. @@ -100,9 +100,9 @@ func TestManager_CreateDeleteTracker(t *testing.T) { assert.NoError(t, err) assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(12, 0))) assert.Equal(t, 1, len(manager.sampleTrackersByUserID)) - assert.True(t, manager.Tracker("user3").hasSameLabels([]string{"feature", "team"})) + assert.True(t, manager.SampleTracker("user3").hasSameLabels([]string{"feature", "team"})) - manager.Tracker("user3").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "foo"}}, 1, "invalid-metrics-name", time.Unix(13, 0)) + manager.SampleTracker("user3").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "foo"}}, 1, "invalid-metrics-name", time.Unix(13, 0)) expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. # TYPE cortex_discarded_attributed_samples_total counter @@ -112,9 +112,9 @@ func TestManager_CreateDeleteTracker(t *testing.T) { }) t.Run("Overflow metrics on cardinality limit", func(t *testing.T) { - manager.Tracker("user3").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"team", "bar", "feature", "bar"}, SamplesCount: 1}}), time.Unix(15, 0)) - manager.Tracker("user3").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"team", "baz", "feature", "baz"}, SamplesCount: 1}}), time.Unix(16, 0)) - manager.Tracker("user3").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"team", "foo", "feature", "foo"}, SamplesCount: 1}}), time.Unix(17, 0)) + manager.SampleTracker("user3").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"team", "bar", "feature", "bar"}, SamplesCount: 1}}), time.Unix(15, 0)) + manager.SampleTracker("user3").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"team", "baz", "feature", "baz"}, SamplesCount: 1}}), time.Unix(16, 0)) + manager.SampleTracker("user3").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"team", "foo", "feature", "foo"}, SamplesCount: 1}}), time.Unix(17, 0)) expectedMetrics := ` # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. # TYPE cortex_received_attributed_samples_total counter @@ -127,9 +127,9 @@ func TestManager_CreateDeleteTracker(t *testing.T) { func TestManager_PurgeInactiveAttributionsUntil(t *testing.T) { manager := newTestManager() - manager.Tracker("user1").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"team", "foo"}, SamplesCount: 1}}), time.Unix(1, 0)) - manager.Tracker("user1").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "foo"}}, 1, "invalid-metrics-name", time.Unix(1, 0)) - manager.Tracker("user3").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "department", Value: "foo"}, {Name: "service", Value: "bar"}}, 1, "out-of-window", time.Unix(10, 0)) + manager.SampleTracker("user1").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"team", "foo"}, SamplesCount: 1}}), time.Unix(1, 0)) + manager.SampleTracker("user1").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "foo"}}, 1, "invalid-metrics-name", time.Unix(1, 0)) + manager.SampleTracker("user3").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "department", Value: "foo"}, {Name: "service", Value: "bar"}}, 1, "out-of-window", time.Unix(10, 0)) t.Run("Purge before inactive timeout", func(t *testing.T) { assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(0, 0))) @@ -151,7 +151,8 @@ func TestManager_PurgeInactiveAttributionsUntil(t *testing.T) { // User3's tracker should remain since it's active, user1's tracker should be removed assert.Equal(t, 1, len(manager.sampleTrackersByUserID), "Expected one active tracker after purging") - assert.Nil(t, manager.Tracker("user1"), "Expected user1 tracker to be purged") + assert.Nil(t, manager.SampleTracker("user1"), "Expected user1 tracker to be purged") + assert.Nil(t, manager.ActiveSeriesTracker("user1"), "Expected user1 tracker to be purged") expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. diff --git a/pkg/costattribution/sample_tracker.go b/pkg/costattribution/sample_tracker.go index 6369349ad21..8dc00deeb88 100644 --- a/pkg/costattribution/sample_tracker.go +++ b/pkg/costattribution/sample_tracker.go @@ -79,8 +79,8 @@ func newSampleTracker(userID string, trackedLabels []string, limit int, cooldown return tracker } -func (t *SampleTracker) hasSameLabels(labels []string) bool { - return slices.Equal(t.labels, labels) +func (st *SampleTracker) hasSameLabels(labels []string) bool { + return slices.Equal(st.labels, labels) } var bufferPool = sync.Pool{ @@ -89,51 +89,49 @@ var bufferPool = sync.Pool{ }, } -func (t *SampleTracker) cleanupTrackerAttribution(key string) { - t.observedMtx.Lock() - defer t.observedMtx.Unlock() - delete(t.observed, key) +func (st *SampleTracker) cleanupTrackerAttribution(key string) { + st.observedMtx.Lock() + defer st.observedMtx.Unlock() + delete(st.observed, key) } -func (t *SampleTracker) Collect(out chan<- prometheus.Metric) { - if t.overflowSince.Load() > 0 { - t.observedMtx.RLock() - t.observedMtx.RUnlock() - out <- prometheus.MustNewConstMetric(t.receivedSamplesAttribution, prometheus.CounterValue, t.overflowCounter.receivedSample.Load(), t.overflowLabels[:len(t.overflowLabels)-1]...) - out <- prometheus.MustNewConstMetric(t.discardedSampleAttribution, prometheus.CounterValue, t.overflowCounter.totalDiscarded.Load(), t.overflowLabels...) +func (st *SampleTracker) Collect(out chan<- prometheus.Metric) { + if st.overflowSince.Load() > 0 { + out <- prometheus.MustNewConstMetric(st.receivedSamplesAttribution, prometheus.CounterValue, st.overflowCounter.receivedSample.Load(), st.overflowLabels[:len(st.overflowLabels)-1]...) + out <- prometheus.MustNewConstMetric(st.discardedSampleAttribution, prometheus.CounterValue, st.overflowCounter.totalDiscarded.Load(), st.overflowLabels...) return } // We don't know the performance of out receiver, so we don't want to hold the lock for too long var prometheusMetrics []prometheus.Metric - t.observedMtx.RLock() - for key, o := range t.observed { + st.observedMtx.RLock() + for key, o := range st.observed { keys := strings.Split(key, string(sep)) - keys = append(keys, t.userID) + keys = append(keys, st.userID) if o.receivedSample.Load() > 0 { - prometheusMetrics = append(prometheusMetrics, prometheus.MustNewConstMetric(t.receivedSamplesAttribution, prometheus.CounterValue, o.receivedSample.Load(), keys...)) + prometheusMetrics = append(prometheusMetrics, prometheus.MustNewConstMetric(st.receivedSamplesAttribution, prometheus.CounterValue, o.receivedSample.Load(), keys...)) } o.discardedSampleMtx.Lock() for reason, discarded := range o.discardedSample { - prometheusMetrics = append(prometheusMetrics, prometheus.MustNewConstMetric(t.discardedSampleAttribution, prometheus.CounterValue, discarded.Load(), append(keys, reason)...)) + prometheusMetrics = append(prometheusMetrics, prometheus.MustNewConstMetric(st.discardedSampleAttribution, prometheus.CounterValue, discarded.Load(), append(keys, reason)...)) } o.discardedSampleMtx.Unlock() } - t.observedMtx.RUnlock() + st.observedMtx.RUnlock() for _, m := range prometheusMetrics { out <- m } } -func (t *SampleTracker) IncrementDiscardedSamples(lbs []mimirpb.LabelAdapter, value float64, reason string, now time.Time) { - if t == nil { +func (st *SampleTracker) IncrementDiscardedSamples(lbs []mimirpb.LabelAdapter, value float64, reason string, now time.Time) { + if st == nil { return } - t.updateCountersWithLabelAdapter(lbs, now, 0, 0, value, &reason) + st.updateCountersWithLabelAdapter(lbs, now, 0, value, &reason) } -func (t *SampleTracker) IncrementReceivedSamples(req *mimirpb.WriteRequest, now time.Time) { - if t == nil { +func (st *SampleTracker) IncrementReceivedSamples(req *mimirpb.WriteRequest, now time.Time) { + if st == nil { return } @@ -143,7 +141,7 @@ func (t *SampleTracker) IncrementReceivedSamples(req *mimirpb.WriteRequest, now buf.Reset() defer bufferPool.Put(buf) for _, ts := range req.Timeseries { - t.fillKeyFromLabelAdapters(ts.Labels, buf) + st.fillKeyFromLabelAdapters(ts.Labels, buf) dict[string(buf.Bytes())] += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) } @@ -152,23 +150,23 @@ func (t *SampleTracker) IncrementReceivedSamples(req *mimirpb.WriteRequest, now var total float64 for k, v := range dict { count := float64(v) - t.updateObservations(k, now, count, 0, nil) + st.updateObservations(k, now, count, 0, nil) total += count } } -func (t *SampleTracker) updateCountersWithLabelAdapter(lbls []mimirpb.LabelAdapter, ts time.Time, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { +func (st *SampleTracker) updateCountersWithLabelAdapter(lbls []mimirpb.LabelAdapter, ts time.Time, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { buf := bufferPool.Get().(*bytes.Buffer) buf.Reset() defer bufferPool.Put(buf) - t.fillKeyFromLabelAdapters(lbls, buf) - t.updateObservations(buf.String(), ts, receivedSampleIncrement, discardedSampleIncrement, reason) + st.fillKeyFromLabelAdapters(lbls, buf) + st.updateObservations(buf.String(), ts, receivedSampleIncrement, discardedSampleIncrement, reason) } -func (t *SampleTracker) fillKeyFromLabelAdapters(lbls []mimirpb.LabelAdapter, buf *bytes.Buffer) { +func (st *SampleTracker) fillKeyFromLabelAdapters(lbls []mimirpb.LabelAdapter, buf *bytes.Buffer) { buf.Reset() var exists bool - for idx, cal := range t.labels { + for idx, cal := range st.labels { if idx > 0 { buf.WriteRune(sep) } @@ -187,22 +185,22 @@ func (t *SampleTracker) fillKeyFromLabelAdapters(lbls []mimirpb.LabelAdapter, bu } // updateObservations updates or creates a new observation in the 'observed' map. -func (t *SampleTracker) updateObservations(key string, ts time.Time, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { +func (st *SampleTracker) updateObservations(key string, ts time.Time, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { // if overflowSince is set, we only update the overflow counter - if t.overflowSince.Load() > 0 { - t.overflowCounter.receivedSample.Add(receivedSampleIncrement) + if st.overflowSince.Load() > 0 { + st.overflowCounter.receivedSample.Add(receivedSampleIncrement) if discardedSampleIncrement > 0 && reason != nil { - t.overflowCounter.totalDiscarded.Add(discardedSampleIncrement) + st.overflowCounter.totalDiscarded.Add(discardedSampleIncrement) } return } // if not overflow, we need to check if the key exists in the observed map, // if yes, we update the observation, otherwise we create a new observation, and set the overflowSince if the max cardinality is exceeded - t.observedMtx.Lock() - defer t.observedMtx.Unlock() - o, known := t.observed[key] - if known && t.overflowSince.Load() == 0 { + st.observedMtx.Lock() + defer st.observedMtx.Unlock() + o, known := st.observed[key] + if known && st.overflowSince.Load() == 0 { o.lastUpdate.Store(ts.Unix()) o.receivedSample.Add(receivedSampleIncrement) if discardedSampleIncrement > 0 && reason != nil { @@ -217,16 +215,21 @@ func (t *SampleTracker) updateObservations(key string, ts time.Time, receivedSam return } - if t.overflowSince.Load() > 0 { - t.overflowCounter.receivedSample.Add(receivedSampleIncrement) + // if it is not known, we need to check if the max cardinality is exceeded + if len(st.observed) >= st.maxCardinality { + st.overflowSince.Store(ts.Unix()) + } + + if st.overflowSince.Load() > 0 { + st.overflowCounter.receivedSample.Add(receivedSampleIncrement) if discardedSampleIncrement > 0 && reason != nil { - t.overflowCounter.totalDiscarded.Add(discardedSampleIncrement) + st.overflowCounter.totalDiscarded.Add(discardedSampleIncrement) } return } // If adding the new observation would exceed the max cardinality, we need to update the state - t.observed[key] = &observation{ + st.observed[key] = &observation{ lastUpdate: *atomic.NewInt64(ts.Unix()), discardedSample: make(map[string]*atomic.Float64), receivedSample: *atomic.NewFloat64(receivedSampleIncrement), @@ -234,45 +237,41 @@ func (t *SampleTracker) updateObservations(key string, ts time.Time, receivedSam } if discardedSampleIncrement > 0 && reason != nil { - t.observed[key].discardedSampleMtx.Lock() - t.observed[key].discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) - t.observed[key].discardedSampleMtx.Unlock() - } - - if len(t.observed) >= t.maxCardinality { - t.overflowSince.Store(ts.Unix()) + st.observed[key].discardedSampleMtx.Lock() + st.observed[key].discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) + st.observed[key].discardedSampleMtx.Unlock() } } -func (t *SampleTracker) recoveredFromOverflow(deadline time.Time) bool { - t.observedMtx.RLock() - if t.overflowSince.Load() > 0 && time.Unix(t.overflowSince.Load(), 0).Add(t.cooldownDuration).Before(deadline) { - if len(t.observed) <= t.maxCardinality { - t.observedMtx.RUnlock() +func (st *SampleTracker) recoveredFromOverflow(deadline time.Time) bool { + st.observedMtx.RLock() + if st.overflowSince.Load() > 0 && time.Unix(st.overflowSince.Load(), 0).Add(st.cooldownDuration).Before(deadline) { + if len(st.observed) <= st.maxCardinality { + st.observedMtx.RUnlock() return true } - t.observedMtx.RUnlock() + st.observedMtx.RUnlock() // Increase the cooldown duration if the number of observations is still above the max cardinality - t.observedMtx.Lock() - if len(t.observed) <= t.maxCardinality { - t.observedMtx.Unlock() + st.observedMtx.Lock() + if len(st.observed) <= st.maxCardinality { + st.observedMtx.Unlock() return true } - t.overflowSince.Store(deadline.Unix()) - t.observedMtx.Unlock() + st.overflowSince.Store(deadline.Unix()) + st.observedMtx.Unlock() } else { - t.observedMtx.RUnlock() + st.observedMtx.RUnlock() } return false } -func (t *SampleTracker) inactiveObservations(deadline time.Time) []string { +func (st *SampleTracker) inactiveObservations(deadline time.Time) []string { // otherwise, we need to check all observations and clean up the ones that are inactive var invalidKeys []string - t.observedMtx.RLock() - defer t.observedMtx.RUnlock() - for labkey, ob := range t.observed { + st.observedMtx.RLock() + defer st.observedMtx.RUnlock() + for labkey, ob := range st.observed { if ob != nil && ob.lastUpdate.Load() <= deadline.Unix() { invalidKeys = append(invalidKeys, labkey) } diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index b00462ca3b9..ee4599f30b6 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -18,15 +18,17 @@ import ( ) func TestTracker_hasSameLabels(t *testing.T) { - tracker := newTestManager().Tracker("user1") - assert.True(t, tracker.hasSameLabels([]string{"team"}), "Expected cost attribution labels mismatch") + st := newTestManager().SampleTracker("user1") + assert.True(t, st.hasSameLabels([]string{"team"}), "Expected cost attribution labels mismatch") + ast := newTestManager().ActiveSeriesTracker("user1") + assert.True(t, ast.hasSameLabels([]string{"team"}), "Expected cost attribution labels mismatch") } func TestTracker_IncrementReceviedSamples(t *testing.T) { tManager := newTestManager() - tracker := tManager.Tracker("user4") + st := tManager.SampleTracker("user4") t.Run("One Single Series in Request", func(t *testing.T) { - tracker.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"platform", "foo", "service", "dodo"}, SamplesCount: 3}}), time.Unix(10, 0)) + st.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"platform", "foo", "service", "dodo"}, SamplesCount: 3}}), time.Unix(10, 0)) expectedMetrics := ` # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. @@ -36,7 +38,7 @@ func TestTracker_IncrementReceviedSamples(t *testing.T) { assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total")) }) t.Run("Multiple Different Series in Request", func(t *testing.T) { - tracker.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{ + st.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{ {LabelValues: []string{"platform", "foo", "service", "dodo"}, SamplesCount: 3}, {LabelValues: []string{"platform", "bar", "service", "yoyo"}, SamplesCount: 5}, }), time.Unix(20, 0)) @@ -51,7 +53,7 @@ func TestTracker_IncrementReceviedSamples(t *testing.T) { }) t.Run("Multiple Series in Request with Same Labels", func(t *testing.T) { - tracker.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{ + st.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{ {LabelValues: []string{"platform", "foo", "service", "dodo"}, SamplesCount: 3}, {LabelValues: []string{"platform", "foo", "service", "yoyo"}, SamplesCount: 5}, }), time.Unix(30, 0)) @@ -68,14 +70,15 @@ func TestTracker_IncrementReceviedSamples(t *testing.T) { func TestTracker_CreateDelete(t *testing.T) { tManager := newTestManager() - tracker := tManager.Tracker("user4") + st := tManager.SampleTracker("user4") + ast := tManager.ActiveSeriesTracker("user4") - tracker.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), time.Unix(1, 0)) - tracker.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "2"), time.Unix(2, 0)) - tracker.DecrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "3")) - tracker.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"platform", "foo", "team", "1"}, SamplesCount: 5}}), time.Unix(4, 0)) - tracker.IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "platform", Value: "foo"}, {Name: "team", Value: "1"}}, 2, "sample-out-of-order", time.Unix(4, 0)) - tracker.IncrementActiveSeries(labels.FromStrings("platform", "bar", "tenant", "user4", "team", "2"), time.Unix(6, 0)) + ast.Increment(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), time.Unix(1, 0)) + ast.Increment(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "2"), time.Unix(2, 0)) + ast.Decrement(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "3")) + st.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"platform", "foo", "team", "1"}, SamplesCount: 5}}), time.Unix(4, 0)) + st.IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "platform", Value: "foo"}, {Name: "team", Value: "1"}}, 2, "sample-out-of-order", time.Unix(4, 0)) + ast.Increment(labels.FromStrings("platform", "bar", "tenant", "user4", "team", "2"), time.Unix(6, 0)) expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. @@ -96,43 +99,45 @@ func TestTracker_CreateDelete(t *testing.T) { "cortex_ingester_attributed_active_series", } assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), metricNames...)) - assert.Equal(t, []string{"foo"}, tracker.inactiveObservations(time.Unix(5, 0))) + + // The purge only apply to the sample tracker. + assert.Equal(t, []string{"foo"}, st.inactiveObservations(time.Unix(5, 0))) assert.NoError(t, tManager.purgeInactiveAttributionsUntil(time.Unix(5, 0))) expectedMetrics = ` # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. # TYPE cortex_ingester_attributed_active_series gauge cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{platform="foo",tenant="user4",tracker="cost-attribution"} 1 ` assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), metricNames...)) - tManager.deleteTracker("user4") + tManager.deleteSampleTracker("user4") + tManager.deleteActiveTracker("user4") assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(""), metricNames...)) } func TestTracker_updateCounters(t *testing.T) { - tracker := newTestManager().Tracker("user3") - lbls1 := labels.FromStrings("department", "foo", "service", "bar") - lbls2 := labels.FromStrings("department", "bar", "service", "baz") - lbls3 := labels.FromStrings("department", "baz", "service", "foo") - - tracker.updateCounters(lbls1, time.Unix(1, 0), 1, 0, 0, nil, true) - assert.False(t, tracker.isOverflow.Load(), "First observation, should not overflow") + st := newTestManager().SampleTracker("user3") + lbls1 := []mimirpb.LabelAdapter{{Name: "department", Value: "foo"}, {Name: "service", Value: "bar"}} + lbls2 := []mimirpb.LabelAdapter{{Name: "department", Value: "bar"}, {Name: "service", Value: "baz"}} + lbls3 := []mimirpb.LabelAdapter{{Name: "department", Value: "baz"}, {Name: "service", Value: "foo"}} - tracker.updateCounters(lbls2, time.Unix(2, 0), 1, 0, 0, nil, true) - assert.False(t, tracker.isOverflow.Load(), "Second observation, should not overflow") + st.updateCountersWithLabelAdapter(lbls1, time.Unix(1, 0), 1, 0, nil) + assert.Equal(t, int64(0), st.overflowSince.Load(), "First observation, should not overflow") - tracker.updateCounters(lbls3, time.Unix(3, 0), 1, 0, 0, nil, true) - assert.True(t, tracker.isOverflow.Load(), "Third observation, should overflow") + st.updateCountersWithLabelAdapter(lbls2, time.Unix(2, 0), 1, 0, nil) + assert.Equal(t, int64(0), st.overflowSince.Load(), "Second observation, should not overflow") - tracker.updateCounters(lbls3, time.Unix(4, 0), 1, 0, 0, nil, true) - assert.True(t, tracker.isOverflow.Load(), "Fourth observation, should stay overflow") + st.updateCountersWithLabelAdapter(lbls3, time.Unix(3, 0), 1, 0, nil) + assert.Equal(t, int64(3), st.overflowSince.Load(), "Third observation, should overflow") - assert.Equal(t, time.Unix(3, 0).Add(tracker.cooldownDuration), tracker.cooldownUntil, "CooldownUntil should be updated correctly") + st.updateCountersWithLabelAdapter(lbls3, time.Unix(4, 0), 1, 0, nil) + assert.Equal(t, int64(3), st.overflowSince.Load(), "Fourth observation, should stay overflow") } func TestTracker_inactiveObservations(t *testing.T) { - // Setup the test environment: create a tracker for user1 with a "team" label and max cardinality of 5. - tracker := newTestManager().Tracker("user1") + // Setup the test environment: create a st for user1 with a "team" label and max cardinality of 5. + st := newTestManager().SampleTracker("user1") // Create two observations with different last update timestamps. observations := [][]mimirpb.LabelAdapter{ @@ -142,31 +147,31 @@ func TestTracker_inactiveObservations(t *testing.T) { } // Simulate samples discarded with different timestamps. - tracker.IncrementDiscardedSamples(observations[0], 1, "invalid-metrics-name", time.Unix(1, 0)) - tracker.IncrementDiscardedSamples(observations[1], 2, "out-of-window-sample", time.Unix(12, 0)) - tracker.IncrementDiscardedSamples(observations[2], 3, "invalid-metrics-name", time.Unix(20, 0)) + st.IncrementDiscardedSamples(observations[0], 1, "invalid-metrics-name", time.Unix(1, 0)) + st.IncrementDiscardedSamples(observations[1], 2, "out-of-window-sample", time.Unix(12, 0)) + st.IncrementDiscardedSamples(observations[2], 3, "invalid-metrics-name", time.Unix(20, 0)) // Ensure that two observations were successfully added to the tracker. - require.Len(t, tracker.observed, 3) + require.Len(t, st.observed, 3) // Purge observations that haven't been updated in the last 10 seconds. - purged := tracker.inactiveObservations(time.Unix(0, 0)) + purged := st.inactiveObservations(time.Unix(0, 0)) require.Len(t, purged, 0) - purged = tracker.inactiveObservations(time.Unix(10, 0)) + purged = st.inactiveObservations(time.Unix(10, 0)) assert.ElementsMatch(t, []string{"foo"}, purged) - purged = tracker.inactiveObservations(time.Unix(15, 0)) + purged = st.inactiveObservations(time.Unix(15, 0)) assert.ElementsMatch(t, []string{"foo", "bar"}, purged) // Check that the purged observation matches the expected details. - purged = tracker.inactiveObservations(time.Unix(25, 0)) + purged = st.inactiveObservations(time.Unix(25, 0)) assert.ElementsMatch(t, []string{"foo", "bar", "baz"}, purged) } func TestTracker_Concurrency(t *testing.T) { m := newTestManager() - tracker := m.Tracker("user1") + ast := m.ActiveSeriesTracker("user1") var wg sync.WaitGroup var i int64 @@ -175,15 +180,15 @@ func TestTracker_Concurrency(t *testing.T) { go func(i int64) { defer wg.Done() lbls := labels.FromStrings("team", string(rune('A'+(i%26)))) - tracker.updateCounters(lbls, time.Unix(i, 0), 1, 0, 0, nil, true) + ast.Increment(lbls, time.Unix(i, 0)) }(i) } wg.Wait() // Verify no data races or inconsistencies - assert.True(t, len(tracker.observed) > 0, "Observed set should not be empty after concurrent updates") - assert.LessOrEqual(t, len(tracker.observed), 2*tracker.maxCardinality, "Observed count should not exceed 2 times of max cardinality") - assert.True(t, tracker.isOverflow.Load(), "Expected state to be Overflow") + assert.True(t, len(ast.observed) > 0, "Observed set should not be empty after concurrent updates") + assert.LessOrEqual(t, len(ast.observed), 2*ast.maxCardinality, "Observed count should not exceed 2 times of max cardinality") + assert.NotEqual(t, 0, ast.overflowSince.Load(), "Expected state to be Overflow") expectedMetrics := ` # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 14129af7755..8fc2c8e6748 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -1247,7 +1247,7 @@ func (d *Distributor) prePushValidationMiddleware(next PushFunc) PushFunc { totalN := validatedSamples + validatedExemplars + validatedMetadata if !d.ingestionRateLimiter.AllowN(now, userID, totalN) { if len(req.Timeseries) > 0 { - d.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(req.Timeseries[0].Labels, float64(validatedSamples), reasonRateLimited, now) + d.costAttributionMgr.SampleTracker(userID).IncrementDiscardedSamples(req.Timeseries[0].Labels, float64(validatedSamples), reasonRateLimited, now) } d.discardedSamplesRateLimited.WithLabelValues(userID, group).Add(float64(validatedSamples)) d.discardedExemplarsRateLimited.WithLabelValues(userID).Add(float64(validatedExemplars)) diff --git a/pkg/distributor/validate_test.go b/pkg/distributor/validate_test.go index 47af497ca64..594afc53cd7 100644 --- a/pkg/distributor/validate_test.go +++ b/pkg/distributor/validate_test.go @@ -84,7 +84,7 @@ func TestValidateLabels(t *testing.T) { careg := prometheus.NewRegistry() manager, err := costattribution.NewManager(5*time.Second, 10*time.Second, log.NewNopLogger(), limits, careg) require.NoError(t, err) - cat := manager.Tracker(userID) + cast := manager.SampleTracker(userID) for _, c := range []struct { metric model.Metric @@ -242,7 +242,7 @@ func TestValidateLabels(t *testing.T) { err: nil, }, } { - err := validateLabels(s, cfg, userID, "custom label", mimirpb.FromMetricsToLabelAdapters(c.metric), c.skipLabelNameValidation, c.skipLabelCountValidation, cat, ts) + err := validateLabels(s, cfg, userID, "custom label", mimirpb.FromMetricsToLabelAdapters(c.metric), c.skipLabelNameValidation, c.skipLabelCountValidation, cast, ts) assert.Equal(t, c.err, err, "wrong error") } diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 1e3002fc21a..602264c9ae0 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -51,7 +51,7 @@ type ActiveSeries struct { matchers *asmodel.Matchers lastConfigUpdate time.Time - cat *costattribution.SampleTracker + caat *costattribution.ActiveSeriesTracker // The duration after which series become inactive. // Also used to determine if enough time has passed since configuration reload for valid results. @@ -78,7 +78,7 @@ type seriesStripe struct { activeNativeHistogramBuckets uint32 // Number of buckets in active native histogram entries in this stripe. Only decreased during purge or clear. activeMatchingNativeHistogramBuckets []uint32 // Number of buckets in active native histogram entries in this stripe matching each matcher of the configured Matchers. - cat *costattribution.SampleTracker + caat *costattribution.ActiveSeriesTracker } // seriesEntry holds a timestamp for single series. @@ -90,12 +90,12 @@ type seriesEntry struct { deleted bool // This series was marked as deleted, so before purging we need to remove the refence to it from the deletedSeries. } -func NewActiveSeries(asm *asmodel.Matchers, timeout time.Duration, cat *costattribution.SampleTracker) *ActiveSeries { - c := &ActiveSeries{matchers: asm, timeout: timeout, cat: cat} +func NewActiveSeries(asm *asmodel.Matchers, timeout time.Duration, caat *costattribution.ActiveSeriesTracker) *ActiveSeries { + c := &ActiveSeries{matchers: asm, timeout: timeout, caat: caat} // Stripes are pre-allocated so that we only read on them and no lock is required. for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted, cat) + c.stripes[i].reinitialize(asm, &c.deleted, caat) } return c @@ -107,10 +107,10 @@ func (c *ActiveSeries) CurrentMatcherNames() []string { return c.matchers.MatcherNames() } -func (c *ActiveSeries) ConfigDiffers(ctCfg asmodel.CustomTrackersConfig, caCfg *costattribution.SampleTracker) bool { +func (c *ActiveSeries) ConfigDiffers(ctCfg asmodel.CustomTrackersConfig, caCfg *costattribution.ActiveSeriesTracker) bool { c.configMutex.RLock() defer c.configMutex.RUnlock() - return ctCfg.String() != c.matchers.Config().String() || caCfg != c.cat + return ctCfg.String() != c.matchers.Config().String() || caCfg != c.caat } func (c *ActiveSeries) ReloadMatchers(asm *asmodel.Matchers, now time.Time) { @@ -118,7 +118,7 @@ func (c *ActiveSeries) ReloadMatchers(asm *asmodel.Matchers, now time.Time) { defer c.configMutex.Unlock() for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted, c.cat) + c.stripes[i].reinitialize(asm, &c.deleted, c.caat) } c.matchers = asm c.lastConfigUpdate = now @@ -408,7 +408,7 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef numNativeHistogramBuckets: numNativeHistogramBuckets, } - s.cat.IncrementActiveSeries(series, time.Unix(0, nowNanos)) + s.caat.Increment(series, time.Unix(0, nowNanos)) s.refs[ref] = e return e.nanos, true } @@ -430,7 +430,7 @@ func (s *seriesStripe) clear() { } // Reinitialize assigns new matchers and corresponding size activeMatching slices. -func (s *seriesStripe) reinitialize(asm *asmodel.Matchers, deleted *deletedSeries, cat *costattribution.SampleTracker) { +func (s *seriesStripe) reinitialize(asm *asmodel.Matchers, deleted *deletedSeries, cat *costattribution.ActiveSeriesTracker) { s.mu.Lock() defer s.mu.Unlock() s.deleted = deleted @@ -443,7 +443,7 @@ func (s *seriesStripe) reinitialize(asm *asmodel.Matchers, deleted *deletedSerie s.activeMatching = resizeAndClear(len(asm.MatcherNames()), s.activeMatching) s.activeMatchingNativeHistograms = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistograms) s.activeMatchingNativeHistogramBuckets = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistogramBuckets) - s.cat = cat + s.caat = cat } func (s *seriesStripe) purge(keepUntil time.Time, idx tsdb.IndexReader) { @@ -468,11 +468,11 @@ func (s *seriesStripe) purge(keepUntil time.Time, idx tsdb.IndexReader) { for ref, entry := range s.refs { ts := entry.nanos.Load() if ts < keepUntilNanos { - if s.cat != nil { + if s.caat != nil { if err := idx.Series(ref, &buf, nil); err != nil { s.activeSeriesAttributionFailureCounter.Add(1) } else { - s.cat.DecrementActiveSeries(buf.Labels()) + s.caat.Decrement(buf.Labels()) } } if entry.deleted { @@ -532,12 +532,12 @@ func (s *seriesStripe) remove(ref storage.SeriesRef, idx tsdb.IndexReader) { } s.active-- - if s.cat != nil { + if s.caat != nil { buf := labels.NewScratchBuilder(128) if err := idx.Series(ref, &buf, nil); err != nil { s.activeSeriesAttributionFailureCounter.Add(1) } else { - s.cat.DecrementActiveSeries(buf.Labels()) + s.caat.Decrement(buf.Labels()) } } if entry.numNativeHistogramBuckets >= 0 { diff --git a/pkg/ingester/activeseries/active_series_test.go b/pkg/ingester/activeseries/active_series_test.go index 690b6b6c71c..a565c2019f7 100644 --- a/pkg/ingester/activeseries/active_series_test.go +++ b/pkg/ingester/activeseries/active_series_test.go @@ -254,7 +254,7 @@ func TestActiveSeries_UpdateSeries_WithCostAttribution(t *testing.T) { reg := prometheus.NewRegistry() manager, err := costattribution.NewManager(5*time.Second, 10*time.Second, log.NewNopLogger(), limits, reg) require.NoError(t, err) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, manager.Tracker("user5")) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, manager.ActiveSeriesTracker("user5")) testCostAttributionUpdateSeries(t, c, reg) } diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 6debbca48df..ed100f6dae0 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -787,8 +787,8 @@ func (i *Ingester) updateActiveSeries(now time.Time) { } newMatchersConfig := i.limits.ActiveSeriesCustomTrackersConfig(userID) - newCostAttributionTracker := i.costAttributionMgr.Tracker(userID) - if userDB.activeSeries.ConfigDiffers(newMatchersConfig, newCostAttributionTracker) { + newCostAttributionActiveSeriesTracker := i.costAttributionMgr.ActiveSeriesTracker(userID) + if userDB.activeSeries.ConfigDiffers(newMatchersConfig, newCostAttributionActiveSeriesTracker) { i.replaceMatchers(asmodel.NewMatchers(newMatchersConfig), userDB, now) } @@ -1198,62 +1198,63 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques outOfOrderWindow = i.limits.OutOfOrderTimeWindow(userID) + cast = i.costAttributionMgr.SampleTracker(userID) errProcessor = mimir_storage.NewSoftAppendErrorProcessor( func() { stats.failedSamplesCount++ }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTimestampTooOldCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonSampleTimestampTooOld, startAppend) + cast.IncrementDiscardedSamples(labels, 1, reasonSampleTimestampTooOld, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooOld, func() softError { return newSampleTimestampTooOldError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleOutOfOrderCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonSampleOutOfOrder, startAppend) + cast.IncrementDiscardedSamples(labels, 1, reasonSampleOutOfOrder, startAppend) updateFirstPartial(i.errorSamplers.sampleOutOfOrder, func() softError { return newSampleOutOfOrderError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTooOldCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonSampleTooOld, startAppend) + cast.IncrementDiscardedSamples(labels, 1, reasonSampleTooOld, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooOldOOOEnabled, func() softError { return newSampleTimestampTooOldOOOEnabledError(model.Time(timestamp), labels, outOfOrderWindow) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTooFarInFutureCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonSampleTooFarInFuture, startAppend) + cast.IncrementDiscardedSamples(labels, 1, reasonSampleTooFarInFuture, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooFarInFuture, func() softError { return newSampleTimestampTooFarInFutureError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.newValueForTimestampCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonNewValueForTimestamp, startAppend) + cast.IncrementDiscardedSamples(labels, 1, reasonNewValueForTimestamp, startAppend) updateFirstPartial(i.errorSamplers.sampleDuplicateTimestamp, func() softError { return newSampleDuplicateTimestampError(model.Time(timestamp), labels) }) }, func(labels []mimirpb.LabelAdapter) { stats.perUserSeriesLimitCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonPerUserSeriesLimit, startAppend) + cast.IncrementDiscardedSamples(labels, 1, reasonPerUserSeriesLimit, startAppend) updateFirstPartial(i.errorSamplers.maxSeriesPerUserLimitExceeded, func() softError { return newPerUserSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerUser(userID)) }) }, func(labels []mimirpb.LabelAdapter) { stats.perMetricSeriesLimitCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonPerMetricSeriesLimit, startAppend) + cast.IncrementDiscardedSamples(labels, 1, reasonPerMetricSeriesLimit, startAppend) updateFirstPartial(i.errorSamplers.maxSeriesPerMetricLimitExceeded, func() softError { return newPerMetricSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerMetric(userID), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleOutOfOrderCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonSampleOutOfOrder, startAppend) + cast.IncrementDiscardedSamples(labels, 1, reasonSampleOutOfOrder, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { e := newNativeHistogramValidationError(globalerror.NativeHistogramOOODisabled, err, model.Time(timestamp), labels) return e @@ -1261,35 +1262,35 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonInvalidNativeHistogram, startAppend) + cast.IncrementDiscardedSamples(labels, 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramCountMismatch, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonInvalidNativeHistogram, startAppend) + cast.IncrementDiscardedSamples(labels, 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramCountNotBigEnough, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonInvalidNativeHistogram, startAppend) + cast.IncrementDiscardedSamples(labels, 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramNegativeBucketCount, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonInvalidNativeHistogram, startAppend) + cast.IncrementDiscardedSamples(labels, 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramSpanNegativeOffset, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(labels, 1, reasonInvalidNativeHistogram, startAppend) + cast.IncrementDiscardedSamples(labels, 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramSpansBucketsMismatch, err, model.Time(timestamp), labels) }) @@ -1437,7 +1438,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre stats.failedSamplesCount += len(ts.Samples) + len(ts.Histograms) stats.sampleTimestampTooOldCount += len(ts.Samples) + len(ts.Histograms) - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(ts.Labels, float64(len(ts.Samples)+len(ts.Histograms)), reasonSampleTimestampTooOld, startAppend) + i.costAttributionMgr.SampleTracker(userID).IncrementDiscardedSamples(ts.Labels, float64(len(ts.Samples)+len(ts.Histograms)), reasonSampleTimestampTooOld, startAppend) var firstTimestamp int64 if len(ts.Samples) > 0 { firstTimestamp = ts.Samples[0].TimestampMs @@ -1457,7 +1458,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre len(ts.Samples) > 0 && allOutOfBoundsFloats(ts.Samples, minAppendTime) { stats.failedSamplesCount += len(ts.Samples) stats.sampleTimestampTooOldCount += len(ts.Samples) - i.costAttributionMgr.Tracker(userID).IncrementDiscardedSamples(ts.Labels, float64(len(ts.Samples)), reasonSampleTimestampTooOld, startAppend) + i.costAttributionMgr.SampleTracker(userID).IncrementDiscardedSamples(ts.Labels, float64(len(ts.Samples)), reasonSampleTimestampTooOld, startAppend) firstTimestamp := ts.Samples[0].TimestampMs updateFirstPartial(i.errorSamplers.sampleTimestampTooOld, func() softError { @@ -2673,7 +2674,7 @@ func (i *Ingester) createTSDB(userID string, walReplayConcurrency int) (*userTSD userDB := &userTSDB{ userID: userID, - activeSeries: activeseries.NewActiveSeries(asmodel.NewMatchers(matchersConfig), i.cfg.ActiveSeriesMetrics.IdleTimeout, i.costAttributionMgr.Tracker(userID)), + activeSeries: activeseries.NewActiveSeries(asmodel.NewMatchers(matchersConfig), i.cfg.ActiveSeriesMetrics.IdleTimeout, i.costAttributionMgr.ActiveSeriesTracker(userID)), seriesInMetric: newMetricCounter(i.limiter, i.cfg.getIgnoreSeriesLimitForMetricNamesMap()), ingestedAPISamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), ingestedRuleSamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), From 749bafdd84a900eebe65e2d5263e5cbff0b272c2 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 13 Jan 2025 17:04:21 +0100 Subject: [PATCH 073/105] remove conflict --- CHANGELOG.md | 1 - go.mod | 1 - 2 files changed, 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 778cc45e068..8182a15ce1e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,6 @@ ## main / unreleased -* [CHANGE] Query-frontend: Add `topic` label to `cortex_ingest_storage_strong_consistency_requests_total`, `cortex_ingest_storage_strong_consistency_failures_total`, and `cortex_ingest_storage_strong_consistency_wait_duration_seconds` metrics. #10220 * [FEATURE] Ingester/Distributor: Add support for exporting cost attribution metrics (`cortex_ingester_attributed_active_series`, `cortex_received_attributed_samples_total`, and `cortex_discarded_attributed_samples_total`) with labels specified by customers to a custom Prometheus registry. This feature enables more flexible billing data tracking. #10269 ### Grafana Mimir diff --git a/go.mod b/go.mod index 83c01b0db5a..ec9819a47b8 100644 --- a/go.mod +++ b/go.mod @@ -287,7 +287,6 @@ require ( sigs.k8s.io/yaml v1.4.0 // indirect ) -// Using a fork of Prometheus with Mimir-specific changes. replace github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20250110020350-a1e2bcf4a615 // Replace memberlist with our fork which includes some fixes that haven't been From 5f07f1626505a73b2321758f3abc5c0c6320bb92 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 13 Jan 2025 17:24:59 +0100 Subject: [PATCH 074/105] clean up code --- pkg/costattribution/active_tracker.go | 1 + pkg/costattribution/sample_tracker.go | 36 ++++++++++++++++++++++----- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/pkg/costattribution/active_tracker.go b/pkg/costattribution/active_tracker.go index 5792d4df451..0dc796fbbf5 100644 --- a/pkg/costattribution/active_tracker.go +++ b/pkg/costattribution/active_tracker.go @@ -75,6 +75,7 @@ func (at *ActiveSeriesTracker) Increment(lbls labels.Labels, now time.Time) { buf.Reset() defer bufferPool.Put(buf) at.fillKeyFromLabels(lbls, buf) + at.observedMtx.RLock() as, ok := at.observed[string(buf.Bytes())] if ok { diff --git a/pkg/costattribution/sample_tracker.go b/pkg/costattribution/sample_tracker.go index 8dc00deeb88..afeafa25f65 100644 --- a/pkg/costattribution/sample_tracker.go +++ b/pkg/costattribution/sample_tracker.go @@ -197,8 +197,7 @@ func (st *SampleTracker) updateObservations(key string, ts time.Time, receivedSa // if not overflow, we need to check if the key exists in the observed map, // if yes, we update the observation, otherwise we create a new observation, and set the overflowSince if the max cardinality is exceeded - st.observedMtx.Lock() - defer st.observedMtx.Unlock() + st.observedMtx.RLock() o, known := st.observed[key] if known && st.overflowSince.Load() == 0 { o.lastUpdate.Store(ts.Unix()) @@ -212,14 +211,39 @@ func (st *SampleTracker) updateObservations(key string, ts time.Time, receivedSa } o.discardedSampleMtx.Unlock() } + st.observedMtx.RUnlock() return } + st.observedMtx.RUnlock() - // if it is not known, we need to check if the max cardinality is exceeded - if len(st.observed) >= st.maxCardinality { - st.overflowSince.Store(ts.Unix()) + // If it is not known, we take the write lock, but still check whether the key is added in the meantime + st.observedMtx.Lock() + defer st.observedMtx.Unlock() + // If not in overflow, we update the observation if it exists, otherwise we check if create a new observation would exceed the max cardinality + // if it does, we set the overflowSince + if st.overflowSince.Load() == 0 { + o, known = st.observed[key] + if known { + o.lastUpdate.Store(ts.Unix()) + o.receivedSample.Add(receivedSampleIncrement) + if discardedSampleIncrement > 0 && reason != nil { + o.discardedSampleMtx.Lock() + if _, ok := o.discardedSample[*reason]; ok { + o.discardedSample[*reason].Add(discardedSampleIncrement) + } else { + o.discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) + } + o.discardedSampleMtx.Unlock() + } + return + } + // if it is not known, we need to check if the max cardinality is exceeded + if len(st.observed) >= st.maxCardinality { + st.overflowSince.Store(ts.Unix()) + } } + // if overflowSince is set, we only update the overflow counter if st.overflowSince.Load() > 0 { st.overflowCounter.receivedSample.Add(receivedSampleIncrement) if discardedSampleIncrement > 0 && reason != nil { @@ -228,7 +252,7 @@ func (st *SampleTracker) updateObservations(key string, ts time.Time, receivedSa return } - // If adding the new observation would exceed the max cardinality, we need to update the state + // create a new observation st.observed[key] = &observation{ lastUpdate: *atomic.NewInt64(ts.Unix()), discardedSample: make(map[string]*atomic.Float64), From 37ba4f2ec64bd39ce7246c1c2bb5f845706161ca Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Mon, 13 Jan 2025 17:39:10 +0100 Subject: [PATCH 075/105] fix --- pkg/costattribution/sample_tracker.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/costattribution/sample_tracker.go b/pkg/costattribution/sample_tracker.go index afeafa25f65..7c22c8a9444 100644 --- a/pkg/costattribution/sample_tracker.go +++ b/pkg/costattribution/sample_tracker.go @@ -36,7 +36,7 @@ type SampleTracker struct { observed map[string]*observation observedMtx sync.RWMutex overflowSince atomic.Int64 - overflowCounter *observation + overflowCounter observation cooldownDuration time.Duration logger log.Logger } @@ -62,7 +62,7 @@ func newSampleTracker(userID string, trackedLabels []string, limit int, cooldown cooldownDuration: cooldown, logger: logger, overflowLabels: overflowLabels, - overflowCounter: &observation{}, + overflowCounter: observation{}, } variableLabels := slices.Clone(orderedLables) From 8d062040b1c40c941847730167a3744c69b08a8d Mon Sep 17 00:00:00 2001 From: Ying WANG <74549700+ying-jeanne@users.noreply.github.com> Date: Tue, 14 Jan 2025 17:59:51 +0100 Subject: [PATCH 076/105] Update pkg/distributor/distributor.go Co-authored-by: Oleg Zaytsev --- pkg/distributor/distributor.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index fcaa7ca3b23..2123a4cad54 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -749,7 +749,6 @@ func (d *Distributor) checkSample(ctx context.Context, userID, cluster, replica // Returns an error explaining the first validation finding. // May alter timeseries data in-place. // The returned error may retain the series labels. - func (d *Distributor) validateSamples(now model.Time, ts *mimirpb.PreallocTimeseries, userID, group string) error { if len(ts.Samples) == 0 { return nil From 68410c8128c495897c224a11d2c8d2e22ee42069 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 14 Jan 2025 18:38:04 +0100 Subject: [PATCH 077/105] address comments --- CHANGELOG.md | 3 +- .../config/mimir.yaml | 7 +++ .../configuration-parameters/index.md | 3 +- pkg/costattribution/manager.go | 62 +++++++++---------- pkg/util/validation/limits.go | 7 ++- pkg/util/validation/limits_test.go | 6 ++ 6 files changed, 53 insertions(+), 35 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8182a15ce1e..464581712c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,14 +2,13 @@ ## main / unreleased -* [FEATURE] Ingester/Distributor: Add support for exporting cost attribution metrics (`cortex_ingester_attributed_active_series`, `cortex_received_attributed_samples_total`, and `cortex_discarded_attributed_samples_total`) with labels specified by customers to a custom Prometheus registry. This feature enables more flexible billing data tracking. #10269 - ### Grafana Mimir * [CHANGE] Distributor: OTLP and push handler replace all non-UTF8 characters with the unicode replacement character `\uFFFD` in error messages before propagating them. #10236 * [CHANGE] Querier: pass query matchers to queryable `IsApplicable` hook. #10256 * [CHANGE] Query-frontend: Add `topic` label to `cortex_ingest_storage_strong_consistency_requests_total`, `cortex_ingest_storage_strong_consistency_failures_total`, and `cortex_ingest_storage_strong_consistency_wait_duration_seconds` metrics. #10220 * [CHANGE] Ruler: cap the rate of retries for remote query evaluation to 170/sec. This is configurable via `-ruler.query-frontend.max-retries-rate`. #10375 #10403 +* [CHANGE] Ingester/Distributor: Add support for exporting cost attribution metrics (`cortex_ingester_attributed_active_series`, `cortex_received_attributed_samples_total`, and `cortex_discarded_attributed_samples_total`) with labels specified by customers to a custom Prometheus registry. This feature enables more flexible billing data tracking. #10269 * [ENHANCEMENT] Query Frontend: Return server-side `samples_processed` statistics. #10103 * [ENHANCEMENT] Distributor: OTLP receiver now converts also metric metadata. See also https://github.com/prometheus/prometheus/pull/15416. #10168 * [ENHANCEMENT] Distributor: discard float and histogram samples with duplicated timestamps from each timeseries in a request before the request is forwarded to ingesters. Discarded samples are tracked by the `cortex_discarded_samples_total` metrics with reason `sample_duplicate_timestamp`. #10145 diff --git a/development/mimir-microservices-mode/config/mimir.yaml b/development/mimir-microservices-mode/config/mimir.yaml index 5d245999115..84d5c219039 100644 --- a/development/mimir-microservices-mode/config/mimir.yaml +++ b/development/mimir-microservices-mode/config/mimir.yaml @@ -1,4 +1,6 @@ multitenancy_enabled: false +cost_attribution_registry_path: "/usage-metrics" +cost_attribution_eviction_interval: 10m distributor: ha_tracker: @@ -183,6 +185,11 @@ limits: ha_cluster_label: ha_cluster ha_replica_label: ha_replica ha_max_clusters: 10 + + cost_attribution_labels: "container" + max_cost_attribution_labels_per_user: 2 + max_cost_attribution_cardinality_per_user: 100 + cost_attribution_cooldown: 20m runtime_config: file: ./config/runtime.yaml diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 87fe406b105..2a3b57c8c32 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -3599,7 +3599,8 @@ The `limits` block configures default and per-tenant limits imposed by component # CLI flag: -validation.cost-attribution-labels [cost_attribution_labels: | default = ""] -# (experimental) Maximum number of cost attribution labels allowed per user. +# (experimental) Maximum number of cost attribution labels allowed per user, the +# value is capped at 4. # CLI flag: -validation.max-cost-attribution-labels-per-user [max_cost_attribution_labels_per_user: | default = 2] diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 38da4a8806a..914698b316b 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -25,25 +25,25 @@ const ( type Manager struct { services.Service - logger log.Logger - limits *validation.Overrides - reg *prometheus.Registry + logger log.Logger + limits *validation.Overrides + reg *prometheus.Registry + inactiveTimeout time.Duration + cleanupInterval time.Duration - mstx sync.RWMutex + stmtx sync.RWMutex sampleTrackersByUserID map[string]*SampleTracker - inactiveTimeout time.Duration - cleanupInterval time.Duration - matx sync.RWMutex + atmtx sync.RWMutex activeTrackersByUserID map[string]*ActiveSeriesTracker } func NewManager(cleanupInterval, inactiveTimeout time.Duration, logger log.Logger, limits *validation.Overrides, reg *prometheus.Registry) (*Manager, error) { m := &Manager{ - mstx: sync.RWMutex{}, + stmtx: sync.RWMutex{}, sampleTrackersByUserID: make(map[string]*SampleTracker), - matx: sync.RWMutex{}, + atmtx: sync.RWMutex{}, activeTrackersByUserID: make(map[string]*ActiveSeriesTracker), limits: limits, @@ -77,9 +77,9 @@ func (m *Manager) SampleTracker(userID string) *SampleTracker { } // Check if the tracker already exists, if exists return it. Otherwise lock and create a new tracker. - m.mstx.RLock() + m.stmtx.RLock() tracker, exists := m.sampleTrackersByUserID[userID] - m.mstx.RUnlock() + m.stmtx.RUnlock() if exists { return tracker } @@ -89,8 +89,8 @@ func (m *Manager) SampleTracker(userID string) *SampleTracker { maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) cooldownDuration := m.limits.CostAttributionCooldown(userID) - m.mstx.Lock() - defer m.mstx.Unlock() + m.stmtx.Lock() + defer m.stmtx.Unlock() if tracker, exists = m.sampleTrackersByUserID[userID]; exists { return tracker } @@ -105,9 +105,9 @@ func (m *Manager) ActiveSeriesTracker(userID string) *ActiveSeriesTracker { } // Check if the tracker already exists, if exists return it. Otherwise lock and create a new tracker. - m.matx.RLock() + m.atmtx.RLock() tracker, exists := m.activeTrackersByUserID[userID] - m.matx.RUnlock() + m.atmtx.RUnlock() if exists { return tracker } @@ -117,8 +117,8 @@ func (m *Manager) ActiveSeriesTracker(userID string) *ActiveSeriesTracker { maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) cooldownDuration := m.limits.CostAttributionCooldown(userID) - m.matx.Lock() - defer m.matx.Unlock() + m.atmtx.Lock() + defer m.atmtx.Unlock() if tracker, exists = m.activeTrackersByUserID[userID]; exists { return tracker } @@ -129,17 +129,17 @@ func (m *Manager) ActiveSeriesTracker(userID string) *ActiveSeriesTracker { } func (m *Manager) Collect(out chan<- prometheus.Metric) { - m.mstx.RLock() + m.stmtx.RLock() for _, tracker := range m.sampleTrackersByUserID { tracker.Collect(out) } - m.mstx.RUnlock() + m.stmtx.RUnlock() - m.matx.RLock() + m.atmtx.RLock() for _, tracker := range m.activeTrackersByUserID { tracker.Collect(out) } - m.matx.RUnlock() + m.atmtx.RUnlock() } func (m *Manager) Describe(chan<- *prometheus.Desc) { @@ -148,15 +148,15 @@ func (m *Manager) Describe(chan<- *prometheus.Desc) { } func (m *Manager) deleteSampleTracker(userID string) { - m.mstx.Lock() + m.stmtx.Lock() delete(m.sampleTrackersByUserID, userID) - m.mstx.Unlock() + m.stmtx.Unlock() } func (m *Manager) deleteActiveTracker(userID string) { - m.matx.Lock() + m.atmtx.Lock() delete(m.activeTrackersByUserID, userID) - m.matx.Unlock() + m.atmtx.Unlock() } func (m *Manager) updateTracker(userID string) (*SampleTracker, *ActiveSeriesTracker) { @@ -178,29 +178,29 @@ func (m *Manager) updateTracker(userID string) (*SampleTracker, *ActiveSeriesTra newCooldownDuration := m.limits.CostAttributionCooldown(userID) if !st.hasSameLabels(lbls) || st.maxCardinality != newMaxCardinality || st.cooldownDuration != newCooldownDuration { - m.mstx.Lock() + m.stmtx.Lock() st = newSampleTracker(userID, lbls, newMaxCardinality, newCooldownDuration, m.logger) m.sampleTrackersByUserID[userID] = st - m.mstx.Unlock() + m.stmtx.Unlock() } if !at.hasSameLabels(lbls) || at.maxCardinality != newMaxCardinality || st.cooldownDuration != newCooldownDuration { - m.matx.Lock() + m.atmtx.Lock() at = newActiveSeriesTracker(userID, lbls, newMaxCardinality, newCooldownDuration, m.logger) m.activeTrackersByUserID[userID] = at - m.matx.Unlock() + m.atmtx.Unlock() } return st, at } func (m *Manager) purgeInactiveAttributionsUntil(deadline time.Time) error { - m.mstx.RLock() + m.stmtx.RLock() userIDs := make([]string, 0, len(m.sampleTrackersByUserID)) for userID := range m.sampleTrackersByUserID { userIDs = append(userIDs, userID) } - m.mstx.RUnlock() + m.stmtx.RUnlock() for _, userID := range userIDs { st, at := m.updateTracker(userID) diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 76614b8992c..b9583801fc6 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -73,6 +73,7 @@ var ( errInvalidIngestStorageReadConsistency = fmt.Errorf("invalid ingest storage read consistency (supported values: %s)", strings.Join(api.ReadConsistencies, ", ")) errInvalidMaxEstimatedChunksPerQueryMultiplier = errors.New("invalid value for -" + MaxEstimatedChunksPerQueryMultiplierFlag + ": must be 0 or greater than or equal to 1") errCostAttributionLabelsLimitExceeded = errors.New("invalid value for -" + costAttributionLabelsFlag + ": exceeds the limit defined by -" + maxCostAttributionLabelsPerUserFlag) + errInvalidMaxCostAttributionLabelsPerUser = errors.New("invalid value for -" + maxCostAttributionLabelsPerUserFlag + ": must be less than or equal to 4") ) // LimitError is a marker interface for the errors that do not comply with the specified limits. @@ -310,7 +311,7 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.StringVar(&l.SeparateMetricsGroupLabel, "validation.separate-metrics-group-label", "", "Label used to define the group label for metrics separation. For each write request, the group is obtained from the first non-empty group label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'group' label with group label's value. Currently applies to the following metrics: cortex_discarded_samples_total") f.Var(&l.CostAttributionLabels, costAttributionLabelsFlag, "Defines labels for cost attribution. Applies to metrics like cortex_received_attributed_samples_total. To disable, set to an empty string. For example, 'team,service' produces metrics such as cortex_received_attributed_samples_total{team='frontend', service='api'}.") - f.IntVar(&l.MaxCostAttributionLabelsPerUser, maxCostAttributionLabelsPerUserFlag, 2, "Maximum number of cost attribution labels allowed per user.") + f.IntVar(&l.MaxCostAttributionLabelsPerUser, maxCostAttributionLabelsPerUserFlag, 2, "Maximum number of cost attribution labels allowed per user, the value is capped at 4.") f.IntVar(&l.MaxCostAttributionCardinalityPerUser, "validation.max-cost-attribution-cardinality-per-user", 10000, "Maximum cardinality of cost attribution labels allowed per user.") f.Var(&l.CostAttributionCooldown, "validation.cost-attribution-cooldown", "Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system stays in overflow mode and extends the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset occurs when the cardinality falls below the limit.") f.IntVar(&l.MaxChunksPerQuery, MaxChunksPerQueryFlag, 2e6, "Maximum number of chunks that can be fetched in a single query from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable.") @@ -493,6 +494,10 @@ func (l *Limits) validate() error { return errCostAttributionLabelsLimitExceeded } + if l.MaxCostAttributionLabelsPerUser > 4 { + return errInvalidMaxCostAttributionLabelsPerUser + } + return nil } diff --git a/pkg/util/validation/limits_test.go b/pkg/util/validation/limits_test.go index c56cb1ab026..86bda0d4ba5 100644 --- a/pkg/util/validation/limits_test.go +++ b/pkg/util/validation/limits_test.go @@ -1082,6 +1082,12 @@ cost_attribution_labels: label1, label2, label3, max_cost_attribution_labels_per_user: 2`, expectedErr: errCostAttributionLabelsLimitExceeded.Error(), }, + "should fail when max_cost_attribution_labels_per_user is more than 4": { + cfg: ` +cost_attribution_labels: label1, label2, +max_cost_attribution_labels_per_user: 5`, + expectedErr: errInvalidMaxCostAttributionLabelsPerUser.Error(), + }, } for testName, testData := range tests { From c4a44eb47dfe89d0d8290252899ec7272d57a592 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 14 Jan 2025 18:53:12 +0100 Subject: [PATCH 078/105] update docs --- cmd/mimir/config-descriptor.json | 2 +- cmd/mimir/help-all.txt.tmpl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 2a007cfe54c..22819788151 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -4383,7 +4383,7 @@ "kind": "field", "name": "max_cost_attribution_labels_per_user", "required": false, - "desc": "Maximum number of cost attribution labels allowed per user.", + "desc": "Maximum number of cost attribution labels allowed per user, the value is capped at 4.", "fieldValue": null, "fieldDefaultValue": 2, "fieldFlag": "validation.max-cost-attribution-labels-per-user", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 683a905df7a..b905ba39f68 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -3336,7 +3336,7 @@ Usage of ./cmd/mimir/mimir: -validation.max-cost-attribution-cardinality-per-user int [experimental] Maximum cardinality of cost attribution labels allowed per user. (default 10000) -validation.max-cost-attribution-labels-per-user int - [experimental] Maximum number of cost attribution labels allowed per user. (default 2) + [experimental] Maximum number of cost attribution labels allowed per user, the value is capped at 4. (default 2) -validation.max-label-names-per-info-series int Maximum number of label names per info series. Has no effect if less than the value of the maximum number of label names per series option (-validation.max-label-names-per-series) (default 80) -validation.max-label-names-per-series int From 091e5c28c49ef7383524457a091f7c042aef87c6 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 14 Jan 2025 19:44:54 +0100 Subject: [PATCH 079/105] update tests --- pkg/costattribution/active_tracker_test.go | 85 +++++++++++ pkg/costattribution/manager_test.go | 16 +-- pkg/costattribution/sample_tracker.go | 11 +- ...tracker_test.go => sample_tracker_test.go} | 132 +++++++++--------- 4 files changed, 167 insertions(+), 77 deletions(-) create mode 100644 pkg/costattribution/active_tracker_test.go rename pkg/costattribution/{tracker_test.go => sample_tracker_test.go} (83%) diff --git a/pkg/costattribution/active_tracker_test.go b/pkg/costattribution/active_tracker_test.go new file mode 100644 index 00000000000..0d0984eee6f --- /dev/null +++ b/pkg/costattribution/active_tracker_test.go @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package costattribution + +import ( + "strings" + "sync" + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/prometheus/prometheus/model/labels" + "github.com/stretchr/testify/assert" +) + +func TestActiveTracker_hasSameLabels(t *testing.T) { + ast := newTestManager().ActiveSeriesTracker("user1") + assert.True(t, ast.hasSameLabels([]string{"team"}), "Expected cost attribution labels mismatch") +} + +func TestActiveTracker_updateCounters(t *testing.T) { + ast := newTestManager().ActiveSeriesTracker("user3") + lbls1 := labels.FromStrings("department", "foo", "service", "bar") + lbls2 := labels.FromStrings("department", "bar", "service", "baz") + lbls3 := labels.FromStrings("department", "baz", "service", "foo") + + ast.Increment(lbls1, time.Unix(1, 0)) + assert.Equal(t, int64(0), ast.overflowSince.Load(), "First observation, should not overflow") + + ast.Decrement(lbls1) + assert.Equal(t, int64(0), ast.overflowSince.Load(), "First observation decremented, should not overflow") + assert.Equal(t, 0, len(ast.observed), "First observation decremented, should be removed since it reached 0") + + ast.Increment(lbls1, time.Unix(2, 0)) + ast.Increment(lbls2, time.Unix(2, 0)) + assert.Equal(t, int64(0), ast.overflowSince.Load(), "Second observation, should not overflow") + + ast.Increment(lbls3, time.Unix(3, 0)) + assert.Equal(t, int64(3), ast.overflowSince.Load(), "Third observation, should overflow") + + ast.Increment(lbls3, time.Unix(4, 0)) + assert.Equal(t, int64(3), ast.overflowSince.Load(), "Fourth observation, should stay overflow") +} + +func TestActiveTracker_Concurrency(t *testing.T) { + m := newTestManager() + ast := m.ActiveSeriesTracker("user1") + + var wg sync.WaitGroup + var i int64 + for i = 0; i < 100; i++ { + wg.Add(1) + go func(i int64) { + defer wg.Done() + lbls := labels.FromStrings("team", string(rune('A'+(i%26)))) + ast.Increment(lbls, time.Unix(i, 0)) + }(i) + } + wg.Wait() + + // Verify no data races or inconsistencies + assert.True(t, len(ast.observed) > 0, "Observed set should not be empty after concurrent updates") + assert.LessOrEqual(t, len(ast.observed), ast.maxCardinality, "Observed count should not exceed max cardinality") + assert.NotEqual(t, 0, ast.overflowSince.Load(), "Expected state to be Overflow") + + expectedMetrics := ` + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{team="__overflow__",tenant="user1",tracker="cost-attribution"} 100 +` + assert.NoError(t, testutil.GatherAndCompare(m.reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + + for i = 0; i < 100; i++ { + wg.Add(1) + go func(i int64) { + defer wg.Done() + lbls := labels.FromStrings("team", string(rune('A'+(i%26)))) + ast.Decrement(lbls) + }(i) + } + wg.Wait() + + assert.Equal(t, 0, len(ast.observed), "Observed set should be empty after all decrements") + assert.NotEqual(t, 0, ast.overflowSince.Load(), "Expected state still to be Overflow") +} diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index 4639a654a10..e9f9c504b9c 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -38,17 +38,17 @@ func TestManager_CreateDeleteTracker(t *testing.T) { manager := newTestManager() t.Run("Tracker existence and attributes", func(t *testing.T) { - user1Tracker := manager.SampleTracker("user1") - assert.NotNil(t, user1Tracker) - assert.True(t, user1Tracker.hasSameLabels([]string{"team"})) - assert.Equal(t, 5, user1Tracker.maxCardinality) + user1SampleTracker := manager.SampleTracker("user1") + assert.NotNil(t, user1SampleTracker) + assert.True(t, user1SampleTracker.hasSameLabels([]string{"team"})) + assert.Equal(t, 5, user1SampleTracker.maxCardinality) assert.Nil(t, manager.SampleTracker("user2")) - user3Tracker := manager.ActiveSeriesTracker("user3") - assert.NotNil(t, user3Tracker) - assert.True(t, user3Tracker.hasSameLabels([]string{"department", "service"})) - assert.Equal(t, 2, user3Tracker.maxCardinality) + user3ActiveTracker := manager.ActiveSeriesTracker("user3") + assert.NotNil(t, user3ActiveTracker) + assert.True(t, user3ActiveTracker.hasSameLabels([]string{"department", "service"})) + assert.Equal(t, 2, user3ActiveTracker.maxCardinality) }) t.Run("Metrics tracking", func(t *testing.T) { diff --git a/pkg/costattribution/sample_tracker.go b/pkg/costattribution/sample_tracker.go index 7c22c8a9444..69b30166afd 100644 --- a/pkg/costattribution/sample_tracker.go +++ b/pkg/costattribution/sample_tracker.go @@ -186,18 +186,21 @@ func (st *SampleTracker) fillKeyFromLabelAdapters(lbls []mimirpb.LabelAdapter, b // updateObservations updates or creates a new observation in the 'observed' map. func (st *SampleTracker) updateObservations(key string, ts time.Time, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { - // if overflowSince is set, we only update the overflow counter + // if not overflow, we need to check if the key exists in the observed map, + // if yes, we update the observation, otherwise we create a new observation, and set the overflowSince if the max cardinality is exceeded + st.observedMtx.RLock() + + // if overflowSince is set, we only update the overflow counter, this is after the read lock since overflowSince can only be set when holding observedMtx write lock + // check it after read lock would make sure that we don't miss any updates if st.overflowSince.Load() > 0 { st.overflowCounter.receivedSample.Add(receivedSampleIncrement) if discardedSampleIncrement > 0 && reason != nil { st.overflowCounter.totalDiscarded.Add(discardedSampleIncrement) } + st.observedMtx.RUnlock() return } - // if not overflow, we need to check if the key exists in the observed map, - // if yes, we update the observation, otherwise we create a new observation, and set the overflowSince if the max cardinality is exceeded - st.observedMtx.RLock() o, known := st.observed[key] if known && st.overflowSince.Load() == 0 { o.lastUpdate.Store(ts.Unix()) diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/sample_tracker_test.go similarity index 83% rename from pkg/costattribution/tracker_test.go rename to pkg/costattribution/sample_tracker_test.go index ee4599f30b6..d32213ca718 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/sample_tracker_test.go @@ -17,14 +17,12 @@ import ( "github.com/grafana/mimir/pkg/mimirpb" ) -func TestTracker_hasSameLabels(t *testing.T) { +func TestSampleTracker_hasSameLabels(t *testing.T) { st := newTestManager().SampleTracker("user1") assert.True(t, st.hasSameLabels([]string{"team"}), "Expected cost attribution labels mismatch") - ast := newTestManager().ActiveSeriesTracker("user1") - assert.True(t, ast.hasSameLabels([]string{"team"}), "Expected cost attribution labels mismatch") } -func TestTracker_IncrementReceviedSamples(t *testing.T) { +func TestSampleTracker_IncrementReceviedSamples(t *testing.T) { tManager := newTestManager() st := tManager.SampleTracker("user4") t.Run("One Single Series in Request", func(t *testing.T) { @@ -68,55 +66,7 @@ func TestTracker_IncrementReceviedSamples(t *testing.T) { }) } -func TestTracker_CreateDelete(t *testing.T) { - tManager := newTestManager() - st := tManager.SampleTracker("user4") - ast := tManager.ActiveSeriesTracker("user4") - - ast.Increment(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), time.Unix(1, 0)) - ast.Increment(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "2"), time.Unix(2, 0)) - ast.Decrement(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "3")) - st.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"platform", "foo", "team", "1"}, SamplesCount: 5}}), time.Unix(4, 0)) - st.IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "platform", Value: "foo"}, {Name: "team", Value: "1"}}, 2, "sample-out-of-order", time.Unix(4, 0)) - ast.Increment(labels.FromStrings("platform", "bar", "tenant", "user4", "team", "2"), time.Unix(6, 0)) - - expectedMetrics := ` - # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. - # TYPE cortex_discarded_attributed_samples_total counter - cortex_discarded_attributed_samples_total{platform="foo",reason="sample-out-of-order", tenant="user4",tracker="cost-attribution"} 2 - # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. - # TYPE cortex_ingester_attributed_active_series gauge - cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="cost-attribution"} 1 - cortex_ingester_attributed_active_series{platform="foo",tenant="user4",tracker="cost-attribution"} 1 - # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. - # TYPE cortex_received_attributed_samples_total counter - cortex_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 5 - ` - - metricNames := []string{ - "cortex_discarded_attributed_samples_total", - "cortex_received_attributed_samples_total", - "cortex_ingester_attributed_active_series", - } - assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), metricNames...)) - - // The purge only apply to the sample tracker. - assert.Equal(t, []string{"foo"}, st.inactiveObservations(time.Unix(5, 0))) - assert.NoError(t, tManager.purgeInactiveAttributionsUntil(time.Unix(5, 0))) - - expectedMetrics = ` - # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. - # TYPE cortex_ingester_attributed_active_series gauge - cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="cost-attribution"} 1 - cortex_ingester_attributed_active_series{platform="foo",tenant="user4",tracker="cost-attribution"} 1 - ` - assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), metricNames...)) - tManager.deleteSampleTracker("user4") - tManager.deleteActiveTracker("user4") - assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(""), metricNames...)) -} - -func TestTracker_updateCounters(t *testing.T) { +func TestSampleTracker_updateCounters(t *testing.T) { st := newTestManager().SampleTracker("user3") lbls1 := []mimirpb.LabelAdapter{{Name: "department", Value: "foo"}, {Name: "service", Value: "bar"}} lbls2 := []mimirpb.LabelAdapter{{Name: "department", Value: "bar"}, {Name: "service", Value: "baz"}} @@ -135,7 +85,7 @@ func TestTracker_updateCounters(t *testing.T) { assert.Equal(t, int64(3), st.overflowSince.Load(), "Fourth observation, should stay overflow") } -func TestTracker_inactiveObservations(t *testing.T) { +func TestSampleTracker_inactiveObservations(t *testing.T) { // Setup the test environment: create a st for user1 with a "team" label and max cardinality of 5. st := newTestManager().SampleTracker("user1") @@ -169,9 +119,9 @@ func TestTracker_inactiveObservations(t *testing.T) { assert.ElementsMatch(t, []string{"foo", "bar", "baz"}, purged) } -func TestTracker_Concurrency(t *testing.T) { +func TestSampleTracker_Concurrency(t *testing.T) { m := newTestManager() - ast := m.ActiveSeriesTracker("user1") + st := m.SampleTracker("user1") var wg sync.WaitGroup var i int64 @@ -179,21 +129,73 @@ func TestTracker_Concurrency(t *testing.T) { wg.Add(1) go func(i int64) { defer wg.Done() - lbls := labels.FromStrings("team", string(rune('A'+(i%26)))) - ast.Increment(lbls, time.Unix(i, 0)) + st.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"team", string(rune('A' + (i % 26)))}, SamplesCount: 1}}), time.Unix(i, 0)) + st.IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: string(rune('A' + (i % 26)))}}, 1, "sample-out-of-order", time.Unix(i, 0)) }(i) } wg.Wait() - // Verify no data races or inconsistencies - assert.True(t, len(ast.observed) > 0, "Observed set should not be empty after concurrent updates") - assert.LessOrEqual(t, len(ast.observed), 2*ast.maxCardinality, "Observed count should not exceed 2 times of max cardinality") - assert.NotEqual(t, 0, ast.overflowSince.Load(), "Expected state to be Overflow") + // Verify no data races or inconsistencies, since after 5 all the samples will be counted into the overflow, so the count should be 95 + assert.True(t, len(st.observed) > 0, "Observed set should not be empty after concurrent updates") + assert.LessOrEqual(t, len(st.observed), st.maxCardinality, "Observed count should not exceed max cardinality") + assert.NotEqual(t, 0, st.overflowSince.Load(), "Expected state to be Overflow") expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{reason="__overflow__",team="__overflow__",tenant="user1",tracker="cost-attribution"} 95 + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{team="__overflow__",tenant="user1",tracker="cost-attribution"} 95 + +` + assert.NoError(t, testutil.GatherAndCompare(m.reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total", "cortex_discarded_attributed_samples_total")) +} + +func TestTracker_CreateDelete(t *testing.T) { + tManager := newTestManager() + st := tManager.SampleTracker("user4") + ast := tManager.ActiveSeriesTracker("user4") + + ast.Increment(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), time.Unix(1, 0)) + ast.Increment(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "2"), time.Unix(2, 0)) + ast.Decrement(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "3")) + st.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"platform", "foo", "team", "1"}, SamplesCount: 5}}), time.Unix(4, 0)) + st.IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "platform", Value: "foo"}, {Name: "team", Value: "1"}}, 2, "sample-out-of-order", time.Unix(4, 0)) + ast.Increment(labels.FromStrings("platform", "bar", "tenant", "user4", "team", "2"), time.Unix(6, 0)) + + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{platform="foo",reason="sample-out-of-order", tenant="user4",tracker="cost-attribution"} 2 # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. # TYPE cortex_ingester_attributed_active_series gauge - cortex_ingester_attributed_active_series{team="__overflow__",tenant="user1",tracker="cost-attribution"} 100 -` - assert.NoError(t, testutil.GatherAndCompare(m.reg, strings.NewReader(expectedMetrics), "cortex_ingester_attributed_active_series")) + cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{platform="foo",tenant="user4",tracker="cost-attribution"} 1 + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 5 + ` + + metricNames := []string{ + "cortex_discarded_attributed_samples_total", + "cortex_received_attributed_samples_total", + "cortex_ingester_attributed_active_series", + } + assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), metricNames...)) + + // The purge only apply to the sample tracker. + assert.Equal(t, []string{"foo"}, st.inactiveObservations(time.Unix(5, 0))) + assert.NoError(t, tManager.purgeInactiveAttributionsUntil(time.Unix(5, 0))) + + expectedMetrics = ` + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{platform="foo",tenant="user4",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), metricNames...)) + tManager.deleteSampleTracker("user4") + tManager.deleteActiveTracker("user4") + assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(""), metricNames...)) } From 78ea8398413c26de939af47e3dd0cd6032066ac7 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 14 Jan 2025 20:26:07 +0100 Subject: [PATCH 080/105] correct the metrics name --- CHANGELOG.md | 2 +- cmd/mimir/config-descriptor.json | 2 +- cmd/mimir/help-all.txt.tmpl | 2 +- .../configuration-parameters/index.md | 4 +- pkg/costattribution/manager_test.go | 26 +++++------ pkg/costattribution/sample_tracker.go | 2 +- pkg/costattribution/sample_tracker_test.go | 44 +++++++++---------- pkg/util/validation/limits.go | 2 +- 8 files changed, 42 insertions(+), 42 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 464581712c1..787f5bc46b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ * [CHANGE] Querier: pass query matchers to queryable `IsApplicable` hook. #10256 * [CHANGE] Query-frontend: Add `topic` label to `cortex_ingest_storage_strong_consistency_requests_total`, `cortex_ingest_storage_strong_consistency_failures_total`, and `cortex_ingest_storage_strong_consistency_wait_duration_seconds` metrics. #10220 * [CHANGE] Ruler: cap the rate of retries for remote query evaluation to 170/sec. This is configurable via `-ruler.query-frontend.max-retries-rate`. #10375 #10403 -* [CHANGE] Ingester/Distributor: Add support for exporting cost attribution metrics (`cortex_ingester_attributed_active_series`, `cortex_received_attributed_samples_total`, and `cortex_discarded_attributed_samples_total`) with labels specified by customers to a custom Prometheus registry. This feature enables more flexible billing data tracking. #10269 +* [CHANGE] Ingester/Distributor: Add support for exporting cost attribution metrics (`cortex_ingester_attributed_active_series`, `cortex_distributor_received_attributed_samples_total`, and `cortex_discarded_attributed_samples_total`) with labels specified by customers to a custom Prometheus registry. This feature enables more flexible billing data tracking. #10269 * [ENHANCEMENT] Query Frontend: Return server-side `samples_processed` statistics. #10103 * [ENHANCEMENT] Distributor: OTLP receiver now converts also metric metadata. See also https://github.com/prometheus/prometheus/pull/15416. #10168 * [ENHANCEMENT] Distributor: discard float and histogram samples with duplicated timestamps from each timeseries in a request before the request is forwarded to ingesters. Discarded samples are tracked by the `cortex_discarded_samples_total` metrics with reason `sample_duplicate_timestamp`. #10145 diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 22819788151..289431fdb20 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -4372,7 +4372,7 @@ "kind": "field", "name": "cost_attribution_labels", "required": false, - "desc": "Defines labels for cost attribution. Applies to metrics like cortex_received_attributed_samples_total. To disable, set to an empty string. For example, 'team,service' produces metrics such as cortex_received_attributed_samples_total{team='frontend', service='api'}.", + "desc": "Defines labels for cost attribution. Applies to metrics like cortex_distributor_received_attributed_samples_total. To disable, set to an empty string. For example, 'team,service' produces metrics such as cortex_distributor_received_attributed_samples_total{team='frontend', service='api'}.", "fieldValue": null, "fieldDefaultValue": "", "fieldFlag": "validation.cost-attribution-labels", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index b905ba39f68..548f39ad5ad 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -3328,7 +3328,7 @@ Usage of ./cmd/mimir/mimir: -validation.cost-attribution-cooldown duration [experimental] Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system stays in overflow mode and extends the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset occurs when the cardinality falls below the limit. -validation.cost-attribution-labels comma-separated-list-of-strings - [experimental] Defines labels for cost attribution. Applies to metrics like cortex_received_attributed_samples_total. To disable, set to an empty string. For example, 'team,service' produces metrics such as cortex_received_attributed_samples_total{team='frontend', service='api'}. + [experimental] Defines labels for cost attribution. Applies to metrics like cortex_distributor_received_attributed_samples_total. To disable, set to an empty string. For example, 'team,service' produces metrics such as cortex_distributor_received_attributed_samples_total{team='frontend', service='api'}. -validation.create-grace-period duration Controls how far into the future incoming samples and exemplars are accepted compared to the wall clock. Any sample or exemplar will be rejected if its timestamp is greater than '(now + creation_grace_period)'. This configuration is enforced in the distributor and ingester. (default 10m) -validation.enforce-metadata-metric-name diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 2a3b57c8c32..67b1fc874a0 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -3593,9 +3593,9 @@ The `limits` block configures default and per-tenant limits imposed by component [active_series_results_max_size_bytes: | default = 419430400] # (experimental) Defines labels for cost attribution. Applies to metrics like -# cortex_received_attributed_samples_total. To disable, set to an empty string. +# cortex_distributor_received_attributed_samples_total. To disable, set to an empty string. # For example, 'team,service' produces metrics such as -# cortex_received_attributed_samples_total{team='frontend', service='api'}. +# cortex_distributor_received_attributed_samples_total{team='frontend', service='api'}. # CLI flag: -validation.cost-attribution-labels [cost_attribution_labels: | default = ""] diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index e9f9c504b9c..a924c4acfff 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -61,11 +61,11 @@ func TestManager_CreateDeleteTracker(t *testing.T) { # TYPE cortex_discarded_attributed_samples_total counter cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="bar",tenant="user1",tracker="cost-attribution"} 1 cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="cost-attribution"} 1 - # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. - # TYPE cortex_received_attributed_samples_total counter - cortex_received_attributed_samples_total{department="foo",service="dodo",tenant="user3",tracker="cost-attribution"} 1 + # HELP cortex_distributor_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_distributor_received_attributed_samples_total counter + cortex_distributor_received_attributed_samples_total{department="foo",service="dodo",tenant="user3",tracker="cost-attribution"} 1 ` - assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total")) + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total", "cortex_distributor_received_attributed_samples_total")) }) t.Run("Purge inactive attributions", func(t *testing.T) { @@ -87,11 +87,11 @@ func TestManager_CreateDeleteTracker(t *testing.T) { assert.Equal(t, 1, len(manager.sampleTrackersByUserID)) expectedMetrics := ` - # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. - # TYPE cortex_received_attributed_samples_total counter - cortex_received_attributed_samples_total{department="foo",service="dodo",tenant="user3",tracker="cost-attribution"} 1 + # HELP cortex_distributor_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_distributor_received_attributed_samples_total counter + cortex_distributor_received_attributed_samples_total{department="foo",service="dodo",tenant="user3",tracker="cost-attribution"} 1 ` - assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total")) + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total", "cortex_distributor_received_attributed_samples_total")) }) t.Run("Updating user cardinality and labels", func(t *testing.T) { @@ -116,11 +116,11 @@ func TestManager_CreateDeleteTracker(t *testing.T) { manager.SampleTracker("user3").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"team", "baz", "feature", "baz"}, SamplesCount: 1}}), time.Unix(16, 0)) manager.SampleTracker("user3").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"team", "foo", "feature", "foo"}, SamplesCount: 1}}), time.Unix(17, 0)) expectedMetrics := ` - # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. - # TYPE cortex_received_attributed_samples_total counter - cortex_received_attributed_samples_total{feature="__overflow__",team="__overflow__",tenant="user3",tracker="cost-attribution"} 2 + # HELP cortex_distributor_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_distributor_received_attributed_samples_total counter + cortex_distributor_received_attributed_samples_total{feature="__overflow__",team="__overflow__",tenant="user3",tracker="cost-attribution"} 2 ` - assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total")) + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_distributor_received_attributed_samples_total")) }) } @@ -170,6 +170,6 @@ func TestManager_PurgeInactiveAttributionsUntil(t *testing.T) { assert.Equal(t, 1, len(manager.sampleTrackersByUserID), "Expected one active tracker after full purge") // No metrics should remain after all purged - assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(""), "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total")) + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(""), "cortex_discarded_attributed_samples_total", "cortex_distributor_received_attributed_samples_total")) }) } diff --git a/pkg/costattribution/sample_tracker.go b/pkg/costattribution/sample_tracker.go index 69b30166afd..f1a4f648456 100644 --- a/pkg/costattribution/sample_tracker.go +++ b/pkg/costattribution/sample_tracker.go @@ -72,7 +72,7 @@ func newSampleTracker(userID string, trackedLabels []string, limit int, cooldown variableLabels, prometheus.Labels{trackerLabel: defaultTrackerName}) - tracker.receivedSamplesAttribution = prometheus.NewDesc("cortex_received_attributed_samples_total", + tracker.receivedSamplesAttribution = prometheus.NewDesc("cortex_distributor_received_attributed_samples_total", "The total number of samples that were received per attribution.", variableLabels[:len(variableLabels)-1], prometheus.Labels{trackerLabel: defaultTrackerName}) diff --git a/pkg/costattribution/sample_tracker_test.go b/pkg/costattribution/sample_tracker_test.go index d32213ca718..68a2d5f19a5 100644 --- a/pkg/costattribution/sample_tracker_test.go +++ b/pkg/costattribution/sample_tracker_test.go @@ -29,11 +29,11 @@ func TestSampleTracker_IncrementReceviedSamples(t *testing.T) { st.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"platform", "foo", "service", "dodo"}, SamplesCount: 3}}), time.Unix(10, 0)) expectedMetrics := ` - # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. - # TYPE cortex_received_attributed_samples_total counter - cortex_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 3 + # HELP cortex_distributor_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_distributor_received_attributed_samples_total counter + cortex_distributor_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 3 ` - assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total")) + assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), "cortex_distributor_received_attributed_samples_total")) }) t.Run("Multiple Different Series in Request", func(t *testing.T) { st.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{ @@ -42,12 +42,12 @@ func TestSampleTracker_IncrementReceviedSamples(t *testing.T) { }), time.Unix(20, 0)) expectedMetrics := ` - # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. - # TYPE cortex_received_attributed_samples_total counter - cortex_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 6 - cortex_received_attributed_samples_total{platform="bar",tenant="user4",tracker="cost-attribution"} 5 + # HELP cortex_distributor_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_distributor_received_attributed_samples_total counter + cortex_distributor_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 6 + cortex_distributor_received_attributed_samples_total{platform="bar",tenant="user4",tracker="cost-attribution"} 5 ` - assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total")) + assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), "cortex_distributor_received_attributed_samples_total")) }) t.Run("Multiple Series in Request with Same Labels", func(t *testing.T) { @@ -57,12 +57,12 @@ func TestSampleTracker_IncrementReceviedSamples(t *testing.T) { }), time.Unix(30, 0)) expectedMetrics := ` - # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. - # TYPE cortex_received_attributed_samples_total counter - cortex_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 14 - cortex_received_attributed_samples_total{platform="bar",tenant="user4",tracker="cost-attribution"} 5 + # HELP cortex_distributor_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_distributor_received_attributed_samples_total counter + cortex_distributor_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 14 + cortex_distributor_received_attributed_samples_total{platform="bar",tenant="user4",tracker="cost-attribution"} 5 ` - assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total")) + assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), "cortex_distributor_received_attributed_samples_total")) }) } @@ -144,12 +144,12 @@ func TestSampleTracker_Concurrency(t *testing.T) { # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. # TYPE cortex_discarded_attributed_samples_total counter cortex_discarded_attributed_samples_total{reason="__overflow__",team="__overflow__",tenant="user1",tracker="cost-attribution"} 95 - # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. - # TYPE cortex_received_attributed_samples_total counter - cortex_received_attributed_samples_total{team="__overflow__",tenant="user1",tracker="cost-attribution"} 95 + # HELP cortex_distributor_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_distributor_received_attributed_samples_total counter + cortex_distributor_received_attributed_samples_total{team="__overflow__",tenant="user1",tracker="cost-attribution"} 95 ` - assert.NoError(t, testutil.GatherAndCompare(m.reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total", "cortex_discarded_attributed_samples_total")) + assert.NoError(t, testutil.GatherAndCompare(m.reg, strings.NewReader(expectedMetrics), "cortex_distributor_received_attributed_samples_total", "cortex_discarded_attributed_samples_total")) } func TestTracker_CreateDelete(t *testing.T) { @@ -172,14 +172,14 @@ func TestTracker_CreateDelete(t *testing.T) { # TYPE cortex_ingester_attributed_active_series gauge cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="cost-attribution"} 1 cortex_ingester_attributed_active_series{platform="foo",tenant="user4",tracker="cost-attribution"} 1 - # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. - # TYPE cortex_received_attributed_samples_total counter - cortex_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 5 + # HELP cortex_distributor_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_distributor_received_attributed_samples_total counter + cortex_distributor_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 5 ` metricNames := []string{ "cortex_discarded_attributed_samples_total", - "cortex_received_attributed_samples_total", + "cortex_distributor_received_attributed_samples_total", "cortex_ingester_attributed_active_series", } assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), metricNames...)) diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index b9583801fc6..639de965bbb 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -310,7 +310,7 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.StringVar(&l.SeparateMetricsGroupLabel, "validation.separate-metrics-group-label", "", "Label used to define the group label for metrics separation. For each write request, the group is obtained from the first non-empty group label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'group' label with group label's value. Currently applies to the following metrics: cortex_discarded_samples_total") - f.Var(&l.CostAttributionLabels, costAttributionLabelsFlag, "Defines labels for cost attribution. Applies to metrics like cortex_received_attributed_samples_total. To disable, set to an empty string. For example, 'team,service' produces metrics such as cortex_received_attributed_samples_total{team='frontend', service='api'}.") + f.Var(&l.CostAttributionLabels, costAttributionLabelsFlag, "Defines labels for cost attribution. Applies to metrics like cortex_distributor_received_attributed_samples_total. To disable, set to an empty string. For example, 'team,service' produces metrics such as cortex_distributor_received_attributed_samples_total{team='frontend', service='api'}.") f.IntVar(&l.MaxCostAttributionLabelsPerUser, maxCostAttributionLabelsPerUserFlag, 2, "Maximum number of cost attribution labels allowed per user, the value is capped at 4.") f.IntVar(&l.MaxCostAttributionCardinalityPerUser, "validation.max-cost-attribution-cardinality-per-user", 10000, "Maximum cardinality of cost attribution labels allowed per user.") f.Var(&l.CostAttributionCooldown, "validation.cost-attribution-cooldown", "Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system stays in overflow mode and extends the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset occurs when the cardinality falls below the limit.") From f0b0b401aaae31c46c037bfd851e2a4fdb30c55f Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Wed, 15 Jan 2025 10:32:46 +0100 Subject: [PATCH 081/105] fix lint --- .../mimir/configure/configuration-parameters/index.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 67b1fc874a0..fe84ce414c5 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -3593,9 +3593,10 @@ The `limits` block configures default and per-tenant limits imposed by component [active_series_results_max_size_bytes: | default = 419430400] # (experimental) Defines labels for cost attribution. Applies to metrics like -# cortex_distributor_received_attributed_samples_total. To disable, set to an empty string. -# For example, 'team,service' produces metrics such as -# cortex_distributor_received_attributed_samples_total{team='frontend', service='api'}. +# cortex_distributor_received_attributed_samples_total. To disable, set to an +# empty string. For example, 'team,service' produces metrics such as +# cortex_distributor_received_attributed_samples_total{team='frontend', +# service='api'}. # CLI flag: -validation.cost-attribution-labels [cost_attribution_labels: | default = ""] From fbb2fac8a1b1f52af2fabf1ccdda683592ed22f7 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Wed, 15 Jan 2025 16:31:55 +0100 Subject: [PATCH 082/105] update examples --- cmd/mimir/config-descriptor.json | 4 ++-- cmd/mimir/help-all.txt.tmpl | 4 ++-- .../mimir-microservices-mode/config/mimir.yaml | 2 +- .../configuration-parameters/index.md | 18 ++++++++---------- pkg/mimir/mimir.go | 2 +- pkg/util/validation/limits.go | 2 +- 6 files changed, 15 insertions(+), 17 deletions(-) diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 289431fdb20..09223021c43 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -4405,7 +4405,7 @@ "kind": "field", "name": "cost_attribution_cooldown", "required": false, - "desc": "Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system stays in overflow mode and extends the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset occurs when the cardinality falls below the limit.", + "desc": "Defines how long cost attribution stays in overflow before attempting a reset, with received/discarded samples extending the cooldown if overflow persists, while active series reset and restart tracking after the cooldown.", "fieldValue": null, "fieldDefaultValue": 0, "fieldFlag": "validation.cost-attribution-cooldown", @@ -19698,7 +19698,7 @@ "kind": "field", "name": "cost_attribution_eviction_interval", "required": false, - "desc": "Time interval at which inactive cost attributions are evicted from the counter, ensuring they are not included in the cost attribution cardinality per user limit.", + "desc": "Specifies how often inactive cost attributions for received and discarded sample trackers are evicted from the counter, ensuring they do not contribute to the cost attribution cardinality per user limit. This setting does not apply to active series, which are managed separately.", "fieldValue": null, "fieldDefaultValue": 1200000000000, "fieldFlag": "cost-attribution.eviction-interval", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 548f39ad5ad..a18c0b6636b 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -1286,7 +1286,7 @@ Usage of ./cmd/mimir/mimir: -cost-attribution.cleanup-interval duration [experimental] Time interval at which the cost attribution cleanup process runs, ensuring inactive cost attribution entries are purged. (default 3m0s) -cost-attribution.eviction-interval duration - [experimental] Time interval at which inactive cost attributions are evicted from the counter, ensuring they are not included in the cost attribution cardinality per user limit. (default 20m0s) + [experimental] Specifies how often inactive cost attributions for received and discarded sample trackers are evicted from the counter, ensuring they do not contribute to the cost attribution cardinality per user limit. This setting does not apply to active series, which are managed separately. (default 20m0s) -cost-attribution.registry-path string [experimental] Defines a custom path for the registry. When specified, Mimir exposes cost attribution metrics through this custom path. If not specified, cost attribution metrics aren't exposed. -debug.block-profile-rate int @@ -3326,7 +3326,7 @@ Usage of ./cmd/mimir/mimir: -usage-stats.installation-mode string Installation mode. Supported values: custom, helm, jsonnet. (default "custom") -validation.cost-attribution-cooldown duration - [experimental] Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system stays in overflow mode and extends the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset occurs when the cardinality falls below the limit. + [experimental] Defines how long cost attribution stays in overflow before attempting a reset, with received/discarded samples extending the cooldown if overflow persists, while active series reset and restart tracking after the cooldown. -validation.cost-attribution-labels comma-separated-list-of-strings [experimental] Defines labels for cost attribution. Applies to metrics like cortex_distributor_received_attributed_samples_total. To disable, set to an empty string. For example, 'team,service' produces metrics such as cortex_distributor_received_attributed_samples_total{team='frontend', service='api'}. -validation.create-grace-period duration diff --git a/development/mimir-microservices-mode/config/mimir.yaml b/development/mimir-microservices-mode/config/mimir.yaml index 84d5c219039..e23680989d8 100644 --- a/development/mimir-microservices-mode/config/mimir.yaml +++ b/development/mimir-microservices-mode/config/mimir.yaml @@ -186,7 +186,7 @@ limits: ha_replica_label: ha_replica ha_max_clusters: 10 - cost_attribution_labels: "container" + cost_attribution_labels: "container,instance" max_cost_attribution_labels_per_user: 2 max_cost_attribution_cardinality_per_user: 100 cost_attribution_cooldown: 20m diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index fe84ce414c5..ba8264604a1 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -456,9 +456,10 @@ overrides_exporter: # CLI flag: -timeseries-unmarshal-caching-optimization-enabled [timeseries_unmarshal_caching_optimization_enabled: | default = true] -# (experimental) Time interval at which inactive cost attributions are evicted -# from the counter, ensuring they are not included in the cost attribution -# cardinality per user limit. +# (experimental) Specifies how often inactive cost attributions for received and +# discarded sample trackers are evicted from the counter, ensuring they do not +# contribute to the cost attribution cardinality per user limit. This setting +# does not apply to active series, which are managed separately. # CLI flag: -cost-attribution.eviction-interval [cost_attribution_eviction_interval: | default = 20m] @@ -3610,13 +3611,10 @@ The `limits` block configures default and per-tenant limits imposed by component # CLI flag: -validation.max-cost-attribution-cardinality-per-user [max_cost_attribution_cardinality_per_user: | default = 10000] -# (experimental) Cooldown period for cost attribution labels. Specifies the -# duration the cost attribution remains in overflow before attempting a reset. -# If the cardinality remains above the limit after this period, the system stays -# in overflow mode and extends the cooldown. Setting this value to 0 disables -# the cooldown, causing the system to continuously check whether the cardinality -# has dropped below the limit. A reset occurs when the cardinality falls below -# the limit. +# (experimental) Defines how long cost attribution stays in overflow before +# attempting a reset, with received/discarded samples extending the cooldown if +# overflow persists, while active series reset and restart tracking after the +# cooldown. # CLI flag: -validation.cost-attribution-cooldown [cost_attribution_cooldown: | default = 0s] diff --git a/pkg/mimir/mimir.go b/pkg/mimir/mimir.go index 540d0f7bbdd..dfec23cad87 100644 --- a/pkg/mimir/mimir.go +++ b/pkg/mimir/mimir.go @@ -179,7 +179,7 @@ func (c *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) { f.BoolVar(&c.EnableGoRuntimeMetrics, "enable-go-runtime-metrics", false, "Set to true to enable all Go runtime metrics, such as go_sched_* and go_memstats_*.") f.BoolVar(&c.TimeseriesUnmarshalCachingOptimizationEnabled, "timeseries-unmarshal-caching-optimization-enabled", true, "Enables optimized marshaling of timeseries.") f.StringVar(&c.CostAttributionRegistryPath, "cost-attribution.registry-path", "", "Defines a custom path for the registry. When specified, Mimir exposes cost attribution metrics through this custom path. If not specified, cost attribution metrics aren't exposed.") - f.DurationVar(&c.CostAttributionEvictionInterval, "cost-attribution.eviction-interval", 20*time.Minute, "Time interval at which inactive cost attributions are evicted from the counter, ensuring they are not included in the cost attribution cardinality per user limit.") + f.DurationVar(&c.CostAttributionEvictionInterval, "cost-attribution.eviction-interval", 20*time.Minute, "Specifies how often inactive cost attributions for received and discarded sample trackers are evicted from the counter, ensuring they do not contribute to the cost attribution cardinality per user limit. This setting does not apply to active series, which are managed separately.") f.DurationVar(&c.CostAttributionCleanupInterval, "cost-attribution.cleanup-interval", 3*time.Minute, "Time interval at which the cost attribution cleanup process runs, ensuring inactive cost attribution entries are purged.") c.API.RegisterFlags(f) diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 639de965bbb..f2c649d3af2 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -313,7 +313,7 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.Var(&l.CostAttributionLabels, costAttributionLabelsFlag, "Defines labels for cost attribution. Applies to metrics like cortex_distributor_received_attributed_samples_total. To disable, set to an empty string. For example, 'team,service' produces metrics such as cortex_distributor_received_attributed_samples_total{team='frontend', service='api'}.") f.IntVar(&l.MaxCostAttributionLabelsPerUser, maxCostAttributionLabelsPerUserFlag, 2, "Maximum number of cost attribution labels allowed per user, the value is capped at 4.") f.IntVar(&l.MaxCostAttributionCardinalityPerUser, "validation.max-cost-attribution-cardinality-per-user", 10000, "Maximum cardinality of cost attribution labels allowed per user.") - f.Var(&l.CostAttributionCooldown, "validation.cost-attribution-cooldown", "Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system stays in overflow mode and extends the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset occurs when the cardinality falls below the limit.") + f.Var(&l.CostAttributionCooldown, "validation.cost-attribution-cooldown", "Defines how long cost attribution stays in overflow before attempting a reset, with received/discarded samples extending the cooldown if overflow persists, while active series reset and restart tracking after the cooldown.") f.IntVar(&l.MaxChunksPerQuery, MaxChunksPerQueryFlag, 2e6, "Maximum number of chunks that can be fetched in a single query from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable.") f.Float64Var(&l.MaxEstimatedChunksPerQueryMultiplier, MaxEstimatedChunksPerQueryMultiplierFlag, 0, "Maximum number of chunks estimated to be fetched in a single query from ingesters and store-gateways, as a multiple of -"+MaxChunksPerQueryFlag+". This limit is enforced in the querier. Must be greater than or equal to 1, or 0 to disable.") f.IntVar(&l.MaxFetchedSeriesPerQuery, MaxSeriesPerQueryFlag, 0, "The maximum number of unique series for which a query can fetch samples from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable") From 70c1d9e0caba3e48620a1df000d61a44f2d313f0 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 16 Jan 2025 11:51:56 +0100 Subject: [PATCH 083/105] remove test files --- development/mimir-microservices-mode/config/mimir.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/development/mimir-microservices-mode/config/mimir.yaml b/development/mimir-microservices-mode/config/mimir.yaml index e23680989d8..5d245999115 100644 --- a/development/mimir-microservices-mode/config/mimir.yaml +++ b/development/mimir-microservices-mode/config/mimir.yaml @@ -1,6 +1,4 @@ multitenancy_enabled: false -cost_attribution_registry_path: "/usage-metrics" -cost_attribution_eviction_interval: 10m distributor: ha_tracker: @@ -185,11 +183,6 @@ limits: ha_cluster_label: ha_cluster ha_replica_label: ha_replica ha_max_clusters: 10 - - cost_attribution_labels: "container,instance" - max_cost_attribution_labels_per_user: 2 - max_cost_attribution_cardinality_per_user: 100 - cost_attribution_cooldown: 20m runtime_config: file: ./config/runtime.yaml From 39b888f0490a5e0b496c85f617af54e8b2975a37 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 16 Jan 2025 12:14:56 +0100 Subject: [PATCH 084/105] change tests --- pkg/costattribution/active_tracker_test.go | 6 ++- pkg/costattribution/manager_test.go | 18 +++++--- pkg/costattribution/sample_tracker_test.go | 49 ---------------------- 3 files changed, 18 insertions(+), 55 deletions(-) diff --git a/pkg/costattribution/active_tracker_test.go b/pkg/costattribution/active_tracker_test.go index 0d0984eee6f..355a3ef00a3 100644 --- a/pkg/costattribution/active_tracker_test.go +++ b/pkg/costattribution/active_tracker_test.go @@ -18,7 +18,7 @@ func TestActiveTracker_hasSameLabels(t *testing.T) { assert.True(t, ast.hasSameLabels([]string{"team"}), "Expected cost attribution labels mismatch") } -func TestActiveTracker_updateCounters(t *testing.T) { +func TestActiveTracker_IncrementDecrement(t *testing.T) { ast := newTestManager().ActiveSeriesTracker("user3") lbls1 := labels.FromStrings("department", "foo", "service", "bar") lbls2 := labels.FromStrings("department", "bar", "service", "baz") @@ -26,6 +26,7 @@ func TestActiveTracker_updateCounters(t *testing.T) { ast.Increment(lbls1, time.Unix(1, 0)) assert.Equal(t, int64(0), ast.overflowSince.Load(), "First observation, should not overflow") + assert.Equal(t, 1, len(ast.observed)) ast.Decrement(lbls1) assert.Equal(t, int64(0), ast.overflowSince.Load(), "First observation decremented, should not overflow") @@ -34,12 +35,15 @@ func TestActiveTracker_updateCounters(t *testing.T) { ast.Increment(lbls1, time.Unix(2, 0)) ast.Increment(lbls2, time.Unix(2, 0)) assert.Equal(t, int64(0), ast.overflowSince.Load(), "Second observation, should not overflow") + assert.Equal(t, 2, len(ast.observed)) ast.Increment(lbls3, time.Unix(3, 0)) assert.Equal(t, int64(3), ast.overflowSince.Load(), "Third observation, should overflow") + assert.Equal(t, 2, len(ast.observed)) ast.Increment(lbls3, time.Unix(4, 0)) assert.Equal(t, int64(3), ast.overflowSince.Load(), "Fourth observation, should stay overflow") + assert.Equal(t, 2, len(ast.observed)) } func TestActiveTracker_Concurrency(t *testing.T) { diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index a924c4acfff..550e1d67a77 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -10,6 +10,7 @@ import ( "github.com/go-kit/log" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/prometheus/prometheus/model/labels" "github.com/stretchr/testify/assert" "github.com/grafana/mimir/pkg/costattribution/testutils" @@ -55,7 +56,7 @@ func TestManager_CreateDeleteTracker(t *testing.T) { manager.SampleTracker("user1").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "bar"}}, 1, "invalid-metrics-name", time.Unix(6, 0)) manager.SampleTracker("user1").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "foo"}}, 1, "invalid-metrics-name", time.Unix(12, 0)) manager.SampleTracker("user3").IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"department", "foo", "service", "dodo"}, SamplesCount: 1}}), time.Unix(20, 0)) - + manager.ActiveSeriesTracker("user1").Increment(labels.FromStrings("team", "bar"), time.Unix(10, 0)) expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. # TYPE cortex_discarded_attributed_samples_total counter @@ -64,19 +65,25 @@ func TestManager_CreateDeleteTracker(t *testing.T) { # HELP cortex_distributor_received_attributed_samples_total The total number of samples that were received per attribution. # TYPE cortex_distributor_received_attributed_samples_total counter cortex_distributor_received_attributed_samples_total{department="foo",service="dodo",tenant="user3",tracker="cost-attribution"} 1 + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{team="bar",tenant="user1",tracker="cost-attribution"} 1 ` - assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total", "cortex_distributor_received_attributed_samples_total")) + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total", "cortex_distributor_received_attributed_samples_total", "cortex_ingester_attributed_active_series")) }) - t.Run("Purge inactive attributions", func(t *testing.T) { + t.Run("Purge inactive attributions, only received/discarded samples are purged", func(t *testing.T) { err := manager.purgeInactiveAttributionsUntil(time.Unix(10, 0)) assert.NoError(t, err) expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. # TYPE cortex_discarded_attributed_samples_total counter cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="cost-attribution"} 1 + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{team="bar",tenant="user1",tracker="cost-attribution"} 1 ` - assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total")) + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total", "cortex_ingester_attributed_active_series")) }) t.Run("Disabling user cost attribution", func(t *testing.T) { @@ -91,7 +98,7 @@ func TestManager_CreateDeleteTracker(t *testing.T) { # TYPE cortex_distributor_received_attributed_samples_total counter cortex_distributor_received_attributed_samples_total{department="foo",service="dodo",tenant="user3",tracker="cost-attribution"} 1 ` - assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total", "cortex_distributor_received_attributed_samples_total")) + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total", "cortex_distributor_received_attributed_samples_total", "cortex_ingester_attributed_active_series")) }) t.Run("Updating user cardinality and labels", func(t *testing.T) { @@ -101,6 +108,7 @@ func TestManager_CreateDeleteTracker(t *testing.T) { assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(12, 0))) assert.Equal(t, 1, len(manager.sampleTrackersByUserID)) assert.True(t, manager.SampleTracker("user3").hasSameLabels([]string{"feature", "team"})) + assert.True(t, manager.ActiveSeriesTracker("user3").hasSameLabels([]string{"feature", "team"})) manager.SampleTracker("user3").IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "team", Value: "foo"}}, 1, "invalid-metrics-name", time.Unix(13, 0)) expectedMetrics := ` diff --git a/pkg/costattribution/sample_tracker_test.go b/pkg/costattribution/sample_tracker_test.go index 68a2d5f19a5..5f2e830b18f 100644 --- a/pkg/costattribution/sample_tracker_test.go +++ b/pkg/costattribution/sample_tracker_test.go @@ -9,7 +9,6 @@ import ( "time" "github.com/prometheus/client_golang/prometheus/testutil" - "github.com/prometheus/prometheus/model/labels" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -151,51 +150,3 @@ func TestSampleTracker_Concurrency(t *testing.T) { ` assert.NoError(t, testutil.GatherAndCompare(m.reg, strings.NewReader(expectedMetrics), "cortex_distributor_received_attributed_samples_total", "cortex_discarded_attributed_samples_total")) } - -func TestTracker_CreateDelete(t *testing.T) { - tManager := newTestManager() - st := tManager.SampleTracker("user4") - ast := tManager.ActiveSeriesTracker("user4") - - ast.Increment(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), time.Unix(1, 0)) - ast.Increment(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "2"), time.Unix(2, 0)) - ast.Decrement(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "3")) - st.IncrementReceivedSamples(testutils.CreateRequest([]testutils.Series{{LabelValues: []string{"platform", "foo", "team", "1"}, SamplesCount: 5}}), time.Unix(4, 0)) - st.IncrementDiscardedSamples([]mimirpb.LabelAdapter{{Name: "platform", Value: "foo"}, {Name: "team", Value: "1"}}, 2, "sample-out-of-order", time.Unix(4, 0)) - ast.Increment(labels.FromStrings("platform", "bar", "tenant", "user4", "team", "2"), time.Unix(6, 0)) - - expectedMetrics := ` - # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. - # TYPE cortex_discarded_attributed_samples_total counter - cortex_discarded_attributed_samples_total{platform="foo",reason="sample-out-of-order", tenant="user4",tracker="cost-attribution"} 2 - # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. - # TYPE cortex_ingester_attributed_active_series gauge - cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="cost-attribution"} 1 - cortex_ingester_attributed_active_series{platform="foo",tenant="user4",tracker="cost-attribution"} 1 - # HELP cortex_distributor_received_attributed_samples_total The total number of samples that were received per attribution. - # TYPE cortex_distributor_received_attributed_samples_total counter - cortex_distributor_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 5 - ` - - metricNames := []string{ - "cortex_discarded_attributed_samples_total", - "cortex_distributor_received_attributed_samples_total", - "cortex_ingester_attributed_active_series", - } - assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), metricNames...)) - - // The purge only apply to the sample tracker. - assert.Equal(t, []string{"foo"}, st.inactiveObservations(time.Unix(5, 0))) - assert.NoError(t, tManager.purgeInactiveAttributionsUntil(time.Unix(5, 0))) - - expectedMetrics = ` - # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. - # TYPE cortex_ingester_attributed_active_series gauge - cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="cost-attribution"} 1 - cortex_ingester_attributed_active_series{platform="foo",tenant="user4",tracker="cost-attribution"} 1 - ` - assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(expectedMetrics), metricNames...)) - tManager.deleteSampleTracker("user4") - tManager.deleteActiveTracker("user4") - assert.NoError(t, testutil.GatherAndCompare(tManager.reg, strings.NewReader(""), metricNames...)) -} From 23ca840cdf10f035d89f3267dedb132c575fd159 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 16 Jan 2025 12:18:54 +0100 Subject: [PATCH 085/105] rename cat to cast --- pkg/distributor/validate.go | 38 ++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/pkg/distributor/validate.go b/pkg/distributor/validate.go index 9f42acf1eba..34c2ab838e2 100644 --- a/pkg/distributor/validate.go +++ b/pkg/distributor/validate.go @@ -239,17 +239,17 @@ func newExemplarValidationMetrics(r prometheus.Registerer) *exemplarValidationMe // validateSample returns an err if the sample is invalid. // The returned error may retain the provided series labels. // It uses the passed 'now' time to measure the relative time of the sample. -func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s mimirpb.Sample, cat *costattribution.SampleTracker) error { +func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s mimirpb.Sample, cast *costattribution.SampleTracker) error { if model.Time(s.TimestampMs) > now.Add(cfg.CreationGracePeriod(userID)) { m.tooFarInFuture.WithLabelValues(userID, group).Inc() - cat.IncrementDiscardedSamples(ls, 1, reasonTooFarInFuture, now.Time()) + cast.IncrementDiscardedSamples(ls, 1, reasonTooFarInFuture, now.Time()) unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return fmt.Errorf(sampleTimestampTooNewMsgFormat, s.TimestampMs, unsafeMetricName) } if cfg.PastGracePeriod(userID) > 0 && model.Time(s.TimestampMs) < now.Add(-cfg.PastGracePeriod(userID)).Add(-cfg.OutOfOrderTimeWindow(userID)) { m.tooFarInPast.WithLabelValues(userID, group).Inc() - cat.IncrementDiscardedSamples(ls, 1, reasonTooFarInPast, now.Time()) + cast.IncrementDiscardedSamples(ls, 1, reasonTooFarInPast, now.Time()) unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return fmt.Errorf(sampleTimestampTooOldMsgFormat, s.TimestampMs, unsafeMetricName) } @@ -260,23 +260,23 @@ func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValida // validateSampleHistogram returns an err if the sample is invalid. // The returned error may retain the provided series labels. // It uses the passed 'now' time to measure the relative time of the sample. -func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s *mimirpb.Histogram, cat *costattribution.SampleTracker) (bool, error) { +func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s *mimirpb.Histogram, cast *costattribution.SampleTracker) (bool, error) { if model.Time(s.Timestamp) > now.Add(cfg.CreationGracePeriod(userID)) { - cat.IncrementDiscardedSamples(ls, 1, reasonTooFarInFuture, now.Time()) + cast.IncrementDiscardedSamples(ls, 1, reasonTooFarInFuture, now.Time()) m.tooFarInFuture.WithLabelValues(userID, group).Inc() unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return false, fmt.Errorf(sampleTimestampTooNewMsgFormat, s.Timestamp, unsafeMetricName) } if cfg.PastGracePeriod(userID) > 0 && model.Time(s.Timestamp) < now.Add(-cfg.PastGracePeriod(userID)).Add(-cfg.OutOfOrderTimeWindow(userID)) { - cat.IncrementDiscardedSamples(ls, 1, reasonTooFarInPast, now.Time()) + cast.IncrementDiscardedSamples(ls, 1, reasonTooFarInPast, now.Time()) m.tooFarInPast.WithLabelValues(userID, group).Inc() unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return false, fmt.Errorf(sampleTimestampTooOldMsgFormat, s.Timestamp, unsafeMetricName) } if s.Schema < mimirpb.MinimumHistogramSchema || s.Schema > mimirpb.MaximumHistogramSchema { - cat.IncrementDiscardedSamples(ls, 1, reasonInvalidNativeHistogramSchema, now.Time()) + cast.IncrementDiscardedSamples(ls, 1, reasonInvalidNativeHistogramSchema, now.Time()) m.invalidNativeHistogramSchema.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(invalidSchemaNativeHistogramMsgFormat, s.Schema) } @@ -290,7 +290,7 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam } if bucketCount > bucketLimit { if !cfg.ReduceNativeHistogramOverMaxBuckets(userID) { - cat.IncrementDiscardedSamples(ls, 1, reasonMaxNativeHistogramBuckets, now.Time()) + cast.IncrementDiscardedSamples(ls, 1, reasonMaxNativeHistogramBuckets, now.Time()) m.maxNativeHistogramBuckets.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(maxNativeHistogramBucketsMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), bucketCount, bucketLimit) } @@ -298,7 +298,7 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam for { bc, err := s.ReduceResolution() if err != nil { - cat.IncrementDiscardedSamples(ls, 1, reasonMaxNativeHistogramBuckets, now.Time()) + cast.IncrementDiscardedSamples(ls, 1, reasonMaxNativeHistogramBuckets, now.Time()) m.maxNativeHistogramBuckets.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(notReducibleNativeHistogramMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), bucketCount, bucketLimit) } @@ -400,16 +400,16 @@ func removeNonASCIIChars(in string) (out string) { // validateLabels returns an err if the labels are invalid. // The returned error may retain the provided series labels. -func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelValidation, skipLabelCountValidation bool, cat *costattribution.SampleTracker, ts time.Time) error { +func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelValidation, skipLabelCountValidation bool, cast *costattribution.SampleTracker, ts time.Time) error { unsafeMetricName, err := extract.UnsafeMetricNameFromLabelAdapters(ls) if err != nil { - cat.IncrementDiscardedSamples(ls, 1, reasonMissingMetricName, ts) + cast.IncrementDiscardedSamples(ls, 1, reasonMissingMetricName, ts) m.missingMetricName.WithLabelValues(userID, group).Inc() return errors.New(noMetricNameMsgFormat) } if !model.IsValidMetricName(model.LabelValue(unsafeMetricName)) { - cat.IncrementDiscardedSamples(ls, 1, reasonInvalidMetricName, ts) + cast.IncrementDiscardedSamples(ls, 1, reasonInvalidMetricName, ts) m.invalidMetricName.WithLabelValues(userID, group).Inc() return fmt.Errorf(invalidMetricNameMsgFormat, removeNonASCIIChars(unsafeMetricName)) } @@ -418,13 +418,13 @@ func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userI if strings.HasSuffix(unsafeMetricName, "_info") { if len(ls) > cfg.MaxLabelNamesPerInfoSeries(userID) { m.maxLabelNamesPerInfoSeries.WithLabelValues(userID, group).Inc() - cat.IncrementDiscardedSamples(ls, 1, reasonMaxLabelNamesPerInfoSeries, ts) + cast.IncrementDiscardedSamples(ls, 1, reasonMaxLabelNamesPerInfoSeries, ts) metric, ellipsis := getMetricAndEllipsis(ls) return fmt.Errorf(tooManyInfoLabelsMsgFormat, len(ls), cfg.MaxLabelNamesPerInfoSeries(userID), metric, ellipsis) } } else { m.maxLabelNamesPerSeries.WithLabelValues(userID, group).Inc() - cat.IncrementDiscardedSamples(ls, 1, reasonMaxLabelNamesPerSeries, ts) + cast.IncrementDiscardedSamples(ls, 1, reasonMaxLabelNamesPerSeries, ts) metric, ellipsis := getMetricAndEllipsis(ls) return fmt.Errorf(tooManyLabelsMsgFormat, len(ls), cfg.MaxLabelNamesPerSeries(userID), metric, ellipsis) } @@ -436,22 +436,22 @@ func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userI for _, l := range ls { if !skipLabelValidation && !model.LabelName(l.Name).IsValid() { m.invalidLabel.WithLabelValues(userID, group).Inc() - cat.IncrementDiscardedSamples(ls, 1, reasonInvalidLabel, ts) + cast.IncrementDiscardedSamples(ls, 1, reasonInvalidLabel, ts) return fmt.Errorf(invalidLabelMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } else if len(l.Name) > maxLabelNameLength { - cat.IncrementDiscardedSamples(ls, 1, reasonLabelNameTooLong, ts) + cast.IncrementDiscardedSamples(ls, 1, reasonLabelNameTooLong, ts) m.labelNameTooLong.WithLabelValues(userID, group).Inc() return fmt.Errorf(labelNameTooLongMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } else if !skipLabelValidation && !model.LabelValue(l.Value).IsValid() { - cat.IncrementDiscardedSamples(ls, 1, reasonInvalidLabelValue, ts) + cast.IncrementDiscardedSamples(ls, 1, reasonInvalidLabelValue, ts) m.invalidLabelValue.WithLabelValues(userID, group).Inc() return fmt.Errorf(invalidLabelValueMsgFormat, l.Name, validUTF8Message(l.Value), unsafeMetricName) } else if len(l.Value) > maxLabelValueLength { - cat.IncrementDiscardedSamples(ls, 1, reasonLabelValueTooLong, ts) + cast.IncrementDiscardedSamples(ls, 1, reasonLabelValueTooLong, ts) m.labelValueTooLong.WithLabelValues(userID, group).Inc() return fmt.Errorf(labelValueTooLongMsgFormat, l.Name, l.Value, mimirpb.FromLabelAdaptersToString(ls)) } else if lastLabelName == l.Name { - cat.IncrementDiscardedSamples(ls, 1, reasonDuplicateLabelNames, ts) + cast.IncrementDiscardedSamples(ls, 1, reasonDuplicateLabelNames, ts) m.duplicateLabelNames.WithLabelValues(userID, group).Inc() return fmt.Errorf(duplicateLabelMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } From d7886f01735e6139f42737764cb0706bf1641623 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 16 Jan 2025 12:21:10 +0100 Subject: [PATCH 086/105] remove the unnecessary indent --- pkg/ingester/activeseries/active_series.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 602264c9ae0..3011a34ca92 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -67,7 +67,8 @@ type seriesStripe struct { // Unix nanoseconds. Only used by purge. Zero = unknown. // Updated in purge and when old timestamp is used when updating series (in this case, oldestEntryTs is updated // without holding the lock -- hence the atomic). - oldestEntryTs atomic.Int64 + oldestEntryTs atomic.Int64 + mu sync.RWMutex refs map[storage.SeriesRef]seriesEntry activeSeriesAttributionFailureCounter atomic.Float64 From 5a7dbbbfa0903a0f9551c64dc9aeb3a1a63e4cc9 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 16 Jan 2025 12:26:47 +0100 Subject: [PATCH 087/105] format --- pkg/ingester/activeseries/active_series.go | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 3011a34ca92..6165287eb40 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -69,17 +69,17 @@ type seriesStripe struct { // without holding the lock -- hence the atomic). oldestEntryTs atomic.Int64 - mu sync.RWMutex - refs map[storage.SeriesRef]seriesEntry + mu sync.RWMutex + refs map[storage.SeriesRef]seriesEntry + active uint32 // Number of active entries in this stripe. Only decreased during purge or clear. + activeMatching []uint32 // Number of active entries in this stripe matching each matcher of the configured Matchers. + activeNativeHistograms uint32 // Number of active entries (only native histograms) in this stripe. Only decreased during purge or clear. + activeMatchingNativeHistograms []uint32 // Number of active entries (only native histograms) in this stripe matching each matcher of the configured Matchers. + activeNativeHistogramBuckets uint32 // Number of buckets in active native histogram entries in this stripe. Only decreased during purge or clear. + activeMatchingNativeHistogramBuckets []uint32 // Number of buckets in active native histogram entries in this stripe matching each matcher of the configured Matchers. + + caat *costattribution.ActiveSeriesTracker activeSeriesAttributionFailureCounter atomic.Float64 - active uint32 // Number of active entries in this stripe. Only decreased during purge or clear. - activeMatching []uint32 // Number of active entries in this stripe matching each matcher of the configured Matchers. - activeNativeHistograms uint32 // Number of active entries (only native histograms) in this stripe. Only decreased during purge or clear. - activeMatchingNativeHistograms []uint32 // Number of active entries (only native histograms) in this stripe matching each matcher of the configured Matchers. - activeNativeHistogramBuckets uint32 // Number of buckets in active native histogram entries in this stripe. Only decreased during purge or clear. - activeMatchingNativeHistogramBuckets []uint32 // Number of buckets in active native histogram entries in this stripe matching each matcher of the configured Matchers. - - caat *costattribution.ActiveSeriesTracker } // seriesEntry holds a timestamp for single series. From 5c26a0b4dcf3040586b33dafe91fa403c8be0245 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 16 Jan 2025 13:51:46 +0100 Subject: [PATCH 088/105] move the order function to the caller --- pkg/costattribution/active_tracker.go | 15 ++++++--------- pkg/costattribution/manager.go | 8 ++++++-- pkg/costattribution/sample_tracker.go | 15 ++++++--------- 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/pkg/costattribution/active_tracker.go b/pkg/costattribution/active_tracker.go index 0dc796fbbf5..603c73eaa6d 100644 --- a/pkg/costattribution/active_tracker.go +++ b/pkg/costattribution/active_tracker.go @@ -31,21 +31,18 @@ type ActiveSeriesTracker struct { } func newActiveSeriesTracker(userID string, trackedLabels []string, limit int, cooldownDuration time.Duration, logger log.Logger) *ActiveSeriesTracker { - orderedLables := slices.Clone(trackedLabels) - slices.Sort(orderedLables) - // Create a map for overflow labels to export when overflow happens - overflowLabels := make([]string, len(orderedLables)+2) - for i := range orderedLables { + overflowLabels := make([]string, len(trackedLabels)+2) + for i := range trackedLabels { overflowLabels[i] = overflowValue } - overflowLabels[len(orderedLables)] = userID - overflowLabels[len(orderedLables)+1] = overflowValue + overflowLabels[len(trackedLabels)] = userID + overflowLabels[len(trackedLabels)+1] = overflowValue ast := &ActiveSeriesTracker{ userID: userID, - labels: orderedLables, + labels: trackedLabels, maxCardinality: limit, observed: make(map[string]*atomic.Int64), logger: logger, @@ -53,7 +50,7 @@ func newActiveSeriesTracker(userID string, trackedLabels []string, limit int, co cooldownDuration: cooldownDuration, } - variableLabels := slices.Clone(orderedLables) + variableLabels := slices.Clone(trackedLabels) variableLabels = append(variableLabels, tenantLabel, "reason") ast.activeSeriesPerUserAttribution = prometheus.NewDesc("cortex_ingester_attributed_active_series", diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 914698b316b..143a07ba886 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -94,7 +94,9 @@ func (m *Manager) SampleTracker(userID string) *SampleTracker { if tracker, exists = m.sampleTrackersByUserID[userID]; exists { return tracker } - tracker = newSampleTracker(userID, labels, maxCardinality, cooldownDuration, m.logger) + orderedLables := slices.Clone(labels) + slices.Sort(orderedLables) + tracker = newSampleTracker(userID, orderedLables, maxCardinality, cooldownDuration, m.logger) m.sampleTrackersByUserID[userID] = tracker return tracker } @@ -123,7 +125,9 @@ func (m *Manager) ActiveSeriesTracker(userID string) *ActiveSeriesTracker { return tracker } - tracker = newActiveSeriesTracker(userID, labels, maxCardinality, cooldownDuration, m.logger) + orderedLables := slices.Clone(labels) + slices.Sort(orderedLables) + tracker = newActiveSeriesTracker(userID, orderedLables, maxCardinality, cooldownDuration, m.logger) m.activeTrackersByUserID[userID] = tracker return tracker } diff --git a/pkg/costattribution/sample_tracker.go b/pkg/costattribution/sample_tracker.go index f1a4f648456..3744cb74091 100644 --- a/pkg/costattribution/sample_tracker.go +++ b/pkg/costattribution/sample_tracker.go @@ -42,21 +42,18 @@ type SampleTracker struct { } func newSampleTracker(userID string, trackedLabels []string, limit int, cooldown time.Duration, logger log.Logger) *SampleTracker { - orderedLables := slices.Clone(trackedLabels) - slices.Sort(orderedLables) - // Create a map for overflow labels to export when overflow happens - overflowLabels := make([]string, len(orderedLables)+2) - for i := range orderedLables { + overflowLabels := make([]string, len(trackedLabels)+2) + for i := range trackedLabels { overflowLabels[i] = overflowValue } - overflowLabels[len(orderedLables)] = userID - overflowLabels[len(orderedLables)+1] = overflowValue + overflowLabels[len(trackedLabels)] = userID + overflowLabels[len(trackedLabels)+1] = overflowValue tracker := &SampleTracker{ userID: userID, - labels: orderedLables, + labels: trackedLabels, maxCardinality: limit, observed: make(map[string]*observation), cooldownDuration: cooldown, @@ -65,7 +62,7 @@ func newSampleTracker(userID string, trackedLabels []string, limit int, cooldown overflowCounter: observation{}, } - variableLabels := slices.Clone(orderedLables) + variableLabels := slices.Clone(trackedLabels) variableLabels = append(variableLabels, tenantLabel, "reason") tracker.discardedSampleAttribution = prometheus.NewDesc("cortex_discarded_attributed_samples_total", "The total number of samples that were discarded per attribution.", From 56beeaa605db1afd61b2c9a2a8295105fd81ead6 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 16 Jan 2025 13:55:21 +0100 Subject: [PATCH 089/105] add comments --- pkg/costattribution/manager.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 143a07ba886..69eda444c8f 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -94,8 +94,11 @@ func (m *Manager) SampleTracker(userID string) *SampleTracker { if tracker, exists = m.sampleTrackersByUserID[userID]; exists { return tracker } + + // sort the labels to ensure the order is consistent orderedLables := slices.Clone(labels) slices.Sort(orderedLables) + tracker = newSampleTracker(userID, orderedLables, maxCardinality, cooldownDuration, m.logger) m.sampleTrackersByUserID[userID] = tracker return tracker @@ -125,8 +128,10 @@ func (m *Manager) ActiveSeriesTracker(userID string) *ActiveSeriesTracker { return tracker } + // sort the labels to ensure the order is consistent orderedLables := slices.Clone(labels) slices.Sort(orderedLables) + tracker = newActiveSeriesTracker(userID, orderedLables, maxCardinality, cooldownDuration, m.logger) m.activeTrackersByUserID[userID] = tracker return tracker From 21a6d3a32ba91e6fa0784ee2de5e5c263ce4ec88 Mon Sep 17 00:00:00 2001 From: Ying WANG <74549700+ying-jeanne@users.noreply.github.com> Date: Thu, 16 Jan 2025 13:57:18 +0100 Subject: [PATCH 090/105] Update pkg/costattribution/sample_tracker.go Co-authored-by: Oleg Zaytsev --- pkg/costattribution/sample_tracker.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/costattribution/sample_tracker.go b/pkg/costattribution/sample_tracker.go index 3744cb74091..84cad718238 100644 --- a/pkg/costattribution/sample_tracker.go +++ b/pkg/costattribution/sample_tracker.go @@ -132,7 +132,8 @@ func (st *SampleTracker) IncrementReceivedSamples(req *mimirpb.WriteRequest, now return } - // We precompute the cost attribution per request before update Observations and State to avoid frequently update the atomic counters + // We precompute the cost attribution per request before update Observations and State to avoid frequently update the atomic counters. + // This is based on the assumption that usually a single WriteRequest will have samples that belong to the same or few cost attribution groups. dict := make(map[string]int) buf := bufferPool.Get().(*bytes.Buffer) buf.Reset() From 73a988160668f510d1b2d89866427fc3fe47d5c8 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 16 Jan 2025 14:02:51 +0100 Subject: [PATCH 091/105] remove useless function --- pkg/costattribution/sample_tracker.go | 16 ++++++---------- pkg/costattribution/sample_tracker_test.go | 14 +++++++++----- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/pkg/costattribution/sample_tracker.go b/pkg/costattribution/sample_tracker.go index 84cad718238..d408c5d638c 100644 --- a/pkg/costattribution/sample_tracker.go +++ b/pkg/costattribution/sample_tracker.go @@ -120,11 +120,15 @@ func (st *SampleTracker) Collect(out chan<- prometheus.Metric) { } } -func (st *SampleTracker) IncrementDiscardedSamples(lbs []mimirpb.LabelAdapter, value float64, reason string, now time.Time) { +func (st *SampleTracker) IncrementDiscardedSamples(lbls []mimirpb.LabelAdapter, value float64, reason string, now time.Time) { if st == nil { return } - st.updateCountersWithLabelAdapter(lbs, now, 0, value, &reason) + buf := bufferPool.Get().(*bytes.Buffer) + buf.Reset() + defer bufferPool.Put(buf) + st.fillKeyFromLabelAdapters(lbls, buf) + st.updateObservations(buf.String(), now, 0, value, &reason) } func (st *SampleTracker) IncrementReceivedSamples(req *mimirpb.WriteRequest, now time.Time) { @@ -153,14 +157,6 @@ func (st *SampleTracker) IncrementReceivedSamples(req *mimirpb.WriteRequest, now } } -func (st *SampleTracker) updateCountersWithLabelAdapter(lbls []mimirpb.LabelAdapter, ts time.Time, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { - buf := bufferPool.Get().(*bytes.Buffer) - buf.Reset() - defer bufferPool.Put(buf) - st.fillKeyFromLabelAdapters(lbls, buf) - st.updateObservations(buf.String(), ts, receivedSampleIncrement, discardedSampleIncrement, reason) -} - func (st *SampleTracker) fillKeyFromLabelAdapters(lbls []mimirpb.LabelAdapter, buf *bytes.Buffer) { buf.Reset() var exists bool diff --git a/pkg/costattribution/sample_tracker_test.go b/pkg/costattribution/sample_tracker_test.go index 5f2e830b18f..68f36254339 100644 --- a/pkg/costattribution/sample_tracker_test.go +++ b/pkg/costattribution/sample_tracker_test.go @@ -65,23 +65,27 @@ func TestSampleTracker_IncrementReceviedSamples(t *testing.T) { }) } -func TestSampleTracker_updateCounters(t *testing.T) { +func TestSampleTracker_IncrementDiscardedSamples(t *testing.T) { st := newTestManager().SampleTracker("user3") lbls1 := []mimirpb.LabelAdapter{{Name: "department", Value: "foo"}, {Name: "service", Value: "bar"}} lbls2 := []mimirpb.LabelAdapter{{Name: "department", Value: "bar"}, {Name: "service", Value: "baz"}} lbls3 := []mimirpb.LabelAdapter{{Name: "department", Value: "baz"}, {Name: "service", Value: "foo"}} - st.updateCountersWithLabelAdapter(lbls1, time.Unix(1, 0), 1, 0, nil) + st.IncrementDiscardedSamples(lbls1, 1, "", time.Unix(1, 0)) assert.Equal(t, int64(0), st.overflowSince.Load(), "First observation, should not overflow") + assert.Equal(t, 1, len(st.observed)) - st.updateCountersWithLabelAdapter(lbls2, time.Unix(2, 0), 1, 0, nil) + st.IncrementDiscardedSamples(lbls2, 1, "", time.Unix(2, 0)) assert.Equal(t, int64(0), st.overflowSince.Load(), "Second observation, should not overflow") + assert.Equal(t, 2, len(st.observed)) - st.updateCountersWithLabelAdapter(lbls3, time.Unix(3, 0), 1, 0, nil) + st.IncrementDiscardedSamples(lbls3, 1, "", time.Unix(3, 0)) assert.Equal(t, int64(3), st.overflowSince.Load(), "Third observation, should overflow") + assert.Equal(t, 2, len(st.observed)) - st.updateCountersWithLabelAdapter(lbls3, time.Unix(4, 0), 1, 0, nil) + st.IncrementDiscardedSamples(lbls3, 1, "", time.Unix(4, 0)) assert.Equal(t, int64(3), st.overflowSince.Load(), "Fourth observation, should stay overflow") + assert.Equal(t, 2, len(st.observed)) } func TestSampleTracker_inactiveObservations(t *testing.T) { From 6d27724c3424fdb1c226b1304d59743824216c8f Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 16 Jan 2025 14:16:45 +0100 Subject: [PATCH 092/105] change overflowSince to time.Time --- pkg/costattribution/sample_tracker.go | 33 ++++++++++++---------- pkg/costattribution/sample_tracker_test.go | 10 +++---- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/pkg/costattribution/sample_tracker.go b/pkg/costattribution/sample_tracker.go index d408c5d638c..be41921b242 100644 --- a/pkg/costattribution/sample_tracker.go +++ b/pkg/costattribution/sample_tracker.go @@ -35,10 +35,11 @@ type SampleTracker struct { overflowLabels []string observed map[string]*observation observedMtx sync.RWMutex - overflowSince atomic.Int64 - overflowCounter observation - cooldownDuration time.Duration - logger log.Logger + // overflowSince is also protected by observedMtx, it is set when the max cardinality is exceeded + overflowSince time.Time + overflowCounter observation + cooldownDuration time.Duration + logger log.Logger } func newSampleTracker(userID string, trackedLabels []string, limit int, cooldown time.Duration, logger log.Logger) *SampleTracker { @@ -93,14 +94,16 @@ func (st *SampleTracker) cleanupTrackerAttribution(key string) { } func (st *SampleTracker) Collect(out chan<- prometheus.Metric) { - if st.overflowSince.Load() > 0 { + // We don't know the performance of out receiver, so we don't want to hold the lock for too long + var prometheusMetrics []prometheus.Metric + st.observedMtx.RLock() + + if !st.overflowSince.IsZero() { out <- prometheus.MustNewConstMetric(st.receivedSamplesAttribution, prometheus.CounterValue, st.overflowCounter.receivedSample.Load(), st.overflowLabels[:len(st.overflowLabels)-1]...) out <- prometheus.MustNewConstMetric(st.discardedSampleAttribution, prometheus.CounterValue, st.overflowCounter.totalDiscarded.Load(), st.overflowLabels...) return } - // We don't know the performance of out receiver, so we don't want to hold the lock for too long - var prometheusMetrics []prometheus.Metric - st.observedMtx.RLock() + for key, o := range st.observed { keys := strings.Split(key, string(sep)) keys = append(keys, st.userID) @@ -186,7 +189,7 @@ func (st *SampleTracker) updateObservations(key string, ts time.Time, receivedSa // if overflowSince is set, we only update the overflow counter, this is after the read lock since overflowSince can only be set when holding observedMtx write lock // check it after read lock would make sure that we don't miss any updates - if st.overflowSince.Load() > 0 { + if !st.overflowSince.IsZero() { st.overflowCounter.receivedSample.Add(receivedSampleIncrement) if discardedSampleIncrement > 0 && reason != nil { st.overflowCounter.totalDiscarded.Add(discardedSampleIncrement) @@ -196,7 +199,7 @@ func (st *SampleTracker) updateObservations(key string, ts time.Time, receivedSa } o, known := st.observed[key] - if known && st.overflowSince.Load() == 0 { + if known { o.lastUpdate.Store(ts.Unix()) o.receivedSample.Add(receivedSampleIncrement) if discardedSampleIncrement > 0 && reason != nil { @@ -218,7 +221,7 @@ func (st *SampleTracker) updateObservations(key string, ts time.Time, receivedSa defer st.observedMtx.Unlock() // If not in overflow, we update the observation if it exists, otherwise we check if create a new observation would exceed the max cardinality // if it does, we set the overflowSince - if st.overflowSince.Load() == 0 { + if st.overflowSince.IsZero() { o, known = st.observed[key] if known { o.lastUpdate.Store(ts.Unix()) @@ -236,12 +239,12 @@ func (st *SampleTracker) updateObservations(key string, ts time.Time, receivedSa } // if it is not known, we need to check if the max cardinality is exceeded if len(st.observed) >= st.maxCardinality { - st.overflowSince.Store(ts.Unix()) + st.overflowSince = ts } } // if overflowSince is set, we only update the overflow counter - if st.overflowSince.Load() > 0 { + if !st.overflowSince.IsZero() { st.overflowCounter.receivedSample.Add(receivedSampleIncrement) if discardedSampleIncrement > 0 && reason != nil { st.overflowCounter.totalDiscarded.Add(discardedSampleIncrement) @@ -266,7 +269,7 @@ func (st *SampleTracker) updateObservations(key string, ts time.Time, receivedSa func (st *SampleTracker) recoveredFromOverflow(deadline time.Time) bool { st.observedMtx.RLock() - if st.overflowSince.Load() > 0 && time.Unix(st.overflowSince.Load(), 0).Add(st.cooldownDuration).Before(deadline) { + if !st.overflowSince.IsZero() && st.overflowSince.Add(st.cooldownDuration).Before(deadline) { if len(st.observed) <= st.maxCardinality { st.observedMtx.RUnlock() return true @@ -279,7 +282,7 @@ func (st *SampleTracker) recoveredFromOverflow(deadline time.Time) bool { st.observedMtx.Unlock() return true } - st.overflowSince.Store(deadline.Unix()) + st.overflowSince = deadline st.observedMtx.Unlock() } else { st.observedMtx.RUnlock() diff --git a/pkg/costattribution/sample_tracker_test.go b/pkg/costattribution/sample_tracker_test.go index 68f36254339..967780a8401 100644 --- a/pkg/costattribution/sample_tracker_test.go +++ b/pkg/costattribution/sample_tracker_test.go @@ -72,19 +72,19 @@ func TestSampleTracker_IncrementDiscardedSamples(t *testing.T) { lbls3 := []mimirpb.LabelAdapter{{Name: "department", Value: "baz"}, {Name: "service", Value: "foo"}} st.IncrementDiscardedSamples(lbls1, 1, "", time.Unix(1, 0)) - assert.Equal(t, int64(0), st.overflowSince.Load(), "First observation, should not overflow") + assert.True(t, st.overflowSince.IsZero(), "First observation, should not overflow") assert.Equal(t, 1, len(st.observed)) st.IncrementDiscardedSamples(lbls2, 1, "", time.Unix(2, 0)) - assert.Equal(t, int64(0), st.overflowSince.Load(), "Second observation, should not overflow") + assert.True(t, st.overflowSince.IsZero(), "Second observation, should not overflow") assert.Equal(t, 2, len(st.observed)) st.IncrementDiscardedSamples(lbls3, 1, "", time.Unix(3, 0)) - assert.Equal(t, int64(3), st.overflowSince.Load(), "Third observation, should overflow") + assert.Equal(t, time.Unix(3, 0), st.overflowSince, "Third observation, should overflow") assert.Equal(t, 2, len(st.observed)) st.IncrementDiscardedSamples(lbls3, 1, "", time.Unix(4, 0)) - assert.Equal(t, int64(3), st.overflowSince.Load(), "Fourth observation, should stay overflow") + assert.Equal(t, time.Unix(3, 0), st.overflowSince, "Fourth observation, should stay overflow") assert.Equal(t, 2, len(st.observed)) } @@ -141,7 +141,7 @@ func TestSampleTracker_Concurrency(t *testing.T) { // Verify no data races or inconsistencies, since after 5 all the samples will be counted into the overflow, so the count should be 95 assert.True(t, len(st.observed) > 0, "Observed set should not be empty after concurrent updates") assert.LessOrEqual(t, len(st.observed), st.maxCardinality, "Observed count should not exceed max cardinality") - assert.NotEqual(t, 0, st.overflowSince.Load(), "Expected state to be Overflow") + assert.NotEqual(t, st.overflowSince.IsZero(), "Expected state to be Overflow") expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. From 5ac64f5df5a2f8442c7a93b3a124d854e8bebb1f Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 16 Jan 2025 14:33:35 +0100 Subject: [PATCH 093/105] change the lock to RWMutex --- pkg/costattribution/sample_tracker.go | 42 ++++++++++++++++++--------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/pkg/costattribution/sample_tracker.go b/pkg/costattribution/sample_tracker.go index be41921b242..ee69c5dbb0e 100644 --- a/pkg/costattribution/sample_tracker.go +++ b/pkg/costattribution/sample_tracker.go @@ -21,7 +21,7 @@ const sep = rune(0x80) type observation struct { lastUpdate atomic.Int64 receivedSample atomic.Float64 - discardedSampleMtx sync.Mutex + discardedSampleMtx sync.RWMutex discardedSample map[string]*atomic.Float64 totalDiscarded atomic.Float64 } @@ -110,11 +110,11 @@ func (st *SampleTracker) Collect(out chan<- prometheus.Metric) { if o.receivedSample.Load() > 0 { prometheusMetrics = append(prometheusMetrics, prometheus.MustNewConstMetric(st.receivedSamplesAttribution, prometheus.CounterValue, o.receivedSample.Load(), keys...)) } - o.discardedSampleMtx.Lock() + o.discardedSampleMtx.RLock() for reason, discarded := range o.discardedSample { prometheusMetrics = append(prometheusMetrics, prometheus.MustNewConstMetric(st.discardedSampleAttribution, prometheus.CounterValue, discarded.Load(), append(keys, reason)...)) } - o.discardedSampleMtx.Unlock() + o.discardedSampleMtx.RUnlock() } st.observedMtx.RUnlock() @@ -203,13 +203,20 @@ func (st *SampleTracker) updateObservations(key string, ts time.Time, receivedSa o.lastUpdate.Store(ts.Unix()) o.receivedSample.Add(receivedSampleIncrement) if discardedSampleIncrement > 0 && reason != nil { - o.discardedSampleMtx.Lock() - if _, ok := o.discardedSample[*reason]; ok { - o.discardedSample[*reason].Add(discardedSampleIncrement) + o.discardedSampleMtx.RLock() + if r, ok := o.discardedSample[*reason]; ok { + r.Add(discardedSampleIncrement) + o.discardedSampleMtx.RUnlock() } else { - o.discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) + o.discardedSampleMtx.RUnlock() + o.discardedSampleMtx.Lock() + if r, ok := o.discardedSample[*reason]; ok { + r.Add(discardedSampleIncrement) + } else { + o.discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) + } + o.discardedSampleMtx.Unlock() } - o.discardedSampleMtx.Unlock() } st.observedMtx.RUnlock() return @@ -227,13 +234,20 @@ func (st *SampleTracker) updateObservations(key string, ts time.Time, receivedSa o.lastUpdate.Store(ts.Unix()) o.receivedSample.Add(receivedSampleIncrement) if discardedSampleIncrement > 0 && reason != nil { - o.discardedSampleMtx.Lock() - if _, ok := o.discardedSample[*reason]; ok { - o.discardedSample[*reason].Add(discardedSampleIncrement) + o.discardedSampleMtx.RLock() + if r, ok := o.discardedSample[*reason]; ok { + r.Add(discardedSampleIncrement) + o.discardedSampleMtx.RUnlock() } else { - o.discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) + o.discardedSampleMtx.RUnlock() + o.discardedSampleMtx.Lock() + if r, ok := o.discardedSample[*reason]; ok { + r.Add(discardedSampleIncrement) + } else { + o.discardedSample[*reason] = atomic.NewFloat64(discardedSampleIncrement) + } + o.discardedSampleMtx.Unlock() } - o.discardedSampleMtx.Unlock() } return } @@ -257,7 +271,7 @@ func (st *SampleTracker) updateObservations(key string, ts time.Time, receivedSa lastUpdate: *atomic.NewInt64(ts.Unix()), discardedSample: make(map[string]*atomic.Float64), receivedSample: *atomic.NewFloat64(receivedSampleIncrement), - discardedSampleMtx: sync.Mutex{}, + discardedSampleMtx: sync.RWMutex{}, } if discardedSampleIncrement > 0 && reason != nil { From e5cda41a140083522dffcee0aad1cab3ff73d415 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 16 Jan 2025 14:35:05 +0100 Subject: [PATCH 094/105] change the condition of recovered to less than maxcardinality --- pkg/costattribution/sample_tracker.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/costattribution/sample_tracker.go b/pkg/costattribution/sample_tracker.go index ee69c5dbb0e..e90c52eacee 100644 --- a/pkg/costattribution/sample_tracker.go +++ b/pkg/costattribution/sample_tracker.go @@ -284,7 +284,7 @@ func (st *SampleTracker) updateObservations(key string, ts time.Time, receivedSa func (st *SampleTracker) recoveredFromOverflow(deadline time.Time) bool { st.observedMtx.RLock() if !st.overflowSince.IsZero() && st.overflowSince.Add(st.cooldownDuration).Before(deadline) { - if len(st.observed) <= st.maxCardinality { + if len(st.observed) < st.maxCardinality { st.observedMtx.RUnlock() return true } @@ -292,7 +292,7 @@ func (st *SampleTracker) recoveredFromOverflow(deadline time.Time) bool { // Increase the cooldown duration if the number of observations is still above the max cardinality st.observedMtx.Lock() - if len(st.observed) <= st.maxCardinality { + if len(st.observed) < st.maxCardinality { st.observedMtx.Unlock() return true } From 41e9f477ba11a30db851e89fd8524b2dab0b39b4 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 16 Jan 2025 14:50:51 +0100 Subject: [PATCH 095/105] remove useless function --- pkg/costattribution/manager.go | 5 +---- pkg/costattribution/sample_tracker.go | 16 +++++++--------- pkg/costattribution/sample_tracker_test.go | 17 ++++++++--------- 3 files changed, 16 insertions(+), 22 deletions(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 69eda444c8f..f4793257757 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -217,10 +217,7 @@ func (m *Manager) purgeInactiveAttributionsUntil(deadline time.Time) error { continue } - invalidKeys := st.inactiveObservations(deadline) - for _, key := range invalidKeys { - st.cleanupTrackerAttribution(key) - } + st.cleanupInactiveObservations(deadline) // only sample tracker can recovered from overflow, the activeseries tracker after the cooldown would just be deleted and recreated if st.recoveredFromOverflow(deadline) { diff --git a/pkg/costattribution/sample_tracker.go b/pkg/costattribution/sample_tracker.go index e90c52eacee..2f62c03fc51 100644 --- a/pkg/costattribution/sample_tracker.go +++ b/pkg/costattribution/sample_tracker.go @@ -87,12 +87,6 @@ var bufferPool = sync.Pool{ }, } -func (st *SampleTracker) cleanupTrackerAttribution(key string) { - st.observedMtx.Lock() - defer st.observedMtx.Unlock() - delete(st.observed, key) -} - func (st *SampleTracker) Collect(out chan<- prometheus.Metric) { // We don't know the performance of out receiver, so we don't want to hold the lock for too long var prometheusMetrics []prometheus.Metric @@ -304,16 +298,20 @@ func (st *SampleTracker) recoveredFromOverflow(deadline time.Time) bool { return false } -func (st *SampleTracker) inactiveObservations(deadline time.Time) []string { +func (st *SampleTracker) cleanupInactiveObservations(deadline time.Time) { // otherwise, we need to check all observations and clean up the ones that are inactive var invalidKeys []string st.observedMtx.RLock() - defer st.observedMtx.RUnlock() for labkey, ob := range st.observed { if ob != nil && ob.lastUpdate.Load() <= deadline.Unix() { invalidKeys = append(invalidKeys, labkey) } } + st.observedMtx.RUnlock() - return invalidKeys + st.observedMtx.Lock() + for _, key := range invalidKeys { + delete(st.observed, key) + } + st.observedMtx.Unlock() } diff --git a/pkg/costattribution/sample_tracker_test.go b/pkg/costattribution/sample_tracker_test.go index 967780a8401..0ad22a1edf9 100644 --- a/pkg/costattribution/sample_tracker_test.go +++ b/pkg/costattribution/sample_tracker_test.go @@ -108,18 +108,17 @@ func TestSampleTracker_inactiveObservations(t *testing.T) { require.Len(t, st.observed, 3) // Purge observations that haven't been updated in the last 10 seconds. - purged := st.inactiveObservations(time.Unix(0, 0)) - require.Len(t, purged, 0) + st.cleanupInactiveObservations(time.Unix(0, 0)) + require.Len(t, st.observed, 3) - purged = st.inactiveObservations(time.Unix(10, 0)) - assert.ElementsMatch(t, []string{"foo"}, purged) + st.cleanupInactiveObservations(time.Unix(10, 0)) + assert.Len(t, st.observed, 2) - purged = st.inactiveObservations(time.Unix(15, 0)) - assert.ElementsMatch(t, []string{"foo", "bar"}, purged) + st.cleanupInactiveObservations(time.Unix(15, 0)) + assert.Len(t, st.observed, 1) - // Check that the purged observation matches the expected details. - purged = st.inactiveObservations(time.Unix(25, 0)) - assert.ElementsMatch(t, []string{"foo", "bar", "baz"}, purged) + st.cleanupInactiveObservations(time.Unix(25, 0)) + assert.Len(t, st.observed, 0) } func TestSampleTracker_Concurrency(t *testing.T) { From cb5512bd0f3e765422d0e19164de3c23369552f2 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 16 Jan 2025 15:33:49 +0100 Subject: [PATCH 096/105] change overflowsince to time.time --- pkg/costattribution/active_tracker.go | 24 ++++++++++------------ pkg/costattribution/active_tracker_test.go | 14 ++++++------- pkg/costattribution/manager.go | 6 +++++- pkg/costattribution/sample_tracker.go | 5 +++-- 4 files changed, 26 insertions(+), 23 deletions(-) diff --git a/pkg/costattribution/active_tracker.go b/pkg/costattribution/active_tracker.go index 603c73eaa6d..12a9be7f50b 100644 --- a/pkg/costattribution/active_tracker.go +++ b/pkg/costattribution/active_tracker.go @@ -22,12 +22,12 @@ type ActiveSeriesTracker struct { maxCardinality int activeSeriesPerUserAttribution *prometheus.Desc overflowLabels []string - observed map[string]*atomic.Int64 + logger log.Logger observedMtx sync.RWMutex - overflowSince atomic.Int64 + observed map[string]*atomic.Int64 + overflowSince time.Time overflowCounter atomic.Int64 cooldownDuration time.Duration - logger log.Logger } func newActiveSeriesTracker(userID string, trackedLabels []string, limit int, cooldownDuration time.Duration, logger log.Logger) *ActiveSeriesTracker { @@ -80,12 +80,12 @@ func (at *ActiveSeriesTracker) Increment(lbls labels.Labels, now time.Time) { at.observedMtx.RUnlock() return } - at.observedMtx.RUnlock() - if at.overflowSince.Load() > 0 { + if !at.overflowSince.IsZero() { at.overflowCounter.Inc() return } + at.observedMtx.RUnlock() at.observedMtx.Lock() defer at.observedMtx.Unlock() @@ -95,13 +95,13 @@ func (at *ActiveSeriesTracker) Increment(lbls labels.Labels, now time.Time) { return } - if at.overflowSince.Load() > 0 { + if !at.overflowSince.IsZero() { at.overflowCounter.Inc() return } if len(at.observed) >= at.maxCardinality { - at.overflowSince.Store(now.Unix()) + at.overflowSince = now at.overflowCounter.Inc() return } @@ -138,20 +138,19 @@ func (at *ActiveSeriesTracker) Decrement(lbls labels.Labels) { } at.observedMtx.RUnlock() - if at.overflowSince.Load() > 0 { + at.observedMtx.RLock() + if !at.overflowSince.IsZero() { at.overflowCounter.Dec() return } - - at.observedMtx.RLock() defer at.observedMtx.RUnlock() panic(fmt.Errorf("decrementing non-existent active series: labels=%v, cost attribution keys: %v, the current observation map length: %d, the current cost attribution key: %s", lbls, at.labels, len(at.observed), buf.String())) } func (at *ActiveSeriesTracker) Collect(out chan<- prometheus.Metric) { - if at.overflowSince.Load() > 0 { + at.observedMtx.RLock() + if !at.overflowSince.IsZero() { var activeSeries int64 - at.observedMtx.RLock() for _, as := range at.observed { activeSeries += as.Load() } @@ -161,7 +160,6 @@ func (at *ActiveSeriesTracker) Collect(out chan<- prometheus.Metric) { } // We don't know the performance of out receiver, so we don't want to hold the lock for too long var prometheusMetrics []prometheus.Metric - at.observedMtx.RLock() for key, as := range at.observed { keys := strings.Split(key, string(sep)) keys = append(keys, at.userID) diff --git a/pkg/costattribution/active_tracker_test.go b/pkg/costattribution/active_tracker_test.go index 355a3ef00a3..ce99d8e4c4f 100644 --- a/pkg/costattribution/active_tracker_test.go +++ b/pkg/costattribution/active_tracker_test.go @@ -25,24 +25,24 @@ func TestActiveTracker_IncrementDecrement(t *testing.T) { lbls3 := labels.FromStrings("department", "baz", "service", "foo") ast.Increment(lbls1, time.Unix(1, 0)) - assert.Equal(t, int64(0), ast.overflowSince.Load(), "First observation, should not overflow") + assert.Equal(t, time.Unix(0, 0), ast.overflowSince, "First observation, should not overflow") assert.Equal(t, 1, len(ast.observed)) ast.Decrement(lbls1) - assert.Equal(t, int64(0), ast.overflowSince.Load(), "First observation decremented, should not overflow") + assert.Equal(t, time.Unix(0, 0), ast.overflowSince, "First observation decremented, should not overflow") assert.Equal(t, 0, len(ast.observed), "First observation decremented, should be removed since it reached 0") ast.Increment(lbls1, time.Unix(2, 0)) ast.Increment(lbls2, time.Unix(2, 0)) - assert.Equal(t, int64(0), ast.overflowSince.Load(), "Second observation, should not overflow") + assert.Equal(t, time.Unix(0, 0), ast.overflowSince, "Second observation, should not overflow") assert.Equal(t, 2, len(ast.observed)) ast.Increment(lbls3, time.Unix(3, 0)) - assert.Equal(t, int64(3), ast.overflowSince.Load(), "Third observation, should overflow") + assert.Equal(t, time.Unix(3, 0), ast.overflowSince, "Third observation, should overflow") assert.Equal(t, 2, len(ast.observed)) ast.Increment(lbls3, time.Unix(4, 0)) - assert.Equal(t, int64(3), ast.overflowSince.Load(), "Fourth observation, should stay overflow") + assert.Equal(t, time.Unix(3, 0), ast.overflowSince, "Fourth observation, should stay overflow") assert.Equal(t, 2, len(ast.observed)) } @@ -65,7 +65,7 @@ func TestActiveTracker_Concurrency(t *testing.T) { // Verify no data races or inconsistencies assert.True(t, len(ast.observed) > 0, "Observed set should not be empty after concurrent updates") assert.LessOrEqual(t, len(ast.observed), ast.maxCardinality, "Observed count should not exceed max cardinality") - assert.NotEqual(t, 0, ast.overflowSince.Load(), "Expected state to be Overflow") + assert.False(t, ast.overflowSince.IsZero(), "Expected state to be Overflow") expectedMetrics := ` # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. @@ -85,5 +85,5 @@ func TestActiveTracker_Concurrency(t *testing.T) { wg.Wait() assert.Equal(t, 0, len(ast.observed), "Observed set should be empty after all decrements") - assert.NotEqual(t, 0, ast.overflowSince.Load(), "Expected state still to be Overflow") + assert.False(t, ast.overflowSince.IsZero(), "Expected state still to be Overflow") } diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index f4793257757..59ff71fc919 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -224,9 +224,13 @@ func (m *Manager) purgeInactiveAttributionsUntil(deadline time.Time) error { m.deleteSampleTracker(userID) } + at.observedMtx.RLock() // if the activeseries tracker has been in overflow for more than the cooldown duration, delete it - if at.overflowSince.Load() > 0 && time.Unix(at.overflowSince.Load(), 0).Add(at.cooldownDuration).Before(deadline) { + if !at.overflowSince.IsZero() && at.overflowSince.Add(at.cooldownDuration).Before(deadline) { + at.observedMtx.RUnlock() m.deleteActiveTracker(userID) + } else { + at.observedMtx.RUnlock() } } return nil diff --git a/pkg/costattribution/sample_tracker.go b/pkg/costattribution/sample_tracker.go index 2f62c03fc51..efc253cc120 100644 --- a/pkg/costattribution/sample_tracker.go +++ b/pkg/costattribution/sample_tracker.go @@ -33,13 +33,13 @@ type SampleTracker struct { receivedSamplesAttribution *prometheus.Desc discardedSampleAttribution *prometheus.Desc overflowLabels []string - observed map[string]*observation + logger log.Logger observedMtx sync.RWMutex + observed map[string]*observation // overflowSince is also protected by observedMtx, it is set when the max cardinality is exceeded overflowSince time.Time overflowCounter observation cooldownDuration time.Duration - logger log.Logger } func newSampleTracker(userID string, trackedLabels []string, limit int, cooldown time.Duration, logger log.Logger) *SampleTracker { @@ -93,6 +93,7 @@ func (st *SampleTracker) Collect(out chan<- prometheus.Metric) { st.observedMtx.RLock() if !st.overflowSince.IsZero() { + st.observedMtx.RUnlock() out <- prometheus.MustNewConstMetric(st.receivedSamplesAttribution, prometheus.CounterValue, st.overflowCounter.receivedSample.Load(), st.overflowLabels[:len(st.overflowLabels)-1]...) out <- prometheus.MustNewConstMetric(st.discardedSampleAttribution, prometheus.CounterValue, st.overflowCounter.totalDiscarded.Load(), st.overflowLabels...) return From c968b95c0827fa8da940828475964a6f73b8ccb1 Mon Sep 17 00:00:00 2001 From: Ying WANG <74549700+ying-jeanne@users.noreply.github.com> Date: Thu, 16 Jan 2025 16:34:20 +0100 Subject: [PATCH 097/105] Update pkg/costattribution/manager.go Co-authored-by: Oleg Zaytsev --- pkg/costattribution/manager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 59ff71fc919..5f7884f2b6d 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -213,7 +213,7 @@ func (m *Manager) purgeInactiveAttributionsUntil(deadline time.Time) error { for _, userID := range userIDs { st, at := m.updateTracker(userID) - if st == nil && at == nil { + if st == nil || at == nil { continue } From 573395529dc2afa36d1f3f2c75fd78fe6d86a62d Mon Sep 17 00:00:00 2001 From: Ying WANG <74549700+ying-jeanne@users.noreply.github.com> Date: Thu, 16 Jan 2025 16:34:34 +0100 Subject: [PATCH 098/105] Update pkg/costattribution/sample_tracker.go Co-authored-by: Oleg Zaytsev --- pkg/costattribution/sample_tracker.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/costattribution/sample_tracker.go b/pkg/costattribution/sample_tracker.go index efc253cc120..4b70f001655 100644 --- a/pkg/costattribution/sample_tracker.go +++ b/pkg/costattribution/sample_tracker.go @@ -266,7 +266,6 @@ func (st *SampleTracker) updateObservations(key string, ts time.Time, receivedSa lastUpdate: *atomic.NewInt64(ts.Unix()), discardedSample: make(map[string]*atomic.Float64), receivedSample: *atomic.NewFloat64(receivedSampleIncrement), - discardedSampleMtx: sync.RWMutex{}, } if discardedSampleIncrement > 0 && reason != nil { From 80b64dc69133148805220b4d7638daaf95389676 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 16 Jan 2025 16:35:03 +0100 Subject: [PATCH 099/105] fix test --- pkg/costattribution/active_tracker.go | 24 +++++++++----- pkg/costattribution/active_tracker_test.go | 6 ++-- pkg/costattribution/sample_tracker.go | 19 ++++++----- pkg/distributor/distributor.go | 16 ++++----- pkg/distributor/validate.go | 38 +++++++++++----------- pkg/ingester/activeseries/active_series.go | 26 +++++++-------- 6 files changed, 69 insertions(+), 60 deletions(-) diff --git a/pkg/costattribution/active_tracker.go b/pkg/costattribution/active_tracker.go index 12a9be7f50b..a105ebe8ac1 100644 --- a/pkg/costattribution/active_tracker.go +++ b/pkg/costattribution/active_tracker.go @@ -18,16 +18,20 @@ import ( type ActiveSeriesTracker struct { userID string - labels []string - maxCardinality int activeSeriesPerUserAttribution *prometheus.Desc - overflowLabels []string logger log.Logger - observedMtx sync.RWMutex - observed map[string]*atomic.Int64 - overflowSince time.Time - overflowCounter atomic.Int64 - cooldownDuration time.Duration + + labels []string + overflowLabels []string + + maxCardinality int + cooldownDuration time.Duration + + observedMtx sync.RWMutex + observed map[string]*atomic.Int64 + overflowSince time.Time + + overflowCounter atomic.Int64 } func newActiveSeriesTracker(userID string, trackedLabels []string, limit int, cooldownDuration time.Duration, logger log.Logger) *ActiveSeriesTracker { @@ -82,6 +86,7 @@ func (at *ActiveSeriesTracker) Increment(lbls labels.Labels, now time.Time) { } if !at.overflowSince.IsZero() { + at.observedMtx.RUnlock() at.overflowCounter.Inc() return } @@ -139,11 +144,12 @@ func (at *ActiveSeriesTracker) Decrement(lbls labels.Labels) { at.observedMtx.RUnlock() at.observedMtx.RLock() + defer at.observedMtx.RUnlock() + if !at.overflowSince.IsZero() { at.overflowCounter.Dec() return } - defer at.observedMtx.RUnlock() panic(fmt.Errorf("decrementing non-existent active series: labels=%v, cost attribution keys: %v, the current observation map length: %d, the current cost attribution key: %s", lbls, at.labels, len(at.observed), buf.String())) } diff --git a/pkg/costattribution/active_tracker_test.go b/pkg/costattribution/active_tracker_test.go index ce99d8e4c4f..68793c390f7 100644 --- a/pkg/costattribution/active_tracker_test.go +++ b/pkg/costattribution/active_tracker_test.go @@ -25,16 +25,16 @@ func TestActiveTracker_IncrementDecrement(t *testing.T) { lbls3 := labels.FromStrings("department", "baz", "service", "foo") ast.Increment(lbls1, time.Unix(1, 0)) - assert.Equal(t, time.Unix(0, 0), ast.overflowSince, "First observation, should not overflow") + assert.True(t, ast.overflowSince.IsZero(), "First observation, should not overflow") assert.Equal(t, 1, len(ast.observed)) ast.Decrement(lbls1) - assert.Equal(t, time.Unix(0, 0), ast.overflowSince, "First observation decremented, should not overflow") + assert.True(t, ast.overflowSince.IsZero(), "First observation decremented, should not overflow") assert.Equal(t, 0, len(ast.observed), "First observation decremented, should be removed since it reached 0") ast.Increment(lbls1, time.Unix(2, 0)) ast.Increment(lbls2, time.Unix(2, 0)) - assert.Equal(t, time.Unix(0, 0), ast.overflowSince, "Second observation, should not overflow") + assert.True(t, ast.overflowSince.IsZero(), "Second observation, should not overflow") assert.Equal(t, 2, len(ast.observed)) ast.Increment(lbls3, time.Unix(3, 0)) diff --git a/pkg/costattribution/sample_tracker.go b/pkg/costattribution/sample_tracker.go index 4b70f001655..4b0c90a1216 100644 --- a/pkg/costattribution/sample_tracker.go +++ b/pkg/costattribution/sample_tracker.go @@ -28,18 +28,21 @@ type observation struct { type SampleTracker struct { userID string - labels []string - maxCardinality int receivedSamplesAttribution *prometheus.Desc discardedSampleAttribution *prometheus.Desc - overflowLabels []string logger log.Logger - observedMtx sync.RWMutex - observed map[string]*observation - // overflowSince is also protected by observedMtx, it is set when the max cardinality is exceeded - overflowSince time.Time - overflowCounter observation + + labels []string + overflowLabels []string + + maxCardinality int cooldownDuration time.Duration + + observedMtx sync.RWMutex + observed map[string]*observation + overflowSince time.Time + + overflowCounter observation } func newSampleTracker(userID string, trackedLabels []string, limit int, cooldown time.Duration, logger log.Logger) *SampleTracker { diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 2123a4cad54..0f09aa49254 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -754,9 +754,9 @@ func (d *Distributor) validateSamples(now model.Time, ts *mimirpb.PreallocTimese return nil } - cast := d.costAttributionMgr.SampleTracker(userID) + cat := d.costAttributionMgr.SampleTracker(userID) if len(ts.Samples) == 1 { - return validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, ts.Samples[0], cast) + return validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, ts.Samples[0], cat) } timestamps := make(map[int64]struct{}, min(len(ts.Samples), 100)) @@ -770,7 +770,7 @@ func (d *Distributor) validateSamples(now model.Time, ts *mimirpb.PreallocTimese } timestamps[s.TimestampMs] = struct{}{} - if err := validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, s, cast); err != nil { + if err := validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, s, cat); err != nil { return err } @@ -795,9 +795,9 @@ func (d *Distributor) validateHistograms(now model.Time, ts *mimirpb.PreallocTim return nil } - cast := d.costAttributionMgr.SampleTracker(userID) + cat := d.costAttributionMgr.SampleTracker(userID) if len(ts.Histograms) == 1 { - updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[0], cast) + updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[0], cat) if err != nil { return err } @@ -818,7 +818,7 @@ func (d *Distributor) validateHistograms(now model.Time, ts *mimirpb.PreallocTim } timestamps[ts.Histograms[idx].Timestamp] = struct{}{} - updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[idx], cast) + updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[idx], cat) if err != nil { return err } @@ -882,8 +882,8 @@ func (d *Distributor) validateExemplars(ts *mimirpb.PreallocTimeseries, userID s // The returned error may retain the series labels. // It uses the passed nowt time to observe the delay of sample timestamps. func (d *Distributor) validateSeries(nowt time.Time, ts *mimirpb.PreallocTimeseries, userID, group string, skipLabelValidation, skipLabelCountValidation bool, minExemplarTS, maxExemplarTS int64) (bool, error) { - cast := d.costAttributionMgr.SampleTracker(userID) - if err := validateLabels(d.sampleValidationMetrics, d.limits, userID, group, ts.Labels, skipLabelValidation, skipLabelCountValidation, cast, nowt); err != nil { + cat := d.costAttributionMgr.SampleTracker(userID) + if err := validateLabels(d.sampleValidationMetrics, d.limits, userID, group, ts.Labels, skipLabelValidation, skipLabelCountValidation, cat, nowt); err != nil { return true, err } diff --git a/pkg/distributor/validate.go b/pkg/distributor/validate.go index 34c2ab838e2..9f42acf1eba 100644 --- a/pkg/distributor/validate.go +++ b/pkg/distributor/validate.go @@ -239,17 +239,17 @@ func newExemplarValidationMetrics(r prometheus.Registerer) *exemplarValidationMe // validateSample returns an err if the sample is invalid. // The returned error may retain the provided series labels. // It uses the passed 'now' time to measure the relative time of the sample. -func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s mimirpb.Sample, cast *costattribution.SampleTracker) error { +func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s mimirpb.Sample, cat *costattribution.SampleTracker) error { if model.Time(s.TimestampMs) > now.Add(cfg.CreationGracePeriod(userID)) { m.tooFarInFuture.WithLabelValues(userID, group).Inc() - cast.IncrementDiscardedSamples(ls, 1, reasonTooFarInFuture, now.Time()) + cat.IncrementDiscardedSamples(ls, 1, reasonTooFarInFuture, now.Time()) unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return fmt.Errorf(sampleTimestampTooNewMsgFormat, s.TimestampMs, unsafeMetricName) } if cfg.PastGracePeriod(userID) > 0 && model.Time(s.TimestampMs) < now.Add(-cfg.PastGracePeriod(userID)).Add(-cfg.OutOfOrderTimeWindow(userID)) { m.tooFarInPast.WithLabelValues(userID, group).Inc() - cast.IncrementDiscardedSamples(ls, 1, reasonTooFarInPast, now.Time()) + cat.IncrementDiscardedSamples(ls, 1, reasonTooFarInPast, now.Time()) unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return fmt.Errorf(sampleTimestampTooOldMsgFormat, s.TimestampMs, unsafeMetricName) } @@ -260,23 +260,23 @@ func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValida // validateSampleHistogram returns an err if the sample is invalid. // The returned error may retain the provided series labels. // It uses the passed 'now' time to measure the relative time of the sample. -func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s *mimirpb.Histogram, cast *costattribution.SampleTracker) (bool, error) { +func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s *mimirpb.Histogram, cat *costattribution.SampleTracker) (bool, error) { if model.Time(s.Timestamp) > now.Add(cfg.CreationGracePeriod(userID)) { - cast.IncrementDiscardedSamples(ls, 1, reasonTooFarInFuture, now.Time()) + cat.IncrementDiscardedSamples(ls, 1, reasonTooFarInFuture, now.Time()) m.tooFarInFuture.WithLabelValues(userID, group).Inc() unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return false, fmt.Errorf(sampleTimestampTooNewMsgFormat, s.Timestamp, unsafeMetricName) } if cfg.PastGracePeriod(userID) > 0 && model.Time(s.Timestamp) < now.Add(-cfg.PastGracePeriod(userID)).Add(-cfg.OutOfOrderTimeWindow(userID)) { - cast.IncrementDiscardedSamples(ls, 1, reasonTooFarInPast, now.Time()) + cat.IncrementDiscardedSamples(ls, 1, reasonTooFarInPast, now.Time()) m.tooFarInPast.WithLabelValues(userID, group).Inc() unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return false, fmt.Errorf(sampleTimestampTooOldMsgFormat, s.Timestamp, unsafeMetricName) } if s.Schema < mimirpb.MinimumHistogramSchema || s.Schema > mimirpb.MaximumHistogramSchema { - cast.IncrementDiscardedSamples(ls, 1, reasonInvalidNativeHistogramSchema, now.Time()) + cat.IncrementDiscardedSamples(ls, 1, reasonInvalidNativeHistogramSchema, now.Time()) m.invalidNativeHistogramSchema.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(invalidSchemaNativeHistogramMsgFormat, s.Schema) } @@ -290,7 +290,7 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam } if bucketCount > bucketLimit { if !cfg.ReduceNativeHistogramOverMaxBuckets(userID) { - cast.IncrementDiscardedSamples(ls, 1, reasonMaxNativeHistogramBuckets, now.Time()) + cat.IncrementDiscardedSamples(ls, 1, reasonMaxNativeHistogramBuckets, now.Time()) m.maxNativeHistogramBuckets.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(maxNativeHistogramBucketsMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), bucketCount, bucketLimit) } @@ -298,7 +298,7 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam for { bc, err := s.ReduceResolution() if err != nil { - cast.IncrementDiscardedSamples(ls, 1, reasonMaxNativeHistogramBuckets, now.Time()) + cat.IncrementDiscardedSamples(ls, 1, reasonMaxNativeHistogramBuckets, now.Time()) m.maxNativeHistogramBuckets.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(notReducibleNativeHistogramMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), bucketCount, bucketLimit) } @@ -400,16 +400,16 @@ func removeNonASCIIChars(in string) (out string) { // validateLabels returns an err if the labels are invalid. // The returned error may retain the provided series labels. -func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelValidation, skipLabelCountValidation bool, cast *costattribution.SampleTracker, ts time.Time) error { +func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelValidation, skipLabelCountValidation bool, cat *costattribution.SampleTracker, ts time.Time) error { unsafeMetricName, err := extract.UnsafeMetricNameFromLabelAdapters(ls) if err != nil { - cast.IncrementDiscardedSamples(ls, 1, reasonMissingMetricName, ts) + cat.IncrementDiscardedSamples(ls, 1, reasonMissingMetricName, ts) m.missingMetricName.WithLabelValues(userID, group).Inc() return errors.New(noMetricNameMsgFormat) } if !model.IsValidMetricName(model.LabelValue(unsafeMetricName)) { - cast.IncrementDiscardedSamples(ls, 1, reasonInvalidMetricName, ts) + cat.IncrementDiscardedSamples(ls, 1, reasonInvalidMetricName, ts) m.invalidMetricName.WithLabelValues(userID, group).Inc() return fmt.Errorf(invalidMetricNameMsgFormat, removeNonASCIIChars(unsafeMetricName)) } @@ -418,13 +418,13 @@ func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userI if strings.HasSuffix(unsafeMetricName, "_info") { if len(ls) > cfg.MaxLabelNamesPerInfoSeries(userID) { m.maxLabelNamesPerInfoSeries.WithLabelValues(userID, group).Inc() - cast.IncrementDiscardedSamples(ls, 1, reasonMaxLabelNamesPerInfoSeries, ts) + cat.IncrementDiscardedSamples(ls, 1, reasonMaxLabelNamesPerInfoSeries, ts) metric, ellipsis := getMetricAndEllipsis(ls) return fmt.Errorf(tooManyInfoLabelsMsgFormat, len(ls), cfg.MaxLabelNamesPerInfoSeries(userID), metric, ellipsis) } } else { m.maxLabelNamesPerSeries.WithLabelValues(userID, group).Inc() - cast.IncrementDiscardedSamples(ls, 1, reasonMaxLabelNamesPerSeries, ts) + cat.IncrementDiscardedSamples(ls, 1, reasonMaxLabelNamesPerSeries, ts) metric, ellipsis := getMetricAndEllipsis(ls) return fmt.Errorf(tooManyLabelsMsgFormat, len(ls), cfg.MaxLabelNamesPerSeries(userID), metric, ellipsis) } @@ -436,22 +436,22 @@ func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userI for _, l := range ls { if !skipLabelValidation && !model.LabelName(l.Name).IsValid() { m.invalidLabel.WithLabelValues(userID, group).Inc() - cast.IncrementDiscardedSamples(ls, 1, reasonInvalidLabel, ts) + cat.IncrementDiscardedSamples(ls, 1, reasonInvalidLabel, ts) return fmt.Errorf(invalidLabelMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } else if len(l.Name) > maxLabelNameLength { - cast.IncrementDiscardedSamples(ls, 1, reasonLabelNameTooLong, ts) + cat.IncrementDiscardedSamples(ls, 1, reasonLabelNameTooLong, ts) m.labelNameTooLong.WithLabelValues(userID, group).Inc() return fmt.Errorf(labelNameTooLongMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } else if !skipLabelValidation && !model.LabelValue(l.Value).IsValid() { - cast.IncrementDiscardedSamples(ls, 1, reasonInvalidLabelValue, ts) + cat.IncrementDiscardedSamples(ls, 1, reasonInvalidLabelValue, ts) m.invalidLabelValue.WithLabelValues(userID, group).Inc() return fmt.Errorf(invalidLabelValueMsgFormat, l.Name, validUTF8Message(l.Value), unsafeMetricName) } else if len(l.Value) > maxLabelValueLength { - cast.IncrementDiscardedSamples(ls, 1, reasonLabelValueTooLong, ts) + cat.IncrementDiscardedSamples(ls, 1, reasonLabelValueTooLong, ts) m.labelValueTooLong.WithLabelValues(userID, group).Inc() return fmt.Errorf(labelValueTooLongMsgFormat, l.Name, l.Value, mimirpb.FromLabelAdaptersToString(ls)) } else if lastLabelName == l.Name { - cast.IncrementDiscardedSamples(ls, 1, reasonDuplicateLabelNames, ts) + cat.IncrementDiscardedSamples(ls, 1, reasonDuplicateLabelNames, ts) m.duplicateLabelNames.WithLabelValues(userID, group).Inc() return fmt.Errorf(duplicateLabelMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 6165287eb40..59d6701c3ed 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -51,7 +51,7 @@ type ActiveSeries struct { matchers *asmodel.Matchers lastConfigUpdate time.Time - caat *costattribution.ActiveSeriesTracker + cat *costattribution.ActiveSeriesTracker // The duration after which series become inactive. // Also used to determine if enough time has passed since configuration reload for valid results. @@ -78,7 +78,7 @@ type seriesStripe struct { activeNativeHistogramBuckets uint32 // Number of buckets in active native histogram entries in this stripe. Only decreased during purge or clear. activeMatchingNativeHistogramBuckets []uint32 // Number of buckets in active native histogram entries in this stripe matching each matcher of the configured Matchers. - caat *costattribution.ActiveSeriesTracker + cat *costattribution.ActiveSeriesTracker activeSeriesAttributionFailureCounter atomic.Float64 } @@ -91,12 +91,12 @@ type seriesEntry struct { deleted bool // This series was marked as deleted, so before purging we need to remove the refence to it from the deletedSeries. } -func NewActiveSeries(asm *asmodel.Matchers, timeout time.Duration, caat *costattribution.ActiveSeriesTracker) *ActiveSeries { - c := &ActiveSeries{matchers: asm, timeout: timeout, caat: caat} +func NewActiveSeries(asm *asmodel.Matchers, timeout time.Duration, cat *costattribution.ActiveSeriesTracker) *ActiveSeries { + c := &ActiveSeries{matchers: asm, timeout: timeout, cat: cat} // Stripes are pre-allocated so that we only read on them and no lock is required. for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted, caat) + c.stripes[i].reinitialize(asm, &c.deleted, cat) } return c @@ -111,7 +111,7 @@ func (c *ActiveSeries) CurrentMatcherNames() []string { func (c *ActiveSeries) ConfigDiffers(ctCfg asmodel.CustomTrackersConfig, caCfg *costattribution.ActiveSeriesTracker) bool { c.configMutex.RLock() defer c.configMutex.RUnlock() - return ctCfg.String() != c.matchers.Config().String() || caCfg != c.caat + return ctCfg.String() != c.matchers.Config().String() || caCfg != c.cat } func (c *ActiveSeries) ReloadMatchers(asm *asmodel.Matchers, now time.Time) { @@ -119,7 +119,7 @@ func (c *ActiveSeries) ReloadMatchers(asm *asmodel.Matchers, now time.Time) { defer c.configMutex.Unlock() for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted, c.caat) + c.stripes[i].reinitialize(asm, &c.deleted, c.cat) } c.matchers = asm c.lastConfigUpdate = now @@ -409,7 +409,7 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef numNativeHistogramBuckets: numNativeHistogramBuckets, } - s.caat.Increment(series, time.Unix(0, nowNanos)) + s.cat.Increment(series, time.Unix(0, nowNanos)) s.refs[ref] = e return e.nanos, true } @@ -444,7 +444,7 @@ func (s *seriesStripe) reinitialize(asm *asmodel.Matchers, deleted *deletedSerie s.activeMatching = resizeAndClear(len(asm.MatcherNames()), s.activeMatching) s.activeMatchingNativeHistograms = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistograms) s.activeMatchingNativeHistogramBuckets = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistogramBuckets) - s.caat = cat + s.cat = cat } func (s *seriesStripe) purge(keepUntil time.Time, idx tsdb.IndexReader) { @@ -469,11 +469,11 @@ func (s *seriesStripe) purge(keepUntil time.Time, idx tsdb.IndexReader) { for ref, entry := range s.refs { ts := entry.nanos.Load() if ts < keepUntilNanos { - if s.caat != nil { + if s.cat != nil { if err := idx.Series(ref, &buf, nil); err != nil { s.activeSeriesAttributionFailureCounter.Add(1) } else { - s.caat.Decrement(buf.Labels()) + s.cat.Decrement(buf.Labels()) } } if entry.deleted { @@ -533,12 +533,12 @@ func (s *seriesStripe) remove(ref storage.SeriesRef, idx tsdb.IndexReader) { } s.active-- - if s.caat != nil { + if s.cat != nil { buf := labels.NewScratchBuilder(128) if err := idx.Series(ref, &buf, nil); err != nil { s.activeSeriesAttributionFailureCounter.Add(1) } else { - s.caat.Decrement(buf.Labels()) + s.cat.Decrement(buf.Labels()) } } if entry.numNativeHistogramBuckets >= 0 { From 34fd8b3855dbae11396c97dc111ebc0e87ad2187 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 16 Jan 2025 16:37:06 +0100 Subject: [PATCH 100/105] formatting --- pkg/costattribution/sample_tracker.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/costattribution/sample_tracker.go b/pkg/costattribution/sample_tracker.go index 4b0c90a1216..bab830f8c32 100644 --- a/pkg/costattribution/sample_tracker.go +++ b/pkg/costattribution/sample_tracker.go @@ -266,9 +266,9 @@ func (st *SampleTracker) updateObservations(key string, ts time.Time, receivedSa // create a new observation st.observed[key] = &observation{ - lastUpdate: *atomic.NewInt64(ts.Unix()), - discardedSample: make(map[string]*atomic.Float64), - receivedSample: *atomic.NewFloat64(receivedSampleIncrement), + lastUpdate: *atomic.NewInt64(ts.Unix()), + discardedSample: make(map[string]*atomic.Float64), + receivedSample: *atomic.NewFloat64(receivedSampleIncrement), } if discardedSampleIncrement > 0 && reason != nil { From 72c16eac9c0b51eea98bcbc50c992ac33d3b7e8a Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 16 Jan 2025 16:43:10 +0100 Subject: [PATCH 101/105] fix dum dum --- pkg/costattribution/active_tracker.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pkg/costattribution/active_tracker.go b/pkg/costattribution/active_tracker.go index a105ebe8ac1..0c0a558d563 100644 --- a/pkg/costattribution/active_tracker.go +++ b/pkg/costattribution/active_tracker.go @@ -90,10 +90,8 @@ func (at *ActiveSeriesTracker) Increment(lbls labels.Labels, now time.Time) { at.overflowCounter.Inc() return } - at.observedMtx.RUnlock() - at.observedMtx.Lock() - defer at.observedMtx.Unlock() + defer at.observedMtx.RUnlock() as, ok = at.observed[string(buf.Bytes())] if ok { as.Inc() From f1da05424b559ccdd63884e58f75bda3877f7783 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 16 Jan 2025 16:59:42 +0100 Subject: [PATCH 102/105] just defer --- pkg/costattribution/active_tracker.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pkg/costattribution/active_tracker.go b/pkg/costattribution/active_tracker.go index 0c0a558d563..926c1f7d154 100644 --- a/pkg/costattribution/active_tracker.go +++ b/pkg/costattribution/active_tracker.go @@ -78,20 +78,18 @@ func (at *ActiveSeriesTracker) Increment(lbls labels.Labels, now time.Time) { at.fillKeyFromLabels(lbls, buf) at.observedMtx.RLock() + defer at.observedMtx.RUnlock() as, ok := at.observed[string(buf.Bytes())] if ok { as.Inc() - at.observedMtx.RUnlock() return } if !at.overflowSince.IsZero() { - at.observedMtx.RUnlock() at.overflowCounter.Inc() return } - defer at.observedMtx.RUnlock() as, ok = at.observed[string(buf.Bytes())] if ok { as.Inc() @@ -110,7 +108,6 @@ func (at *ActiveSeriesTracker) Increment(lbls labels.Labels, now time.Time) { } at.observed[string(buf.Bytes())] = atomic.NewInt64(1) - } func (at *ActiveSeriesTracker) Decrement(lbls labels.Labels) { From 673d6074efb6168672e6c44406af5634cdd559de Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 16 Jan 2025 17:23:17 +0100 Subject: [PATCH 103/105] use write lock to write, sounds reasonable hum? --- pkg/costattribution/active_tracker.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pkg/costattribution/active_tracker.go b/pkg/costattribution/active_tracker.go index 926c1f7d154..29ae1faf0e1 100644 --- a/pkg/costattribution/active_tracker.go +++ b/pkg/costattribution/active_tracker.go @@ -78,14 +78,15 @@ func (at *ActiveSeriesTracker) Increment(lbls labels.Labels, now time.Time) { at.fillKeyFromLabels(lbls, buf) at.observedMtx.RLock() - defer at.observedMtx.RUnlock() as, ok := at.observed[string(buf.Bytes())] if ok { as.Inc() + at.observedMtx.RUnlock() return } if !at.overflowSince.IsZero() { + at.observedMtx.RUnlock() at.overflowCounter.Inc() return } @@ -93,20 +94,26 @@ func (at *ActiveSeriesTracker) Increment(lbls labels.Labels, now time.Time) { as, ok = at.observed[string(buf.Bytes())] if ok { as.Inc() + at.observedMtx.RUnlock() return } if !at.overflowSince.IsZero() { + at.observedMtx.RUnlock() at.overflowCounter.Inc() return } if len(at.observed) >= at.maxCardinality { at.overflowSince = now + at.observedMtx.RUnlock() at.overflowCounter.Inc() return } + at.observedMtx.RUnlock() + at.observedMtx.Lock() + defer at.observedMtx.Unlock() at.observed[string(buf.Bytes())] = atomic.NewInt64(1) } From aa57e2f7e671a7fd17de16a54e7894707bbc1502 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Thu, 16 Jan 2025 17:49:32 +0100 Subject: [PATCH 104/105] update lock --- pkg/costattribution/active_tracker.go | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/pkg/costattribution/active_tracker.go b/pkg/costattribution/active_tracker.go index 29ae1faf0e1..0ecd3e3c537 100644 --- a/pkg/costattribution/active_tracker.go +++ b/pkg/costattribution/active_tracker.go @@ -103,17 +103,27 @@ func (at *ActiveSeriesTracker) Increment(lbls labels.Labels, now time.Time) { at.overflowCounter.Inc() return } + at.observedMtx.RUnlock() + + at.observedMtx.Lock() + defer at.observedMtx.Unlock() + + as, ok = at.observed[string(buf.Bytes())] + if ok { + as.Inc() + return + } + + if !at.overflowSince.IsZero() { + at.overflowCounter.Inc() + return + } if len(at.observed) >= at.maxCardinality { at.overflowSince = now - at.observedMtx.RUnlock() at.overflowCounter.Inc() return } - at.observedMtx.RUnlock() - - at.observedMtx.Lock() - defer at.observedMtx.Unlock() at.observed[string(buf.Bytes())] = atomic.NewInt64(1) } From c5eb8c260de66a742b15c1cbb0ea2d3df2d6bf69 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Fri, 17 Jan 2025 10:05:59 +0100 Subject: [PATCH 105/105] changelog update --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6322f46112e..72675d35574 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,12 +4,12 @@ ### Grafana Mimir +* [FEATURE] Ingester/Distributor: Add support for exporting cost attribution metrics (`cortex_ingester_attributed_active_series`, `cortex_distributor_received_attributed_samples_total`, and `cortex_discarded_attributed_samples_total`) with labels specified by customers to a custom Prometheus registry. This feature enables more flexible billing data tracking. #10269 * [CHANGE] Querier: pass context to queryable `IsApplicable` hook. #10451 * [CHANGE] Distributor: OTLP and push handler replace all non-UTF8 characters with the unicode replacement character `\uFFFD` in error messages before propagating them. #10236 * [CHANGE] Querier: pass query matchers to queryable `IsApplicable` hook. #10256 * [CHANGE] Query-frontend: Add `topic` label to `cortex_ingest_storage_strong_consistency_requests_total`, `cortex_ingest_storage_strong_consistency_failures_total`, and `cortex_ingest_storage_strong_consistency_wait_duration_seconds` metrics. #10220 * [CHANGE] Ruler: cap the rate of retries for remote query evaluation to 170/sec. This is configurable via `-ruler.query-frontend.max-retries-rate`. #10375 #10403 -* [CHANGE] Ingester/Distributor: Add support for exporting cost attribution metrics (`cortex_ingester_attributed_active_series`, `cortex_distributor_received_attributed_samples_total`, and `cortex_discarded_attributed_samples_total`) with labels specified by customers to a custom Prometheus registry. This feature enables more flexible billing data tracking. #10269 * [CHANGE] Query-frontend: Add `topic` label to `cortex_ingest_storage_reader_last_produced_offset_requests_total`, `cortex_ingest_storage_reader_last_produced_offset_failures_total`, `cortex_ingest_storage_reader_last_produced_offset_request_duration_seconds`, `cortex_ingest_storage_reader_partition_start_offset_requests_total`, `cortex_ingest_storage_reader_partition_start_offset_failures_total`, `cortex_ingest_storage_reader_partition_start_offset_request_duration_seconds` metrics. #10462 * [ENHANCEMENT] Query Frontend: Return server-side `samples_processed` statistics. #10103 * [ENHANCEMENT] Distributor: OTLP receiver now converts also metric metadata. See also https://github.com/prometheus/prometheus/pull/15416. #10168