From f04c28f6eb68c74044618dd98d96a97c90e2b009 Mon Sep 17 00:00:00 2001 From: Ying WANG Date: Tue, 17 Dec 2024 21:58:35 +0100 Subject: [PATCH] refectory --- .../config/mimir.yaml | 9 +-- .../configuration-parameters/index.md | 39 ++++++++++++ pkg/costattribution/manager.go | 17 +++-- pkg/costattribution/manager_test.go | 21 ++++--- pkg/costattribution/tracker.go | 46 +++++++++----- pkg/costattribution/tracker_test.go | 13 +++- pkg/distributor/distributor.go | 47 +++++++------- pkg/distributor/validate.go | 1 + pkg/ingester/activeseries/active_series.go | 63 +++++++++---------- pkg/ingester/ingester.go | 30 ++++----- pkg/ingester/user_tsdb.go | 7 ++- 11 files changed, 178 insertions(+), 115 deletions(-) diff --git a/development/mimir-microservices-mode/config/mimir.yaml b/development/mimir-microservices-mode/config/mimir.yaml index 31702611891..5d245999115 100644 --- a/development/mimir-microservices-mode/config/mimir.yaml +++ b/development/mimir-microservices-mode/config/mimir.yaml @@ -1,6 +1,4 @@ multitenancy_enabled: false -cost_attribution_registry_path: "/usage-metrics" -cost_attribution_eviction_interval: 10m distributor: ha_tracker: @@ -186,10 +184,5 @@ limits: ha_replica_label: ha_replica ha_max_clusters: 10 - cost_attribution_labels: "container" - max_cost_attribution_labels_per_user: 2 - max_cost_attribution_cardinality_per_user: 100 - cost_attribution_cooldown: 20m - runtime_config: - file: ./config/runtime.yaml \ No newline at end of file + file: ./config/runtime.yaml diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index f9a147e7bca..137e88d3fc1 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -455,6 +455,18 @@ overrides_exporter: # (experimental) Enables optimized marshaling of timeseries. # CLI flag: -timeseries-unmarshal-caching-optimization-enabled [timeseries_unmarshal_caching_optimization_enabled: | default = true] + +# (experimental) Time interval at which inactive cost attributions are evicted +# from the counter, ensuring they are not included in the cost attribution +# cardinality per user limit. +# CLI flag: -cost-attribution.eviction-interval +[cost_attribution_eviction_interval: | default = 20m] + +# (experimental) Defines a custom path for the registry. When specified, Mimir +# will expose cost attribution metrics through this custom path, if not +# specified, cost attribution metrics won't be exposed. +# CLI flag: -cost-attribution.registry-path +[cost_attribution_registry_path: | default = ""] ``` ### common @@ -3569,6 +3581,33 @@ The `limits` block configures default and per-tenant limits imposed by component # CLI flag: -querier.active-series-results-max-size-bytes [active_series_results_max_size_bytes: | default = 419430400] +# (experimental) Defines labels for cost attribution, applied to metrics like +# cortex_distributor_attributed_received_samples_total. Set to an empty string +# to disable. Example: 'team,service' will produce metrics such as +# cortex_distributor_attributed_received_samples_total{team='frontend', +# service='api'}. +# CLI flag: -validation.cost-attribution-labels +[cost_attribution_labels: | default = ""] + +# (experimental) Maximum number of cost attribution labels allowed per user. +# CLI flag: -validation.max-cost-attribution-labels-per-user +[max_cost_attribution_labels_per_user: | default = 2] + +# (experimental) Maximum cardinality of cost attribution labels allowed per +# user. +# CLI flag: -validation.max-cost-attribution-cardinality-per-user +[max_cost_attribution_cardinality_per_user: | default = 10000] + +# (experimental) Cooldown period for cost attribution labels. Specifies the +# duration the cost attribution remains in overflow before attempting a reset. +# If the cardinality remains above the limit after this period, the system will +# stay in overflow mode and extend the cooldown. Setting this value to 0 +# disables the cooldown, causing the system to continuously check whether the +# cardinality has dropped below the limit. A reset will occur once the +# cardinality falls below the limit. +# CLI flag: -validation.cost-attribution-cooldown +[cost_attribution_cooldown: | default = 0s] + # Duration to delay the evaluation of rules to ensure the underlying metrics # have been pushed. # CLI flag: -ruler.evaluation-delay-duration diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go index 0c60ed54505..b39da15689b 100644 --- a/pkg/costattribution/manager.go +++ b/pkg/costattribution/manager.go @@ -157,16 +157,15 @@ func (m *Manager) inactiveObservationsForUser(userID string, deadline int64) []s m.trackersByUserID[userID] = cat m.mtx.Unlock() return nil - } else { - maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) - if cat.MaxCardinality() != maxCardinality { - cat.UpdateMaxCardinality(maxCardinality) - } + } + maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) + if cat.MaxCardinality() != maxCardinality { + cat.UpdateMaxCardinality(maxCardinality) + } - cooldown := int64(m.limits.CostAttributionCooldown(userID).Seconds()) - if cooldown != cat.CooldownDuration() { - cat.UpdateCooldownDuration(cooldown) - } + cooldown := int64(m.limits.CostAttributionCooldown(userID).Seconds()) + if cooldown != cat.CooldownDuration() { + cat.UpdateCooldownDuration(cooldown) } return cat.InactiveObservations(deadline) diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go index 1e67704b287..4fbf42fe35a 100644 --- a/pkg/costattribution/manager_test.go +++ b/pkg/costattribution/manager_test.go @@ -93,7 +93,8 @@ func Test_CreateDeleteTracker(t *testing.T) { }) t.Run("Purge inactive attributions", func(t *testing.T) { - manager.purgeInactiveAttributionsUntil(time.Unix(10, 0).Unix()) + err := manager.purgeInactiveAttributionsUntil(time.Unix(10, 0).Unix()) + assert.NoError(t, err) expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. # TYPE cortex_discarded_attributed_samples_total counter @@ -103,8 +104,10 @@ func Test_CreateDeleteTracker(t *testing.T) { }) t.Run("Disabling user cost attribution", func(t *testing.T) { - manager.limits, _ = getMockLimits(1) - manager.purgeInactiveAttributionsUntil(time.Unix(11, 0).Unix()) + var err error + manager.limits, err = getMockLimits(1) + assert.NoError(t, err) + assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(11, 0).Unix())) assert.Equal(t, 1, len(manager.trackersByUserID)) expectedMetrics := ` @@ -116,8 +119,10 @@ func Test_CreateDeleteTracker(t *testing.T) { }) t.Run("Updating user cardinality and labels", func(t *testing.T) { - manager.limits, _ = getMockLimits(2) - manager.purgeInactiveAttributionsUntil(time.Unix(12, 0).Unix()) + var err error + manager.limits, err = getMockLimits(2) + assert.NoError(t, err) + assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(12, 0).Unix())) assert.Equal(t, 1, len(manager.trackersByUserID)) assert.True(t, manager.TrackerForUser("user3").CompareCALabels([]string{"feature", "team"})) @@ -151,7 +156,7 @@ func Test_PurgeInactiveAttributionsUntil(t *testing.T) { manager.TrackerForUser("user3").IncrementDiscardedSamples(labels.FromStrings("department", "foo", "service", "bar"), 1, "out-of-window", time.Unix(10, 0)) t.Run("Purge before inactive timeout", func(t *testing.T) { - manager.purgeInactiveAttributionsUntil(time.Unix(0, 0).Unix()) + assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(0, 0).Unix())) assert.Equal(t, 2, len(manager.trackersByUserID)) expectedMetrics := ` @@ -166,7 +171,7 @@ func Test_PurgeInactiveAttributionsUntil(t *testing.T) { t.Run("Purge after inactive timeout", func(t *testing.T) { // disable cost attribution for user1 to test purging manager.limits, _ = getMockLimits(1) - manager.purgeInactiveAttributionsUntil(time.Unix(5, 0).Unix()) + assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(5, 0).Unix())) // User3's tracker should remain since it's active, user1's tracker should be removed assert.Equal(t, 1, len(manager.trackersByUserID), "Expected one active tracker after purging") @@ -182,7 +187,7 @@ func Test_PurgeInactiveAttributionsUntil(t *testing.T) { t.Run("Purge all trackers", func(t *testing.T) { // Trigger a purge that should remove all inactive trackers - manager.purgeInactiveAttributionsUntil(time.Unix(20, 0).Unix()) + assert.NoError(t, manager.purgeInactiveAttributionsUntil(time.Unix(20, 0).Unix())) // Tracker would stay at 1 since user1's tracker is disabled assert.Equal(t, 1, len(manager.trackersByUserID), "Expected one active tracker after full purge") diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go index 0a232195848..8f245c1a6e1 100644 --- a/pkg/costattribution/tracker.go +++ b/pkg/costattribution/tracker.go @@ -42,6 +42,7 @@ type Tracker struct { activeSeriesPerUserAttribution *prometheus.Desc receivedSamplesAttribution *prometheus.Desc discardedSampleAttribution *prometheus.Desc + failedActiveSeriesDecrement *prometheus.Desc overflowLabels []string obseveredMtx sync.RWMutex observed map[string]*Observation @@ -49,6 +50,7 @@ type Tracker struct { state TrackerState overflowCounter *Observation cooldownUntil *atomic.Int64 + totalFailedActiveSeries *atomic.Float64 cooldownDuration int64 logger log.Logger } @@ -70,15 +72,16 @@ func newTracker(userID string, trackedLabels []string, limit int, cooldown time. overflowLabels[len(trackedLabels)+1] = overflowValue tracker := &Tracker{ - userID: userID, - caLabels: trackedLabels, - caLabelMap: caLabelMap, - maxCardinality: limit, - observed: make(map[string]*Observation), - hashBuffer: make([]byte, 0, 1024), - cooldownDuration: int64(cooldown.Seconds()), - logger: logger, - overflowLabels: overflowLabels, + userID: userID, + caLabels: trackedLabels, + caLabelMap: caLabelMap, + maxCardinality: limit, + observed: make(map[string]*Observation), + hashBuffer: make([]byte, 0, 1024), + cooldownDuration: int64(cooldown.Seconds()), + logger: logger, + overflowLabels: overflowLabels, + totalFailedActiveSeries: atomic.NewFloat64(0), } tracker.discardedSampleAttribution = prometheus.NewDesc("cortex_discarded_attributed_samples_total", @@ -94,7 +97,9 @@ func newTracker(userID string, trackedLabels []string, limit int, cooldown time. tracker.activeSeriesPerUserAttribution = prometheus.NewDesc("cortex_ingester_attributed_active_series", "The total number of active series per user and attribution.", append(trackedLabels, TenantLabel), prometheus.Labels{TrackerLabel: defaultTrackerName}) - + tracker.failedActiveSeriesDecrement = prometheus.NewDesc("cortex_ingester_attributed_active_series_failure", + "The total number of failed active series decrement per user and tracker.", []string{TenantLabel}, + prometheus.Labels{TrackerLabel: defaultTrackerName}) return tracker } @@ -149,11 +154,11 @@ func (t *Tracker) IncrementActiveSeries(lbs labels.Labels, now time.Time) { t.updateCounters(lbs, now.Unix(), 1, 0, 0, nil) } -func (t *Tracker) DecrementActiveSeries(lbs labels.Labels, now time.Time) { +func (t *Tracker) DecrementActiveSeries(lbs labels.Labels) { if t == nil { return } - t.updateCounters(lbs, now.Unix(), -1, 0, 0, nil) + t.updateCounters(lbs, -1, -1, 0, 0, nil) } func (t *Tracker) Collect(out chan<- prometheus.Metric) { @@ -182,6 +187,9 @@ func (t *Tracker) Collect(out chan<- prometheus.Metric) { o.discardSamplemtx.Unlock() } } + if t.totalFailedActiveSeries.Load() > 0 { + out <- prometheus.MustNewConstMetric(t.failedActiveSeriesDecrement, prometheus.CounterValue, t.totalFailedActiveSeries.Load(), t.userID) + } } func (t *Tracker) IncrementDiscardedSamples(lbs labels.Labels, value float64, reason string, now time.Time) { @@ -198,6 +206,13 @@ func (t *Tracker) IncrementReceivedSamples(lbs labels.Labels, value float64, now t.updateCounters(lbs, now.Unix(), 0, value, 0, nil) } +func (t *Tracker) IncrementActiveSeriesFailure(value float64) { + if t == nil { + return + } + t.totalFailedActiveSeries.Add(value) +} + func (t *Tracker) updateCounters(lbls labels.Labels, ts int64, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement float64, reason *string) { labelValues := make([]string, len(t.caLabels)) lbls.Range(func(l labels.Label) { @@ -248,8 +263,11 @@ func (t *Tracker) handleObservation(stream string, ts int64, activeSeriesIncreme o.discardSamplemtx.Unlock() } } else if len(t.observed) < t.maxCardinality*2 { - // Create a new observation for the stream - t.createNewObservation(stream, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) + // If the ts is negative, it means that the method is called from DecrementActiveSeries, when key doesn't exist we should ignore the call + // Otherwise create a new observation for the stream + if ts >= 0 { + t.createNewObservation(stream, ts, activeSeriesIncrement, receivedSampleIncrement, discardedSampleIncrement, reason) + } } } diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go index 82de4e8b64c..bc08b5ccb27 100644 --- a/pkg/costattribution/tracker_test.go +++ b/pkg/costattribution/tracker_test.go @@ -35,11 +35,11 @@ func Test_CreateCleanupTracker(t *testing.T) { cat.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), time.Unix(1, 0)) cat.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "2"), time.Unix(2, 0)) - cat.DecrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "3"), time.Unix(3, 0)) + cat.DecrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "3")) cat.IncrementReceivedSamples(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), 5, time.Unix(4, 0)) cat.IncrementDiscardedSamples(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), 2, "sample-out-of-order", time.Unix(4, 0)) - cat.IncrementActiveSeries(labels.FromStrings("platform", "bar", "tenant", "user4", "team", "2"), time.Unix(6, 0)) + cat.IncrementActiveSeriesFailure(1) expectedMetrics := ` # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. @@ -49,6 +49,9 @@ func Test_CreateCleanupTracker(t *testing.T) { # TYPE cortex_ingester_attributed_active_series gauge cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="cost-attribution"} 1 cortex_ingester_attributed_active_series{platform="foo",tenant="user4",tracker="cost-attribution"} 1 + # HELP cortex_ingester_attributed_active_series_failure The total number of failed active series decrement per user and tracker. + # TYPE cortex_ingester_attributed_active_series_failure counter + cortex_ingester_attributed_active_series_failure{tenant="user4",tracker="cost-attribution"} 1 # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. # TYPE cortex_received_attributed_samples_total counter cortex_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 5 @@ -58,15 +61,19 @@ func Test_CreateCleanupTracker(t *testing.T) { "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total", "cortex_ingester_attributed_active_series", + "cortex_ingester_attributed_active_series_failure", } assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) assert.Equal(t, []string{"foo"}, cat.InactiveObservations(5)) - tManager.purgeInactiveAttributionsUntil(5) + assert.NoError(t, tManager.purgeInactiveAttributionsUntil(5)) expectedMetrics = ` # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. # TYPE cortex_ingester_attributed_active_series gauge cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="cost-attribution"} 1 + # HELP cortex_ingester_attributed_active_series_failure The total number of failed active series decrement per user and tracker. + # TYPE cortex_ingester_attributed_active_series_failure counter + cortex_ingester_attributed_active_series_failure{tenant="user4",tracker="cost-attribution"} 1 ` assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) tManager.deleteUserTracker("user4") diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 3594123435d..a14bf4b52f4 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -34,6 +34,19 @@ import ( "github.com/grafana/dskit/services" "github.com/grafana/dskit/tenant" "github.com/grafana/dskit/user" + "github.com/opentracing/opentracing-go" + "github.com/opentracing/opentracing-go/ext" + "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/model/labels" + "github.com/prometheus/prometheus/model/relabel" + "github.com/prometheus/prometheus/scrape" + "go.uber.org/atomic" + "golang.org/x/exp/slices" + "golang.org/x/sync/errgroup" + "github.com/grafana/mimir/pkg/cardinality" "github.com/grafana/mimir/pkg/costattribution" ingester_client "github.com/grafana/mimir/pkg/ingester/client" @@ -48,18 +61,6 @@ import ( "github.com/grafana/mimir/pkg/util/pool" "github.com/grafana/mimir/pkg/util/spanlogger" "github.com/grafana/mimir/pkg/util/validation" - "github.com/opentracing/opentracing-go" - "github.com/opentracing/opentracing-go/ext" - "github.com/pkg/errors" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" - "github.com/prometheus/common/model" - "github.com/prometheus/prometheus/model/labels" - "github.com/prometheus/prometheus/model/relabel" - "github.com/prometheus/prometheus/scrape" - "go.uber.org/atomic" - "golang.org/x/exp/slices" - "golang.org/x/sync/errgroup" ) func init() { @@ -745,15 +746,14 @@ func (d *Distributor) checkSample(ctx context.Context, userID, cluster, replica // May alter timeseries data in-place. // The returned error may retain the series labels. -func (d *Distributor) validateSamples(tnow model.Time, ts *mimirpb.PreallocTimeseries, userID, group string) error { +func (d *Distributor) validateSamples(now model.Time, ts *mimirpb.PreallocTimeseries, userID, group string) error { if len(ts.Samples) == 0 { return nil } cat := d.costAttributionMgr.TrackerForUser(userID) - if len(ts.Samples) == 1 { - return validateSample(d.sampleValidationMetrics, tnow, d.limits, userID, group, ts.Labels, ts.Samples[0], cat) + return validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, ts.Samples[0], cat) } timestamps := make(map[int64]struct{}, min(len(ts.Samples), 100)) @@ -767,7 +767,7 @@ func (d *Distributor) validateSamples(tnow model.Time, ts *mimirpb.PreallocTimes } timestamps[s.TimestampMs] = struct{}{} - if err := validateSample(d.sampleValidationMetrics, tnow, d.limits, userID, group, ts.Labels, s, cat); err != nil { + if err := validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, s, cat); err != nil { return err } @@ -787,14 +787,14 @@ func (d *Distributor) validateSamples(tnow model.Time, ts *mimirpb.PreallocTimes // Returns an error explaining the first validation finding. // May alter timeseries data in-place. // The returned error may retain the series labels. -func (d *Distributor) validateHistograms(tnow model.Time, ts *mimirpb.PreallocTimeseries, userID, group string) error { +func (d *Distributor) validateHistograms(now model.Time, ts *mimirpb.PreallocTimeseries, userID, group string) error { if len(ts.Histograms) == 0 { return nil } cat := d.costAttributionMgr.TrackerForUser(userID) if len(ts.Histograms) == 1 { - updated, err := validateSampleHistogram(d.sampleValidationMetrics, tnow, d.limits, userID, group, ts.Labels, &ts.Histograms[0], cat) + updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[0], cat) if err != nil { return err } @@ -807,7 +807,6 @@ func (d *Distributor) validateHistograms(tnow model.Time, ts *mimirpb.PreallocTi timestamps := make(map[int64]struct{}, min(len(ts.Histograms), 100)) currPos := 0 histogramsUpdated := false - for idx := range ts.Histograms { if _, ok := timestamps[ts.Histograms[idx].Timestamp]; ok { // A sample with the same timestamp has already been validated, so we skip it. @@ -816,7 +815,7 @@ func (d *Distributor) validateHistograms(tnow model.Time, ts *mimirpb.PreallocTi } timestamps[ts.Histograms[idx].Timestamp] = struct{}{} - updated, err := validateSampleHistogram(d.sampleValidationMetrics, tnow, d.limits, userID, group, ts.Labels, &ts.Histograms[idx], cat) + updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[idx], cat) if err != nil { return err } @@ -884,6 +883,7 @@ func (d *Distributor) validateSeries(nowt time.Time, ts *mimirpb.PreallocTimeser if err := validateLabels(d.sampleValidationMetrics, d.limits, userID, group, ts.Labels, skipLabelValidation, skipLabelCountValidation, cat, nowt); err != nil { return true, err } + now := model.TimeFromUnixNano(nowt.UnixNano()) totalSamplesAndHistograms := len(ts.Samples) + len(ts.Histograms) @@ -973,8 +973,8 @@ func (d *Distributor) prePushHaDedupeMiddleware(next PushFunc) PushFunc { } numSamples := 0 - tnow := time.Now() - group := d.activeGroups.UpdateActiveGroupTimestamp(userID, validation.GroupLabel(d.limits, userID, req.Timeseries), tnow) + now := time.Now() + group := d.activeGroups.UpdateActiveGroupTimestamp(userID, validation.GroupLabel(d.limits, userID, req.Timeseries), now) for _, ts := range req.Timeseries { numSamples += len(ts.Samples) + len(ts.Histograms) } @@ -988,7 +988,7 @@ func (d *Distributor) prePushHaDedupeMiddleware(next PushFunc) PushFunc { if errors.As(err, &tooManyClustersError{}) { d.discardedSamplesTooManyHaClusters.WithLabelValues(userID, group).Add(float64(numSamples)) - d.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(numSamples), reasonTooManyHAClusters, tnow) + d.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(numSamples), reasonTooManyHAClusters, now) } return err @@ -1829,7 +1829,6 @@ func tokenForMetadata(userID string, metricName string) uint32 { func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID string) { var receivedSamples, receivedExemplars, receivedMetadata int - for _, ts := range req.Timeseries { receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) diff --git a/pkg/distributor/validate.go b/pkg/distributor/validate.go index 8b9849ba730..5b6775cdf9f 100644 --- a/pkg/distributor/validate.go +++ b/pkg/distributor/validate.go @@ -443,6 +443,7 @@ func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userI m.labelNameTooLong.WithLabelValues(userID, group).Inc() return fmt.Errorf(labelNameTooLongMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } else if !skipLabelValidation && !model.LabelValue(l.Value).IsValid() { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidLabelValue, ts) m.invalidLabelValue.WithLabelValues(userID, group).Inc() return fmt.Errorf(invalidLabelValueMsgFormat, l.Name, strings.ToValidUTF8(l.Value, ""), unsafeMetricName) } else if len(l.Value) > maxLabelValueLength { diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index e7895404a22..6c06a62e162 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -49,9 +49,10 @@ type ActiveSeries struct { // configMutex protects matchers and lastMatchersUpdate. it used by both matchers and cat configMutex sync.RWMutex matchers *asmodel.Matchers - cat *costattribution.Tracker lastConfigUpdate time.Time + cat *costattribution.Tracker + // The duration after which series become inactive. // Also used to determine if enough time has passed since configuration reload for valid results. timeout time.Duration @@ -67,7 +68,6 @@ type seriesStripe struct { // Updated in purge and when old timestamp is used when updating series (in this case, oldestEntryTs is updated // without holding the lock -- hence the atomic). oldestEntryTs atomic.Int64 - cat *costattribution.Tracker mu sync.RWMutex refs map[storage.SeriesRef]seriesEntry active uint32 // Number of active entries in this stripe. Only decreased during purge or clear. @@ -76,6 +76,8 @@ type seriesStripe struct { activeMatchingNativeHistograms []uint32 // Number of active entries (only native histograms) in this stripe matching each matcher of the configured Matchers. activeNativeHistogramBuckets uint32 // Number of buckets in active native histogram entries in this stripe. Only decreased during purge or clear. activeMatchingNativeHistogramBuckets []uint32 // Number of buckets in active native histogram entries in this stripe matching each matcher of the configured Matchers. + + cat *costattribution.Tracker } // seriesEntry holds a timestamp for single series. @@ -87,14 +89,8 @@ type seriesEntry struct { deleted bool // This series was marked as deleted, so before purging we need to remove the refence to it from the deletedSeries. } -func NewActiveSeries( - asm *asmodel.Matchers, - timeout time.Duration, - cat *costattribution.Tracker, -) *ActiveSeries { - c := &ActiveSeries{ - matchers: asm, timeout: timeout, cat: cat, - } +func NewActiveSeries(asm *asmodel.Matchers, timeout time.Duration, cat *costattribution.Tracker) *ActiveSeries { + c := &ActiveSeries{matchers: asm, timeout: timeout, cat: cat} // Stripes are pre-allocated so that we only read on them and no lock is required. for i := 0; i < numStripes; i++ { @@ -112,8 +108,7 @@ func (c *ActiveSeries) CurrentMatcherNames() []string { func (c *ActiveSeries) ConfigDiffers(ctCfg asmodel.CustomTrackersConfig, caCfg *costattribution.Tracker) bool { currentCTC, currentCAT := c.CurrentConfig() - // TODO: I think here to check the pointer is not equal is already enough, if we recreate tracker, it is for a good reason, otherwise, nothing changed - return ctCfg.String() != currentCTC.String() || caCfg != currentCAT //|| !costattribution.CompareCALabels(caCfg.CALabels(), currentCAT.CALabels()) + return ctCfg.String() != currentCTC.String() || caCfg != currentCAT } func (c *ActiveSeries) ReloadMatchers(asm *asmodel.Matchers, now time.Time) { @@ -137,6 +132,7 @@ func (c *ActiveSeries) CurrentConfig() (asmodel.CustomTrackersConfig, *costattri // Pass -1 in numNativeHistogramBuckets if the series is not a native histogram series. func (c *ActiveSeries) UpdateSeries(series labels.Labels, ref storage.SeriesRef, now time.Time, numNativeHistogramBuckets int, idx tsdb.IndexReader) { stripeID := ref % numStripes + created := c.stripes[stripeID].updateSeriesTimestamp(now, series, ref, numNativeHistogramBuckets) if created { if deleted, ok := c.deleted.find(series); ok { @@ -408,8 +404,6 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef numNativeHistogramBuckets: numNativeHistogramBuckets, } - // here if we have a cost attribution label, we can split the serie count based on the value of the label - // we also set the reference to the value of the label in the entry, so when remove, we can decrease the counter accordingly s.cat.IncrementActiveSeries(series, time.Unix(0, nowNanos)) s.refs[ref] = e return e.nanos, true @@ -432,11 +426,7 @@ func (s *seriesStripe) clear() { } // Reinitialize assigns new matchers and corresponding size activeMatching slices. -func (s *seriesStripe) reinitialize( - asm *asmodel.Matchers, - deleted *deletedSeries, - cat *costattribution.Tracker, -) { +func (s *seriesStripe) reinitialize(asm *asmodel.Matchers, deleted *deletedSeries, cat *costattribution.Tracker) { s.mu.Lock() defer s.mu.Unlock() s.deleted = deleted @@ -474,17 +464,20 @@ func (s *seriesStripe) purge(keepUntil time.Time, idx tsdb.IndexReader) { for ref, entry := range s.refs { ts := entry.nanos.Load() if ts < keepUntilNanos { + // cost attribution is enabled, if it's not nil, we need to decrement the active series count, otherwise means received error when get idx, + // we need to increment the active series failure count. + if s.cat != nil { + if idx == nil { + s.cat.IncrementActiveSeriesFailure(1) + } else if err := idx.Series(ref, &buf, nil); err != nil { + s.cat.IncrementActiveSeriesFailure(1) + } else { + s.cat.DecrementActiveSeries(buf.Labels()) + } + } if entry.deleted { s.deleted.purge(ref) } - - if idx != nil { - if err := idx.Series(ref, &buf, nil); err != nil { - //TODO: think about what to do here - _ = err - } - s.cat.DecrementActiveSeries(buf.Labels(), keepUntil) - } delete(s.refs, ref) continue } @@ -532,13 +525,17 @@ func (s *seriesStripe) remove(ref storage.SeriesRef, idx tsdb.IndexReader) { } s.active-- - if idx != nil { - buf := labels.NewScratchBuilder(10) - if err := idx.Series(ref, &buf, nil); err != nil { - //TODO: think about what to do here - _ = err + if s.cat != nil { + if idx == nil { + s.cat.IncrementActiveSeriesFailure(1) + } else { + buf := labels.NewScratchBuilder(128) + if err := idx.Series(ref, &buf, nil); err != nil { + s.cat.IncrementActiveSeriesFailure(1) + } else { + s.cat.DecrementActiveSeries(buf.Labels()) + } } - s.cat.DecrementActiveSeries(buf.Labels(), time.Now()) } if entry.numNativeHistogramBuckets >= 0 { s.activeNativeHistograms-- diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 2b3561a3530..763ce527c5c 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -371,9 +371,8 @@ func newIngester(cfg Config, limits *validation.Overrides, registerer prometheus limits: limits, logger: logger, - tsdbs: make(map[string]*userTSDB), - usersMetadata: make(map[string]*userMetricsMetadata), - + tsdbs: make(map[string]*userTSDB), + usersMetadata: make(map[string]*userMetricsMetadata), bucket: bucketClient, tsdbMetrics: newTSDBMetrics(registerer, logger), shipperMetrics: newShipperMetrics(registerer), @@ -793,7 +792,12 @@ func (i *Ingester) updateActiveSeries(now time.Time) { i.replaceMatchers(asmodel.NewMatchers(newMatchersConfig), userDB, now) } - idx, _ := userDB.Head().Index() + // If the userDB idx is unavailable, pass nil pointer to Purge methode, and record it as a failure in metrics when decrementing active series. + idx, err := userDB.Head().Index() + if err != nil { + level.Warn(i.logger).Log("msg", "failed to get the index of the TSDB head", "user", userID, "err", err) + idx = nil + } valid := userDB.activeSeries.Purge(now, idx) if !valid { // Active series config has been reloaded, exposing loading metric until MetricsIdleTimeout passes. @@ -1167,6 +1171,7 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques // Note that we don't .Finish() the span in this method on purpose spanlog := spanlogger.FromContext(ctx, i.logger) spanlog.DebugLog("event", "acquired append lock") + var ( startAppend = time.Now() @@ -1411,8 +1416,10 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre var nonCopiedLabels labels.Labels // idx is used to decrease active series count in case of error for cost attribution. - idx, _ := i.getTSDB(userID).Head().Index() - // TODO: deal with the error here + idx, err := i.getTSDB(userID).Head().Index() + if err != nil { + idx = nil + } for _, ts := range timeseries { // The labels must be sorted (in our case, it's guaranteed a write request @@ -1429,7 +1436,6 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre allOutOfBoundsHistograms(ts.Histograms, minAppendTime) { stats.failedSamplesCount += len(ts.Samples) + len(ts.Histograms) - stats.sampleTimestampTooOldCount += len(ts.Samples) + len(ts.Histograms) i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)+len(ts.Histograms)), reasonSampleTimestampTooOld, startAppend) var firstTimestamp int64 @@ -2666,12 +2672,8 @@ func (i *Ingester) createTSDB(userID string, walReplayConcurrency int) (*userTSD } userDB := &userTSDB{ - userID: userID, - activeSeries: activeseries.NewActiveSeries( - asmodel.NewMatchers(matchersConfig), - i.cfg.ActiveSeriesMetrics.IdleTimeout, - i.costAttributionMgr.TrackerForUser(userID), - ), + userID: userID, + activeSeries: activeseries.NewActiveSeries(asmodel.NewMatchers(matchersConfig), i.cfg.ActiveSeriesMetrics.IdleTimeout, i.costAttributionMgr.TrackerForUser(userID)), seriesInMetric: newMetricCounter(i.limiter, i.cfg.getIgnoreSeriesLimitForMetricNamesMap()), ingestedAPISamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), ingestedRuleSamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), @@ -3274,7 +3276,7 @@ func (i *Ingester) compactBlocksToReduceInMemorySeries(ctx context.Context, now idx, err := db.Head().Index() if err != nil { level.Warn(i.logger).Log("msg", "failed to get the index of the TSDB head", "user", userID, "err", err) - continue + idx = nil } db.activeSeries.Purge(now, idx) diff --git a/pkg/ingester/user_tsdb.go b/pkg/ingester/user_tsdb.go index 2f31f41892e..61c1aa244ee 100644 --- a/pkg/ingester/user_tsdb.go +++ b/pkg/ingester/user_tsdb.go @@ -619,8 +619,11 @@ func (u *userTSDB) computeOwnedSeries() int { } count := 0 - idx, _ := u.Head().Index() - // TODO: deal with the err here + idx, err := u.Head().Index() + if err != nil { + idx = nil + } + u.Head().ForEachSecondaryHash(func(refs []chunks.HeadSeriesRef, secondaryHashes []uint32) { for i, sh := range secondaryHashes { if u.ownedTokenRanges.IncludesKey(sh) {