From fb55d8edddc8d68166dd0734660c18e335f0d87a Mon Sep 17 00:00:00 2001 From: Julien Duchesne Date: Mon, 16 Dec 2024 15:02:45 -0500 Subject: [PATCH 1/4] feat(ruler): add flags for the query frontend retries Currently, Mimir is hardcoded at 3 retries and 100ms as a minimum retry time. Could be useful to configure this --- CHANGELOG.md | 1 + pkg/mimir/modules.go | 8 +++++++- pkg/ruler/remotequerier.go | 20 ++++++++++++-------- pkg/ruler/remotequerier_test.go | 33 ++++++++++++++++++++------------- 4 files changed, 40 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e83f0e4fdd..35e12a0eaf5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ * [CHANGE] Querier: pass query matchers to queryable `IsApplicable` hook. #10256 * [ENHANCEMENT] Distributor: OTLP receiver now converts also metric metadata. See also https://github.com/prometheus/prometheus/pull/15416. #10168 * [ENHANCEMENT] Distributor: discard float and histogram samples with duplicated timestamps from each timeseries in a request before the request is forwarded to ingesters. Discarded samples are tracked by the `cortex_discarded_samples_total` metrics with reason `sample_duplicate_timestamp`. #10145 +* [ENHANCEMENT] Ruler: Add flags to configure the retry behavior when forwarding queries to the query frontend: `-ruler.query-frontend.max-retries`, `-ruler.query-frontend.min-retry-backoff` and `-ruler.query-frontend.max-retry-backoff`. #10259 * [BUGFIX] Distributor: Use a boolean to track changes while merging the ReplicaDesc components, rather than comparing the objects directly. #10185 * [BUGFIX] Querier: fix timeout responding to query-frontend when response size is very close to `-querier.frontend-client.grpc-max-send-msg-size`. #10154 diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index 697501af98f..3a9588cf6b5 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -15,6 +15,7 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" + "github.com/grafana/dskit/backoff" "github.com/grafana/dskit/dns" httpgrpc_server "github.com/grafana/dskit/httpgrpc/server" "github.com/grafana/dskit/kv" @@ -852,7 +853,12 @@ func (t *Mimir) initRuler() (serv services.Service, err error) { if err != nil { return nil, err } - remoteQuerier := ruler.NewRemoteQuerier(queryFrontendClient, t.Cfg.Querier.EngineConfig.Timeout, t.Cfg.Ruler.QueryFrontend.QueryResultResponseFormat, t.Cfg.API.PrometheusHTTPPrefix, util_log.Logger, ruler.WithOrgIDMiddleware) + retryConfig := backoff.Config{ + MinBackoff: t.Cfg.Ruler.QueryFrontend.MinRetryBackoff, + MaxBackoff: t.Cfg.Ruler.QueryFrontend.MaxRetryBackoff, + MaxRetries: t.Cfg.Ruler.QueryFrontend.MaxRetries, + } + remoteQuerier := ruler.NewRemoteQuerier(queryFrontendClient, retryConfig, t.Cfg.Querier.EngineConfig.Timeout, t.Cfg.Ruler.QueryFrontend.QueryResultResponseFormat, t.Cfg.API.PrometheusHTTPPrefix, util_log.Logger, ruler.WithOrgIDMiddleware) embeddedQueryable = prom_remote.NewSampleAndChunkQueryableClient( remoteQuerier, diff --git a/pkg/ruler/remotequerier.go b/pkg/ruler/remotequerier.go index 57a13f7ef6e..1ec5497940a 100644 --- a/pkg/ruler/remotequerier.go +++ b/pkg/ruler/remotequerier.go @@ -50,8 +50,6 @@ const ( statusError = "error" - maxRequestRetries = 3 - formatJSON = "json" formatProtobuf = "protobuf" ) @@ -63,6 +61,11 @@ type QueryFrontendConfig struct { // Address is the address of the query-frontend to connect to. Address string `yaml:"address"` + // Retry configuration. + MaxRetries int `yaml:"max_retries"` + MinRetryBackoff time.Duration `yaml:"min_retry_backoff"` + MaxRetryBackoff time.Duration `yaml:"max_retry_backoff"` + // GRPCClientConfig contains gRPC specific config options. GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config" doc:"description=Configures the gRPC client used to communicate between the rulers and query-frontends."` @@ -75,6 +78,9 @@ func (c *QueryFrontendConfig) RegisterFlags(f *flag.FlagSet) { "", "GRPC listen address of the query-frontend(s). Must be a DNS address (prefixed with dns:///) "+ "to enable client side load balancing.") + f.IntVar(&c.MaxRetries, "ruler.query-frontend.max-retries", 3, "Maximum number of retries for a single request.") + f.DurationVar(&c.MinRetryBackoff, "ruler.query-frontend.min-retry-backoff", 100*time.Millisecond, "Minimum backoff duration for retries.") + f.DurationVar(&c.MaxRetryBackoff, "ruler.query-frontend.max-retry-backoff", 2*time.Second, "Maximum backoff duration for retries.") c.GRPCClientConfig.CustomCompressors = []string{s2.Name} c.GRPCClientConfig.RegisterFlagsWithPrefix("ruler.query-frontend.grpc-client-config", f) @@ -115,6 +121,7 @@ type Middleware func(ctx context.Context, req *httpgrpc.HTTPRequest) error // RemoteQuerier executes read operations against a httpgrpc.HTTPClient. type RemoteQuerier struct { client httpgrpc.HTTPClient + retryConfig backoff.Config timeout time.Duration middlewares []Middleware promHTTPPrefix string @@ -129,6 +136,7 @@ var protobufDecoderInstance = protobufDecoder{} // NewRemoteQuerier creates and initializes a new RemoteQuerier instance. func NewRemoteQuerier( client httpgrpc.HTTPClient, + retryConfig backoff.Config, timeout time.Duration, preferredQueryResultResponseFormat string, prometheusHTTPPrefix string, @@ -137,6 +145,7 @@ func NewRemoteQuerier( ) *RemoteQuerier { return &RemoteQuerier{ client: client, + retryConfig: retryConfig, timeout: timeout, middlewares: middlewares, promHTTPPrefix: prometheusHTTPPrefix, @@ -309,12 +318,7 @@ func (q *RemoteQuerier) createRequest(ctx context.Context, query string, ts time func (q *RemoteQuerier) sendRequest(ctx context.Context, req *httpgrpc.HTTPRequest, logger log.Logger) (*httpgrpc.HTTPResponse, error) { // Ongoing request may be cancelled during evaluation due to some transient error or server shutdown, // so we'll keep retrying until we get a successful response or backoff is terminated. - retryConfig := backoff.Config{ - MinBackoff: 100 * time.Millisecond, - MaxBackoff: 2 * time.Second, - MaxRetries: maxRequestRetries, - } - retry := backoff.New(ctx, retryConfig) + retry := backoff.New(ctx, q.retryConfig) for { resp, err := q.client.Handle(ctx, req) diff --git a/pkg/ruler/remotequerier_test.go b/pkg/ruler/remotequerier_test.go index 1105bb77093..fdfea416b52 100644 --- a/pkg/ruler/remotequerier_test.go +++ b/pkg/ruler/remotequerier_test.go @@ -15,6 +15,7 @@ import ( "github.com/gogo/protobuf/proto" "github.com/gogo/status" "github.com/golang/snappy" + "github.com/grafana/dskit/backoff" "github.com/grafana/dskit/grpcutil" "github.com/grafana/dskit/httpgrpc" "github.com/prometheus/prometheus/model/histogram" @@ -30,6 +31,12 @@ import ( "github.com/grafana/mimir/pkg/querier/api" ) +var retryConfig = backoff.Config{ + MinBackoff: 100 * time.Millisecond, + MaxBackoff: 1 * time.Second, + MaxRetries: 3, +} + type mockHTTPGRPCClient func(ctx context.Context, req *httpgrpc.HTTPRequest, _ ...grpc.CallOption) (*httpgrpc.HTTPResponse, error) func (c mockHTTPGRPCClient) Handle(ctx context.Context, req *httpgrpc.HTTPRequest, opts ...grpc.CallOption) (*httpgrpc.HTTPResponse, error) { @@ -64,7 +71,7 @@ func TestRemoteQuerier_Read(t *testing.T) { t.Run("should issue a remote read request", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, retryConfig, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Read(context.Background(), &prompb.Query{}, false) require.NoError(t, err) @@ -76,7 +83,7 @@ func TestRemoteQuerier_Read(t *testing.T) { t.Run("should not inject the read consistency header if none is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, retryConfig, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Read(context.Background(), &prompb.Query{}, false) require.NoError(t, err) @@ -86,7 +93,7 @@ func TestRemoteQuerier_Read(t *testing.T) { t.Run("should inject the read consistency header if it is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, retryConfig, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) ctx := api.ContextWithReadConsistencyLevel(context.Background(), api.ReadConsistencyStrong) _, err := q.Read(ctx, &prompb.Query{}, false) @@ -101,7 +108,7 @@ func TestRemoteQuerier_ReadReqTimeout(t *testing.T) { <-ctx.Done() return nil, ctx.Err() } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Second, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), retryConfig, time.Second, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Read(context.Background(), &prompb.Query{}, false) require.Error(t, err) @@ -139,7 +146,7 @@ func TestRemoteQuerier_Query(t *testing.T) { t.Run(fmt.Sprintf("format = %s", format), func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, format, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, retryConfig, time.Minute, format, "/prometheus", log.NewNopLogger()) _, err := q.Query(context.Background(), "qs", tm) require.NoError(t, err) @@ -165,7 +172,7 @@ func TestRemoteQuerier_Query(t *testing.T) { t.Run("should not inject the read consistency header if none is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, retryConfig, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Query(context.Background(), "qs", tm) require.NoError(t, err) @@ -175,7 +182,7 @@ func TestRemoteQuerier_Query(t *testing.T) { t.Run("should inject the read consistency header if it is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, retryConfig, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) ctx := api.ContextWithReadConsistencyLevel(context.Background(), api.ReadConsistencyStrong) _, err := q.Query(ctx, "qs", tm) @@ -276,7 +283,7 @@ func TestRemoteQuerier_QueryRetryOnFailure(t *testing.T) { } return testCase.response, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), retryConfig, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) require.Equal(t, int64(0), count.Load()) _, err := q.Query(ctx, "qs", time.Now()) if testCase.err == nil { @@ -405,7 +412,7 @@ func TestRemoteQuerier_QueryJSONDecoding(t *testing.T) { Body: []byte(scenario.body), }, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), retryConfig, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) actual, err := q.Query(context.Background(), "qs", tm) @@ -678,7 +685,7 @@ func TestRemoteQuerier_QueryProtobufDecoding(t *testing.T) { Body: b, }, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatProtobuf, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), retryConfig, time.Minute, formatProtobuf, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) actual, err := q.Query(context.Background(), "qs", tm) @@ -701,7 +708,7 @@ func TestRemoteQuerier_QueryUnknownResponseContentType(t *testing.T) { Body: []byte("some body content"), }, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), retryConfig, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) _, err := q.Query(context.Background(), "qs", tm) @@ -713,7 +720,7 @@ func TestRemoteQuerier_QueryReqTimeout(t *testing.T) { <-ctx.Done() return nil, ctx.Err() } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Second, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), retryConfig, time.Second, formatJSON, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) _, err := q.Query(context.Background(), "qs", tm) @@ -771,7 +778,7 @@ func TestRemoteQuerier_StatusErrorResponses(t *testing.T) { return testCase.resp, testCase.err } logger := newLoggerWithCounter() - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", logger) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), retryConfig, time.Minute, formatJSON, "/prometheus", logger) tm := time.Unix(1649092025, 515834) From 74056e44b1099260ef01e5aeb1dd7cd79e21aa3b Mon Sep 17 00:00:00 2001 From: Julien Duchesne Date: Mon, 16 Dec 2024 15:12:59 -0500 Subject: [PATCH 2/4] make doc --- .../configure/configuration-parameters/index.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index f9a147e7bca..20c5532ebbb 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -2096,6 +2096,18 @@ query_frontend: # CLI flag: -ruler.query-frontend.address [address: | default = ""] + # Maximum number of retries for a single request. + # CLI flag: -ruler.query-frontend.max-retries + [max_retries: | default = 3] + + # Minimum backoff duration for retries. + # CLI flag: -ruler.query-frontend.min-retry-backoff + [min_retry_backoff: | default = 100ms] + + # Maximum backoff duration for retries. + # CLI flag: -ruler.query-frontend.max-retry-backoff + [max_retry_backoff: | default = 2s] + # Configures the gRPC client used to communicate between the rulers and # query-frontends. # The CLI flags prefix for this block configuration is: From 597ddcc29675a97ff5ebc66b2e58ed97c6bcb2c4 Mon Sep 17 00:00:00 2001 From: Julien Duchesne Date: Mon, 16 Dec 2024 15:26:40 -0500 Subject: [PATCH 3/4] make reference-help --- cmd/mimir/config-descriptor.json | 30 ++++++++++++++++++++++++++++++ cmd/mimir/help-all.txt.tmpl | 6 ++++++ cmd/mimir/help.txt.tmpl | 6 ++++++ 3 files changed, 42 insertions(+) diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 3ac7f5c294b..85e674ed904 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -12956,6 +12956,36 @@ "fieldFlag": "ruler.query-frontend.address", "fieldType": "string" }, + { + "kind": "field", + "name": "max_retries", + "required": false, + "desc": "Maximum number of retries for a single request.", + "fieldValue": null, + "fieldDefaultValue": 3, + "fieldFlag": "ruler.query-frontend.max-retries", + "fieldType": "int" + }, + { + "kind": "field", + "name": "min_retry_backoff", + "required": false, + "desc": "Minimum backoff duration for retries.", + "fieldValue": null, + "fieldDefaultValue": 100000000, + "fieldFlag": "ruler.query-frontend.min-retry-backoff", + "fieldType": "duration" + }, + { + "kind": "field", + "name": "max_retry_backoff", + "required": false, + "desc": "Maximum backoff duration for retries.", + "fieldValue": null, + "fieldDefaultValue": 2000000000, + "fieldFlag": "ruler.query-frontend.max-retry-backoff", + "fieldType": "duration" + }, { "kind": "block", "name": "grpc_client_config", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 08bc71314d3..80bbf5a9853 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -2987,6 +2987,12 @@ Usage of ./cmd/mimir/mimir: Override the default minimum TLS version. Allowed values: VersionTLS10, VersionTLS11, VersionTLS12, VersionTLS13 -ruler.query-frontend.grpc-client-config.tls-server-name string Override the expected name on the server certificate. + -ruler.query-frontend.max-retries int + Maximum number of retries for a single request. (default 3) + -ruler.query-frontend.max-retry-backoff duration + Maximum backoff duration for retries. (default 2s) + -ruler.query-frontend.min-retry-backoff duration + Minimum backoff duration for retries. (default 100ms) -ruler.query-frontend.query-result-response-format string Format to use when retrieving query results from query-frontends. Supported values: json, protobuf (default "protobuf") -ruler.query-stats-enabled diff --git a/cmd/mimir/help.txt.tmpl b/cmd/mimir/help.txt.tmpl index f25ae81834b..5d1f58386bf 100644 --- a/cmd/mimir/help.txt.tmpl +++ b/cmd/mimir/help.txt.tmpl @@ -737,6 +737,12 @@ Usage of ./cmd/mimir/mimir: Maximum number of rules per rule group by namespace. Value is a map, where each key is the namespace and value is the number of rules allowed in the namespace (int). On the command line, this map is given in a JSON format. The number of rules specified has the same meaning as -ruler.max-rules-per-rule-group, but only applies for the specific namespace. If specified, it supersedes -ruler.max-rules-per-rule-group. (default {}) -ruler.query-frontend.address string GRPC listen address of the query-frontend(s). Must be a DNS address (prefixed with dns:///) to enable client side load balancing. + -ruler.query-frontend.max-retries int + Maximum number of retries for a single request. (default 3) + -ruler.query-frontend.max-retry-backoff duration + Maximum backoff duration for retries. (default 2s) + -ruler.query-frontend.min-retry-backoff duration + Minimum backoff duration for retries. (default 100ms) -ruler.query-frontend.query-result-response-format string Format to use when retrieving query results from query-frontends. Supported values: json, protobuf (default "protobuf") -ruler.recording-rules-evaluation-enabled From cc39266965401ecce23bbddbb0dd7a129d32d793 Mon Sep 17 00:00:00 2001 From: Julien Duchesne Date: Tue, 17 Dec 2024 09:27:10 -0500 Subject: [PATCH 4/4] Experimental --- CHANGELOG.md | 2 +- cmd/mimir/config-descriptor.json | 9 ++++++--- cmd/mimir/help-all.txt.tmpl | 6 +++--- cmd/mimir/help.txt.tmpl | 6 ------ .../mimir/configure/configuration-parameters/index.md | 6 +++--- pkg/ruler/remotequerier.go | 7 ++++--- 6 files changed, 17 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 35e12a0eaf5..6f038a9cc31 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ * [CHANGE] Querier: pass query matchers to queryable `IsApplicable` hook. #10256 * [ENHANCEMENT] Distributor: OTLP receiver now converts also metric metadata. See also https://github.com/prometheus/prometheus/pull/15416. #10168 * [ENHANCEMENT] Distributor: discard float and histogram samples with duplicated timestamps from each timeseries in a request before the request is forwarded to ingesters. Discarded samples are tracked by the `cortex_discarded_samples_total` metrics with reason `sample_duplicate_timestamp`. #10145 -* [ENHANCEMENT] Ruler: Add flags to configure the retry behavior when forwarding queries to the query frontend: `-ruler.query-frontend.max-retries`, `-ruler.query-frontend.min-retry-backoff` and `-ruler.query-frontend.max-retry-backoff`. #10259 +* [ENHANCEMENT] Ruler: Add experimental flags to configure the retry behavior when forwarding queries to the query frontend: `-ruler.query-frontend.max-retries`, `-ruler.query-frontend.min-retry-backoff` and `-ruler.query-frontend.max-retry-backoff`. #10259 * [BUGFIX] Distributor: Use a boolean to track changes while merging the ReplicaDesc components, rather than comparing the objects directly. #10185 * [BUGFIX] Querier: fix timeout responding to query-frontend when response size is very close to `-querier.frontend-client.grpc-max-send-msg-size`. #10154 diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 85e674ed904..6f6c7e63f7a 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -12964,7 +12964,8 @@ "fieldValue": null, "fieldDefaultValue": 3, "fieldFlag": "ruler.query-frontend.max-retries", - "fieldType": "int" + "fieldType": "int", + "fieldCategory": "experimental" }, { "kind": "field", @@ -12974,7 +12975,8 @@ "fieldValue": null, "fieldDefaultValue": 100000000, "fieldFlag": "ruler.query-frontend.min-retry-backoff", - "fieldType": "duration" + "fieldType": "duration", + "fieldCategory": "experimental" }, { "kind": "field", @@ -12984,7 +12986,8 @@ "fieldValue": null, "fieldDefaultValue": 2000000000, "fieldFlag": "ruler.query-frontend.max-retry-backoff", - "fieldType": "duration" + "fieldType": "duration", + "fieldCategory": "experimental" }, { "kind": "block", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 80bbf5a9853..37d32ed020b 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -2988,11 +2988,11 @@ Usage of ./cmd/mimir/mimir: -ruler.query-frontend.grpc-client-config.tls-server-name string Override the expected name on the server certificate. -ruler.query-frontend.max-retries int - Maximum number of retries for a single request. (default 3) + [experimental] Maximum number of retries for a single request. (default 3) -ruler.query-frontend.max-retry-backoff duration - Maximum backoff duration for retries. (default 2s) + [experimental] Maximum backoff duration for retries. (default 2s) -ruler.query-frontend.min-retry-backoff duration - Minimum backoff duration for retries. (default 100ms) + [experimental] Minimum backoff duration for retries. (default 100ms) -ruler.query-frontend.query-result-response-format string Format to use when retrieving query results from query-frontends. Supported values: json, protobuf (default "protobuf") -ruler.query-stats-enabled diff --git a/cmd/mimir/help.txt.tmpl b/cmd/mimir/help.txt.tmpl index 5d1f58386bf..f25ae81834b 100644 --- a/cmd/mimir/help.txt.tmpl +++ b/cmd/mimir/help.txt.tmpl @@ -737,12 +737,6 @@ Usage of ./cmd/mimir/mimir: Maximum number of rules per rule group by namespace. Value is a map, where each key is the namespace and value is the number of rules allowed in the namespace (int). On the command line, this map is given in a JSON format. The number of rules specified has the same meaning as -ruler.max-rules-per-rule-group, but only applies for the specific namespace. If specified, it supersedes -ruler.max-rules-per-rule-group. (default {}) -ruler.query-frontend.address string GRPC listen address of the query-frontend(s). Must be a DNS address (prefixed with dns:///) to enable client side load balancing. - -ruler.query-frontend.max-retries int - Maximum number of retries for a single request. (default 3) - -ruler.query-frontend.max-retry-backoff duration - Maximum backoff duration for retries. (default 2s) - -ruler.query-frontend.min-retry-backoff duration - Minimum backoff duration for retries. (default 100ms) -ruler.query-frontend.query-result-response-format string Format to use when retrieving query results from query-frontends. Supported values: json, protobuf (default "protobuf") -ruler.recording-rules-evaluation-enabled diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 20c5532ebbb..26c7e3f94fb 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -2096,15 +2096,15 @@ query_frontend: # CLI flag: -ruler.query-frontend.address [address: | default = ""] - # Maximum number of retries for a single request. + # (experimental) Maximum number of retries for a single request. # CLI flag: -ruler.query-frontend.max-retries [max_retries: | default = 3] - # Minimum backoff duration for retries. + # (experimental) Minimum backoff duration for retries. # CLI flag: -ruler.query-frontend.min-retry-backoff [min_retry_backoff: | default = 100ms] - # Maximum backoff duration for retries. + # (experimental) Maximum backoff duration for retries. # CLI flag: -ruler.query-frontend.max-retry-backoff [max_retry_backoff: | default = 2s] diff --git a/pkg/ruler/remotequerier.go b/pkg/ruler/remotequerier.go index 1ec5497940a..22ac240facc 100644 --- a/pkg/ruler/remotequerier.go +++ b/pkg/ruler/remotequerier.go @@ -62,9 +62,9 @@ type QueryFrontendConfig struct { Address string `yaml:"address"` // Retry configuration. - MaxRetries int `yaml:"max_retries"` - MinRetryBackoff time.Duration `yaml:"min_retry_backoff"` - MaxRetryBackoff time.Duration `yaml:"max_retry_backoff"` + MaxRetries int `yaml:"max_retries" category:"experimental"` + MinRetryBackoff time.Duration `yaml:"min_retry_backoff" category:"experimental"` + MaxRetryBackoff time.Duration `yaml:"max_retry_backoff" category:"experimental"` // GRPCClientConfig contains gRPC specific config options. GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config" doc:"description=Configures the gRPC client used to communicate between the rulers and query-frontends."` @@ -322,6 +322,7 @@ func (q *RemoteQuerier) sendRequest(ctx context.Context, req *httpgrpc.HTTPReque for { resp, err := q.client.Handle(ctx, req) + if err == nil { // Responses with status codes 4xx should always be considered erroneous. // These errors shouldn't be retried because it is expected that