Skip to content

Commit

Permalink
Merge branch 'main' into julienduchesne/ruler-retries
Browse files Browse the repository at this point in the history
  • Loading branch information
julienduchesne authored Dec 17, 2024
2 parents cc39266 + 3dc5104 commit 21edd1f
Show file tree
Hide file tree
Showing 40 changed files with 637 additions and 470 deletions.
26 changes: 0 additions & 26 deletions .github/workflows/allowlist.json

This file was deleted.

83 changes: 0 additions & 83 deletions .github/workflows/dependabot_reviewer.yml

This file was deleted.

36 changes: 36 additions & 0 deletions .github/workflows/grafanabot_reviewer.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: Auto-review Grafanabot PRs
on: pull_request_target

permissions:
pull-requests: write
contents: write

jobs:
dependabot-reviewer:
runs-on: ubuntu-latest

if: ${{ github.event.pull_request.user.login == 'grafanabot' }}

steps:
- name: Checkout Repository
uses: actions/checkout@v4

- name: Approve and auto-merge
id: auto-merge
if: contains(github.event.pull_request.head.ref, 'helm-chart-weekly-')
run: |
gh pr merge --auto --squash "$PR_URL"
gh pr review $PR_URL \
--approve -b "**I'm approving** this pull request, since it is a helm release."
env:
PR_URL: ${{github.event.pull_request.html_url}}
GITHUB_TOKEN: ${{secrets.GH_BOT_ACCESS_TOKEN}}

- name: Manual review is required
if: steps.auto-merge.conclusion != 'success'
run: |
gh pr comment $PR_URL --body "**This PR from grafanabot requires manual review.**"
env:
PR_URL: ${{github.event.pull_request.html_url}}
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,13 @@
* [CHANGE] Querier: pass query matchers to queryable `IsApplicable` hook. #10256
* [ENHANCEMENT] Distributor: OTLP receiver now converts also metric metadata. See also https://github.com/prometheus/prometheus/pull/15416. #10168
* [ENHANCEMENT] Distributor: discard float and histogram samples with duplicated timestamps from each timeseries in a request before the request is forwarded to ingesters. Discarded samples are tracked by the `cortex_discarded_samples_total` metrics with reason `sample_duplicate_timestamp`. #10145
* [ENHANCEMENT] Ruler: Add `cortex_prometheus_rule_group_last_rule_duration_sum_seconds` metric to track the total evaluation duration of a rule group regardless of concurrency #10189
* [ENHANCEMENT] Ruler: Add experimental flags to configure the retry behavior when forwarding queries to the query frontend: `-ruler.query-frontend.max-retries`, `-ruler.query-frontend.min-retry-backoff` and `-ruler.query-frontend.max-retry-backoff`. #10259
* [BUGFIX] Distributor: Use a boolean to track changes while merging the ReplicaDesc components, rather than comparing the objects directly. #10185
* [BUGFIX] Querier: fix timeout responding to query-frontend when response size is very close to `-querier.frontend-client.grpc-max-send-msg-size`. #10154
* [BUGFIX] Ruler: fix indeterminate rules being always run concurrently (instead of never) when `-ruler.max-independent-rule-evaluation-concurrency` is set. https://github.com/prometheus/prometheus/pull/15560 #10258
* [BUGFIX] PromQL: Fix various UTF-8 bugs related to quoting. https://github.com/prometheus/prometheus/pull/15531 #10258
* [BUGFIX] Ruler: Fixed an issue when using the experimental `-ruler.max-independent-rule-evaluation-concurrency` feature, where if a rule group was eligible for concurrency, it would flap between running concurrently or not based on the time it took after running concurrently. #9726 #10189

### Mixin

Expand Down
8 changes: 4 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ require (
github.com/prometheus/alertmanager v0.27.0
github.com/prometheus/client_golang v1.20.5
github.com/prometheus/client_model v0.6.1
github.com/prometheus/common v0.60.1
github.com/prometheus/common v0.61.0
github.com/prometheus/prometheus v1.99.0
github.com/segmentio/fasthash v1.0.3
github.com/sirupsen/logrus v1.9.3
Expand All @@ -49,7 +49,7 @@ require (
golang.org/x/net v0.32.0
golang.org/x/sync v0.10.0
golang.org/x/time v0.8.0
google.golang.org/grpc v1.67.1
google.golang.org/grpc v1.68.1
gopkg.in/yaml.v2 v2.4.0
gopkg.in/yaml.v3 v3.0.1
)
Expand All @@ -75,7 +75,7 @@ require (
github.com/pierrec/lz4/v4 v4.1.22
github.com/prometheus/procfs v0.15.1
github.com/shirou/gopsutil/v4 v4.24.11
github.com/thanos-io/objstore v0.0.0-20241128114755-8d266b990716
github.com/thanos-io/objstore v0.0.0-20241216182809-925be8281e0c
github.com/tjhop/slog-gokit v0.1.2
github.com/twmb/franz-go v1.18.0
github.com/twmb/franz-go/pkg/kadm v1.14.0
Expand Down Expand Up @@ -285,7 +285,7 @@ require (
)

// Using a fork of Prometheus with Mimir-specific changes.
replace github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20241210170917-0a0a41616520
replace github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20241216154007-6ce3249dcbb8

// Replace memberlist with our fork which includes some fixes that haven't been
// merged upstream yet:
Expand Down
20 changes: 10 additions & 10 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -931,8 +931,8 @@ github.com/cncf/xds/go v0.0.0-20230105202645-06c439db220b/go.mod h1:eXthEFrGJvWH
github.com/cncf/xds/go v0.0.0-20230310173818-32f1caf87195/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs=
github.com/cncf/xds/go v0.0.0-20230607035331-e9ce68804cb4/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs=
github.com/cncf/xds/go v0.0.0-20240423153145-555b57ec207b/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8=
github.com/cncf/xds/go v0.0.0-20240723142845-024c85f92f20 h1:N+3sFI5GUjRKBi+i0TxYVST9h4Ie192jJWpHvthBBgg=
github.com/cncf/xds/go v0.0.0-20240723142845-024c85f92f20/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8=
github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78 h1:QVw89YDxXxEe+l8gU8ETbOasdwEV+avkR75ZzsVV9WI=
github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8=
github.com/colega/go-yaml-yaml v0.0.0-20220720105220-255a8d16d094 h1:FpZSn61BWXbtyH68+uSv416veEswX1M2HRyQfdHnOyQ=
github.com/colega/go-yaml-yaml v0.0.0-20220720105220-255a8d16d094/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
github.com/coreos/go-semver v0.3.0 h1:wkHLiw0WNATZnSG7epLsujiMCgPAc9xhjJ4tgnAxmfM=
Expand Down Expand Up @@ -1279,8 +1279,8 @@ github.com/grafana/gomemcache v0.0.0-20241016125027-0a5bcc5aef40 h1:1TeKhyS+pvzO
github.com/grafana/gomemcache v0.0.0-20241016125027-0a5bcc5aef40/go.mod h1:IGRj8oOoxwJbHBYl1+OhS9UjQR0dv6SQOep7HqmtyFU=
github.com/grafana/memberlist v0.3.1-0.20220714140823-09ffed8adbbe h1:yIXAAbLswn7VNWBIvM71O2QsgfgW9fRXZNR0DXe6pDU=
github.com/grafana/memberlist v0.3.1-0.20220714140823-09ffed8adbbe/go.mod h1:MS2lj3INKhZjWNqd3N0m3J+Jxf3DAOnAH9VT3Sh9MUE=
github.com/grafana/mimir-prometheus v0.0.0-20241210170917-0a0a41616520 h1:FADazl5oVYBARbfVMtLkPQ9IfIwhiE9lrPrKNPOHBV4=
github.com/grafana/mimir-prometheus v0.0.0-20241210170917-0a0a41616520/go.mod h1:NpYc1U0eC7m6xUh3t3Pq565KxaIc08Oaquiu71dEMi8=
github.com/grafana/mimir-prometheus v0.0.0-20241216154007-6ce3249dcbb8 h1:wTPjkFjHVU0weI66T4qnVkS6241zgIZuLu5Nasg0wAY=
github.com/grafana/mimir-prometheus v0.0.0-20241216154007-6ce3249dcbb8/go.mod h1:a5LEa2Vy87wOp0Vu6sLmEIR1V59fqH3QosOSiErAr30=
github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956 h1:em1oddjXL8c1tL0iFdtVtPloq2hRPen2MJQKoAWpxu0=
github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956/go.mod h1:qtI1ogk+2JhVPIXVc6q+NHziSmy2W5GbdQZFUHADCBU=
github.com/grafana/prometheus-alertmanager v0.25.1-0.20240930132144-b5e64e81e8d3 h1:6D2gGAwyQBElSrp3E+9lSr7k8gLuP3Aiy20rweLWeBw=
Expand Down Expand Up @@ -1390,8 +1390,8 @@ github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4=
github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/ionos-cloud/sdk-go/v6 v6.2.1 h1:mxxN+frNVmbFrmmFfXnBC3g2USYJrl6mc1LW2iNYbFY=
github.com/ionos-cloud/sdk-go/v6 v6.2.1/go.mod h1:SXrO9OGyWjd2rZhAhEpdYN6VUAODzzqRdqA9BCviQtI=
github.com/ionos-cloud/sdk-go/v6 v6.3.0 h1:/lTieTH9Mo/CWm3cTlFLnK10jgxjUGkAqRffGqvPteY=
github.com/ionos-cloud/sdk-go/v6 v6.3.0/go.mod h1:SXrO9OGyWjd2rZhAhEpdYN6VUAODzzqRdqA9BCviQtI=
github.com/jessevdk/go-flags v1.5.0 h1:1jKYvbxEjfUl0fmqTCOfonvskHHXMjBySTLW4y9LFvc=
github.com/jessevdk/go-flags v1.5.0/go.mod h1:Fw0T6WPc1dYxT4mKEZRfG5kJhaTDP9pj1c2EWnYs/m4=
github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
Expand Down Expand Up @@ -1611,8 +1611,8 @@ github.com/prometheus/common v0.9.1/go.mod h1:yhUN8i9wzaXS3w1O07YhxHEBxD+W35wd8b
github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo=
github.com/prometheus/common v0.26.0/go.mod h1:M7rCNAaPfAosfx8veZJCuw84e35h3Cfd9VFqTh1DIvc=
github.com/prometheus/common v0.29.0/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+M/gUGO4Hls=
github.com/prometheus/common v0.60.1 h1:FUas6GcOw66yB/73KC+BOZoFJmbo/1pojoILArPAaSc=
github.com/prometheus/common v0.60.1/go.mod h1:h0LYf1R1deLSKtD4Vdg8gy4RuOvENW2J/h19V5NADQw=
github.com/prometheus/common v0.61.0 h1:3gv/GThfX0cV2lpO7gkTUwZru38mxevy90Bj8YFSRQQ=
github.com/prometheus/common v0.61.0/go.mod h1:zr29OCN/2BsJRaFwG8QOBr41D6kkchKbpeNH7pAjb/s=
github.com/prometheus/common/sigv4 v0.1.0 h1:qoVebwtwwEhS85Czm2dSROY5fTo2PAPEVdDeppTwGX4=
github.com/prometheus/common/sigv4 v0.1.0/go.mod h1:2Jkxxk9yYvCkE5G1sQT7GuEXm57JrvHu9k5YwTjsNtI=
github.com/prometheus/exporter-toolkit v0.13.1 h1:Evsh0gWQo2bdOHlnz9+0Nm7/OFfIwhE2Ws4A2jIlR04=
Expand Down Expand Up @@ -1717,8 +1717,8 @@ github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf
github.com/subosito/gotenv v1.4.1/go.mod h1:ayKnFf/c6rvx/2iiLrJUk1e6plDbT3edrFNGqEflhK0=
github.com/tencentyun/cos-go-sdk-v5 v0.7.40 h1:W6vDGKCHe4wBACI1d2UgE6+50sJFhRWU4O8IB2ozzxM=
github.com/tencentyun/cos-go-sdk-v5 v0.7.40/go.mod h1:4dCEtLHGh8QPxHEkgq+nFaky7yZxQuYwgSJM87icDaw=
github.com/thanos-io/objstore v0.0.0-20241128114755-8d266b990716 h1:yPe6qACqOTU3s4dYSXx/YjnI2C1v1IWbdCTQ8KjuXkc=
github.com/thanos-io/objstore v0.0.0-20241128114755-8d266b990716/go.mod h1:vyzFrBXgP+fGNG2FopEGWOO/zrIuoy7zt3LpLeezRsw=
github.com/thanos-io/objstore v0.0.0-20241216182809-925be8281e0c h1:RtWSdFUT6yNvmeKXSVCLXfgX4LZh0A+x/LqyyqgldX8=
github.com/thanos-io/objstore v0.0.0-20241216182809-925be8281e0c/go.mod h1:vyzFrBXgP+fGNG2FopEGWOO/zrIuoy7zt3LpLeezRsw=
github.com/tjhop/slog-gokit v0.1.2 h1:pmQI4SvU9h4gA0vIQsdhJQSqQg4mOmsPykG2/PM3j1I=
github.com/tjhop/slog-gokit v0.1.2/go.mod h1:8fhlcp8C8ELbg3GCyKv06tgt4B5sDq2P1r2DQAu1HuM=
github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU=
Expand Down
6 changes: 6 additions & 0 deletions pkg/frontend/querymiddleware/request_validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,14 @@ import (
"net/http"

"github.com/grafana/dskit/cancellation"
"github.com/prometheus/common/model"
)

func init() {
// Mimir doesn't support Prometheus' UTF-8 metric/label name scheme yet.
model.NameValidationScheme = model.LegacyValidation
}

const requestValidationFailedFmt = "request validation failed for "

var errMetricsQueryRequestValidationFailed = cancellation.NewErrorf(
Expand Down
2 changes: 1 addition & 1 deletion pkg/querier/distributor_queryable_streaming_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ func TestStreamingChunkSeries_CreateIteratorTwice(t *testing.T) {

iterator = series.Iterator(iterator)
require.NotNil(t, iterator)
require.EqualError(t, iterator.Err(), `can't create iterator multiple times for the one streaming series ({the-name="the-value"})`)
require.EqualError(t, iterator.Err(), `can't create iterator multiple times for the one streaming series ({"the-name"="the-value"})`)
}

func createTestChunk(t *testing.T, time int64, value float64) client.Chunk {
Expand Down
9 changes: 9 additions & 0 deletions pkg/ruler/manager_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ type ManagerMetrics struct {
GroupInterval *prometheus.Desc
GroupLastEvalTime *prometheus.Desc
GroupLastDuration *prometheus.Desc
GroupLastRuleDurationSum *prometheus.Desc
GroupLastRestoreDuration *prometheus.Desc
GroupRules *prometheus.Desc
GroupLastEvalSamples *prometheus.Desc
Expand Down Expand Up @@ -89,6 +90,12 @@ func NewManagerMetrics(logger log.Logger) *ManagerMetrics {
[]string{"user", "rule_group"},
nil,
),
GroupLastRuleDurationSum: prometheus.NewDesc(
"cortex_prometheus_rule_group_last_rule_duration_sum_seconds",
"The sum of time in seconds it took to evaluate each rule in the group regardless of concurrency. This should be higher than the group duration if rules are evaluated concurrently.",
[]string{"user", "rule_group"},
nil,
),
GroupLastRestoreDuration: prometheus.NewDesc(
"cortex_prometheus_rule_group_last_restore_duration_seconds",
"The duration of the last alert rules alerts restoration using the `ALERTS_FOR_STATE` series across all rule groups.",
Expand Down Expand Up @@ -131,6 +138,7 @@ func (m *ManagerMetrics) Describe(out chan<- *prometheus.Desc) {
out <- m.GroupInterval
out <- m.GroupLastEvalTime
out <- m.GroupLastDuration
out <- m.GroupLastRuleDurationSum
out <- m.GroupLastRestoreDuration
out <- m.GroupRules
out <- m.GroupLastEvalSamples
Expand All @@ -156,6 +164,7 @@ func (m *ManagerMetrics) Collect(out chan<- prometheus.Metric) {
data.SendSumOfGaugesPerTenant(out, m.GroupInterval, "prometheus_rule_group_interval_seconds", dskit_metrics.WithLabels("rule_group"))
data.SendSumOfGaugesPerTenant(out, m.GroupLastEvalTime, "prometheus_rule_group_last_evaluation_timestamp_seconds", dskit_metrics.WithLabels("rule_group"))
data.SendSumOfGaugesPerTenant(out, m.GroupLastDuration, "prometheus_rule_group_last_duration_seconds", dskit_metrics.WithLabels("rule_group"))
data.SendSumOfGaugesPerTenant(out, m.GroupLastRuleDurationSum, "prometheus_rule_group_last_rule_duration_sum_seconds", dskit_metrics.WithLabels("rule_group"))
data.SendSumOfGaugesPerTenant(out, m.GroupRules, "prometheus_rule_group_rules", dskit_metrics.WithLabels("rule_group"))
data.SendSumOfGaugesPerTenant(out, m.GroupLastEvalSamples, "prometheus_rule_group_last_evaluation_samples", dskit_metrics.WithLabels("rule_group"))
}
14 changes: 12 additions & 2 deletions pkg/ruler/rule_concurrency.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,9 +190,19 @@ func (c *TenantConcurrencyController) Allow(_ context.Context, group *rules.Grou
// isGroupAtRisk checks if the rule group's last evaluation time is within the risk threshold.
func (c *TenantConcurrencyController) isGroupAtRisk(group *rules.Group) bool {
interval := group.Interval().Seconds()
lastEvaluation := group.GetEvaluationTime().Seconds()
runtimeThreshold := interval * c.thresholdRuleConcurrency / 100

return lastEvaluation >= interval*c.thresholdRuleConcurrency/100
// If the group evaluation time is greater than the threshold, the group is at risk.
if group.GetEvaluationTime().Seconds() >= runtimeThreshold {
return true
}

// If the total rule evaluation time is greater than the threshold, the group is at risk.
if group.GetRuleEvaluationTimeSum().Seconds() >= runtimeThreshold {
return true
}

return false
}

// isRuleIndependent checks if the rule is independent of other rules.
Expand Down
Loading

0 comments on commit 21edd1f

Please sign in to comment.