diff --git a/apps/agent/services/ratelimit/metrics.go b/apps/agent/services/ratelimit/metrics.go index bee92c052b..595dc06855 100644 --- a/apps/agent/services/ratelimit/metrics.go +++ b/apps/agent/services/ratelimit/metrics.go @@ -23,4 +23,12 @@ var ( Subsystem: "ratelimit", Name: "ratelimits_total", }, []string{"passed"}) + + // forceSync is a counter that increments every time the agent is forced to + // sync with the origin ratelimit service because it doesn't have enough data + forceSync = promauto.NewCounter(prometheus.CounterOpts{ + Namespace: "agent", + Subsystem: "ratelimit", + Name: "force_sync", + }) ) diff --git a/apps/agent/services/ratelimit/ratelimit.go b/apps/agent/services/ratelimit/ratelimit.go index da8b89c746..1d6511a2ba 100644 --- a/apps/agent/services/ratelimit/ratelimit.go +++ b/apps/agent/services/ratelimit/ratelimit.go @@ -38,23 +38,23 @@ func (s *service) Ratelimit(ctx context.Context, req *ratelimitv1.RatelimitReque } } - // prevExists, currExists := s.CheckWindows(ctx, ratelimitReq) - // // If neither window existed before, we should do an origin ratelimit check - // // because we likely don't have enough data to make an accurate decision' - // if !prevExists && !currExists { - - // originRes, err := s.ratelimitOrigin(ctx, req) - // // The control flow is a bit unusual here because we want to return early on - // // success, rather than on error - // if err == nil && originRes != nil { - // return originRes, nil - // } - // if err != nil { - // // We want to know about the error, but if there is one, we just fall back - // // to local state, so we don't return early - // s.logger.Err(err).Msg("failed to sync with origin, falling back to local state") - // } - // } + prevExists, currExists := s.CheckWindows(ctx, ratelimitReq) + // If neither window existed before, we should do an origin ratelimit check + // because we likely don't have enough data to make an accurate decision' + if !prevExists && !currExists { + + originRes, err := s.ratelimitOrigin(ctx, req) + // The control flow is a bit unusual here because we want to return early on + // success, rather than on error + if err == nil && originRes != nil { + return originRes, nil + } + if err != nil { + // We want to know about the error, but if there is one, we just fall back + // to local state, so we don't return early + s.logger.Err(err).Msg("failed to sync with origin, falling back to local state") + } + } taken := s.Take(ctx, ratelimitReq) @@ -93,6 +93,8 @@ func (s *service) ratelimitOrigin(ctx context.Context, req *ratelimitv1.Ratelimi ctx, span := tracing.Start(ctx, "ratelimit.RatelimitOrigin") defer span.End() + forceSync.Inc() + now := time.Now() if req.Time != nil { now = time.UnixMilli(req.GetTime()) @@ -116,7 +118,11 @@ func (s *service) ratelimitOrigin(ctx context.Context, req *ratelimitv1.Ratelimi Time: now.UnixMilli(), }) - res, err := client.PushPull(ctx, connectReq) + res, err := s.syncCircuitBreaker.Do(ctx, func(innerCtx context.Context) (*connect.Response[ratelimitv1.PushPullResponse], error) { + innerCtx, cancel := context.WithTimeout(innerCtx, 10*time.Second) + defer cancel() + return client.PushPull(innerCtx, connectReq) + }) if err != nil { tracing.RecordError(span, err) s.logger.Err(err).Msg("failed to call ratelimit") diff --git a/apps/api/src/routes/v1_ratelimit_limit.accuracy.test.ts b/apps/api/src/routes/v1_ratelimit_limit.accuracy.test.ts index 762450975c..b4a5a9a734 100644 --- a/apps/api/src/routes/v1_ratelimit_limit.accuracy.test.ts +++ b/apps/api/src/routes/v1_ratelimit_limit.accuracy.test.ts @@ -96,7 +96,7 @@ for (const { limit, duration, rps, seconds } of testCases) { }, 0); const exactLimit = Math.min(results.length, (limit / (duration / 1000)) * seconds); - const upperLimit = Math.round(exactLimit * 1.5); + const upperLimit = Math.round(exactLimit * 2.5); const lowerLimit = Math.round(exactLimit * 0.95); console.info({ name, passed, exactLimit, upperLimit, lowerLimit }); t.expect(passed).toBeGreaterThanOrEqual(lowerLimit);