Skip to content

Commit

Permalink
Merge pull request #12 from jameinel/jitter-cleanup
Browse files Browse the repository at this point in the history
#12

The original implementation of jitter computed the new delay, and then did a range of actual delay in the range of 0-100% of that value. On average, that actually means that we delay 1/2 as long as the desired values. In practice it was slightly more, because we then capped to min and max, which meant for the simple tests it was a 60% bias.

The new algorithm does a +/- 20% logic, which means that on average, we actually end up very close to the expected values (my averaging ratios normally came out within 1% of the expected values, and worse case was about a 3.6% bias). Since it is random, it could be anything (you could have a string of 100 minimum possible values). But that is unlikely enough that I'll leave the test suite as asserting we are within 10% and live with the 1 in 1 billion chance that it fails.

Also, there was a flaky test (StopChannel) where it could get an extra trigger sometimes, and we had the bug that if a user incorrectly supplied a exponential backoff that was <1.0 we would actually end up going below minDelay, rather than capping it at min. (we would check if you applied jitter, but not normally).

There isn't a lot to QA manually, as I think the test cases are pretty thorough.

I'm a bit unhappy about my "1ms" limits on the 20% over and under, but I think that is still ok. Certainly it is ok in practice.
  • Loading branch information
jujubot authored Sep 16, 2024
2 parents 62423bf + 69f327c commit d9c58a0
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 20 deletions.
9 changes: 4 additions & 5 deletions doc.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
// })
//
// The bare minimum arguments that need to be specified are:
// * Func - the function to call
// * Attempts - the number of times to try Func before giving up, or a negative number for unlimited attempts (`retry.UnlimitedAttempts`)
// * Delay - how long to wait between each try that returns an error
// * Clock - either the wall clock, or some testing clock
// - Func - the function to call
// - Attempts - the number of times to try Func before giving up, or a negative number for unlimited attempts (`retry.UnlimitedAttempts`)
// - Delay - how long to wait between each try that returns an error
// - Clock - either the wall clock, or some testing clock
//
// Any error that is returned from the Func is considered transient.
// In order to identify some errors as fatal, pass in a function for the
Expand Down Expand Up @@ -87,5 +87,4 @@
// Delay: 100 * time.Millisecond,
// Clock: clock.WallClock,
// })
//
package retry
20 changes: 11 additions & 9 deletions retry.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,8 @@ func DoubleDelay(delay time.Duration, attempt int) time.Duration {
// structure.
//
// The next delay value is calculated using the following formula:
// newDelay = min(minDelay * exp^attempt, maxDelay)
//
// newDelay = min(minDelay * exp^attempt, maxDelay)
//
// If applyJitter is set to true, the function will randomly select and return
// back a value in the [minDelay, newDelay] range.
Expand All @@ -241,18 +242,19 @@ func ExpBackoff(minDelay, maxDelay time.Duration, exp float64, applyJitter bool)
maxDelayF := float64(maxDelay)
return func(_ time.Duration, attempt int) time.Duration {
newDelay := minDelayF * math.Pow(exp, float64(attempt))
if newDelay > maxDelayF {
newDelay = maxDelayF
}

// Return a random value in the [minDelay, newDelay) range.
if applyJitter {
newDelay = rand.Float64() * newDelay
if newDelay < minDelayF {
newDelay = minDelayF
}
// We want to go +/- 20%, which is a 40% swing, and
// Float64 returns in the range 0-1
newDelay = (1 + rand.Float64()*0.4 - 0.2) * newDelay
}
if newDelay < minDelayF {
newDelay = minDelayF
}
if newDelay > maxDelayF {
newDelay = maxDelayF
}

return time.Duration(newDelay).Round(time.Millisecond)
}
}
101 changes: 95 additions & 6 deletions retry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package retry_test

import (
"math"
"time"

"github.com/juju/clock"
Expand Down Expand Up @@ -33,7 +34,11 @@ func (mock *mockClock) Now() time.Time {
func (mock *mockClock) After(wait time.Duration) <-chan time.Time {
mock.delays = append(mock.delays, wait)
mock.now = mock.now.Add(wait)
return time.After(time.Microsecond)
// Note (jam): 2024-09-16 on my machine,
// go test -count 500 -failfast -check.f StopChannel
// Fails reliably at 1 Microsecond. I think the issue is that at 1us,
// the clock can tick while the after func is being evaluated.
return time.After(10 * time.Microsecond)
}

func (*retrySuite) TestSuccessHasNoDelay(c *gc.C) {
Expand Down Expand Up @@ -335,14 +340,15 @@ func (*expBackoffSuite) TestExpBackoffWithoutJitter(c *gc.C) {
}
}

func (*expBackoffSuite) TestExpBackoffWithtJitter(c *gc.C) {
func (*expBackoffSuite) TestExpBackoffWithJitter(c *gc.C) {
minDelay := 200 * time.Millisecond
backoffFunc := retry.ExpBackoff(minDelay, 2*time.Second, 2.0, true)
// All of these are allowed to go up to 20% over the expected value
maxDurations := []time.Duration{
200 * time.Millisecond,
400 * time.Millisecond,
800 * time.Millisecond,
1600 * time.Millisecond,
240 * time.Millisecond,
480 * time.Millisecond,
960 * time.Millisecond,
1920 * time.Millisecond,
2000 * time.Millisecond, // capped to maxDelay
}

Expand All @@ -352,3 +358,86 @@ func (*expBackoffSuite) TestExpBackoffWithtJitter(c *gc.C) {
c.Assert(got, jc.LessThan, maxDuration+1, gc.Commentf("expected jittered duration for attempt %d to be in the [%s, %s] range; got %s", attempt, minDelay, maxDuration, got))
}
}

// TestExpBackofWithJitterAverage makes sure that turning on Jitter doesn't
// dramatically change the average wait times for sampling. (eg, if we say wait
// 200ms to 2000ms, turning on jitter should keep the wait times roughly aligned with those times).
// This is a little bit tricky, because we expect it to be random, but we'll
// look at the average ratio of the jittered value and the expected backoff value.
func (*expBackoffSuite) TestExpBackofWithJitterAverage(c *gc.C) {
const (
// 1.02^100 ~= 10, causing us to go from 200ms to 2s in 100 steps
minDelay = 200 * time.Millisecond
maxDelay = 2 * time.Second
maxAttempts = 100
backoff = 1.02
)
jitterBackoffFunc := retry.ExpBackoff(minDelay, maxDelay, backoff, true)
noJitterBackoffFunc := retry.ExpBackoff(minDelay, maxDelay, backoff, false)
ratioSum := 0.0
for attempt := 0; attempt < maxAttempts; attempt++ {
jitterValue := jitterBackoffFunc(0, attempt)
nonJitterValue := noJitterBackoffFunc(0, attempt)
ratio := float64(jitterValue) / float64(nonJitterValue)
ratioSum += ratio
// We have > and < not >=, so we need a bit of flexibility,
// also float64 imprecision gives us a bit more than 1ns of
// inaccuracy
minJitter := time.Duration(0.8*float64(nonJitterValue)) - time.Millisecond
maxJitter := time.Duration(1.2*float64(nonJitterValue)) + time.Millisecond
c.Assert(jitterValue, jc.GreaterThan, minJitter,
gc.Commentf("expected jittered duration for attempt %d to be in the [%s, %s] range; got %s",
attempt, minJitter, maxJitter, jitterValue))
c.Assert(jitterValue, jc.LessThan, maxJitter,
gc.Commentf("expected jittered duration for attempt %d to be in the [%s, %s] range; got %s",
attempt, minJitter, maxJitter, jitterValue))
}
// We could do a geometric mean instead of a arithmetic mean because we
// are dealing with ratios, but ratios should stay in the range of 0-2,
// so arithmetic makes sense.
ratioAvg := ratioSum / maxAttempts
// In practice, while individual attempts might vary by +/- 20%, they
// average out over 100 steps, and we actually end up within +/- 1% on
// average. The most I've seen is a 3.6% variation.
// We move this out to 10% to avoid an annoying test (eg, string of low
// randoms).
c.Check(ratioAvg, jc.GreaterThan, 0.9,
gc.Commentf("jitter reduced the average duration by %.3f, we expected it to be +/- 2%% on average", ratioAvg))
c.Check(ratioAvg, jc.LessThan, 1.1,
gc.Commentf("jitter increased the average duration by %.3f, we expected it to be +/- 2%% on average", ratioAvg))
}

// TestExpBackoffBadExponent says that we'll at least roughly conform to
// expectations even if the user gives us a bad backoff factor.
// We don't have a mechanism for giving an error, but at least hold to min and
// max values.
func (*expBackoffSuite) TestExpBackoffBadExponent(c *gc.C) {
const (
// a backoff of 0.8 would put us below 200ms
minDelay = 200 * time.Millisecond
maxDelay = 2 * time.Second
maxAttempts = 10
backoff = 0.8
)
jitterBackoffFunc := retry.ExpBackoff(minDelay, maxDelay, backoff, true)
noJitterBackoffFunc := retry.ExpBackoff(minDelay, maxDelay, backoff, false)
for attempt := 0; attempt < maxAttempts; attempt++ {
jitterValue := jitterBackoffFunc(0, attempt)
nonJitterValue := noJitterBackoffFunc(0, attempt)
c.Check(nonJitterValue, jc.GreaterThan, minDelay-1,
gc.Commentf("expected duration for attempt %d to be in the [%s, %s] range; got %s",
attempt, minDelay, maxDelay, nonJitterValue))
c.Check(nonJitterValue, jc.LessThan, maxDelay+1,
gc.Commentf("expected duration for attempt %d to be in the [%s, %s] range; got %s",
attempt, minDelay, maxDelay, nonJitterValue))
// Ensure that even with jitter we are still capped at min and max delay
minJitter := time.Duration(math.Max(0.8*float64(nonJitterValue), float64(minDelay))) - time.Microsecond
maxJitter := time.Duration(math.Min(1.2*float64(nonJitterValue), float64(maxDelay))) + time.Microsecond
c.Check(jitterValue, jc.GreaterThan, minJitter,
gc.Commentf("expected jittered duration for attempt %d to be in the [%s, %s] range; got %s",
attempt, minJitter, maxJitter, jitterValue))
c.Check(jitterValue, jc.LessThan, maxJitter,
gc.Commentf("expected jittered duration for attempt %d to be in the [%s, %s] range; got %s",
attempt, minJitter, maxJitter, jitterValue))
}
}

0 comments on commit d9c58a0

Please sign in to comment.