Merge pull request #12 from jameinel/jitter-cleanup

#12 The original implementation of jitter computed the new delay, and then did a range of actual delay in the range of 0-100% of that value. On average, that actually means that we delay 1/2 as long as the desired values. In practice it was slightly more, because we then capped to min and max, which meant for the simple tests it was a 60% bias. The new algorithm does a +/- 20% logic, which means that on average, we actually end up very close to the expected values (my averaging ratios normally came out within 1% of the expected values, and worse case was about a 3.6% bias). Since it is random, it could be anything (you could have a string of 100 minimum possible values). But that is unlikely enough that I'll leave the test suite as asserting we are within 10% and live with the 1 in 1 billion chance that it fails. Also, there was a flaky test (StopChannel) where it could get an extra trigger sometimes, and we had the bug that if a user incorrectly supplied a exponential backoff that was <1.0 we would actually end up going below minDelay, rather than capping it at min. (we would check if you applied jitter, but not normally). There isn't a lot to QA manually, as I think the test cases are pretty thorough. I'm a bit unhappy about my "1ms" limits on the 20% over and under, but I think that is still ok. Certainly it is ok in practice.
juju · Sep 16, 2024 · d9c58a0 · d9c58a0
2 parents 62423bf + 69f327c
commit d9c58a0
Show file tree

Hide file tree

Showing 3 changed files with 110 additions and 20 deletions.
diff --git a/doc.go b/doc.go
@@ -13,10 +13,10 @@
 //	})
 //
 // The bare minimum arguments that need to be specified are:
-//  * Func - the function to call
-//  * Attempts - the number of times to try Func before giving up, or a negative number for unlimited attempts (`retry.UnlimitedAttempts`)
-//  * Delay - how long to wait between each try that returns an error
-//  * Clock - either the wall clock, or some testing clock
+//   - Func - the function to call
+//   - Attempts - the number of times to try Func before giving up, or a negative number for unlimited attempts (`retry.UnlimitedAttempts`)
+//   - Delay - how long to wait between each try that returns an error
+//   - Clock - either the wall clock, or some testing clock
 //
 // Any error that is returned from the Func is considered transient.
 // In order to identify some errors as fatal, pass in a function for the
@@ -87,5 +87,4 @@
 //		Delay:       100 * time.Millisecond,
 //		Clock:       clock.WallClock,
 //	})
-//
 package retry
diff --git a/retry.go b/retry.go
@@ -232,7 +232,8 @@ func DoubleDelay(delay time.Duration, attempt int) time.Duration {
 // structure.
 //
 // The next delay value is calculated using the following formula:
-//   newDelay = min(minDelay * exp^attempt, maxDelay)
+//
+//	newDelay = min(minDelay * exp^attempt, maxDelay)
 //
 // If applyJitter is set to true, the function will randomly select and return
 // back a value in the [minDelay, newDelay] range.
@@ -241,18 +242,19 @@ func ExpBackoff(minDelay, maxDelay time.Duration, exp float64, applyJitter bool)
 	maxDelayF := float64(maxDelay)
 	return func(_ time.Duration, attempt int) time.Duration {
 		newDelay := minDelayF * math.Pow(exp, float64(attempt))
-		if newDelay > maxDelayF {
-			newDelay = maxDelayF
-		}
 
 		// Return a random value in the [minDelay, newDelay) range.
 		if applyJitter {
-			newDelay = rand.Float64() * newDelay
-			if newDelay < minDelayF {
-				newDelay = minDelayF
-			}
+			// We want to go +/- 20%, which is a 40% swing, and
+			// Float64 returns in the range 0-1
+			newDelay = (1 + rand.Float64()*0.4 - 0.2) * newDelay
+		}
+		if newDelay < minDelayF {
+			newDelay = minDelayF
+		}
+		if newDelay > maxDelayF {
+			newDelay = maxDelayF
 		}
-
 		return time.Duration(newDelay).Round(time.Millisecond)
 	}
 }
diff --git a/retry_test.go b/retry_test.go
@@ -4,6 +4,7 @@
 package retry_test
 
 import (
+	"math"
 	"time"
 
 	"github.com/juju/clock"
@@ -33,7 +34,11 @@ func (mock *mockClock) Now() time.Time {
 func (mock *mockClock) After(wait time.Duration) <-chan time.Time {
 	mock.delays = append(mock.delays, wait)
 	mock.now = mock.now.Add(wait)
-	return time.After(time.Microsecond)
+	// Note (jam): 2024-09-16 on my machine,
+	// 	go test -count 500 -failfast -check.f StopChannel
+	// Fails reliably at 1 Microsecond. I think the issue is that at 1us,
+	// the clock can tick while the after func is being evaluated.
+	return time.After(10 * time.Microsecond)
 }
 
 func (*retrySuite) TestSuccessHasNoDelay(c *gc.C) {
@@ -335,14 +340,15 @@ func (*expBackoffSuite) TestExpBackoffWithoutJitter(c *gc.C) {
 	}
 }
 
-func (*expBackoffSuite) TestExpBackoffWithtJitter(c *gc.C) {
+func (*expBackoffSuite) TestExpBackoffWithJitter(c *gc.C) {
 	minDelay := 200 * time.Millisecond
 	backoffFunc := retry.ExpBackoff(minDelay, 2*time.Second, 2.0, true)
+	// All of these are allowed to go up to 20% over the expected value
 	maxDurations := []time.Duration{
-		200 * time.Millisecond,
-		400 * time.Millisecond,
-		800 * time.Millisecond,
-		1600 * time.Millisecond,
+		240 * time.Millisecond,
+		480 * time.Millisecond,
+		960 * time.Millisecond,
+		1920 * time.Millisecond,
 		2000 * time.Millisecond, // capped to maxDelay
 	}
 
@@ -352,3 +358,86 @@ func (*expBackoffSuite) TestExpBackoffWithtJitter(c *gc.C) {
 		c.Assert(got, jc.LessThan, maxDuration+1, gc.Commentf("expected jittered duration for attempt %d to be in the [%s, %s] range; got %s", attempt, minDelay, maxDuration, got))
 	}
 }
+
+// TestExpBackofWithJitterAverage makes sure that turning on Jitter doesn't
+// dramatically change the average wait times for sampling. (eg, if we say wait
+// 200ms to 2000ms, turning on jitter should keep the wait times roughly aligned with those times).
+// This is a little bit tricky, because we expect it to be random, but we'll
+// look at the average ratio of the jittered value and the expected backoff value.
+func (*expBackoffSuite) TestExpBackofWithJitterAverage(c *gc.C) {
+	const (
+		// 1.02^100 ~= 10, causing us to go from 200ms to 2s in 100 steps
+		minDelay    = 200 * time.Millisecond
+		maxDelay    = 2 * time.Second
+		maxAttempts = 100
+		backoff     = 1.02
+	)
+	jitterBackoffFunc := retry.ExpBackoff(minDelay, maxDelay, backoff, true)
+	noJitterBackoffFunc := retry.ExpBackoff(minDelay, maxDelay, backoff, false)
+	ratioSum := 0.0
+	for attempt := 0; attempt < maxAttempts; attempt++ {
+		jitterValue := jitterBackoffFunc(0, attempt)
+		nonJitterValue := noJitterBackoffFunc(0, attempt)
+		ratio := float64(jitterValue) / float64(nonJitterValue)
+		ratioSum += ratio
+		// We have > and < not >=, so we need a bit of flexibility,
+		// also float64 imprecision gives us a bit more than 1ns of
+		// inaccuracy
+		minJitter := time.Duration(0.8*float64(nonJitterValue)) - time.Millisecond
+		maxJitter := time.Duration(1.2*float64(nonJitterValue)) + time.Millisecond
+		c.Assert(jitterValue, jc.GreaterThan, minJitter,
+			gc.Commentf("expected jittered duration for attempt %d to be in the [%s, %s] range; got %s",
+				attempt, minJitter, maxJitter, jitterValue))
+		c.Assert(jitterValue, jc.LessThan, maxJitter,
+			gc.Commentf("expected jittered duration for attempt %d to be in the [%s, %s] range; got %s",
+				attempt, minJitter, maxJitter, jitterValue))
+	}
+	// We could do a geometric mean instead of a arithmetic mean because we
+	// are dealing with ratios, but ratios should stay in the range of 0-2,
+	// so arithmetic makes sense.
+	ratioAvg := ratioSum / maxAttempts
+	// In practice, while individual attempts might vary by +/- 20%, they
+	// average out over 100 steps, and we actually end up within +/- 1% on
+	// average. The most I've seen is a 3.6% variation.
+	// We move this out to 10% to avoid an annoying test (eg, string of low
+	// randoms).
+	c.Check(ratioAvg, jc.GreaterThan, 0.9,
+		gc.Commentf("jitter reduced the average duration by %.3f, we expected it to be +/- 2%% on average", ratioAvg))
+	c.Check(ratioAvg, jc.LessThan, 1.1,
+		gc.Commentf("jitter increased the average duration by %.3f, we expected it to be +/- 2%% on average", ratioAvg))
+}
+
+// TestExpBackoffBadExponent says that we'll at least roughly conform to
+// expectations even if the user gives us a bad backoff factor.
+// We don't have a mechanism for giving an error, but at least hold to min and
+// max values.
+func (*expBackoffSuite) TestExpBackoffBadExponent(c *gc.C) {
+	const (
+		// a backoff of 0.8 would put us below 200ms
+		minDelay    = 200 * time.Millisecond
+		maxDelay    = 2 * time.Second
+		maxAttempts = 10
+		backoff     = 0.8
+	)
+	jitterBackoffFunc := retry.ExpBackoff(minDelay, maxDelay, backoff, true)
+	noJitterBackoffFunc := retry.ExpBackoff(minDelay, maxDelay, backoff, false)
+	for attempt := 0; attempt < maxAttempts; attempt++ {
+		jitterValue := jitterBackoffFunc(0, attempt)
+		nonJitterValue := noJitterBackoffFunc(0, attempt)
+		c.Check(nonJitterValue, jc.GreaterThan, minDelay-1,
+			gc.Commentf("expected duration for attempt %d to be in the [%s, %s] range; got %s",
+				attempt, minDelay, maxDelay, nonJitterValue))
+		c.Check(nonJitterValue, jc.LessThan, maxDelay+1,
+			gc.Commentf("expected duration for attempt %d to be in the [%s, %s] range; got %s",
+				attempt, minDelay, maxDelay, nonJitterValue))
+		// Ensure that even with jitter we are still capped at min and max delay
+		minJitter := time.Duration(math.Max(0.8*float64(nonJitterValue), float64(minDelay))) - time.Microsecond
+		maxJitter := time.Duration(math.Min(1.2*float64(nonJitterValue), float64(maxDelay))) + time.Microsecond
+		c.Check(jitterValue, jc.GreaterThan, minJitter,
+			gc.Commentf("expected jittered duration for attempt %d to be in the [%s, %s] range; got %s",
+				attempt, minJitter, maxJitter, jitterValue))
+		c.Check(jitterValue, jc.LessThan, maxJitter,
+			gc.Commentf("expected jittered duration for attempt %d to be in the [%s, %s] range; got %s",
+				attempt, minJitter, maxJitter, jitterValue))
+	}
+}