Skip to content

Commit

Permalink
Fix computation of shuffle shard when shard size is <= 0, or higher t…
Browse files Browse the repository at this point in the history
…han number of ring instances. (grafana#566)

* Fix computation of shuffle shard when shard size is <= 0, or higher than number of ring instances.

* Remove check for added instances, it was supposed to make shuffleShard faster, but didn't.

* Test for shard size == num instances.

* CHANGELOG.md

* Remove unnecessary changes.

* Benchmark shardSize=0.

* Add bugfix entry.

* Update comment.
  • Loading branch information
pstibrany authored Aug 16, 2024
1 parent 9b4a342 commit aa1c6eb
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 8 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@
* [ENHANCEMENT] memberlist: Added `-<prefix>memberlist.broadcast-timeout-for-local-updates-on-shutdown` option to set timeout for sending locally-generated updates on shutdown, instead of previously hardcoded 10s (which is still the default). #539
* [ENHANCEMENT] tracing: add ExtractTraceSpanID function.
* [EHNANCEMENT] crypto/tls: Support reloading client certificates #537 #552
* [ENHANCEMENT] Add read only support for ingesters in the ring and lifecycler. #553 #554
* [ENHANCEMENT] Add read only support for ingesters in the ring and lifecycler. #553 #554 #556
* [ENHANCEMENT] Added new ring methods to expose number of writable instances with tokens per zone, and overall. #560 #562
* [ENHANCEMENT] `services.FailureWatcher` can now be closed, which unregisters all service and manager listeners, and closes channel used to receive errors. #564
* [CHANGE] Backoff: added `Backoff.ErrCause()` which is like `Backoff.Err()` but returns the context cause if backoff is terminated because the context has been canceled. #538
Expand Down Expand Up @@ -258,3 +258,4 @@
* [BUGFIX] Memcached: Don't truncate sub-second TTLs to 0 which results in them being cached forever. #530
* [BUGFIX] Cache: initialise the `operation_failures_total{reason="connect-timeout"}` metric to 0 for each cache operation type on startup. #545
* [BUGFIX] spanlogger: include correct caller information in log messages logged through a `SpanLogger`. #547
* [BUGFIX] Ring: shuffle shard without lookback no longer returns entire ring when shard size >= number of instances. Instead proper subring is computed, with correct number of instances in each zone. Returning entire ring was a bug, and such ring can contain instances that were not supposed to be used, if zones are not balanced. #554 #556
10 changes: 5 additions & 5 deletions ring/ring.go
Original file line number Diff line number Diff line change
Expand Up @@ -697,11 +697,11 @@ func (r *Ring) updateRingMetrics(compareResult CompareResult) {
//
// Subring returned by this method does not contain instances that have read-only field set.
func (r *Ring) ShuffleShard(identifier string, size int) ReadRing {
// Use all instances if shuffle sharding is disabled, or it covers all instances anyway.
// Use all possible instances if shuffle sharding is disabled. We don't set size to r.InstancesCount(), because
// that could lead to not all instances being returned when ring zones are unbalanced.
// Reason for not returning entire ring directly is that we need to filter out read-only instances.
instances := r.InstancesCount()
if size <= 0 || instances <= size {
size = instances
if size <= 0 {
size = math.MaxInt
}

if cached := r.getCachedShuffledSubring(identifier, size); cached != nil {
Expand Down Expand Up @@ -771,7 +771,7 @@ func (r *Ring) shuffleShard(identifier string, size int, lookbackPeriod time.Dur
actualZones = []string{""}
}

shard := make(map[string]InstanceDesc, size)
shard := make(map[string]InstanceDesc, min(len(r.ringDesc.Ingesters), size))

// We need to iterate zones always in the same order to guarantee stability.
for _, zone := range actualZones {
Expand Down
93 changes: 91 additions & 2 deletions ring/ring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1462,6 +1462,83 @@ func TestRing_ShuffleShard(t *testing.T) {
expectedZoneCount: 2,
expectedInstancesInZoneCount: map[string]int{"zone-a": 1, "zone-b": 1, "zone-c": 0},
},
"multiple zones, shard size == num instances, balanced zones": {
ringInstances: map[string]InstanceDesc{
"instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: gen.GenerateTokens(128, nil)},
"instance-2": {Addr: "127.0.0.2", Zone: "zone-a", Tokens: gen.GenerateTokens(128, nil)},
"instance-3": {Addr: "127.0.0.3", Zone: "zone-b", Tokens: gen.GenerateTokens(128, nil)},
"instance-4": {Addr: "127.0.0.4", Zone: "zone-b", Tokens: gen.GenerateTokens(128, nil)},
"instance-5": {Addr: "127.0.0.5", Zone: "zone-c", Tokens: gen.GenerateTokens(128, nil)},
"instance-6": {Addr: "127.0.0.6", Zone: "zone-c", Tokens: gen.GenerateTokens(128, nil)},
},
shardSize: 6,
zoneAwarenessEnabled: true,
expectedSize: 6,
expectedDistribution: []int{2, 2, 2},
expectedZoneCount: 3,
expectedInstancesInZoneCount: map[string]int{"zone-a": 2, "zone-b": 2, "zone-c": 2},
},
"multiple zones, shard size == num instances, unbalanced zones": {
ringInstances: map[string]InstanceDesc{
"instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: gen.GenerateTokens(128, nil)},
"instance-2": {Addr: "127.0.0.2", Zone: "zone-a", Tokens: gen.GenerateTokens(128, nil)},
"instance-3": {Addr: "127.0.0.3", Zone: "zone-a", Tokens: gen.GenerateTokens(128, nil)},
"instance-4": {Addr: "127.0.0.4", Zone: "zone-b", Tokens: gen.GenerateTokens(128, nil)},
"instance-5": {Addr: "127.0.0.5", Zone: "zone-b", Tokens: gen.GenerateTokens(128, nil)},
"instance-6": {Addr: "127.0.0.6", Zone: "zone-c", Tokens: gen.GenerateTokens(128, nil)},
},
shardSize: 6,
zoneAwarenessEnabled: true,
expectedSize: 5,
expectedDistribution: []int{2, 2, 1},
expectedZoneCount: 3,
expectedInstancesInZoneCount: map[string]int{"zone-a": 2, "zone-b": 2, "zone-c": 1},
},
"multiple zones, shard size > num instances, balanced zones": {
ringInstances: map[string]InstanceDesc{
"instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: gen.GenerateTokens(128, nil)},
"instance-2": {Addr: "127.0.0.2", Zone: "zone-b", Tokens: gen.GenerateTokens(128, nil)},
"instance-3": {Addr: "127.0.0.3", Zone: "zone-c", Tokens: gen.GenerateTokens(128, nil)},
},
shardSize: 3,
zoneAwarenessEnabled: true,
expectedSize: 3,
expectedDistribution: []int{1, 1, 1},
expectedZoneCount: 3,
expectedInstancesInZoneCount: map[string]int{"zone-a": 1, "zone-b": 1, "zone-c": 1},
},
"multiple zones, shard size > num instances, unbalanced zones": {
ringInstances: map[string]InstanceDesc{
"instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: gen.GenerateTokens(128, nil)},
"instance-2": {Addr: "127.0.0.2", Zone: "zone-a", Tokens: gen.GenerateTokens(128, nil)},
"instance-3": {Addr: "127.0.0.3", Zone: "zone-a", Tokens: gen.GenerateTokens(128, nil)},
"instance-4": {Addr: "127.0.0.4", Zone: "zone-b", Tokens: gen.GenerateTokens(128, nil)},
"instance-5": {Addr: "127.0.0.5", Zone: "zone-b", Tokens: gen.GenerateTokens(128, nil)},
"instance-6": {Addr: "127.0.0.6", Zone: "zone-c", Tokens: gen.GenerateTokens(128, nil)},
},
shardSize: 9,
zoneAwarenessEnabled: true,
expectedSize: 6,
expectedDistribution: []int{3, 2, 1},
expectedZoneCount: 3,
expectedInstancesInZoneCount: map[string]int{"zone-a": 3, "zone-b": 2, "zone-c": 1},
},
"multiple zones, shard size = 0, unbalanced zones": {
ringInstances: map[string]InstanceDesc{
"instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: gen.GenerateTokens(128, nil)},
"instance-2": {Addr: "127.0.0.2", Zone: "zone-a", Tokens: gen.GenerateTokens(128, nil)},
"instance-3": {Addr: "127.0.0.3", Zone: "zone-a", Tokens: gen.GenerateTokens(128, nil)},
"instance-4": {Addr: "127.0.0.4", Zone: "zone-b", Tokens: gen.GenerateTokens(128, nil)},
"instance-5": {Addr: "127.0.0.5", Zone: "zone-b", Tokens: gen.GenerateTokens(128, nil)},
"instance-6": {Addr: "127.0.0.6", Zone: "zone-c", Tokens: gen.GenerateTokens(128, nil)},
},
shardSize: 0,
zoneAwarenessEnabled: true,
expectedSize: 6,
expectedDistribution: []int{3, 2, 1},
expectedZoneCount: 3,
expectedInstancesInZoneCount: map[string]int{"zone-a": 3, "zone-b": 2, "zone-c": 1},
},
}

for testName, testData := range tests {
Expand Down Expand Up @@ -3117,7 +3194,7 @@ func TestRing_ShuffleShardWithLookback_CachingConcurrency(t *testing.T) {
func BenchmarkRing_ShuffleShard(b *testing.B) {
for _, numInstances := range []int{50, 100, 1000} {
for _, numZones := range []int{1, 3} {
for _, shardSize := range []int{3, 10, 30} {
for _, shardSize := range []int{0, 3, 10, 30} {
b.Run(fmt.Sprintf("num instances = %d, num zones = %d, shard size = %d", numInstances, numZones, shardSize), func(b *testing.B) {
benchmarkShuffleSharding(b, numInstances, numZones, 128, shardSize, false)
})
Expand All @@ -3129,7 +3206,7 @@ func BenchmarkRing_ShuffleShard(b *testing.B) {
func BenchmarkRing_ShuffleShardCached(b *testing.B) {
for _, numInstances := range []int{50, 100, 1000} {
for _, numZones := range []int{1, 3} {
for _, shardSize := range []int{3, 10, 30} {
for _, shardSize := range []int{0, 3, 10, 30} {
b.Run(fmt.Sprintf("num instances = %d, num zones = %d, shard size = %d", numInstances, numZones, shardSize), func(b *testing.B) {
benchmarkShuffleSharding(b, numInstances, numZones, 128, shardSize, true)
})
Expand Down Expand Up @@ -3162,6 +3239,18 @@ func BenchmarkRing_ShuffleShard_LargeShardSize(b *testing.B) {
benchmarkShuffleSharding(b, numInstances, numZones, numTokens, shardSize, cacheEnabled)
}

func BenchmarkRing_ShuffleShard_ShardSize_0(b *testing.B) {
const (
numInstances = 90
numZones = 3
numTokens = 512
shardSize = 0
cacheEnabled = false
)

benchmarkShuffleSharding(b, numInstances, numZones, numTokens, shardSize, cacheEnabled)
}

func benchmarkShuffleSharding(b *testing.B, numInstances, numZones, numTokens, shardSize int, cache bool) {
// Initialise the ring.
ringDesc := &Desc{Ingesters: generateRingInstances(initTokenGenerator(b), numInstances, numZones, numTokens)}
Expand Down

0 comments on commit aa1c6eb

Please sign in to comment.