Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 28 additions & 4 deletions binaryfusefilter.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,11 @@ type BinaryFuseBuilder struct {
//
// The function may return an error if the set is empty.
func BuildBinaryFuse[T Unsigned](b *BinaryFuseBuilder, keys []uint64) (BinaryFuse[T], error) {
f, _, err := buildBinaryFuse[T](b, keys)
return f, err
}

func buildBinaryFuse[T Unsigned](b *BinaryFuseBuilder, keys []uint64) (_ BinaryFuse[T], iterations int, _ error) {
size := uint32(len(keys))
var filter BinaryFuse[T]
filter.initializeParameters(b, size)
Expand All @@ -78,13 +83,32 @@ func BuildBinaryFuse[T Unsigned](b *BinaryFuseBuilder, keys []uint64) (BinaryFus
var h012 [6]uint32
// this could be used to compute the mod3
// tabmod3 := [5]uint8{0,1,2,0,1}
iterations := 0
for {
iterations += 1
if iterations > MaxIterations {
// The probability of this happening is lower than the cosmic-ray
// probability (i.e., a cosmic ray corrupts your system).
return BinaryFuse[T]{}, errors.New("too many iterations")
return BinaryFuse[T]{}, iterations, errors.New("too many iterations")
}
if size > 4 && size < 1_000_000 {
// The segment length is calculated using an empirical formula. For some
// sizes, the segment length is too large and leads to many iterations.
// Once every four iterations, use the previous segment length while
// keeping the same capacity. See TestBinaryFuseBoundarySizes.
switch iterations % 4 {
case 2:
// Switch to smaller segment size.
filter.SegmentLength /= 2
filter.SegmentLengthMask = filter.SegmentLength - 1
filter.SegmentCount = filter.SegmentCount*2 + 2
filter.SegmentCountLength = filter.SegmentCount * filter.SegmentLength
case 3:
// Restore the calculated segment size.
filter.SegmentLength *= 2
filter.SegmentLengthMask = filter.SegmentLength - 1
filter.SegmentCount = filter.SegmentCount/2 - 1
filter.SegmentCountLength = filter.SegmentCount * filter.SegmentLength
}
}

blockBits := 1
Expand Down Expand Up @@ -228,7 +252,7 @@ func BuildBinaryFuse[T Unsigned](b *BinaryFuseBuilder, keys []uint64) (BinaryFus
filter.Seed = splitmix64(&rngcounter)
}
if size == 0 {
return filter, nil
return filter, iterations, nil
}

for i := int(size - 1); i >= 0; i-- {
Expand All @@ -245,7 +269,7 @@ func BuildBinaryFuse[T Unsigned](b *BinaryFuseBuilder, keys []uint64) (BinaryFus
filter.Fingerprints[h012[found]] = xor2 ^ filter.Fingerprints[h012[found+1]] ^ filter.Fingerprints[h012[found+2]]
}

return filter, nil
return filter, iterations, nil
}

func (filter *BinaryFuse[T]) initializeParameters(b *BinaryFuseBuilder, size uint32) {
Expand Down
132 changes: 132 additions & 0 deletions binaryfusefilter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ import (
"fmt"
"math/rand/v2"
"slices"
"sort"
"strings"
"sync"
"testing"

"github.com/cespare/xxhash/v2"
Expand Down Expand Up @@ -377,3 +380,132 @@ func crossCheckFuseBuilder[T Unsigned](t *testing.T, bld *BinaryFuseBuilder, key
_ = expected
require.Equal(t, *expected, filter)
}

// segmentLengthSizes contains represents the range of sizes [startSize, endSize] that
// all get the same segmentLength.
type segmentLengthSizes struct {
segmentLength uint32
startSize uint32
startSegmentCount uint32
endSize uint32
endSegmentCount uint32
}

var binaryFuseParamStableOnce struct {
once sync.Once
result []segmentLengthSizes
}

const binaryFuseParamTableMaxSegmentSize = 16384

func binaryFuseSegLenAndCnt(size uint32) (segLen uint32, segCnt uint32) {
var f BinaryFuse[uint8]
f.initializeParameters(&BinaryFuseBuilder{}, size)
return f.SegmentLength, f.SegmentCount
}

func binaryFuseParamsTable() []segmentLengthSizes {
binaryFuseParamStableOnce.once.Do(func() {
var table []segmentLengthSizes
size := uint32(1)
for {
segLen, segCnt := binaryFuseSegLenAndCnt(size)
if segLen > binaryFuseParamTableMaxSegmentSize {
break
}
// Find the first size that changes the segment length.
n := uint32(sort.Search(int(size*4), func(x int) bool {
l, _ := binaryFuseSegLenAndCnt(size + uint32(x))
return l != segLen
}))
_, endSegCnt := binaryFuseSegLenAndCnt(size + n - 1)
table = append(table, segmentLengthSizes{
segmentLength: segLen,
startSize: size,
startSegmentCount: segCnt,
endSize: size + n - 1,
endSegmentCount: endSegCnt,
})
size += n
}
binaryFuseParamStableOnce.result = table
})
return binaryFuseParamStableOnce.result
}

// TestBinaryFuseParams shows the segment count and size range for each segment
// length. Used to verify any changes in parameter calculation.
func TestBinaryFuseParams(t *testing.T) {
expected := `
| SegLen | SegCnt range | Size range |
|--------|--------------|-------------------|
| 4 | 1 - 1 | 1 - 2 |
| 8 | 1 - 1 | 3 - 8 |
| 16 | 1 - 2 | 9 - 27 |
| 32 | 1 - 3 | 28 - 91 |
| 64 | 1 - 5 | 92 - 303 |
| 128 | 2 - 9 | 304 - 1009 |
| 256 | 4 - 16 | 1010 - 3361 |
| 512 | 7 - 26 | 3362 - 11192 |
| 1024 | 12 - 42 | 11193 - 37272 |
| 2048 | 20 - 69 | 37273 - 124117 |
| 4096 | 34 - 114 | 124118 - 413309 |
| 8192 | 56 - 188 | 413310 - 1376321 |
| 16384 | 93 - 313 | 1376322 - 4583149 |
`

var out strings.Builder
fmt.Fprintf(&out, "| SegLen | SegCnt range | Size range |\n")
fmt.Fprintf(&out, "|--------|--------------|-------------------|\n")
for _, row := range binaryFuseParamsTable() {
fmt.Fprintf(&out, "| %6d | %4d - %-5d | %7d - %-7d |\n",
row.segmentLength,
row.startSegmentCount, row.endSegmentCount,
row.startSize, row.endSize,
)
}
str := out.String()
require.Equal(t, strings.TrimSpace(expected), strings.TrimSpace(str))
}

func checkNumIterations(t *testing.T, size uint32) {
const numTrials = 20

keys := make([]uint64, size)
var totalIterations, maxIterations int
for range numTrials {
for i := range keys {
keys[i] = rand.Uint64()
}
var b BinaryFuseBuilder
filter, iterations, err := buildBinaryFuse[uint8](&b, keys)
require.NoError(t, err)
for range 100 {
require.True(t, filter.Contains(keys[rand.IntN(len(keys))]))
}
totalIterations += iterations
maxIterations = max(maxIterations, iterations)
}
t.Logf("size: %d iterations: %.2f avg (%d max)", size, float64(totalIterations)/numTrials, maxIterations)
}

func TestBinaryFuseBoundarySizes(t *testing.T) {
// For each segment length, test the smallest and largest segment count. For a
// given segment count, we want to choose the largest size for that count
// (which has the least "slack" space).
for _, s := range binaryFuseParamsTable() {
if s.startSize > 1_000_000 {
// Larger sizes take too long to test.
break
}
if s.startSegmentCount != s.endSegmentCount {
// Find the first size that doesn't use the start segment count.
n := uint32(sort.Search(int(s.endSize-s.startSize+1), func(x int) bool {
l, c := binaryFuseSegLenAndCnt(s.startSize + uint32(x))
return l != s.segmentLength || c != s.startSegmentCount
}))
checkNumIterations(t, s.startSize+n-1)
}
checkNumIterations(t, s.endSize)
}
}
Loading