forked from axiomhq/hyperloglog
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sparse.go
119 lines (100 loc) · 2.38 KB
/
sparse.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
package hyperloglog
import (
"math/bits"
"github.com/kamstrup/intmap"
)
func getIndex(k uint32, p, pp uint8) uint32 {
if k&1 == 1 {
return bextr32(k, 32-p, p)
}
return bextr32(k, pp-p+1, p)
}
// Encode a hash to be used in the sparse representation.
func encodeHash(x uint64, p, pp uint8) uint32 {
idx := uint32(bextr(x, 64-pp, pp))
if bextr(x, 64-pp, pp-p) == 0 {
zeros := bits.LeadingZeros64((bextr(x, 0, 64-pp)<<pp)|(1<<pp-1)) + 1
return idx<<7 | uint32(zeros<<1) | 1
}
return idx << 1
}
// Decode a hash from the sparse representation.
func decodeHash(k uint32, p, pp uint8) (uint32, uint8) {
var r uint8
if k&1 == 1 {
r = uint8(bextr32(k, 1, 6)) + pp - p
} else {
// We can use the 64bit clz implementation and reduce the result
// by 32 to get a clz for a 32bit word.
r = uint8(bits.LeadingZeros64(uint64(k<<(32-pp+p-1))) - 31) // -32 + 1
}
return getIndex(k, p, pp), r
}
type set struct {
m *intmap.Set[uint32]
}
func newSet(size int) *set {
return &set{m: intmap.NewSet[uint32](size)}
}
func (s *set) ForEach(fn func(v uint32)) {
s.m.ForEach(func(v uint32) bool {
fn(v)
return true
})
}
func (s *set) Merge(other *set) {
other.m.ForEach(func(v uint32) bool {
s.m.Add(v)
return true
})
}
func (s *set) Len() int {
return s.m.Len()
}
func (s *set) add(v uint32) bool {
if s.m.Has(v) {
return false
}
s.m.Add(v)
return true
}
func (s *set) Clone() *set {
if s == nil {
return nil
}
newS := intmap.NewSet[uint32](s.m.Len())
s.m.ForEach(func(v uint32) bool {
newS.Add(v)
return true
})
return &set{m: newS}
}
func (s *set) MarshalBinary() (data []byte, err error) {
// 4 bytes for the size of the set, and 4 bytes for each key.
// list.
data = make([]byte, 0, 4+(4*s.m.Len()))
// Length of the set. We only need 32 bits because the size of the set
// couldn't exceed that on 32 bit architectures.
sl := s.m.Len()
data = append(data, []byte{
byte(sl >> 24),
byte(sl >> 16),
byte(sl >> 8),
byte(sl),
}...)
// Marshal each element in the set.
s.m.ForEach(func(k uint32) bool {
data = append(data, []byte{
byte(k >> 24),
byte(k >> 16),
byte(k >> 8),
byte(k),
}...)
return true
})
return data, nil
}
type uint64Slice []uint32
func (p uint64Slice) Len() int { return len(p) }
func (p uint64Slice) Less(i, j int) bool { return p[i] < p[j] }
func (p uint64Slice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }