Skip to content

Commit a853ed1

Browse files
committed
chore(core): Refactor SmallString
Signed-off-by: Vladislav Oleshko <[email protected]>
1 parent 54e2de9 commit a853ed1

File tree

3 files changed

+57
-124
lines changed

3 files changed

+57
-124
lines changed

src/core/compact_object.cc

Lines changed: 21 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,6 @@ struct TL {
380380

381381
thread_local TL tl;
382382

383-
constexpr bool kUseSmallStrings = true;
384383
constexpr bool kUseAsciiEncoding = true;
385384

386385
} // namespace
@@ -1153,31 +1152,24 @@ void CompactObj::GetString(char* dest) const {
11531152
return;
11541153
} else {
11551154
CHECK_EQ(SMALL_TAG, taglen_);
1156-
string_view slices[2];
1157-
unsigned num = u_.small_str.GetV(slices);
1158-
DCHECK_EQ(2u, num);
1159-
size_t decoded_len = GetStrEncoding().DecodedSize(u_.small_str.size(), slices[0][0]);
1155+
auto& ss = u_.small_str;
1156+
1157+
size_t decoded_len = GetStrEncoding().DecodedSize(ss.size(), ss.first_byte());
11601158

11611159
if (mask_bits_.encoding == HUFFMAN_ENC) {
1162-
tl.tmp_buf.resize(slices[0].size() + slices[1].size() - 1);
1163-
uint8_t* next = tl.tmp_buf.data();
1164-
memcpy(next, slices[0].data() + 1, slices[0].size() - 1);
1165-
next += slices[0].size() - 1;
1166-
memcpy(next, slices[1].data(), slices[1].size());
1167-
string_view src(reinterpret_cast<const char*>(tl.tmp_buf.data()), tl.tmp_buf.size());
1160+
tl.tmp_buf.resize(ss.size());
1161+
auto* base = reinterpret_cast<char*>(tl.tmp_buf.data());
1162+
ss.Get(base);
1163+
11681164
const auto& decoder = tl.GetHuffmanDecoder(huffman_domain_);
1169-
CHECK(decoder.Decode(src, decoded_len, dest));
1165+
CHECK(decoder.Decode({base + 1, ss.size() - 1}, decoded_len, dest)); // skip first char
11701166
return;
11711167
}
11721168

11731169
// we left some space on the left to allow inplace ascii unpacking.
1174-
size_t space_left = decoded_len - u_.small_str.size();
1175-
1176-
char* next = dest + space_left;
1177-
memcpy(next, slices[0].data(), slices[0].size());
1178-
next += slices[0].size();
1179-
memcpy(next, slices[1].data(), slices[1].size());
1180-
detail::ascii_unpack_simd(reinterpret_cast<uint8_t*>(dest + space_left), decoded_len, dest);
1170+
char* offset_dest = dest + (decoded_len - u_.small_str.size());
1171+
ss.Get(offset_dest);
1172+
detail::ascii_unpack_simd(reinterpret_cast<uint8_t*>(offset_dest), decoded_len, dest);
11811173
}
11821174
return;
11831175
}
@@ -1190,15 +1182,8 @@ void CompactObj::GetString(char* dest) const {
11901182
return;
11911183
}
11921184

1193-
if (taglen_ == SMALL_TAG) {
1194-
string_view slices[2];
1195-
unsigned num = u_.small_str.GetV(slices);
1196-
DCHECK_EQ(2u, num);
1197-
memcpy(dest, slices[0].data(), slices[0].size());
1198-
dest += slices[0].size();
1199-
memcpy(dest, slices[1].data(), slices[1].size());
1200-
return;
1201-
}
1185+
if (taglen_ == SMALL_TAG)
1186+
return u_.small_str.Get(dest);
12021187

12031188
LOG(FATAL) << "Bad tag " << int(taglen_);
12041189
}
@@ -1258,7 +1243,7 @@ void CompactObj::Materialize(std::string_view blob, bool is_raw) {
12581243
DCHECK_GT(blob.size(), kInlineLen); // There are no mutable commands that shrink strings
12591244

12601245
if (is_raw) {
1261-
if (kUseSmallStrings && SmallString::CanAllocate(blob.size())) {
1246+
if (SmallString::CanAllocate(blob.size())) {
12621247
SetMeta(SMALL_TAG, mask_);
12631248
tl.small_str_bytes += u_.small_str.Assign(blob);
12641249
} else {
@@ -1471,8 +1456,7 @@ bool CompactObj::CmpEncoded(string_view sv) const {
14711456
DCHECK_GT(sv.size(), 16u); // we would not be in SMALL_TAG, otherwise.
14721457

14731458
string_view slice[2];
1474-
unsigned num = u_.small_str.GetV(slice);
1475-
DCHECK_EQ(2u, num);
1459+
u_.small_str.Get(slice);
14761460
DCHECK_LT(slice[0].size(), 14u);
14771461

14781462
uint8_t tmpbuf[14];
@@ -1581,18 +1565,14 @@ void CompactObj::EncodeString(string_view str, bool is_key) {
15811565

15821566
DCHECK_GT(encoded.size(), kInlineLen);
15831567

1584-
if (kUseSmallStrings && SmallString::CanAllocate(encoded.size())) {
1585-
if (taglen_ == 0) {
1568+
if (SmallString::CanAllocate(encoded.size())) {
1569+
if (taglen_ == SMALL_TAG)
1570+
tl.small_str_bytes -= u_.small_str.MallocUsed();
1571+
else
15861572
SetMeta(SMALL_TAG, mask_);
1587-
tl.small_str_bytes += u_.small_str.Assign(encoded);
1588-
return;
1589-
}
15901573

1591-
if (taglen_ == SMALL_TAG && encoded.size() <= u_.small_str.size()) {
1592-
tl.small_str_bytes -= u_.small_str.MallocUsed();
1593-
tl.small_str_bytes += u_.small_str.Assign(encoded);
1594-
return;
1595-
}
1574+
tl.small_str_bytes += u_.small_str.Assign(encoded);
1575+
return;
15961576
}
15971577

15981578
SetMeta(ROBJ_TAG, mask_);

src/core/small_string.cc

Lines changed: 27 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -54,55 +54,33 @@ size_t SmallString::UsedThreadLocal() {
5454

5555
static_assert(sizeof(SmallString) == 16);
5656

57-
// we should use only for sizes greater than kPrefLen
5857
size_t SmallString::Assign(std::string_view s) {
5958
DCHECK_GT(s.size(), kPrefLen);
60-
59+
DCHECK(CanAllocate(s.size()));
6160
uint8_t* realptr = nullptr;
6261

63-
if (size_ == 0) {
64-
// packed structs can not be tied here.
62+
// If the allocation is large enough and space efficient, we can avoid allocating
63+
if (s.size() >= size_ || s.size() * 2 < MallocUsed()) {
64+
if (size_)
65+
Free();
6566
auto [sp, rp] = tl.seg_alloc->Allocate(s.size() - kPrefLen);
6667
small_ptr_ = sp;
6768
realptr = rp;
68-
size_ = s.size();
69-
} else if (s.size() <= size_) {
70-
realptr = tl.seg_alloc->Translate(small_ptr_);
71-
72-
if (s.size() < size_) {
73-
size_t capacity = mi_usable_size(realptr);
74-
if (s.size() * 2 < capacity) {
75-
tl.seg_alloc->Free(small_ptr_);
76-
auto [sp, rp] = tl.seg_alloc->Allocate(s.size() - kPrefLen);
77-
small_ptr_ = sp;
78-
realptr = rp;
79-
}
80-
size_ = s.size();
81-
}
82-
} else {
83-
LOG(FATAL) << "TBD: Bad usage";
8469
}
8570

71+
size_ = s.size();
8672
memcpy(prefix_, s.data(), kPrefLen);
8773
memcpy(realptr, s.data() + kPrefLen, s.size() - kPrefLen);
88-
8974
return mi_malloc_usable_size(realptr);
9075
}
9176

9277
void SmallString::Free() {
93-
if (size_ <= kPrefLen)
94-
return;
95-
9678
tl.seg_alloc->Free(small_ptr_);
9779
size_ = 0;
9880
}
9981

10082
uint16_t SmallString::MallocUsed() const {
101-
if (size_ <= kPrefLen)
102-
return 0;
103-
auto* realptr = tl.seg_alloc->Translate(small_ptr_);
104-
105-
return mi_malloc_usable_size(realptr);
83+
return mi_malloc_usable_size(tl.seg_alloc->Translate(small_ptr_));
10684
}
10785

10886
bool SmallString::Equal(std::string_view o) const {
@@ -112,13 +90,10 @@ bool SmallString::Equal(std::string_view o) const {
11290
if (size_ == 0)
11391
return true;
11492

115-
DCHECK_GT(size_, kPrefLen);
116-
11793
if (memcmp(prefix_, o.data(), kPrefLen) != 0)
11894
return false;
11995

12096
uint8_t* realp = tl.seg_alloc->Translate(small_ptr_);
121-
12297
return memcmp(realp, o.data() + kPrefLen, size_ - kPrefLen) == 0;
12398
}
12499

@@ -127,21 +102,16 @@ bool SmallString::Equal(const SmallString& os) const {
127102
return false;
128103

129104
string_view me[2], other[2];
130-
unsigned n1 = GetV(me);
131-
unsigned n2 = os.GetV(other);
132-
133-
if (n1 != n2)
134-
return false;
105+
Get(me);
106+
os.Get(other);
135107

136108
return me[0] == other[0] && me[1] == other[1];
137109
}
138110

139111
uint64_t SmallString::HashCode() const {
140-
DCHECK_GT(size_, kPrefLen);
141-
142112
string_view slice[2];
113+
Get(slice);
143114

144-
GetV(slice);
145115
XXH3_state_t* state = tl.xxh_state.get();
146116
XXH3_64bits_reset_withSeed(state, kHashSeed);
147117
XXH3_64bits_update(state, slice[0].data(), slice[0].size());
@@ -150,41 +120,35 @@ uint64_t SmallString::HashCode() const {
150120
return XXH3_64bits_digest(state);
151121
}
152122

153-
void SmallString::Get(std::string* dest) const {
154-
dest->resize(size_);
155-
if (size_) {
156-
DCHECK_GT(size_, kPrefLen);
157-
memcpy(dest->data(), prefix_, kPrefLen);
158-
uint8_t* ptr = tl.seg_alloc->Translate(small_ptr_);
159-
memcpy(dest->data() + kPrefLen, ptr, size_ - kPrefLen);
160-
}
161-
}
162-
163-
unsigned SmallString::GetV(string_view dest[2]) const {
164-
DCHECK_GT(size_, kPrefLen);
165-
if (size_ <= kPrefLen) {
166-
dest[0] = string_view{prefix_, size_};
167-
return 1;
168-
}
123+
void SmallString::Get(string_view dest[2]) const {
124+
DCHECK(size_);
169125

170126
dest[0] = string_view{prefix_, kPrefLen};
171127
uint8_t* ptr = tl.seg_alloc->Translate(small_ptr_);
172128
dest[1] = string_view{reinterpret_cast<char*>(ptr), size_ - kPrefLen};
173-
return 2;
174129
}
175130

176-
bool SmallString::DefragIfNeeded(PageUsage* page_usage) {
177-
DCHECK_GT(size_, kPrefLen);
178-
if (size_ <= kPrefLen) {
179-
return false;
180-
}
131+
void SmallString::Get(char* out) const {
132+
string_view strs[2];
133+
Get(strs);
134+
memcpy(out, strs[0].data(), strs[0].size());
135+
memcpy(out + strs[0].size(), strs[1].data(), strs[1].size());
136+
}
137+
138+
void SmallString::Get(std::string* dest) const {
139+
dest->resize(size_);
140+
Get(dest->data());
141+
}
181142

143+
bool SmallString::DefragIfNeeded(PageUsage* page_usage) {
182144
uint8_t* cur_real_ptr = tl.seg_alloc->Translate(small_ptr_);
183145
if (!page_usage->IsPageForObjectUnderUtilized(tl.seg_alloc->heap(), cur_real_ptr))
184146
return false;
185147

186-
auto [sp, rp] = tl.seg_alloc->Allocate(size_ - kPrefLen);
148+
if (!CanAllocate(size_ - kPrefLen)) // Forced
149+
return false;
187150

151+
auto [sp, rp] = tl.seg_alloc->Allocate(size_ - kPrefLen);
188152
memcpy(rp, cur_real_ptr, size_ - kPrefLen);
189153
tl.seg_alloc->Free(small_ptr_);
190154
small_ptr_ = sp;

src/core/small_string.h

Lines changed: 9 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,8 @@ namespace dfly {
1010

1111
class PageUsage;
1212

13-
// blob strings of upto ~256B. Small sizes are probably predominant
14-
// for in-memory workloads, especially for keys.
15-
// Please note that this class does not have automatic constructors and destructors, therefore
16-
// it requires explicit management.
13+
// Efficient storage of strings longer than 10 bytes and up to 256 bytes.
14+
// Requires explicit memory management
1715
class SmallString {
1816
static constexpr unsigned kPrefLen = 10;
1917
static constexpr unsigned kMaxSize = (1 << 8) - 1;
@@ -23,41 +21,32 @@ class SmallString {
2321
static size_t UsedThreadLocal();
2422
static bool CanAllocate(size_t size);
2523

26-
void Reset() {
27-
size_ = 0;
28-
}
29-
3024
// Returns malloc used.
3125
size_t Assign(std::string_view s);
3226
void Free();
3327

3428
bool Equal(std::string_view o) const;
3529
bool Equal(const SmallString& mps) const;
3630

37-
uint16_t size() const {
38-
return size_;
39-
}
40-
4131
uint64_t HashCode() const;
42-
43-
// I am lying here. we should use mi_malloc_usable size really.
4432
uint16_t MallocUsed() const;
4533

34+
void Get(std::string_view dest[2]) const;
35+
void Get(char* out) const;
4636
void Get(std::string* dest) const;
4737

48-
// returns 1 or 2 slices representing this small string.
49-
// Guarantees zero copy, i.e. dest will not point to any of external buffers.
50-
// With current implementation, it will return 2 slices for a non-empty string.
51-
unsigned GetV(std::string_view dest[2]) const;
52-
5338
bool DefragIfNeeded(PageUsage* page_usage);
5439

40+
size_t size() const {
41+
return size_;
42+
}
43+
5544
uint8_t first_byte() const {
5645
return prefix_[0];
5746
}
5847

5948
private:
60-
// prefix of the string that is broken down into 2 parts.
49+
// The string is stored broken up into two parts, the first one - in this array
6150
char prefix_[kPrefLen];
6251

6352
uint32_t small_ptr_; // 32GB capacity because we ignore 3 lsb bits (i.e. x8).

0 commit comments

Comments
 (0)