Skip to content

Commit c507fa4

Browse files
authored
chore(core): Refactor SmallString (#6138)
1 parent f31f3c9 commit c507fa4

File tree

3 files changed

+72
-134
lines changed

3 files changed

+72
-134
lines changed

src/core/compact_object.cc

Lines changed: 28 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,6 @@ struct TL {
380380

381381
thread_local TL tl;
382382

383-
constexpr bool kUseSmallStrings = true;
384383
constexpr bool kUseAsciiEncoding = true;
385384

386385
} // namespace
@@ -1148,40 +1147,32 @@ void CompactObj::GetString(char* dest) const {
11481147
}
11491148

11501149
if (mask_bits_.encoding) {
1150+
StrEncoding str_encoding = GetStrEncoding();
1151+
string_view decode_blob;
1152+
11511153
if (taglen_ == ROBJ_TAG) {
11521154
CHECK_EQ(OBJ_STRING, u_.r_obj.type());
11531155
DCHECK_EQ(OBJ_ENCODING_RAW, u_.r_obj.encoding());
1154-
string_view blob{(const char*)u_.r_obj.inner_obj(), u_.r_obj.Size()};
1155-
GetStrEncoding().Decode(blob, dest);
1156-
return;
1156+
decode_blob = {(const char*)u_.r_obj.inner_obj(), u_.r_obj.Size()};
11571157
} else {
11581158
CHECK_EQ(SMALL_TAG, taglen_);
1159-
string_view slices[2];
1160-
unsigned num = u_.small_str.GetV(slices);
1161-
DCHECK_EQ(2u, num);
1162-
size_t decoded_len = GetStrEncoding().DecodedSize(u_.small_str.size(), slices[0][0]);
1163-
1164-
if (mask_bits_.encoding == HUFFMAN_ENC) {
1165-
tl.tmp_buf.resize(slices[0].size() + slices[1].size() - 1);
1166-
uint8_t* next = tl.tmp_buf.data();
1167-
memcpy(next, slices[0].data() + 1, slices[0].size() - 1);
1168-
next += slices[0].size() - 1;
1169-
memcpy(next, slices[1].data(), slices[1].size());
1170-
string_view src(reinterpret_cast<const char*>(tl.tmp_buf.data()), tl.tmp_buf.size());
1171-
const auto& decoder = tl.GetHuffmanDecoder(huffman_domain_);
1172-
CHECK(decoder.Decode(src, decoded_len, dest));
1173-
return;
1174-
}
1159+
auto& ss = u_.small_str;
11751160

1176-
// we left some space on the left to allow inplace ascii unpacking.
1177-
size_t space_left = decoded_len - u_.small_str.size();
1161+
char* copy_dest;
1162+
if (str_encoding.enc_ == HUFFMAN_ENC) {
1163+
tl.tmp_buf.resize(ss.size());
1164+
copy_dest = reinterpret_cast<char*>(tl.tmp_buf.data());
1165+
} else {
1166+
// Write to rightmost location of dest buffer to leave some bytes for inline unpacking
1167+
size_t decoded_len = str_encoding.DecodedSize(ss.size(), ss.first_byte());
1168+
copy_dest = dest + (decoded_len - ss.size());
1169+
}
11781170

1179-
char* next = dest + space_left;
1180-
memcpy(next, slices[0].data(), slices[0].size());
1181-
next += slices[0].size();
1182-
memcpy(next, slices[1].data(), slices[1].size());
1183-
detail::ascii_unpack_simd(reinterpret_cast<uint8_t*>(dest + space_left), decoded_len, dest);
1171+
ss.Get(copy_dest);
1172+
decode_blob = {copy_dest, ss.size()};
11841173
}
1174+
1175+
str_encoding.Decode(decode_blob, dest);
11851176
return;
11861177
}
11871178

@@ -1193,15 +1184,8 @@ void CompactObj::GetString(char* dest) const {
11931184
return;
11941185
}
11951186

1196-
if (taglen_ == SMALL_TAG) {
1197-
string_view slices[2];
1198-
unsigned num = u_.small_str.GetV(slices);
1199-
DCHECK_EQ(2u, num);
1200-
memcpy(dest, slices[0].data(), slices[0].size());
1201-
dest += slices[0].size();
1202-
memcpy(dest, slices[1].data(), slices[1].size());
1203-
return;
1204-
}
1187+
if (taglen_ == SMALL_TAG)
1188+
return u_.small_str.Get(dest);
12051189

12061190
LOG(FATAL) << "Bad tag " << int(taglen_);
12071191
}
@@ -1267,7 +1251,7 @@ void CompactObj::Materialize(std::string_view blob, bool is_raw) {
12671251
DCHECK_GT(blob.size(), kInlineLen); // There are no mutable commands that shrink strings
12681252

12691253
if (is_raw) {
1270-
if (kUseSmallStrings && SmallString::CanAllocate(blob.size())) {
1254+
if (SmallString::CanAllocate(blob.size())) {
12711255
SetMeta(SMALL_TAG, mask_);
12721256
tl.small_str_bytes += u_.small_str.Assign(blob);
12731257
} else {
@@ -1481,8 +1465,7 @@ bool CompactObj::CmpEncoded(string_view sv) const {
14811465
DCHECK_GT(sv.size(), 16u); // we would not be in SMALL_TAG, otherwise.
14821466

14831467
string_view slice[2];
1484-
unsigned num = u_.small_str.GetV(slice);
1485-
DCHECK_EQ(2u, num);
1468+
u_.small_str.Get(slice);
14861469
DCHECK_LT(slice[0].size(), 14u);
14871470

14881471
uint8_t tmpbuf[14];
@@ -1591,18 +1574,14 @@ void CompactObj::EncodeString(string_view str, bool is_key) {
15911574

15921575
DCHECK_GT(encoded.size(), kInlineLen);
15931576

1594-
if (kUseSmallStrings && SmallString::CanAllocate(encoded.size())) {
1595-
if (taglen_ == 0) {
1577+
if (SmallString::CanAllocate(encoded.size())) {
1578+
if (taglen_ == SMALL_TAG)
1579+
tl.small_str_bytes -= u_.small_str.MallocUsed();
1580+
else
15961581
SetMeta(SMALL_TAG, mask_);
1597-
tl.small_str_bytes += u_.small_str.Assign(encoded);
1598-
return;
1599-
}
16001582

1601-
if (taglen_ == SMALL_TAG && encoded.size() <= u_.small_str.size()) {
1602-
tl.small_str_bytes -= u_.small_str.MallocUsed();
1603-
tl.small_str_bytes += u_.small_str.Assign(encoded);
1604-
return;
1605-
}
1583+
tl.small_str_bytes += u_.small_str.Assign(encoded);
1584+
return;
16061585
}
16071586

16081587
SetMeta(ROBJ_TAG, mask_);

src/core/small_string.cc

Lines changed: 35 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -54,55 +54,39 @@ size_t SmallString::UsedThreadLocal() {
5454

5555
static_assert(sizeof(SmallString) == 16);
5656

57-
// we should use only for sizes greater than kPrefLen
5857
size_t SmallString::Assign(std::string_view s) {
5958
DCHECK_GT(s.size(), kPrefLen);
60-
59+
DCHECK(CanAllocate(s.size()));
6160
uint8_t* realptr = nullptr;
6261

63-
if (size_ == 0) {
64-
// packed structs can not be tied here.
65-
auto [sp, rp] = tl.seg_alloc->Allocate(s.size() - kPrefLen);
62+
// reallocate if we need a larger allocation or it becomes space-inefficient
63+
size_t heap_len = s.size() - kPrefLen;
64+
if (size_t available = MallocUsed(); available < heap_len || heap_len * 2 < available) {
65+
Free();
66+
67+
auto [sp, rp] = tl.seg_alloc->Allocate(heap_len);
6668
small_ptr_ = sp;
6769
realptr = rp;
68-
size_ = s.size();
69-
} else if (s.size() <= size_) {
70-
realptr = tl.seg_alloc->Translate(small_ptr_);
71-
72-
if (s.size() < size_) {
73-
size_t capacity = mi_usable_size(realptr);
74-
if (s.size() * 2 < capacity) {
75-
tl.seg_alloc->Free(small_ptr_);
76-
auto [sp, rp] = tl.seg_alloc->Allocate(s.size() - kPrefLen);
77-
small_ptr_ = sp;
78-
realptr = rp;
79-
}
80-
size_ = s.size();
81-
}
8270
} else {
83-
LOG(FATAL) << "TBD: Bad usage";
71+
realptr = tl.seg_alloc->Translate(small_ptr_);
8472
}
8573

74+
size_ = s.size();
8675
memcpy(prefix_, s.data(), kPrefLen);
87-
memcpy(realptr, s.data() + kPrefLen, s.size() - kPrefLen);
88-
76+
memcpy(realptr, s.data() + kPrefLen, heap_len);
8977
return mi_malloc_usable_size(realptr);
9078
}
9179

9280
void SmallString::Free() {
93-
if (size_ <= kPrefLen)
94-
return;
95-
96-
tl.seg_alloc->Free(small_ptr_);
81+
if (size_)
82+
tl.seg_alloc->Free(small_ptr_);
9783
size_ = 0;
9884
}
9985

10086
uint16_t SmallString::MallocUsed() const {
101-
if (size_ <= kPrefLen)
102-
return 0;
103-
auto* realptr = tl.seg_alloc->Translate(small_ptr_);
104-
105-
return mi_malloc_usable_size(realptr);
87+
if (size_)
88+
return mi_malloc_usable_size(tl.seg_alloc->Translate(small_ptr_));
89+
return 0;
10690
}
10791

10892
bool SmallString::Equal(std::string_view o) const {
@@ -112,13 +96,10 @@ bool SmallString::Equal(std::string_view o) const {
11296
if (size_ == 0)
11397
return true;
11498

115-
DCHECK_GT(size_, kPrefLen);
116-
11799
if (memcmp(prefix_, o.data(), kPrefLen) != 0)
118100
return false;
119101

120102
uint8_t* realp = tl.seg_alloc->Translate(small_ptr_);
121-
122103
return memcmp(realp, o.data() + kPrefLen, size_ - kPrefLen) == 0;
123104
}
124105

@@ -127,21 +108,16 @@ bool SmallString::Equal(const SmallString& os) const {
127108
return false;
128109

129110
string_view me[2], other[2];
130-
unsigned n1 = GetV(me);
131-
unsigned n2 = os.GetV(other);
132-
133-
if (n1 != n2)
134-
return false;
111+
Get(me);
112+
os.Get(other);
135113

136114
return me[0] == other[0] && me[1] == other[1];
137115
}
138116

139117
uint64_t SmallString::HashCode() const {
140-
DCHECK_GT(size_, kPrefLen);
141-
142118
string_view slice[2];
119+
Get(slice);
143120

144-
GetV(slice);
145121
XXH3_state_t* state = tl.xxh_state.get();
146122
XXH3_64bits_reset_withSeed(state, kHashSeed);
147123
XXH3_64bits_update(state, slice[0].data(), slice[0].size());
@@ -150,41 +126,35 @@ uint64_t SmallString::HashCode() const {
150126
return XXH3_64bits_digest(state);
151127
}
152128

153-
void SmallString::Get(std::string* dest) const {
154-
dest->resize(size_);
155-
if (size_) {
156-
DCHECK_GT(size_, kPrefLen);
157-
memcpy(dest->data(), prefix_, kPrefLen);
158-
uint8_t* ptr = tl.seg_alloc->Translate(small_ptr_);
159-
memcpy(dest->data() + kPrefLen, ptr, size_ - kPrefLen);
160-
}
161-
}
162-
163-
unsigned SmallString::GetV(string_view dest[2]) const {
164-
DCHECK_GT(size_, kPrefLen);
165-
if (size_ <= kPrefLen) {
166-
dest[0] = string_view{prefix_, size_};
167-
return 1;
168-
}
129+
void SmallString::Get(string_view dest[2]) const {
130+
DCHECK(size_);
169131

170132
dest[0] = string_view{prefix_, kPrefLen};
171133
uint8_t* ptr = tl.seg_alloc->Translate(small_ptr_);
172134
dest[1] = string_view{reinterpret_cast<char*>(ptr), size_ - kPrefLen};
173-
return 2;
174135
}
175136

176-
bool SmallString::DefragIfNeeded(PageUsage* page_usage) {
177-
DCHECK_GT(size_, kPrefLen);
178-
if (size_ <= kPrefLen) {
179-
return false;
180-
}
137+
void SmallString::Get(char* out) const {
138+
string_view strs[2];
139+
Get(strs);
140+
memcpy(out, strs[0].data(), strs[0].size());
141+
memcpy(out + strs[0].size(), strs[1].data(), strs[1].size());
142+
}
181143

144+
void SmallString::Get(std::string* dest) const {
145+
dest->resize(size_);
146+
Get(dest->data());
147+
}
148+
149+
bool SmallString::DefragIfNeeded(PageUsage* page_usage) {
182150
uint8_t* cur_real_ptr = tl.seg_alloc->Translate(small_ptr_);
183151
if (!page_usage->IsPageForObjectUnderUtilized(tl.seg_alloc->heap(), cur_real_ptr))
184152
return false;
185153

186-
auto [sp, rp] = tl.seg_alloc->Allocate(size_ - kPrefLen);
154+
if (!CanAllocate(size_ - kPrefLen)) // Forced
155+
return false;
187156

157+
auto [sp, rp] = tl.seg_alloc->Allocate(size_ - kPrefLen);
188158
memcpy(rp, cur_real_ptr, size_ - kPrefLen);
189159
tl.seg_alloc->Free(small_ptr_);
190160
small_ptr_ = sp;

src/core/small_string.h

Lines changed: 9 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,8 @@ namespace dfly {
1010

1111
class PageUsage;
1212

13-
// blob strings of upto ~256B. Small sizes are probably predominant
14-
// for in-memory workloads, especially for keys.
15-
// Please note that this class does not have automatic constructors and destructors, therefore
16-
// it requires explicit management.
13+
// Efficient storage of strings longer than 10 bytes.
14+
// Requires explicit memory management
1715
class SmallString {
1816
static constexpr unsigned kPrefLen = 10;
1917
static constexpr unsigned kMaxSize = (1 << 8) - 1;
@@ -23,41 +21,32 @@ class SmallString {
2321
static size_t UsedThreadLocal();
2422
static bool CanAllocate(size_t size);
2523

26-
void Reset() {
27-
size_ = 0;
28-
}
29-
3024
// Returns malloc used.
3125
size_t Assign(std::string_view s);
3226
void Free();
3327

3428
bool Equal(std::string_view o) const;
3529
bool Equal(const SmallString& mps) const;
3630

37-
uint16_t size() const {
38-
return size_;
39-
}
40-
4131
uint64_t HashCode() const;
42-
43-
// I am lying here. we should use mi_malloc_usable size really.
4432
uint16_t MallocUsed() const;
4533

34+
void Get(std::string_view dest[2]) const;
35+
void Get(char* out) const;
4636
void Get(std::string* dest) const;
4737

48-
// returns 1 or 2 slices representing this small string.
49-
// Guarantees zero copy, i.e. dest will not point to any of external buffers.
50-
// With current implementation, it will return 2 slices for a non-empty string.
51-
unsigned GetV(std::string_view dest[2]) const;
52-
5338
bool DefragIfNeeded(PageUsage* page_usage);
5439

40+
size_t size() const {
41+
return size_;
42+
}
43+
5544
uint8_t first_byte() const {
5645
return prefix_[0];
5746
}
5847

5948
private:
60-
// prefix of the string that is broken down into 2 parts.
49+
// The string is stored broken up into two parts, the first one - in this array
6150
char prefix_[kPrefLen];
6251

6352
uint32_t small_ptr_; // 32GB capacity because we ignore 3 lsb bits (i.e. x8).

0 commit comments

Comments
 (0)