Skip to content

Commit 7ac476f

Browse files
authored
Merge pull request #59 from github/aneubeck/config
Make features configurable
2 parents e9cc593 + c429039 commit 7ac476f

File tree

7 files changed

+358
-189
lines changed

7 files changed

+358
-189
lines changed

.cargo/config.toml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Consider adding "--codegen=link-args=-Wl,--compress-debug-sections=zlib"
2+
3+
[target.x86_64-unknown-linux-gnu]
4+
# SSE3 is requred by simd-varint.
5+
# POPCNT makes `count_ones` (which we use in geofilter and bitrank) more efficient.
6+
rustflags = ["-C", "target-feature=+ssse3,+avx2,+popcnt"]
7+
8+
[target.x86_64-apple-darwin]
9+
# SSE3 is requred by simd-varint.
10+
# POPCNT makes `count_ones` (which we use in geofilter and bitrank) more efficient.
11+
rustflags = ["-C", "target-feature=+ssse3,+avx2,+popcnt"]
12+
13+
[target.aarch64-apple-darwin]
14+
rustflags = ["-C", "target-feature=+neon"]

crates/string-offsets/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[package]
22
name = "string-offsets"
33
authors = ["The blackbird team <support@github.com>"]
4-
version = "0.1.0"
4+
version = "0.2.0"
55
edition = "2021"
66
description = "Converts string offsets between UTF-8 bytes, UTF-16 code units, Unicode code points, and lines."
77
repository = "https://github.com/github/rust-gems"
Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,38 @@
11
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
22
use rand::{rng, Rng};
3-
use string_offsets::StringOffsets;
3+
use string_offsets::{AllConfig, OnlyLines, StringOffsets};
44

5-
fn construction_benchmark(c: &mut Criterion) {
6-
let mut group = c.benchmark_group("construction");
5+
fn only_lines_construction_benchmark(c: &mut Criterion) {
6+
let mut group = c.benchmark_group("only_lines_construction");
77
for size in [1000, 10000, 100000] {
88
let mut rng = rng();
99
// Generate random ascii input.
1010
let random_input: String = (0..size)
11-
.map(|_| rng.random_range(32u8..128) as char)
11+
.map(|_| rng.random_range(32u8..128u8) as char)
1212
.collect();
1313
group.throughput(criterion::Throughput::Bytes(random_input.len() as u64));
1414
group.bench_with_input(
1515
BenchmarkId::from_parameter(size),
1616
&random_input,
17-
|b, input| b.iter(|| black_box(StringOffsets::new(input))),
17+
|b, input| b.iter(|| black_box(StringOffsets::<OnlyLines>::new(input))),
18+
);
19+
}
20+
group.finish();
21+
}
22+
23+
fn full_construction_benchmark(c: &mut Criterion) {
24+
let mut group = c.benchmark_group("full_construction");
25+
for size in [1000, 10000, 100000] {
26+
let mut rng = rng();
27+
// Generate random ascii input.
28+
let random_input: String = (0..size)
29+
.map(|_| rng.random_range(32u8..128u8) as char)
30+
.collect();
31+
group.throughput(criterion::Throughput::Bytes(random_input.len() as u64));
32+
group.bench_with_input(
33+
BenchmarkId::from_parameter(size),
34+
&random_input,
35+
|b, input| b.iter(|| black_box(StringOffsets::<AllConfig>::new(input))),
1836
);
1937
}
2038
group.finish();
@@ -23,6 +41,6 @@ fn construction_benchmark(c: &mut Criterion) {
2341
criterion_group!(
2442
name = benches;
2543
config = Criterion::default();
26-
targets = construction_benchmark
44+
targets = only_lines_construction_benchmark, full_construction_benchmark
2745
);
2846
criterion_main!(benches);

crates/string-offsets/src/bitrank.rs

Lines changed: 33 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,6 @@ struct Block {
4040

4141
impl Block {
4242
/// Set a bit without updating `self.sub_blocks`.
43-
///
44-
/// This panics if the bit was already set, because that indicates that the original positions
45-
/// list is invalid/had duplicates.
4643
fn set(&mut self, index: usize) {
4744
debug_assert!(index < BITS_PER_BLOCK);
4845
let chunk_idx = index / BITS_PER_SUB_BLOCK;
@@ -52,11 +49,7 @@ impl Block {
5249
self.bits[chunk_idx] |= mask;
5350
}
5451

55-
/// The **total rank** of the block relative local index, and the index of the one
56-
/// bit that establishes that rank (aka "select") **if** it occurs within that same
57-
/// chunk, otherwise ['None']. The assumption is that if you would have to look back
58-
/// through previous chunks it would actually be cheaper to do a lookup in the original
59-
/// data structure that the bit vector was created from.
52+
/// The **total rank** of the block relative local index.
6053
fn rank(&self, local_idx: usize) -> usize {
6154
let mut rank = self.rank as usize;
6255
let sub_block = local_idx / BITS_PER_SUB_BLOCK;
@@ -65,11 +58,7 @@ impl Block {
6558
let remainder = local_idx % BITS_PER_SUB_BLOCK;
6659

6760
let last_chunk = local_idx / BITS_PER_SUB_BLOCK;
68-
let masked = if remainder == 0 {
69-
0
70-
} else {
71-
self.bits[last_chunk] << (BITS_PER_SUB_BLOCK - remainder)
72-
};
61+
let masked = self.bits[last_chunk] & !(SubblockBits::MAX << remainder);
7362
rank + masked.count_ones() as usize
7463
}
7564

@@ -176,42 +165,52 @@ mod tests {
176165

177166
/// Creates a `BitRank` containing the integers in `iter` (which should be strictly
178167
/// increasing).
179-
pub fn bitrank<I: IntoIterator<Item = usize>>(capacity: usize, iter: I) -> BitRank {
180-
let mut builder = BitRankBuilder::with_capacity(capacity);
181-
for position in iter {
182-
builder.push(position);
168+
pub fn bitrank<I>(iter: I) -> BitRank
169+
where
170+
I: IntoIterator<Item = usize>,
171+
I::IntoIter: DoubleEndedIterator,
172+
{
173+
let mut iter = iter.into_iter().rev();
174+
if let Some(last) = iter.next() {
175+
let mut builder = BitRankBuilder::with_capacity(last + 1);
176+
builder.push(last);
177+
for position in iter {
178+
builder.push(position);
179+
}
180+
builder.finish()
181+
} else {
182+
BitRank { blocks: vec![] }
183183
}
184-
builder.finish()
185184
}
186185

187186
#[test]
188187
fn test_rank_zero() {
189-
let br = bitrank(1, [0]);
188+
let br = bitrank([0]);
190189
assert_eq!(br.rank(0), 0);
191190
assert_eq!(br.rank(1), 1);
192191
}
193192

194193
#[test]
195194
fn test_empty() {
196-
let br = bitrank(0, []);
195+
let br = bitrank([]);
197196
assert!(br.blocks.is_empty());
198197
}
199198

200199
#[test]
201200
fn test_index_out_of_bounds() {
202-
let br = bitrank(BITS_PER_BLOCK, [BITS_PER_BLOCK - 1]);
201+
let br = bitrank([BITS_PER_BLOCK - 1]);
203202
assert_eq!(br.rank(BITS_PER_BLOCK), 1);
204203
}
205204

206205
#[test]
207206
#[should_panic]
208207
fn test_duplicate_position() {
209-
bitrank(91, [64, 66, 68, 68, 90]);
208+
bitrank([64, 66, 68, 68, 90]);
210209
}
211210

212211
#[test]
213212
fn test_rank_exclusive() {
214-
let br = bitrank(133, 0..132);
213+
let br = bitrank(0..132);
215214
assert_eq!(br.blocks.len(), 1);
216215
assert_eq!(br.rank(64), 64);
217216
assert_eq!(br.rank(132), 132);
@@ -221,37 +220,37 @@ mod tests {
221220
fn test_rank() {
222221
let mut positions: Vec<usize> = (0..132).collect();
223222
positions.append(&mut vec![138usize, 140, 146]);
224-
let br = bitrank(146, positions);
223+
let br = bitrank(positions);
225224
assert_eq!(br.rank(135), 132);
226225

227-
let br2 = bitrank(BITS_PER_BLOCK, 0..BITS_PER_BLOCK - 5);
226+
let br2 = bitrank(0..BITS_PER_BLOCK - 5);
228227
assert_eq!(br2.rank(169), 169);
229228

230-
let br3 = bitrank(BITS_PER_BLOCK + 6, 0..BITS_PER_BLOCK + 5);
229+
let br3 = bitrank(0..BITS_PER_BLOCK + 5);
231230
assert_eq!(br3.rank(BITS_PER_BLOCK), BITS_PER_BLOCK);
232231
}
233232

234233
#[test]
235234
fn test_rank_idx() {
236235
let mut positions: Vec<usize> = (0..132).collect();
237236
positions.append(&mut vec![138usize, 140, 146]);
238-
let br = bitrank(147, positions);
237+
let br = bitrank(positions);
239238
assert_eq!(br.rank(135), 132);
240239

241240
let bits2: Vec<usize> = (0..BITS_PER_BLOCK - 5).collect();
242-
let br2 = bitrank(BITS_PER_BLOCK, bits2);
241+
let br2 = bitrank(bits2);
243242
assert_eq!(br2.rank(169), 169);
244243

245244
let bits3: Vec<usize> = (0..BITS_PER_BLOCK + 5).collect();
246-
let br3 = bitrank(BITS_PER_BLOCK + 6, bits3);
245+
let br3 = bitrank(bits3);
247246
assert_eq!(br3.rank(BITS_PER_BLOCK), BITS_PER_BLOCK);
248247

249248
let bits4: Vec<usize> = vec![1, 1000, 7777, BITS_PER_BLOCK + 1];
250-
let br4 = bitrank(BITS_PER_BLOCK + 1, bits4);
249+
let br4 = bitrank(bits4);
251250
assert_eq!(br4.rank(8000), 3);
252251

253252
let bits5: Vec<usize> = vec![1, 1000, 7777, BITS_PER_BLOCK + 1];
254-
let br5 = bitrank(BITS_PER_BLOCK + 1, bits5);
253+
let br5 = bitrank(bits5);
255254
assert_eq!(br5.rank(BITS_PER_BLOCK), 3);
256255
}
257256

@@ -267,7 +266,7 @@ mod tests {
267266
// This isn't strictly necessary, given that the bit would just be toggled again, but it
268267
// ensures that we are meeting the contract.
269268
random_bits.dedup();
270-
let br = bitrank(1_000_000, random_bits.iter().copied());
269+
let br = bitrank(random_bits.iter().copied());
271270
let mut rank = 0;
272271
for i in 0..random_bits.capacity() {
273272
assert_eq!(br.rank(i), rank);
@@ -282,7 +281,7 @@ mod tests {
282281
#[test]
283282
fn test_rank_out_of_bounds() {
284283
for i in 1..30 {
285-
let br = bitrank(BITS_PER_BLOCK * i, [BITS_PER_BLOCK * i - 1]);
284+
let br = bitrank([BITS_PER_BLOCK * i - 1]);
286285
assert_eq!(br.max_rank(), 1);
287286
assert_eq!(br.rank(BITS_PER_BLOCK * i - 1), 0);
288287
for j in 0..10 {
@@ -293,10 +292,7 @@ mod tests {
293292

294293
#[test]
295294
fn test_large_gap() {
296-
let br = bitrank(
297-
BITS_PER_BLOCK * 16,
298-
(3..4).chain(BITS_PER_BLOCK * 15..BITS_PER_BLOCK * 15 + 17),
299-
);
295+
let br = bitrank((3..4).chain(BITS_PER_BLOCK * 15..BITS_PER_BLOCK * 15 + 17));
300296
for i in 1..15 {
301297
assert_eq!(br.rank(BITS_PER_BLOCK * i), 1);
302298
}

crates/string-offsets/src/config.rs

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
//! Configuration types for enabling/disabling features are compile time.
2+
//!
3+
//! By disabling features, the compiler can generate faster code which can be important for certain use cases.
4+
//! Certain implementations/conversion operations will only be available if the corresponding features were enabled.
5+
6+
/// Type-level boolean.
7+
pub trait Bool {
8+
/// The value of the boolean.
9+
const VALUE: bool;
10+
}
11+
/// Type-level true.
12+
pub struct True {}
13+
/// Type-level false.
14+
pub struct False {}
15+
impl Bool for True {
16+
const VALUE: bool = true;
17+
}
18+
impl Bool for False {
19+
const VALUE: bool = false;
20+
}
21+
22+
/// Configures which features should be enabled for a [`StringOffsets`] instance.
23+
pub trait ConfigType {
24+
/// Whether to enable character conversions.
25+
type HasChars: Bool;
26+
/// Whether to enable UTF-16 conversions.
27+
type HasUtf16: Bool;
28+
/// Whether to enable line conversions.
29+
type HasLines: Bool;
30+
/// Whether to enable whitespace checks.
31+
type HasWhitespace: Bool;
32+
}
33+
34+
/// Configuration type that enables all features.
35+
pub struct AllConfig {}
36+
impl ConfigType for AllConfig {
37+
type HasChars = True;
38+
type HasUtf16 = True;
39+
type HasLines = True;
40+
type HasWhitespace = True;
41+
}
42+
43+
/// Configuration type that only enables line conversions.
44+
pub struct OnlyLines {}
45+
impl ConfigType for OnlyLines {
46+
type HasChars = False;
47+
type HasUtf16 = False;
48+
type HasLines = True;
49+
type HasWhitespace = False;
50+
}

0 commit comments

Comments
 (0)