1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
|
// To run:
//
// cargo criterion --features criterion/real_blackbox
//
// This benchmarks each of the different libraries at several ratios of ASCII to
// non-ASCII content. There is one additional benchmark labeled "baseline" which
// just iterates over characters in a string, converting UTF-8 to 32-bit chars.
//
// Criterion will show a time in milliseconds. The non-baseline bench functions
// each make one million function calls (2 calls per character, 500K characters
// in the strings created by gen_string). The "time per call" listed in our
// readme is computed by subtracting this baseline from the other bench
// functions' time, then dividing by one million (ms -> ns).
#![allow(clippy::needless_pass_by_value)]
#[path = "../tests/fst/mod.rs"]
mod fst;
#[path = "../tests/roaring/mod.rs"]
mod roaring;
#[path = "../tests/trie/mod.rs"]
mod trie;
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use rand::distributions::{Bernoulli, Distribution, Uniform};
use rand::rngs::SmallRng;
use rand::SeedableRng;
use std::time::Duration;
fn gen_string(p_nonascii: u32) -> String {
let mut rng = SmallRng::from_seed([b'!'; 32]);
let pick_nonascii = Bernoulli::from_ratio(p_nonascii, 100).unwrap();
let ascii = Uniform::new_inclusive('\0', '\x7f');
let nonascii = Uniform::new_inclusive(0x80 as char, char::MAX);
let mut string = String::new();
for _ in 0..500_000 {
let distribution = if pick_nonascii.sample(&mut rng) {
nonascii
} else {
ascii
};
string.push(distribution.sample(&mut rng));
}
string
}
fn bench(c: &mut Criterion, group_name: &str, string: String) {
let mut group = c.benchmark_group(group_name);
group.measurement_time(Duration::from_secs(10));
group.bench_function("baseline", |b| {
b.iter(|| {
for ch in string.chars() {
black_box(ch);
}
});
});
group.bench_function("unicode-ident", |b| {
b.iter(|| {
for ch in string.chars() {
black_box(unicode_ident::is_xid_start(ch));
black_box(unicode_ident::is_xid_continue(ch));
}
});
});
group.bench_function("unicode-xid", |b| {
b.iter(|| {
for ch in string.chars() {
black_box(unicode_xid::UnicodeXID::is_xid_start(ch));
black_box(unicode_xid::UnicodeXID::is_xid_continue(ch));
}
});
});
group.bench_function("ucd-trie", |b| {
b.iter(|| {
for ch in string.chars() {
black_box(trie::XID_START.contains_char(ch));
black_box(trie::XID_CONTINUE.contains_char(ch));
}
});
});
group.bench_function("fst", |b| {
let xid_start_fst = fst::xid_start_fst();
let xid_continue_fst = fst::xid_continue_fst();
b.iter(|| {
for ch in string.chars() {
let ch_bytes = (ch as u32).to_be_bytes();
black_box(xid_start_fst.contains(ch_bytes));
black_box(xid_continue_fst.contains(ch_bytes));
}
});
});
group.bench_function("roaring", |b| {
let xid_start_bitmap = roaring::xid_start_bitmap();
let xid_continue_bitmap = roaring::xid_continue_bitmap();
b.iter(|| {
for ch in string.chars() {
black_box(xid_start_bitmap.contains(ch as u32));
black_box(xid_continue_bitmap.contains(ch as u32));
}
});
});
group.finish();
}
fn bench0(c: &mut Criterion) {
bench(c, "0%-nonascii", gen_string(0));
}
fn bench1(c: &mut Criterion) {
bench(c, "1%-nonascii", gen_string(1));
}
fn bench10(c: &mut Criterion) {
bench(c, "10%-nonascii", gen_string(10));
}
fn bench100(c: &mut Criterion) {
bench(c, "100%-nonascii", gen_string(100));
}
criterion_group!(benches, bench0, bench1, bench10, bench100);
criterion_main!(benches);
|