Closed as not planned
Description
What version of regex are you using?
1.11.1
Describe the bug at a high level.
While RegexBuilder::size_limit()
only sets an approximation for the size limit, the approximation can be quite inaccurate, in some cases by a factor of ~3.
What are the steps to reproduce the behavior?
The following (admittedly hacky) benchmark shows the difference between the minimal size limit (within a margin of error) required to build a regular expression and the actual heap usage:
use std::fs::read_to_string;
use {regex::RegexBuilder, tikv_jemallocator::Jemalloc};
#[global_allocator]
static GLOBAL: Jemalloc = Jemalloc;
const MIN_WORDS: usize = 1024;
const MAX_WORDS: usize = 16384;
const WORDS_STEP: usize = 16;
const MAX_REGEX_SIZE: usize = 8 * 1024 * 1024;
const REGEX_SIZE_MARGIN: usize = 4096;
fn estimate_heap_size<T>(thing: T) -> usize {
let epoch = tikv_jemalloc_ctl::epoch::mib().unwrap();
let allocated = tikv_jemalloc_ctl::stats::allocated::mib().unwrap();
epoch.advance().unwrap();
let with = allocated.read().unwrap();
drop(thing);
epoch.advance().unwrap();
let without = allocated.read().unwrap();
with.saturating_sub(without)
}
fn benchmark_regex_size(pattern: &str) -> Option<(usize, usize)> {
let mut lower = 0;
let mut upper = MAX_REGEX_SIZE;
loop {
let limit = lower + (upper - lower).div_ceil(2);
if limit == MAX_REGEX_SIZE {
return None;
}
if let Ok(re) = RegexBuilder::new(pattern).size_limit(limit).build() {
if upper - lower <= REGEX_SIZE_MARGIN {
let heap_size = estimate_heap_size(re);
return Some((limit, heap_size));
}
upper = limit;
} else {
lower = limit;
}
}
}
fn main() {
let words = read_to_string("/usr/share/dict/words").unwrap();
println!("No. Words, Size Limit, Heap Size");
for i in (MIN_WORDS..=MAX_WORDS).step_by(WORDS_STEP) {
print!("{i}, ");
let regex = words.lines().take(i).collect::<Vec<_>>().join("|");
if let Some((regex_size, heap_size)) = benchmark_regex_size(®ex) {
println!("{regex_size}, {heap_size}");
} else {
println!("-, -");
}
}
}
It relies on the following dependencies:
[dependencies]
regex = "1.11.1"
tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
tikv-jemallocator = "0.6"
What is the actual behavior?
The following chart shows size limits vs. the actual heap usage:
Metadata
Metadata
Assignees
Labels
No labels