Skip to content

Commit

Permalink
min and max freq cutoff
Browse files Browse the repository at this point in the history
  • Loading branch information
tomfran committed Jan 28, 2024
1 parent 58ad091 commit f7ea00c
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 32 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,16 @@ Search engine written in Rust, based on an inverted index on disk.
**Text preprocessing**
- [x] Tokenization;
- [x] Stemming;
- [ ] Parametrization at build time.

**Index construction**
- [x] In-memory datasets index construction;
- [x] Proper vocabulary and paths on disk;
- [x] Spelling correction index;.
- [x] Spelling correction index;
- [x] Min and max frequency cutoffs.

**Queries**
- [x] BM25 scoring;
- [x] Window computation;
- [x] Query window

**Evaluation**
- [ ] Query speed;
Expand Down
1 change: 1 addition & 0 deletions search/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ indicatif = { version = "0.17.0", features = ["rayon", "improved_unicode"] }
fxhash = "0.2.1"
tempdir = "0.3.7"
regex = "1"
argparse = "0.2.2"
32 changes: 25 additions & 7 deletions search/src/engine/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,30 @@ const PROGRESS_STYLE: &str =
"Documents per second: {per_sec:<3}\n\n[{elapsed_precise}] [{bar:50}] {pos}/{len} [{eta_precise}]";
const PROGRESS_CHARS: &str = "=> ";

const CUTOFF_THRESHOLD: f64 = 0.8;

pub fn build_engine(input_dir: &str, output_path: &str, preprocessor: &Preprocessor) {
let index: InMemory = build_in_memory(input_dir, preprocessor);
pub fn build_engine(
input_path: &str,
output_path: &str,
preprocessor: &Preprocessor,
max_freq_percentage_threshold: f64,
min_freq_threshold: u32,
) {
let index: InMemory = build_in_memory(
input_path,
preprocessor,
max_freq_percentage_threshold,
min_freq_threshold,
);
Postings::write_postings(&index, output_path);
Vocabulary::write_vocabulary(&index, output_path);
Documents::write_documents(&index.documents, output_path);
}

fn build_in_memory(input_dir: &str, preprocessor: &Preprocessor) -> InMemory {
fn build_in_memory(
input_dir: &str,
preprocessor: &Preprocessor,
max_freq_percentage_threshold: f64,
min_freq_threshold: u32,
) -> InMemory {
let files: Vec<fs::DirEntry> = fs::read_dir(input_dir)
.expect("error while retrieving input directory content")
.map(std::result::Result::unwrap)
Expand Down Expand Up @@ -98,13 +112,17 @@ fn build_in_memory(input_dir: &str, preprocessor: &Preprocessor) -> InMemory {

let final_postings = postings.into_inner().unwrap();

let frequency_threshold = (doc_id_mutex.into_inner().unwrap() as f64 * CUTOFF_THRESHOLD) as u32;
let frequency_threshold =
(doc_id_mutex.into_inner().unwrap() as f64 * max_freq_percentage_threshold) as u32;

let sorted_term_index_map: BTreeMap<String, usize> = term_index_map
.into_inner()
.unwrap()
.into_iter()
.filter(|(_, v)| final_postings[*v].collection_frequency <= frequency_threshold)
.filter(|(_, v)| {
let f = final_postings[*v].collection_frequency;
f <= frequency_threshold && f > min_freq_threshold
})
.collect();

InMemory {
Expand Down
19 changes: 15 additions & 4 deletions search/src/engine/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ pub const OFFSETS_EXTENSION: &str = ".offsets";
pub const DOCUMENTS_EXTENSION: &str = ".docs";
pub const VOCABULARY_ALPHA_EXTENSION: &str = ".alphas";

const WINDOW_SCORE_MULTIPLIER: f64 = 0.5;
const WINDOW_SCORE_MULTIPLIER: f64 = 0.0;
const BM25_SCORE_MULTIPLIER: f64 = 1.0;

const BM25_KL: f64 = 1.2;
Expand Down Expand Up @@ -58,8 +58,19 @@ struct DocumentScore {
}

impl Engine {
pub fn build_engine(input_path: &str, output_path: &str) {
builder::build_engine(input_path, output_path, &Preprocessor::new());
pub fn build_engine(
input_path: &str,
output_path: &str,
max_freq_percentage_threshold: f64,
min_freq_threshold: u32,
) {
builder::build_engine(
input_path,
output_path,
&Preprocessor::new(),
max_freq_percentage_threshold,
min_freq_threshold,
);
}

pub fn load_index(input_path: &str) -> Engine {
Expand Down Expand Up @@ -191,7 +202,7 @@ mod test {
fn test_build() {
let index_path = &create_temporary_dir_path();

Engine::build_engine("test_data/docs", index_path);
Engine::build_engine("test_data/docs", index_path, 1.0, 0);

let mut idx = Engine::load_index(index_path);

Expand Down
45 changes: 27 additions & 18 deletions search/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use std::process::{exit, Command};
use std::time::{Duration, Instant};

const NUM_TOP_RESULTS: usize = 10;
const NUM_RESULTS: usize = 1_000_000;
const NUM_RESULTS: usize = 100;

fn print_results(result: &QueryResult) {
println!("Search tokens: {:?}", result.tokens);
Expand Down Expand Up @@ -57,7 +57,10 @@ fn main() {
let args: Vec<String> = env::args().collect();

if args.len() < 3 || args.len() > 5 {
println!("Usage: cargo run --bin search <base_path> <load_or_build> [build_num_threads]");
println!("Usage: cargo run -r <base_path> <load_or_build> <min_freq (integer)> <max_frequency_perc (float)>
\nExample:
\n\t- cargo run -r path/to/docs build 10 0.90
\n\t- cargo run -r path/to/docs load");
return;
}

Expand All @@ -69,27 +72,33 @@ fn main() {
let docs_path = format!("{base_path}/docs");

if build_index {
println!("Start build on directory [{docs_path}]\n");

let num_threads = args.get(3).map_or(0, |s| s.parse().unwrap_or(0));

if num_threads != 0 {
println!("Setting thread number to {num_threads}");
let min_freq: Result<u32, _> = args[3].parse();
let min_freq = match min_freq {
Ok(value) => value,
Err(_) => {
println!("Error: min_freq must be an integer.");
return;
}
};

let max_frequency_perc: Result<f64, _> = args[4].parse();
let max_frequency_perc = match max_frequency_perc {
Ok(value) => value,
Err(_) => {
println!("Error: max_frequency_perc must be a float.");
return;
}
};

rayon::ThreadPoolBuilder::new()
.num_threads(num_threads)
.build_global()
.unwrap();
}
println!("Start build on directory [{docs_path}]\n");

let start_time = Instant::now();

Engine::build_engine(&docs_path, &index_path);
Engine::build_engine(&docs_path, &index_path, max_frequency_perc, min_freq);
let elapsed_time = start_time.elapsed();

println!(
"Index built in {}.\n\nLoad options:\n- CLI: cargo run --release --bin search {} load",
HumanDuration(Duration::from_secs(elapsed_time.as_secs())),
base_path
"Index built in {}",
HumanDuration(Duration::from_secs(elapsed_time.as_secs()))
);

exit(0);
Expand Down

0 comments on commit f7ea00c

Please sign in to comment.