Skip to content

Commit

Permalink
refactor and spell check index start
Browse files Browse the repository at this point in the history
  • Loading branch information
tomfran committed Jan 25, 2024
1 parent 686b21c commit 60bacfb
Show file tree
Hide file tree
Showing 14 changed files with 282 additions and 118 deletions.
11 changes: 2 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,11 @@ Search engine written in Rust, based on an inverted index on disk.
**Index construction**
- [x] In-memory datasets index construction;
- [x] Proper vocabulary and paths on disk;
- [ ] Spelling correction index.
- [ ] Spelling correction index: in progress.

**Queries**
- [x] Tf-idf ranked retrieval;
- [x] Window computation;
- [ ] FIle content retrieval.

**Evaluation**
- [ ] Query speed;
Expand All @@ -30,13 +29,7 @@ Search engine written in Rust, based on an inverted index on disk.

**Client**
- [x] CLI;
- [ ] Web interface.

## Crates in use
- [stemmer-rs](https://github.com/lise-henry/stemmer-rs)
- [tokenizers](https://github.com/huggingface/tokenizers)
- [indicatif](https://github.com/console-rs/indicatif)
- [fxhash](https://github.com/cbreeden/fxhash)
- [x] Web interface.

## References
[Introduction to Information Retrieval](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) - Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze
Expand Down
8 changes: 8 additions & 0 deletions makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
web:
cargo run --release --bin server $(index_name)

cli:
cargo run --release --bin search $(index_name) ${action}

test:
cargo test --release
4 changes: 2 additions & 2 deletions search/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ edition = "2021"

[dependencies]
rand = "0.8"
tokenizers = { version = "0.15.0", features = ["http"] }
rust-stemmers = "1.2.0"
rayon = "1.8.0"
indicatif = {version = "0.17.0", features = ["rayon", "improved_unicode"]}
indicatif = { version = "0.17.0", features = ["rayon", "improved_unicode"] }
fxhash = "0.2.1"
tempdir = "0.3.7"
regex = "1"
12 changes: 5 additions & 7 deletions search/src/index/builder.rs
Original file line number Diff line number Diff line change
@@ -1,34 +1,32 @@
use super::{
documents::{Document, Documents},
postings::{PostingEntry, PostingList, Postings},
text,
preprocessor::Preprocessor,
vocabulary::Vocabulary,
InMemoryIndex,
};
use indicatif::{ParallelProgressIterator, ProgressStyle};
use rayon::prelude::*;
use rust_stemmers::Stemmer;
use std::{
collections::{BTreeMap, HashMap},
fs,
sync::Mutex,
};
use tokenizers::Tokenizer;

const PROGRESS_STYLE: &str =
"Documents per second: {per_sec:<3}\n\n[{elapsed_precise}] [{bar:50}] {pos}/{len} [{eta_precise}]";
const PROGRESS_CHARS: &str = "=> ";

const CUTOFF_THRESHOLD: f64 = 0.8;

pub fn build_index(input_dir: &str, output_path: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) {
let index: InMemoryIndex = build_in_memory(input_dir, tokenizer, stemmer);
pub fn build_index(input_dir: &str, output_path: &str, preprocessor: &Preprocessor) {
let index: InMemoryIndex = build_in_memory(input_dir, preprocessor);
Postings::write_postings(&index, output_path);
Vocabulary::write_vocabulary(&index, output_path);
Documents::write_documents(&index.documents, output_path);
}

fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) -> InMemoryIndex {
fn build_in_memory(input_dir: &str, preprocessor: &Preprocessor) -> InMemoryIndex {
let files: Vec<fs::DirEntry> = fs::read_dir(input_dir)
.expect("error while retrieving input directory content")
.map(|p| p.unwrap())
Expand All @@ -54,7 +52,7 @@ fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) ->
)
.for_each(|d| {
let file_content = fs::read_to_string(d.path()).expect("error while reading file");
let tokens = text::tokenize_and_stem(tokenizer, stemmer, &file_content);
let tokens = preprocessor.tokenize_and_stem(&file_content);

let mut doc_id = doc_id_mutex.lock().unwrap();

Expand Down
58 changes: 58 additions & 0 deletions search/src/index/documents.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,61 @@ impl Documents {
self.docs[doc_id as usize].path.clone()
}
}

#[cfg(test)]
mod tests {
use crate::test_utils::utils::create_temporary_file_path;

use super::*;

#[test]
fn test_write_and_load() {
let dir = create_temporary_file_path("docs_unit");

let documents = vec![
Document {
path: "document1.txt".to_string(),
lenght: 100,
},
Document {
path: "document2.txt".to_string(),
lenght: 150,
},
];

Documents::write_documents(&documents, &dir);
let loaded_documents = Documents::load_documents(&dir);

assert_eq!(loaded_documents.get_num_documents(), documents.len() as u32);

for i in 0..documents.len() {
assert_eq!(loaded_documents.get_doc_path(i as u32), documents[i].path);
assert_eq!(loaded_documents.get_doc_len(i as u32), documents[i].lenght);
}
}

#[test]
fn test_methods() {
let documents = vec![
Document {
path: "document1.txt".to_string(),
lenght: 100,
},
Document {
path: "document2.txt".to_string(),
lenght: 150,
},
];

let doc_collection = Documents {
docs: documents.clone(),
};

assert_eq!(doc_collection.get_num_documents(), documents.len() as u32);

for i in 0..documents.len() {
assert_eq!(doc_collection.get_doc_path(i as u32), documents[i].path);
assert_eq!(doc_collection.get_doc_len(i as u32), documents[i].lenght);
}
}
}
30 changes: 12 additions & 18 deletions search/src/index/mod.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
mod builder;
mod documents;
mod postings;
mod text;
mod preprocessor;
mod utils;
mod vocabulary;

use rust_stemmers::Stemmer;
use std::collections::BTreeMap;
use tokenizers::Tokenizer;

use self::documents::{Document, Documents};
use self::postings::{PostingList, Postings};
use self::preprocessor::Preprocessor;
use self::vocabulary::Vocabulary;
use std::collections::BTreeMap;

pub const POSTINGS_EXTENSION: &str = ".postings";
pub const OFFSETS_EXTENSION: &str = ".offsets";
Expand All @@ -22,8 +20,7 @@ pub struct Index {
vocabulary: Vocabulary,
postings: Postings,
documents: Documents,
tokenizer: Tokenizer,
stemmer: Stemmer,
preprocessor: Preprocessor,
}

pub struct InMemoryIndex {
Expand All @@ -33,19 +30,16 @@ pub struct InMemoryIndex {
}

impl Index {
pub fn build_index(input_path: &str, output_path: &str, tokenizer_path: &str) {
let tokenizer = text::load_tokenizer(tokenizer_path, false);
let stemmer = text::load_stemmer();
builder::build_index(input_path, output_path, &tokenizer, &stemmer);
pub fn build_index(input_path: &str, output_path: &str) {
builder::build_index(input_path, output_path, &Preprocessor::new());
}

pub fn load_index(input_path: &str, tokenizer_path: &str) -> Index {
pub fn load_index(input_path: &str) -> Index {
Index {
vocabulary: Vocabulary::load_vocabulary(input_path),
postings: Postings::load_postings_reader(input_path),
documents: Documents::load_documents(input_path),
tokenizer: text::load_tokenizer(tokenizer_path, false),
stemmer: text::load_stemmer(),
preprocessor: Preprocessor::new(),
}
}

Expand All @@ -55,8 +49,8 @@ impl Index {
.map(|i| self.postings.load_postings_list(i))
}

pub fn tokenize_and_stem_query(&self, query: &str) -> Vec<String> {
text::tokenize_and_stem(&self.tokenizer, &self.stemmer, query)
pub fn get_query_tokens(&self, query: &str) -> Vec<String> {
self.preprocessor.tokenize_and_stem(query)
}

pub fn get_num_documents(&self) -> u32 {
Expand All @@ -81,9 +75,9 @@ mod test {
fn test_build() {
let index_path = &create_temporary_dir_path();

Index::build_index("test_data/docs", index_path, "test_data/test_tokenizer");
Index::build_index("test_data/docs", index_path);

let mut idx = Index::load_index(index_path, "test_data/test_tokenizer");
let mut idx = Index::load_index(index_path);

for ele in ["hello", "man", "world"] {
assert!(idx.vocabulary.get_term_index(ele).is_some());
Expand Down
42 changes: 42 additions & 0 deletions search/src/index/preprocessor.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
use regex::Regex;
use rust_stemmers::{Algorithm, Stemmer};

pub struct Preprocessor {
stemmer: Stemmer,
regex: Regex,
}

impl Preprocessor {
pub fn new() -> Preprocessor {
Preprocessor {
stemmer: Stemmer::create(Algorithm::English),
regex: Regex::new(r"[^a-zA-Z0-9\s]+").expect("error while building regex"),
}
}

pub fn tokenize_and_stem(&self, text: &str) -> Vec<String> {
self.regex
.replace_all(text, " ")
.split_whitespace()
.map(|t| t.to_lowercase())
.map(|t| self.stemmer.stem(&t).to_string())
.collect()
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_tokenize_and_stem() {
let preprocessor = Preprocessor::new();

let text1 = "The quick brown, fox jumps over the lazy dog!!!";
let result1 = preprocessor.tokenize_and_stem(text1);
assert_eq!(
result1,
vec!["the", "quick", "brown", "fox", "jump", "over", "the", "lazi", "dog"]
);
}
}
37 changes: 0 additions & 37 deletions search/src/index/text.rs

This file was deleted.

Loading

0 comments on commit 60bacfb

Please sign in to comment.