refactor and spell check index start

tomfran · Jan 25, 2024 · 60bacfb · 60bacfb
1 parent 686b21c
commit 60bacfb
Show file tree

Hide file tree

Showing 14 changed files with 282 additions and 118 deletions.
diff --git a/README.md b/README.md
@@ -16,12 +16,11 @@ Search engine written in Rust, based on an inverted index on disk.
 **Index construction**
 - [x] In-memory datasets index construction;
 - [x] Proper vocabulary and paths on disk;
-- [ ] Spelling correction index.
+- [ ] Spelling correction index: in progress.
 
 **Queries**
 - [x] Tf-idf ranked retrieval;
 - [x] Window computation;
-- [ ] FIle content retrieval.
 
 **Evaluation**
 - [ ] Query speed;
@@ -30,13 +29,7 @@ Search engine written in Rust, based on an inverted index on disk.
 
 **Client**
 - [x] CLI;
-- [ ] Web interface.
-
-## Crates in use
-- [stemmer-rs](https://github.com/lise-henry/stemmer-rs)
-- [tokenizers](https://github.com/huggingface/tokenizers)
-- [indicatif](https://github.com/console-rs/indicatif)
-- [fxhash](https://github.com/cbreeden/fxhash)
+- [x] Web interface.
 
 ## References
 [Introduction to Information Retrieval](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) - Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze

diff --git a/makefile b/makefile
@@ -0,0 +1,8 @@
+web:
+	cargo run --release --bin server $(index_name)
+
+cli:
+	cargo run --release --bin search $(index_name) ${action}
+
+test:
+	cargo test --release
diff --git a/search/Cargo.toml b/search/Cargo.toml
@@ -6,9 +6,9 @@ edition = "2021"
 
 [dependencies]
 rand = "0.8"
-tokenizers = { version = "0.15.0", features = ["http"] }
 rust-stemmers = "1.2.0"
 rayon = "1.8.0"
-indicatif = {version = "0.17.0", features = ["rayon", "improved_unicode"]}
+indicatif = { version = "0.17.0", features = ["rayon", "improved_unicode"] }
 fxhash = "0.2.1"
 tempdir = "0.3.7"
+regex = "1"
diff --git a/search/src/index/builder.rs b/search/src/index/builder.rs
@@ -1,34 +1,32 @@
 use super::{
     documents::{Document, Documents},
     postings::{PostingEntry, PostingList, Postings},
-    text,
+    preprocessor::Preprocessor,
     vocabulary::Vocabulary,
     InMemoryIndex,
 };
 use indicatif::{ParallelProgressIterator, ProgressStyle};
 use rayon::prelude::*;
-use rust_stemmers::Stemmer;
 use std::{
     collections::{BTreeMap, HashMap},
     fs,
     sync::Mutex,
 };
-use tokenizers::Tokenizer;
 
 const PROGRESS_STYLE: &str =
     "Documents per second: {per_sec:<3}\n\n[{elapsed_precise}] [{bar:50}] {pos}/{len} [{eta_precise}]";
 const PROGRESS_CHARS: &str = "=> ";
 
 const CUTOFF_THRESHOLD: f64 = 0.8;
 
-pub fn build_index(input_dir: &str, output_path: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) {
-    let index: InMemoryIndex = build_in_memory(input_dir, tokenizer, stemmer);
+pub fn build_index(input_dir: &str, output_path: &str, preprocessor: &Preprocessor) {
+    let index: InMemoryIndex = build_in_memory(input_dir, preprocessor);
     Postings::write_postings(&index, output_path);
     Vocabulary::write_vocabulary(&index, output_path);
     Documents::write_documents(&index.documents, output_path);
 }
 
-fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) -> InMemoryIndex {
+fn build_in_memory(input_dir: &str, preprocessor: &Preprocessor) -> InMemoryIndex {
     let files: Vec<fs::DirEntry> = fs::read_dir(input_dir)
         .expect("error while retrieving input directory content")
         .map(|p| p.unwrap())
@@ -54,7 +52,7 @@ fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) ->
         )
         .for_each(|d| {
             let file_content = fs::read_to_string(d.path()).expect("error while reading file");
-            let tokens = text::tokenize_and_stem(tokenizer, stemmer, &file_content);
+            let tokens = preprocessor.tokenize_and_stem(&file_content);
 
             let mut doc_id = doc_id_mutex.lock().unwrap();
 

diff --git a/search/src/index/documents.rs b/search/src/index/documents.rs
@@ -65,3 +65,61 @@ impl Documents {
         self.docs[doc_id as usize].path.clone()
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::test_utils::utils::create_temporary_file_path;
+
+    use super::*;
+
+    #[test]
+    fn test_write_and_load() {
+        let dir = create_temporary_file_path("docs_unit");
+
+        let documents = vec![
+            Document {
+                path: "document1.txt".to_string(),
+                lenght: 100,
+            },
+            Document {
+                path: "document2.txt".to_string(),
+                lenght: 150,
+            },
+        ];
+
+        Documents::write_documents(&documents, &dir);
+        let loaded_documents = Documents::load_documents(&dir);
+
+        assert_eq!(loaded_documents.get_num_documents(), documents.len() as u32);
+
+        for i in 0..documents.len() {
+            assert_eq!(loaded_documents.get_doc_path(i as u32), documents[i].path);
+            assert_eq!(loaded_documents.get_doc_len(i as u32), documents[i].lenght);
+        }
+    }
+
+    #[test]
+    fn test_methods() {
+        let documents = vec![
+            Document {
+                path: "document1.txt".to_string(),
+                lenght: 100,
+            },
+            Document {
+                path: "document2.txt".to_string(),
+                lenght: 150,
+            },
+        ];
+
+        let doc_collection = Documents {
+            docs: documents.clone(),
+        };
+
+        assert_eq!(doc_collection.get_num_documents(), documents.len() as u32);
+
+        for i in 0..documents.len() {
+            assert_eq!(doc_collection.get_doc_path(i as u32), documents[i].path);
+            assert_eq!(doc_collection.get_doc_len(i as u32), documents[i].lenght);
+        }
+    }
+}
diff --git a/search/src/index/mod.rs b/search/src/index/mod.rs
@@ -1,17 +1,15 @@
 mod builder;
 mod documents;
 mod postings;
-mod text;
+mod preprocessor;
 mod utils;
 mod vocabulary;
 
-use rust_stemmers::Stemmer;
-use std::collections::BTreeMap;
-use tokenizers::Tokenizer;
-
 use self::documents::{Document, Documents};
 use self::postings::{PostingList, Postings};
+use self::preprocessor::Preprocessor;
 use self::vocabulary::Vocabulary;
+use std::collections::BTreeMap;
 
 pub const POSTINGS_EXTENSION: &str = ".postings";
 pub const OFFSETS_EXTENSION: &str = ".offsets";
@@ -22,8 +20,7 @@ pub struct Index {
     vocabulary: Vocabulary,
     postings: Postings,
     documents: Documents,
-    tokenizer: Tokenizer,
-    stemmer: Stemmer,
+    preprocessor: Preprocessor,
 }
 
 pub struct InMemoryIndex {
@@ -33,19 +30,16 @@ pub struct InMemoryIndex {
 }
 
 impl Index {
-    pub fn build_index(input_path: &str, output_path: &str, tokenizer_path: &str) {
-        let tokenizer = text::load_tokenizer(tokenizer_path, false);
-        let stemmer = text::load_stemmer();
-        builder::build_index(input_path, output_path, &tokenizer, &stemmer);
+    pub fn build_index(input_path: &str, output_path: &str) {
+        builder::build_index(input_path, output_path, &Preprocessor::new());
     }
 
-    pub fn load_index(input_path: &str, tokenizer_path: &str) -> Index {
+    pub fn load_index(input_path: &str) -> Index {
         Index {
             vocabulary: Vocabulary::load_vocabulary(input_path),
             postings: Postings::load_postings_reader(input_path),
             documents: Documents::load_documents(input_path),
-            tokenizer: text::load_tokenizer(tokenizer_path, false),
-            stemmer: text::load_stemmer(),
+            preprocessor: Preprocessor::new(),
         }
     }
 
@@ -55,8 +49,8 @@ impl Index {
             .map(|i| self.postings.load_postings_list(i))
     }
 
-    pub fn tokenize_and_stem_query(&self, query: &str) -> Vec<String> {
-        text::tokenize_and_stem(&self.tokenizer, &self.stemmer, query)
+    pub fn get_query_tokens(&self, query: &str) -> Vec<String> {
+        self.preprocessor.tokenize_and_stem(query)
     }
 
     pub fn get_num_documents(&self) -> u32 {
@@ -81,9 +75,9 @@ mod test {
     fn test_build() {
         let index_path = &create_temporary_dir_path();
 
-        Index::build_index("test_data/docs", index_path, "test_data/test_tokenizer");
+        Index::build_index("test_data/docs", index_path);
 
-        let mut idx = Index::load_index(index_path, "test_data/test_tokenizer");
+        let mut idx = Index::load_index(index_path);
 
         for ele in ["hello", "man", "world"] {
             assert!(idx.vocabulary.get_term_index(ele).is_some());

diff --git a/search/src/index/preprocessor.rs b/search/src/index/preprocessor.rs
@@ -0,0 +1,42 @@
+use regex::Regex;
+use rust_stemmers::{Algorithm, Stemmer};
+
+pub struct Preprocessor {
+    stemmer: Stemmer,
+    regex: Regex,
+}
+
+impl Preprocessor {
+    pub fn new() -> Preprocessor {
+        Preprocessor {
+            stemmer: Stemmer::create(Algorithm::English),
+            regex: Regex::new(r"[^a-zA-Z0-9\s]+").expect("error while building regex"),
+        }
+    }
+
+    pub fn tokenize_and_stem(&self, text: &str) -> Vec<String> {
+        self.regex
+            .replace_all(text, " ")
+            .split_whitespace()
+            .map(|t| t.to_lowercase())
+            .map(|t| self.stemmer.stem(&t).to_string())
+            .collect()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_tokenize_and_stem() {
+        let preprocessor = Preprocessor::new();
+
+        let text1 = "The quick brown, fox jumps over the lazy dog!!!";
+        let result1 = preprocessor.tokenize_and_stem(text1);
+        assert_eq!(
+            result1,
+            vec!["the", "quick", "brown", "fox", "jump", "over", "the", "lazi", "dog"]
+        );
+    }
+}
diff --git a/search/src/index/text.rs b/search/src/index/text.rs