stemmer

tomfran · Dec 27, 2023 · 0d4d6fa · 0d4d6fa
1 parent 214ab2e
commit 0d4d6fa
Show file tree

Hide file tree

Showing 6 changed files with 52 additions and 40 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -9,4 +9,5 @@ path = "src/lib.rs"
 
 [dependencies]
 rand = "0.8"
-tokenizers = { version = "0.15.0", features = ["http"] }
+tokenizers = { version = "0.15.0", features = ["http"] }
+rust-stemmers = "1.2.0"
diff --git a/README.md b/README.md
@@ -2,19 +2,22 @@
 
 Search engine written in Rust, based on an inverted index on disk.
 
-**Implementation status** 
+### Implementation status 
 - [x] IO classes for writing and reading bit-streams;
 - [ ] Text preprocessing: 
   - [x] Tokenization;
-  - [ ] Stemming.
+  - [x] Stemming.
 - [ ] Index construction:
   - [x] In-memory datasets index construction;
   - [ ] Disk-based partial index construction and merging;
   - [ ] Additional indexes to support things such as spelling correction.
 - [ ] Index queries:
   - [ ] Boolean queries;
-  - [ ] Tf-idf ranked retrieval.
+  - [x] Tf-idf ranked retrieval.
 
-**References**
+### Crates in use
+- [lise-henry/stemmer-rs](https://github.com/lise-henry/stemmer-rs)
+- [huggingface/tokenizers](https://github.com/huggingface/tokenizers)
 
-[*Introduction to Information Retrieval - Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze*](https://nlp.stanford.edu/IR-book/information-retrieval-book.html)
+### References
+[Introduction to Information Retrieval - Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze](https://nlp.stanford.edu/IR-book/information-retrieval-book.html)
diff --git a/src/index/builder.rs b/src/index/builder.rs
@@ -1,3 +1,4 @@
+use rust_stemmers::Stemmer;
 use std::{collections::BTreeMap, fs};
 use tokenizers::Tokenizer;
 
@@ -17,22 +18,22 @@ struct InMemoryIndex {
     document_lenghts: Vec<u32>,
 }
 
-pub fn build_index(input_dir: &str, output_path: &str, tokenizer: &Tokenizer) {
-    let index = build_in_memory(input_dir, tokenizer);
+pub fn build_index(input_dir: &str, output_path: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) {
+    let index = build_in_memory(input_dir, tokenizer, stemmer);
     write_postings(&index, output_path);
     write_vocabulary(&index.term_index_map, output_path);
     write_doc_lentghts(&index.document_lenghts, output_path);
 }
 
-fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer) -> InMemoryIndex {
+fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) -> InMemoryIndex {
     let documents =
         fs::read_dir(input_dir).expect("error while retrieving input directory content");
 
     let tokenized_docs_iter = documents
         .into_iter()
         .map(|p| p.unwrap())
         .map(|p| fs::read_to_string(p.path()).expect("error while reading file"))
-        .map(|s| text_utils::tokenize(tokenizer, &s));
+        .map(|s| text_utils::tokenize_and_stem(tokenizer, stemmer, &s));
 
     let mut term_index_map: BTreeMap<String, usize> = BTreeMap::new();
     let mut postings: Vec<BTreeMap<u32, u32>> = Vec::new();

diff --git a/src/index/mod.rs b/src/index/mod.rs
@@ -2,7 +2,9 @@ mod builder;
 mod loader;
 mod text_utils;
 
+use rust_stemmers::Stemmer;
 use std::collections::BTreeMap;
+use std::fmt::Display;
 use tokenizers::Tokenizer;
 
 use crate::disk::bits_reader::BitsReader;
@@ -18,6 +20,7 @@ pub struct Index {
     term_offset_map: BTreeMap<String, u64>,
     doc_lenghts: Vec<u32>,
     tokenizer: Tokenizer,
+    stemmer: Stemmer,
 }
 
 #[derive(Debug)]
@@ -35,7 +38,8 @@ pub struct PostingEntry {
 impl Index {
     pub fn build_index(input_path: &str, output_path: &str, tokenizer_path: &str) {
         let tokenizer = text_utils::load_tokenizer(tokenizer_path, false);
-        builder::build_index(input_path, output_path, &tokenizer);
+        let stemmer = text_utils::load_stemmer();
+        builder::build_index(input_path, output_path, &tokenizer, &stemmer);
     }
 
     pub fn load_index(input_path: &str, tokenizer_path: &str) -> Index {
@@ -44,6 +48,7 @@ impl Index {
             term_offset_map: loader::load_terms_to_offsets_map(input_path),
             doc_lenghts: loader::load_document_lenghts(input_path),
             tokenizer: text_utils::load_tokenizer(tokenizer_path, false),
+            stemmer: text_utils::load_stemmer(),
         }
     }
 
@@ -79,8 +84,19 @@ impl Index {
         })
     }
 
-    pub fn tokenize_query(&self, query: &str) -> Vec<String> {
-        text_utils::tokenize(&self.tokenizer, query)
+    pub fn tokenize_and_stem_query(&self, query: &str) -> Vec<String> {
+        text_utils::tokenize_and_stem(&self.tokenizer, &self.stemmer, query)
+    }
+}
+
+impl Display for Index {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "Index:\n- vocab size: {}\n- num. documents: {})",
+            self.term_offset_map.len(),
+            self.get_num_documents()
+        )
     }
 }
 

diff --git a/src/index/text_utils.rs b/src/index/text_utils.rs
@@ -1,5 +1,6 @@
 use std::{fs::create_dir_all, path::Path};
 
+use rust_stemmers::{Algorithm, Stemmer};
 use tokenizers::Tokenizer;
 
 pub fn load_tokenizer(filename: &str, force_download: bool) -> Tokenizer {
@@ -19,10 +20,19 @@ pub fn load_tokenizer(filename: &str, force_download: bool) -> Tokenizer {
     Tokenizer::from_file(filename).expect("error while loading tokenizer from file")
 }
 
-pub fn tokenize(tokenizer: &Tokenizer, text: &str) -> Vec<String> {
-    tokenizer
+pub fn load_stemmer() -> Stemmer {
+    Stemmer::create(Algorithm::English)
+}
+
+pub fn tokenize_and_stem(tokenizer: &Tokenizer, stemmer: &Stemmer, text: &str) -> Vec<String> {
+    let tokenized_text = tokenizer
         .encode(text, false)
-        .expect("error while tokenizing text")
+        .expect("error while tokenizing text");
+
+    tokenized_text
         .get_tokens()
-        .to_vec()
+        .iter()
+        .map(|t| t.to_lowercase())
+        .map(|t| stemmer.stem(&t).to_string())
+        .collect()
 }
diff --git a/src/query/mod.rs b/src/query/mod.rs
@@ -25,11 +25,9 @@ impl QueryProcessor {
     }
 
     pub fn query(&mut self, query: &str) -> Vec<u32> {
-        println!("\nQuery: {:?}", query);
-
         let mut scores: HashMap<u32, f32> = HashMap::new();
 
-        for token in self.index.tokenize_query(query) {
+        for token in self.index.tokenize_and_stem_query(query) {
             if let Some(postings) = self.index.get_term(&token) {
                 let idf = (self.num_documents as f32 / postings.collection_frequency as f32).log2();
 
@@ -44,27 +42,10 @@ impl QueryProcessor {
         }
 
         let mut selector = DocumentSelector::new(3);
-        scores.iter().for_each(|(id, score)| {
-            println!("- document: {:?}, score: {:?}", id, score);
-            selector.push(*id, *score)
-        });
+        scores
+            .iter()
+            .for_each(|(id, score)| selector.push(*id, *score));
 
         selector.get_sorted_ids()
     }
 }
-
-// #[cfg(test)]
-// mod test {
-//     use super::*;
-
-//     #[test]
-//     fn test_build() {
-//         let mut q = QueryProcessor::build_query_processor(
-//             "data/small/index/small",
-//             "data/small/bert-base-uncased",
-//         );
-//         q.query("google");
-//         q.query("apple");
-//         q.query("microsoft");
-//     }
-// }