Splitted tokenization and indexing phases

tomfran · Feb 6, 2024 · 216693b · 216693b
1 parent 4f2f5c7
commit 216693b
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 70 deletions.
diff --git a/search/Cargo.toml b/search/Cargo.toml
@@ -3,7 +3,6 @@ name = "search"
 version = "0.1.0"
 edition = "2021"
 
-
 [dependencies]
 rand = "0.8"
 rust-stemmers = "1.2.0"

diff --git a/search/src/engine/builder.rs b/search/src/engine/builder.rs
@@ -7,12 +7,12 @@ use super::{
     vocabulary::Vocabulary,
     InMemory,
 };
-use indicatif::{ParallelProgressIterator, ProgressStyle};
+use fxhash::FxHashMap;
+use indicatif::{ParallelProgressIterator, ProgressIterator, ProgressStyle};
 use rayon::prelude::*;
 use std::{
-    collections::{BTreeMap, HashMap},
+    collections::{hash_map::Entry, BTreeMap},
     fs,
-    sync::Mutex,
 };
 
 const PROGRESS_STYLE: &str =
@@ -32,8 +32,11 @@ pub fn build_engine(
         max_freq_percentage_threshold,
         min_freq_threshold,
     );
+    println!("- Writing postings");
     Postings::write_postings(&index, output_path);
+    println!("- Writing vocabulary");
     Vocabulary::write_vocabulary(&index, output_path);
+    println!("- Writing documents");
     Documents::write_documents(&index.documents, output_path);
 }
 
@@ -43,88 +46,90 @@ fn build_in_memory(
     max_freq_percentage_threshold: f64,
     min_freq_threshold: u32,
 ) -> InMemory {
+    let iterator_style = ProgressStyle::with_template(PROGRESS_STYLE)
+        .unwrap()
+        .progress_chars(PROGRESS_CHARS);
+
     let files = walk_dir(input_dir);
+
+    println!("- Pre-processing phase");
+    let processed_documents: Vec<(String, Vec<String>)> = files
+        .into_par_iter()
+        .progress_with_style(iterator_style.clone())
+        .map(|d| {
+            let file_content = fs::read_to_string(d.path()).expect("error while reading file");
+            (
+                d.path().to_str().unwrap().to_string(),
+                preprocessor.tokenize_and_stem(&file_content),
+            )
+        })
+        .collect();
+
+    println!("- Indexing phase");
+
     // document counter
-    let doc_id_mutex = Mutex::new(0);
+    let mut doc_id = 0;
     // postings list
-    let postings: Mutex<Vec<PostingsList>> = Mutex::new(Vec::new());
+    let mut postings = Vec::new();
     // word to postings index
-    let term_index_map = Mutex::new(HashMap::new());
+    let mut term_index_map = FxHashMap::default();
     // per-word doc id to posting list index
-    let term_doc_map: Mutex<Vec<HashMap<u32, usize>>> = Mutex::new(Vec::new());
+    let mut term_doc_map: Vec<FxHashMap<u32, usize>> = Vec::new();
     // documents data
-    let documents = Mutex::new(Vec::new());
+    let mut documents = Vec::new();
 
-    files
-        .into_par_iter()
-        .progress_with_style(
-            ProgressStyle::with_template(PROGRESS_STYLE)
-                .unwrap()
-                .progress_chars(PROGRESS_CHARS),
-        )
-        .for_each(|d| {
-            let file_content: String =
-                fs::read_to_string(d.path()).expect("error while reading file");
-            let tokens = preprocessor.tokenize_and_stem(&file_content);
-
-            let mut doc_id = doc_id_mutex.lock().unwrap();
-
-            // update documents array
-            documents.lock().unwrap().push(Document {
-                path: d.path().to_str().unwrap().to_string(),
-                length: tokens.len() as u32,
-            });
-
-            let mut l_term_index_map = term_index_map.lock().unwrap();
-            let mut l_postings = postings.lock().unwrap();
-            let mut l_term_doc_map = term_doc_map.lock().unwrap();
-
-            for (word_pos, t) in tokens.iter().enumerate() {
-                // obtain postings for this word and increment collection frequency
-                if !l_term_index_map.contains_key(t) {
-                    let idx = l_term_index_map.len();
-                    l_term_index_map.insert(t.clone(), idx);
-                    l_postings.push(PostingsList::new());
-                    l_term_doc_map.push(HashMap::new());
-                }
-                let term_index = *l_term_index_map.get(t).unwrap();
-
-                // obtain document entry for this word and update it
-                let postings_list = &mut l_postings[term_index];
-                if !l_term_doc_map[term_index].contains_key(&doc_id) {
-                    let idx = postings_list.len();
-                    l_term_doc_map[term_index].insert(*doc_id, idx);
-                    postings_list.push(Posting::default());
-                }
-                let posting_entry_index = *l_term_doc_map[term_index].get(&doc_id).unwrap();
-
-                let posting_entry = &mut postings_list[posting_entry_index];
-
-                posting_entry.document_frequency += 1;
-                posting_entry.document_id = *doc_id;
-                posting_entry.positions.push(word_pos as u32);
-            }
-            *doc_id += 1;
+    let processed_docs_iterator = processed_documents
+        .into_iter()
+        .progress_with_style(iterator_style);
+
+    for (path, tokens) in processed_docs_iterator {
+        // update documents array
+        documents.push(Document {
+            path,
+            length: tokens.len() as u32,
         });
 
-    let final_postings = postings.into_inner().unwrap();
+        for (word_pos, t) in tokens.iter().enumerate() {
+            // obtain postings for this word and increment collection frequency
+            if !term_index_map.contains_key(t) {
+                let idx = term_index_map.len();
+                term_index_map.insert(t.clone(), idx);
+                postings.push(PostingsList::new());
+                term_doc_map.push(FxHashMap::default());
+            }
+            let term_index = *term_index_map.get(t).unwrap();
+
+            // obtain document entry for this word and update it
+            let postings_list = &mut postings[term_index];
+            if let Entry::Vacant(e) = term_doc_map[term_index].entry(doc_id) {
+                let idx = postings_list.len();
+                e.insert(idx);
+                postings_list.push(Posting::default());
+            }
+            let posting_entry_index = *term_doc_map[term_index].get(&doc_id).unwrap();
 
-    let frequency_threshold =
-        (doc_id_mutex.into_inner().unwrap() as f64 * max_freq_percentage_threshold) as u32;
+            let posting_entry = &mut postings_list[posting_entry_index];
 
-    let sorted_term_index_map: BTreeMap<String, usize> = term_index_map
-        .into_inner()
-        .unwrap()
+            posting_entry.document_frequency += 1;
+            posting_entry.document_id = doc_id;
+            posting_entry.positions.push(word_pos as u32);
+        }
+        doc_id += 1;
+    }
+
+    let frequency_threshold = (doc_id as f64 * max_freq_percentage_threshold) as u32;
+
+    let term_index_map: BTreeMap<String, usize> = term_index_map
         .into_iter()
         .filter(|(_, v)| {
-            let f = final_postings[*v].len() as u32;
+            let f = postings[*v].len() as u32;
             f <= frequency_threshold && f > min_freq_threshold
         })
         .collect();
 
     InMemory {
-        term_index_map: sorted_term_index_map,
-        postings: final_postings,
-        documents: documents.into_inner().unwrap(),
+        term_index_map,
+        postings,
+        documents,
     }
 }
diff --git a/search/src/main.rs b/search/src/main.rs
@@ -90,7 +90,7 @@ fn main() {
         let elapsed_time = start_time.elapsed();
 
         println!(
-            "Index built in {}",
+            "\nIndex built in {}",
             HumanDuration(Duration::from_secs(elapsed_time.as_secs()))
         );