document paths on disk, adjusted tfidf scoring

tomfran · Jan 18, 2024 · 723488d · 723488d
1 parent 6484a9d
commit 723488d
Show file tree

Hide file tree

Showing 6 changed files with 88 additions and 46 deletions.
diff --git a/src/disk/bits_reader.rs b/src/disk/bits_reader.rs
@@ -73,10 +73,6 @@ impl BitsReader {
             .collect()
     }
 
-    pub fn read_vbyte_gamma_vector(&mut self) -> Vec<u32> {
-        (0..self.read_vbyte()).map(|_| self.read_gamma()).collect()
-    }
-
     pub fn read_str(&mut self) -> String {
         String::from_utf8(
             (0..self.read_gamma())
@@ -150,12 +146,10 @@ mod tests {
             w.write_gamma(i);
         });
 
-        for _ in 0..2 {
-            w.write_vbyte(3);
-            (1..4).for_each(|i| {
-                w.write_gamma(i);
-            });
-        }
+        w.write_vbyte(3);
+        (1..4).for_each(|i| {
+            w.write_gamma(i);
+        });
 
         w.write_str("hello");
         w.write_str("");
@@ -167,7 +161,6 @@ mod tests {
         (1..100).for_each(|i| assert_eq!(i, r.read_vbyte()));
         (1..100).for_each(|i| assert_eq!(i, r.read_gamma()));
 
-        assert_eq!(r.read_vbyte_gamma_vector(), [1, 2, 3]);
         assert_eq!(r.read_vbyte_gamma_gap_vector(), [1, 3, 6]);
 
         assert_eq!(r.read_str(), "hello");

diff --git a/src/index/builder.rs b/src/index/builder.rs
@@ -10,7 +10,7 @@ use rust_stemmers::Stemmer;
 use tokenizers::Tokenizer;
 
 use super::{
-    documents::write_doc_lentghts,
+    documents::{write_documents, Document},
     postings::{write_postings, PostingEntry, PostingList},
     text,
     vocabulary::write_vocabulary,
@@ -27,24 +27,28 @@ pub fn build_index(input_dir: &str, output_path: &str, tokenizer: &Tokenizer, st
     let index: InMemoryIndex = build_in_memory(input_dir, tokenizer, stemmer);
     write_postings(&index, output_path);
     write_vocabulary(&index, output_path);
-    write_doc_lentghts(&index.document_lengths, output_path);
+    write_documents(&index.documents, output_path);
 }
 
 fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) -> InMemoryIndex {
-    let documents: Vec<fs::DirEntry> = fs::read_dir(input_dir)
+    let files: Vec<fs::DirEntry> = fs::read_dir(input_dir)
         .expect("error while retrieving input directory content")
         .map(|p| p.unwrap())
         .collect();
 
+    // document counter
     let doc_id_mutex = Mutex::new(0);
-    let term_index_map = Mutex::new(HashMap::new());
 
+    // postings list
     let postings: Mutex<Vec<PostingList>> = Mutex::new(Vec::new());
+    // word to postings index
+    let term_index_map = Mutex::new(HashMap::new());
+    // per-word doc id to posting list index
     let term_doc_map: Mutex<Vec<HashMap<u32, usize>>> = Mutex::new(Vec::new());
+    // documents data
+    let documents = Mutex::new(Vec::new());
 
-    let document_lengths = Mutex::new(Vec::new());
-
-    documents
+    files
         .into_par_iter()
         .progress_with_style(
             ProgressStyle::with_template(PROGRESS_STYLE)
@@ -57,13 +61,18 @@ fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) ->
 
             let mut doc_id = doc_id_mutex.lock().unwrap();
 
-            document_lengths.lock().unwrap().push(tokens.len() as u32);
+            // update documents array
+            documents.lock().unwrap().push(Document {
+                path: d.path().to_str().unwrap().to_string(),
+                lenght: tokens.len() as u32,
+            });
 
             let mut l_term_index_map = term_index_map.lock().unwrap();
             let mut l_postings = postings.lock().unwrap();
             let mut l_term_doc_map = term_doc_map.lock().unwrap();
 
             for (word_pos, t) in tokens.iter().enumerate() {
+                // obtain postings for this word and increment collection frequency
                 if !l_term_index_map.contains_key(t) {
                     let idx = l_term_index_map.len();
                     l_term_index_map.insert(t.clone(), idx);
@@ -75,6 +84,7 @@ fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) ->
                 let postings_list = &mut l_postings[term_index];
                 postings_list.collection_frequency += 1;
 
+                // obtain document entry for this word and update it
                 if !l_term_doc_map[term_index].contains_key(&doc_id) {
                     let idx = postings_list.documents.len();
                     l_term_doc_map[term_index].insert(*doc_id, idx);
@@ -105,6 +115,6 @@ fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) ->
     InMemoryIndex {
         term_index_map: sorted_term_index_map,
         postings: final_postings,
-        document_lengths: document_lengths.into_inner().unwrap(),
+        documents: documents.into_inner().unwrap(),
     }
 }
diff --git a/src/index/documents.rs b/src/index/documents.rs
@@ -1,20 +1,33 @@
 use crate::disk::{bits_reader::BitsReader, bits_writer::BitsWriter};
 
-use super::DOCUMENT_LENGHTS_EXTENSION;
+use super::DOCUMENTS_EXTENSION;
 
-pub fn write_doc_lentghts(document_lenghts: &Vec<u32>, output_path: &str) {
-    let doc_path = output_path.to_string() + DOCUMENT_LENGHTS_EXTENSION;
+#[derive(Clone)]
+pub struct Document {
+    pub path: String,
+    pub lenght: u32,
+}
+
+pub fn write_documents(documents: &Vec<Document>, output_path: &str) {
+    let doc_path = output_path.to_string() + DOCUMENTS_EXTENSION;
     let mut doc_writer = BitsWriter::new(&doc_path);
 
-    doc_writer.write_vbyte(document_lenghts.len() as u32);
-    document_lenghts.iter().for_each(|l| {
-        doc_writer.write_gamma(*l);
+    doc_writer.write_vbyte(documents.len() as u32);
+    documents.iter().for_each(|l| {
+        doc_writer.write_str(&l.path);
+        doc_writer.write_vbyte(l.lenght);
     });
 
     doc_writer.flush();
 }
 
-pub fn load_document_lenghts(input_path: &str) -> Vec<u32> {
-    let mut reader = BitsReader::new(&(input_path.to_string() + DOCUMENT_LENGHTS_EXTENSION));
-    reader.read_vbyte_gamma_vector()
+pub fn load_documents(input_path: &str) -> Vec<Document> {
+    let mut reader = BitsReader::new(&(input_path.to_string() + DOCUMENTS_EXTENSION));
+
+    (0..reader.read_vbyte())
+        .map(|_| Document {
+            path: reader.read_str(),
+            lenght: reader.read_vbyte(),
+        })
+        .collect()
 }
diff --git a/src/index/mod.rs b/src/index/mod.rs
@@ -12,26 +12,27 @@ use tokenizers::Tokenizer;
 
 use crate::disk::bits_reader::BitsReader;
 
+use self::documents::Document;
 use self::postings::PostingList;
 
 pub const POSTINGS_EXTENSION: &str = ".postings";
 pub const OFFSETS_EXTENSION: &str = ".offsets";
-pub const DOCUMENT_LENGHTS_EXTENSION: &str = ".doc_lengths";
+pub const DOCUMENTS_EXTENSION: &str = ".docs";
 pub const VOCABULARY_ALPHA_EXTENSION: &str = ".alphas";
 
 pub struct Index {
     term_to_index: FxHashMap<String, usize>,
     postings: BitsReader,
     term_offsets: Vec<u64>,
-    doc_lenghts: Vec<u32>,
+    documents: Vec<Document>,
     tokenizer: Tokenizer,
     stemmer: Stemmer,
 }
 
 pub struct InMemoryIndex {
     term_index_map: BTreeMap<String, usize>,
     postings: Vec<PostingList>,
-    document_lengths: Vec<u32>,
+    documents: Vec<Document>,
 }
 
 impl Index {
@@ -46,14 +47,22 @@ impl Index {
             term_to_index: vocabulary::load_vocabulary(input_path),
             postings: postings::build_postings_reader(input_path),
             term_offsets: postings::load_offsets(input_path),
-            doc_lenghts: documents::load_document_lenghts(input_path),
+            documents: documents::load_documents(input_path),
             tokenizer: text::load_tokenizer(tokenizer_path, false),
             stemmer: text::load_stemmer(),
         }
     }
 
     pub fn get_num_documents(&self) -> u32 {
-        self.doc_lenghts.len() as u32
+        self.documents.len() as u32
+    }
+
+    pub fn get_document_len(&self, doc_id: u32) -> u32 {
+        self.documents[doc_id as usize].lenght
+    }
+
+    pub fn get_document_path(&self, doc_id: u32) -> String {
+        self.documents[doc_id as usize].path.clone()
     }
 
     pub fn get_term(&mut self, term: &str) -> Option<postings::PostingList> {
@@ -81,12 +90,14 @@ impl Display for Index {
 
 #[cfg(test)]
 mod test {
-    use crate::test_utils::utils::create_temporary_dir_path;
-
     use super::*;
+    use crate::test_utils::utils::create_temporary_dir_path;
+    use rayon::ThreadPoolBuilder;
 
     #[test]
     fn test_build() {
+        let _ = ThreadPoolBuilder::new().num_threads(1).build_global();
+
         let index_path = &create_temporary_dir_path();
 
         Index::build_index(

diff --git a/src/main.rs b/src/main.rs
@@ -8,17 +8,22 @@ use search::query::QueryProcessor;
 
 use indicatif::HumanDuration;
 
-const NUM_RESULTS: usize = 1000000;
+const NUM_RESULTS: usize = 1_000_000;
 
-fn print_results(results: &[u32], elapsed_time: Duration) {
-    // println!("\nSearch Results:");
+fn print_results(results: &[String], elapsed_time: Duration) {
+    if results.is_empty() {
+        println!("\nNo documents found\n");
+        return;
+    }
 
-    // for (i, doc_id) in results.iter().enumerate() {
-    //     println!("\t- {:3}. Doc ID: {}", i + 1, doc_id);
-    // }
+    println!("\nTop 10 results:\n");
+
+    for (i, doc_id) in results.iter().take(10).enumerate() {
+        println!("\t{:2}. {}", i + 1, doc_id);
+    }
 
     println!(
-        "\nFetched {} documents in {} ms",
+        "\nFetched {} documents in {} ms\n",
         results.len(),
         elapsed_time.as_millis()
     );

diff --git a/src/query/mod.rs b/src/query/mod.rs
@@ -32,7 +32,7 @@ impl QueryProcessor {
         }
     }
 
-    pub fn query(&mut self, query: &str, num_results: usize) -> Vec<u32> {
+    pub fn query(&mut self, query: &str, num_results: usize) -> Vec<String> {
         let mut scores: HashMap<u32, DocumentScore> = HashMap::new();
 
         let tokens = self.index.tokenize_and_stem_query(query);
@@ -41,6 +41,8 @@ impl QueryProcessor {
             if let Some(postings) = self.index.get_term(token) {
                 let idf = (self.num_documents as f32 / postings.collection_frequency as f32).log2();
 
+                // for each term-doc pair, increment the documetn tf-idf score
+                // and record token positions for window computation
                 for doc_posting in &postings.documents {
                     let td_idf_score = doc_posting.document_frequency as f32 * idf;
 
@@ -64,13 +66,21 @@ impl QueryProcessor {
 
         let mut selector = DocumentSelector::new(num_results);
         let num_tokens = tokens.len();
-        scores.iter().for_each(|(id, score)| {
+        scores.iter_mut().for_each(|(id, score)| {
+            // tf-idf score must be divided by the document len
+            score.tf_idf /= self.index.get_document_len(*id) as f32;
+
             selector.push(*id, QueryProcessor::compute_score(score, num_tokens));
         });
 
-        selector.get_sorted_ids()
+        selector
+            .get_sorted_ids()
+            .iter()
+            .map(|i| self.index.get_document_path(*i))
+            .collect()
     }
 
+    // score takes into consideration the window size and td-idf scoring
     fn compute_score(document_score: &DocumentScore, num_tokens: usize) -> f32 {
         let mut window = u32::MAX;