simplified structure, bm25 instead of tfidf

tomfran · Jan 27, 2024 · 58ad091 · 58ad091
1 parent 5cc4bfe
commit 58ad091
Show file tree

Hide file tree

Showing 16 changed files with 293 additions and 328 deletions.
diff --git a/README.md b/README.md
@@ -16,10 +16,10 @@ Search engine written in Rust, based on an inverted index on disk.
 **Index construction**
 - [x] In-memory datasets index construction;
 - [x] Proper vocabulary and paths on disk;
-- [ ] Spelling correction index: in progress.
+- [x] Spelling correction index;.
 
 **Queries**
-- [x] Tf-idf ranked retrieval;
+- [x] BM25 scoring;
 - [x] Window computation;
 
 **Evaluation**

diff --git a/search/src/index/builder.rs → search/src/engine/builder.rs b/search/src/index/builder.rs → search/src/engine/builder.rs
@@ -19,7 +19,7 @@ const PROGRESS_CHARS: &str = "=> ";
 
 const CUTOFF_THRESHOLD: f64 = 0.8;
 
-pub fn build_index(input_dir: &str, output_path: &str, preprocessor: &Preprocessor) {
+pub fn build_engine(input_dir: &str, output_path: &str, preprocessor: &Preprocessor) {
     let index: InMemory = build_in_memory(input_dir, preprocessor);
     Postings::write_postings(&index, output_path);
     Vocabulary::write_vocabulary(&index, output_path);
@@ -59,7 +59,7 @@ fn build_in_memory(input_dir: &str, preprocessor: &Preprocessor) -> InMemory {
             // update documents array
             documents.lock().unwrap().push(Document {
                 path: d.path().to_str().unwrap().to_string(),
-                lenght: tokens.len() as u32,
+                length: tokens.len() as u32,
             });
 
             let mut l_term_index_map = term_index_map.lock().unwrap();

diff --git a/search/src/index/documents.rs → search/src/engine/documents.rs b/search/src/index/documents.rs → search/src/engine/documents.rs
@@ -4,33 +4,39 @@ use crate::disk::{bits_reader::BitsReader, bits_writer::BitsWriter};
 #[derive(Clone)]
 pub struct Document {
     pub path: String,
-    pub lenght: u32,
+    pub length: u32,
 }
 
 pub struct Documents {
     docs: Vec<Document>,
+    avg_len: f64,
 }
 
 impl Documents {
     pub fn load_documents(input_path: &str) -> Documents {
         let mut reader = BitsReader::new(&(input_path.to_string() + DOCUMENTS_EXTENSION));
 
         let mut prev = String::new();
-        let docs = (0..reader.read_vbyte())
+
+        let mut length_sum = 0;
+
+        let docs: Vec<Document> = (0..reader.read_vbyte())
             .map(|_| {
                 let p_len = reader.read_gamma();
                 let prefix: String = prev.chars().take(p_len as usize).collect();
-                let s = prefix + &reader.read_str();
-                prev = s.clone();
+                let path = prefix + &reader.read_str();
+                prev = path.clone();
+
+                let length = reader.read_vbyte();
+                length_sum += length;
 
-                Document {
-                    path: s,
-                    lenght: reader.read_vbyte(),
-                }
+                Document { path, length }
             })
             .collect();
 
-        Documents { docs }
+        let avg_len = length_sum as f64 / docs.len() as f64;
+
+        Documents { docs, avg_len }
     }
 
     pub fn write_documents(documents: &Vec<Document>, output_path: &str) {
@@ -47,7 +53,7 @@ impl Documents {
             prev = &l.path;
 
             writer.write_str(&remaining);
-            writer.write_vbyte(l.lenght);
+            writer.write_vbyte(l.length);
         }
 
         writer.flush();
@@ -58,7 +64,11 @@ impl Documents {
     }
 
     pub fn get_doc_len(&self, doc_id: u32) -> u32 {
-        self.docs[doc_id as usize].lenght
+        self.docs[doc_id as usize].length
+    }
+
+    pub fn get_avg_doc_len(&self) -> f64 {
+        self.avg_len
     }
 
     pub fn get_doc_path(&self, doc_id: u32) -> String {
@@ -79,11 +89,11 @@ mod tests {
         let documents = vec![
             Document {
                 path: "document1.txt".to_string(),
-                lenght: 100,
+                length: 100,
             },
             Document {
                 path: "document2.txt".to_string(),
-                lenght: 150,
+                length: 150,
             },
         ];
 
@@ -94,7 +104,7 @@ mod tests {
 
         for (i, d) in documents.iter().enumerate() {
             assert_eq!(loaded_documents.get_doc_path(i as u32), d.path);
-            assert_eq!(loaded_documents.get_doc_len(i as u32), d.lenght);
+            assert_eq!(loaded_documents.get_doc_len(i as u32), d.length);
         }
     }
 
@@ -103,23 +113,24 @@ mod tests {
         let documents = vec![
             Document {
                 path: "document1.txt".to_string(),
-                lenght: 100,
+                length: 100,
             },
             Document {
                 path: "document2.txt".to_string(),
-                lenght: 150,
+                length: 150,
             },
         ];
 
         let doc_collection = Documents {
             docs: documents.clone(),
+            avg_len: 125.0,
         };
 
         assert_eq!(doc_collection.get_num_documents(), documents.len() as u32);
 
         for (i, d) in documents.iter().enumerate() {
             assert_eq!(doc_collection.get_doc_path(i as u32), d.path);
-            assert_eq!(doc_collection.get_doc_len(i as u32), d.lenght);
+            assert_eq!(doc_collection.get_doc_len(i as u32), d.length);
         }
     }
 }
diff --git a/search/src/query/document_selector.rs → search/src/engine/heap.rs b/search/src/query/document_selector.rs → search/src/engine/heap.rs
@@ -3,12 +3,12 @@ use std::{cmp::Ordering, collections::BinaryHeap};
 #[derive(Debug)]
 pub struct Entry {
     pub id: u32,
-    pub score: f64,
+    pub priority: f64,
 }
 
 impl PartialEq for Entry {
     fn eq(&self, other: &Self) -> bool {
-        self.score == other.score
+        self.priority == other.priority
     }
 }
 
@@ -23,35 +23,41 @@ impl PartialOrd for Entry {
 impl Ord for Entry {
     fn cmp(&self, other: &Self) -> Ordering {
         other
-            .score
-            .partial_cmp(&self.score)
+            .priority
+            .partial_cmp(&self.priority)
             .unwrap_or(Ordering::Equal)
     }
 }
 
-pub struct DocumentSelector {
+pub struct FixedMinHeap {
     heap: BinaryHeap<Entry>,
     capacity: usize,
 }
 
-impl DocumentSelector {
-    pub fn new(capacity: usize) -> DocumentSelector {
-        DocumentSelector {
+impl FixedMinHeap {
+    pub fn new(capacity: usize) -> FixedMinHeap {
+        FixedMinHeap {
             heap: BinaryHeap::new(),
             capacity,
         }
     }
 
     pub fn push(&mut self, id: u32, score: f64) {
-        self.heap.push(Entry { id, score });
+        self.heap.push(Entry {
+            id,
+            priority: score,
+        });
 
         if self.heap.len() > self.capacity {
             self.heap.pop();
         }
     }
 
-    pub fn get_sorted_entries(&mut self) -> Vec<Entry> {
-        let mut res: Vec<Entry> = (0..self.capacity).filter_map(|_| self.heap.pop()).collect();
+    pub fn get_sorted_id_priority_pairs(&mut self) -> Vec<(u32, f64)> {
+        let mut res: Vec<(u32, f64)> = (0..self.capacity)
+            .filter_map(|_| self.heap.pop().map(|e| (e.id, e.priority)))
+            .collect();
+
         res.reverse();
         res
     }
@@ -63,37 +69,29 @@ mod test {
 
     #[test]
     fn test_top_k() {
-        let mut selector = DocumentSelector::new(2);
+        let mut selector = FixedMinHeap::new(2);
 
         selector.push(2, 0.4);
         selector.push(3, 0.3);
         selector.push(1, 0.5);
         selector.push(4, 0.2);
 
         assert_eq!(
-            selector
-                .get_sorted_entries()
-                .iter()
-                .map(|e| e.id)
-                .collect::<Vec<_>>(),
-            [1, 2]
+            selector.get_sorted_id_priority_pairs(),
+            [(1, 0.5), (2, 0.4)]
         );
     }
 
     #[test]
     fn test_top_less_than_k() {
-        let mut selector = DocumentSelector::new(3);
+        let mut selector = FixedMinHeap::new(3);
 
         selector.push(1, 0.5);
         selector.push(2, 0.4);
 
         assert_eq!(
-            selector
-                .get_sorted_entries()
-                .iter()
-                .map(|e| e.id)
-                .collect::<Vec<_>>(),
-            [1, 2]
+            selector.get_sorted_id_priority_pairs(),
+            [(1, 0.5), (2, 0.4)]
         );
     }
 }