boolean queries and readme

tomfran · Jan 30, 2024 · 3772d96 · 3772d96
1 parent 4bfbbab
commit 3772d96
Show file tree

Hide file tree

Showing 10 changed files with 180 additions and 75 deletions.
diff --git a/.gitignore b/.gitignore
@@ -24,6 +24,4 @@ Cargo.lock
 # TODO
 todo.md
 
-/misc
-
 .DS_Store
diff --git a/README.md b/README.md
@@ -2,29 +2,53 @@
 
 Search engine written in Rust, based on an inverted index on disk.
 
-## Implementation status 
+## Commands
 
-**IO**
-- [x] Classes for writing and reading bit-streams;
-- [x] Proper strings writer and reader.
+**Index a new document collection**
 
-**Text preprocessing** 
-- [x] Tokenization;
-- [x] Stemming;
+```
+make cli folder=path/to/folder action=build min_f=1 max_p=0.99
+```
 
-**Index construction**
-- [x] In-memory datasets index construction;
-- [x] Proper vocabulary and paths on disk;
-- [x] Spelling correction index;
-- [x] Min and max frequency cutoffs.
+The `min_f` param filters terms appearing less that it, while `max_p` filters terms appearing more than 
+in `max_p` percentage of the documents.
 
-**Queries**
-- [x] BM25 scoring and query window; 
-- [ ] Boolean queries: in progress
+The folder param is a path to a folder with the following structure: 
+```
+├── docs
+│   ├── 1.txt
+│   ├── 2.txt
+│   └── 3.txt
+└── index
+    ├── idx.alphas
+    ├── idx.docs
+    ├── idx.offsets
+    └── idx.postings
+```
 
-**Client**
-- [x] CLI;
-- [x] Web interface.
+The index folder will be created after the build command.
+
+**Load a document collection**
+
+You can load a pre-build index by running:
+
+```
+make web folder=path/to/folder
+```
+
+You can then visit `http://0.0.0.0:3000` to find a web interface to enter free text and boolean queries.
+
+![web.png](misc%2Fweb.png)
+
+**Query Syntax**
+
+You can perform Google-like free test queries, results will 
+be ranked via [BM25](https://en.wikipedia.org/wiki/Okapi_BM25) scoring.
+
+You can also specify boolean queries with `"b: "` prefix such as: 
+```
+b: hello AND there OR NOT man
+```
 
 ## References
 [Introduction to Information Retrieval](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) - Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze

diff --git a/makefile b/makefile
@@ -1,8 +1,8 @@
 web:
-	cargo run --release --bin server $(index_name)
+	cargo run --release --bin server ${folder}
 
 cli:
-	cargo run --release --bin search $(index_name) ${action}
+	cargo run --release --bin search ${folder} ${action} ${min_f} ${max_p}
 
 test:
 	cargo test --release

diff --git a/misc/web.png b/misc/web.png
diff --git a/search/Cargo.toml b/search/Cargo.toml
@@ -13,3 +13,4 @@ fxhash = "0.2.1"
 tempdir = "0.3.7"
 regex = "1"
 argparse = "0.2.2"
+phf = { version = "0.11.2", features = ["macros"] }
diff --git a/search/src/engine/mod.rs b/search/src/engine/mod.rs
@@ -11,6 +11,7 @@ use self::heap::FixedMinHeap;
 use self::postings::{DocumentIdsList, Postings, PostingsList};
 use self::preprocessor::Preprocessor;
 use self::vocabulary::Vocabulary;
+use phf::phf_map;
 use std::cmp::min;
 use std::collections::{BTreeMap, HashMap};
 use std::time::Instant;
@@ -26,6 +27,14 @@ const BM25_SCORE_MULTIPLIER: f64 = 1.0;
 const BM25_KL: f64 = 1.2;
 const BM25_B: f64 = 0.75;
 
+static BOOLEAN_PRECEDENCE: phf::Map<&'static str, &u8> = phf_map! {
+    "NOT" => &3,
+    "AND" => &2,
+    "OR" => &1,
+    "(" => &0,
+    ")" => &0,
+};
+
 pub struct Engine {
     vocabulary: Vocabulary,
     postings: Postings,
@@ -39,19 +48,13 @@ pub struct InMemory {
     documents: Vec<Document>,
 }
 
-pub struct BooleanQueryResult {
-    pub postfix_query: Vec<String>,
-    pub documents_ids: DocumentIdsList,
-    pub time_ms: u128,
-}
-
-pub struct RankedQueryResult {
-    pub tokens: Vec<String>,
-    pub documents: Vec<RankedDocumentResult>,
+pub struct QueryResult {
+    pub query: Vec<String>,
+    pub documents: Vec<DocumentResult>,
     pub time_ms: u128,
 }
 
-pub struct RankedDocumentResult {
+pub struct DocumentResult {
     pub id: u32,
     pub path: String,
     pub score: f64,
@@ -88,15 +91,16 @@ impl Engine {
         }
     }
 
-    pub fn boolean_query(&mut self, postfix_expression: Vec<&str>) -> BooleanQueryResult {
+    pub fn boolean_query(&mut self, query: &str) -> QueryResult {
         let start_time = Instant::now();
 
         let mut stack = Vec::new();
         let mut intermediate_result;
         let num_docs = self.documents.get_num_documents();
 
-        for p in postfix_expression.clone() {
-            match p {
+        let query = Self::infix_to_postfix_boolean(query);
+        for p in query.clone() {
+            match p.as_str() {
                 "AND" => {
                     intermediate_result =
                         Postings::and_operator(stack.pop().unwrap(), stack.pop().unwrap());
@@ -111,7 +115,7 @@ impl Engine {
                 _ => {
                     intermediate_result = self
                         .vocabulary
-                        .spellcheck_term(p)
+                        .spellcheck_term(&p)
                         .and_then(|t| self.get_term_doc_ids(&t))
                         .unwrap_or_default();
                 }
@@ -120,16 +124,27 @@ impl Engine {
             stack.push(intermediate_result);
         }
 
+        let documents = stack
+            .pop()
+            .unwrap()
+            .iter()
+            .map(|i| DocumentResult {
+                id: *i,
+                path: self.documents.get_doc_path(*i),
+                score: 1.0,
+            })
+            .collect();
+
         let time_ms = start_time.elapsed().as_millis();
 
-        BooleanQueryResult {
-            postfix_query: postfix_expression.iter().map(|s| s.to_string()).collect(),
-            documents_ids: stack.pop().unwrap(),
+        QueryResult {
+            query,
+            documents,
             time_ms,
         }
     }
 
-    pub fn free_query(&mut self, query: &str, num_results: usize) -> RankedQueryResult {
+    pub fn free_query(&mut self, query: &str, num_results: usize) -> QueryResult {
         let start_time = Instant::now();
 
         let tokens: Vec<String> = self
@@ -184,7 +199,7 @@ impl Engine {
         let documents = selector
             .get_sorted_id_priority_pairs()
             .iter()
-            .map(|(id, score)| RankedDocumentResult {
+            .map(|(id, score)| DocumentResult {
                 id: *id,
                 score: *score,
                 path: self.documents.get_doc_path(*id),
@@ -193,23 +208,57 @@ impl Engine {
 
         let time_ms = start_time.elapsed().as_millis();
 
-        RankedQueryResult {
-            tokens,
+        QueryResult {
+            query: tokens,
             documents,
             time_ms,
         }
     }
 
-    fn get_term_postings(&mut self, term: &str) -> Option<PostingsList> {
+    fn get_term_doc_ids(&mut self, term: &str) -> Option<DocumentIdsList> {
         self.vocabulary
             .get_term_index(term)
-            .map(|i| self.postings.load_postings_list(i))
+            .map(|i| self.postings.load_doc_ids_list(i))
     }
 
-    fn get_term_doc_ids(&mut self, term: &str) -> Option<DocumentIdsList> {
+    fn infix_to_postfix_boolean(query: &str) -> Vec<String> {
+        let mut res = Vec::new();
+        let mut stack = Vec::new();
+
+        let sanitized_query = query.replace('(', " ( ").replace(')', " ) ");
+
+        for t in sanitized_query.split_ascii_whitespace() {
+            if t == "(" {
+                stack.push(t);
+            } else if t == ")" {
+                let mut last = stack.pop().unwrap();
+                while last != "(" {
+                    res.push(last);
+                    last = stack.pop().unwrap();
+                }
+            } else if let Some(current_precedence) = BOOLEAN_PRECEDENCE.get(t) {
+                while !stack.is_empty() {
+                    let last = stack.last().unwrap();
+                    if BOOLEAN_PRECEDENCE.get(last).unwrap() > current_precedence {
+                        res.push(stack.pop().unwrap());
+                    } else {
+                        break;
+                    }
+                }
+                stack.push(t);
+            } else {
+                res.push(t);
+            }
+        }
+
+        stack.iter().rev().for_each(|e| res.push(e));
+        res.iter().map(|s| (*s).to_string()).collect()
+    }
+
+    fn get_term_postings(&mut self, term: &str) -> Option<PostingsList> {
         self.vocabulary
             .get_term_index(term)
-            .map(|i| self.postings.load_doc_ids_list(i))
+            .map(|i| self.postings.load_postings_list(i))
     }
 
     fn compute_score(document_score: &DocumentScore, num_tokens: usize) -> f64 {
@@ -254,37 +303,59 @@ mod test {
     #[test]
     fn test_build() {
         let index_path = &create_temporary_dir_path();
-
         Engine::build_engine("test_data/docs", index_path, 1.0, 0);
-
         let mut idx = Engine::load_index(index_path);
 
         for ele in ["hello", "man", "world"] {
             assert!(idx.vocabulary.get_term_index(ele).is_some());
         }
 
-        let mut query: Vec<String> = idx
+        let mut free_query: Vec<String> = idx
             .free_query("hello", 10)
             .documents
             .iter()
             .map(|d| d.path.clone())
             .collect();
+        free_query.sort();
+
+        assert_eq!(free_query, ["test_data/docs/1.txt", "test_data/docs/2.txt"]);
+
+        let mut boolean_query: Vec<String> = idx
+            .boolean_query("hello AND NOT world")
+            .documents
+            .iter()
+            .map(|d| d.path.clone())
+            .collect();
+        boolean_query.sort();
 
-        query.sort();
-
-        assert_eq!(query, ["test_data/docs/1.txt", "test_data/docs/2.txt"]);
-
-        // println!(
-        //     "{:?}",
-        //     idx.boolean_query(vec!["hello", "man", "OR"]).documents_ids
-        // );
-        // println!(
-        //     "{:?}",
-        //     idx.boolean_query(vec!["hello", "man", "AND"]).documents_ids
-        // );
-        // println!(
-        //     "{:?}",
-        //     idx.boolean_query(vec!["man", "NOT"]).documents_ids[0]
-        // );
+        assert_eq!(boolean_query, ["test_data/docs/2.txt"]);
+    }
+
+    #[test]
+    fn test_infix_postfix() {
+        assert_eq!(
+            Engine::infix_to_postfix_boolean("a AND (b OR NOT c)"),
+            ["a", "b", "c", "NOT", "OR", "AND"]
+        );
+
+        assert_eq!(
+            Engine::infix_to_postfix_boolean("a AND b OR NOT c"),
+            ["a", "b", "AND", "c", "NOT", "OR"]
+        );
+
+        assert_eq!(
+            Engine::infix_to_postfix_boolean("NOT (a AND b) OR NOT (c OR d)"),
+            ["a", "b", "AND", "NOT", "c", "d", "OR", "NOT", "OR"]
+        );
+
+        assert_eq!(
+            Engine::infix_to_postfix_boolean("a AND b AND c OR d OR e"),
+            ["a", "b", "c", "AND", "AND", "d", "e", "OR", "OR"]
+        );
+
+        assert_eq!(
+            Engine::infix_to_postfix_boolean("a AND (b OR c)"),
+            ["a", "b", "c", "OR", "AND"]
+        );
     }
 }
diff --git a/search/src/engine/postings.rs b/search/src/engine/postings.rs
@@ -241,5 +241,8 @@ mod tests {
 
         let result_empty = Postings::not_operator(vec![], n);
         assert_eq!(result_empty, (1..=n).collect::<Vec<u32>>());
+
+        let result_full = Postings::not_operator(vec![0, 1, 2], 3);
+        assert_eq!(result_full, []);
     }
 }
diff --git a/search/src/main.rs b/search/src/main.rs
@@ -1,5 +1,5 @@
 use indicatif::HumanDuration;
-use search::engine::{Engine, RankedQueryResult};
+use search::engine::{Engine, QueryResult};
 use std::cmp::min;
 use std::env;
 use std::io::{self, Write};
@@ -9,8 +9,8 @@ use std::time::{Duration, Instant};
 const NUM_TOP_RESULTS: usize = 10;
 const NUM_RESULTS: usize = 100;
 
-fn print_results(result: &RankedQueryResult) {
-    println!("Search tokens: {:?}", result.tokens);
+fn print_results(result: &QueryResult) {
+    println!("Search tokens: {:?}", result.query);
 
     if result.documents.is_empty() {
         println!("\nNo documents found\n");
@@ -107,7 +107,11 @@ fn main() {
     loop {
         let query = read_line("> ");
 
-        let result = e.free_query(&query, NUM_RESULTS);
+        let result = if query.starts_with("b: ") {
+            e.boolean_query(&query.replace("b: ", ""))
+        } else {
+            e.free_query(&query, NUM_RESULTS)
+        };
 
         print_results(&result);
     }