Skip to content

Commit

Permalink
boolean queries and readme
Browse files Browse the repository at this point in the history
  • Loading branch information
tomfran committed Jan 30, 2024
1 parent 4bfbbab commit 3772d96
Show file tree
Hide file tree
Showing 10 changed files with 180 additions and 75 deletions.
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,4 @@ Cargo.lock
# TODO
todo.md

/misc

.DS_Store
60 changes: 42 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,53 @@

Search engine written in Rust, based on an inverted index on disk.

## Implementation status
## Commands

**IO**
- [x] Classes for writing and reading bit-streams;
- [x] Proper strings writer and reader.
**Index a new document collection**

**Text preprocessing**
- [x] Tokenization;
- [x] Stemming;
```
make cli folder=path/to/folder action=build min_f=1 max_p=0.99
```

**Index construction**
- [x] In-memory datasets index construction;
- [x] Proper vocabulary and paths on disk;
- [x] Spelling correction index;
- [x] Min and max frequency cutoffs.
The `min_f` param filters terms appearing less that it, while `max_p` filters terms appearing more than
in `max_p` percentage of the documents.

**Queries**
- [x] BM25 scoring and query window;
- [ ] Boolean queries: in progress
The folder param is a path to a folder with the following structure:
```
├── docs
│ ├── 1.txt
│ ├── 2.txt
│ └── 3.txt
└── index
├── idx.alphas
├── idx.docs
├── idx.offsets
└── idx.postings
```

**Client**
- [x] CLI;
- [x] Web interface.
The index folder will be created after the build command.

**Load a document collection**

You can load a pre-build index by running:

```
make web folder=path/to/folder
```

You can then visit `http://0.0.0.0:3000` to find a web interface to enter free text and boolean queries.

![web.png](misc%2Fweb.png)

**Query Syntax**

You can perform Google-like free test queries, results will
be ranked via [BM25](https://en.wikipedia.org/wiki/Okapi_BM25) scoring.

You can also specify boolean queries with `"b: "` prefix such as:
```
b: hello AND there OR NOT man
```

## References
[Introduction to Information Retrieval](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) - Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze
Expand Down
4 changes: 2 additions & 2 deletions makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
web:
cargo run --release --bin server $(index_name)
cargo run --release --bin server ${folder}

cli:
cargo run --release --bin search $(index_name) ${action}
cargo run --release --bin search ${folder} ${action} ${min_f} ${max_p}

test:
cargo test --release
Expand Down
Binary file added misc/web.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions search/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ fxhash = "0.2.1"
tempdir = "0.3.7"
regex = "1"
argparse = "0.2.2"
phf = { version = "0.11.2", features = ["macros"] }
159 changes: 115 additions & 44 deletions search/src/engine/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use self::heap::FixedMinHeap;
use self::postings::{DocumentIdsList, Postings, PostingsList};
use self::preprocessor::Preprocessor;
use self::vocabulary::Vocabulary;
use phf::phf_map;
use std::cmp::min;
use std::collections::{BTreeMap, HashMap};
use std::time::Instant;
Expand All @@ -26,6 +27,14 @@ const BM25_SCORE_MULTIPLIER: f64 = 1.0;
const BM25_KL: f64 = 1.2;
const BM25_B: f64 = 0.75;

static BOOLEAN_PRECEDENCE: phf::Map<&'static str, &u8> = phf_map! {
"NOT" => &3,
"AND" => &2,
"OR" => &1,
"(" => &0,
")" => &0,
};

pub struct Engine {
vocabulary: Vocabulary,
postings: Postings,
Expand All @@ -39,19 +48,13 @@ pub struct InMemory {
documents: Vec<Document>,
}

pub struct BooleanQueryResult {
pub postfix_query: Vec<String>,
pub documents_ids: DocumentIdsList,
pub time_ms: u128,
}

pub struct RankedQueryResult {
pub tokens: Vec<String>,
pub documents: Vec<RankedDocumentResult>,
pub struct QueryResult {
pub query: Vec<String>,
pub documents: Vec<DocumentResult>,
pub time_ms: u128,
}

pub struct RankedDocumentResult {
pub struct DocumentResult {
pub id: u32,
pub path: String,
pub score: f64,
Expand Down Expand Up @@ -88,15 +91,16 @@ impl Engine {
}
}

pub fn boolean_query(&mut self, postfix_expression: Vec<&str>) -> BooleanQueryResult {
pub fn boolean_query(&mut self, query: &str) -> QueryResult {
let start_time = Instant::now();

let mut stack = Vec::new();
let mut intermediate_result;
let num_docs = self.documents.get_num_documents();

for p in postfix_expression.clone() {
match p {
let query = Self::infix_to_postfix_boolean(query);
for p in query.clone() {
match p.as_str() {
"AND" => {
intermediate_result =
Postings::and_operator(stack.pop().unwrap(), stack.pop().unwrap());
Expand All @@ -111,7 +115,7 @@ impl Engine {
_ => {
intermediate_result = self
.vocabulary
.spellcheck_term(p)
.spellcheck_term(&p)
.and_then(|t| self.get_term_doc_ids(&t))
.unwrap_or_default();
}
Expand All @@ -120,16 +124,27 @@ impl Engine {
stack.push(intermediate_result);
}

let documents = stack
.pop()
.unwrap()
.iter()
.map(|i| DocumentResult {
id: *i,
path: self.documents.get_doc_path(*i),
score: 1.0,
})
.collect();

let time_ms = start_time.elapsed().as_millis();

BooleanQueryResult {
postfix_query: postfix_expression.iter().map(|s| s.to_string()).collect(),
documents_ids: stack.pop().unwrap(),
QueryResult {
query,
documents,
time_ms,
}
}

pub fn free_query(&mut self, query: &str, num_results: usize) -> RankedQueryResult {
pub fn free_query(&mut self, query: &str, num_results: usize) -> QueryResult {
let start_time = Instant::now();

let tokens: Vec<String> = self
Expand Down Expand Up @@ -184,7 +199,7 @@ impl Engine {
let documents = selector
.get_sorted_id_priority_pairs()
.iter()
.map(|(id, score)| RankedDocumentResult {
.map(|(id, score)| DocumentResult {
id: *id,
score: *score,
path: self.documents.get_doc_path(*id),
Expand All @@ -193,23 +208,57 @@ impl Engine {

let time_ms = start_time.elapsed().as_millis();

RankedQueryResult {
tokens,
QueryResult {
query: tokens,
documents,
time_ms,
}
}

fn get_term_postings(&mut self, term: &str) -> Option<PostingsList> {
fn get_term_doc_ids(&mut self, term: &str) -> Option<DocumentIdsList> {
self.vocabulary
.get_term_index(term)
.map(|i| self.postings.load_postings_list(i))
.map(|i| self.postings.load_doc_ids_list(i))
}

fn get_term_doc_ids(&mut self, term: &str) -> Option<DocumentIdsList> {
fn infix_to_postfix_boolean(query: &str) -> Vec<String> {
let mut res = Vec::new();
let mut stack = Vec::new();

let sanitized_query = query.replace('(', " ( ").replace(')', " ) ");

for t in sanitized_query.split_ascii_whitespace() {
if t == "(" {
stack.push(t);
} else if t == ")" {
let mut last = stack.pop().unwrap();
while last != "(" {
res.push(last);
last = stack.pop().unwrap();
}
} else if let Some(current_precedence) = BOOLEAN_PRECEDENCE.get(t) {
while !stack.is_empty() {
let last = stack.last().unwrap();
if BOOLEAN_PRECEDENCE.get(last).unwrap() > current_precedence {
res.push(stack.pop().unwrap());
} else {
break;
}
}
stack.push(t);
} else {
res.push(t);
}
}

stack.iter().rev().for_each(|e| res.push(e));
res.iter().map(|s| (*s).to_string()).collect()
}

fn get_term_postings(&mut self, term: &str) -> Option<PostingsList> {
self.vocabulary
.get_term_index(term)
.map(|i| self.postings.load_doc_ids_list(i))
.map(|i| self.postings.load_postings_list(i))
}

fn compute_score(document_score: &DocumentScore, num_tokens: usize) -> f64 {
Expand Down Expand Up @@ -254,37 +303,59 @@ mod test {
#[test]
fn test_build() {
let index_path = &create_temporary_dir_path();

Engine::build_engine("test_data/docs", index_path, 1.0, 0);

let mut idx = Engine::load_index(index_path);

for ele in ["hello", "man", "world"] {
assert!(idx.vocabulary.get_term_index(ele).is_some());
}

let mut query: Vec<String> = idx
let mut free_query: Vec<String> = idx
.free_query("hello", 10)
.documents
.iter()
.map(|d| d.path.clone())
.collect();
free_query.sort();

assert_eq!(free_query, ["test_data/docs/1.txt", "test_data/docs/2.txt"]);

let mut boolean_query: Vec<String> = idx
.boolean_query("hello AND NOT world")
.documents
.iter()
.map(|d| d.path.clone())
.collect();
boolean_query.sort();

query.sort();

assert_eq!(query, ["test_data/docs/1.txt", "test_data/docs/2.txt"]);

// println!(
// "{:?}",
// idx.boolean_query(vec!["hello", "man", "OR"]).documents_ids
// );
// println!(
// "{:?}",
// idx.boolean_query(vec!["hello", "man", "AND"]).documents_ids
// );
// println!(
// "{:?}",
// idx.boolean_query(vec!["man", "NOT"]).documents_ids[0]
// );
assert_eq!(boolean_query, ["test_data/docs/2.txt"]);
}

#[test]
fn test_infix_postfix() {
assert_eq!(
Engine::infix_to_postfix_boolean("a AND (b OR NOT c)"),
["a", "b", "c", "NOT", "OR", "AND"]
);

assert_eq!(
Engine::infix_to_postfix_boolean("a AND b OR NOT c"),
["a", "b", "AND", "c", "NOT", "OR"]
);

assert_eq!(
Engine::infix_to_postfix_boolean("NOT (a AND b) OR NOT (c OR d)"),
["a", "b", "AND", "NOT", "c", "d", "OR", "NOT", "OR"]
);

assert_eq!(
Engine::infix_to_postfix_boolean("a AND b AND c OR d OR e"),
["a", "b", "c", "AND", "AND", "d", "e", "OR", "OR"]
);

assert_eq!(
Engine::infix_to_postfix_boolean("a AND (b OR c)"),
["a", "b", "c", "OR", "AND"]
);
}
}
3 changes: 3 additions & 0 deletions search/src/engine/postings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -241,5 +241,8 @@ mod tests {

let result_empty = Postings::not_operator(vec![], n);
assert_eq!(result_empty, (1..=n).collect::<Vec<u32>>());

let result_full = Postings::not_operator(vec![0, 1, 2], 3);
assert_eq!(result_full, []);
}
}
12 changes: 8 additions & 4 deletions search/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use indicatif::HumanDuration;
use search::engine::{Engine, RankedQueryResult};
use search::engine::{Engine, QueryResult};
use std::cmp::min;
use std::env;
use std::io::{self, Write};
Expand All @@ -9,8 +9,8 @@ use std::time::{Duration, Instant};
const NUM_TOP_RESULTS: usize = 10;
const NUM_RESULTS: usize = 100;

fn print_results(result: &RankedQueryResult) {
println!("Search tokens: {:?}", result.tokens);
fn print_results(result: &QueryResult) {
println!("Search tokens: {:?}", result.query);

if result.documents.is_empty() {
println!("\nNo documents found\n");
Expand Down Expand Up @@ -107,7 +107,11 @@ fn main() {
loop {
let query = read_line("> ");

let result = e.free_query(&query, NUM_RESULTS);
let result = if query.starts_with("b: ") {
e.boolean_query(&query.replace("b: ", ""))
} else {
e.free_query(&query, NUM_RESULTS)
};

print_results(&result);
}
Expand Down
Loading

0 comments on commit 3772d96

Please sign in to comment.