Skip to content

Commit

Permalink
boolean queries start
Browse files Browse the repository at this point in the history
  • Loading branch information
tomfran committed Jan 30, 2024
1 parent f7ea00c commit 4bfbbab
Show file tree
Hide file tree
Showing 7 changed files with 259 additions and 71 deletions.
9 changes: 2 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,8 @@ Search engine written in Rust, based on an inverted index on disk.
- [x] Min and max frequency cutoffs.

**Queries**
- [x] BM25 scoring;
- [x] Query window

**Evaluation**
- [ ] Query speed;
- [ ] Query quality;
- [ ] Disk overhead.
- [x] BM25 scoring and query window;
- [ ] Boolean queries: in progress

**Client**
- [x] CLI;
Expand Down
18 changes: 8 additions & 10 deletions search/src/engine/builder.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use super::{
documents::{Document, Documents},
postings::{PostingEntry, PostingList, Postings},
postings::{Posting, Postings, PostingsList},
preprocessor::Preprocessor,
vocabulary::Vocabulary,
InMemory,
Expand Down Expand Up @@ -49,7 +49,7 @@ fn build_in_memory(
// document counter
let doc_id_mutex = Mutex::new(0);
// postings list
let postings: Mutex<Vec<PostingList>> = Mutex::new(Vec::new());
let postings: Mutex<Vec<PostingsList>> = Mutex::new(Vec::new());
// word to postings index
let term_index_map = Mutex::new(HashMap::new());
// per-word doc id to posting list index
Expand Down Expand Up @@ -85,23 +85,21 @@ fn build_in_memory(
if !l_term_index_map.contains_key(t) {
let idx = l_term_index_map.len();
l_term_index_map.insert(t.clone(), idx);
l_postings.push(PostingList::default());
l_postings.push(PostingsList::new());
l_term_doc_map.push(HashMap::new());
}
let term_index = *l_term_index_map.get(t).unwrap();

let postings_list = &mut l_postings[term_index];
postings_list.collection_frequency += 1;

// obtain document entry for this word and update it
let postings_list = &mut l_postings[term_index];
if !l_term_doc_map[term_index].contains_key(&doc_id) {
let idx = postings_list.documents.len();
let idx = postings_list.len();
l_term_doc_map[term_index].insert(*doc_id, idx);
postings_list.documents.push(PostingEntry::default());
postings_list.push(Posting::default());
}
let posting_entry_index = *l_term_doc_map[term_index].get(&doc_id).unwrap();

let posting_entry = &mut postings_list.documents[posting_entry_index];
let posting_entry = &mut postings_list[posting_entry_index];

posting_entry.document_frequency += 1;
posting_entry.document_id = *doc_id;
Expand All @@ -120,7 +118,7 @@ fn build_in_memory(
.unwrap()
.into_iter()
.filter(|(_, v)| {
let f = final_postings[*v].collection_frequency;
let f = final_postings[*v].len() as u32;
f <= frequency_threshold && f > min_freq_threshold
})
.collect();
Expand Down
90 changes: 78 additions & 12 deletions search/src/engine/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ mod vocabulary;

use self::documents::{Document, Documents};
use self::heap::FixedMinHeap;
use self::postings::{PostingList, Postings};
use self::postings::{DocumentIdsList, Postings, PostingsList};
use self::preprocessor::Preprocessor;
use self::vocabulary::Vocabulary;
use std::cmp::min;
Expand All @@ -35,17 +35,23 @@ pub struct Engine {

pub struct InMemory {
term_index_map: BTreeMap<String, usize>,
postings: Vec<PostingList>,
postings: Vec<PostingsList>,
documents: Vec<Document>,
}

pub struct QueryResult {
pub struct BooleanQueryResult {
pub postfix_query: Vec<String>,
pub documents_ids: DocumentIdsList,
pub time_ms: u128,
}

pub struct RankedQueryResult {
pub tokens: Vec<String>,
pub documents: Vec<DocumentResult>,
pub documents: Vec<RankedDocumentResult>,
pub time_ms: u128,
}

pub struct DocumentResult {
pub struct RankedDocumentResult {
pub id: u32,
pub path: String,
pub score: f64,
Expand Down Expand Up @@ -82,7 +88,48 @@ impl Engine {
}
}

pub fn query(&mut self, query: &str, num_results: usize) -> QueryResult {
pub fn boolean_query(&mut self, postfix_expression: Vec<&str>) -> BooleanQueryResult {
let start_time = Instant::now();

let mut stack = Vec::new();
let mut intermediate_result;
let num_docs = self.documents.get_num_documents();

for p in postfix_expression.clone() {
match p {
"AND" => {
intermediate_result =
Postings::and_operator(stack.pop().unwrap(), stack.pop().unwrap());
}
"OR" => {
intermediate_result =
Postings::or_operator(stack.pop().unwrap(), stack.pop().unwrap());
}
"NOT" => {
intermediate_result = Postings::not_operator(stack.pop().unwrap(), num_docs);
}
_ => {
intermediate_result = self
.vocabulary
.spellcheck_term(p)
.and_then(|t| self.get_term_doc_ids(&t))
.unwrap_or_default();
}
}

stack.push(intermediate_result);
}

let time_ms = start_time.elapsed().as_millis();

BooleanQueryResult {
postfix_query: postfix_expression.iter().map(|s| s.to_string()).collect(),
documents_ids: stack.pop().unwrap(),
time_ms,
}
}

pub fn free_query(&mut self, query: &str, num_results: usize) -> RankedQueryResult {
let start_time = Instant::now();

let tokens: Vec<String> = self
Expand All @@ -102,10 +149,10 @@ impl Engine {
// compute idf where n is the number of documents and
// nq the number of documents containing query term

let nq = postings.collection_frequency as f64;
let nq = self.vocabulary.get_term_frequency(token).unwrap() as f64;
let idf = ((n - nq + 0.5) / (nq + 0.5) + 1.0).ln();

for doc_posting in &postings.documents {
for doc_posting in &postings {
// compute B25 score, where fq is the frequency of term in this documents
// dl is the document len, and avgdl is the average document len accross the collection

Expand Down Expand Up @@ -137,7 +184,7 @@ impl Engine {
let documents = selector
.get_sorted_id_priority_pairs()
.iter()
.map(|(id, score)| DocumentResult {
.map(|(id, score)| RankedDocumentResult {
id: *id,
score: *score,
path: self.documents.get_doc_path(*id),
Expand All @@ -146,19 +193,25 @@ impl Engine {

let time_ms = start_time.elapsed().as_millis();

QueryResult {
RankedQueryResult {
tokens,
documents,
time_ms,
}
}

fn get_term_postings(&mut self, term: &str) -> Option<PostingList> {
fn get_term_postings(&mut self, term: &str) -> Option<PostingsList> {
self.vocabulary
.get_term_index(term)
.map(|i| self.postings.load_postings_list(i))
}

fn get_term_doc_ids(&mut self, term: &str) -> Option<DocumentIdsList> {
self.vocabulary
.get_term_index(term)
.map(|i| self.postings.load_doc_ids_list(i))
}

fn compute_score(document_score: &DocumentScore, num_tokens: usize) -> f64 {
let mut window = u32::MAX;

Expand Down Expand Up @@ -211,7 +264,7 @@ mod test {
}

let mut query: Vec<String> = idx
.query("hello", 10)
.free_query("hello", 10)
.documents
.iter()
.map(|d| d.path.clone())
Expand All @@ -220,5 +273,18 @@ mod test {
query.sort();

assert_eq!(query, ["test_data/docs/1.txt", "test_data/docs/2.txt"]);

// println!(
// "{:?}",
// idx.boolean_query(vec!["hello", "man", "OR"]).documents_ids
// );
// println!(
// "{:?}",
// idx.boolean_query(vec!["hello", "man", "AND"]).documents_ids
// );
// println!(
// "{:?}",
// idx.boolean_query(vec!["man", "NOT"]).documents_ids[0]
// );
}
}
Loading

0 comments on commit 4bfbbab

Please sign in to comment.