Skip to content

Commit

Permalink
simplified structure, bm25 instead of tfidf
Browse files Browse the repository at this point in the history
  • Loading branch information
tomfran committed Jan 27, 2024
1 parent 5cc4bfe commit 58ad091
Show file tree
Hide file tree
Showing 16 changed files with 293 additions and 328 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ Search engine written in Rust, based on an inverted index on disk.
**Index construction**
- [x] In-memory datasets index construction;
- [x] Proper vocabulary and paths on disk;
- [ ] Spelling correction index: in progress.
- [x] Spelling correction index;.

**Queries**
- [x] Tf-idf ranked retrieval;
- [x] BM25 scoring;
- [x] Window computation;

**Evaluation**
Expand Down
4 changes: 2 additions & 2 deletions search/src/index/builder.rs → search/src/engine/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ const PROGRESS_CHARS: &str = "=> ";

const CUTOFF_THRESHOLD: f64 = 0.8;

pub fn build_index(input_dir: &str, output_path: &str, preprocessor: &Preprocessor) {
pub fn build_engine(input_dir: &str, output_path: &str, preprocessor: &Preprocessor) {
let index: InMemory = build_in_memory(input_dir, preprocessor);
Postings::write_postings(&index, output_path);
Vocabulary::write_vocabulary(&index, output_path);
Expand Down Expand Up @@ -59,7 +59,7 @@ fn build_in_memory(input_dir: &str, preprocessor: &Preprocessor) -> InMemory {
// update documents array
documents.lock().unwrap().push(Document {
path: d.path().to_str().unwrap().to_string(),
lenght: tokens.len() as u32,
length: tokens.len() as u32,
});

let mut l_term_index_map = term_index_map.lock().unwrap();
Expand Down
45 changes: 28 additions & 17 deletions search/src/index/documents.rs → search/src/engine/documents.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,39 @@ use crate::disk::{bits_reader::BitsReader, bits_writer::BitsWriter};
#[derive(Clone)]
pub struct Document {
pub path: String,
pub lenght: u32,
pub length: u32,
}

pub struct Documents {
docs: Vec<Document>,
avg_len: f64,
}

impl Documents {
pub fn load_documents(input_path: &str) -> Documents {
let mut reader = BitsReader::new(&(input_path.to_string() + DOCUMENTS_EXTENSION));

let mut prev = String::new();
let docs = (0..reader.read_vbyte())

let mut length_sum = 0;

let docs: Vec<Document> = (0..reader.read_vbyte())
.map(|_| {
let p_len = reader.read_gamma();
let prefix: String = prev.chars().take(p_len as usize).collect();
let s = prefix + &reader.read_str();
prev = s.clone();
let path = prefix + &reader.read_str();
prev = path.clone();

let length = reader.read_vbyte();
length_sum += length;

Document {
path: s,
lenght: reader.read_vbyte(),
}
Document { path, length }
})
.collect();

Documents { docs }
let avg_len = length_sum as f64 / docs.len() as f64;

Documents { docs, avg_len }
}

pub fn write_documents(documents: &Vec<Document>, output_path: &str) {
Expand All @@ -47,7 +53,7 @@ impl Documents {
prev = &l.path;

writer.write_str(&remaining);
writer.write_vbyte(l.lenght);
writer.write_vbyte(l.length);
}

writer.flush();
Expand All @@ -58,7 +64,11 @@ impl Documents {
}

pub fn get_doc_len(&self, doc_id: u32) -> u32 {
self.docs[doc_id as usize].lenght
self.docs[doc_id as usize].length
}

pub fn get_avg_doc_len(&self) -> f64 {
self.avg_len
}

pub fn get_doc_path(&self, doc_id: u32) -> String {
Expand All @@ -79,11 +89,11 @@ mod tests {
let documents = vec![
Document {
path: "document1.txt".to_string(),
lenght: 100,
length: 100,
},
Document {
path: "document2.txt".to_string(),
lenght: 150,
length: 150,
},
];

Expand All @@ -94,7 +104,7 @@ mod tests {

for (i, d) in documents.iter().enumerate() {
assert_eq!(loaded_documents.get_doc_path(i as u32), d.path);
assert_eq!(loaded_documents.get_doc_len(i as u32), d.lenght);
assert_eq!(loaded_documents.get_doc_len(i as u32), d.length);
}
}

Expand All @@ -103,23 +113,24 @@ mod tests {
let documents = vec![
Document {
path: "document1.txt".to_string(),
lenght: 100,
length: 100,
},
Document {
path: "document2.txt".to_string(),
lenght: 150,
length: 150,
},
];

let doc_collection = Documents {
docs: documents.clone(),
avg_len: 125.0,
};

assert_eq!(doc_collection.get_num_documents(), documents.len() as u32);

for (i, d) in documents.iter().enumerate() {
assert_eq!(doc_collection.get_doc_path(i as u32), d.path);
assert_eq!(doc_collection.get_doc_len(i as u32), d.lenght);
assert_eq!(doc_collection.get_doc_len(i as u32), d.length);
}
}
}
48 changes: 23 additions & 25 deletions search/src/query/document_selector.rs → search/src/engine/heap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@ use std::{cmp::Ordering, collections::BinaryHeap};
#[derive(Debug)]
pub struct Entry {
pub id: u32,
pub score: f64,
pub priority: f64,
}

impl PartialEq for Entry {
fn eq(&self, other: &Self) -> bool {
self.score == other.score
self.priority == other.priority
}
}

Expand All @@ -23,35 +23,41 @@ impl PartialOrd for Entry {
impl Ord for Entry {
fn cmp(&self, other: &Self) -> Ordering {
other
.score
.partial_cmp(&self.score)
.priority
.partial_cmp(&self.priority)
.unwrap_or(Ordering::Equal)
}
}

pub struct DocumentSelector {
pub struct FixedMinHeap {
heap: BinaryHeap<Entry>,
capacity: usize,
}

impl DocumentSelector {
pub fn new(capacity: usize) -> DocumentSelector {
DocumentSelector {
impl FixedMinHeap {
pub fn new(capacity: usize) -> FixedMinHeap {
FixedMinHeap {
heap: BinaryHeap::new(),
capacity,
}
}

pub fn push(&mut self, id: u32, score: f64) {
self.heap.push(Entry { id, score });
self.heap.push(Entry {
id,
priority: score,
});

if self.heap.len() > self.capacity {
self.heap.pop();
}
}

pub fn get_sorted_entries(&mut self) -> Vec<Entry> {
let mut res: Vec<Entry> = (0..self.capacity).filter_map(|_| self.heap.pop()).collect();
pub fn get_sorted_id_priority_pairs(&mut self) -> Vec<(u32, f64)> {
let mut res: Vec<(u32, f64)> = (0..self.capacity)
.filter_map(|_| self.heap.pop().map(|e| (e.id, e.priority)))
.collect();

res.reverse();
res
}
Expand All @@ -63,37 +69,29 @@ mod test {

#[test]
fn test_top_k() {
let mut selector = DocumentSelector::new(2);
let mut selector = FixedMinHeap::new(2);

selector.push(2, 0.4);
selector.push(3, 0.3);
selector.push(1, 0.5);
selector.push(4, 0.2);

assert_eq!(
selector
.get_sorted_entries()
.iter()
.map(|e| e.id)
.collect::<Vec<_>>(),
[1, 2]
selector.get_sorted_id_priority_pairs(),
[(1, 0.5), (2, 0.4)]
);
}

#[test]
fn test_top_less_than_k() {
let mut selector = DocumentSelector::new(3);
let mut selector = FixedMinHeap::new(3);

selector.push(1, 0.5);
selector.push(2, 0.4);

assert_eq!(
selector
.get_sorted_entries()
.iter()
.map(|e| e.id)
.collect::<Vec<_>>(),
[1, 2]
selector.get_sorted_id_priority_pairs(),
[(1, 0.5), (2, 0.4)]
);
}
}
Loading

0 comments on commit 58ad091

Please sign in to comment.