Skip to content

Commit

Permalink
document paths on disk, adjusted tfidf scoring
Browse files Browse the repository at this point in the history
  • Loading branch information
tomfran committed Jan 18, 2024
1 parent 6484a9d commit 723488d
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 46 deletions.
15 changes: 4 additions & 11 deletions src/disk/bits_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,6 @@ impl BitsReader {
.collect()
}

pub fn read_vbyte_gamma_vector(&mut self) -> Vec<u32> {
(0..self.read_vbyte()).map(|_| self.read_gamma()).collect()
}

pub fn read_str(&mut self) -> String {
String::from_utf8(
(0..self.read_gamma())
Expand Down Expand Up @@ -150,12 +146,10 @@ mod tests {
w.write_gamma(i);
});

for _ in 0..2 {
w.write_vbyte(3);
(1..4).for_each(|i| {
w.write_gamma(i);
});
}
w.write_vbyte(3);
(1..4).for_each(|i| {
w.write_gamma(i);
});

w.write_str("hello");
w.write_str("");
Expand All @@ -167,7 +161,6 @@ mod tests {
(1..100).for_each(|i| assert_eq!(i, r.read_vbyte()));
(1..100).for_each(|i| assert_eq!(i, r.read_gamma()));

assert_eq!(r.read_vbyte_gamma_vector(), [1, 2, 3]);
assert_eq!(r.read_vbyte_gamma_gap_vector(), [1, 3, 6]);

assert_eq!(r.read_str(), "hello");
Expand Down
28 changes: 19 additions & 9 deletions src/index/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use rust_stemmers::Stemmer;
use tokenizers::Tokenizer;

use super::{
documents::write_doc_lentghts,
documents::{write_documents, Document},
postings::{write_postings, PostingEntry, PostingList},
text,
vocabulary::write_vocabulary,
Expand All @@ -27,24 +27,28 @@ pub fn build_index(input_dir: &str, output_path: &str, tokenizer: &Tokenizer, st
let index: InMemoryIndex = build_in_memory(input_dir, tokenizer, stemmer);
write_postings(&index, output_path);
write_vocabulary(&index, output_path);
write_doc_lentghts(&index.document_lengths, output_path);
write_documents(&index.documents, output_path);
}

fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) -> InMemoryIndex {
let documents: Vec<fs::DirEntry> = fs::read_dir(input_dir)
let files: Vec<fs::DirEntry> = fs::read_dir(input_dir)
.expect("error while retrieving input directory content")
.map(|p| p.unwrap())
.collect();

// document counter
let doc_id_mutex = Mutex::new(0);
let term_index_map = Mutex::new(HashMap::new());

// postings list
let postings: Mutex<Vec<PostingList>> = Mutex::new(Vec::new());
// word to postings index
let term_index_map = Mutex::new(HashMap::new());
// per-word doc id to posting list index
let term_doc_map: Mutex<Vec<HashMap<u32, usize>>> = Mutex::new(Vec::new());
// documents data
let documents = Mutex::new(Vec::new());

let document_lengths = Mutex::new(Vec::new());

documents
files
.into_par_iter()
.progress_with_style(
ProgressStyle::with_template(PROGRESS_STYLE)
Expand All @@ -57,13 +61,18 @@ fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) ->

let mut doc_id = doc_id_mutex.lock().unwrap();

document_lengths.lock().unwrap().push(tokens.len() as u32);
// update documents array
documents.lock().unwrap().push(Document {
path: d.path().to_str().unwrap().to_string(),
lenght: tokens.len() as u32,
});

let mut l_term_index_map = term_index_map.lock().unwrap();
let mut l_postings = postings.lock().unwrap();
let mut l_term_doc_map = term_doc_map.lock().unwrap();

for (word_pos, t) in tokens.iter().enumerate() {
// obtain postings for this word and increment collection frequency
if !l_term_index_map.contains_key(t) {
let idx = l_term_index_map.len();
l_term_index_map.insert(t.clone(), idx);
Expand All @@ -75,6 +84,7 @@ fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) ->
let postings_list = &mut l_postings[term_index];
postings_list.collection_frequency += 1;

// obtain document entry for this word and update it
if !l_term_doc_map[term_index].contains_key(&doc_id) {
let idx = postings_list.documents.len();
l_term_doc_map[term_index].insert(*doc_id, idx);
Expand Down Expand Up @@ -105,6 +115,6 @@ fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) ->
InMemoryIndex {
term_index_map: sorted_term_index_map,
postings: final_postings,
document_lengths: document_lengths.into_inner().unwrap(),
documents: documents.into_inner().unwrap(),
}
}
31 changes: 22 additions & 9 deletions src/index/documents.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,33 @@
use crate::disk::{bits_reader::BitsReader, bits_writer::BitsWriter};

use super::DOCUMENT_LENGHTS_EXTENSION;
use super::DOCUMENTS_EXTENSION;

pub fn write_doc_lentghts(document_lenghts: &Vec<u32>, output_path: &str) {
let doc_path = output_path.to_string() + DOCUMENT_LENGHTS_EXTENSION;
#[derive(Clone)]
pub struct Document {
pub path: String,
pub lenght: u32,
}

pub fn write_documents(documents: &Vec<Document>, output_path: &str) {
let doc_path = output_path.to_string() + DOCUMENTS_EXTENSION;
let mut doc_writer = BitsWriter::new(&doc_path);

doc_writer.write_vbyte(document_lenghts.len() as u32);
document_lenghts.iter().for_each(|l| {
doc_writer.write_gamma(*l);
doc_writer.write_vbyte(documents.len() as u32);
documents.iter().for_each(|l| {
doc_writer.write_str(&l.path);
doc_writer.write_vbyte(l.lenght);
});

doc_writer.flush();
}

pub fn load_document_lenghts(input_path: &str) -> Vec<u32> {
let mut reader = BitsReader::new(&(input_path.to_string() + DOCUMENT_LENGHTS_EXTENSION));
reader.read_vbyte_gamma_vector()
pub fn load_documents(input_path: &str) -> Vec<Document> {
let mut reader = BitsReader::new(&(input_path.to_string() + DOCUMENTS_EXTENSION));

(0..reader.read_vbyte())
.map(|_| Document {
path: reader.read_str(),
lenght: reader.read_vbyte(),
})
.collect()
}
25 changes: 18 additions & 7 deletions src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,26 +12,27 @@ use tokenizers::Tokenizer;

use crate::disk::bits_reader::BitsReader;

use self::documents::Document;
use self::postings::PostingList;

pub const POSTINGS_EXTENSION: &str = ".postings";
pub const OFFSETS_EXTENSION: &str = ".offsets";
pub const DOCUMENT_LENGHTS_EXTENSION: &str = ".doc_lengths";
pub const DOCUMENTS_EXTENSION: &str = ".docs";
pub const VOCABULARY_ALPHA_EXTENSION: &str = ".alphas";

pub struct Index {
term_to_index: FxHashMap<String, usize>,
postings: BitsReader,
term_offsets: Vec<u64>,
doc_lenghts: Vec<u32>,
documents: Vec<Document>,
tokenizer: Tokenizer,
stemmer: Stemmer,
}

pub struct InMemoryIndex {
term_index_map: BTreeMap<String, usize>,
postings: Vec<PostingList>,
document_lengths: Vec<u32>,
documents: Vec<Document>,
}

impl Index {
Expand All @@ -46,14 +47,22 @@ impl Index {
term_to_index: vocabulary::load_vocabulary(input_path),
postings: postings::build_postings_reader(input_path),
term_offsets: postings::load_offsets(input_path),
doc_lenghts: documents::load_document_lenghts(input_path),
documents: documents::load_documents(input_path),
tokenizer: text::load_tokenizer(tokenizer_path, false),
stemmer: text::load_stemmer(),
}
}

pub fn get_num_documents(&self) -> u32 {
self.doc_lenghts.len() as u32
self.documents.len() as u32
}

pub fn get_document_len(&self, doc_id: u32) -> u32 {
self.documents[doc_id as usize].lenght
}

pub fn get_document_path(&self, doc_id: u32) -> String {
self.documents[doc_id as usize].path.clone()
}

pub fn get_term(&mut self, term: &str) -> Option<postings::PostingList> {
Expand Down Expand Up @@ -81,12 +90,14 @@ impl Display for Index {

#[cfg(test)]
mod test {
use crate::test_utils::utils::create_temporary_dir_path;

use super::*;
use crate::test_utils::utils::create_temporary_dir_path;
use rayon::ThreadPoolBuilder;

#[test]
fn test_build() {
let _ = ThreadPoolBuilder::new().num_threads(1).build_global();

let index_path = &create_temporary_dir_path();

Index::build_index(
Expand Down
19 changes: 12 additions & 7 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,22 @@ use search::query::QueryProcessor;

use indicatif::HumanDuration;

const NUM_RESULTS: usize = 1000000;
const NUM_RESULTS: usize = 1_000_000;

fn print_results(results: &[u32], elapsed_time: Duration) {
// println!("\nSearch Results:");
fn print_results(results: &[String], elapsed_time: Duration) {
if results.is_empty() {
println!("\nNo documents found\n");
return;
}

// for (i, doc_id) in results.iter().enumerate() {
// println!("\t- {:3}. Doc ID: {}", i + 1, doc_id);
// }
println!("\nTop 10 results:\n");

for (i, doc_id) in results.iter().take(10).enumerate() {
println!("\t{:2}. {}", i + 1, doc_id);
}

println!(
"\nFetched {} documents in {} ms",
"\nFetched {} documents in {} ms\n",
results.len(),
elapsed_time.as_millis()
);
Expand Down
16 changes: 13 additions & 3 deletions src/query/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ impl QueryProcessor {
}
}

pub fn query(&mut self, query: &str, num_results: usize) -> Vec<u32> {
pub fn query(&mut self, query: &str, num_results: usize) -> Vec<String> {
let mut scores: HashMap<u32, DocumentScore> = HashMap::new();

let tokens = self.index.tokenize_and_stem_query(query);
Expand All @@ -41,6 +41,8 @@ impl QueryProcessor {
if let Some(postings) = self.index.get_term(token) {
let idf = (self.num_documents as f32 / postings.collection_frequency as f32).log2();

// for each term-doc pair, increment the documetn tf-idf score
// and record token positions for window computation
for doc_posting in &postings.documents {
let td_idf_score = doc_posting.document_frequency as f32 * idf;

Expand All @@ -64,13 +66,21 @@ impl QueryProcessor {

let mut selector = DocumentSelector::new(num_results);
let num_tokens = tokens.len();
scores.iter().for_each(|(id, score)| {
scores.iter_mut().for_each(|(id, score)| {
// tf-idf score must be divided by the document len
score.tf_idf /= self.index.get_document_len(*id) as f32;

selector.push(*id, QueryProcessor::compute_score(score, num_tokens));
});

selector.get_sorted_ids()
selector
.get_sorted_ids()
.iter()
.map(|i| self.index.get_document_path(*i))
.collect()
}

// score takes into consideration the window size and td-idf scoring
fn compute_score(document_score: &DocumentScore, num_tokens: usize) -> f32 {
let mut window = u32::MAX;

Expand Down

0 comments on commit 723488d

Please sign in to comment.