Skip to content

Commit

Permalink
stemmer
Browse files Browse the repository at this point in the history
  • Loading branch information
tomfran committed Dec 27, 2023
1 parent 214ab2e commit 0d4d6fa
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 40 deletions.
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ path = "src/lib.rs"

[dependencies]
rand = "0.8"
tokenizers = { version = "0.15.0", features = ["http"] }
tokenizers = { version = "0.15.0", features = ["http"] }
rust-stemmers = "1.2.0"
13 changes: 8 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,22 @@

Search engine written in Rust, based on an inverted index on disk.

**Implementation status**
### Implementation status
- [x] IO classes for writing and reading bit-streams;
- [ ] Text preprocessing:
- [x] Tokenization;
- [ ] Stemming.
- [x] Stemming.
- [ ] Index construction:
- [x] In-memory datasets index construction;
- [ ] Disk-based partial index construction and merging;
- [ ] Additional indexes to support things such as spelling correction.
- [ ] Index queries:
- [ ] Boolean queries;
- [ ] Tf-idf ranked retrieval.
- [x] Tf-idf ranked retrieval.

**References**
### Crates in use
- [lise-henry/stemmer-rs](https://github.com/lise-henry/stemmer-rs)
- [huggingface/tokenizers](https://github.com/huggingface/tokenizers)

[*Introduction to Information Retrieval - Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze*](https://nlp.stanford.edu/IR-book/information-retrieval-book.html)
### References
[Introduction to Information Retrieval - Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze](https://nlp.stanford.edu/IR-book/information-retrieval-book.html)
9 changes: 5 additions & 4 deletions src/index/builder.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use rust_stemmers::Stemmer;
use std::{collections::BTreeMap, fs};
use tokenizers::Tokenizer;

Expand All @@ -17,22 +18,22 @@ struct InMemoryIndex {
document_lenghts: Vec<u32>,
}

pub fn build_index(input_dir: &str, output_path: &str, tokenizer: &Tokenizer) {
let index = build_in_memory(input_dir, tokenizer);
pub fn build_index(input_dir: &str, output_path: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) {
let index = build_in_memory(input_dir, tokenizer, stemmer);
write_postings(&index, output_path);
write_vocabulary(&index.term_index_map, output_path);
write_doc_lentghts(&index.document_lenghts, output_path);
}

fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer) -> InMemoryIndex {
fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) -> InMemoryIndex {
let documents =
fs::read_dir(input_dir).expect("error while retrieving input directory content");

let tokenized_docs_iter = documents
.into_iter()
.map(|p| p.unwrap())
.map(|p| fs::read_to_string(p.path()).expect("error while reading file"))
.map(|s| text_utils::tokenize(tokenizer, &s));
.map(|s| text_utils::tokenize_and_stem(tokenizer, stemmer, &s));

let mut term_index_map: BTreeMap<String, usize> = BTreeMap::new();
let mut postings: Vec<BTreeMap<u32, u32>> = Vec::new();
Expand Down
22 changes: 19 additions & 3 deletions src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ mod builder;
mod loader;
mod text_utils;

use rust_stemmers::Stemmer;
use std::collections::BTreeMap;
use std::fmt::Display;
use tokenizers::Tokenizer;

use crate::disk::bits_reader::BitsReader;
Expand All @@ -18,6 +20,7 @@ pub struct Index {
term_offset_map: BTreeMap<String, u64>,
doc_lenghts: Vec<u32>,
tokenizer: Tokenizer,
stemmer: Stemmer,
}

#[derive(Debug)]
Expand All @@ -35,7 +38,8 @@ pub struct PostingEntry {
impl Index {
pub fn build_index(input_path: &str, output_path: &str, tokenizer_path: &str) {
let tokenizer = text_utils::load_tokenizer(tokenizer_path, false);
builder::build_index(input_path, output_path, &tokenizer);
let stemmer = text_utils::load_stemmer();
builder::build_index(input_path, output_path, &tokenizer, &stemmer);
}

pub fn load_index(input_path: &str, tokenizer_path: &str) -> Index {
Expand All @@ -44,6 +48,7 @@ impl Index {
term_offset_map: loader::load_terms_to_offsets_map(input_path),
doc_lenghts: loader::load_document_lenghts(input_path),
tokenizer: text_utils::load_tokenizer(tokenizer_path, false),
stemmer: text_utils::load_stemmer(),
}
}

Expand Down Expand Up @@ -79,8 +84,19 @@ impl Index {
})
}

pub fn tokenize_query(&self, query: &str) -> Vec<String> {
text_utils::tokenize(&self.tokenizer, query)
pub fn tokenize_and_stem_query(&self, query: &str) -> Vec<String> {
text_utils::tokenize_and_stem(&self.tokenizer, &self.stemmer, query)
}
}

impl Display for Index {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"Index:\n- vocab size: {}\n- num. documents: {})",
self.term_offset_map.len(),
self.get_num_documents()
)
}
}

Expand Down
18 changes: 14 additions & 4 deletions src/index/text_utils.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::{fs::create_dir_all, path::Path};

use rust_stemmers::{Algorithm, Stemmer};
use tokenizers::Tokenizer;

pub fn load_tokenizer(filename: &str, force_download: bool) -> Tokenizer {
Expand All @@ -19,10 +20,19 @@ pub fn load_tokenizer(filename: &str, force_download: bool) -> Tokenizer {
Tokenizer::from_file(filename).expect("error while loading tokenizer from file")
}

pub fn tokenize(tokenizer: &Tokenizer, text: &str) -> Vec<String> {
tokenizer
pub fn load_stemmer() -> Stemmer {
Stemmer::create(Algorithm::English)
}

pub fn tokenize_and_stem(tokenizer: &Tokenizer, stemmer: &Stemmer, text: &str) -> Vec<String> {
let tokenized_text = tokenizer
.encode(text, false)
.expect("error while tokenizing text")
.expect("error while tokenizing text");

tokenized_text
.get_tokens()
.to_vec()
.iter()
.map(|t| t.to_lowercase())
.map(|t| stemmer.stem(&t).to_string())
.collect()
}
27 changes: 4 additions & 23 deletions src/query/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,9 @@ impl QueryProcessor {
}

pub fn query(&mut self, query: &str) -> Vec<u32> {
println!("\nQuery: {:?}", query);

let mut scores: HashMap<u32, f32> = HashMap::new();

for token in self.index.tokenize_query(query) {
for token in self.index.tokenize_and_stem_query(query) {
if let Some(postings) = self.index.get_term(&token) {
let idf = (self.num_documents as f32 / postings.collection_frequency as f32).log2();

Expand All @@ -44,27 +42,10 @@ impl QueryProcessor {
}

let mut selector = DocumentSelector::new(3);
scores.iter().for_each(|(id, score)| {
println!("- document: {:?}, score: {:?}", id, score);
selector.push(*id, *score)
});
scores
.iter()
.for_each(|(id, score)| selector.push(*id, *score));

selector.get_sorted_ids()
}
}

// #[cfg(test)]
// mod test {
// use super::*;

// #[test]
// fn test_build() {
// let mut q = QueryProcessor::build_query_processor(
// "data/small/index/small",
// "data/small/bert-base-uncased",
// );
// q.query("google");
// q.query("apple");
// q.query("microsoft");
// }
// }

0 comments on commit 0d4d6fa

Please sign in to comment.