Skip to content

Commit

Permalink
huggingface tokenizers
Browse files Browse the repository at this point in the history
  • Loading branch information
tomfran committed Dec 22, 2023
1 parent 44d7029 commit 7a9b503
Show file tree
Hide file tree
Showing 10 changed files with 50 additions and 46 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ path = "src/lib.rs"

[dependencies]
rand = "0.8"
regex = "1"
tokenizers = { version = "0.15.0", features = ["http"] }
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@ Search engine written in Rust, based on an inverted index on disk.
- [x] Tokenization;
- [ ] Stemming.
- [ ] Index construction:
- [ ] [In progress] In-memory datasets index construction;
- [ ] In-memory datasets index construction;
- [ ] Disk-based partial index construction and merging;
- [ ] Additional indexes to support things such as spelling correction.
- [ ] Index queries:
- [ ] Boolean queries;
- [ ] Tf-idf ranked retrieval.

**References**

Expand Down
1 change: 1 addition & 0 deletions data/index_unit_test/test_tokenizer

Large diffs are not rendered by default.

17 changes: 9 additions & 8 deletions src/indexer/disk_utils.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
use crate::{
disk::{
bits_reader::BitsReader, bits_writer::BitsWriter, terms_reader::TermsReader,
terms_writer::TermsWriter,
},
text::tokens,
use tokenizers::Tokenizer;

use crate::disk::{
bits_reader::BitsReader, bits_writer::BitsWriter, terms_reader::TermsReader,
terms_writer::TermsWriter,
};
use std::{collections::BTreeMap, fs};

use super::text_utils;

const POSTINGS_EXTENSION: &str = ".postings";
const OFFSETS_EXTENSION: &str = ".offsets";

Expand All @@ -15,16 +16,16 @@ const VOCABULARY_LENGHTS_EXTENSION: &str = ".lengths";

pub fn build_in_memory_postings(
input_dir: &str,
tokenizer: &Tokenizer,
) -> (BTreeMap<String, usize>, Vec<BTreeMap<u32, u32>>) {
let documents =
fs::read_dir(input_dir).expect("error while retrieving input directory content");

let tokens_regex = tokens::build_tokenization_regex();
let tokenized_docs_iter = documents
.into_iter()
.map(|p| p.unwrap())
.map(|p| fs::read_to_string(p.path()).expect("error while reading file"))
.map(|s| tokens::tokenize(&s, &tokens_regex));
.map(|s| text_utils::tokenize(tokenizer, &s));

let mut words: BTreeMap<String, usize> = BTreeMap::new();
let mut in_memory_postings: Vec<BTreeMap<u32, u32>> = Vec::new();
Expand Down
9 changes: 6 additions & 3 deletions src/indexer/index.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use super::disk_utils;
use super::{disk_utils, text_utils::load_tokenizer};
use crate::disk::bits_reader::BitsReader;
use std::collections::BTreeMap;

Expand All @@ -8,8 +8,10 @@ pub struct Index {
}

impl Index {
pub fn build_index(input_dir: &str, output_path: &str) {
let (words, postings) = disk_utils::build_in_memory_postings(input_dir);
pub fn build_index(input_dir: &str, output_path: &str, tokenizer_path: &str) {
let tokenizer = load_tokenizer(tokenizer_path, false);
let (words, postings) = disk_utils::build_in_memory_postings(input_dir, &tokenizer);

disk_utils::write_postings(&words, &postings, output_path);
disk_utils::write_vocabulary(&words, output_path);
}
Expand Down Expand Up @@ -48,6 +50,7 @@ mod test {
Index::build_index(
"data/index_unit_test/docs",
"data/index_unit_test/index/test",
"data/index_unit_test/test_tokenizer",
);

let mut idx = Index::load_index("data/index_unit_test/index/test");
Expand Down
1 change: 1 addition & 0 deletions src/indexer/mod.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
mod disk_utils;
pub mod index;
mod text_utils;
28 changes: 28 additions & 0 deletions src/indexer/text_utils.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
use std::{fs::create_dir_all, path::Path};

use tokenizers::Tokenizer;

pub fn load_tokenizer(filename: &str, force_download: bool) -> Tokenizer {
let path = Path::new(filename);

if !path.exists() || force_download {
path.parent().map(create_dir_all);

let identifier = path.file_name().unwrap().to_str().unwrap();

Tokenizer::from_pretrained(identifier, None)
.expect("error while retrieving tokenizer from the web")
.save(filename, false)
.expect("error while saving tokenizer to file");
}

Tokenizer::from_file(filename).expect("error while loading tokenizer from file")
}

pub fn tokenize(tokenizer: &Tokenizer, text: &str) -> Vec<String> {
tokenizer
.encode(text, false)
.expect("error while tokenizing text")
.get_tokens()
.to_vec()
}
1 change: 0 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
pub mod disk;
pub mod indexer;
pub mod text;
1 change: 0 additions & 1 deletion src/text/mod.rs

This file was deleted.

31 changes: 0 additions & 31 deletions src/text/tokens.rs

This file was deleted.

0 comments on commit 7a9b503

Please sign in to comment.