diff --git a/makefile b/makefile index 15f5363..f9d463d 100644 --- a/makefile +++ b/makefile @@ -6,3 +6,9 @@ cli: test: cargo test --release + +clippy: + cargo clippy + +clippy-pedantic: + cargo clippy -- -W clippy::pedantic \ No newline at end of file diff --git a/search/src/index/builder.rs b/search/src/index/builder.rs index 01d70d3..d42b8aa 100644 --- a/search/src/index/builder.rs +++ b/search/src/index/builder.rs @@ -3,7 +3,7 @@ use super::{ postings::{PostingEntry, PostingList, Postings}, preprocessor::Preprocessor, vocabulary::Vocabulary, - InMemoryIndex, + InMemory, }; use indicatif::{ParallelProgressIterator, ProgressStyle}; use rayon::prelude::*; @@ -20,16 +20,16 @@ const PROGRESS_CHARS: &str = "=> "; const CUTOFF_THRESHOLD: f64 = 0.8; pub fn build_index(input_dir: &str, output_path: &str, preprocessor: &Preprocessor) { - let index: InMemoryIndex = build_in_memory(input_dir, preprocessor); + let index: InMemory = build_in_memory(input_dir, preprocessor); Postings::write_postings(&index, output_path); Vocabulary::write_vocabulary(&index, output_path); Documents::write_documents(&index.documents, output_path); } -fn build_in_memory(input_dir: &str, preprocessor: &Preprocessor) -> InMemoryIndex { +fn build_in_memory(input_dir: &str, preprocessor: &Preprocessor) -> InMemory { let files: Vec = fs::read_dir(input_dir) .expect("error while retrieving input directory content") - .map(|p| p.unwrap()) + .map(std::result::Result::unwrap) .collect(); // document counter @@ -107,7 +107,7 @@ fn build_in_memory(input_dir: &str, preprocessor: &Preprocessor) -> InMemoryInde .filter(|(_, v)| final_postings[*v].collection_frequency <= frequency_threshold) .collect(); - InMemoryIndex { + InMemory { term_index_map: sorted_term_index_map, postings: final_postings, documents: documents.into_inner().unwrap(), diff --git a/search/src/index/documents.rs b/search/src/index/documents.rs index 66264ec..568a932 100644 --- a/search/src/index/documents.rs +++ b/search/src/index/documents.rs @@ -15,7 +15,7 @@ impl Documents { pub fn load_documents(input_path: &str) -> Documents { let mut reader = BitsReader::new(&(input_path.to_string() + DOCUMENTS_EXTENSION)); - let mut prev: String = "".to_string(); + let mut prev = String::new(); let docs = (0..reader.read_vbyte()) .map(|_| { let p_len = reader.read_gamma(); @@ -40,7 +40,7 @@ impl Documents { let mut prev = ""; writer.write_vbyte(documents.len() as u32); - documents.iter().for_each(|l| { + for l in documents.iter() { let p_len = utils::get_matching_prefix_len(prev, &l.path); writer.write_gamma(p_len as u32); let remaining: String = l.path.chars().skip(p_len).collect(); @@ -48,7 +48,7 @@ impl Documents { writer.write_str(&remaining); writer.write_vbyte(l.lenght); - }); + } writer.flush(); } @@ -92,9 +92,9 @@ mod tests { assert_eq!(loaded_documents.get_num_documents(), documents.len() as u32); - for i in 0..documents.len() { - assert_eq!(loaded_documents.get_doc_path(i as u32), documents[i].path); - assert_eq!(loaded_documents.get_doc_len(i as u32), documents[i].lenght); + for (i, d) in documents.iter().enumerate() { + assert_eq!(loaded_documents.get_doc_path(i as u32), d.path); + assert_eq!(loaded_documents.get_doc_len(i as u32), d.lenght); } } @@ -117,9 +117,9 @@ mod tests { assert_eq!(doc_collection.get_num_documents(), documents.len() as u32); - for i in 0..documents.len() { - assert_eq!(doc_collection.get_doc_path(i as u32), documents[i].path); - assert_eq!(doc_collection.get_doc_len(i as u32), documents[i].lenght); + for (i, d) in documents.iter().enumerate() { + assert_eq!(doc_collection.get_doc_path(i as u32), d.path); + assert_eq!(doc_collection.get_doc_len(i as u32), d.lenght); } } } diff --git a/search/src/index/mod.rs b/search/src/index/mod.rs index ac2c3cc..41a0db0 100644 --- a/search/src/index/mod.rs +++ b/search/src/index/mod.rs @@ -23,7 +23,7 @@ pub struct Index { preprocessor: Preprocessor, } -pub struct InMemoryIndex { +pub struct InMemory { term_index_map: BTreeMap, postings: Vec, documents: Vec, diff --git a/search/src/index/postings.rs b/search/src/index/postings.rs index e56b2e7..5535018 100644 --- a/search/src/index/postings.rs +++ b/search/src/index/postings.rs @@ -1,4 +1,4 @@ -use super::{InMemoryIndex, OFFSETS_EXTENSION, POSTINGS_EXTENSION}; +use super::{InMemory, OFFSETS_EXTENSION, POSTINGS_EXTENSION}; use crate::disk::{bits_reader::BitsReader, bits_writer::BitsWriter}; #[derive(Default)] @@ -38,7 +38,7 @@ impl Postings { Postings { reader, offsets } } - pub fn write_postings(index: &InMemoryIndex, output_path: &str) { + pub fn write_postings(index: &InMemory, output_path: &str) { let postings_path = output_path.to_string() + POSTINGS_EXTENSION; let mut postings_writer = BitsWriter::new(&postings_path); @@ -50,7 +50,7 @@ impl Postings { offsets_writer.write_vbyte(index.term_index_map.len() as u32); - for (_, idx) in index.term_index_map.iter() { + for idx in index.term_index_map.values() { offsets_writer.write_gamma(offset as u32 - prev_offset); prev_offset = offset as u32; @@ -59,13 +59,13 @@ impl Postings { offset += postings_writer.write_vbyte(postings.documents.len() as u32); let mut prev_doc_id = 0; - for entry in postings.documents.iter() { + for entry in &postings.documents { offset += postings_writer.write_gamma(entry.document_id - prev_doc_id); offset += postings_writer.write_gamma(entry.document_frequency); let mut prev_pos = 0; offset += postings_writer.write_vbyte(entry.positions.len() as u32); - for pos in entry.positions.iter() { + for pos in &entry.positions { offset += postings_writer.write_gamma(*pos - prev_pos); prev_pos = *pos; } diff --git a/search/src/index/preprocessor.rs b/search/src/index/preprocessor.rs index 8d050ac..7db6a33 100644 --- a/search/src/index/preprocessor.rs +++ b/search/src/index/preprocessor.rs @@ -18,7 +18,7 @@ impl Preprocessor { self.regex .replace_all(text, " ") .split_whitespace() - .map(|t| t.to_lowercase()) + .map(str::to_lowercase) .map(|t| self.stemmer.stem(&t).to_string()) .collect() } diff --git a/search/src/index/vocabulary.rs b/search/src/index/vocabulary.rs index 135bfba..5d749a8 100644 --- a/search/src/index/vocabulary.rs +++ b/search/src/index/vocabulary.rs @@ -1,4 +1,4 @@ -use super::{utils, InMemoryIndex, VOCABULARY_ALPHA_EXTENSION}; +use super::{utils, InMemory, VOCABULARY_ALPHA_EXTENSION}; use crate::disk::{bits_reader::BitsReader, bits_writer::BitsWriter}; use fxhash::FxHashMap; @@ -11,7 +11,7 @@ pub struct Vocabulary { } impl Vocabulary { - pub fn write_vocabulary(index: &InMemoryIndex, output_path: &str) { + pub fn write_vocabulary(index: &InMemory, output_path: &str) { let path = output_path.to_string() + VOCABULARY_ALPHA_EXTENSION; let mut writer = BitsWriter::new(&path); @@ -22,14 +22,14 @@ impl Vocabulary { // write all terms with prefix compression let mut prev = ""; - vocab.keys().for_each(|s| { + for s in vocab.keys() { let p_len = utils::get_matching_prefix_len(prev, s); writer.write_gamma(p_len as u32); let remaining: String = s.chars().skip(p_len).collect(); prev = s; writer.write_str(&remaining); - }); + } // write all collection frequencies index.postings.iter().for_each(|p| { @@ -46,7 +46,7 @@ impl Vocabulary { let num_terms: u32 = reader.read_vbyte(); // read prefix compressed terms - let mut prev = "".to_string(); + let mut prev = String::new(); let mut index_to_term = Vec::new(); @@ -95,7 +95,7 @@ impl Vocabulary { } pub fn get_term_index(&self, term: &str) -> Option { - self.term_to_index.get(term).map(|i| *i) + self.term_to_index.get(term).copied() } #[allow(dead_code)] @@ -109,12 +109,12 @@ impl Vocabulary { fn get_closest_index(&self, term: &str) -> Option { let candidates = (0..term.len() - 2) .map(|i| term[i..i + 3].to_string()) - .flat_map(|t| self.trigram_index.get(&t)) - .flat_map(|v| v.into_iter()); + .filter_map(|t| self.trigram_index.get(&t)) + .flat_map(|v| v.iter()); candidates .min_by_key(|i| Self::distance(term, &self.index_to_term[**i])) - .map(|i| *i) + .copied() } #[allow(unused_variables)] @@ -139,19 +139,20 @@ mod tests { map.insert("hello".to_string(), 0); map.insert("world".to_string(), 0); - let mut postings = Vec::new(); - postings.push(PostingList { - collection_frequency: 1, - documents: Vec::new(), - }); - postings.push(PostingList { - collection_frequency: 2, - documents: Vec::new(), - }); - - let index = InMemoryIndex { + let postings = vec![ + PostingList { + collection_frequency: 1, + documents: Vec::new(), + }, + PostingList { + collection_frequency: 2, + documents: Vec::new(), + }, + ]; + + let index = InMemory { term_index_map: map, - postings: postings, + postings, documents: Vec::new(), }; diff --git a/search/src/main.rs b/search/src/main.rs index 9935aee..e9c2d32 100644 --- a/search/src/main.rs +++ b/search/src/main.rs @@ -1,6 +1,6 @@ use indicatif::HumanDuration; use search::index::Index; -use search::query::{QueryProcessor, QueryResult}; +use search::query::{Processor, Result}; use std::cmp::min; use std::env; use std::io::{self, Write}; @@ -10,7 +10,7 @@ use std::time::{Duration, Instant}; const NUM_TOP_RESULTS: usize = 10; const NUM_RESULTS: usize = 1_000_000; -fn print_results(result: QueryResult) { +fn print_results(result: &Result) { println!("Search tokens: {:?}", result.tokens); if result.documents.is_empty() { @@ -35,7 +35,7 @@ fn print_results(result: QueryResult) { } fn read_line(prompt: &str) -> String { - print!("{}", prompt); + print!("{prompt}"); io::stdout().flush().unwrap(); let mut input = String::new(); @@ -66,16 +66,16 @@ fn main() { let action = &args[2]; let build_index = action == "build"; - let index_path = format!("{}/index/idx", base_path); - let docs_path = format!("{}/docs", base_path); + let index_path = format!("{base_path}/index/idx"); + let docs_path = format!("{base_path}/docs"); if build_index { - println!("Start build on directory [{}]\n", docs_path); + println!("Start build on directory [{docs_path}]\n"); let num_threads = args.get(3).map_or(0, |s| s.parse().unwrap_or(0)); if num_threads != 0 { - println!("Setting thread number to {}", num_threads); + println!("Setting thread number to {num_threads}"); rayon::ThreadPoolBuilder::new() .num_threads(num_threads) @@ -96,11 +96,10 @@ fn main() { exit(0); } - let mut q = QueryProcessor::build_query_processor(&index_path); + let mut q = Processor::build_query_processor(&index_path); println!( - "Loaded search engine for directory: [{}]\n\nWrite a query and press enter.\n", - base_path + "Loaded search engine for directory: [{base_path}]\n\nWrite a query and press enter.\n" ); loop { @@ -108,6 +107,6 @@ fn main() { let result = q.query(&query, NUM_RESULTS); - print_results(result); + print_results(&result); } } diff --git a/search/src/query/document_selector.rs b/search/src/query/document_selector.rs index c89c481..2b55f0e 100644 --- a/search/src/query/document_selector.rs +++ b/search/src/query/document_selector.rs @@ -3,7 +3,7 @@ use std::{cmp::Ordering, collections::BinaryHeap}; #[derive(Debug)] pub struct Entry { pub id: u32, - pub score: f32, + pub score: f64, } impl PartialEq for Entry { @@ -39,7 +39,7 @@ impl DocumentSelector { } } - pub fn push(&mut self, id: u32, score: f32) { + pub fn push(&mut self, id: u32, score: f64) { self.heap.push(Entry { id, score }); if self.heap.len() > self.capacity { @@ -48,7 +48,7 @@ impl DocumentSelector { } pub fn get_sorted_entries(&mut self) -> Vec { - let mut res: Vec = (0..self.capacity).flat_map(|_| self.heap.pop()).collect(); + let mut res: Vec = (0..self.capacity).filter_map(|_| self.heap.pop()).collect(); res.reverse(); res } diff --git a/search/src/query/mod.rs b/search/src/query/mod.rs index 968fa6b..2c09589 100644 --- a/search/src/query/mod.rs +++ b/search/src/query/mod.rs @@ -6,14 +6,14 @@ use self::document_selector::DocumentSelector; mod document_selector; -const WINDOW_MULTIPLIER: f32 = 10.0; +const WINDOW_MULTIPLIER: f64 = 10.0; -pub struct QueryProcessor { +pub struct Processor { index: Index, num_documents: u32, } -pub struct QueryResult { +pub struct Result { pub tokens: Vec, pub documents: Vec, pub time_ms: u128, @@ -22,33 +22,33 @@ pub struct QueryResult { pub struct DocumentResult { pub id: u32, pub path: String, - pub score: f32, + pub score: f64, } #[derive(Default)] struct DocumentScore { - tf_idf: f32, + tf_idf: f64, term_positions: HashMap>, } -impl QueryProcessor { - pub fn build_query_processor(index_input_path: &str) -> QueryProcessor { +impl Processor { + pub fn build_query_processor(index_input_path: &str) -> Processor { let index = Index::load_index(index_input_path); let num_documents = index.get_num_documents(); - QueryProcessor { + Processor { index, num_documents, } } - pub fn query(&mut self, query: &str, num_results: usize) -> QueryResult { + pub fn query(&mut self, query: &str, num_results: usize) -> Result { let start_time = Instant::now(); let tokens = self.index.get_query_tokens(query); let documents = self - .get_sorted_document_entries(tokens.clone(), num_results) + .get_sorted_document_entries(&tokens.clone(), num_results) .iter() .map(|e| DocumentResult { id: e.id, @@ -59,7 +59,7 @@ impl QueryProcessor { let time_ms = start_time.elapsed().as_millis(); - QueryResult { + Result { tokens, documents, time_ms, @@ -68,29 +68,24 @@ impl QueryProcessor { fn get_sorted_document_entries( &mut self, - tokens: Vec, + tokens: &[String], num_results: usize, ) -> Vec { let mut scores: HashMap = HashMap::new(); for (id, token) in tokens.iter().enumerate() { if let Some(postings) = self.index.get_term_postings(token) { - let idf = (self.num_documents as f32 / postings.collection_frequency as f32).log2(); + let idf = (self.num_documents as f64 / postings.collection_frequency as f64).log2(); // for each term-doc pair, increment the documetn tf-idf score // and record token positions for window computation for doc_posting in &postings.documents { - let td_idf_score = doc_posting.document_frequency as f32 * idf; + let td_idf_score = doc_posting.document_frequency as f64 * idf; - let doc_score = scores - .entry(doc_posting.document_id) - .or_insert(DocumentScore::default()); + let doc_score = scores.entry(doc_posting.document_id).or_default(); doc_score.tf_idf += td_idf_score; - let positions = doc_score - .term_positions - .entry(id as u32) - .or_insert(Vec::new()); + let positions = doc_score.term_positions.entry(id as u32).or_default(); doc_posting .positions @@ -102,17 +97,17 @@ impl QueryProcessor { let mut selector = DocumentSelector::new(num_results); let num_tokens = tokens.len(); - scores.iter_mut().for_each(|(id, score)| { + for (id, score) in scores.iter_mut() { // tf-idf score must be divided by the document len - score.tf_idf /= self.index.get_document_len(*id) as f32; - selector.push(*id, QueryProcessor::compute_score(score, num_tokens)); - }); + score.tf_idf /= self.index.get_document_len(*id) as f64; + selector.push(*id, Processor::compute_score(score, num_tokens)); + } selector.get_sorted_entries() } // score takes into consideration the window size and td-idf scoring - fn compute_score(document_score: &DocumentScore, num_tokens: usize) -> f32 { + fn compute_score(document_score: &DocumentScore, num_tokens: usize) -> f64 { let mut window = u32::MAX; let mut arr: Vec<(u32, u32)> = document_score @@ -121,11 +116,11 @@ impl QueryProcessor { .flat_map(|(id, positions)| positions.iter().map(|p| (*p, *id))) .collect(); - arr.sort(); + arr.sort_unstable(); let mut j = 0; let mut seen: HashMap = HashMap::new(); - for (pos, id) in arr.iter().cloned() { + for (pos, id) in arr.iter().copied() { seen.entry(id).and_modify(|c| *c += 1).or_insert(1); while seen.len() == num_tokens && j < arr.len() { @@ -141,6 +136,6 @@ impl QueryProcessor { } } - WINDOW_MULTIPLIER * (num_tokens as f32 / window as f32) + document_score.tf_idf + WINDOW_MULTIPLIER * (num_tokens as f64 / window as f64) + document_score.tf_idf } } diff --git a/server/src/main.rs b/server/src/main.rs index 78c3a91..117e659 100644 --- a/server/src/main.rs +++ b/server/src/main.rs @@ -8,7 +8,7 @@ use axum::{ Router, }; use log::info; -use search::query::QueryProcessor; +use search::query::Processor; use serde::{Deserialize, Serialize}; use std::{ env, @@ -18,7 +18,7 @@ use std::{ struct AppState { index_path: String, - query_processor: Mutex, + query_processor: Mutex, } #[tokio::main] @@ -35,11 +35,11 @@ async fn main() { } let base_path = &args[1]; - let index_path = format!("{}/index/idx", base_path); + let index_path = format!("{base_path}/index/idx"); let state = Arc::new(AppState { index_path: base_path.clone(), - query_processor: Mutex::new(QueryProcessor::build_query_processor(&index_path)), + query_processor: Mutex::new(Processor::build_query_processor(&index_path)), }); let app = Router::new() @@ -66,7 +66,7 @@ where Err(err) => ( StatusCode::INTERNAL_SERVER_ERROR, - format!("Failed to render template. Error: {}", err), + format!("Failed to render template. Error: {err}"), ) .into_response(), } @@ -102,7 +102,7 @@ struct QueryResponse { #[derive(Deserialize, Serialize)] struct Document { id: u32, - score: f32, + score: f64, path: String, content: String, }