From d54da80a99276926819e066b573054c41d850b60 Mon Sep 17 00:00:00 2001 From: Koki Kato Date: Thu, 23 Jan 2025 09:29:39 +0900 Subject: [PATCH] Drop `SearchIndexBuilder` --- examples/example_builder.rs | 52 ---------------- src/lib.rs | 2 +- src/search.rs | 121 +----------------------------------- 3 files changed, 4 insertions(+), 171 deletions(-) delete mode 100644 examples/example_builder.rs diff --git a/examples/example_builder.rs b/examples/example_builder.rs deleted file mode 100644 index 98a665c..0000000 --- a/examples/example_builder.rs +++ /dev/null @@ -1,52 +0,0 @@ -use fm_index::converter::RangeConverter; -use fm_index::SearchIndexBuilder; - -fn main() { - // Prepare a text string to search for patterns. - let text = concat!( - "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.", - "Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.", - "Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.", - "Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", - ).as_bytes().to_vec(); - - // Converter converts each character into packed representation. - // `' '` ~ `'~'` represents a range of ASCII printable characters. - let converter = RangeConverter::new(b' ', b'~'); - - // To perform locate queries, we need to use some storage. How much storage - // is used depends on the `level` arguments passed. `0` retains the full - // information, but we don't need the whole array since we can interpolate - // missing elements in a suffix array from others. A sampler will _sieve_ a - // suffix array for this purpose. - // You can also use `FMIndex::count_only()` if you don't perform location - // queries (disabled in type-level). - let index = SearchIndexBuilder::new(converter).build(text); - - // Search for a pattern string. - let pattern = "dolor"; - let search = index.search(pattern); - - // Count the number of occurrences. - let n = search.count(); - assert_eq!(n, 4); - - // List the position of all occurrences. - let positions = search.locate(); - assert_eq!(positions, vec![246, 12, 300, 103]); - - // Extract preceding characters from a search position. - let i = 0; - let mut prefix = search.iter_backward(i).take(16).collect::>(); - prefix.reverse(); - assert_eq!(prefix, b"Duis aute irure ".to_owned()); - - // Extract succeeding characters from a search position. - let i = 3; - let postfix = search.iter_forward(i).take(20).collect::>(); - assert_eq!(postfix, b"dolore magna aliqua.".to_owned()); - - // Search can be chained backward. - let search_chained = search.search("et "); - assert_eq!(search_chained.count(), 1); -} diff --git a/src/lib.rs b/src/lib.rs index 4433c06..e9c2968 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -152,4 +152,4 @@ pub use crate::rlfmi::RLFMIndex; pub use character::Character; pub use iter::{BackwardIterator, FMIndexBackend, ForwardIterator}; -pub use search::{Search, SearchIndex, SearchIndexBuilder}; +pub use search::{Search, SearchIndex}; diff --git a/src/search.rs b/src/search.rs index a69c796..f875231 100644 --- a/src/search.rs +++ b/src/search.rs @@ -1,125 +1,10 @@ #[cfg(doc)] use crate::converter; -use crate::converter::{Converter, IndexWithConverter}; +use crate::converter::IndexWithConverter; use crate::iter::FMIndexBackend; -use crate::suffix_array::{self, HasPosition, SuffixOrderSampledArray}; -use crate::{seal, Character, FMIndex, RLFMIndex}; - -/// A builder that builds [`SearchIndex`]. -pub struct SearchIndexBuilder -where - T: Character, - C: Converter, -{ - converter: C, - // We avoid extracting parts into another `type` definition. - // Also, we use dyn trait in order not to add another type variable for this closure type. - #[allow(clippy::type_complexity)] - get_sample: Box S>, - _i: std::marker::PhantomData, - _t: std::marker::PhantomData, -} - -impl SearchIndexBuilder<(), T, C, ()> -where - T: Character, - C: Converter, -{ - /// Create a new [`SearchIndexBuilder`]. - /// - /// - `converter` is a [`Converter`] is used to convert the characters to a - /// smaller alphabet. Use [`converter::IdConverter`] if you don't need to - /// restrict the alphabet. Use [`converter::RangeConverter`] if you can - /// contrain characters to a particular range. See [`converter`] for more - /// details. - pub fn new( - converter: C, - ) -> SearchIndexBuilder, T, C, SuffixOrderSampledArray> - { - SearchIndexBuilder { - converter, - get_sample: Box::new(|sa| suffix_array::sample(sa, 0)), - _i: std::marker::PhantomData, - _t: std::marker::PhantomData, - } - } -} - -impl SearchIndexBuilder -where - T: Character, - C: Converter, -{ - /// Make sure the index only supports the count operation. - /// - /// The suffix array for the locate operation will be dropped from the index. - pub fn count_only(self) -> SearchIndexBuilder, T, C, ()> { - SearchIndexBuilder { - converter: self.converter, - get_sample: Box::new(|_| ()), - _i: std::marker::PhantomData, - _t: self._t, - } - } - - /// Make sure the index will use RLFM-Index, which encodes the backing Wavelet Matrix using run-length encoding. - /// - /// The index will be more space-efficient than the FM-Index, but is slower. - pub fn run_length_encoding(self) -> SearchIndexBuilder, T, C, S> { - SearchIndexBuilder { - converter: self.converter, - get_sample: self.get_sample, - _i: std::marker::PhantomData, - _t: self._t, - } - } -} - -impl SearchIndexBuilder -where - I: FMIndexBackend, - T: Character, - C: Converter, -{ - /// Adjust the sampling level of the suffix array to use for position lookup. - /// - /// A sampling level of 0 means the most memory is used (a full suffix-array is - /// retained), while looking up positions is faster. A sampling level of - /// 1 means half the memory is used, but looking up positions is slower. - /// Each increase in level halves the memory usage but slows down - /// position lookup. - pub fn level(mut self, level: usize) -> SearchIndexBuilder { - self.get_sample = Box::new(move |sa| suffix_array::sample(sa, level)); - self - } -} - -impl SearchIndexBuilder, T, C, S> -where - T: Character, - C: Converter, -{ - /// Build a new [SearchIndex] backed by [FMIndex]. - pub fn build(self, text: Vec) -> SearchIndex> { - SearchIndex { - index: FMIndex::create(text, self.converter, self.get_sample), - } - } -} - -impl SearchIndexBuilder, T, C, S> -where - T: Character, - C: Converter, -{ - /// Build a new [SearchIndex] backed by [RLFMIndex]. - pub fn build(self, text: Vec) -> SearchIndex> { - SearchIndex { - index: RLFMIndex::create(text, self.converter, self.get_sample), - } - } -} +use crate::seal; +use crate::suffix_array::HasPosition; /// A full-text index backed by FM-Index or its variant. pub struct SearchIndex {