From 1d078d7f31b0d4fa7ad9e0fc0d20774bc11d9080 Mon Sep 17 00:00:00 2001 From: Martijn Faassen Date: Wed, 29 Jan 2025 15:31:43 +0100 Subject: [PATCH 1/2] Refine traits and documentation surrounding them. --- src/fm_index.rs | 36 ++++++++++++++-- src/iter.rs | 49 ++++++++++++++++------ src/lib.rs | 2 +- src/rlfmi.rs | 26 +++++++++++- src/search.rs | 2 +- tests/test_api.rs | 103 ++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 197 insertions(+), 21 deletions(-) create mode 100644 tests/test_api.rs diff --git a/src/fm_index.rs b/src/fm_index.rs index 7af4452..1e65965 100644 --- a/src/fm_index.rs +++ b/src/fm_index.rs @@ -4,7 +4,7 @@ use crate::converter; use crate::converter::{Converter, IndexWithConverter}; use crate::iter::FMIndexBackend; use crate::suffix_array::{self, HasPosition, SuffixOrderSampledArray}; -use crate::{sais, seal}; +use crate::{sais, seal, HeapSize}; use crate::{util, Search}; use serde::{Deserialize, Serialize}; @@ -95,7 +95,11 @@ where } } -impl FMIndex { +impl FMIndex +where + T: Character, + C: Converter, +{ /// The size on the heap of the FM-Index. /// /// No suffix array information is stored in this index. @@ -106,7 +110,11 @@ impl FMIndex { } } -impl FMIndex { +impl FMIndex +where + T: Character, + C: Converter, +{ /// The size on the heap of the FM-Index. /// /// Sampled suffix array data is stored in this index. @@ -118,6 +126,26 @@ impl FMIndex { } } +impl HeapSize for FMIndex +where + T: Character, + C: Converter, +{ + fn size(&self) -> usize { + FMIndex::::size(self) + } +} + +impl HeapSize for FMIndex +where + T: Character, + C: Converter, +{ + fn size(&self) -> usize { + FMIndex::::size(self) + } +} + impl seal::Sealed for FMIndex {} impl FMIndexBackend for FMIndex @@ -127,7 +155,7 @@ where { type T = T; - fn len(&self) -> u64 { + fn len(&self) -> u64 { self.bw.len() as u64 } diff --git a/src/iter.rs b/src/iter.rs index 479b18d..1cb9df0 100644 --- a/src/iter.rs +++ b/src/iter.rs @@ -3,13 +3,16 @@ use crate::converter::{Converter, IndexWithConverter}; use crate::seal; use crate::search::Search; -/// A FM-Index that can search texts backwards and forwards. +/// Trait for an FM-Index implementation. +/// +/// You can use this to implement against a FM-Index generically. +/// +/// You cannot implement this trait yourself. pub trait FMIndexBackend: Sized + seal::Sealed { /// A [`Character`] type. type T: Character; - #[doc(hidden)] - fn len(&self) -> u64; + // We hide all the methods involved in implementation. #[doc(hidden)] fn get_l(&self, i: u64) -> Self::T; @@ -24,25 +27,45 @@ pub trait FMIndexBackend: Sized + seal::Sealed { #[doc(hidden)] fn fl_map2(&self, c: Self::T, i: u64) -> u64; - #[doc(hidden)] - fn search(&self, pattern: K) -> Search - where - K: AsRef<[Self::T]>, - { - Search::new(self).search(pattern) - } - #[doc(hidden)] fn iter_forward(&self, i: u64) -> ForwardIterator { - debug_assert!(i < self.len::()); + debug_assert!(i < self.len()); ForwardIterator { index: self, i } } #[doc(hidden)] fn iter_backward(&self, i: u64) -> BackwardIterator { - debug_assert!(i < self.len::()); + debug_assert!(i < self.len()); BackwardIterator { index: self, i } } + + // The following methods are public. + + /// Search for a pattern in the text. + /// + /// Return a [`Search`] object with information about the search + /// result. + fn search(&self, pattern: K) -> Search + where + K: AsRef<[Self::T]>, + { + Search::new(self).search(pattern) + } + + /// The size of the text in the index + /// + /// Note that this includes an ending \0 (terminator) character + /// so will be one more than the length of the text. + fn len(&self) -> u64; +} + +/// Access the heap size of the structure. +/// +/// This can be useful if you want to fine-tune the memory usage of your +/// application. +pub trait HeapSize { + /// The size on the heap of this structure, in bytes. + fn size(&self) -> usize; } /// An iterator that goes backwards through the text, producing [`Character`]. diff --git a/src/lib.rs b/src/lib.rs index bbaf899..62d146b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -152,5 +152,5 @@ pub use crate::rlfmi::RLFMIndex; pub use builder::SearchIndexBuilder; pub use character::Character; -pub use iter::{BackwardIterator, FMIndexBackend, ForwardIterator}; +pub use iter::{BackwardIterator, FMIndexBackend, ForwardIterator, HeapSize}; pub use search::{Search, SearchIndex}; diff --git a/src/rlfmi.rs b/src/rlfmi.rs index a3203f2..9ec199b 100644 --- a/src/rlfmi.rs +++ b/src/rlfmi.rs @@ -4,7 +4,7 @@ use crate::converter; use crate::converter::{Converter, IndexWithConverter}; use crate::iter::FMIndexBackend; use crate::suffix_array::{self, HasPosition, SuffixOrderSampledArray}; -use crate::{sais, Search}; +use crate::{sais, HeapSize, Search}; use crate::{seal, util}; use serde::{Deserialize, Serialize}; @@ -142,6 +142,7 @@ where impl RLFMIndex where T: Character, + C: Converter, { /// Heap size of the index. /// @@ -158,6 +159,7 @@ where impl RLFMIndex where T: Character, + C: Converter, { /// The size on the heap of the FM-Index. /// @@ -172,6 +174,26 @@ where } } +impl HeapSize for RLFMIndex +where + T: Character, + C: Converter, +{ + fn size(&self) -> usize { + RLFMIndex::::size(self) + } +} + +impl HeapSize for RLFMIndex +where + T: Character, + C: Converter, +{ + fn size(&self) -> usize { + RLFMIndex::::size(self) + } +} + impl seal::Sealed for RLFMIndex {} impl FMIndexBackend for RLFMIndex @@ -181,7 +203,7 @@ where { type T = T; - fn len(&self) -> u64 { + fn len(&self) -> u64 { self.len } diff --git a/src/search.rs b/src/search.rs index 9035027..b48bbba 100644 --- a/src/search.rs +++ b/src/search.rs @@ -42,7 +42,7 @@ where Search { index, s: 0, - e: index.len::(), + e: index.len(), pattern: vec![], } } diff --git a/tests/test_api.rs b/tests/test_api.rs new file mode 100644 index 0000000..59cf6ba --- /dev/null +++ b/tests/test_api.rs @@ -0,0 +1,103 @@ +// tests that exercise the public API, especially the traits + +use fm_index::{FMIndexBackend, HeapSize}; + +fn len(index: &T) -> u64 { + index.len() +} + +fn size(t: &T) -> usize { + t.size() +} + +#[test] +fn test_fm_index_backend_trait_fm_index_suffix_array() { + let builder = fm_index::SearchIndexBuilder::new(); + let text = "text"; + + let index = builder.build(text.as_bytes().to_vec()); + + // any result will do for this test + assert_eq!(len(&index), 5); +} + +#[test] +fn test_heap_size_trait_fm_index_suffix_array() { + let builder = fm_index::SearchIndexBuilder::new(); + let text = "text"; + + let index = builder.build(text.as_bytes().to_vec()); + + // any result will do for this test + assert!(size(&index) > 0); +} + +#[test] +fn test_fm_index_backend_trait_fm_index_count_only() { + let builder = fm_index::SearchIndexBuilder::new().count_only(); + let text = "text"; + + let index = builder.build(text.as_bytes().to_vec()); + + // any result will do for this test + assert_eq!(len(&index), 5); +} + +#[test] +fn test_heap_size_trait_fm_index_count_only() { + let builder = fm_index::SearchIndexBuilder::new().count_only(); + let text = "text"; + + let index = builder.build(text.as_bytes().to_vec()); + + // any result will do for this test + assert!(size(&index) > 0); +} + +#[test] +fn test_fm_index_backend_trait_rlfm_index_suffix_array() { + let builder = fm_index::SearchIndexBuilder::new().run_length_encoding(); + let text = "text"; + + let index = builder.build(text.as_bytes().to_vec()); + + // any result will do for this test + assert_eq!(len(&index), 5); +} + +#[test] +fn test_heap_size_trait_rlfm_index_suffix_array() { + let builder = fm_index::SearchIndexBuilder::new().run_length_encoding(); + let text = "text"; + + let index = builder.build(text.as_bytes().to_vec()); + + // any result will do for this test + assert!(size(&index) > 0); +} + +#[test] +fn test_fm_index_backend_trait_rlfm_index_count_only() { + let builder = fm_index::SearchIndexBuilder::new() + .count_only() + .run_length_encoding(); + let text = "text"; + + let index = builder.build(text.as_bytes().to_vec()); + + // any result will do for this test + assert_eq!(len(&index), 5); +} + +#[test] +fn test_heap_size_trait_rlfm_index_count_only() { + let builder = fm_index::SearchIndexBuilder::new() + .count_only() + .run_length_encoding(); + let text = "text"; + + let index = builder.build(text.as_bytes().to_vec()); + + // any result will do for this test + assert!(size(&index) > 0); +} From 72a86b2b633c49f7260b863b1786ffb367015a67 Mon Sep 17 00:00:00 2001 From: Martijn Faassen Date: Wed, 29 Jan 2025 15:32:26 +0100 Subject: [PATCH 2/2] Remove the Search struct in favor of the trait. --- src/lib.rs | 2 +- src/search.rs | 15 --------------- 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 62d146b..dc36428 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -153,4 +153,4 @@ pub use crate::rlfmi::RLFMIndex; pub use builder::SearchIndexBuilder; pub use character::Character; pub use iter::{BackwardIterator, FMIndexBackend, ForwardIterator, HeapSize}; -pub use search::{Search, SearchIndex}; +pub use search::Search; diff --git a/src/search.rs b/src/search.rs index b48bbba..6725586 100644 --- a/src/search.rs +++ b/src/search.rs @@ -8,21 +8,6 @@ use crate::iter::FMIndexBackend; use crate::seal; use crate::suffix_array::HasPosition; -/// A full-text index backed by FM-Index or its variant. -pub struct SearchIndex { - index: I, -} - -impl SearchIndex { - /// Search for a pattern in the text. - /// - /// Return a [`Search`] object with information about the search - /// result. - pub fn search>(&self, pattern: K) -> Search { - self.index.search(pattern) - } -} - /// An object containing the result of a search. /// /// This is expanded with a `locate` method if the index is