diff --git a/benches/locate.rs b/benches/locate.rs index 16b8c48..2352f66 100644 --- a/benches/locate.rs +++ b/benches/locate.rs @@ -1,5 +1,4 @@ -use fm_index::suffix_array::HasPosition; -use fm_index::{FMIndexBackend, SearchIndexBuilder}; +use fm_index::{FMIndexBackend, HasPosition, SearchIndexBuilder}; use criterion::{criterion_group, criterion_main}; use criterion::{BatchSize, BenchmarkId, Criterion, Throughput}; diff --git a/src/builder.rs b/src/builder.rs index b5e9341..faf4f0a 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -3,7 +3,7 @@ use crate::converter; use crate::{ converter::{Converter, IdConverter}, - suffix_array::SuffixOrderSampledArray, + suffix_array::sample::SuffixOrderSampledArray, Character, FMIndex, RLFMIndex, }; diff --git a/src/fm_index.rs b/src/fm_index.rs index 1e65965..3a4f1fb 100644 --- a/src/fm_index.rs +++ b/src/fm_index.rs @@ -2,8 +2,8 @@ use crate::character::{prepare_text, Character}; #[cfg(doc)] use crate::converter; use crate::converter::{Converter, IndexWithConverter}; -use crate::iter::FMIndexBackend; -use crate::suffix_array::{self, HasPosition, SuffixOrderSampledArray}; +use crate::iter::{FMIndexBackend, HasPosition}; +use crate::suffix_array::sample::{self, SuffixOrderSampledArray}; use crate::{sais, seal, HeapSize}; use crate::{util, Search}; @@ -39,7 +39,7 @@ where C: Converter, { pub(crate) fn new(text: Vec, converter: C, level: usize) -> Self { - Self::create(text, converter, |sa| suffix_array::sample(sa, level)) + Self::create(text, converter, |sa| sample::sample(sa, level)) } } diff --git a/src/iter.rs b/src/iter.rs index 1cb9df0..e6f6aa3 100644 --- a/src/iter.rs +++ b/src/iter.rs @@ -68,6 +68,12 @@ pub trait HeapSize { fn size(&self) -> usize; } +/// A trait for an index that supports locate queries. +pub trait HasPosition { + #[doc(hidden)] + fn get_sa(&self, i: u64) -> u64; +} + /// An iterator that goes backwards through the text, producing [`Character`]. pub struct BackwardIterator<'a, I> where diff --git a/src/lib.rs b/src/lib.rs index dc36428..4067369 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -135,7 +135,6 @@ #![warn(missing_docs)] pub mod converter; -pub mod suffix_array; mod builder; mod character; @@ -145,6 +144,7 @@ mod rlfmi; mod sais; mod seal; mod search; +mod suffix_array; mod util; pub use crate::fm_index::FMIndex; @@ -152,5 +152,5 @@ pub use crate::rlfmi::RLFMIndex; pub use builder::SearchIndexBuilder; pub use character::Character; -pub use iter::{BackwardIterator, FMIndexBackend, ForwardIterator, HeapSize}; +pub use iter::{BackwardIterator, FMIndexBackend, ForwardIterator, HasPosition, HeapSize}; pub use search::Search; diff --git a/src/rlfmi.rs b/src/rlfmi.rs index 9ec199b..0f0d2ff 100644 --- a/src/rlfmi.rs +++ b/src/rlfmi.rs @@ -2,8 +2,8 @@ use crate::character::{prepare_text, Character}; #[cfg(doc)] use crate::converter; use crate::converter::{Converter, IndexWithConverter}; -use crate::iter::FMIndexBackend; -use crate::suffix_array::{self, HasPosition, SuffixOrderSampledArray}; +use crate::iter::{FMIndexBackend, HasPosition}; +use crate::suffix_array::sample::{self, SuffixOrderSampledArray}; use crate::{sais, HeapSize, Search}; use crate::{seal, util}; @@ -41,7 +41,7 @@ where C: Converter, { pub(crate) fn new(text: Vec, converter: C, level: usize) -> Self { - Self::create(text, converter, |sa| suffix_array::sample(sa, level)) + Self::create(text, converter, |sa| sample::sample(sa, level)) } } diff --git a/src/search.rs b/src/search.rs index 6725586..a713f68 100644 --- a/src/search.rs +++ b/src/search.rs @@ -4,9 +4,8 @@ use crate::character::Character; use crate::converter; use crate::converter::IndexWithConverter; -use crate::iter::FMIndexBackend; +use crate::iter::{FMIndexBackend, HasPosition}; use crate::seal; -use crate::suffix_array::HasPosition; /// An object containing the result of a search. /// diff --git a/src/suffix_array.rs b/src/suffix_array.rs index 21a3056..3068146 100644 --- a/src/suffix_array.rs +++ b/src/suffix_array.rs @@ -1,112 +1,5 @@ //! Suffix arrays, used to construct the index. //! //! Can also be used in sampled fashion to perform locate queries. -use crate::{seal, util}; -use std::fmt; -use serde::{Deserialize, Serialize}; -use vers_vecs::BitVec; - -/// A trait for an index that supports locate queries. -/// -/// This is only supported when [`SuffixOrderSampledArray`] is passed in. -pub trait HasPosition { - #[doc(hidden)] - fn get_sa(&self, i: u64) -> u64; -} - -/// A sampled suffix array, stored within the index. -#[derive(Serialize, Deserialize)] -pub struct SuffixOrderSampledArray { - level: usize, - word_size: usize, - sa: BitVec, - len: usize, -} - -impl SuffixOrderSampledArray { - pub(crate) fn get(&self, i: u64) -> Option { - debug_assert!(i < self.len as u64); - if i & ((1 << self.level) - 1) == 0 { - Some( - self.sa.get_bits_unchecked( - (i as usize >> self.level) * self.word_size, - self.word_size, - ), - ) - } else { - None - } - } - - pub(crate) fn size(&self) -> usize { - std::mem::size_of::() + self.sa.heap_size() - } -} - -impl fmt::Debug for SuffixOrderSampledArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - for i in 0..self.len { - match self.get(i as u64) { - Some(sa) => write!(f, "{}", sa)?, - None => write!(f, "?")?, - } - } - Ok(()) - } -} - -pub(crate) fn sample(sa: &[u64], level: usize) -> SuffixOrderSampledArray { - let n = sa.len(); - let word_size = (util::log2(n as u64) + 1) as usize; - debug_assert!(n > 0); - debug_assert!( - n > (1 << level), - "sampling level L must satisfy 2^L < text_len (L = {}, text_len = {})", - level, - n, - ); - let sa_samples_len = ((n - 1) >> level) + 1; - let mut sa_samples = BitVec::with_capacity(sa_samples_len); - // fid::BitArray::with_word_size(word_size, sa_samples_len); - for i in 0..sa_samples_len { - sa_samples.append_bits(sa[i << level], word_size); - } - SuffixOrderSampledArray { - level, - word_size, - sa: sa_samples, - len: sa.len(), - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_regular() { - let cases = [ - (1, 10), - (1, 25), - (2, 8), - (2, 9), - (2, 10), - (2, 25), - (3, 24), - (3, 25), - ]; - for &(level, n) in cases.iter() { - let sa = (0..n).collect::>(); - let ssa = sample(&sa, level); - for i in 0..n { - let v = ssa.get(i); - if i & ((1 << level) - 1) == 0 { - assert_eq!(v, Some(i), "ssa[{}] should be Some({})", i, i); - } else { - assert_eq!(v, None, "ssa[{}] should be None", i); - } - } - } - } -} +pub mod sample; diff --git a/src/suffix_array/sample.rs b/src/suffix_array/sample.rs new file mode 100644 index 0000000..e3df26b --- /dev/null +++ b/src/suffix_array/sample.rs @@ -0,0 +1,102 @@ +//! Sampled suffix arrays to perform locate queries. +use crate::util; +use std::fmt; + +use serde::{Deserialize, Serialize}; +use vers_vecs::BitVec; + +/// A sampled suffix array, stored within the index. +#[derive(Serialize, Deserialize)] +pub struct SuffixOrderSampledArray { + level: usize, + word_size: usize, + sa: BitVec, + len: usize, +} + +impl SuffixOrderSampledArray { + pub(crate) fn get(&self, i: u64) -> Option { + debug_assert!(i < self.len as u64); + if i & ((1 << self.level) - 1) == 0 { + Some( + self.sa.get_bits_unchecked( + (i as usize >> self.level) * self.word_size, + self.word_size, + ), + ) + } else { + None + } + } + + pub(crate) fn size(&self) -> usize { + std::mem::size_of::() + self.sa.heap_size() + } +} + +impl fmt::Debug for SuffixOrderSampledArray { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + for i in 0..self.len { + match self.get(i as u64) { + Some(sa) => write!(f, "{}", sa)?, + None => write!(f, "?")?, + } + } + Ok(()) + } +} + +pub(crate) fn sample(sa: &[u64], level: usize) -> SuffixOrderSampledArray { + let n = sa.len(); + let word_size = (util::log2(n as u64) + 1) as usize; + debug_assert!(n > 0); + debug_assert!( + n > (1 << level), + "sampling level L must satisfy 2^L < text_len (L = {}, text_len = {})", + level, + n, + ); + let sa_samples_len = ((n - 1) >> level) + 1; + let mut sa_samples = BitVec::with_capacity(sa_samples_len); + // fid::BitArray::with_word_size(word_size, sa_samples_len); + for i in 0..sa_samples_len { + sa_samples.append_bits(sa[i << level], word_size); + } + SuffixOrderSampledArray { + level, + word_size, + sa: sa_samples, + len: sa.len(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_regular() { + let cases = [ + (1, 10), + (1, 25), + (2, 8), + (2, 9), + (2, 10), + (2, 25), + (3, 24), + (3, 25), + ]; + for &(level, n) in cases.iter() { + let sa = (0..n).collect::>(); + let ssa = sample(&sa, level); + for i in 0..n { + let v = ssa.get(i); + if i & ((1 << level) - 1) == 0 { + assert_eq!(v, Some(i), "ssa[{}] should be Some({})", i, i); + } else { + assert_eq!(v, None, "ssa[{}] should be None", i); + } + } + } + } +}