Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Restructure suffix array module #50

Merged
merged 4 commits into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions benches/locate.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use fm_index::suffix_array::HasPosition;
use fm_index::{FMIndexBackend, SearchIndexBuilder};
use fm_index::{FMIndexBackend, HasPosition, SearchIndexBuilder};

use criterion::{criterion_group, criterion_main};
use criterion::{BatchSize, BenchmarkId, Criterion, Throughput};
Expand Down
2 changes: 1 addition & 1 deletion src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use crate::converter;

use crate::{
converter::{Converter, IdConverter},
suffix_array::SuffixOrderSampledArray,
suffix_array::sample::SuffixOrderSampledArray,
Character, FMIndex, RLFMIndex,
};

Expand Down
6 changes: 3 additions & 3 deletions src/fm_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ use crate::character::{prepare_text, Character};
#[cfg(doc)]
use crate::converter;
use crate::converter::{Converter, IndexWithConverter};
use crate::iter::FMIndexBackend;
use crate::suffix_array::{self, HasPosition, SuffixOrderSampledArray};
use crate::iter::{FMIndexBackend, HasPosition};
use crate::suffix_array::sample::{self, SuffixOrderSampledArray};
use crate::{sais, seal, HeapSize};
use crate::{util, Search};

Expand Down Expand Up @@ -39,7 +39,7 @@ where
C: Converter<T>,
{
pub(crate) fn new(text: Vec<T>, converter: C, level: usize) -> Self {
Self::create(text, converter, |sa| suffix_array::sample(sa, level))
Self::create(text, converter, |sa| sample::sample(sa, level))
}
}

Expand Down
6 changes: 6 additions & 0 deletions src/iter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,12 @@ pub trait HeapSize {
fn size(&self) -> usize;
}

/// A trait for an index that supports locate queries.
pub trait HasPosition {
#[doc(hidden)]
fn get_sa<L: seal::IsLocal>(&self, i: u64) -> u64;
}

/// An iterator that goes backwards through the text, producing [`Character`].
pub struct BackwardIterator<'a, I>
where
Expand Down
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,6 @@
#![warn(missing_docs)]

pub mod converter;
pub mod suffix_array;

mod builder;
mod character;
Expand All @@ -145,12 +144,13 @@ mod rlfmi;
mod sais;
mod seal;
mod search;
mod suffix_array;
mod util;

pub use crate::fm_index::FMIndex;
pub use crate::rlfmi::RLFMIndex;

pub use builder::SearchIndexBuilder;
pub use character::Character;
pub use iter::{BackwardIterator, FMIndexBackend, ForwardIterator, HeapSize};
pub use iter::{BackwardIterator, FMIndexBackend, ForwardIterator, HasPosition, HeapSize};
pub use search::Search;
6 changes: 3 additions & 3 deletions src/rlfmi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ use crate::character::{prepare_text, Character};
#[cfg(doc)]
use crate::converter;
use crate::converter::{Converter, IndexWithConverter};
use crate::iter::FMIndexBackend;
use crate::suffix_array::{self, HasPosition, SuffixOrderSampledArray};
use crate::iter::{FMIndexBackend, HasPosition};
use crate::suffix_array::sample::{self, SuffixOrderSampledArray};
use crate::{sais, HeapSize, Search};
use crate::{seal, util};

Expand Down Expand Up @@ -41,7 +41,7 @@ where
C: Converter<T>,
{
pub(crate) fn new(text: Vec<T>, converter: C, level: usize) -> Self {
Self::create(text, converter, |sa| suffix_array::sample(sa, level))
Self::create(text, converter, |sa| sample::sample(sa, level))
}
}

Expand Down
3 changes: 1 addition & 2 deletions src/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@ use crate::character::Character;
use crate::converter;

use crate::converter::IndexWithConverter;
use crate::iter::FMIndexBackend;
use crate::iter::{FMIndexBackend, HasPosition};
use crate::seal;
use crate::suffix_array::HasPosition;

/// An object containing the result of a search.
///
Expand Down
109 changes: 1 addition & 108 deletions src/suffix_array.rs
Original file line number Diff line number Diff line change
@@ -1,112 +1,5 @@
//! Suffix arrays, used to construct the index.
//!
//! Can also be used in sampled fashion to perform locate queries.
use crate::{seal, util};
use std::fmt;

use serde::{Deserialize, Serialize};
use vers_vecs::BitVec;

/// A trait for an index that supports locate queries.
///
/// This is only supported when [`SuffixOrderSampledArray`] is passed in.
pub trait HasPosition {
#[doc(hidden)]
fn get_sa<L: seal::IsLocal>(&self, i: u64) -> u64;
}

/// A sampled suffix array, stored within the index.
#[derive(Serialize, Deserialize)]
pub struct SuffixOrderSampledArray {
level: usize,
word_size: usize,
sa: BitVec,
len: usize,
}

impl SuffixOrderSampledArray {
pub(crate) fn get(&self, i: u64) -> Option<u64> {
debug_assert!(i < self.len as u64);
if i & ((1 << self.level) - 1) == 0 {
Some(
self.sa.get_bits_unchecked(
(i as usize >> self.level) * self.word_size,
self.word_size,
),
)
} else {
None
}
}

pub(crate) fn size(&self) -> usize {
std::mem::size_of::<Self>() + self.sa.heap_size()
}
}

impl fmt::Debug for SuffixOrderSampledArray {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
for i in 0..self.len {
match self.get(i as u64) {
Some(sa) => write!(f, "{}", sa)?,
None => write!(f, "?")?,
}
}
Ok(())
}
}

pub(crate) fn sample(sa: &[u64], level: usize) -> SuffixOrderSampledArray {
let n = sa.len();
let word_size = (util::log2(n as u64) + 1) as usize;
debug_assert!(n > 0);
debug_assert!(
n > (1 << level),
"sampling level L must satisfy 2^L < text_len (L = {}, text_len = {})",
level,
n,
);
let sa_samples_len = ((n - 1) >> level) + 1;
let mut sa_samples = BitVec::with_capacity(sa_samples_len);
// fid::BitArray::with_word_size(word_size, sa_samples_len);
for i in 0..sa_samples_len {
sa_samples.append_bits(sa[i << level], word_size);
}
SuffixOrderSampledArray {
level,
word_size,
sa: sa_samples,
len: sa.len(),
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_regular() {
let cases = [
(1, 10),
(1, 25),
(2, 8),
(2, 9),
(2, 10),
(2, 25),
(3, 24),
(3, 25),
];
for &(level, n) in cases.iter() {
let sa = (0..n).collect::<Vec<u64>>();
let ssa = sample(&sa, level);
for i in 0..n {
let v = ssa.get(i);
if i & ((1 << level) - 1) == 0 {
assert_eq!(v, Some(i), "ssa[{}] should be Some({})", i, i);
} else {
assert_eq!(v, None, "ssa[{}] should be None", i);
}
}
}
}
}
pub mod sample;
102 changes: 102 additions & 0 deletions src/suffix_array/sample.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
//! Sampled suffix arrays to perform locate queries.
use crate::util;
use std::fmt;

use serde::{Deserialize, Serialize};
use vers_vecs::BitVec;

/// A sampled suffix array, stored within the index.
#[derive(Serialize, Deserialize)]
pub struct SuffixOrderSampledArray {
level: usize,
word_size: usize,
sa: BitVec,
len: usize,
}

impl SuffixOrderSampledArray {
pub(crate) fn get(&self, i: u64) -> Option<u64> {
debug_assert!(i < self.len as u64);
if i & ((1 << self.level) - 1) == 0 {
Some(
self.sa.get_bits_unchecked(
(i as usize >> self.level) * self.word_size,
self.word_size,
),
)
} else {
None
}
}

pub(crate) fn size(&self) -> usize {
std::mem::size_of::<Self>() + self.sa.heap_size()
}
}

impl fmt::Debug for SuffixOrderSampledArray {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
for i in 0..self.len {
match self.get(i as u64) {
Some(sa) => write!(f, "{}", sa)?,
None => write!(f, "?")?,
}
}
Ok(())
}
}

pub(crate) fn sample(sa: &[u64], level: usize) -> SuffixOrderSampledArray {
let n = sa.len();
let word_size = (util::log2(n as u64) + 1) as usize;
debug_assert!(n > 0);
debug_assert!(
n > (1 << level),
"sampling level L must satisfy 2^L < text_len (L = {}, text_len = {})",
level,
n,
);
let sa_samples_len = ((n - 1) >> level) + 1;
let mut sa_samples = BitVec::with_capacity(sa_samples_len);
// fid::BitArray::with_word_size(word_size, sa_samples_len);
for i in 0..sa_samples_len {
sa_samples.append_bits(sa[i << level], word_size);
}
SuffixOrderSampledArray {
level,
word_size,
sa: sa_samples,
len: sa.len(),
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_regular() {
let cases = [
(1, 10),
(1, 25),
(2, 8),
(2, 9),
(2, 10),
(2, 25),
(3, 24),
(3, 25),
];
for &(level, n) in cases.iter() {
let sa = (0..n).collect::<Vec<u64>>();
let ssa = sample(&sa, level);
for i in 0..n {
let v = ssa.get(i);
if i & ((1 << level) - 1) == 0 {
assert_eq!(v, Some(i), "ssa[{}] should be Some({})", i, i);
} else {
assert_eq!(v, None, "ssa[{}] should be None", i);
}
}
}
}
}