Skip to content

Commit

Permalink
Merge pull request #50 from ajalab/restructure-suffix-array-module
Browse files Browse the repository at this point in the history
Restructure suffix array module
  • Loading branch information
ajalab authored Jan 31, 2025
2 parents c493db1 + b9a3100 commit 735005a
Show file tree
Hide file tree
Showing 9 changed files with 120 additions and 121 deletions.
3 changes: 1 addition & 2 deletions benches/locate.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use fm_index::suffix_array::HasPosition;
use fm_index::{FMIndexBackend, SearchIndexBuilder};
use fm_index::{FMIndexBackend, HasPosition, SearchIndexBuilder};

use criterion::{criterion_group, criterion_main};
use criterion::{BatchSize, BenchmarkId, Criterion, Throughput};
Expand Down
2 changes: 1 addition & 1 deletion src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use crate::converter;

use crate::{
converter::{Converter, IdConverter},
suffix_array::SuffixOrderSampledArray,
suffix_array::sample::SuffixOrderSampledArray,
Character, FMIndex, RLFMIndex,
};

Expand Down
6 changes: 3 additions & 3 deletions src/fm_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ use crate::character::{prepare_text, Character};
#[cfg(doc)]
use crate::converter;
use crate::converter::{Converter, IndexWithConverter};
use crate::iter::FMIndexBackend;
use crate::suffix_array::{self, HasPosition, SuffixOrderSampledArray};
use crate::iter::{FMIndexBackend, HasPosition};
use crate::suffix_array::sample::{self, SuffixOrderSampledArray};
use crate::{sais, seal, HeapSize};
use crate::{util, Search};

Expand Down Expand Up @@ -39,7 +39,7 @@ where
C: Converter<T>,
{
pub(crate) fn new(text: Vec<T>, converter: C, level: usize) -> Self {
Self::create(text, converter, |sa| suffix_array::sample(sa, level))
Self::create(text, converter, |sa| sample::sample(sa, level))
}
}

Expand Down
6 changes: 6 additions & 0 deletions src/iter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,12 @@ pub trait HeapSize {
fn size(&self) -> usize;
}

/// A trait for an index that supports locate queries.
pub trait HasPosition {
#[doc(hidden)]
fn get_sa<L: seal::IsLocal>(&self, i: u64) -> u64;
}

/// An iterator that goes backwards through the text, producing [`Character`].
pub struct BackwardIterator<'a, I>
where
Expand Down
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,6 @@
#![warn(missing_docs)]

pub mod converter;
pub mod suffix_array;

mod builder;
mod character;
Expand All @@ -145,12 +144,13 @@ mod rlfmi;
mod sais;
mod seal;
mod search;
mod suffix_array;
mod util;

pub use crate::fm_index::FMIndex;
pub use crate::rlfmi::RLFMIndex;

pub use builder::SearchIndexBuilder;
pub use character::Character;
pub use iter::{BackwardIterator, FMIndexBackend, ForwardIterator, HeapSize};
pub use iter::{BackwardIterator, FMIndexBackend, ForwardIterator, HasPosition, HeapSize};
pub use search::Search;
6 changes: 3 additions & 3 deletions src/rlfmi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ use crate::character::{prepare_text, Character};
#[cfg(doc)]
use crate::converter;
use crate::converter::{Converter, IndexWithConverter};
use crate::iter::FMIndexBackend;
use crate::suffix_array::{self, HasPosition, SuffixOrderSampledArray};
use crate::iter::{FMIndexBackend, HasPosition};
use crate::suffix_array::sample::{self, SuffixOrderSampledArray};
use crate::{sais, HeapSize, Search};
use crate::{seal, util};

Expand Down Expand Up @@ -41,7 +41,7 @@ where
C: Converter<T>,
{
pub(crate) fn new(text: Vec<T>, converter: C, level: usize) -> Self {
Self::create(text, converter, |sa| suffix_array::sample(sa, level))
Self::create(text, converter, |sa| sample::sample(sa, level))
}
}

Expand Down
3 changes: 1 addition & 2 deletions src/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@ use crate::character::Character;
use crate::converter;

use crate::converter::IndexWithConverter;
use crate::iter::FMIndexBackend;
use crate::iter::{FMIndexBackend, HasPosition};
use crate::seal;
use crate::suffix_array::HasPosition;

/// An object containing the result of a search.
///
Expand Down
109 changes: 1 addition & 108 deletions src/suffix_array.rs
Original file line number Diff line number Diff line change
@@ -1,112 +1,5 @@
//! Suffix arrays, used to construct the index.
//!
//! Can also be used in sampled fashion to perform locate queries.
use crate::{seal, util};
use std::fmt;
use serde::{Deserialize, Serialize};
use vers_vecs::BitVec;

/// A trait for an index that supports locate queries.
///
/// This is only supported when [`SuffixOrderSampledArray`] is passed in.
pub trait HasPosition {
#[doc(hidden)]
fn get_sa<L: seal::IsLocal>(&self, i: u64) -> u64;
}

/// A sampled suffix array, stored within the index.
#[derive(Serialize, Deserialize)]
pub struct SuffixOrderSampledArray {
level: usize,
word_size: usize,
sa: BitVec,
len: usize,
}

impl SuffixOrderSampledArray {
pub(crate) fn get(&self, i: u64) -> Option<u64> {
debug_assert!(i < self.len as u64);
if i & ((1 << self.level) - 1) == 0 {
Some(
self.sa.get_bits_unchecked(
(i as usize >> self.level) * self.word_size,
self.word_size,
),
)
} else {
None
}
}

pub(crate) fn size(&self) -> usize {
std::mem::size_of::<Self>() + self.sa.heap_size()
}
}

impl fmt::Debug for SuffixOrderSampledArray {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
for i in 0..self.len {
match self.get(i as u64) {
Some(sa) => write!(f, "{}", sa)?,
None => write!(f, "?")?,
}
}
Ok(())
}
}

pub(crate) fn sample(sa: &[u64], level: usize) -> SuffixOrderSampledArray {
let n = sa.len();
let word_size = (util::log2(n as u64) + 1) as usize;
debug_assert!(n > 0);
debug_assert!(
n > (1 << level),
"sampling level L must satisfy 2^L < text_len (L = {}, text_len = {})",
level,
n,
);
let sa_samples_len = ((n - 1) >> level) + 1;
let mut sa_samples = BitVec::with_capacity(sa_samples_len);
// fid::BitArray::with_word_size(word_size, sa_samples_len);
for i in 0..sa_samples_len {
sa_samples.append_bits(sa[i << level], word_size);
}
SuffixOrderSampledArray {
level,
word_size,
sa: sa_samples,
len: sa.len(),
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_regular() {
let cases = [
(1, 10),
(1, 25),
(2, 8),
(2, 9),
(2, 10),
(2, 25),
(3, 24),
(3, 25),
];
for &(level, n) in cases.iter() {
let sa = (0..n).collect::<Vec<u64>>();
let ssa = sample(&sa, level);
for i in 0..n {
let v = ssa.get(i);
if i & ((1 << level) - 1) == 0 {
assert_eq!(v, Some(i), "ssa[{}] should be Some({})", i, i);
} else {
assert_eq!(v, None, "ssa[{}] should be None", i);
}
}
}
}
}
pub mod sample;
102 changes: 102 additions & 0 deletions src/suffix_array/sample.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
//! Sampled suffix arrays to perform locate queries.
use crate::util;
use std::fmt;

use serde::{Deserialize, Serialize};
use vers_vecs::BitVec;

/// A sampled suffix array, stored within the index.
#[derive(Serialize, Deserialize)]
pub struct SuffixOrderSampledArray {
level: usize,
word_size: usize,
sa: BitVec,
len: usize,
}

impl SuffixOrderSampledArray {
pub(crate) fn get(&self, i: u64) -> Option<u64> {
debug_assert!(i < self.len as u64);
if i & ((1 << self.level) - 1) == 0 {
Some(
self.sa.get_bits_unchecked(
(i as usize >> self.level) * self.word_size,
self.word_size,
),
)
} else {
None
}
}

pub(crate) fn size(&self) -> usize {
std::mem::size_of::<Self>() + self.sa.heap_size()
}
}

impl fmt::Debug for SuffixOrderSampledArray {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
for i in 0..self.len {
match self.get(i as u64) {
Some(sa) => write!(f, "{}", sa)?,
None => write!(f, "?")?,
}
}
Ok(())
}
}

pub(crate) fn sample(sa: &[u64], level: usize) -> SuffixOrderSampledArray {
let n = sa.len();
let word_size = (util::log2(n as u64) + 1) as usize;
debug_assert!(n > 0);
debug_assert!(
n > (1 << level),
"sampling level L must satisfy 2^L < text_len (L = {}, text_len = {})",
level,
n,
);
let sa_samples_len = ((n - 1) >> level) + 1;
let mut sa_samples = BitVec::with_capacity(sa_samples_len);
// fid::BitArray::with_word_size(word_size, sa_samples_len);
for i in 0..sa_samples_len {
sa_samples.append_bits(sa[i << level], word_size);
}
SuffixOrderSampledArray {
level,
word_size,
sa: sa_samples,
len: sa.len(),
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_regular() {
let cases = [
(1, 10),
(1, 25),
(2, 8),
(2, 9),
(2, 10),
(2, 25),
(3, 24),
(3, 25),
];
for &(level, n) in cases.iter() {
let sa = (0..n).collect::<Vec<u64>>();
let ssa = sample(&sa, level);
for i in 0..n {
let v = ssa.get(i);
if i & ((1 << level) - 1) == 0 {
assert_eq!(v, Some(i), "ssa[{}] should be Some({})", i, i);
} else {
assert_eq!(v, None, "ssa[{}] should be None", i);
}
}
}
}
}

0 comments on commit 735005a

Please sign in to comment.