Skip to content

Commit

Permalink
fix reading the correct prefix length
Browse files Browse the repository at this point in the history
  • Loading branch information
koeppl committed Feb 5, 2024
1 parent 0659381 commit 9622a31
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 18 deletions.
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,19 @@ cargo build
cargo run --bin count_sigma -- --file ./data/tudocomp/einstein.en.txt
```


compute the 5th Fibonacci word
```
cargo run --bin word -- -n fibonacci -k 5
```


Datasets can be found at http://dolomit.cs.tu-dortmund.de/tudocomp/

The output format of the analytic tools is compatible with [sqlplot](https://github.com/koeppl/sqlplot).

## CAVEATS

- The BWT computation requires that the zero byte does not occur in your input. To enforce that, you can use the `escape` program to escape all zero bytes.
For instance, `escape -f 0 -t 255 -e 254 | count_r` escapes 0 with the escape byte 254 to byte 255, and pipes the input to `count_r` counting the number of BWT runs.


2 changes: 1 addition & 1 deletion src/bin/count_z.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ fn main() {
let result_format = format!(
"RESULT file={} length={} ",
core::get_filename(&args.infilename),
args.prefixlength
sa.len()
);

now = Instant::now();
Expand Down
10 changes: 5 additions & 5 deletions src/bin/escape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ struct Args {
#[arg(short, long)]
to_symbols: Vec<u8>,

/// list byte codes that are safe and equal of length to from
#[arg(short, long)]
/// if true, invert the conversion
#[arg(long)]
is_reversion: bool,
}

Expand Down Expand Up @@ -73,9 +73,9 @@ fn main() {
let mut writer = io::stream_or_stdout(core::stringopt_stropt(&args.outfilename));

//@ sanity checks
assert_eq!(args.from_symbols.len(), args.to_symbols.len());
assert!(!args.from_symbols.contains(&args.escape_symbol));
assert!(!args.to_symbols.contains(&args.escape_symbol));
assert_eq!(args.from_symbols.len(), args.to_symbols.len(), "The set of --from and --to symbols must be of equal size!");
assert!(!args.from_symbols.contains(&args.escape_symbol), "The set of --from symbols must not contain the escape symbol!");
assert!(!args.to_symbols.contains(&args.escape_symbol), "The set of --to symbols must not contain the escape symbol!");
assert!(!args
.from_symbols
.iter()
Expand Down
8 changes: 4 additions & 4 deletions src/bin/is_stringattractor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -181,11 +181,11 @@ fn is_attractor(text: &[u8], attractor: &[u64]) -> bool {
};
let suffix_edges = lcp_intervals(text, &sa, &lcp);

use succinct::BitVector;
// use succinct::BitVector;
use succinct::*;
// use succinct::Select1Support;
use succinct::bit_vec::BitVecMut;
use succinct::BinSearchSelect;
// use succinct::bit_vec::BitVecMut;
// use succinct::BinSearchSelect;

let attractor_positions = {
let mut v = BitVector::with_fill(n as u64, false);
Expand All @@ -194,7 +194,7 @@ fn is_attractor(text: &[u8], attractor: &[u64]) -> bool {
}
v
};
use succinct::Rank9;
// use succinct::Rank9;
//TODO: remove clone() calls!
let rank = Rank9::new(attractor_positions.clone());
let select = BinSearchSelect::new(rank.clone()); //@ starts with index 0
Expand Down
4 changes: 3 additions & 1 deletion src/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ use more_asserts::assert_lt;
use more_asserts::debug_assert_lt;

pub fn bwt_from_text_by_sa(text: &Vec<u8>) -> Vec<u8> {
assert_gt!(!text.len(), 0);
let n = text.len();
let mut sa = vec![0; n];
assert!(!text[..text.len() - 1].iter().any(|&x| x == 0));
assert!(!text[..text.len() - 1].iter().any(|&x| x == 0), "the input text contains bytes equal to zero!");
cdivsufsort::sort_in_place(text, sa.as_mut_slice());
let mut bwt = vec![text[0]; n];
// let mut rsa = vec![0; n];
Expand Down Expand Up @@ -236,6 +237,7 @@ pub fn bit_size(i: usize) -> u8 {

pub fn compute_phi<T: AsPrimitive<usize> + num::cast::FromPrimitive>(sa: &[T]) -> Vec<T> {
let mut phi = vec![T::from_usize(0).unwrap(); sa.len()];
assert_gt!(sa.len(), 0);
for i in 1..sa.len() {
phi[sa[i].as_()] = sa[i - 1];
}
Expand Down
15 changes: 9 additions & 6 deletions src/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,16 @@ pub fn file2byte_vector(filename: &str, prefix_length: usize) -> Vec<u8> {
metadata.len()
};
assert!(buffer_length <= std::usize::MAX as u64);
let mut buffer = Vec::new();
buffer.reserve_exact(buffer_length as usize);

match f.read_to_end(&mut buffer) {
Ok(length) => assert_eq!(length, buffer.len()),
Err(x) => panic!("in file2byte_vector: {}", x),
};
let mut buffer = vec![0; buffer_length as usize];
f.read_exact(&mut buffer).unwrap();

// let mut buffer = Vec::new();
// buffer.reserve_exact(buffer_length as usize);
// match f.read_to_end(&mut buffer) {
// Ok(length) => assert_eq!(length, buffer.len()),
// Err(x) => panic!("in file2byte_vector: {}", x),
// };
buffer
}

Expand Down

0 comments on commit 9622a31

Please sign in to comment.