Skip to content

Commit

Permalink
updating the documentation to have some more info on splash page
Browse files Browse the repository at this point in the history
  • Loading branch information
holtjma committed Jul 7, 2020
1 parent 6038853 commit efd9697
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 8 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fmlrc"
version = "0.1.0"
version = "0.1.1"
authors = ["holtjma <[email protected]>"]
edition = "2018"
license = "MIT OR Apache-2.0"
Expand Down
17 changes: 12 additions & 5 deletions src/bv_bwt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ const ZERO_COUNT_VEC: [u64; 256] = [0; 256];
// sweet spot seems to be 8, providing a definitely speed boost while using ~25MB,
// so negligible overhead compared to FMLRC as a whole
// 9 is ~153MB, and 10 is ~922MB; 9 does offer a boost, but I'd be hesistant to use >=10 due to mem reqs
pub const DEFAULT_CACHE_K: usize = 8;
const DEFAULT_CACHE_K: usize = 8;

/// Structure that contains a bit vector-based BWT+FM-index implementation
pub struct BitVectorBWT {
Expand All @@ -30,13 +30,14 @@ pub struct BitVectorBWT {
kmer_cache: Vec<BWTRange>
}

/// Basic struct for containing a range in a BWT
/// Basic struct for containing a range in a BWT.
/// Only contains fields `l` and `h`, representing a range [l, h).
#[derive(Clone,Copy,Default,Debug,Eq,PartialEq)]
pub struct BWTRange {
/// the lower bound, inclusive
l: u64,
pub l: u64,
/// the upper bound, exclusive
h: u64
pub h: u64
}

impl Default for BitVectorBWT {
Expand Down Expand Up @@ -121,7 +122,13 @@ impl BitVectorBWT {
/// Initializes the BWT from the numpy file format for compressed BWTs
/// # Arguments
/// * `filename` - the name of the file to load into memory
// TODO: figure out how to write a test for this
/// # Examples
/// ```ignore
/// use fmlrc::bv_bwt::BitVectorBWT;
/// let mut bwt = BitVectorBWT::new();
/// let filename: String = "/path/to/my/file/comp_msbwt.npy".to_string();
/// bwt.load_numpy_file(&filename);
/// ```
pub fn load_numpy_file(&mut self, filename: &str) -> std::io::Result<()> {
//read the numpy header: http://docs.scipy.org/doc/numpy-1.10.1/neps/npy-format.html
//get the initial file size
Expand Down
6 changes: 6 additions & 0 deletions src/bwt_converter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,17 @@ use std::fs::{File, OpenOptions};
use std::io::prelude::*;
use std::io::{BufWriter, Read};

/// The number of characters in our alphabet
pub const VC_LEN: usize = 6; //$ A C G N T
/// The number of bits for storing the character in a byte
pub const LETTER_BITS: usize = 3; //defined
/// The number of bit for storing quantity in a byte
pub const NUMBER_BITS: usize = 5; //8-letterBits
/// Multiplier for multi-byte runs
pub const NUM_POWER: usize = 32; //2**numberBits
/// Contains the character mask
pub const MASK: u8 = 0x07; //255 >> numberBits
/// Contains the right-shifted number mask
pub const COUNT_MASK: u8 = 0x1F;

//TODO: convert_to_vec currently pulls the whole compressed BWT into memory, which I'm largely okay with
Expand Down
39 changes: 39 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,48 @@

/*!
# FM-Index Long Read Corrector v2
This library provides access to the functionality used by FMLRC2 to perform read correction using a Burrows Wheeler Transform (BWT).
Currently, the BWT is assumed to have been generated externally (typically with a tool like ropebwt2) and stored in the same numpy format as FMLRC v1.
FMLRC load a binary representation of the BWT into memory for performing very fast queries at the cost of memory usage.
This particular implementation is accelerated over FMLRC v1 by using a cache to pre-compute common queries to the BWT.
## Example
```rust
use fmlrc::bv_bwt::BitVectorBWT;
use fmlrc::bwt_converter::convert_to_vec;
use fmlrc::ropebwt2_util::create_bwt_from_strings;
use fmlrc::string_util::convert_stoi;
use std::io::Cursor;
//example with in-memory BWT
let data: Vec<&str> = vec!["ACGT", "CCGG"];
let seq = create_bwt_from_strings(&data).unwrap();
let cursor_seq = Cursor::new(seq);
let vec_form = convert_to_vec(cursor_seq);
let mut bwt = BitVectorBWT::new();
bwt.load_vector(vec_form);
//bwt.load_numpy_file(filename); <- if in a numpy file
//do a count
let kmer: Vec<u8> = convert_stoi(&"ACGT");
let kmer_count = bwt.count_kmer(&kmer); //ACGT
assert_eq!(kmer_count, 1);
```
*/

/// Contains the bit vector implementation of the BWT
pub mod bv_bwt;
/// Contains the function for reformating a BWT string into the expected run-length format or numpy file
pub mod bwt_converter;
/// Contains bit vector with basic rank support; other crates exist with this, but they tended to be slow for some reason
pub mod indexed_bit_vec;
/// Contains a wrapper around the rust-bio FASTA writer, but forces an ordering on the reads
pub mod ordered_fasta_writer;
/// Contains the logic for performing the read correction
pub mod read_correction;
/// Contains wrapper functions for `ropebwt2`, most will fail if `ropebwt2` is not on the PATH
pub mod ropebwt2_util;
/// Contains special statistics functions, mainly an ignored median score
pub mod stats_util;
/// Contains inline functions for converting between strings and integer formats
pub mod string_util;
20 changes: 18 additions & 2 deletions src/read_correction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,25 @@ const VALID_CHARS_LEN: usize = VALID_CHARS.len();
/// stores options for running the correction algorithms
pub struct CorrectionParameters {
//use_fm_index: bool, //old parameter that toggled between bit vector and classic mode
/// The k-mer sizes to use for correction, performs one pass per value in the array
pub kmer_sizes: Vec<usize>,
/// The absolute minimum k-mer count to be considered present
pub min_count: u64,
/// The maximum length of sequence to attempt to correct
pub max_branch_attempt_length: usize,
/// A multiplier on `k` to limit the number of branches explored
pub branch_limit_factor: f64,
/// A buffering factor to allow bridge corrections to be longer than the original sequence
pub branch_buffer_factor: f64,
//TODO: add a tail_truncate_factor that buts a bounding box around min length and max length
/// A factor limiting partial corrections when bridges cannot be found
pub midpoint_ed_factor: f64,
/// A buffering factor to allow assembly corrections to be longer than the original sequence
pub tail_buffer_factor: f64,
/// A multiplier factor on the median for dynamic minimum k-mer count to be considered present
pub frac: f64,
//fm_bit_power: u8, //only matters for classic mode which isn't implemented currently
/// Will calculate more stats if verbose is set to `true`
pub verbose: bool
}

Expand All @@ -37,19 +46,28 @@ pub struct Correction {
/// a struct for storing generic read
#[derive(Clone,Debug)]
pub struct LongReadFA {
/// The index associated with the read
pub read_index: u64,
/// The read label/identifier
pub label: String,
/// The actual genomic sequence
pub seq: String
}

/// a struct for storing the modified string
#[derive(Clone,Debug)]
pub struct CorrectionResults {
/// The index associated with the read
pub read_index: u64,
/// The read label/identifier
pub label: String,
/// The original, uncorrected sequence
pub original_seq: String,
/// The modified, corrected sequence
pub corrected_seq: String,
/// If verbose is set, this will store the average k-mer count before correction
pub avg_before: f64,
/// If verbose is set, this will store the average k-mer count after correction
pub avg_after: f64
}

Expand Down Expand Up @@ -462,8 +480,6 @@ fn pick_best_levenshtein(original: &[u8], candidates: Vec<Vec<u8>>, bwt: &BitVec
/// * `min_count` - the minimum count required for a path to be consider solid
/// * `branch_limit` - the maximum number of branches to explore before giving up on all paths
/// * `max_branch_len` - the maximum branch length allowed before giving up on a path
/// # Examples
/// TODO: See unit tests for current examples.
pub fn bridge_kmers(
bwt: &BitVectorBWT, seed_kmer: &[u8], target_kmer: &[u8], min_count: u64, branch_limit: usize,
max_branch_len: usize
Expand Down

0 comments on commit efd9697

Please sign in to comment.