diff --git a/Cargo.toml b/Cargo.toml index 13cf560..aded766 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fmlrc" -version = "0.1.0" +version = "0.1.1" authors = ["holtjma "] edition = "2018" license = "MIT OR Apache-2.0" diff --git a/src/bv_bwt.rs b/src/bv_bwt.rs index c43d9cf..5f2b655 100644 --- a/src/bv_bwt.rs +++ b/src/bv_bwt.rs @@ -14,7 +14,7 @@ const ZERO_COUNT_VEC: [u64; 256] = [0; 256]; // sweet spot seems to be 8, providing a definitely speed boost while using ~25MB, // so negligible overhead compared to FMLRC as a whole // 9 is ~153MB, and 10 is ~922MB; 9 does offer a boost, but I'd be hesistant to use >=10 due to mem reqs -pub const DEFAULT_CACHE_K: usize = 8; +const DEFAULT_CACHE_K: usize = 8; /// Structure that contains a bit vector-based BWT+FM-index implementation pub struct BitVectorBWT { @@ -30,13 +30,14 @@ pub struct BitVectorBWT { kmer_cache: Vec } -/// Basic struct for containing a range in a BWT +/// Basic struct for containing a range in a BWT. +/// Only contains fields `l` and `h`, representing a range [l, h). #[derive(Clone,Copy,Default,Debug,Eq,PartialEq)] pub struct BWTRange { /// the lower bound, inclusive - l: u64, + pub l: u64, /// the upper bound, exclusive - h: u64 + pub h: u64 } impl Default for BitVectorBWT { @@ -121,7 +122,13 @@ impl BitVectorBWT { /// Initializes the BWT from the numpy file format for compressed BWTs /// # Arguments /// * `filename` - the name of the file to load into memory - // TODO: figure out how to write a test for this + /// # Examples + /// ```ignore + /// use fmlrc::bv_bwt::BitVectorBWT; + /// let mut bwt = BitVectorBWT::new(); + /// let filename: String = "/path/to/my/file/comp_msbwt.npy".to_string(); + /// bwt.load_numpy_file(&filename); + /// ``` pub fn load_numpy_file(&mut self, filename: &str) -> std::io::Result<()> { //read the numpy header: http://docs.scipy.org/doc/numpy-1.10.1/neps/npy-format.html //get the initial file size diff --git a/src/bwt_converter.rs b/src/bwt_converter.rs index 8b72b3c..fc94c02 100644 --- a/src/bwt_converter.rs +++ b/src/bwt_converter.rs @@ -6,11 +6,17 @@ use std::fs::{File, OpenOptions}; use std::io::prelude::*; use std::io::{BufWriter, Read}; +/// The number of characters in our alphabet pub const VC_LEN: usize = 6; //$ A C G N T +/// The number of bits for storing the character in a byte pub const LETTER_BITS: usize = 3; //defined +/// The number of bit for storing quantity in a byte pub const NUMBER_BITS: usize = 5; //8-letterBits +/// Multiplier for multi-byte runs pub const NUM_POWER: usize = 32; //2**numberBits +/// Contains the character mask pub const MASK: u8 = 0x07; //255 >> numberBits +/// Contains the right-shifted number mask pub const COUNT_MASK: u8 = 0x1F; //TODO: convert_to_vec currently pulls the whole compressed BWT into memory, which I'm largely okay with diff --git a/src/lib.rs b/src/lib.rs index 71c7629..61b9050 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,9 +1,48 @@ +/*! +# FM-Index Long Read Corrector v2 +This library provides access to the functionality used by FMLRC2 to perform read correction using a Burrows Wheeler Transform (BWT). +Currently, the BWT is assumed to have been generated externally (typically with a tool like ropebwt2) and stored in the same numpy format as FMLRC v1. +FMLRC load a binary representation of the BWT into memory for performing very fast queries at the cost of memory usage. +This particular implementation is accelerated over FMLRC v1 by using a cache to pre-compute common queries to the BWT. + +## Example +```rust +use fmlrc::bv_bwt::BitVectorBWT; +use fmlrc::bwt_converter::convert_to_vec; +use fmlrc::ropebwt2_util::create_bwt_from_strings; +use fmlrc::string_util::convert_stoi; +use std::io::Cursor; + +//example with in-memory BWT +let data: Vec<&str> = vec!["ACGT", "CCGG"]; +let seq = create_bwt_from_strings(&data).unwrap(); +let cursor_seq = Cursor::new(seq); +let vec_form = convert_to_vec(cursor_seq); +let mut bwt = BitVectorBWT::new(); +bwt.load_vector(vec_form); +//bwt.load_numpy_file(filename); <- if in a numpy file + +//do a count +let kmer: Vec = convert_stoi(&"ACGT"); +let kmer_count = bwt.count_kmer(&kmer); //ACGT +assert_eq!(kmer_count, 1); +``` +*/ + +/// Contains the bit vector implementation of the BWT pub mod bv_bwt; +/// Contains the function for reformating a BWT string into the expected run-length format or numpy file pub mod bwt_converter; +/// Contains bit vector with basic rank support; other crates exist with this, but they tended to be slow for some reason pub mod indexed_bit_vec; +/// Contains a wrapper around the rust-bio FASTA writer, but forces an ordering on the reads pub mod ordered_fasta_writer; +/// Contains the logic for performing the read correction pub mod read_correction; +/// Contains wrapper functions for `ropebwt2`, most will fail if `ropebwt2` is not on the PATH pub mod ropebwt2_util; +/// Contains special statistics functions, mainly an ignored median score pub mod stats_util; +/// Contains inline functions for converting between strings and integer formats pub mod string_util; diff --git a/src/read_correction.rs b/src/read_correction.rs index 2861e39..27f466b 100644 --- a/src/read_correction.rs +++ b/src/read_correction.rs @@ -14,16 +14,25 @@ const VALID_CHARS_LEN: usize = VALID_CHARS.len(); /// stores options for running the correction algorithms pub struct CorrectionParameters { //use_fm_index: bool, //old parameter that toggled between bit vector and classic mode + /// The k-mer sizes to use for correction, performs one pass per value in the array pub kmer_sizes: Vec, + /// The absolute minimum k-mer count to be considered present pub min_count: u64, + /// The maximum length of sequence to attempt to correct pub max_branch_attempt_length: usize, + /// A multiplier on `k` to limit the number of branches explored pub branch_limit_factor: f64, + /// A buffering factor to allow bridge corrections to be longer than the original sequence pub branch_buffer_factor: f64, //TODO: add a tail_truncate_factor that buts a bounding box around min length and max length + /// A factor limiting partial corrections when bridges cannot be found pub midpoint_ed_factor: f64, + /// A buffering factor to allow assembly corrections to be longer than the original sequence pub tail_buffer_factor: f64, + /// A multiplier factor on the median for dynamic minimum k-mer count to be considered present pub frac: f64, //fm_bit_power: u8, //only matters for classic mode which isn't implemented currently + /// Will calculate more stats if verbose is set to `true` pub verbose: bool } @@ -37,19 +46,28 @@ pub struct Correction { /// a struct for storing generic read #[derive(Clone,Debug)] pub struct LongReadFA { + /// The index associated with the read pub read_index: u64, + /// The read label/identifier pub label: String, + /// The actual genomic sequence pub seq: String } /// a struct for storing the modified string #[derive(Clone,Debug)] pub struct CorrectionResults { + /// The index associated with the read pub read_index: u64, + /// The read label/identifier pub label: String, + /// The original, uncorrected sequence pub original_seq: String, + /// The modified, corrected sequence pub corrected_seq: String, + /// If verbose is set, this will store the average k-mer count before correction pub avg_before: f64, + /// If verbose is set, this will store the average k-mer count after correction pub avg_after: f64 } @@ -462,8 +480,6 @@ fn pick_best_levenshtein(original: &[u8], candidates: Vec>, bwt: &BitVec /// * `min_count` - the minimum count required for a path to be consider solid /// * `branch_limit` - the maximum number of branches to explore before giving up on all paths /// * `max_branch_len` - the maximum branch length allowed before giving up on a path -/// # Examples -/// TODO: See unit tests for current examples. pub fn bridge_kmers( bwt: &BitVectorBWT, seed_kmer: &[u8], target_kmer: &[u8], min_count: u64, branch_limit: usize, max_branch_len: usize