updating the documentation to have some more info on splash page

HudsonAlpha · Jul 7, 2020 · efd9697 · efd9697
1 parent 6038853
commit efd9697
Show file tree

Hide file tree

Showing 5 changed files with 76 additions and 8 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "fmlrc"
-version = "0.1.0"
+version = "0.1.1"
 authors = ["holtjma <[email protected]>"]
 edition = "2018"
 license = "MIT OR Apache-2.0"

diff --git a/src/bv_bwt.rs b/src/bv_bwt.rs
@@ -14,7 +14,7 @@ const ZERO_COUNT_VEC: [u64; 256] = [0; 256];
 // sweet spot seems to be 8, providing a definitely speed boost while using ~25MB,
 // so negligible overhead compared to FMLRC as a whole
 // 9 is ~153MB, and 10 is ~922MB; 9 does offer a boost, but I'd be hesistant to use >=10 due to mem reqs
-pub const DEFAULT_CACHE_K: usize = 8; 
+const DEFAULT_CACHE_K: usize = 8; 
 
 /// Structure that contains a bit vector-based BWT+FM-index implementation
 pub struct BitVectorBWT {
@@ -30,13 +30,14 @@ pub struct BitVectorBWT {
     kmer_cache: Vec<BWTRange>
 }
 
-/// Basic struct for containing a range in a BWT
+/// Basic struct for containing a range in a BWT.
+/// Only contains fields `l` and `h`, representing a range [l, h).
 #[derive(Clone,Copy,Default,Debug,Eq,PartialEq)]
 pub struct BWTRange {
     /// the lower bound, inclusive
-    l: u64,
+    pub l: u64,
     /// the upper bound, exclusive
-    h: u64
+    pub h: u64
 }
 
 impl Default for BitVectorBWT {
@@ -121,7 +122,13 @@ impl BitVectorBWT {
     /// Initializes the BWT from the numpy file format for compressed BWTs
     /// # Arguments
     /// * `filename` - the name of the file to load into memory
-    // TODO: figure out how to write a test for this
+    /// # Examples
+    /// ```ignore
+    /// use fmlrc::bv_bwt::BitVectorBWT;
+    /// let mut bwt = BitVectorBWT::new();
+    /// let filename: String = "/path/to/my/file/comp_msbwt.npy".to_string();
+    /// bwt.load_numpy_file(&filename);
+    /// ```
     pub fn load_numpy_file(&mut self, filename: &str) -> std::io::Result<()> {
         //read the numpy header: http://docs.scipy.org/doc/numpy-1.10.1/neps/npy-format.html
         //get the initial file size

diff --git a/src/bwt_converter.rs b/src/bwt_converter.rs
@@ -6,11 +6,17 @@ use std::fs::{File, OpenOptions};
 use std::io::prelude::*;
 use std::io::{BufWriter, Read};
 
+/// The number of characters in our alphabet
 pub const VC_LEN: usize = 6;      //$ A C G N T
+/// The number of bits for storing the character in a byte
 pub const LETTER_BITS: usize = 3; //defined
+/// The number of bit for storing quantity in a byte
 pub const NUMBER_BITS: usize = 5; //8-letterBits
+/// Multiplier for multi-byte runs
 pub const NUM_POWER: usize = 32;  //2**numberBits
+/// Contains the character mask
 pub const MASK: u8 = 0x07;        //255 >> numberBits
+/// Contains the right-shifted number mask
 pub const COUNT_MASK: u8 = 0x1F;
 
 //TODO: convert_to_vec currently pulls the whole compressed BWT into memory, which I'm largely okay with

diff --git a/src/lib.rs b/src/lib.rs
@@ -1,9 +1,48 @@
 
+/*!
+# FM-Index Long Read Corrector v2
+This library provides access to the functionality used by FMLRC2 to perform read correction using a Burrows Wheeler Transform (BWT).
+Currently, the BWT is assumed to have been generated externally (typically with a tool like ropebwt2) and stored in the same numpy format as FMLRC v1.
+FMLRC load a binary representation of the BWT into memory for performing very fast queries at the cost of memory usage.
+This particular implementation is accelerated over FMLRC v1 by using a cache to pre-compute common queries to the BWT.
+
+## Example
+```rust
+use fmlrc::bv_bwt::BitVectorBWT;
+use fmlrc::bwt_converter::convert_to_vec;
+use fmlrc::ropebwt2_util::create_bwt_from_strings;
+use fmlrc::string_util::convert_stoi;
+use std::io::Cursor;
+
+//example with in-memory BWT
+let data: Vec<&str> = vec!["ACGT", "CCGG"];
+let seq = create_bwt_from_strings(&data).unwrap();
+let cursor_seq = Cursor::new(seq);
+let vec_form = convert_to_vec(cursor_seq);
+let mut bwt = BitVectorBWT::new();
+bwt.load_vector(vec_form);
+//bwt.load_numpy_file(filename); <- if in a numpy file
+
+//do a count
+let kmer: Vec<u8> = convert_stoi(&"ACGT");
+let kmer_count = bwt.count_kmer(&kmer); //ACGT
+assert_eq!(kmer_count, 1);
+```
+*/
+
+/// Contains the bit vector implementation of the BWT
 pub mod bv_bwt;
+/// Contains the function for reformating a BWT string into the expected run-length format or numpy file
 pub mod bwt_converter;
+/// Contains bit vector with basic rank support; other crates exist with this, but they tended to be slow for some reason
 pub mod indexed_bit_vec;
+/// Contains a wrapper around the rust-bio FASTA writer, but forces an ordering on the reads
 pub mod ordered_fasta_writer;
+/// Contains the logic for performing the read correction
 pub mod read_correction;
+/// Contains wrapper functions for `ropebwt2`, most will fail if `ropebwt2` is not on the PATH
 pub mod ropebwt2_util;
+/// Contains special statistics functions, mainly an ignored median score
 pub mod stats_util;
+/// Contains inline functions for converting between strings and integer formats
 pub mod string_util;
diff --git a/src/read_correction.rs b/src/read_correction.rs
@@ -14,16 +14,25 @@ const VALID_CHARS_LEN: usize = VALID_CHARS.len();
 /// stores options for running the correction algorithms
 pub struct CorrectionParameters {
     //use_fm_index: bool, //old parameter that toggled between bit vector and classic mode
+    /// The k-mer sizes to use for correction, performs one pass per value in the array
     pub kmer_sizes: Vec<usize>,
+    /// The absolute minimum k-mer count to be considered present
     pub min_count: u64,
+    /// The maximum length of sequence to attempt to correct
     pub max_branch_attempt_length: usize,
+    /// A multiplier on `k` to limit the number of branches explored
     pub branch_limit_factor: f64,
+    /// A buffering factor to allow bridge corrections to be longer than the original sequence
     pub branch_buffer_factor: f64,
     //TODO: add a tail_truncate_factor that buts a bounding box around min length and max length
+    /// A factor limiting partial corrections when bridges cannot be found
     pub midpoint_ed_factor: f64,
+    /// A buffering factor to allow assembly corrections to be longer than the original sequence
     pub tail_buffer_factor: f64,
+    /// A multiplier factor on the median for dynamic minimum k-mer count to be considered present
     pub frac: f64,
     //fm_bit_power: u8, //only matters for classic mode which isn't implemented currently
+    /// Will calculate more stats if verbose is set to `true`
     pub verbose: bool
 }
 
@@ -37,19 +46,28 @@ pub struct Correction {
 /// a struct for storing generic read
 #[derive(Clone,Debug)]
 pub struct LongReadFA {
+    /// The index associated with the read
     pub read_index: u64,
+    /// The read label/identifier
     pub label: String,
+    /// The actual genomic sequence
     pub seq: String
 }
 
 /// a struct for storing the modified string
 #[derive(Clone,Debug)]
 pub struct CorrectionResults {
+    /// The index associated with the read
     pub read_index: u64,
+    /// The read label/identifier
     pub label: String,
+    /// The original, uncorrected sequence
     pub original_seq: String,
+    /// The modified, corrected sequence
     pub corrected_seq: String,
+    /// If verbose is set, this will store the average k-mer count before correction
     pub avg_before: f64,
+    /// If verbose is set, this will store the average k-mer count after correction
     pub avg_after: f64
 }
 
@@ -462,8 +480,6 @@ fn pick_best_levenshtein(original: &[u8], candidates: Vec<Vec<u8>>, bwt: &BitVec
 /// * `min_count` - the minimum count required for a path to be consider solid
 /// * `branch_limit` - the maximum number of branches to explore before giving up on all paths
 /// * `max_branch_len` - the maximum branch length allowed before giving up on a path
-/// # Examples
-/// TODO: See unit tests for current examples.
 pub fn bridge_kmers(
     bwt: &BitVectorBWT, seed_kmer: &[u8], target_kmer: &[u8], min_count: u64, branch_limit: usize, 
     max_branch_len: usize