Merge pull request #26 from HudsonAlpha/wfa_edit_dist

Wfa edit dist
HudsonAlpha · Apr 8, 2022 · 80dea1f · 80dea1f
2 parents 69ac321 + 9a5bd97
commit 80dea1f
Show file tree

Hide file tree

Showing 5 changed files with 208 additions and 13 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "fmlrc"
-version = "0.1.6"
+version = "0.1.7"
 authors = ["holtjma <[email protected]>"]
 edition = "2018"
 license = "MIT OR Apache-2.0"

diff --git a/README.md b/README.md
@@ -71,8 +71,7 @@ Currently, only uncompressed FASTA is supported for output reads.
 2. Unlimited `k`/`K` parameters - FMLRC v1 allowed 1 or 2 sizes for `k` only; FMLRC v2 can have the option set as many times as desired at increased CPU time (for example, a 3-pass correction with `k=[21, 59, 79]`) 
 3. Call caching - FMLRC v2 pre-computes all _k_-mers of a given size. This reduces the run-time significantly by cutting reducing calls to the FM-index.
 4. Input handling - thanks to [needletail](https://crates.io/crates/needletail), the uncorrected reads can be in FASTA/FASTQ and may or may not be gzip compressed.
-5. SIMD accelerated alignment - thanks to [triple_accel](https://crates.io/crates/triple_accel), the correction alignment step can be accelerated with SIMD instructions when available
-6. Unit testing - FMLRC v2 has unit testing through the standard Rust testing framework (i.e. `cargo test`)
+5. Unit testing - FMLRC v2 has unit testing through the standard Rust testing framework (i.e. `cargo test`)
 
 ## Benchmarks
 Thus far, all benchmarks have focused on a relatively small _E. coli_ dataset for verifying correctness.
@@ -88,12 +87,12 @@ The actual corrections are _nearly_ identical (there are slight differences not
 However, FMLRC v2 runs in less than half the time from both real time and CPU time perspectives. 
 While not explicitly measured, FMLRC v2 does use ~1GB of extra memory due to the 10-mer cache (`-C 10`).
 
-| Metric | FMLRC v1.0.0 | FMLRC2 v0.1.5 (`-C 10`) | FMLRC2 v0.1.6 (`-C 10`) |
+| Metric | FMLRC v1.0.0 | FMLRC2 v0.1.6 (`-C 10`) | FMLRC2 v0.1.7 (`-C 10`) |
 | - | - | - | - |
 | Recall | 0.9830 | 0.9830 | 0.9830 |
 | Precision | 0.9821 | 0.9821 | 0.9821 |
-| Real time | 3m38.067s | 2m47.120s | **1m24.219s** |
-| CPU time | 27m23.652s | 18m54.873s | **8m49.680s** |
+| Real time | 3m38.067s | 1m24.219s | **1m14.720s** |
+| CPU time | 27m23.652s | 8m49.680s | **8m15.823s** |
 
 ## Reference
 FMLRC v2 does not currently have a pre-print or paper. If you use FMLRC v2, please cite the FMLRC v1 paper:

diff --git a/benches/bv_bwt_benchmark.rs b/benches/bv_bwt_benchmark.rs
@@ -2,6 +2,7 @@
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
 use std::io::Cursor;
 
+use fmlrc::align::*;
 use fmlrc::bv_bwt::BitVectorBWT;
 use fmlrc::bwt_converter::convert_to_vec;
 use fmlrc::read_correction::bridge_kmers;
@@ -123,5 +124,32 @@ pub fn bench_stats_util(c: &mut Criterion) {
     }));
 }
 
-criterion_group!(benches, bench_string_util, bench_count_kmer, bench_fixed_counts, bench_stats_util);
+pub fn bench_align(c: &mut Criterion) {
+    let v1: Vec<u8> = vec![0, 1, 2, 4, 5, 0, 1, 2, 4, 5, 0, 1, 2, 4, 5, 0, 1, 2, 4, 5];
+    let v2: Vec<u8> = vec![0, 1, 3, 4, 5, 0, 1, 3, 4, 5, 0, 1, 3, 4, 5, 0, 1, 3, 4, 5];
+    let v3: Vec<u8> = vec![1, 2, 3, 5, 1, 2, 3, 5, 1, 2, 3, 5, 1, 2, 3, 5];
+    let v4: Vec<u8> = vec![0, 1, 3, 4, 5, 0, 1, 3, 4, 5, 0, 1, 3, 4, 5, 0, 1, 3, 4, 5, 0, 1, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 5];
+
+    c.bench_function("align_edit_distance", |b| b.iter(|| {
+        black_box(edit_distance(&v1, &v1));
+        black_box(edit_distance(&v1, &v2));
+        black_box(edit_distance(&v1, &v3));
+    }));
+
+    c.bench_function("align_edit_distance_minimize", |b| b.iter(|| {
+        black_box(edit_distance_minimize(&v1, &v4));
+    }));
+
+    c.bench_function("align_wfa_ed", |b| b.iter(|| {
+        black_box(wfa_ed(&v1, &v1));
+        black_box(wfa_ed(&v1, &v2));
+        black_box(wfa_ed(&v1, &v3));
+    }));
+
+    c.bench_function("align_wfa_minimize", |b| b.iter(|| {
+        black_box(wfa_minimize(&v1, &v4));
+    }));
+}
+
+criterion_group!(benches, bench_string_util, bench_count_kmer, bench_fixed_counts, bench_stats_util, bench_align);
 criterion_main!(benches);
diff --git a/src/align.rs b/src/align.rs
@@ -1,8 +1,8 @@
 
-use std::cmp::min;
+use std::cmp::{max,min};
 use std::mem::swap;
 
-/// Returns the edit distance between two u8 Vecs.
+/// Returns the edit distance between two u8 Vecs by doing the full grid calculation.
 /// # Arguments
 /// * `v1` - the first Vec
 /// * `v2` - the second Vec
@@ -54,7 +54,7 @@ pub struct MatchScore {
     pub match_length: usize
 }
 
-/// Returns the edit distance and index (y) between v1 and a slice of v2[0..y] such that the edit distance is minimized.
+/// Returns the edit distance and index (y) between v1 and a slice of v2[0..y] such that the edit distance is minimized by doing the full grid calculation.
 /// This used when you have a correction that is longer than the original sequence and you want to truncate the correction to the closest prefix string.
 /// # Arguments
 /// * `v1` - the first Vec, this one is always fully used in the edit distance calculation
@@ -113,6 +113,143 @@ pub fn edit_distance_minimize(v1: &[u8], v2: &[u8]) -> MatchScore {
     }
 }
 
+/// Returns the edit distance between two u8 Vecs by using a version of WFA.
+/// # Arguments
+/// * `v1` - the first Vec
+/// * `v2` - the second Vec
+/// # Examples
+/// ```rust
+/// use fmlrc::align::wfa_ed;
+/// let v1: Vec<u8> = vec![0, 1, 2, 4, 5];
+/// let v2: Vec<u8> = vec![0, 1, 3, 4, 5];
+/// let v3: Vec<u8> = vec![1, 2, 3, 5];
+/// assert_eq!(wfa_ed(&v1, &v1), 0);
+/// assert_eq!(wfa_ed(&v1, &v2), 1);
+/// assert_eq!(wfa_ed(&v1, &v3), 2);
+/// ```
+pub fn wfa_ed(v1: &[u8], v2: &[u8]) -> usize {
+    //we need the lengths to know where we are in the vecs
+    let l1 = v1.len();
+    let l2 = v2.len();
+
+    //stores the next indices that should be compared
+    let mut curr_wf: Vec<(usize, usize)> = vec![(0, 0)];
+    let mut next_wf: Vec<(usize, usize)> = vec![(0, 0); 3];
+    let mut edits = 0;
+
+    //main idea is to iterate until we're at the end of BOTH vecs, this is guaranteed because i and j monotonically increase
+    loop {
+        //during each iteration, we go over all wavefronts; at iteration e, there are 2*e+1 current wavefronts that will generate 2*(e+1)+1 wavefronts
+        //"e" in this context corresponds to the edit distance "edits"
+        for (wf_index, &wf) in curr_wf.iter().enumerate() {
+            let mut i = wf.0;
+            let mut j = wf.1;
+
+            //as long as the symbols match, keep moving along the diagonal
+            while i < l1 && j < l2 && v1[i] == v2[j] {
+                i += 1;
+                j += 1;
+            }
+
+            if i == l1 && j == l2 {
+                //we found the end, return the number of edits required to get here
+                return edits;
+            }
+            else if i == l1 {
+                //push the wavefront, but i cannot increase
+                next_wf[wf_index] = max(next_wf[wf_index], (i, j));
+                next_wf[wf_index+1] = max(next_wf[wf_index+1], (i, j+1));
+                next_wf[wf_index+2] = max(next_wf[wf_index+2], (i, j+1));
+            } else if j == l2 {
+                //push the wavefront, but j cannot increase
+                next_wf[wf_index] = max(next_wf[wf_index], (i+1, j));
+                next_wf[wf_index+1] = max(next_wf[wf_index+1], (i+1, j));
+                next_wf[wf_index+2] = max(next_wf[wf_index+2], (i, j));
+            } else {
+                //v1 and v2 do not match at i, j; add mismatch, insert, and del to the next wavefront
+                next_wf[wf_index] = max(next_wf[wf_index], (i+1, j)); //v2 has a deletion relative to v1
+                next_wf[wf_index+1] = max(next_wf[wf_index+1], (i+1, j+1)); //v2 has a mismatch relative to v1
+                next_wf[wf_index+2] = max(next_wf[wf_index+2], (i, j+1)); //v2 has an insertion relative to v1
+            }
+        }
+
+        //we finished this wave, increment the edit count and generate the buffer for the next wavefront
+        edits += 1;
+        curr_wf = next_wf;
+        next_wf = vec![(0, 0); 3+2*edits];
+    }
+}
+
+/// Returns the edit distance and index (y) between v1 and a slice of v2[0..y] such that the edit distance is minimized by using a version of WFA.
+/// This used when you have a correction that is longer than the original sequence and you want to truncate the correction to the closest prefix string.
+/// # Arguments
+/// * `v1` - the first Vec, this one is always fully used in the edit distance calculation
+/// * `v2` - the second Vec, this one will have a prefix used in the final calculation
+/// # Examples
+/// ```rust
+/// use fmlrc::align::{edit_distance_minimize,MatchScore};
+/// let v1: Vec<u8> = vec![0, 1, 2, 4, 5];
+/// let v2: Vec<u8> = vec![0, 1, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 5];
+/// assert_eq!(edit_distance_minimize(&v1, &v1), MatchScore {
+///     score: 0,
+///     match_length: 5
+/// });
+/// //The best match is v2[0..5] and the edit distance from v1 is 1
+/// assert_eq!(edit_distance_minimize(&v1, &v2), MatchScore {
+///     score: 1,
+///     match_length: 5
+/// });
+/// ```
+pub fn wfa_minimize(v1: &[u8], v2: &[u8]) -> MatchScore {
+    //we need the lengths to know where we are in the vecs
+    let l1 = v1.len();
+    let l2 = v2.len();
+
+    //stores the next indices that should be compared
+    let mut curr_wf: Vec<(usize, usize)> = vec![(0, 0)];
+    let mut next_wf: Vec<(usize, usize)> = vec![(0, 0); 3];
+    let mut edits = 0;
+
+    //main idea is to iterate until we're at the end of v1, this is guaranteed because i abd j are monotonically increasing
+    loop {
+        //during each iteration, we go over all wavefronts; at iteration e, there are 2*e+1 current wavefronts that will generate 2*(e+1)+1 wavefronts
+        //"e" in this context corresponds to the edit distance "edits"
+        for (wf_index, &wf) in curr_wf.iter().enumerate().rev() {
+            let mut i = wf.0;
+            let mut j = wf.1;
+
+            //as long as the symbols match, keep moving along the diagonal
+            while i < l1 && j < l2 && v1[i] == v2[j] {
+                i += 1;
+                j += 1;
+            }
+
+            if i == l1 {
+                //we reached the end of v1, and since we are going in reverse order, this _must_ be the maximum 'j' value possible with this edit dist
+                return MatchScore {
+                    score: edits,
+                    match_length: j
+                }
+            } else if j == l2 {
+                //we reached the end of v2, so j can no longer increase
+                next_wf[wf_index] = max(next_wf[wf_index], (i+1, j));
+                next_wf[wf_index+1] = max(next_wf[wf_index+1], (i+1, j));
+                next_wf[wf_index+2] = max(next_wf[wf_index+2], (i, j));
+            } else {
+                //v1 and v2 do not match at i, j; add mismatch, insert, and del to the next wavefront
+                next_wf[wf_index] = max(next_wf[wf_index], (i+1, j)); //v2 has a deletion relative to v1
+                next_wf[wf_index+1] = max(next_wf[wf_index+1], (i+1, j+1)); //v2 has a mismatch relative to v1
+                next_wf[wf_index+2] = max(next_wf[wf_index+2], (i, j+1)); //v2 has an insertion relative to v1
+            }
+        }
+
+        //we finished this wave, increment the edit count and generate the buffer for the next wavefront
+        edits += 1;
+        swap(&mut curr_wf, &mut next_wf);
+        next_wf = vec![(0, 0); 3+2*edits];
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -147,4 +284,35 @@ mod tests {
             match_length: 5
         });
     }
+
+    #[test]
+    fn test_wfa_ed() {
+        let v1: Vec<u8> = vec![0, 1, 2, 4, 5];
+        let v2: Vec<u8> = vec![0, 1, 3, 4, 5];
+        let v3: Vec<u8> = vec![1, 2, 3, 5];
+
+        assert_eq!(wfa_ed(&v1, &v1), 0);
+        assert_eq!(wfa_ed(&v1, &v2), 1);
+        assert_eq!(wfa_ed(&v1, &v3), 2);
+
+        assert_eq!(wfa_ed(&v2, &v2), 0);
+        assert_eq!(wfa_ed(&v2, &v3), 3);
+
+        assert_eq!(wfa_ed(&v3, &v3), 0);
+    }
+
+    #[test]
+    fn test_wfa_minimize() {
+        let v1: Vec<u8> = vec![0, 1, 2, 4, 5];
+        let v2: Vec<u8> = vec![0, 1, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 5];
+
+        assert_eq!(wfa_minimize(&v1, &v1), MatchScore {
+            score: 0,
+            match_length: 5
+        });
+        assert_eq!(wfa_minimize(&v1, &v2), MatchScore {
+            score: 1,
+            match_length: 5
+        });
+    }
 }
diff --git a/src/read_correction.rs b/src/read_correction.rs
@@ -4,7 +4,7 @@ extern crate log;
 use std::cmp::min;
 use std::sync::Arc;
 
-use crate::align::{edit_distance,edit_distance_minimize,MatchScore};
+use crate::align::{wfa_ed,wfa_minimize,MatchScore};
 use crate::bv_bwt::BitVectorBWT;
 use crate::stats_util;
 use crate::string_util;
@@ -337,7 +337,7 @@ fn pick_best_levenshtein_search(original: &[u8], candidates: Vec<Vec<u8>>, bwt:
 
     //first calculate the best match for each candidate, and mark the minimum best score we find as we go
     for candidate in candidates.iter() {
-        let best_match: MatchScore = edit_distance_minimize(original, candidate);
+        let best_match: MatchScore = wfa_minimize(original, candidate);
         min_score = min(best_match.score, min_score);
         ed_scores.push(best_match);
     }
@@ -402,7 +402,7 @@ fn pick_best_levenshtein(original: &[u8], candidates: Vec<Vec<u8>>, bwt: &BitVec
     else {
         //we have multiple values, so check for edit distance
         let ed_scores: Vec<usize> = candidates.iter()
-            .map(|candidate| edit_distance(original, candidate))
+            .map(|candidate| wfa_ed(original, candidate))
             .collect::<Vec<usize>>();
         let min_score: usize = *ed_scores.iter().min().unwrap();