Skip to content

Commit

Permalink
Merge pull request #26 from HudsonAlpha/wfa_edit_dist
Browse files Browse the repository at this point in the history
Wfa edit dist
  • Loading branch information
holtjma authored Apr 8, 2022
2 parents 69ac321 + 9a5bd97 commit 80dea1f
Show file tree
Hide file tree
Showing 5 changed files with 208 additions and 13 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fmlrc"
version = "0.1.6"
version = "0.1.7"
authors = ["holtjma <[email protected]>"]
edition = "2018"
license = "MIT OR Apache-2.0"
Expand Down
9 changes: 4 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,7 @@ Currently, only uncompressed FASTA is supported for output reads.
2. Unlimited `k`/`K` parameters - FMLRC v1 allowed 1 or 2 sizes for `k` only; FMLRC v2 can have the option set as many times as desired at increased CPU time (for example, a 3-pass correction with `k=[21, 59, 79]`)
3. Call caching - FMLRC v2 pre-computes all _k_-mers of a given size. This reduces the run-time significantly by cutting reducing calls to the FM-index.
4. Input handling - thanks to [needletail](https://crates.io/crates/needletail), the uncorrected reads can be in FASTA/FASTQ and may or may not be gzip compressed.
5. SIMD accelerated alignment - thanks to [triple_accel](https://crates.io/crates/triple_accel), the correction alignment step can be accelerated with SIMD instructions when available
6. Unit testing - FMLRC v2 has unit testing through the standard Rust testing framework (i.e. `cargo test`)
5. Unit testing - FMLRC v2 has unit testing through the standard Rust testing framework (i.e. `cargo test`)

## Benchmarks
Thus far, all benchmarks have focused on a relatively small _E. coli_ dataset for verifying correctness.
Expand All @@ -88,12 +87,12 @@ The actual corrections are _nearly_ identical (there are slight differences not
However, FMLRC v2 runs in less than half the time from both real time and CPU time perspectives.
While not explicitly measured, FMLRC v2 does use ~1GB of extra memory due to the 10-mer cache (`-C 10`).

| Metric | FMLRC v1.0.0 | FMLRC2 v0.1.5 (`-C 10`) | FMLRC2 v0.1.6 (`-C 10`) |
| Metric | FMLRC v1.0.0 | FMLRC2 v0.1.6 (`-C 10`) | FMLRC2 v0.1.7 (`-C 10`) |
| - | - | - | - |
| Recall | 0.9830 | 0.9830 | 0.9830 |
| Precision | 0.9821 | 0.9821 | 0.9821 |
| Real time | 3m38.067s | 2m47.120s | **1m24.219s** |
| CPU time | 27m23.652s | 18m54.873s | **8m49.680s** |
| Real time | 3m38.067s | 1m24.219s | **1m14.720s** |
| CPU time | 27m23.652s | 8m49.680s | **8m15.823s** |

## Reference
FMLRC v2 does not currently have a pre-print or paper. If you use FMLRC v2, please cite the FMLRC v1 paper:
Expand Down
30 changes: 29 additions & 1 deletion benches/bv_bwt_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use std::io::Cursor;

use fmlrc::align::*;
use fmlrc::bv_bwt::BitVectorBWT;
use fmlrc::bwt_converter::convert_to_vec;
use fmlrc::read_correction::bridge_kmers;
Expand Down Expand Up @@ -123,5 +124,32 @@ pub fn bench_stats_util(c: &mut Criterion) {
}));
}

criterion_group!(benches, bench_string_util, bench_count_kmer, bench_fixed_counts, bench_stats_util);
pub fn bench_align(c: &mut Criterion) {
let v1: Vec<u8> = vec![0, 1, 2, 4, 5, 0, 1, 2, 4, 5, 0, 1, 2, 4, 5, 0, 1, 2, 4, 5];
let v2: Vec<u8> = vec![0, 1, 3, 4, 5, 0, 1, 3, 4, 5, 0, 1, 3, 4, 5, 0, 1, 3, 4, 5];
let v3: Vec<u8> = vec![1, 2, 3, 5, 1, 2, 3, 5, 1, 2, 3, 5, 1, 2, 3, 5];
let v4: Vec<u8> = vec![0, 1, 3, 4, 5, 0, 1, 3, 4, 5, 0, 1, 3, 4, 5, 0, 1, 3, 4, 5, 0, 1, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 5];

c.bench_function("align_edit_distance", |b| b.iter(|| {
black_box(edit_distance(&v1, &v1));
black_box(edit_distance(&v1, &v2));
black_box(edit_distance(&v1, &v3));
}));

c.bench_function("align_edit_distance_minimize", |b| b.iter(|| {
black_box(edit_distance_minimize(&v1, &v4));
}));

c.bench_function("align_wfa_ed", |b| b.iter(|| {
black_box(wfa_ed(&v1, &v1));
black_box(wfa_ed(&v1, &v2));
black_box(wfa_ed(&v1, &v3));
}));

c.bench_function("align_wfa_minimize", |b| b.iter(|| {
black_box(wfa_minimize(&v1, &v4));
}));
}

criterion_group!(benches, bench_string_util, bench_count_kmer, bench_fixed_counts, bench_stats_util, bench_align);
criterion_main!(benches);
174 changes: 171 additions & 3 deletions src/align.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@

use std::cmp::min;
use std::cmp::{max,min};
use std::mem::swap;

/// Returns the edit distance between two u8 Vecs.
/// Returns the edit distance between two u8 Vecs by doing the full grid calculation.
/// # Arguments
/// * `v1` - the first Vec
/// * `v2` - the second Vec
Expand Down Expand Up @@ -54,7 +54,7 @@ pub struct MatchScore {
pub match_length: usize
}

/// Returns the edit distance and index (y) between v1 and a slice of v2[0..y] such that the edit distance is minimized.
/// Returns the edit distance and index (y) between v1 and a slice of v2[0..y] such that the edit distance is minimized by doing the full grid calculation.
/// This used when you have a correction that is longer than the original sequence and you want to truncate the correction to the closest prefix string.
/// # Arguments
/// * `v1` - the first Vec, this one is always fully used in the edit distance calculation
Expand Down Expand Up @@ -113,6 +113,143 @@ pub fn edit_distance_minimize(v1: &[u8], v2: &[u8]) -> MatchScore {
}
}

/// Returns the edit distance between two u8 Vecs by using a version of WFA.
/// # Arguments
/// * `v1` - the first Vec
/// * `v2` - the second Vec
/// # Examples
/// ```rust
/// use fmlrc::align::wfa_ed;
/// let v1: Vec<u8> = vec![0, 1, 2, 4, 5];
/// let v2: Vec<u8> = vec![0, 1, 3, 4, 5];
/// let v3: Vec<u8> = vec![1, 2, 3, 5];
/// assert_eq!(wfa_ed(&v1, &v1), 0);
/// assert_eq!(wfa_ed(&v1, &v2), 1);
/// assert_eq!(wfa_ed(&v1, &v3), 2);
/// ```
pub fn wfa_ed(v1: &[u8], v2: &[u8]) -> usize {
//we need the lengths to know where we are in the vecs
let l1 = v1.len();
let l2 = v2.len();

//stores the next indices that should be compared
let mut curr_wf: Vec<(usize, usize)> = vec![(0, 0)];
let mut next_wf: Vec<(usize, usize)> = vec![(0, 0); 3];
let mut edits = 0;

//main idea is to iterate until we're at the end of BOTH vecs, this is guaranteed because i and j monotonically increase
loop {
//during each iteration, we go over all wavefronts; at iteration e, there are 2*e+1 current wavefronts that will generate 2*(e+1)+1 wavefronts
//"e" in this context corresponds to the edit distance "edits"
for (wf_index, &wf) in curr_wf.iter().enumerate() {
let mut i = wf.0;
let mut j = wf.1;

//as long as the symbols match, keep moving along the diagonal
while i < l1 && j < l2 && v1[i] == v2[j] {
i += 1;
j += 1;
}

if i == l1 && j == l2 {
//we found the end, return the number of edits required to get here
return edits;
}
else if i == l1 {
//push the wavefront, but i cannot increase
next_wf[wf_index] = max(next_wf[wf_index], (i, j));
next_wf[wf_index+1] = max(next_wf[wf_index+1], (i, j+1));
next_wf[wf_index+2] = max(next_wf[wf_index+2], (i, j+1));
} else if j == l2 {
//push the wavefront, but j cannot increase
next_wf[wf_index] = max(next_wf[wf_index], (i+1, j));
next_wf[wf_index+1] = max(next_wf[wf_index+1], (i+1, j));
next_wf[wf_index+2] = max(next_wf[wf_index+2], (i, j));
} else {
//v1 and v2 do not match at i, j; add mismatch, insert, and del to the next wavefront
next_wf[wf_index] = max(next_wf[wf_index], (i+1, j)); //v2 has a deletion relative to v1
next_wf[wf_index+1] = max(next_wf[wf_index+1], (i+1, j+1)); //v2 has a mismatch relative to v1
next_wf[wf_index+2] = max(next_wf[wf_index+2], (i, j+1)); //v2 has an insertion relative to v1
}
}

//we finished this wave, increment the edit count and generate the buffer for the next wavefront
edits += 1;
curr_wf = next_wf;
next_wf = vec![(0, 0); 3+2*edits];
}
}

/// Returns the edit distance and index (y) between v1 and a slice of v2[0..y] such that the edit distance is minimized by using a version of WFA.
/// This used when you have a correction that is longer than the original sequence and you want to truncate the correction to the closest prefix string.
/// # Arguments
/// * `v1` - the first Vec, this one is always fully used in the edit distance calculation
/// * `v2` - the second Vec, this one will have a prefix used in the final calculation
/// # Examples
/// ```rust
/// use fmlrc::align::{edit_distance_minimize,MatchScore};
/// let v1: Vec<u8> = vec![0, 1, 2, 4, 5];
/// let v2: Vec<u8> = vec![0, 1, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 5];
/// assert_eq!(edit_distance_minimize(&v1, &v1), MatchScore {
/// score: 0,
/// match_length: 5
/// });
/// //The best match is v2[0..5] and the edit distance from v1 is 1
/// assert_eq!(edit_distance_minimize(&v1, &v2), MatchScore {
/// score: 1,
/// match_length: 5
/// });
/// ```
pub fn wfa_minimize(v1: &[u8], v2: &[u8]) -> MatchScore {
//we need the lengths to know where we are in the vecs
let l1 = v1.len();
let l2 = v2.len();

//stores the next indices that should be compared
let mut curr_wf: Vec<(usize, usize)> = vec![(0, 0)];
let mut next_wf: Vec<(usize, usize)> = vec![(0, 0); 3];
let mut edits = 0;

//main idea is to iterate until we're at the end of v1, this is guaranteed because i abd j are monotonically increasing
loop {
//during each iteration, we go over all wavefronts; at iteration e, there are 2*e+1 current wavefronts that will generate 2*(e+1)+1 wavefronts
//"e" in this context corresponds to the edit distance "edits"
for (wf_index, &wf) in curr_wf.iter().enumerate().rev() {
let mut i = wf.0;
let mut j = wf.1;

//as long as the symbols match, keep moving along the diagonal
while i < l1 && j < l2 && v1[i] == v2[j] {
i += 1;
j += 1;
}

if i == l1 {
//we reached the end of v1, and since we are going in reverse order, this _must_ be the maximum 'j' value possible with this edit dist
return MatchScore {
score: edits,
match_length: j
}
} else if j == l2 {
//we reached the end of v2, so j can no longer increase
next_wf[wf_index] = max(next_wf[wf_index], (i+1, j));
next_wf[wf_index+1] = max(next_wf[wf_index+1], (i+1, j));
next_wf[wf_index+2] = max(next_wf[wf_index+2], (i, j));
} else {
//v1 and v2 do not match at i, j; add mismatch, insert, and del to the next wavefront
next_wf[wf_index] = max(next_wf[wf_index], (i+1, j)); //v2 has a deletion relative to v1
next_wf[wf_index+1] = max(next_wf[wf_index+1], (i+1, j+1)); //v2 has a mismatch relative to v1
next_wf[wf_index+2] = max(next_wf[wf_index+2], (i, j+1)); //v2 has an insertion relative to v1
}
}

//we finished this wave, increment the edit count and generate the buffer for the next wavefront
edits += 1;
swap(&mut curr_wf, &mut next_wf);
next_wf = vec![(0, 0); 3+2*edits];
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -147,4 +284,35 @@ mod tests {
match_length: 5
});
}

#[test]
fn test_wfa_ed() {
let v1: Vec<u8> = vec![0, 1, 2, 4, 5];
let v2: Vec<u8> = vec![0, 1, 3, 4, 5];
let v3: Vec<u8> = vec![1, 2, 3, 5];

assert_eq!(wfa_ed(&v1, &v1), 0);
assert_eq!(wfa_ed(&v1, &v2), 1);
assert_eq!(wfa_ed(&v1, &v3), 2);

assert_eq!(wfa_ed(&v2, &v2), 0);
assert_eq!(wfa_ed(&v2, &v3), 3);

assert_eq!(wfa_ed(&v3, &v3), 0);
}

#[test]
fn test_wfa_minimize() {
let v1: Vec<u8> = vec![0, 1, 2, 4, 5];
let v2: Vec<u8> = vec![0, 1, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 5];

assert_eq!(wfa_minimize(&v1, &v1), MatchScore {
score: 0,
match_length: 5
});
assert_eq!(wfa_minimize(&v1, &v2), MatchScore {
score: 1,
match_length: 5
});
}
}
6 changes: 3 additions & 3 deletions src/read_correction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ extern crate log;
use std::cmp::min;
use std::sync::Arc;

use crate::align::{edit_distance,edit_distance_minimize,MatchScore};
use crate::align::{wfa_ed,wfa_minimize,MatchScore};
use crate::bv_bwt::BitVectorBWT;
use crate::stats_util;
use crate::string_util;
Expand Down Expand Up @@ -337,7 +337,7 @@ fn pick_best_levenshtein_search(original: &[u8], candidates: Vec<Vec<u8>>, bwt:

//first calculate the best match for each candidate, and mark the minimum best score we find as we go
for candidate in candidates.iter() {
let best_match: MatchScore = edit_distance_minimize(original, candidate);
let best_match: MatchScore = wfa_minimize(original, candidate);
min_score = min(best_match.score, min_score);
ed_scores.push(best_match);
}
Expand Down Expand Up @@ -402,7 +402,7 @@ fn pick_best_levenshtein(original: &[u8], candidates: Vec<Vec<u8>>, bwt: &BitVec
else {
//we have multiple values, so check for edit distance
let ed_scores: Vec<usize> = candidates.iter()
.map(|candidate| edit_distance(original, candidate))
.map(|candidate| wfa_ed(original, candidate))
.collect::<Vec<usize>>();
let min_score: usize = *ed_scores.iter().min().unwrap();

Expand Down

0 comments on commit 80dea1f

Please sign in to comment.