Skip to content

Commit

Permalink
compute-optimized MinHash (for small scaled or large cardinalities) (#…
Browse files Browse the repository at this point in the history
…1045)

* large minhash, implemented as a btree
* use large minhash for compute by default
* fix lints and msrv (older backtrace crate)
* many property tests, oracle-based to compare both implementations
* bump rust version
  • Loading branch information
luizirber authored Jun 26, 2020
1 parent a7c07eb commit bc2b168
Show file tree
Hide file tree
Showing 11 changed files with 1,485 additions and 47 deletions.
4 changes: 1 addition & 3 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@ name: Rust checks
on:
push:
branches: [master]
paths:
- 'src/core/**'
pull_request:
paths:
- 'src/core/**'
Expand Down Expand Up @@ -150,7 +148,7 @@ jobs:
uses: actions-rs/cargo@v1
with:
command: clippy
args: -- -D warnings
args: --all -- -D warnings

wasm-pack:
name: Check if wasm-pack builds a valid package for the sourmash crate
Expand Down
3 changes: 2 additions & 1 deletion src/core/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "sourmash"
version = "0.7.0"
version = "0.8.0"
authors = ["Luiz Irber <[email protected]>"]
description = "MinHash sketches for genomic data"
repository = "https://github.com/dib-lab/sourmash"
Expand All @@ -25,6 +25,7 @@ parallel = ["rayon"]
#cbindgen = "~0.14.2"

[dependencies]
backtrace = "=0.3.46" # later versions require rust 1.40
byteorder = "1.3.4"
cfg-if = "0.1.10"
failure = "0.1.8"
Expand Down
26 changes: 13 additions & 13 deletions src/core/src/cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use failure::Error;

use crate::index::MHBT;
use crate::signature::Signature;
use crate::sketch::minhash::{max_hash_for_scaled, HashFunctions, KmerMinHash};
use crate::sketch::minhash::{max_hash_for_scaled, HashFunctions, KmerMinHashBTree};
use crate::sketch::Sketch;

pub fn prepare(index_path: &str) -> Result<(), Error> {
Expand Down Expand Up @@ -101,15 +101,15 @@ pub fn build_template(params: &ComputeParameters) -> Vec<Sketch> {
let mut ksigs = vec![];

if params.protein {
ksigs.push(Sketch::MinHash(
KmerMinHash::builder()
ksigs.push(Sketch::LargeMinHash(
KmerMinHashBTree::builder()
.num(params.num_hashes)
.ksize(*k)
.hash_function(HashFunctions::murmur64_protein)
.max_hash(max_hash)
.seed(params.seed)
.abunds(if params.track_abundance {
Some(vec![])
Some(Default::default())
} else {
None
})
Expand All @@ -118,15 +118,15 @@ pub fn build_template(params: &ComputeParameters) -> Vec<Sketch> {
}

if params.dayhoff {
ksigs.push(Sketch::MinHash(
KmerMinHash::builder()
ksigs.push(Sketch::LargeMinHash(
KmerMinHashBTree::builder()
.num(params.num_hashes)
.ksize(*k)
.hash_function(HashFunctions::murmur64_dayhoff)
.max_hash(max_hash)
.seed(params.seed)
.abunds(if params.track_abundance {
Some(vec![])
Some(Default::default())
} else {
None
})
Expand All @@ -135,15 +135,15 @@ pub fn build_template(params: &ComputeParameters) -> Vec<Sketch> {
}

if params.hp {
ksigs.push(Sketch::MinHash(
KmerMinHash::builder()
ksigs.push(Sketch::LargeMinHash(
KmerMinHashBTree::builder()
.num(params.num_hashes)
.ksize(*k)
.hash_function(HashFunctions::murmur64_hp)
.max_hash(max_hash)
.seed(params.seed)
.abunds(if params.track_abundance {
Some(vec![])
Some(Default::default())
} else {
None
})
Expand All @@ -152,15 +152,15 @@ pub fn build_template(params: &ComputeParameters) -> Vec<Sketch> {
}

if params.dna {
ksigs.push(Sketch::MinHash(
KmerMinHash::builder()
ksigs.push(Sketch::LargeMinHash(
KmerMinHashBTree::builder()
.num(params.num_hashes)
.ksize(*k)
.hash_function(HashFunctions::murmur64_DNA)
.max_hash(max_hash)
.seed(params.seed)
.abunds(if params.track_abundance {
Some(vec![])
Some(Default::default())
} else {
None
})
Expand Down
5 changes: 1 addition & 4 deletions src/core/src/index/sbt/mhbt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,6 @@ mod test {
use std::path::PathBuf;

use assert_matches::assert_matches;
use tempfile;

use super::Factory;

Expand Down Expand Up @@ -206,9 +205,7 @@ mod test {
None,
)
.unwrap();
let sig_data = sigs[0].clone();

let leaf = sig_data.into();
let leaf = sigs[0].clone();

let results = sbt.find(search_minhashes, &leaf, 0.5).unwrap();
assert_eq!(results.len(), 1);
Expand Down
26 changes: 26 additions & 0 deletions src/core/src/signature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,20 +34,23 @@ impl SigsTrait for Sketch {
match *self {
Sketch::UKHS(ref ukhs) => ukhs.size(),
Sketch::MinHash(ref mh) => mh.size(),
Sketch::LargeMinHash(ref mh) => mh.size(),
}
}

fn to_vec(&self) -> Vec<u64> {
match *self {
Sketch::UKHS(ref ukhs) => ukhs.to_vec(),
Sketch::MinHash(ref mh) => mh.to_vec(),
Sketch::LargeMinHash(ref mh) => mh.to_vec(),
}
}

fn ksize(&self) -> usize {
match *self {
Sketch::UKHS(ref ukhs) => ukhs.ksize(),
Sketch::MinHash(ref mh) => mh.ksize(),
Sketch::LargeMinHash(ref mh) => mh.ksize(),
}
}

Expand All @@ -61,19 +64,25 @@ impl SigsTrait for Sketch {
Sketch::MinHash(ref ot) => mh.check_compatible(ot),
_ => Err(SourmashError::MismatchSignatureType.into()),
},
Sketch::LargeMinHash(ref mh) => match other {
Sketch::LargeMinHash(ref ot) => mh.check_compatible(ot),
_ => Err(SourmashError::MismatchSignatureType.into()),
},
}
}

fn add_sequence(&mut self, seq: &[u8], force: bool) -> Result<(), Error> {
match *self {
Sketch::MinHash(ref mut mh) => mh.add_sequence(seq, force),
Sketch::LargeMinHash(ref mut mh) => mh.add_sequence(seq, force),
Sketch::UKHS(_) => unimplemented!(),
}
}

fn add_protein(&mut self, seq: &[u8]) -> Result<(), Error> {
match *self {
Sketch::MinHash(ref mut mh) => mh.add_protein(seq),
Sketch::LargeMinHash(ref mut mh) => mh.add_protein(seq),
Sketch::UKHS(_) => unimplemented!(),
}
}
Expand Down Expand Up @@ -183,6 +192,7 @@ impl Signature {
if self.signatures.len() == 1 {
match &self.signatures[0] {
Sketch::MinHash(mh) => mh.md5sum(),
Sketch::LargeMinHash(mh) => mh.md5sum(),
Sketch::UKHS(hs) => hs.md5sum(),
}
} else {
Expand Down Expand Up @@ -267,6 +277,22 @@ impl Signature {
None => return true, // TODO: match previous behavior
};
}
Sketch::LargeMinHash(mh) => {
if let Some(k) = ksize {
if k != mh.ksize() as usize {
return false;
}
};

match moltype {
Some(x) => {
if mh.hash_function() == x {
return true;
}
}
None => return true, // TODO: match previous behavior
};
}
Sketch::UKHS(hs) => {
if let Some(k) = ksize {
if k != hs.ksize() as usize {
Expand Down
Loading

0 comments on commit bc2b168

Please sign in to comment.