diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 9a4b98adc7..ee72b70a44 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -3,8 +3,6 @@ name: Rust checks on: push: branches: [master] - paths: - - 'src/core/**' pull_request: paths: - 'src/core/**' @@ -150,7 +148,7 @@ jobs: uses: actions-rs/cargo@v1 with: command: clippy - args: -- -D warnings + args: --all -- -D warnings wasm-pack: name: Check if wasm-pack builds a valid package for the sourmash crate diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 7b93309992..3050438159 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sourmash" -version = "0.7.0" +version = "0.8.0" authors = ["Luiz Irber "] description = "MinHash sketches for genomic data" repository = "https://github.com/dib-lab/sourmash" @@ -25,6 +25,7 @@ parallel = ["rayon"] #cbindgen = "~0.14.2" [dependencies] +backtrace = "=0.3.46" # later versions require rust 1.40 byteorder = "1.3.4" cfg-if = "0.1.10" failure = "0.1.8" diff --git a/src/core/src/cmd.rs b/src/core/src/cmd.rs index 73430e6629..897920bfa7 100644 --- a/src/core/src/cmd.rs +++ b/src/core/src/cmd.rs @@ -2,7 +2,7 @@ use failure::Error; use crate::index::MHBT; use crate::signature::Signature; -use crate::sketch::minhash::{max_hash_for_scaled, HashFunctions, KmerMinHash}; +use crate::sketch::minhash::{max_hash_for_scaled, HashFunctions, KmerMinHashBTree}; use crate::sketch::Sketch; pub fn prepare(index_path: &str) -> Result<(), Error> { @@ -101,15 +101,15 @@ pub fn build_template(params: &ComputeParameters) -> Vec { let mut ksigs = vec![]; if params.protein { - ksigs.push(Sketch::MinHash( - KmerMinHash::builder() + ksigs.push(Sketch::LargeMinHash( + KmerMinHashBTree::builder() .num(params.num_hashes) .ksize(*k) .hash_function(HashFunctions::murmur64_protein) .max_hash(max_hash) .seed(params.seed) .abunds(if params.track_abundance { - Some(vec![]) + Some(Default::default()) } else { None }) @@ -118,15 +118,15 @@ pub fn build_template(params: &ComputeParameters) -> Vec { } if params.dayhoff { - ksigs.push(Sketch::MinHash( - KmerMinHash::builder() + ksigs.push(Sketch::LargeMinHash( + KmerMinHashBTree::builder() .num(params.num_hashes) .ksize(*k) .hash_function(HashFunctions::murmur64_dayhoff) .max_hash(max_hash) .seed(params.seed) .abunds(if params.track_abundance { - Some(vec![]) + Some(Default::default()) } else { None }) @@ -135,15 +135,15 @@ pub fn build_template(params: &ComputeParameters) -> Vec { } if params.hp { - ksigs.push(Sketch::MinHash( - KmerMinHash::builder() + ksigs.push(Sketch::LargeMinHash( + KmerMinHashBTree::builder() .num(params.num_hashes) .ksize(*k) .hash_function(HashFunctions::murmur64_hp) .max_hash(max_hash) .seed(params.seed) .abunds(if params.track_abundance { - Some(vec![]) + Some(Default::default()) } else { None }) @@ -152,15 +152,15 @@ pub fn build_template(params: &ComputeParameters) -> Vec { } if params.dna { - ksigs.push(Sketch::MinHash( - KmerMinHash::builder() + ksigs.push(Sketch::LargeMinHash( + KmerMinHashBTree::builder() .num(params.num_hashes) .ksize(*k) .hash_function(HashFunctions::murmur64_DNA) .max_hash(max_hash) .seed(params.seed) .abunds(if params.track_abundance { - Some(vec![]) + Some(Default::default()) } else { None }) diff --git a/src/core/src/index/sbt/mhbt.rs b/src/core/src/index/sbt/mhbt.rs index 738648d7d8..6628b4e799 100644 --- a/src/core/src/index/sbt/mhbt.rs +++ b/src/core/src/index/sbt/mhbt.rs @@ -152,7 +152,6 @@ mod test { use std::path::PathBuf; use assert_matches::assert_matches; - use tempfile; use super::Factory; @@ -206,9 +205,7 @@ mod test { None, ) .unwrap(); - let sig_data = sigs[0].clone(); - - let leaf = sig_data.into(); + let leaf = sigs[0].clone(); let results = sbt.find(search_minhashes, &leaf, 0.5).unwrap(); assert_eq!(results.len(), 1); diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index eb62046cbd..88cbd2829e 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -34,6 +34,7 @@ impl SigsTrait for Sketch { match *self { Sketch::UKHS(ref ukhs) => ukhs.size(), Sketch::MinHash(ref mh) => mh.size(), + Sketch::LargeMinHash(ref mh) => mh.size(), } } @@ -41,6 +42,7 @@ impl SigsTrait for Sketch { match *self { Sketch::UKHS(ref ukhs) => ukhs.to_vec(), Sketch::MinHash(ref mh) => mh.to_vec(), + Sketch::LargeMinHash(ref mh) => mh.to_vec(), } } @@ -48,6 +50,7 @@ impl SigsTrait for Sketch { match *self { Sketch::UKHS(ref ukhs) => ukhs.ksize(), Sketch::MinHash(ref mh) => mh.ksize(), + Sketch::LargeMinHash(ref mh) => mh.ksize(), } } @@ -61,12 +64,17 @@ impl SigsTrait for Sketch { Sketch::MinHash(ref ot) => mh.check_compatible(ot), _ => Err(SourmashError::MismatchSignatureType.into()), }, + Sketch::LargeMinHash(ref mh) => match other { + Sketch::LargeMinHash(ref ot) => mh.check_compatible(ot), + _ => Err(SourmashError::MismatchSignatureType.into()), + }, } } fn add_sequence(&mut self, seq: &[u8], force: bool) -> Result<(), Error> { match *self { Sketch::MinHash(ref mut mh) => mh.add_sequence(seq, force), + Sketch::LargeMinHash(ref mut mh) => mh.add_sequence(seq, force), Sketch::UKHS(_) => unimplemented!(), } } @@ -74,6 +82,7 @@ impl SigsTrait for Sketch { fn add_protein(&mut self, seq: &[u8]) -> Result<(), Error> { match *self { Sketch::MinHash(ref mut mh) => mh.add_protein(seq), + Sketch::LargeMinHash(ref mut mh) => mh.add_protein(seq), Sketch::UKHS(_) => unimplemented!(), } } @@ -183,6 +192,7 @@ impl Signature { if self.signatures.len() == 1 { match &self.signatures[0] { Sketch::MinHash(mh) => mh.md5sum(), + Sketch::LargeMinHash(mh) => mh.md5sum(), Sketch::UKHS(hs) => hs.md5sum(), } } else { @@ -267,6 +277,22 @@ impl Signature { None => return true, // TODO: match previous behavior }; } + Sketch::LargeMinHash(mh) => { + if let Some(k) = ksize { + if k != mh.ksize() as usize { + return false; + } + }; + + match moltype { + Some(x) => { + if mh.hash_function() == x { + return true; + } + } + None => return true, // TODO: match previous behavior + }; + } Sketch::UKHS(hs) => { if let Some(k) = ksize { if k != hs.ksize() as usize { diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 253730af7a..25fecd166e 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -1,5 +1,5 @@ use std::cmp::Ordering; -use std::collections::HashMap; +use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::convert::TryFrom; use std::f64::consts::PI; use std::iter::{Iterator, Peekable}; @@ -258,6 +258,10 @@ impl KmerMinHash { } pub fn set_hash_function(&mut self, h: HashFunctions) -> Result<(), Error> { + if self.hash_function == h { + return Ok(()); + } + if !self.is_empty() { return Err(SourmashError::NonEmptyMinHash { message: "hash_function".into(), @@ -482,14 +486,22 @@ impl KmerMinHash { if merged.len() < (self.num as usize) || (self.num as usize) == 0 { self.mins = merged; self.abunds = if merged_abunds.is_empty() { - None + if self.abunds.is_some() { + Some(vec![]) + } else { + None + } } else { Some(merged_abunds) }; } else { self.mins = merged.into_iter().take(self.num as usize).collect(); self.abunds = if merged_abunds.is_empty() { - None + if self.abunds.is_some() { + Some(vec![]) + } else { + None + } } else { Some(merged_abunds.into_iter().take(self.num as usize).collect()) } @@ -686,7 +698,7 @@ impl KmerMinHash { } // create a downsampled copy of self - fn downsample_max_hash(&self, max_hash: u64) -> Result { + pub fn downsample_max_hash(&self, max_hash: u64) -> Result { let mut new_mh = KmerMinHash::new( self.num, self.ksize, @@ -703,7 +715,7 @@ impl KmerMinHash { Ok(new_mh) } - fn to_vec_abunds(&self) -> Vec<(u64, u64)> { + pub fn to_vec_abunds(&self) -> Vec<(u64, u64)> { if let Some(abunds) = &self.abunds { self.mins .iter() @@ -1208,3 +1220,762 @@ const VALID: [bool; 256] = { lookup[b'T' as usize] = true; lookup }; + +//############# +// A MinHash implementation for low scaled or large cardinalities + +#[cfg_attr(all(target_arch = "wasm32", target_vendor = "unknown"), wasm_bindgen)] +#[derive(Debug, Clone, PartialEq, TypedBuilder)] +pub struct KmerMinHashBTree { + num: u32, + ksize: u32, + + #[builder(default_code = "HashFunctions::murmur64_DNA")] + hash_function: HashFunctions, + + #[builder(default_code = "42u64")] + seed: u64, + + #[builder(default_code = "u64::max_value()")] + max_hash: u64, + + #[builder(default)] + mins: BTreeSet, + + #[builder(default)] + abunds: Option>, + + #[builder(default_code = "0u64")] + current_max: u64, +} + +impl Default for KmerMinHashBTree { + fn default() -> KmerMinHashBTree { + KmerMinHashBTree { + num: 1000, + ksize: 21, + hash_function: HashFunctions::murmur64_DNA, + seed: 42, + max_hash: 0, + mins: Default::default(), + abunds: None, + current_max: 0, + } + } +} + +impl Serialize for KmerMinHashBTree { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let n_fields = match &self.abunds { + Some(_) => 8, + _ => 7, + }; + + let mut partial = serializer.serialize_struct("KmerMinHashBTree", n_fields)?; + partial.serialize_field("num", &self.num)?; + partial.serialize_field("ksize", &self.ksize)?; + partial.serialize_field("seed", &self.seed)?; + partial.serialize_field("max_hash", &self.max_hash)?; + partial.serialize_field("mins", &self.mins)?; + partial.serialize_field("md5sum", &self.md5sum())?; + + if let Some(abunds) = &self.abunds { + let abs: Vec = abunds.values().cloned().collect(); + partial.serialize_field("abundances", &abs)?; + } + + partial.serialize_field("molecule", &self.hash_function.to_string())?; + + partial.end() + } +} + +impl<'de> Deserialize<'de> for KmerMinHashBTree { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + #[derive(Deserialize)] + struct TempSig { + num: u32, + ksize: u32, + seed: u64, + max_hash: u64, + //md5sum: String, + mins: Vec, + abundances: Option>, + molecule: String, + } + + let tmpsig = TempSig::deserialize(deserializer)?; + + let num = if tmpsig.max_hash != 0 { 0 } else { tmpsig.num }; + let hash_function = match tmpsig.molecule.to_lowercase().as_ref() { + "protein" => HashFunctions::murmur64_protein, + "dayhoff" => HashFunctions::murmur64_dayhoff, + "hp" => HashFunctions::murmur64_hp, + "dna" => HashFunctions::murmur64_DNA, + _ => unimplemented!(), // TODO: throw error here + }; + + let current_max; + // This shouldn't be necessary, but at some point we + // created signatures with unordered mins =( + let (mins, abunds) = if let Some(abunds) = tmpsig.abundances { + let mut values: Vec<(_, _)> = tmpsig.mins.iter().zip(abunds.iter()).collect(); + values.sort(); + let mins: BTreeSet<_> = values.iter().map(|(v, _)| **v).collect(); + let abunds = values.into_iter().map(|(v, x)| (*v, *x)).collect(); + current_max = *mins.iter().rev().next().unwrap_or(&0); + (mins, Some(abunds)) + } else { + current_max = 0; + (tmpsig.mins.into_iter().collect(), None) + }; + + Ok(KmerMinHashBTree { + num, + ksize: tmpsig.ksize, + seed: tmpsig.seed, + max_hash: tmpsig.max_hash, + mins, + abunds, + hash_function, + current_max, + }) + } +} + +impl KmerMinHashBTree { + pub fn new( + num: u32, + ksize: u32, + hash_function: HashFunctions, + seed: u64, + max_hash: u64, + track_abundance: bool, + ) -> KmerMinHashBTree { + let mins = Default::default(); + + let abunds = if track_abundance { + Some(Default::default()) + } else { + None + }; + + KmerMinHashBTree { + num, + ksize, + hash_function, + seed, + max_hash, + mins, + abunds, + current_max: 0, + } + } + + pub fn num(&self) -> u32 { + self.num + } + + pub fn is_protein(&self) -> bool { + self.hash_function == HashFunctions::murmur64_protein + } + + fn is_dna(&self) -> bool { + self.hash_function == HashFunctions::murmur64_DNA + } + + pub fn seed(&self) -> u64 { + self.seed + } + + pub fn max_hash(&self) -> u64 { + self.max_hash + } + + pub fn clear(&mut self) { + self.mins.clear(); + if let Some(ref mut abunds) = self.abunds { + abunds.clear(); + } + self.current_max = 0; + } + + pub fn is_empty(&self) -> bool { + self.mins.is_empty() + } + + pub fn set_hash_function(&mut self, h: HashFunctions) -> Result<(), Error> { + if self.hash_function == h { + return Ok(()); + } + + if !self.is_empty() { + return Err(SourmashError::NonEmptyMinHash { + message: "hash_function".into(), + } + .into()); + } + + self.hash_function = h; + Ok(()) + } + + pub fn track_abundance(&self) -> bool { + self.abunds.is_some() + } + + pub fn enable_abundance(&mut self) -> Result<(), Error> { + if !self.mins.is_empty() { + return Err(SourmashError::NonEmptyMinHash { + message: "track_abundance=True".into(), + } + .into()); + } + + self.abunds = Some(Default::default()); + + Ok(()) + } + + pub fn disable_abundance(&mut self) { + self.abunds = None; + } + + pub fn md5sum(&self) -> String { + let mut md5_ctx = md5::Context::new(); + md5_ctx.consume(self.ksize().to_string()); + self.mins + .iter() + .for_each(|x| md5_ctx.consume(x.to_string())); + format!("{:x}", md5_ctx.compute()) + } + + pub fn add_hash(&mut self, hash: u64) { + self.add_hash_with_abundance(hash, 1); + } + + pub fn add_hash_with_abundance(&mut self, hash: u64, abundance: u64) { + if hash > self.max_hash && self.max_hash != 0 { + // This is a scaled minhash, and we don't need to add the new hash + return; + } + + if self.num == 0 && self.max_hash == 0 { + // why did you create this minhash? it will always be empty... + return; + } + + if abundance == 0 { + // well, don't add it. + return; + } + + // From this point on, hash is within scaled (or no scaled specified). + + // empty mins? add it. + if self.mins.is_empty() { + self.mins.insert(hash); + if let Some(ref mut abunds) = self.abunds { + abunds.insert(hash, abundance); + } + self.current_max = hash; + return; + } + + if hash <= self.max_hash || hash <= self.current_max || (self.mins.len() as u32) < self.num + { + // "good" hash - within range, smaller than current entry, or + // still have space available + if self.mins.insert(hash) && hash > self.current_max { + self.current_max = hash; + } + if let Some(ref mut abunds) = self.abunds { + *abunds.entry(hash).or_insert(0) += abundance; + } + + // is it too big now? + if self.num != 0 && self.mins.len() > (self.num as usize) { + let last = *self.mins.iter().rev().next().unwrap(); + self.mins.remove(&last); + if let Some(ref mut abunds) = self.abunds { + abunds.remove(&last); + } + self.current_max = *self.mins.iter().rev().next().unwrap(); + } + } + } + + pub fn add_word(&mut self, word: &[u8]) { + let hash = _hash_murmur(word, self.seed); + self.add_hash(hash); + } + + pub fn remove_hash(&mut self, hash: u64) { + if self.mins.remove(&hash) { + if let Some(ref mut abunds) = self.abunds { + abunds.remove(&hash); + } + } + if hash == self.current_max { + self.current_max = *self.mins.iter().rev().next().unwrap_or(&0); + } + } + + pub fn remove_many(&mut self, hashes: &[u64]) -> Result<(), Error> { + for min in hashes { + self.remove_hash(*min); + } + Ok(()) + } + + pub fn merge(&mut self, other: &KmerMinHashBTree) -> Result<(), Error> { + self.check_compatible(other)?; + let union = self.mins.union(&other.mins); + + let to_take = if self.num == 0 { + usize::max_value() + } else { + self.num as usize + }; + + self.mins = union.take(to_take).cloned().collect(); + + if let Some(abunds) = &self.abunds { + if let Some(oabunds) = &other.abunds { + let mut new_abunds = BTreeMap::new(); + + for hash in &self.mins { + *new_abunds.entry(*hash).or_insert(0) += + abunds.get(&hash).unwrap_or(&0) + oabunds.get(&hash).unwrap_or(&0); + } + self.abunds = Some(new_abunds) + } + } + + Ok(()) + } + + pub fn add_from(&mut self, other: &KmerMinHashBTree) -> Result<(), Error> { + for min in &other.mins { + self.add_hash(*min); + } + Ok(()) + } + + pub fn add_many(&mut self, hashes: &[u64]) -> Result<(), Error> { + for min in hashes { + self.add_hash(*min); + } + Ok(()) + } + + pub fn add_many_with_abund(&mut self, hashes: &[(u64, u64)]) -> Result<(), Error> { + for item in hashes { + self.add_hash_with_abundance(item.0, item.1); + } + Ok(()) + } + + pub fn count_common(&self, other: &KmerMinHashBTree, downsample: bool) -> Result { + if downsample && self.max_hash != other.max_hash { + let (first, second) = if self.max_hash < other.max_hash { + (self, other) + } else { + (other, self) + }; + let downsampled_mh = second.downsample_max_hash(first.max_hash)?; + first.count_common(&downsampled_mh, false) + } else { + self.check_compatible(other)?; + let iter = Intersection::new(self.mins.iter(), other.mins.iter()); + + Ok(iter.count() as u64) + } + } + + pub fn intersection(&self, other: &KmerMinHashBTree) -> Result<(Vec, u64), Error> { + self.check_compatible(other)?; + + let mut combined_mh = KmerMinHashBTree::new( + self.num, + self.ksize, + self.hash_function, + self.seed, + self.max_hash, + self.abunds.is_some(), + ); + + combined_mh.merge(&self)?; + combined_mh.merge(&other)?; + + let it1 = Intersection::new(self.mins.iter(), other.mins.iter()); + + // TODO: there is probably a way to avoid this Vec here, + // and pass the it1 as left in it2. + let i1: Vec = it1.cloned().collect(); + let i2: Vec = combined_mh.mins.iter().cloned().collect(); + let it2 = Intersection::new(i1.iter(), i2.iter()); + + let common: Vec = it2.cloned().collect(); + Ok((common, combined_mh.mins.len() as u64)) + } + + pub fn intersection_size(&self, other: &KmerMinHashBTree) -> Result<(u64, u64), Error> { + self.check_compatible(other)?; + + let mut combined_mh = KmerMinHashBTree::new( + self.num, + self.ksize, + self.hash_function, + self.seed, + self.max_hash, + self.abunds.is_some(), + ); + + combined_mh.merge(&self)?; + combined_mh.merge(&other)?; + + let it1 = Intersection::new(self.mins.iter(), other.mins.iter()); + + // TODO: there is probably a way to avoid this Vec here, + // and pass the it1 as left in it2. + let i1: Vec = it1.cloned().collect(); + let i2: Vec = combined_mh.mins.iter().cloned().collect(); + let it2 = Intersection::new(i1.iter(), i2.iter()); + + Ok((it2.count() as u64, combined_mh.mins.len() as u64)) + } + + // calculate Jaccard similarity, ignoring abundance. + pub fn jaccard(&self, other: &KmerMinHashBTree) -> Result { + self.check_compatible(other)?; + if let Ok((common, size)) = self.intersection_size(other) { + Ok(common as f64 / u64::max(1, size) as f64) + } else { + Ok(0.0) + } + } + + // compare two minhashes, with abundance; + // calculate their angular similarity. + pub fn angular_similarity(&self, other: &KmerMinHashBTree) -> Result { + self.check_compatible(other)?; + + if self.abunds.is_none() || other.abunds.is_none() { + // TODO: throw error, we need abundance for this + unimplemented!() // @CTB fixme + } + + let abunds = self.abunds.as_ref().unwrap(); + let other_abunds = other.abunds.as_ref().unwrap(); + + let mut prod = 0; + let a_sq: u64 = abunds.values().map(|a| (a * a)).sum(); + let b_sq: u64 = other_abunds.values().map(|a| (a * a)).sum(); + + for (hash, value) in abunds.iter() { + if let Some(oa) = other_abunds.get(&hash) { + prod += value * oa + } + } + + let norm_a = (a_sq as f64).sqrt(); + let norm_b = (b_sq as f64).sqrt(); + + if norm_a == 0. || norm_b == 0. { + return Ok(0.0); + } + let prod = f64::min(prod as f64 / (norm_a * norm_b), 1.); + let distance = 2. * prod.acos() / PI; + Ok(1. - distance) + } + + pub fn similarity( + &self, + other: &KmerMinHashBTree, + ignore_abundance: bool, + downsample: bool, + ) -> Result { + if downsample && self.max_hash != other.max_hash { + let (first, second) = if self.max_hash < other.max_hash { + (self, other) + } else { + (other, self) + }; + let downsampled_mh = second.downsample_max_hash(first.max_hash)?; + first.similarity(&downsampled_mh, ignore_abundance, false) + } else if ignore_abundance || self.abunds.is_none() || other.abunds.is_none() { + self.jaccard(&other) + } else { + self.angular_similarity(&other) + } + } + + pub fn dayhoff(&self) -> bool { + self.hash_function == HashFunctions::murmur64_dayhoff + } + + pub fn hp(&self) -> bool { + self.hash_function == HashFunctions::murmur64_hp + } + + pub fn hash_function(&self) -> HashFunctions { + self.hash_function + } + + pub fn mins(&self) -> Vec { + self.mins.iter().cloned().collect() + } + + pub fn abunds(&self) -> Option> { + if let Some(abunds) = &self.abunds { + Some(abunds.values().cloned().collect()) + } else { + None + } + } + + // create a downsampled copy of self + pub fn downsample_max_hash(&self, max_hash: u64) -> Result { + let mut new_mh = KmerMinHashBTree::new( + self.num, + self.ksize, + self.hash_function, + self.seed, + max_hash, // old max_hash => max_hash arg + self.abunds.is_some(), + ); + if self.abunds.is_some() { + new_mh.add_many_with_abund(&self.to_vec_abunds())?; + } else { + new_mh.add_many(&self.mins())?; + } + Ok(new_mh) + } + + pub fn to_vec_abunds(&self) -> Vec<(u64, u64)> { + if let Some(abunds) = &self.abunds { + abunds.iter().map(|(a, b)| (*a, *b)).collect() + } else { + self.mins + .iter() + .cloned() + .zip(std::iter::repeat(1)) + .collect() + } + } +} + +impl SigsTrait for KmerMinHashBTree { + fn size(&self) -> usize { + self.mins.len() + } + + fn to_vec(&self) -> Vec { + self.mins() + } + + fn ksize(&self) -> usize { + self.ksize as usize + } + + fn check_compatible(&self, other: &KmerMinHashBTree) -> Result<(), Error> { + /* + if self.num != other.num { + return Err(SourmashError::MismatchNum { + n1: self.num, + n2: other.num, + } + .into()); + } + */ + if self.ksize != other.ksize { + return Err(SourmashError::MismatchKSizes.into()); + } + if self.hash_function != other.hash_function { + // TODO: fix this error + return Err(SourmashError::MismatchDNAProt.into()); + } + if self.max_hash != other.max_hash { + return Err(SourmashError::MismatchScaled.into()); + } + if self.seed != other.seed { + return Err(SourmashError::MismatchSeed.into()); + } + Ok(()) + } + + fn add_sequence(&mut self, seq: &[u8], force: bool) -> Result<(), Error> { + let ksize = self.ksize as usize; + let len = seq.len(); + + if len < ksize { + return Ok(()); + }; + + // Here we convert the sequence to upper case and + // pre-calculate the reverse complement for the full sequence... + let sequence = seq.to_ascii_uppercase(); + let rc = revcomp(&sequence); + + if self.is_dna() { + let mut last_position_check = 0; + + let mut is_valid_kmer = |i| { + for j in std::cmp::max(i, last_position_check)..i + ksize { + if !VALID[sequence[j] as usize] { + return false; + } + last_position_check += 1; + } + true + }; + + for i in 0..=len - ksize { + // ... and then while moving the k-mer window forward for the sequence + // we move another window backwards for the RC. + // For a ksize = 3, and a sequence AGTCGT (len = 6): + // +-+---------+---------------+-------+ + // seq RC |i|i + ksize|len - ksize - i|len - i| + // AGTCGT ACGACT +-+---------+---------------+-------+ + // +-> +-> |0| 2 | 3 | 6 | + // +-> +-> |1| 3 | 2 | 5 | + // +-> +-> |2| 4 | 1 | 4 | + // +-> +-> |3| 5 | 0 | 3 | + // +-+---------+---------------+-------+ + // (leaving this table here because I had to draw to + // get the indices correctly) + + let kmer = &sequence[i..i + ksize]; + + if !is_valid_kmer(i) { + if !force { + // throw error if DNA is not valid + return Err(SourmashError::InvalidDNA { + message: String::from_utf8(kmer.to_vec()).unwrap(), + } + .into()); + } + + continue; // skip invalid k-mer + } + + let krc = &rc[len - ksize - i..len - i]; + self.add_word(std::cmp::min(kmer, krc)); + } + } else { + // protein + let aa_ksize = self.ksize / 3; + + for i in 0..3 { + let substr: Vec = sequence + .iter() + .cloned() + .skip(i) + .take(sequence.len() - i) + .collect(); + let aa = to_aa(&substr, self.dayhoff(), self.hp()).unwrap(); + + aa.windows(aa_ksize as usize).for_each(|n| self.add_word(n)); + + let rc_substr: Vec = rc.iter().cloned().skip(i).take(rc.len() - i).collect(); + let aa_rc = to_aa(&rc_substr, self.dayhoff(), self.hp()).unwrap(); + + aa_rc + .windows(aa_ksize as usize) + .for_each(|n| self.add_word(n)); + } + } + + Ok(()) + } + + fn add_protein(&mut self, seq: &[u8]) -> Result<(), Error> { + let ksize = (self.ksize / 3) as usize; + let len = seq.len(); + + if len < ksize { + return Ok(()); + } + + if let HashFunctions::murmur64_protein = self.hash_function { + for aa_kmer in seq.windows(ksize) { + self.add_word(&aa_kmer); + } + return Ok(()); + } + + let aa_seq: Vec<_> = match self.hash_function { + HashFunctions::murmur64_dayhoff => seq.iter().cloned().map(aa_to_dayhoff).collect(), + HashFunctions::murmur64_hp => seq.iter().cloned().map(aa_to_hp).collect(), + invalid => { + return Err(SourmashError::InvalidHashFunction { + function: format!("{}", invalid), + } + .into()) + } + }; + + for aa_kmer in aa_seq.windows(ksize) { + self.add_word(aa_kmer); + } + + Ok(()) + } +} + +impl From for KmerMinHash { + fn from(other: KmerMinHashBTree) -> KmerMinHash { + let mut new_mh = KmerMinHash::new( + other.num(), + other.ksize() as u32, + other.hash_function(), + other.seed(), + other.max_hash(), + other.track_abundance(), + ); + + let mins = other.mins.into_iter().collect(); + let abunds = if let Some(abunds) = other.abunds { + Some(abunds.values().cloned().collect()) + } else { + None + }; + + new_mh.mins = mins; + new_mh.abunds = abunds; + + new_mh + } +} + +impl From for KmerMinHashBTree { + fn from(other: KmerMinHash) -> KmerMinHashBTree { + let mut new_mh = KmerMinHashBTree::new( + other.num(), + other.ksize() as u32, + other.hash_function(), + other.seed(), + other.max_hash(), + other.track_abundance(), + ); + + let mins: BTreeSet = other.mins.into_iter().collect(); + let abunds = if let Some(abunds) = other.abunds { + Some(mins.iter().cloned().zip(abunds.into_iter()).collect()) + } else { + None + }; + + new_mh.mins = mins; + new_mh.abunds = abunds; + + new_mh + } +} diff --git a/src/core/src/sketch/mod.rs b/src/core/src/sketch/mod.rs index f02d0f7e1c..1c6ee710ff 100644 --- a/src/core/src/sketch/mod.rs +++ b/src/core/src/sketch/mod.rs @@ -5,12 +5,13 @@ pub mod ukhs; use serde_derive::{Deserialize, Serialize}; -use crate::sketch::minhash::KmerMinHash; +use crate::sketch::minhash::{KmerMinHash, KmerMinHashBTree}; use crate::sketch::ukhs::FlatUKHS; #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(untagged)] pub enum Sketch { MinHash(KmerMinHash), + LargeMinHash(KmerMinHashBTree), UKHS(FlatUKHS), // FIXME } diff --git a/src/core/src/sketch/nodegraph.rs b/src/core/src/sketch/nodegraph.rs index 13f26007a6..c85df5b062 100644 --- a/src/core/src/sketch/nodegraph.rs +++ b/src/core/src/sketch/nodegraph.rs @@ -388,7 +388,7 @@ mod test { // >>> a.save("test.ng") // and dumping test.ng with xxd: // $ xxd -i test.ng - static RAW_DATA: &'static [u8] = &[ + static RAW_DATA: &[u8] = &[ 0x4f, 0x58, 0x4c, 0x49, 0x04, 0x02, 0x03, 0x00, 0x00, 0x00, 0x06, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x01, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x01, 0x0d, 0x00, 0x00, 0x00, @@ -397,7 +397,7 @@ mod test { 0x00, 0x00, 0x00, 0x06, ]; - static COMPRESSED_RAW_DATA: &'static [u8] = &[ + static COMPRESSED_RAW_DATA: &[u8] = &[ 0x1f, 0x8b, 0x08, 0x08, 0x73, 0x88, 0x9f, 0x5e, 0x00, 0x03, 0x74, 0x65, 0x73, 0x74, 0x2e, 0x6e, 0x67, 0x00, 0xf3, 0x8f, 0xf0, 0xf1, 0x64, 0x61, 0x62, 0x66, 0x60, 0x60, 0x60, 0x03, 0x11, 0x20, 0x20, 0x0c, 0xa5, 0x19, 0x38, 0x19, 0x05, 0x61, 0x4c, 0x1e, 0x46, 0x5e, 0x28, @@ -565,7 +565,7 @@ mod test { leaf5.count(_hash(kmer.as_bytes())); } - let h = _hash("AAAAT".as_bytes()); + let h = _hash(b"AAAAT"); for leaf in &[leaf1, leaf2, leaf3, leaf5] { assert_eq!(leaf.get(h), 1); } diff --git a/src/core/tests/minhash.rs b/src/core/tests/minhash.rs index 9d4beab247..344dcd130c 100644 --- a/src/core/tests/minhash.rs +++ b/src/core/tests/minhash.rs @@ -1,14 +1,28 @@ -use sourmash::signature::SigsTrait; -use sourmash::sketch::minhash::{max_hash_for_scaled, HashFunctions, KmerMinHash}; +use std::fs::File; +use std::io::BufReader; +use std::path::PathBuf; + +use sourmash::signature::{Signature, SigsTrait}; +use sourmash::sketch::minhash::{ + max_hash_for_scaled, HashFunctions, KmerMinHash, KmerMinHashBTree, +}; +use sourmash::sketch::Sketch; + +use proptest::collection::vec; +use proptest::num::u64; +use proptest::proptest; + +// TODO: use f64::EPSILON when we bump MSRV +const EPSILON: f64 = 0.01; #[test] fn throws_error() { let mut mh = KmerMinHash::new(1, 4, HashFunctions::murmur64_DNA, 42, 0, false); - match mh.add_sequence(b"ATGR", false) { - Ok(_) => assert!(false, "R is not a valid DNA character"), - Err(_) => assert!(true), - } + assert!( + mh.add_sequence(b"ATGR", false).is_err(), + "R is not a valid DNA character" + ); } #[test] @@ -59,8 +73,8 @@ fn similarity() -> Result<(), Box> { b.add_hash(1); b.add_hash(2); - assert!((a.similarity(&a, false, false)? - 1.0).abs() < 0.001); - assert!((a.similarity(&b, false, false)? - 0.5).abs() < 0.001); + assert!((a.similarity(&a, false, false)? - 1.0).abs() < EPSILON); + assert!((a.similarity(&b, false, false)? - 0.5).abs() < EPSILON); Ok(()) } @@ -77,7 +91,7 @@ fn similarity_2() -> Result<(), Box> { b.add_sequence(b"ATGGA", false)?; assert!( - (a.similarity(&b, false, false)? - 0.705).abs() < 0.001, + (a.similarity(&b, false, false)? - 0.705).abs() < EPSILON, "{}", a.similarity(&b, false, false)? ); @@ -100,11 +114,11 @@ fn similarity_3() -> Result<(), Box> { b.add_hash(3); b.add_hash(4); - assert!((a.similarity(&a, false, false)? - 1.0).abs() < 0.001); - assert!((a.similarity(&b, false, false)? - 0.23).abs() < 0.001); + assert!((a.similarity(&a, false, false)? - 1.0).abs() < EPSILON); + assert!((a.similarity(&b, false, false)? - 0.23).abs() < EPSILON); - assert!((a.similarity(&a, true, false)? - 1.0).abs() < 0.001); - assert!((a.similarity(&b, true, false)? - 0.2).abs() < 0.001); + assert!((a.similarity(&a, true, false)? - 1.0).abs() < EPSILON); + assert!((a.similarity(&b, true, false)? - 0.2).abs() < EPSILON); Ok(()) } @@ -137,3 +151,567 @@ fn hp() { fn max_for_scaled() { assert_eq!(max_hash_for_scaled(100), Some(184467440737095520)); } + +proptest! { +#[test] +fn oracle_mins(hashes in vec(u64::ANY, 1..10000)) { + let mut a = KmerMinHash::new(1000, 21, HashFunctions::murmur64_protein, 42, 0, true); + let mut b = KmerMinHashBTree::new(1000, 21, HashFunctions::murmur64_protein, 42, 0, true); + + let mut c: KmerMinHash = Default::default(); + c.set_hash_function(HashFunctions::murmur64_protein).unwrap(); + c.enable_abundance().unwrap(); + + let mut d: KmerMinHashBTree = Default::default(); + d.set_hash_function(HashFunctions::murmur64_protein).unwrap(); + d.enable_abundance().unwrap(); + + let mut to_remove = vec![]; + for hash in &hashes { + a.add_hash(*hash); + b.add_hash(*hash); + + if hash % 2 == 0 { + to_remove.push(*hash); + } + } + + c.add_from(&a).unwrap(); + c.remove_many(&to_remove).unwrap(); + + d.add_from(&b).unwrap(); + d.remove_many(&to_remove).unwrap(); + + assert_eq!(a.mins(), b.mins()); + assert_eq!(c.mins(), d.mins()); + + assert_eq!(a.count_common(&c, false).unwrap(), b.count_common(&d, false).unwrap()); + assert_eq!(a.count_common(&c, true).unwrap(), b.count_common(&d, true).unwrap()); + + assert_eq!(a.abunds(), b.abunds()); + assert_eq!(c.abunds(), d.abunds()); + + assert!((a.similarity(&c, false, false).unwrap() - b.similarity(&d, false, false).unwrap()).abs() < EPSILON); +} +} + +proptest! { +#[test] +fn oracle_mins_scaled(hashes in vec(u64::ANY, 1..10000)) { + let max_hash = max_hash_for_scaled(100).unwrap(); + let mut a = KmerMinHash::new(0, 6, HashFunctions::murmur64_DNA, 42, max_hash, true); + let mut b = KmerMinHashBTree::new(0, 6, HashFunctions::murmur64_DNA, 42, max_hash, true); + + let mut c = KmerMinHash::new(0, 6, HashFunctions::murmur64_DNA, 42, max_hash, true); + let mut d = KmerMinHashBTree::new(0, 6, HashFunctions::murmur64_DNA, 42, max_hash, true); + + let mut to_remove = vec![]; + for hash in &hashes { + a.add_hash(*hash); + b.add_hash(*hash); + + if hash % 2 == 0 { + to_remove.push(*hash); + } + } + + c.add_many(&hashes).unwrap(); + d.add_many(&hashes).unwrap(); + + c.remove_many(&to_remove).unwrap(); + d.remove_many(&to_remove).unwrap(); + + a.remove_hash(hashes[0]); + b.remove_hash(hashes[0]); + + assert_eq!(a.mins(), b.mins()); + assert_eq!(c.mins(), d.mins()); + + assert_eq!(a.md5sum(), b.md5sum()); + assert_eq!(c.md5sum(), d.md5sum()); + + assert_eq!(a.is_protein(), b.is_protein()); + assert_eq!(a.num(), b.num()); + assert_eq!(a.seed(), b.seed()); + assert_eq!(a.ksize(), b.ksize()); + assert_eq!(a.max_hash(), b.max_hash()); + assert_eq!(a.track_abundance(), b.track_abundance()); + assert_eq!(a.hash_function(), b.hash_function()); + + assert_eq!(a.abunds(), b.abunds()); + assert_eq!(c.abunds(), d.abunds()); + + assert!((a.similarity(&c, false, false).unwrap() - b.similarity(&d, false, false).unwrap()).abs() < EPSILON); + assert!((c.similarity(&a, false, false).unwrap() - d.similarity(&b, false, false).unwrap()).abs() < EPSILON); + assert!((a.similarity(&c, true, false).unwrap() - b.similarity(&d, true, false).unwrap()).abs() < EPSILON); + assert!((c.similarity(&a, true, false).unwrap() - d.similarity(&b, true, false).unwrap()).abs() < EPSILON); + + assert_eq!(a.count_common(&c, false).unwrap(), b.count_common(&d, false).unwrap()); + assert_eq!(c.count_common(&a, false).unwrap(), d.count_common(&b, false).unwrap()); + assert_eq!(a.count_common(&c, true).unwrap(), b.count_common(&d, true).unwrap()); + assert_eq!(c.count_common(&a, true).unwrap(), d.count_common(&b, true).unwrap()); + + let mut e = a.downsample_max_hash(100).unwrap(); + let mut f = b.downsample_max_hash(100).unwrap(); + + // Can't compare different scaled without explicit downsample + assert!(c.similarity(&e, false, false).is_err()); + assert!(d.similarity(&f, false, false).is_err()); + assert!(c.similarity(&e, true, false).is_err()); + assert!(d.similarity(&f, true, false).is_err()); + + assert!((c.similarity(&e, true, true).unwrap() - d.similarity(&f, true, true).unwrap()).abs() < EPSILON); + assert!((e.similarity(&c, true, true).unwrap() - f.similarity(&d, true, true).unwrap()).abs() < EPSILON); + assert!((c.similarity(&e, false, true).unwrap() - d.similarity(&f, false, true).unwrap()).abs() < EPSILON); + assert!((e.similarity(&c, false, true).unwrap() - f.similarity(&d, false, true).unwrap()).abs() < EPSILON); + + // Can't compare different scaled without explicit downsample + assert!(e.count_common(&c, false).is_err()); + assert!(f.count_common(&d, false).is_err()); + + assert_eq!(e.count_common(&c, true).unwrap(), f.count_common(&d, true).unwrap()); + assert_eq!(c.count_common(&e, true).unwrap(), d.count_common(&f, true).unwrap()); + + // disable abundances + e.disable_abundance(); + f.disable_abundance(); + + // Can't compare different scaled without explicit downsample + assert!(c.similarity(&e, false, false).is_err()); + assert!(d.similarity(&f, false, false).is_err()); + assert!(c.similarity(&e, true, false).is_err()); + assert!(d.similarity(&f, true, false).is_err()); + + assert!((c.similarity(&e, true, true).unwrap() - d.similarity(&f, true, true).unwrap()).abs() < EPSILON); + assert!((e.similarity(&c, true, true).unwrap() - f.similarity(&d, true, true).unwrap()).abs() < EPSILON); + assert!((c.similarity(&e, false, true).unwrap() - d.similarity(&f, false, true).unwrap()).abs() < EPSILON); + assert!((e.similarity(&c, false, true).unwrap() - f.similarity(&d, false, true).unwrap()).abs() < EPSILON); + + // Can't compare different scaled without explicit downsample + assert!(e.count_common(&c, false).is_err()); + assert!(f.count_common(&d, false).is_err()); + + assert_eq!(e.count_common(&c, true).unwrap(), f.count_common(&d, true).unwrap()); + assert_eq!(c.count_common(&e, true).unwrap(), d.count_common(&f, true).unwrap()); +} +} + +proptest! { +#[test] +fn prop_merge(seq1 in "[ACGT]{6,100}", seq2 in "[ACGT]{6,200}") { + let max_hash = max_hash_for_scaled(10).unwrap(); + let mut a = KmerMinHash::new(0, 6, HashFunctions::murmur64_DNA, 42, max_hash, true); + let mut b = KmerMinHashBTree::new(0, 6, HashFunctions::murmur64_DNA, 42, max_hash, true); + + let mut c = KmerMinHash::new(0, 6, HashFunctions::murmur64_DNA, 42, max_hash, true); + let mut d = KmerMinHashBTree::new(0, 6, HashFunctions::murmur64_DNA, 42, max_hash, true); + + a.add_sequence(seq1.as_bytes(), false).unwrap(); + b.add_sequence(seq1.as_bytes(), false).unwrap(); + + c.add_sequence(seq2.as_bytes(), false).unwrap(); + d.add_sequence(seq2.as_bytes(), false).unwrap(); + + a.merge(&c).unwrap(); + b.merge(&d).unwrap(); + + assert_eq!(a.mins(), b.mins()); + assert_eq!(c.mins(), d.mins()); + + assert_eq!(a.abunds(), b.abunds()); + assert_eq!(c.abunds(), d.abunds()); + + assert_eq!(a.intersection_size(&c).unwrap(), b.intersection_size(&d).unwrap()); + assert_eq!(c.intersection(&a).unwrap(), d.intersection(&b).unwrap()); + + assert!((a.similarity(&c, false, false).unwrap() - b.similarity(&d, false, false).unwrap()).abs() < EPSILON); + assert!((a.similarity(&c, true, false).unwrap() - b.similarity(&d, true, false).unwrap()).abs() < EPSILON); + + let mut e = a.downsample_max_hash(100).unwrap(); + let mut f = b.downsample_max_hash(100).unwrap(); + + assert!((e.similarity(&c, false, true).unwrap() - f.similarity(&d, false, true).unwrap()).abs() < EPSILON); + assert!((e.similarity(&c, true, true).unwrap() - f.similarity(&d, true, true).unwrap()).abs() < EPSILON); + + e.disable_abundance(); + f.disable_abundance(); + + assert!((e.similarity(&c, false, true).unwrap() - f.similarity(&d, false, true).unwrap()).abs() < EPSILON); + assert!((e.similarity(&c, true, true).unwrap() - f.similarity(&d, true, true).unwrap()).abs() < EPSILON); + + e.clear(); + f.clear(); + + assert!(e.is_empty()); + assert!(f.is_empty()); +} +} + +#[test] +fn load_save_minhash_sketches() { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/genome-s10+s11.sig"); + + let file = File::open(filename).unwrap(); + let reader = BufReader::new(file); + let sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); + + let sig = sigs.get(0).unwrap(); + let sketches = sig.sketches(); + let mut buffer = vec![]; + + if let Sketch::MinHash(mh) = &sketches[0] { + let bmh: KmerMinHashBTree = mh.clone().into(); + { + serde_json::to_writer(&mut buffer, &bmh).unwrap(); + } + + let new_mh: KmerMinHash = serde_json::from_reader(&buffer[..]).unwrap(); + let new_bmh: KmerMinHashBTree = serde_json::from_reader(&buffer[..]).unwrap(); + + assert_eq!(mh.md5sum(), new_mh.md5sum()); + assert_eq!(bmh.md5sum(), new_bmh.md5sum()); + assert_eq!(bmh.md5sum(), new_mh.md5sum()); + assert_eq!(mh.md5sum(), new_bmh.md5sum()); + + assert_eq!(mh.mins(), new_mh.mins()); + assert_eq!(bmh.mins(), new_bmh.mins()); + assert_eq!(bmh.mins(), new_mh.mins()); + assert_eq!(mh.mins(), new_bmh.mins()); + + assert_eq!(mh.abunds(), new_mh.abunds()); + assert_eq!(bmh.abunds(), new_bmh.abunds()); + assert_eq!(bmh.abunds(), new_mh.abunds()); + assert_eq!(mh.abunds(), new_bmh.abunds()); + + assert!( + (mh.similarity(&new_mh, false, false).unwrap() + - bmh.similarity(&new_bmh, false, false).unwrap()) + .abs() + < EPSILON + ); + + assert!( + (mh.similarity(&new_mh, true, false).unwrap() + - bmh.similarity(&new_bmh, true, false).unwrap()) + .abs() + < EPSILON + ); + + buffer.clear(); + let imh: KmerMinHash = bmh.clone().into(); + { + serde_json::to_writer(&mut buffer, &imh).unwrap(); + } + + let new_mh: KmerMinHash = serde_json::from_reader(&buffer[..]).unwrap(); + let new_bmh: KmerMinHashBTree = serde_json::from_reader(&buffer[..]).unwrap(); + + assert_eq!(mh.md5sum(), new_mh.md5sum()); + assert_eq!(bmh.md5sum(), new_bmh.md5sum()); + assert_eq!(bmh.md5sum(), new_mh.md5sum()); + assert_eq!(mh.md5sum(), new_bmh.md5sum()); + + assert_eq!(mh.mins(), new_mh.mins()); + assert_eq!(bmh.mins(), new_bmh.mins()); + assert_eq!(bmh.mins(), new_mh.mins()); + assert_eq!(mh.mins(), new_bmh.mins()); + + assert_eq!(mh.abunds(), new_mh.abunds()); + assert_eq!(bmh.abunds(), new_bmh.abunds()); + assert_eq!(bmh.abunds(), new_mh.abunds()); + assert_eq!(mh.abunds(), new_bmh.abunds()); + + assert_eq!(mh.to_vec(), new_mh.to_vec()); + assert_eq!(bmh.to_vec(), new_bmh.to_vec()); + assert_eq!(bmh.to_vec(), new_mh.to_vec()); + assert_eq!(mh.to_vec(), new_bmh.to_vec()); + + assert_eq!(mh.to_vec_abunds(), new_mh.to_vec_abunds()); + assert_eq!(bmh.to_vec_abunds(), new_bmh.to_vec_abunds()); + assert_eq!(bmh.to_vec_abunds(), new_mh.to_vec_abunds()); + assert_eq!(mh.to_vec_abunds(), new_bmh.to_vec_abunds()); + + assert!( + (mh.similarity(&new_mh, false, false).unwrap() + - bmh.similarity(&new_bmh, false, false).unwrap()) + .abs() + < EPSILON + ); + + assert!( + (mh.similarity(&new_mh, true, false).unwrap() + - bmh.similarity(&new_bmh, true, false).unwrap()) + .abs() + < EPSILON + ); + } +} + +#[test] +fn load_save_minhash_sketches_abund() { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/gather-abund/reads-s10-s11.sig"); + + let file = File::open(filename).unwrap(); + let reader = BufReader::new(file); + let sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); + + let sig = sigs.get(0).unwrap(); + let sketches = sig.sketches(); + let mut buffer = vec![]; + + if let Sketch::MinHash(mh) = &sketches[0] { + let bmh: KmerMinHashBTree = mh.clone().into(); + { + serde_json::to_writer(&mut buffer, &bmh).unwrap(); + } + + let new_mh: KmerMinHash = serde_json::from_reader(&buffer[..]).unwrap(); + let new_bmh: KmerMinHashBTree = serde_json::from_reader(&buffer[..]).unwrap(); + + assert_eq!(mh.md5sum(), new_mh.md5sum()); + assert_eq!(bmh.md5sum(), new_bmh.md5sum()); + assert_eq!(bmh.md5sum(), new_mh.md5sum()); + assert_eq!(mh.md5sum(), new_bmh.md5sum()); + + assert_eq!(mh.mins(), new_mh.mins()); + assert_eq!(bmh.mins(), new_bmh.mins()); + assert_eq!(bmh.mins(), new_mh.mins()); + assert_eq!(mh.mins(), new_bmh.mins()); + + assert_eq!(mh.abunds(), new_mh.abunds()); + assert_eq!(bmh.abunds(), new_bmh.abunds()); + assert_eq!(bmh.abunds(), new_mh.abunds()); + assert_eq!(mh.abunds(), new_bmh.abunds()); + + assert_eq!(mh.to_vec(), new_mh.to_vec()); + assert_eq!(bmh.to_vec(), new_bmh.to_vec()); + assert_eq!(bmh.to_vec(), new_mh.to_vec()); + assert_eq!(mh.to_vec(), new_bmh.to_vec()); + + assert_eq!(mh.to_vec_abunds(), new_mh.to_vec_abunds()); + assert_eq!(bmh.to_vec_abunds(), new_bmh.to_vec_abunds()); + assert_eq!(bmh.to_vec_abunds(), new_mh.to_vec_abunds()); + assert_eq!(mh.to_vec_abunds(), new_bmh.to_vec_abunds()); + + assert!( + (mh.similarity(&new_mh, false, false).unwrap() + - bmh.similarity(&new_bmh, false, false).unwrap()) + .abs() + < EPSILON + ); + + assert!( + (mh.similarity(&new_mh, true, false).unwrap() + - bmh.similarity(&new_bmh, true, false).unwrap()) + .abs() + < EPSILON + ); + + buffer.clear(); + let imh: KmerMinHash = bmh.clone().into(); + { + serde_json::to_writer(&mut buffer, &imh).unwrap(); + } + + let new_mh: KmerMinHash = serde_json::from_reader(&buffer[..]).unwrap(); + let new_bmh: KmerMinHashBTree = serde_json::from_reader(&buffer[..]).unwrap(); + + assert_eq!(mh.md5sum(), new_mh.md5sum()); + assert_eq!(bmh.md5sum(), new_bmh.md5sum()); + assert_eq!(bmh.md5sum(), new_mh.md5sum()); + assert_eq!(mh.md5sum(), new_bmh.md5sum()); + + assert_eq!(mh.mins(), new_mh.mins()); + assert_eq!(bmh.mins(), new_bmh.mins()); + assert_eq!(bmh.mins(), new_mh.mins()); + assert_eq!(mh.mins(), new_bmh.mins()); + + assert_eq!(mh.abunds(), new_mh.abunds()); + assert_eq!(bmh.abunds(), new_bmh.abunds()); + assert_eq!(bmh.abunds(), new_mh.abunds()); + assert_eq!(mh.abunds(), new_bmh.abunds()); + + assert!( + (mh.similarity(&new_mh, false, false).unwrap() + - bmh.similarity(&new_bmh, false, false).unwrap()) + .abs() + < EPSILON + ); + + assert!( + (mh.similarity(&new_mh, true, false).unwrap() + - bmh.similarity(&new_bmh, true, false).unwrap()) + .abs() + < EPSILON + ); + } +} + +#[test] +fn merge_empty_scaled() { + let max_hash = max_hash_for_scaled(10).unwrap(); + let mut a = KmerMinHash::new(0, 6, HashFunctions::murmur64_DNA, 42, max_hash, true); + let mut b = KmerMinHashBTree::new(0, 6, HashFunctions::murmur64_DNA, 42, max_hash, true); + + let c = KmerMinHash::new(0, 6, HashFunctions::murmur64_DNA, 42, max_hash, true); + let d = KmerMinHashBTree::new(0, 6, HashFunctions::murmur64_DNA, 42, max_hash, true); + + a.merge(&c).unwrap(); + b.merge(&d).unwrap(); + + assert!(a.is_empty()); + assert!(b.is_empty()); + + a.add_hash_with_abundance(0, 0); + assert!(a.is_empty()); + b.add_hash_with_abundance(0, 0); + assert!(b.is_empty()); + + a.clear(); + assert!(a.is_empty()); + b.clear(); + assert!(b.is_empty()); +} + +#[test] +fn check_errors() { + let max_hash = max_hash_for_scaled(10).unwrap(); + let mut a = KmerMinHash::new(0, 6, HashFunctions::murmur64_DNA, 42, max_hash, false); + let mut b = KmerMinHashBTree::new(0, 6, HashFunctions::murmur64_DNA, 42, max_hash, false); + + // sequence too short: OK + assert!(a.add_sequence(b"AC", false).is_ok()); + assert!(b.add_sequence(b"AC", false).is_ok()); + + // invalid base, throw error + assert!(a.add_sequence(b"ACTGNN", false).is_err()); + assert!(b.add_sequence(b"ACTGNN", false).is_err()); + + a.add_hash(1); + b.add_hash(1); + + // Can't set abundance after something was inserted + assert!(a.enable_abundance().is_err()); + assert!(b.enable_abundance().is_err()); + + // Can't change hash function after insertion + assert!(a.set_hash_function(HashFunctions::murmur64_hp).is_err()); + assert!(b.set_hash_function(HashFunctions::murmur64_hp).is_err()); + + // setting to the same hash function is fine + assert!(a.set_hash_function(HashFunctions::murmur64_DNA).is_ok()); + assert!(b.set_hash_function(HashFunctions::murmur64_DNA).is_ok()); + + let c = KmerMinHash::new(0, 7, HashFunctions::murmur64_DNA, 42, max_hash, true); + let d = KmerMinHashBTree::new(0, 7, HashFunctions::murmur64_DNA, 42, max_hash, true); + + // different ksize + assert!(a.check_compatible(&c).is_err()); + assert!(b.check_compatible(&d).is_err()); + + let c = KmerMinHash::new(0, 6, HashFunctions::murmur64_protein, 42, max_hash, true); + let d = KmerMinHashBTree::new(0, 6, HashFunctions::murmur64_protein, 42, max_hash, true); + + // different hash_function + assert!(a.check_compatible(&c).is_err()); + assert!(b.check_compatible(&d).is_err()); + + let c = KmerMinHash::new(0, 6, HashFunctions::murmur64_DNA, 31, max_hash, true); + let d = KmerMinHashBTree::new(0, 6, HashFunctions::murmur64_DNA, 31, max_hash, true); + + // different seed + assert!(a.check_compatible(&c).is_err()); + assert!(b.check_compatible(&d).is_err()); +} + +//fn prop_merge(seq1 in "[ACGT]{6,100}", seq2 in "[ACGT]{6,200}") { + +proptest! { +#[test] +fn load_save_minhash_dayhoff(seq in "FLYS*CWLPGQRMTHINKVADER{0,1000}") { + let max_hash = max_hash_for_scaled(10).unwrap(); + let mut a = KmerMinHash::new(0, 3, HashFunctions::murmur64_dayhoff, 42, max_hash, true); + let mut b = KmerMinHashBTree::new(0, 3, HashFunctions::murmur64_dayhoff, 42, max_hash, true); + + a.add_protein(seq.as_bytes()).unwrap(); + b.add_protein(seq.as_bytes()).unwrap(); + + let mut buffer_a = vec![]; + let mut buffer_b = vec![]; + + { + serde_json::to_writer(&mut buffer_a, &a).unwrap(); + serde_json::to_writer(&mut buffer_b, &b).unwrap(); + } + + assert_eq!(buffer_a, buffer_b); + + let c: KmerMinHash = serde_json::from_reader(&buffer_b[..]).unwrap(); + let d: KmerMinHashBTree = serde_json::from_reader(&buffer_a[..]).unwrap(); + + assert!((a.similarity(&c, false, false).unwrap() - b.similarity(&d, false, false).unwrap()).abs() < EPSILON); + assert!((a.similarity(&c, true, false).unwrap() - b.similarity(&d, true, false).unwrap()).abs() < EPSILON); +} +} + +proptest! { +#[test] +fn load_save_minhash_hp(seq in "FLYS*CWLPGQRMTHINKVADER{0,1000}") { + let max_hash = max_hash_for_scaled(10).unwrap(); + let mut a = KmerMinHash::new(0, 3, HashFunctions::murmur64_hp, 42, max_hash, true); + let mut b = KmerMinHashBTree::new(0, 3, HashFunctions::murmur64_hp, 42, max_hash, true); + + a.add_protein(seq.as_bytes()).unwrap(); + b.add_protein(seq.as_bytes()).unwrap(); + + let mut buffer_a = vec![]; + let mut buffer_b = vec![]; + + { + serde_json::to_writer(&mut buffer_a, &a).unwrap(); + serde_json::to_writer(&mut buffer_b, &b).unwrap(); + } + + assert_eq!(buffer_a, buffer_b); + + let c: KmerMinHash = serde_json::from_reader(&buffer_b[..]).unwrap(); + let d: KmerMinHashBTree = serde_json::from_reader(&buffer_a[..]).unwrap(); + + assert!((a.similarity(&c, false, false).unwrap() - b.similarity(&d, false, false).unwrap()).abs() < EPSILON); + assert!((a.similarity(&c, true, false).unwrap() - b.similarity(&d, true, false).unwrap()).abs() < EPSILON); +} +} + +proptest! { +#[test] +fn load_save_minhash_dna(seq in "ACGTN{0,1000}") { + let max_hash = max_hash_for_scaled(10).unwrap(); + let mut a = KmerMinHash::new(0, 21, HashFunctions::murmur64_DNA, 42, max_hash, true); + let mut b = KmerMinHashBTree::new(0, 21, HashFunctions::murmur64_DNA, 42, max_hash, true); + + a.add_sequence(seq.as_bytes(), true).unwrap(); + b.add_sequence(seq.as_bytes(), true).unwrap(); + + // Can't add_protein to a DNA sig + a.add_protein(seq.as_bytes()).is_err(); + b.add_protein(seq.as_bytes()).is_err(); + + let mut buffer_a = vec![]; + let mut buffer_b = vec![]; + + { + serde_json::to_writer(&mut buffer_a, &a).unwrap(); + serde_json::to_writer(&mut buffer_b, &b).unwrap(); + } + + assert_eq!(buffer_a, buffer_b); + + let c: KmerMinHash = serde_json::from_reader(&buffer_b[..]).unwrap(); + let d: KmerMinHashBTree = serde_json::from_reader(&buffer_a[..]).unwrap(); + + assert!((a.similarity(&c, false, false).unwrap() - b.similarity(&d, false, false).unwrap()).abs() < EPSILON); + assert!((a.similarity(&c, true, false).unwrap() - b.similarity(&d, true, false).unwrap()).abs() < EPSILON); +} +} diff --git a/src/core/tests/signature.rs b/src/core/tests/signature.rs index 60dd07217b..7070de13c6 100644 --- a/src/core/tests/signature.rs +++ b/src/core/tests/signature.rs @@ -1,12 +1,13 @@ -use serde_json; - use std::fs::File; use std::io::BufReader; +use std::io::Read; use std::path::PathBuf; +use needletail::parse_sequence_reader; use sourmash::cmd::ComputeParameters; use sourmash::signature::Signature; use sourmash::signature::SigsTrait; +use sourmash::sketch::Sketch; #[test] fn load_signature() { @@ -26,6 +27,43 @@ fn load_signature() { assert_eq!(sig.hash_function(), "0.murmur64"); assert_eq!(sig.name(), "s10+s11"); assert_eq!(sig.size(), 4); + + let sketches = sig.sketches(); + match sketches[0] { + Sketch::MinHash(_) => (), + Sketch::LargeMinHash(_) => panic!(), + Sketch::UKHS(_) => panic!(), + } +} + +#[test] +fn load_signature_2() { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/47.abunds.fa.sig"); + + let file = File::open(filename).unwrap(); + let reader = BufReader::new(file); + let sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); + + assert_eq!(sigs.len(), 1); + + let sig = sigs.get(0).unwrap(); + assert_eq!(sig.class(), "sourmash_signature"); + assert_eq!(sig.email(), ""); + assert_eq!(sig.filename(), "47.fa"); + assert_eq!(sig.hash_function(), "0.murmur64"); + assert_eq!( + sig.name(), + "NC_009665.1 Shewanella baltica OS185, complete genome" + ); + assert_eq!(sig.size(), 1); + + let sketches = sig.sketches(); + match sketches[0] { + Sketch::MinHash(_) => (), + Sketch::LargeMinHash(_) => panic!(), + Sketch::UKHS(_) => panic!(), + } } #[test] @@ -106,3 +144,33 @@ fn signature_add_protein() { assert_eq!(sketches[0].size(), 3); assert_eq!(sketches[1].size(), 2); } + +#[test] +fn signature_add_sequence_cp() { + let mut cp = ComputeParameters::default(); + cp.dayhoff = true; + cp.protein = true; + cp.hp = true; + cp.dna = true; + + let mut sig = Signature::from_params(&cp); + + let mut data: Vec = vec![]; + let mut f = File::open("../../tests/test-data/ecoli.genes.fna").unwrap(); + let _ = f.read_to_end(&mut data); + + parse_sequence_reader( + &data[..], + |_| {}, + |rec| { + sig.add_sequence(&rec.seq, false).unwrap(); + }, + ) + .unwrap(); + + let sketches = sig.sketches(); + assert_eq!(sketches.len(), 12); + for sk in sketches { + assert_eq!(sk.size(), 500); + } +} diff --git a/src/core/tests/test.rs b/src/core/tests/test.rs index f6a471df51..1df0e75049 100644 --- a/src/core/tests/test.rs +++ b/src/core/tests/test.rs @@ -1,5 +1,3 @@ -use sourmash; - #[test] fn test_murmur() { assert_eq!(sourmash::_hash_murmur(b"ACG", 42), 1731421407650554201)