Skip to content

Commit 1a6b78d

Browse files
committed
#3: switched to FixedBitSet, using bincode to performa binary encoding and decoding
1 parent 8367356 commit 1a6b78d

File tree

8 files changed

+108
-86
lines changed

8 files changed

+108
-86
lines changed

Cargo.toml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,7 @@ repository = "https://github.com/mkviatkovskii/ringo.git"
1111
version = "0.0.1"
1212

1313
[dependencies]
14-
bitcode = "0.6.0"
15-
bit-set = "0.6.0"
16-
bit-vec = "0.7.0"
14+
bincode = "2.0.0-rc.3"
15+
fixedbitset = "0.5.7"
1716
nom = "7.1.3"
1817
petgraph = "0.6.5"

src/ringo/math/similarity/tanimoto.rs

Lines changed: 22 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,34 @@
1-
use bit_vec::BitVec;
1+
use fixedbitset::FixedBitSet;
22

3-
pub fn tanimoto_vec(a: &BitVec, b: &BitVec) -> f32 {
3+
pub fn tanimoto_bitset(a: &FixedBitSet, b: &FixedBitSet) -> f32 {
44
let mut and_ = a.clone();
5-
let mut or_ = a.clone();
6-
and_.and(b);
7-
or_.or(b);
8-
9-
let mut dividend: u32 = 0;
10-
for b in and_.blocks() {
11-
dividend += b.count_ones();
12-
}
13-
let mut divisor: u32 = 0;
14-
for b in or_.blocks() {
15-
divisor += b.count_ones();
16-
}
17-
18-
return dividend as f32 / divisor as f32;
19-
}
20-
21-
pub unsafe fn tanimoto_array(a: &[u64; 4], b: &[u64; 4]) -> f32 {
22-
let mut dividend: u32 = 0;
23-
let mut divisor: u32 = 0;
24-
25-
for i in 0..4 {
26-
dividend += ((a[i] & b[i]) as i64).count_ones();
27-
divisor += ((a[i] | b[i]) as i64).count_ones();
28-
}
29-
return dividend as f32 / divisor as f32;
5+
and_.intersect_with(b);
6+
return and_.count_ones(..) as f32 / (a.count_ones(..) + b.count_ones(..) - and_.count_ones(..)) as f32;
307
}
318

329
#[cfg(test)]
3310
mod tests {
34-
use bit_vec::BitVec;
35-
use crate::ringo::math::similarity::tanimoto::{tanimoto_array, tanimoto_vec};
11+
use fixedbitset::FixedBitSet;
12+
use crate::ringo::math::similarity::tanimoto::{tanimoto_bitset};
3613

3714
#[test]
38-
fn test_tanimoto_vec_033() {
39-
let a: BitVec = BitVec::from_bytes(&[0b00000101]);
40-
let b = BitVec::from_bytes(&[0b00000011]);
41-
42-
assert_eq!(tanimoto_vec(&a, &b), 0.33333334);
15+
fn test_tanimoto_bitset_033() {
16+
let mut a = FixedBitSet::with_capacity(8);
17+
a.insert(0);
18+
a.insert(2);
19+
let mut b = FixedBitSet::with_capacity(8);
20+
b.insert(0);
21+
b.insert(1);
22+
assert_eq!(tanimoto_bitset(&a, &b), 0.33333334);
4323
}
4424

4525
#[test]
46-
fn test_tanimoto_vec_05() {
47-
let a: BitVec = BitVec::from_bytes(&[0b0000001]);
48-
let b = BitVec::from_bytes(&[0b00000011]);
49-
50-
assert_eq!(tanimoto_vec(&a, &b), 0.5);
26+
fn test_tanimoto_bitset_05() {
27+
let mut a = FixedBitSet::with_capacity(8);
28+
a.insert(0);
29+
let mut b = FixedBitSet::with_capacity(8);
30+
b.insert(0);
31+
b.insert(1);
32+
assert_eq!(tanimoto_bitset(&a, &b), 0.5);
5133
}
52-
53-
#[test]
54-
fn test_tanimoto_array_033() {
55-
let a: [u64; 4] = [0b00000101, 0, 0, 0];
56-
let b = [0b00000011, 0, 0, 0];
57-
58-
unsafe {
59-
assert_eq!(tanimoto_array(&a, &b), 0.33333334);
60-
}
61-
}
62-
63-
#[test]
64-
fn test_tanimoto_array_05() {
65-
let a: [u64; 4] = [0b00000001, 0, 0, 0];
66-
let b = [0b00000011, 0, 0, 0];
67-
68-
unsafe {
69-
assert_eq!(tanimoto_array(&a, &b), 0.5);
70-
}
71-
}
72-
}
34+
}

src/ringo/molecule/model/molecule.rs

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,16 @@
11
use crate::ringo::molecule::model::atom::Atom;
22
use crate::ringo::molecule::model::bond::Bond;
33
use crate::ringo::molecule::model::element::atomic_weight;
4-
use bit_set::BitSet;
5-
use bit_vec::BitVec;
4+
use crate::ringo::ringo::fingerprint::Fingerprint;
65
use petgraph::stable_graph::{EdgeIndex, NodeIndex, StableGraph};
76
use petgraph::visit::EdgeRef;
87
use petgraph::Undirected;
98
use std::borrow::Borrow;
109
use std::collections::{BTreeSet};
1110
use std::collections::hash_map::DefaultHasher;
12-
use std::fmt::Debug;
1311
use std::hash::Hasher;
14-
use crate::ringo::math::similarity::tanimoto::tanimoto_vec;
12+
use fixedbitset::FixedBitSet;
13+
use crate::ringo::math::similarity::tanimoto::tanimoto_bitset;
1514
use crate::ringo::molecule::smiles::reader::molecule::parse_molecule;
1615

1716
pub struct Molecule {
@@ -89,14 +88,14 @@ impl Molecule {
8988
}
9089

9190
// TODO: move to Descriptors crate
92-
pub fn ecfp(&self, radius: usize, fp_length: usize) -> BitVec {
93-
let mut fp = BitSet::new();
91+
pub fn ecfp(&self, radius: usize, fp_length: usize) -> Fingerprint {
92+
let mut fp = FixedBitSet::new();
9493

9594
for node in self.graph.node_indices() {
9695
ecfp_recursive(&self.graph, radius, 1, node, &mut fp, fp_length, &mut DefaultHasher::new());
9796
}
9897

99-
BitVec::from_fn(fp_length, |idx| fp.contains(idx))
98+
Fingerprint(fp)
10099
}
101100
}
102101

@@ -105,7 +104,7 @@ fn ecfp_recursive(
105104
radius: usize,
106105
depth: usize,
107106
node: NodeIndex,
108-
fp: &mut BitSet,
107+
fp: &mut FixedBitSet,
109108
fp_length: usize,
110109
hasher: &mut DefaultHasher,
111110
) {
@@ -142,6 +141,6 @@ fn ecfp_recursive(
142141
fn test_ecfp() {
143142
let ecfp_ibuprofen = parse_molecule("CC(C)CC1=CC=C(C=C1)C(C)C(=O)O").unwrap().1.ecfp(2, 128);
144143
let ecfp_naproxen = parse_molecule("CC(C1=CC2=C(C=C1)C=C(C=C2)OC)C(=O)O").unwrap().1.ecfp(2, 128);
145-
let sim = tanimoto_vec(&ecfp_ibuprofen, &ecfp_naproxen);
144+
let sim = tanimoto_bitset(&ecfp_ibuprofen.0, &ecfp_naproxen.0);
146145
assert!(0.53 < sim && sim < 0.54);
147146
}

src/ringo/molecule/smiles/reader/molecule.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -388,10 +388,7 @@ mod tests {
388388
println!("{}: ", smiles);
389389
let m = parse_molecule(smiles).unwrap().1;
390390
let result = m.ecfp(2, 512);
391-
for bit in result {
392-
print!("{}", if bit { 1 } else { 0 });
393-
}
394-
println!("");
391+
println!("{:?}", result);
395392
}
396393
}
397394
}

src/ringo/ringo.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
mod index;
22
mod search;
33
mod index_item;
4+
pub(crate) mod fingerprint;

src/ringo/ringo/fingerprint.rs

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
use bincode::de::BorrowDecoder;
2+
use bincode::error::{DecodeError, EncodeError};
3+
use fixedbitset::{Block, FixedBitSet};
4+
5+
pub const FINGERPRINT_SIZE: usize = 512;
6+
7+
#[derive(Debug)]
8+
pub struct Fingerprint(pub FixedBitSet);
9+
10+
impl bincode::Encode for Fingerprint {
11+
fn encode<E: bincode::enc::Encoder>(&self, encoder: &mut E) -> Result<(), EncodeError> {
12+
self.0.as_slice().encode(encoder)?;
13+
Ok(())
14+
}
15+
}
16+
17+
impl bincode::Decode for Fingerprint {
18+
fn decode<D: bincode::de::Decoder>(decoder: &mut D) -> Result<Self, DecodeError> {
19+
let slice = Vec::<Block>::decode(decoder)?;
20+
let fp = FixedBitSet::with_capacity_and_blocks(FINGERPRINT_SIZE, slice);
21+
Ok(Fingerprint(fp))
22+
}
23+
}
24+
25+
impl<'de> bincode::BorrowDecode<'de> for Fingerprint {
26+
fn borrow_decode<D: BorrowDecoder<'de>>(decoder: &mut D) -> Result<Self, DecodeError> {
27+
let slice = Vec::<Block>::borrow_decode(decoder)?;
28+
let fp = FixedBitSet::with_capacity_and_blocks(FINGERPRINT_SIZE, slice);
29+
Ok(Fingerprint(fp))
30+
}
31+
}
32+
33+
#[cfg(test)]
34+
mod tests {
35+
use fixedbitset::{FixedBitSet};
36+
use crate::ringo::ringo::fingerprint::{Fingerprint, FINGERPRINT_SIZE};
37+
38+
#[test]
39+
fn test_fingerprint_encode_decode() {
40+
let mut fp = Fingerprint(FixedBitSet::with_capacity(FINGERPRINT_SIZE));
41+
fp.0.set(1, true);
42+
fp.0.set(17, true);
43+
44+
let encoded = bincode::encode_to_vec(&fp, bincode::config::standard()).unwrap();
45+
let decoded: Fingerprint = bincode::decode_from_slice(&encoded, bincode::config::standard()).unwrap().0;
46+
assert_eq!(decoded.0.ones().collect::<Vec<usize>>(), vec![1, 17]);
47+
}
48+
}

src/ringo/ringo/index.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,12 @@ fn index(smiles_file: &str) {
1313
for line in std::io::BufReader::new(fi).lines() {
1414
let line = line.unwrap();
1515
let molecule = parse_molecule(&line).unwrap().1;
16-
IndexItem::new(offset, molecule.ecfp(2, 512));
16+
IndexItem{position: offset, fingerprint: molecule.ecfp(2, 512)};
1717
offset += line.len() + 1;
1818
}
1919
}
2020

21+
2122
#[test]
2223
fn test_index() {
2324
index("molecules.smi");

src/ringo/ringo/index_item.rs

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,30 @@
1-
use bit_vec::BitVec;
1+
use bincode::{Decode, Encode};
2+
use crate::ringo::ringo::fingerprint::Fingerprint;
23

4+
#[derive(Debug, Encode, Decode)]
35
pub struct IndexItem {
46
pub position: usize,
5-
pub fingerprint: Vec<u8>
7+
pub fingerprint: Fingerprint
68
}
79

8-
impl IndexItem {
9-
pub fn new(position: usize, fingerprint: BitVec) -> IndexItem {
10-
IndexItem {
11-
position,
12-
fingerprint: fingerprint.to_bytes()
13-
}
10+
#[cfg(test)]
11+
mod tests {
12+
use bincode::config::standard;
13+
use bincode::{decode_from_slice, encode_to_vec};
14+
use fixedbitset::FixedBitSet;
15+
use crate::ringo::ringo::index_item::IndexItem;
16+
use crate::ringo::ringo::fingerprint::Fingerprint;
17+
18+
#[test]
19+
fn test_index_item_encode_decode() {
20+
let fp = Fingerprint(FixedBitSet::with_capacity(512));
21+
let mut ii = IndexItem {position: 0, fingerprint: fp};
22+
ii.position = 0;
23+
ii.fingerprint.0.set(1, true);
24+
ii.fingerprint.0.set(17, true);
25+
26+
let encoded = encode_to_vec(&ii, standard()).unwrap();
27+
let decoded: IndexItem = decode_from_slice(&encoded, standard()).unwrap().0;
28+
assert_eq!(decoded.fingerprint.0.ones().collect::<Vec<usize>>(), vec![1, 17]);
1429
}
15-
}
30+
}

0 commit comments

Comments
 (0)