Skip to content

Commit bf86e2c

Browse files
committed
#3: proper fingerprints writing with fixed size, #5: fingerprints similarity search
1 parent 14763ba commit bf86e2c

File tree

5 files changed

+89
-10
lines changed

5 files changed

+89
-10
lines changed

molecules.smi

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
C
2-
CC
1+
CC(C)CC1=CC=C(C=C1)C(C)C(=O)O Ibuprofen
2+
CC(C1=CC2=C(C=C1)C=C(C=C2)OC)C(=O)O Naproxen

src/ringo/molecule/model/molecule.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use crate::ringo::molecule::model::atom::Atom;
33
use crate::ringo::molecule::model::bond::Bond;
44
use crate::ringo::molecule::model::element::atomic_weight;
55
use crate::ringo::molecule::smiles::reader::molecule::parse_molecule;
6-
use crate::ringo::ringo::fingerprint::Fingerprint;
6+
use crate::ringo::ringo::fingerprint::{Fingerprint, FINGERPRINT_SIZE};
77
use fixedbitset::FixedBitSet;
88
use petgraph::stable_graph::{EdgeIndex, NodeIndex, StableGraph};
99
use petgraph::visit::EdgeRef;
@@ -89,7 +89,7 @@ impl Molecule {
8989

9090
// TODO: move to Descriptors crate
9191
pub fn ecfp(&self, radius: usize, fp_length: usize) -> Fingerprint {
92-
let mut fp = FixedBitSet::new();
92+
let mut fp = FixedBitSet::with_capacity(FINGERPRINT_SIZE);
9393

9494
for node in self.graph.node_indices() {
9595
ecfp_recursive(

src/ringo/ringo/fingerprint.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,11 @@ mod tests {
4141
fp.0.set(1, true);
4242
fp.0.set(17, true);
4343

44-
let encoded = bincode::encode_to_vec(&fp, bincode::config::standard()).unwrap();
44+
let mut buf = vec![0u8; FINGERPRINT_SIZE / 8];
45+
let encoded = bincode::encode_into_slice(&fp, buf.as_mut_slice(), bincode::config::standard()).unwrap();
46+
4547
let decoded: Fingerprint =
46-
bincode::decode_from_slice(&encoded, bincode::config::standard())
48+
bincode::decode_from_slice(&buf, bincode::config::standard())
4749
.unwrap()
4850
.0;
4951
assert_eq!(decoded.0.ones().collect::<Vec<usize>>(), vec![1, 17]);

src/ringo/ringo/index.rs

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,37 @@
11
use crate::ringo::molecule::smiles::reader::molecule::parse_molecule;
22
use crate::ringo::ringo::index_item::IndexItem;
33
use std::fs::File;
4-
use std::io::BufRead;
4+
use std::io::{BufRead, BufWriter, Write};
5+
use bincode::{encode_into_slice};
6+
use crate::ringo::ringo::fingerprint::FINGERPRINT_SIZE;
7+
8+
#[cfg(windows)]
9+
const LINE_ENDING_LENGTH: usize = 2;
10+
#[cfg(not(windows))]
11+
const LINE_ENDING_LENGTH: usize = 1;
512

613
fn index(smiles_file: &str) {
714
// open file for reading
815
let fi = File::open(smiles_file).expect("Could not open file");
916

1017
// open binary file for index
1118
let mut offset = 0;
12-
// let mut fo = File::create(smiles_file.to_owned() + ".fp");
19+
let mut fo = File::create(smiles_file.to_owned() + ".fp");
20+
let mut writer = BufWriter::new(fo.unwrap());
21+
1322
for line in std::io::BufReader::new(fi).lines() {
1423
let line = line.unwrap();
1524
let molecule = parse_molecule(&line).unwrap().1;
16-
IndexItem {
25+
let index_item = IndexItem {
1726
position: offset,
1827
fingerprint: molecule.ecfp(2, 512),
1928
};
20-
offset += line.len() + 1;
29+
offset += line.len() + LINE_ENDING_LENGTH;
30+
31+
let mut buf = vec![0u8; FINGERPRINT_SIZE / 8 + 8];
32+
33+
encode_into_slice(&index_item, buf.as_mut_slice(), bincode::config::standard()).unwrap();
34+
writer.write(&buf).unwrap();
2135
}
2236
}
2337

src/ringo/ringo/search.rs

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,64 @@
1+
use std::fs::File;
2+
use std::io::{BufRead, BufReader, Read, Seek};
3+
use crate::ringo::math::similarity::tanimoto::tanimoto_bitset;
4+
use crate::ringo::molecule::smiles::reader::molecule::parse_molecule;
5+
use crate::ringo::ringo::index_item::IndexItem;
16

7+
pub struct SearchResult {
8+
pub line: String,
9+
pub similarity: f32
10+
}
11+
12+
pub fn similarity_search(smiles_file: &str, query: &str, min_similarity: f32, limit: usize) -> Vec<SearchResult> {
13+
let query = parse_molecule(query).unwrap().1;
14+
let query_fp = query.ecfp(2, 512);
15+
16+
// smiles file
17+
let fis = File::open(&smiles_file).expect("Could not open file");
18+
let mut reader = BufReader::new(fis);
19+
20+
//fingerprings file
21+
let fif = File::open(smiles_file.to_owned() + ".fp").expect("Could not open file");
22+
let file_len = fif.metadata().unwrap().len();
23+
let index_item_size = 72u8;
24+
let index_count = file_len / index_item_size as u64;
25+
let mut buf_reader = BufReader::new(fif);
26+
27+
let mut results = Vec::new();
28+
29+
for i in 0..index_count {
30+
// read index item from file
31+
let mut buf = vec![0u8; index_item_size as usize];
32+
buf_reader.read_exact(&mut buf).unwrap();
33+
34+
// decode index item
35+
let index_item: IndexItem = bincode::decode_from_slice(&buf, bincode::config::standard()).unwrap().0;
36+
37+
// calculate similarity
38+
let similarity = tanimoto_bitset(&index_item.fingerprint.0, &query_fp.0);
39+
40+
// print similarity if it is greater than min_similarity
41+
if similarity >= min_similarity {
42+
let position = index_item.position;
43+
reader.seek(std::io::SeekFrom::Start(position as u64)).unwrap();
44+
45+
let mut line = String::new();
46+
reader.read_line(&mut line).unwrap();
47+
// println!("{i} {similarity} {position} {line}");
48+
results.push(SearchResult {
49+
line: line,
50+
similarity: similarity
51+
});
52+
}
53+
}
54+
55+
results
56+
}
57+
58+
#[test]
59+
fn test_similarity_search() {
60+
let results = similarity_search("molecules.smi", "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O", 0.7, 100);
61+
assert_eq!(results.len(), 1);
62+
let results = similarity_search("molecules.smi", "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O", 0.5, 100);
63+
assert_eq!(results.len(), 2);
64+
}

0 commit comments

Comments
 (0)