|
| 1 | +use std::fs::File; |
| 2 | +use std::io::{BufRead, BufReader, Read, Seek}; |
| 3 | +use crate::ringo::math::similarity::tanimoto::tanimoto_bitset; |
| 4 | +use crate::ringo::molecule::smiles::reader::molecule::parse_molecule; |
| 5 | +use crate::ringo::ringo::index_item::IndexItem; |
1 | 6 |
|
| 7 | +pub struct SearchResult { |
| 8 | + pub line: String, |
| 9 | + pub similarity: f32 |
| 10 | +} |
| 11 | + |
| 12 | +pub fn similarity_search(smiles_file: &str, query: &str, min_similarity: f32, limit: usize) -> Vec<SearchResult> { |
| 13 | + let query = parse_molecule(query).unwrap().1; |
| 14 | + let query_fp = query.ecfp(2, 512); |
| 15 | + |
| 16 | + // smiles file |
| 17 | + let fis = File::open(&smiles_file).expect("Could not open file"); |
| 18 | + let mut reader = BufReader::new(fis); |
| 19 | + |
| 20 | + //fingerprings file |
| 21 | + let fif = File::open(smiles_file.to_owned() + ".fp").expect("Could not open file"); |
| 22 | + let file_len = fif.metadata().unwrap().len(); |
| 23 | + let index_item_size = 72u8; |
| 24 | + let index_count = file_len / index_item_size as u64; |
| 25 | + let mut buf_reader = BufReader::new(fif); |
| 26 | + |
| 27 | + let mut results = Vec::new(); |
| 28 | + |
| 29 | + for i in 0..index_count { |
| 30 | + // read index item from file |
| 31 | + let mut buf = vec![0u8; index_item_size as usize]; |
| 32 | + buf_reader.read_exact(&mut buf).unwrap(); |
| 33 | + |
| 34 | + // decode index item |
| 35 | + let index_item: IndexItem = bincode::decode_from_slice(&buf, bincode::config::standard()).unwrap().0; |
| 36 | + |
| 37 | + // calculate similarity |
| 38 | + let similarity = tanimoto_bitset(&index_item.fingerprint.0, &query_fp.0); |
| 39 | + |
| 40 | + // print similarity if it is greater than min_similarity |
| 41 | + if similarity >= min_similarity { |
| 42 | + let position = index_item.position; |
| 43 | + reader.seek(std::io::SeekFrom::Start(position as u64)).unwrap(); |
| 44 | + |
| 45 | + let mut line = String::new(); |
| 46 | + reader.read_line(&mut line).unwrap(); |
| 47 | + // println!("{i} {similarity} {position} {line}"); |
| 48 | + results.push(SearchResult { |
| 49 | + line: line, |
| 50 | + similarity: similarity |
| 51 | + }); |
| 52 | + } |
| 53 | + } |
| 54 | + |
| 55 | + results |
| 56 | +} |
| 57 | + |
| 58 | +#[test] |
| 59 | +fn test_similarity_search() { |
| 60 | + let results = similarity_search("molecules.smi", "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O", 0.7, 100); |
| 61 | + assert_eq!(results.len(), 1); |
| 62 | + let results = similarity_search("molecules.smi", "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O", 0.5, 100); |
| 63 | + assert_eq!(results.len(), 2); |
| 64 | +} |
0 commit comments