From df4ad3eca275b678951a41f8040a093b3af19451 Mon Sep 17 00:00:00 2001 From: Mikhail Kviatkovskii Date: Sun, 7 Apr 2024 21:48:13 +0400 Subject: [PATCH] Fixed SMILES reader to handle bonds --- src/indigo/molecule/model/molecule.rs | 12 ++ src/indigo/molecule/smiles/reader/molecule.rs | 196 +++++++++++++++++- 2 files changed, 207 insertions(+), 1 deletion(-) diff --git a/src/indigo/molecule/model/molecule.rs b/src/indigo/molecule/model/molecule.rs index fe824e0..479e242 100644 --- a/src/indigo/molecule/model/molecule.rs +++ b/src/indigo/molecule/model/molecule.rs @@ -38,6 +38,18 @@ impl Molecule { return self.graph.edge_weight(edge); } + pub fn get_bond_by_atoms(&self, ni1: NodeIndex, ni2: NodeIndex) -> Option<&Bond> { + let e = self.graph.find_edge_undirected(ni1, ni2); + if e.is_none() { + return None; + } + return self.graph.edge_weight(e.unwrap().0); + } + + pub fn has_bond(&self, ni1: NodeIndex, ni2: NodeIndex) -> bool { + return self.graph.find_edge_undirected(ni1, ni2).is_some(); + } + pub fn count_atoms(&self) -> usize { return self.graph.node_count(); } diff --git a/src/indigo/molecule/smiles/reader/molecule.rs b/src/indigo/molecule/smiles/reader/molecule.rs index a379312..b7418b8 100644 --- a/src/indigo/molecule/smiles/reader/molecule.rs +++ b/src/indigo/molecule/smiles/reader/molecule.rs @@ -19,6 +19,7 @@ fn parse_molecule(input: &str) -> IResult<&str, Molecule> { let mut molecule = Molecule::new(); let mut open_cycles: HashMap = HashMap::new(); let mut stack: Vec<(NodeIndex, BondOrder)> = Vec::new(); + let mut pending_bonds: Vec<(NodeIndex, NodeIndex, Bond)> = Vec::new(); let mut parse_atoms_and_bonds = many0(alt(( map(parse_atom, |atom| (Some(atom), None, None, None)), @@ -33,7 +34,6 @@ fn parse_molecule(input: &str) -> IResult<&str, Molecule> { let mut prev_node = NodeIndex::end(); let mut prev_bond = BondOrder::Single; - let mut pending_bonds: Vec<(NodeIndex, NodeIndex, Bond)> = Vec::new(); for (atom, bond, cycle_digit, open_paren) in atoms_and_bonds { if let Some(open) = open_paren { @@ -54,6 +54,7 @@ fn parse_molecule(input: &str) -> IResult<&str, Molecule> { } else if let Some(digit) = cycle_digit { if let Some(open_node) = open_cycles.remove(&digit) { pending_bonds.push((prev_node, open_node, Bond { order: prev_bond })); + prev_bond = BondOrder::Single; } else { open_cycles.insert(digit, prev_node); } @@ -131,6 +132,33 @@ mod tests { ); } + #[test] + fn parse_molecule_branch() { + let m = parse_molecule("C(O)N").unwrap().1; + assert_eq!(m.count_atoms(), 3); + assert_eq!(m.count_bonds(), 2); + assert_eq!( + m.get_atom(NodeIndex::new(0)).unwrap().element.atomic_number, + 6 + ); + assert_eq!( + m.get_atom(NodeIndex::new(1)).unwrap().element.atomic_number, + 8 + ); + assert_eq!( + m.get_atom(NodeIndex::new(2)).unwrap().element.atomic_number, + 7 + ); + assert_eq!( + m.get_bond(EdgeIndex::new(0)).unwrap().order, + BondOrder::Single + ); + assert_eq!( + m.get_bond(EdgeIndex::new(1)).unwrap().order, + BondOrder::Single + ); + } + #[test] fn parse_molecule_c1cc1() { let m = parse_molecule("C1P=N#1").unwrap().1; @@ -177,6 +205,172 @@ mod tests { ); } + #[test] + fn parse_molecule_branch_double_bond() { + let m = parse_molecule("C(=O)N").unwrap().1; + assert_eq!(m.count_atoms(), 3); + assert_eq!(m.count_bonds(), 2); + assert_eq!( + m.get_atom(NodeIndex::new(0)).unwrap().element.atomic_number, + 6 + ); + assert_eq!( + m.get_atom(NodeIndex::new(1)).unwrap().element.atomic_number, + 8 + ); + assert_eq!( + m.get_atom(NodeIndex::new(2)).unwrap().element.atomic_number, + 7 + ); + assert_eq!( + m.get_bond_by_atoms(NodeIndex::new(0), NodeIndex::new(1)).unwrap().order, + BondOrder::Double + ); + assert_eq!( + m.get_bond_by_atoms(NodeIndex::new(0), NodeIndex::new(2)).unwrap().order, + BondOrder::Single + ); + } + + #[test] + fn parse_molecule_branch_double_bonds() { + let m = parse_molecule("C(=O)=N").unwrap().1; + assert_eq!(m.count_atoms(), 3); + assert_eq!(m.count_bonds(), 2); + assert_eq!( + m.get_atom(NodeIndex::new(0)).unwrap().element.atomic_number, + 6 + ); + assert_eq!( + m.get_atom(NodeIndex::new(1)).unwrap().element.atomic_number, + 8 + ); + assert_eq!( + m.get_atom(NodeIndex::new(2)).unwrap().element.atomic_number, + 7 + ); + assert_eq!( + m.get_bond_by_atoms(NodeIndex::new(0), NodeIndex::new(1)).unwrap().order, + BondOrder::Double + ); + assert_eq!( + m.get_bond_by_atoms(NodeIndex::new(0), NodeIndex::new(2)).unwrap().order, + BondOrder::Double + ); + } + + #[test] + fn parse_molecule_branch_recursive() { + let m = parse_molecule("C(=S(=O)P)N").unwrap().1; + assert_eq!(m.count_atoms(), 5); + assert_eq!(m.count_bonds(), 4); + assert_eq!( + m.get_atom(NodeIndex::new(0)).unwrap().element.atomic_number, + 6 + ); + assert_eq!( + m.get_atom(NodeIndex::new(1)).unwrap().element.atomic_number, + 16 + ); + assert_eq!( + m.get_atom(NodeIndex::new(2)).unwrap().element.atomic_number, + 8 + ); + assert_eq!( + m.get_atom(NodeIndex::new(3)).unwrap().element.atomic_number, + 15 + ); + assert_eq!( + m.get_atom(NodeIndex::new(4)).unwrap().element.atomic_number, + 7 + ); + assert_eq!( + m.get_bond_by_atoms(NodeIndex::new(0), NodeIndex::new(1)).unwrap().order, + BondOrder::Double + ); + assert_eq!( + m.get_bond_by_atoms(NodeIndex::new(1), NodeIndex::new(2)).unwrap().order, + BondOrder::Double + ); + assert_eq!( + m.get_bond_by_atoms(NodeIndex::new(1), NodeIndex::new(3)).unwrap().order, + BondOrder::Single + ); + assert_eq!( + m.get_bond_by_atoms(NodeIndex::new(0), NodeIndex::new(4)).unwrap().order, + BondOrder::Single + ); + } + + #[test] + fn parse_molecule_cycle_double() { + let m = parse_molecule("N1OC=1S").unwrap().1; + assert_eq!(m.count_atoms(), 4); + assert_eq!(m.count_bonds(), 4); + assert_eq!( + m.get_atom(NodeIndex::new(0)).unwrap().element.atomic_number, + 7 + ); + assert_eq!( + m.get_atom(NodeIndex::new(1)).unwrap().element.atomic_number, + 8 + ); + assert_eq!( + m.get_atom(NodeIndex::new(2)).unwrap().element.atomic_number, + 6 + ); + assert_eq!( + m.get_atom(NodeIndex::new(3)).unwrap().element.atomic_number, + 16 + ); + assert!(m.has_bond(NodeIndex::new(0), NodeIndex::new(1))); + assert!(m.get_bond_by_atoms(NodeIndex::new(0), NodeIndex::new(1)).unwrap().order == BondOrder::Single); + assert!(m.has_bond(NodeIndex::new(1), NodeIndex::new(2))); + assert!(m.get_bond_by_atoms(NodeIndex::new(1), NodeIndex::new(2)).unwrap().order == BondOrder::Single); + assert!(m.has_bond(NodeIndex::new(0), NodeIndex::new(2))); + assert!(m.get_bond_by_atoms(NodeIndex::new(0), NodeIndex::new(2)).unwrap().order == BondOrder::Double); + assert!(m.has_bond(NodeIndex::new(2), NodeIndex::new(3))); + assert!(m.get_bond_by_atoms(NodeIndex::new(2), NodeIndex::new(3)).unwrap().order == BondOrder::Single); + } + + + #[test] + fn parse_molecule_cycle_branch() { + let m = parse_molecule("N1C(=P)S=1O").unwrap().1; + assert_eq!(m.count_atoms(), 5); + assert_eq!(m.count_bonds(), 5); + assert_eq!( + m.get_atom(NodeIndex::new(0)).unwrap().element.atomic_number, + 7 + ); + assert_eq!( + m.get_atom(NodeIndex::new(1)).unwrap().element.atomic_number, + 6 + ); + assert_eq!( + m.get_atom(NodeIndex::new(2)).unwrap().element.atomic_number, + 15 + ); + assert_eq!( + m.get_atom(NodeIndex::new(3)).unwrap().element.atomic_number, + 16 + ); + assert_eq!( + m.get_atom(NodeIndex::new(4)).unwrap().element.atomic_number, + 8 + ); + assert!(m.has_bond(NodeIndex::new(0), NodeIndex::new(1))); + assert!(m.get_bond_by_atoms(NodeIndex::new(0), NodeIndex::new(1)).unwrap().order == BondOrder::Single); + assert!(m.has_bond(NodeIndex::new(1), NodeIndex::new(2))); + assert!(m.get_bond_by_atoms(NodeIndex::new(1), NodeIndex::new(2)).unwrap().order == BondOrder::Double); + assert!(m.has_bond(NodeIndex::new(1), NodeIndex::new(3))); + assert!(m.get_bond_by_atoms(NodeIndex::new(1), NodeIndex::new(3)).unwrap().order == BondOrder::Single); + assert!(m.has_bond(NodeIndex::new(3), NodeIndex::new(4))); + assert!(m.get_bond_by_atoms(NodeIndex::new(3), NodeIndex::new(4)).unwrap().order == BondOrder::Single); + assert!(m.has_bond(NodeIndex::new(3), NodeIndex::new(0))); + assert!(m.get_bond_by_atoms(NodeIndex::new(3), NodeIndex::new(0)).unwrap().order == BondOrder::Double); + } + #[test] fn parse_molecule_c1cc() { assert!(parse_molecule("C1CC").is_err())