From 88a4ad698887db1a1558fd14864438ee1aa02202 Mon Sep 17 00:00:00 2001 From: Aliaksandr Dziarkach <18146690+AliaksandrDziarkach@users.noreply.github.com> Date: Thu, 14 Sep 2023 20:32:37 +0300 Subject: [PATCH] #1254 SMARTS with component-level grouping saved without '()' (#1266) Co-authored-by: Aliaksandr Dziarkach --- api/cpp/tests/basic/reaction.cpp | 4 +- .../integration/ref/formats/smarts.py.out | 3 + .../ref/rsmiles/rsmiles_smarts.py.out | 1 + api/tests/integration/tests/formats/smarts.py | 15 +++++ .../tests/rsmiles/rsmiles_smarts.py | 20 ++++-- core/indigo-core/molecule/query_molecule.h | 2 + .../molecule/src/query_molecule.cpp | 20 ++++++ .../indigo-core/molecule/src/smiles_saver.cpp | 64 ++++++++++++++++++- .../reaction/src/rsmiles_loader.cpp | 25 +------- .../reaction/src/rsmiles_saver.cpp | 2 +- core/indigo-core/tests/tests/formats.cpp | 18 ++++++ core/indigo-core/tests/tests/reaction.cpp | 11 +++- 12 files changed, 152 insertions(+), 33 deletions(-) diff --git a/api/cpp/tests/basic/reaction.cpp b/api/cpp/tests/basic/reaction.cpp index d046228d65..cd1b21e78c 100644 --- a/api/cpp/tests/basic/reaction.cpp +++ b/api/cpp/tests/basic/reaction.cpp @@ -29,7 +29,7 @@ TEST(Reaction, automapReactionSmarts) const auto& session = IndigoSession::create(); auto reaction = session->loadReaction("[C:1]=[C:2][C:3].[C:4]=[C:5][C:6]>>[C:3][C:2]=[C:5][C:6]"); reaction.automap(IndigoAutomapMode::CLEAR); - ASSERT_STREQ("[#6]=[#6]-[#6].[#6]=[#6]-[#6]>>[#6]-[#6]=[#6]-[#6] |^3:0,3,^1:1,4,7,8|", reaction.smarts().c_str()); + ASSERT_STREQ("[#6]=[#6]-[#6].[#6]=[#6]-[#6]>>[#6]-[#6]=[#6]-[#6]", reaction.smarts().c_str()); } TEST(Reaction, automapQueryReactionSmarts) @@ -37,5 +37,5 @@ TEST(Reaction, automapQueryReactionSmarts) const auto& session = IndigoSession::create(); auto reaction = session->loadQueryReaction("[C:1]=[C:2][C:3].[C:4]=[C:5][C:6]>>[C:3][C:2]=[C:5][C:6] |$;;R1;;;R2;R1;;;R2$|"); reaction.automap(IndigoAutomapMode::CLEAR); - ASSERT_STREQ("[#6]=[#6]-[#6].[#6]=[#6]-[#6]>>[#6]-[#6]=[#6]-[#6] |$;;R1;;;R2;R1;;;R2$|", reaction.smarts().c_str()); + ASSERT_STREQ("[#6]=[#6]-[#6].[#6]=[#6]-[#6]>>[#6]-[#6]=[#6]-[#6]", reaction.smarts().c_str()); } diff --git a/api/tests/integration/ref/formats/smarts.py.out b/api/tests/integration/ref/formats/smarts.py.out index 605a3dc506..c61ce65b05 100644 --- a/api/tests/integration/ref/formats/smarts.py.out +++ b/api/tests/integration/ref/formats/smarts.py.out @@ -6,3 +6,6 @@ CC[C+5]CCCCC CC[C+5]CCCCC **** Load and Save as Query with not list **** [#6]-[#6]-[!#5!#6!#7]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6] +**** Load and Save as Query with component-level grouping **** +([#8].[#6]) is ok. smarts_in==smarts_out +([#8].[#6]).([#8].[#6]) is ok. smarts_in==smarts_out diff --git a/api/tests/integration/ref/rsmiles/rsmiles_smarts.py.out b/api/tests/integration/ref/rsmiles/rsmiles_smarts.py.out index f883375dca..5b18061691 100644 --- a/api/tests/integration/ref/rsmiles/rsmiles_smarts.py.out +++ b/api/tests/integration/ref/rsmiles/rsmiles_smarts.py.out @@ -1 +1,2 @@ SMARTS component-level grouping load ok +SMARTS component-level grouping save ok diff --git a/api/tests/integration/tests/formats/smarts.py b/api/tests/integration/tests/formats/smarts.py index d16c49c348..6d4c4b039a 100644 --- a/api/tests/integration/tests/formats/smarts.py +++ b/api/tests/integration/tests/formats/smarts.py @@ -16,6 +16,17 @@ def testSmarts(m): print(m.smiles()) +def test_smarts_load_save(smarts_in): + m = indigo.loadSmarts(smarts_in) + smarts_out = m.smarts() + if smarts_in == smarts_out: + print("%s is ok. smarts_in==smarts_out" % smarts_in) + else: + print("smarts_in!=smarts_out") + print(" smarts_in=%s", smarts_in) + print("smarts_out=%s", smarts_out) + + molstr = """ Ketcher 11241617102D 1 1.00000 0.00000 0 @@ -81,3 +92,7 @@ def testSmarts(m): print("**** Load and Save as Query with not list ****") m = indigo.loadQueryMolecule(notlist) print(m.smarts()) + +print("**** Load and Save as Query with component-level grouping ****") +test_smarts_load_save("([#8].[#6])") +test_smarts_load_save("([#8].[#6]).([#8].[#6])") diff --git a/api/tests/integration/tests/rsmiles/rsmiles_smarts.py b/api/tests/integration/tests/rsmiles/rsmiles_smarts.py index 8fd4dcaa75..f564e5c536 100644 --- a/api/tests/integration/tests/rsmiles/rsmiles_smarts.py +++ b/api/tests/integration/tests/rsmiles/rsmiles_smarts.py @@ -10,8 +10,18 @@ indigo = Indigo() - -rxn1 = indigo.loadReactionSmarts("([#8:1].[#6:2])>>([#8:1].[#6:2])") -assert rxn1.countReactants() == 1 -assert rxn1.countProducts() == 1 -print("SMARTS component-level grouping load ok") +smarts_in = "([#8:1].[#6:2])>>([#8:1].[#6:2])" +rxn1 = indigo.loadReactionSmarts(smarts_in) +if rxn1.countReactants() == 1 and rxn1.countProducts() == 1: + print("SMARTS component-level grouping load ok") +else: + print("SMARTS component-level grouping load failed") + print("rxn1.countReactants()=%s" % rxn1.countReactants()) + print("rxn1.countProducts()=%s" % rxn1.countProducts()) +smarts_out = rxn1.smarts() +if smarts_in == smarts_out: + print("SMARTS component-level grouping save ok") +else: + print("SMARTS component-level grouping save failed") + print("smart_in=%s" % smarts_in) + print("smart_ou=%s" % smarts_out) diff --git a/core/indigo-core/molecule/query_molecule.h b/core/indigo-core/molecule/query_molecule.h index dc6c51fd39..cd98fa75fd 100644 --- a/core/indigo-core/molecule/query_molecule.h +++ b/core/indigo-core/molecule/query_molecule.h @@ -372,6 +372,8 @@ namespace indigo // must belong to different connected components of the target molecule Array components; + void getComponentNeighbors(std::list>& componentNeighbors); + void invalidateAtom(int index, int mask) override; int getAtomMaxExteralConnectivity(int idx); diff --git a/core/indigo-core/molecule/src/query_molecule.cpp b/core/indigo-core/molecule/src/query_molecule.cpp index d32df22ec7..6a46a3175c 100644 --- a/core/indigo-core/molecule/src/query_molecule.cpp +++ b/core/indigo-core/molecule/src/query_molecule.cpp @@ -2140,3 +2140,23 @@ void QueryMolecule::getQueryAtomLabel(int qa, Array& result) if (it != query_atom_labels.end()) result.readString(it->second.c_str(), true); } + +void QueryMolecule::getComponentNeighbors(std::list>& componentNeighbors) +{ + std::unordered_map> componentAtoms; + for (int i = 0; i < components.size(); ++i) + { + int componentId = components[i]; + if (componentId > 0) + { // vertice[i] belongs to component #Id + componentAtoms[componentId].insert(i); + } + } + componentNeighbors.clear(); + for (auto elem : componentAtoms) + { + auto atoms = elem.second; + if (atoms.size() > 1) + componentNeighbors.emplace_back(atoms); + } +} \ No newline at end of file diff --git a/core/indigo-core/molecule/src/smiles_saver.cpp b/core/indigo-core/molecule/src/smiles_saver.cpp index 1bc2397e7f..c91ffc0af4 100644 --- a/core/indigo-core/molecule/src/smiles_saver.cpp +++ b/core/indigo-core/molecule/src/smiles_saver.cpp @@ -185,6 +185,47 @@ void SmilesSaver::_saveMolecule() walk.walk(); const Array& v_seq = walk.getSequence(); + Array v_to_comp_group; + v_to_comp_group.resize(v_seq.size()); + v_to_comp_group.fffill(); + + if (_qmol != nullptr && smarts_mode && _qmol->components.size() >= v_seq.size()) + { + if (v_seq.size() < 1) + return; // No atoms to save + std::set components; + int cur_component = -1; + for (int i = 0; i < v_seq.size(); ++i) + { + // In v_seq each fragment started with vertex which parent == -1 + // In SMARTS some fragments could be grouped (component-level grouping) + // In QueryMolecule group number stored in "".components" member. GroupId == 0 means no group defined. + // Each fragment - connected graph, so all vertexes should belong to one group. + // All group fragments should go one by one - in SMARTS its inside "()". + if (v_seq[i].parent_vertex < 0) // New Fragment + { + int new_component = _qmol->components[v_seq[i].idx]; + // if component defined for new fragment(id>0) and its different from previous and seen before + if (new_component > 0 && new_component != cur_component && components.count(new_component)) + { + // According to the DfsWalk code, the groups components should be neighbors. + // If will be found case when it wrong - add code to rearrange fragments + throw Error("SMARTS fragments need to reaarange."); + } + components.emplace(new_component); + cur_component = new_component; + } + else + { + if (cur_component != _qmol->components[v_seq[i].idx]) + { + // Fragment contains atoms from different components - something went wrong + throw Error("Fragment contains atoms from different components."); + } + } + v_to_comp_group[i] = cur_component; + } + } // fill up neighbor lists for the stereocenters calculation for (i = 0; i < v_seq.size(); i++) @@ -525,8 +566,25 @@ void SmilesSaver::_saveMolecule() else { if (!first_component) + { + // group == 0 means no group set. + int prev_group = v_to_comp_group[i - 1]; + int new_group = v_to_comp_group[i]; + bool different_groups = new_group != prev_group; + if (smarts_mode && prev_group && different_groups) // if component group ended + _output.writeChar(')'); + _output.writeChar('.'); - first_component = false; + + if (smarts_mode && new_group && different_groups) // if new group started + _output.writeChar('('); + } + else + { + if (smarts_mode && v_to_comp_group[i] > 0) // component level grouping set for this fragment + _output.writeChar('('); + first_component = false; + } _written_components++; } if (write_atom) @@ -599,8 +657,10 @@ void SmilesSaver::_saveMolecule() _output.writeString("{+n}"); } } + if (smarts_mode && v_to_comp_group[i - 1] > 0) // if group set for last fragment - add finish ) + _output.writeChar(')'); - if (write_extra_info && chemaxon) + if (write_extra_info && chemaxon && !smarts_mode) // no extended block in SMARTS { // Before we write the |...| block (ChemAxon's Extended SMILES), // we must clean up the mess we did with the attachment points diff --git a/core/indigo-core/reaction/src/rsmiles_loader.cpp b/core/indigo-core/reaction/src/rsmiles_loader.cpp index b2eed45a2f..40b695ccaa 100644 --- a/core/indigo-core/reaction/src/rsmiles_loader.cpp +++ b/core/indigo-core/reaction/src/rsmiles_loader.cpp @@ -75,25 +75,6 @@ void RSmilesLoader::loadQueryReaction(QueryReaction& rxn) _loadReaction(); } -static void _createComponenetsExternalNeighbors(QueryMolecule& qmol, std::list>& externNeib) -{ - std::unordered_map> componentAtoms; - for (int i = 0; i < qmol.components.size(); ++i) - { - int componentId = qmol.components[i]; - if (componentId > 0) - { // vertice[i] belongs to component #Id - componentAtoms[componentId].insert(i); - } - } - for (auto elem : componentAtoms) - { - auto atoms = elem.second; - if (atoms.size() > 1) - externNeib.emplace_back(atoms); - } -} - void RSmilesLoader::_loadReaction() { _brxn->clear(); @@ -147,7 +128,7 @@ void RSmilesLoader::_loadReaction() { rcnt = std::make_unique(); r_loader.loadQueryMolecule(static_cast(*rcnt)); - _createComponenetsExternalNeighbors(static_cast(*rcnt), rcnt_extNeibs); + static_cast(*rcnt).getComponentNeighbors(rcnt_extNeibs); } rcnt_aam.copy(rcnt->reaction_atom_mapping); @@ -181,7 +162,7 @@ void RSmilesLoader::_loadReaction() { ctlt = std::make_unique(); c_loader.loadQueryMolecule(static_cast(*ctlt)); - _createComponenetsExternalNeighbors(static_cast(*ctlt), ctlt_extNeibs); + static_cast(*ctlt).getComponentNeighbors(ctlt_extNeibs); } ctlt_aam.copy(ctlt->reaction_atom_mapping); @@ -221,7 +202,7 @@ void RSmilesLoader::_loadReaction() { prod = std::make_unique(); p_loader.loadQueryMolecule(static_cast(*prod)); - _createComponenetsExternalNeighbors(static_cast(*prod), prod_extNeibs); + static_cast(*prod).getComponentNeighbors(prod_extNeibs); } prod_aam.copy(prod->reaction_atom_mapping); diff --git a/core/indigo-core/reaction/src/rsmiles_saver.cpp b/core/indigo-core/reaction/src/rsmiles_saver.cpp index cafcc18f54..a103eafd79 100644 --- a/core/indigo-core/reaction/src/rsmiles_saver.cpp +++ b/core/indigo-core/reaction/src/rsmiles_saver.cpp @@ -146,7 +146,7 @@ void RSmilesSaver::_saveReaction() _writeMolecule(i); } - if (chemaxon) + if (chemaxon && !smarts_mode) { _comma = false; _writeFragmentsInfo(); diff --git a/core/indigo-core/tests/tests/formats.cpp b/core/indigo-core/tests/tests/formats.cpp index 9d4c960daa..b06cb89372 100644 --- a/core/indigo-core/tests/tests/formats.cpp +++ b/core/indigo-core/tests/tests/formats.cpp @@ -323,3 +323,21 @@ M END saver.saveMolecule(t_mol); ASSERT_EQ(t_mol.sgroups.getSGroupCount(), 0); } + +TEST_F(IndigoCoreFormatsTest, smarts_load_save) +{ + QueryMolecule q_mol; + + std::string smarts_in{"([#8].[#6]).([#6].[#8])"}; + BufferScanner scanner(smarts_in.c_str()); + SmilesLoader loader(scanner); + loader.smarts_mode = true; + loader.loadQueryMolecule(q_mol); + Array out; + ArrayOutput std_out(out); + SmilesSaver saver(std_out); + saver.smarts_mode = true; + saver.saveQueryMolecule(q_mol); + std::string smarts_out{out.ptr(), static_cast(out.size())}; + ASSERT_EQ(smarts_in, smarts_out); +} diff --git a/core/indigo-core/tests/tests/reaction.cpp b/core/indigo-core/tests/tests/reaction.cpp index 41b378f4bc..81c5320ab1 100644 --- a/core/indigo-core/tests/tests/reaction.cpp +++ b/core/indigo-core/tests/tests/reaction.cpp @@ -20,6 +20,7 @@ #include #include +#include #include "common.h" @@ -65,7 +66,7 @@ TEST_F(IndigoCoreReactionTest, aliases_complex) QueryReaction reaction; loadQueryReaction("[#6:1]=[#6:2][#6:3].[#6:4]=[#6:5][#6:6]>>[#6:3][#6:2]=[#6:5][#6:6] |$;;R1;;;R2;R1;;;R2$|", reaction); reaction.clearAAM(); - ASSERT_STREQ("[#6]=[#6]-[#6].[#6]=[#6]-[#6]>>[#6]-[#6]=[#6]-[#6] |$;;R1;;;R2;R1;;;R2$|", saveReactionSmiles(reaction, true).c_str()); + ASSERT_STREQ("[#6]=[#6]-[#6].[#6]=[#6]-[#6]>>[#6]-[#6]=[#6]-[#6]", saveReactionSmiles(reaction, true).c_str()); ASSERT_STREQ("$RXN\n\n -INDIGO- 0100000000\n\n 2 1\n$MOL\n\n -INDIGO-01000000002D\n\n 3 2 0 0 0 0 0 0 0 0999 V2000\n 0.0000 0.0000 " "0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 0.0000 0.0000 " " 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1 2 2 0 0 0 0\n 2 3 1 0 0 0 0\nA 3\nR1\nM END\n$MOL\n\n " @@ -99,4 +100,12 @@ TEST_F(IndigoCoreReactionTest, smarts_reaction) loadQueryReaction(smarts_in.c_str(), qr); ASSERT_EQ(qr.reactantsCount(), 1); ASSERT_EQ(qr.productsCount(), 1); + Array out; + ArrayOutput std_out(out); + RSmilesSaver saver(std_out); + saver.smarts_mode = true; + saver.saveQueryReaction(qr); + out.push(0); + std::string smarts_out{out.ptr()}; + ASSERT_EQ(smarts_in, smarts_out); } \ No newline at end of file