Overhaul protomer enumeration (#1779)

* Enumerate input molecule with protomers * Drop annotation * Isolate state enumeration tests * Test enumeration of crazy acids * Update docstrings * Simplify protomer enumeration * Strip tests * Update protomer tests * Update/tag tests * Add test with bad input state * Update docstrings, release history * Band-aid inconsistent annotations * Typo fix Co-authored-by: Josh A. Mitchell <[email protected]> * Add back tests that were only moved * Use simpler SMILES patterns * Apply suggestions from code review * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Josh A. Mitchell <[email protected]> Co-authored-by: Jeff Wagner <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jeffrey Wagner <[email protected]>
openforcefield · Feb 5, 2024 · b08243e · b08243e
1 parent 344c83f
commit b08243e
Show file tree

Hide file tree

Showing 6 changed files with 383 additions and 322 deletions.
diff --git a/docs/releasehistory.md b/docs/releasehistory.md
@@ -43,6 +43,10 @@ This release adds compatibility with QCFractal >=0.50.0, but removes compatibilt
 
 ### Behavior changes
 
+- [PR #1779](https://github.com/openforcefield/openff-toolkit/pull/1779): Overhauls `Molecule.enumerate_protomers`:
+  * By default, `max_states=0` and there is no limit to the number of returned protomers
+  * The input molecule is returned if it is enumerated with default Quacpac formal charge enumeration settings
+
 ### Bugfixes
 
 - [PR #1778](https://github.com/openforcefield/openff-toolkit/pull/1778): Ensures SD data tags are preserved in `Molecule.from_openeye` if the input is of type `oechem.OEGraphMol`.

diff --git a/openff/toolkit/_tests/molecule/__init__.py b/openff/toolkit/_tests/molecule/__init__.py
@@ -0,0 +1 @@
+"""Tests of the Molecule API."""
diff --git a/openff/toolkit/_tests/molecule/test_state_enumeration.py b/openff/toolkit/_tests/molecule/test_state_enumeration.py
@@ -0,0 +1,340 @@
+import pytest
+
+from openff.toolkit import Molecule
+from openff.toolkit._tests.utils import requires_openeye
+from openff.toolkit.utils.toolkits import OpenEyeToolkitWrapper, RDKitToolkitWrapper
+
+
+class TestTautomerEnumeration:
+    tautomer_data = [
+        {"molecule": "Oc1c(cccc3)c3nc2ccncc12", "tautomers": 2},
+        {"molecule": "CN=c1nc[nH]cc1", "tautomers": 2},
+        {"molecule": "c1[nH]c2c(=O)[nH]c(nc2n1)N", "tautomers": 14},
+    ]
+
+    @pytest.mark.parametrize(
+        "toolkit_class", [OpenEyeToolkitWrapper, RDKitToolkitWrapper]
+    )
+    @pytest.mark.parametrize("molecule_data", tautomer_data)
+    def test_enumerating_tautomers(self, molecule_data, toolkit_class):
+        """Test the ability of each toolkit to produce tautomers of an input molecule."""
+
+        if toolkit_class.is_available():
+            toolkit = toolkit_class()
+            mol = Molecule.from_smiles(
+                molecule_data["molecule"],
+                allow_undefined_stereo=True,
+                toolkit_registry=toolkit,
+            )
+
+            tautomers = mol.enumerate_tautomers(toolkit_registry=toolkit)
+
+            assert len(tautomers) == molecule_data["tautomers"]
+            assert mol not in tautomers
+            # check that the molecules are not isomorphic of the input
+            for taut in tautomers:
+                assert taut.n_conformers == 0
+                assert mol.is_isomorphic_with(taut) is False
+
+        else:
+            pytest.skip("Required toolkit is unavailable")
+
+    @pytest.mark.parametrize(
+        "toolkit_class", [OpenEyeToolkitWrapper, RDKitToolkitWrapper]
+    )
+    def test_enumerating_tautomers_options(self, toolkit_class):
+        """Test the enumeration options"""
+
+        if toolkit_class.is_available():
+            toolkit = toolkit_class()
+            # test the max molecules option
+            mol = Molecule.from_smiles(
+                "c1[nH]c2c(=O)[nH]c(nc2n1)N",
+                toolkit_registry=toolkit,
+                allow_undefined_stereo=True,
+            )
+
+            tauts_no = 5
+            tautomers = mol.enumerate_tautomers(
+                max_states=tauts_no, toolkit_registry=toolkit
+            )
+            assert len(tautomers) <= tauts_no
+            assert mol not in tautomers
+
+    @pytest.mark.parametrize(
+        "toolkit_class", [RDKitToolkitWrapper, OpenEyeToolkitWrapper]
+    )
+    def test_enumerating_no_tautomers(self, toolkit_class):
+        """Test that the toolkits return an empty list if there are no tautomers to enumerate."""
+
+        if toolkit_class.is_available():
+            toolkit = toolkit_class()
+            mol = Molecule.from_smiles("CC", toolkit_registry=toolkit)
+
+            tautomers = mol.enumerate_tautomers(toolkit_registry=toolkit)
+            assert tautomers == []
+
+        else:
+            pytest.skip("Required toolkit is unavailable")
+
+
+class TestProtomerEnumeration:
+    @requires_openeye
+    def test_enumerating_no_protomers(self):
+        """Make sure the input molecule is returned when there is only one protomer."""
+
+        mol = Molecule.from_smiles("CC")
+
+        assert len(mol.enumerate_protomers()) == 1
+        assert mol.enumerate_protomers()[0] == mol
+
+    @requires_openeye
+    def test_enumerating_protomers(self):
+        """Test enumerating the formal charges."""
+
+        # there should be three protomers, in addition to the input state
+        mol = Molecule.from_smiles("Oc2ccc(c1ccncc1)cc2")
+
+        protomers = mol.enumerate_protomers()
+
+        assert mol in protomers
+        assert len(protomers) == 4
+
+        # make sure generating extra states produces the same result
+        assert len(protomers) == len(mol.enumerate_protomers(max_states=10))
+
+        # make sure each protomer is unique
+        assert len(protomers) == len(set(protomers))
+
+    @requires_openeye
+    def test_tetracarboxylic_acid(self):
+        acid = Molecule.from_smiles("C(C(=O)O)(C(=O)O)=C(C(=O)O)(C(=O)O)")
+
+        protomers = acid.enumerate_protomers()
+
+        assert len(protomers) == 16
+        assert acid in protomers
+
+    @requires_openeye
+    def test_bad_state_not_necessarily_in_output(self):
+        # Quacpac considers NH4+ and NH3 to be valid states with default charge enumeration settings,
+        # but (reasonably enough) does not consider NH2- to be a valid state. Using any of these states
+        # as an input, the result should be 2 protomers (cation and neutral) unless `max_states=1`
+        cation = Molecule.from_smiles("[NH4+]")
+        neutral = Molecule.from_smiles("[NH3]")
+        anion = Molecule.from_smiles("[NH2-]")
+
+        for molecule in [cation, neutral, anion]:
+            assert len(molecule.enumerate_protomers(max_states=3)) == 2
+            assert len(molecule.enumerate_protomers(max_states=1)) == 1
+
+            protomers = molecule.enumerate_protomers()
+
+            assert len(protomers) == 2
+
+            assert cation in molecule.enumerate_protomers()
+            assert neutral in molecule.enumerate_protomers()
+            assert anion not in molecule.enumerate_protomers()
+
+    @pytest.mark.slow
+    @requires_openeye
+    def test_many_protomers(self):
+        acid = Molecule.from_smiles(5 * "C(C(=O)O)(C(=O)O)")
+
+        protomers = acid.enumerate_protomers()
+
+        assert acid in protomers
+        assert len(protomers) == 1600
+
+
+class TestStereoisomerEnumeration:
+    @pytest.mark.parametrize(
+        "toolkit_class", [OpenEyeToolkitWrapper, RDKitToolkitWrapper]
+    )
+    def test_enumerating_stereobonds(self, toolkit_class):
+        """Test the backend toolkits in enumerating the stereo bonds in a molecule."""
+
+        if toolkit_class.is_available():
+            toolkit = toolkit_class()
+            mol = Molecule.from_smiles(
+                "ClC=CCl", allow_undefined_stereo=True, toolkit_registry=toolkit
+            )
+
+            # use the default options
+            isomers = mol.enumerate_stereoisomers()
+            assert len(isomers) == 2
+
+            assert mol not in isomers
+            # make sure the input molecule is only different by bond stereo
+            for ismol in isomers:
+                assert (
+                    Molecule.are_isomorphic(
+                        mol,
+                        ismol,
+                        return_atom_map=False,
+                        bond_stereochemistry_matching=False,
+                    )[0]
+                    is True
+                )
+                assert mol.is_isomorphic_with(ismol) is False
+
+            # make sure the isomers are different
+            assert isomers[0].is_isomorphic_with(isomers[1]) is False
+
+        else:
+            pytest.skip("Required toolkit is unavailable")
+
+    @pytest.mark.parametrize(
+        "toolkit_class", [OpenEyeToolkitWrapper, RDKitToolkitWrapper]
+    )
+    def test_enumerating_stereocenters(self, toolkit_class):
+        """Test the backend toolkits in enumerating the stereo centers in a molecule."""
+
+        if toolkit_class.is_available():
+            toolkit = toolkit_class()
+            mol = Molecule.from_smiles(
+                "NC(Cl)(F)O", toolkit_registry=toolkit, allow_undefined_stereo=True
+            )
+
+            isomers = mol.enumerate_stereoisomers(toolkit_registry=toolkit)
+
+            assert len(isomers) == 2
+            # make sure the mol is not in the isomers and that they only differ by stereo chem
+            assert mol not in isomers
+            for ismol in isomers:
+                assert ismol.n_conformers != 0
+                assert (
+                    Molecule.are_isomorphic(
+                        mol,
+                        ismol,
+                        return_atom_map=False,
+                        atom_stereochemistry_matching=False,
+                    )[0]
+                    is True
+                )
+                assert mol.is_isomorphic_with(ismol) is False
+
+            # make sure the two isomers are different
+            assert isomers[0].is_isomorphic_with(isomers[1]) is False
+
+        else:
+            pytest.skip("Required toolkit is unavailable")
+
+    @pytest.mark.parametrize(
+        "toolkit_class", [OpenEyeToolkitWrapper, RDKitToolkitWrapper]
+    )
+    def test_enumerating_stereo_options(self, toolkit_class):
+        """Test the enumerating stereo chem options"""
+
+        if toolkit_class.is_available():
+            toolkit = toolkit_class()
+
+            # test undefined only
+            mol = Molecule.from_smiles(
+                "ClC=CCl", toolkit_registry=toolkit, allow_undefined_stereo=True
+            )
+            isomers = mol.enumerate_stereoisomers(
+                undefined_only=True, rationalise=False
+            )
+
+            assert len(isomers) == 2
+            for isomer in isomers:
+                assert isomer.n_conformers == 0
+
+            mol = Molecule.from_smiles(
+                r"Cl/C=C\Cl", toolkit_registry=toolkit, allow_undefined_stereo=True
+            )
+            isomers = mol.enumerate_stereoisomers(
+                undefined_only=True, rationalise=False
+            )
+
+            assert isomers == []
+
+            mol = Molecule.from_smiles(
+                r"Cl/C=C\Cl", toolkit_registry=toolkit, allow_undefined_stereo=True
+            )
+            isomers = mol.enumerate_stereoisomers(
+                undefined_only=False, rationalise=False
+            )
+
+            assert len(isomers) == 1
+
+            # test max isomers
+            mol = Molecule.from_smiles(
+                "BrC=C[C@H]1OC(C2)(F)C2(Cl)C1",
+                toolkit_registry=toolkit,
+                allow_undefined_stereo=True,
+            )
+            isomers = mol.enumerate_stereoisomers(
+                max_isomers=5,
+                undefined_only=True,
+                toolkit_registry=toolkit,
+                rationalise=True,
+            )
+
+            assert len(isomers) <= 5
+            for isomer in isomers:
+                assert isomer.n_conformers == 1
+
+        else:
+            pytest.skip("Required toolkit is unavailable")
+
+    @pytest.mark.parametrize(
+        "toolkit_class", [OpenEyeToolkitWrapper, RDKitToolkitWrapper]
+    )
+    @pytest.mark.parametrize(
+        "smiles, undefined_only, expected",
+        [
+            (
+                "FC(Br)(Cl)[C@@H](Br)(Cl)",
+                False,
+                [
+                    "F[C@](Br)(Cl)[C@@H](Br)(Cl)",
+                    "F[C@](Br)(Cl)[C@H](Br)(Cl)",
+                    "F[C@@](Br)(Cl)[C@@H](Br)(Cl)",
+                    "F[C@@](Br)(Cl)[C@H](Br)(Cl)",
+                ],
+            ),
+            (
+                "FC(Br)(Cl)[C@@H](Br)(Cl)",
+                True,
+                ["F[C@](Br)(Cl)[C@@H](Br)(Cl)", "F[C@@](Br)(Cl)[C@@H](Br)(Cl)"],
+            ),
+            ("F[C@H](Cl)Br", False, ["F[C@@H](Cl)Br"]),
+            ("F[C@H](Cl)Br", True, []),
+        ],
+    )
+    def test_enumerating_stereo_partially_defined(
+        self, toolkit_class, smiles, undefined_only, expected
+    ):
+        """Test the enumerating stereo of molecules with partially defined chirality"""
+
+        if not toolkit_class.is_available():
+            pytest.skip("Required toolkit is unavailable")
+
+        toolkit = toolkit_class()
+
+        # test undefined only
+        mol = Molecule.from_smiles(
+            smiles, toolkit_registry=toolkit, allow_undefined_stereo=True
+        )
+        stereoisomers = mol.enumerate_stereoisomers(
+            undefined_only=undefined_only, rationalise=False
+        )
+
+        # Ensure that the results of the enumeration are what the test expects.
+        # This roundtrips the expected output from SMILES --> OFFMol --> SMILES,
+        # since the SMILES for stereoisomers generated in this test may change depending
+        # on which cheminformatics toolkit is used.
+        expected = {
+            Molecule.from_smiles(stereoisomer, allow_undefined_stereo=True).to_smiles(
+                explicit_hydrogens=True, isomeric=True, mapped=False
+            )
+            for stereoisomer in expected
+        }
+        actual = {
+            stereoisomer.to_smiles(explicit_hydrogens=True, isomeric=True, mapped=False)
+            for stereoisomer in stereoisomers
+        }
+
+        assert expected == actual