diff --git a/docs/releasehistory.md b/docs/releasehistory.md index da2dd3963..e4c73d725 100644 --- a/docs/releasehistory.md +++ b/docs/releasehistory.md @@ -10,7 +10,7 @@ Releases follow the `major.minor.micro` scheme recommended by [PEP440](https://w ### New features - [PR #1567](https://github.com/openforcefield/openff-toolkit/pull/1567): Allows setting `Molecule.name` in `Molecule.from_smiles`, `from_inchi`, `from_polymer_pdb`, and `from_pdb_and_smiles`. -- [PR #1565](https://github.com/openforcefield/openff-toolkit/pull/1565): Adds `Topology.from_pdb` +- [PR #1565](https://github.com/openforcefield/openff-toolkit/pull/1565): Adds `Topology.from_pdb`. ### Behavior changes - [PR #1569](https://github.com/openforcefield/openff-toolkit/pull/1569): Several instances of `Exception` being raised are now replaced with other exceptions being raised. diff --git a/openff/toolkit/data/proteins/ace-ZZZ-gly-nme.pdb b/openff/toolkit/data/proteins/ace-ZZZ-gly-nme.pdb new file mode 100644 index 000000000..e5055b423 --- /dev/null +++ b/openff/toolkit/data/proteins/ace-ZZZ-gly-nme.pdb @@ -0,0 +1,78 @@ +REMARK 1 CREATED WITH OPENMM 8.0, 2023-04-25 +ATOM 1 C ACE A 1 -2.066 -2.592 0.175 1.00 0.00 C +ATOM 2 CH3 ACE A 1 -2.551 -3.997 0.367 1.00 0.00 C +ATOM 3 O ACE A 1 -2.742 -1.857 -0.621 1.00 0.00 O +ATOM 4 H1 ACE A 1 -3.605 -4.130 0.022 1.00 0.00 H +ATOM 5 H2 ACE A 1 -1.889 -4.674 -0.185 1.00 0.00 H +ATOM 6 H3 ACE A 1 -2.443 -4.229 1.460 1.00 0.00 H +HETATM 7 C3x ZZZ A 2 0.958 -0.804 0.130 1.00 0.00 C +HETATM 8 O2x ZZZ A 2 1.422 0.410 -0.180 1.00 0.00 O +HETATM 9 P1x ZZZ A 2 2.782 0.984 0.562 1.00 0.00 P +HETATM 10 O3x ZZZ A 2 3.223 2.281 -0.117 1.00 0.00 O +HETATM 11 O4x ZZZ A 2 4.036 -0.203 0.428 1.00 0.00 O +HETATM 12 C4x ZZZ A 2 5.229 0.481 0.703 1.00 0.00 C +HETATM 13 N1x ZZZ A 2 5.451 1.459 -0.299 1.00 0.00 N +HETATM 14 C5x ZZZ A 2 6.447 2.391 0.081 1.00 0.00 C +HETATM 15 O5x ZZZ A 2 7.549 2.337 -0.776 1.00 0.00 O +HETATM 16 O6x ZZZ A 2 2.467 1.269 2.209 1.00 0.00 O +HETATM 17 C6x ZZZ A 2 -0.465 -0.731 0.692 1.00 0.00 C +HETATM 18 H4x ZZZ A 2 1.563 -1.211 0.974 1.00 0.00 H +HETATM 19 C7x ZZZ A 2 0.969 -1.745 -1.031 1.00 0.00 C +HETATM 20 N2x ZZZ A 2 -0.925 -2.088 0.827 1.00 0.00 N +HETATM 21 H5x ZZZ A 2 -0.519 -0.217 1.646 1.00 0.00 H +HETATM 22 C8x ZZZ A 2 -1.233 0.066 -0.254 1.00 0.00 C +HETATM 23 H6x ZZZ A 2 2.010 -1.912 -1.357 1.00 0.00 H +HETATM 24 H7x ZZZ A 2 0.425 -1.369 -1.907 1.00 0.00 H +HETATM 25 H8x ZZZ A 2 0.577 -2.754 -0.739 1.00 0.00 H +HETATM 26 H9x ZZZ A 2 -0.386 -2.741 1.455 1.00 0.00 H +HETATM 27 O7x ZZZ A 2 -0.724 0.367 -1.381 1.00 0.00 O +HETATM 28 H10x ZZZ A 2 5.158 0.829 1.726 1.00 0.00 H +HETATM 29 H11x ZZZ A 2 6.021 -0.306 0.563 1.00 0.00 H +HETATM 30 H12x ZZZ A 2 5.170 1.366 -1.230 1.00 0.00 H +HETATM 31 H13x ZZZ A 2 6.754 2.406 1.128 1.00 0.00 H +HETATM 32 H14x ZZZ A 2 5.983 3.478 -0.107 1.00 0.00 H +HETATM 33 H15x ZZZ A 2 8.354 2.068 -0.209 1.00 0.00 H +ATOM 34 N GLY A 3 -2.558 0.532 -0.002 1.00 0.00 N +ATOM 35 CA GLY A 3 -3.219 1.371 -0.989 1.00 0.00 C +ATOM 36 H GLY A 3 -3.047 0.271 0.916 1.00 0.00 H +ATOM 37 C GLY A 3 -4.631 1.568 -0.767 1.00 0.00 C +ATOM 38 HA1 GLY A 3 -2.631 2.367 -0.989 1.00 0.00 H +ATOM 39 HA2 GLY A 3 -3.015 0.934 -1.970 1.00 0.00 H +ATOM 40 O GLY A 3 -5.267 2.435 -1.509 1.00 0.00 O +ATOM 41 N NME A 4 -5.391 0.878 0.228 1.00 0.00 N +ATOM 42 C NME A 4 -6.816 1.155 0.367 1.00 0.00 C +ATOM 43 H NME A 4 -5.003 0.147 0.835 1.00 0.00 H +ATOM 44 HH31 NME A 4 -7.186 1.620 -0.590 1.00 0.00 H +ATOM 45 HH32 NME A 4 -6.902 1.861 1.199 1.00 0.00 H +ATOM 46 HH33 NME A 4 -7.332 0.226 0.536 1.00 0.00 H +TER 47 NME A 4 +CONECT 6 2 +CONECT 7 8 17 18 19 +CONECT 8 7 9 +CONECT 9 8 10 11 16 +CONECT 10 9 +CONECT 11 9 12 +CONECT 12 11 13 28 29 +CONECT 13 12 14 30 +CONECT 14 13 15 31 32 +CONECT 15 14 33 +CONECT 16 9 +CONECT 17 7 20 21 22 +CONECT 18 7 +CONECT 19 7 23 24 25 +CONECT 20 1 17 26 +CONECT 21 17 +CONECT 22 17 27 34 +CONECT 23 19 +CONECT 24 19 +CONECT 25 19 +CONECT 26 20 +CONECT 27 22 +CONECT 28 12 +CONECT 29 12 +CONECT 30 13 +CONECT 31 14 +CONECT 32 14 +CONECT 33 15 +CONECT 34 22 +END diff --git a/openff/toolkit/data/proteins/ace-ZZZ-gly-nme.sdf b/openff/toolkit/data/proteins/ace-ZZZ-gly-nme.sdf new file mode 100644 index 000000000..70c8a1462 --- /dev/null +++ b/openff/toolkit/data/proteins/ace-ZZZ-gly-nme.sdf @@ -0,0 +1,98 @@ + + RDKit 3D + + 46 45 0 0 0 0 0 0 0 0999 V2000 + 0.3304 -3.1891 -0.0011 C 0 0 0 0 0 0 0 0 0 0 0 0 + 0.1741 -4.6801 -0.0964 C 0 0 0 0 0 0 0 0 0 0 0 0 + 1.2672 -2.5771 -0.5521 O 0 0 0 0 0 0 0 0 0 0 0 0 + -0.9097 -4.8221 -0.3619 H 0 0 0 0 0 0 0 0 0 0 0 0 + 0.3001 -5.0996 0.9179 H 0 0 0 0 0 0 0 0 0 0 0 0 + 0.8050 -5.1289 -0.8713 H 0 0 0 0 0 0 0 0 0 0 0 0 + 0.1336 -0.1285 0.2580 C 0 0 1 0 0 0 0 0 0 0 0 0 + 1.4706 -0.2969 0.4981 O 0 0 0 0 0 0 0 0 0 0 0 0 + 2.3559 0.7366 -0.5599 P 0 0 2 0 0 5 0 0 0 0 0 0 + 1.4404 1.3216 -1.6193 O 0 0 0 0 0 0 0 0 0 0 0 0 + 3.5581 -0.0903 -1.3039 O 0 0 0 0 0 0 0 0 0 0 0 0 + 4.8210 0.0033 -0.8250 C 0 0 0 0 0 0 0 0 0 0 0 0 + 5.4547 1.2719 -0.7892 N 0 0 0 0 0 0 0 0 0 0 0 0 + 6.8337 1.1928 -0.3195 C 0 0 0 0 0 0 0 0 0 0 0 0 + 7.4169 2.4351 -0.2557 O 0 0 0 0 0 0 0 0 0 0 0 0 + 2.9095 1.9785 0.4905 O 0 0 0 0 0 1 0 0 0 0 0 0 + -0.7328 -1.0893 0.9908 C 0 0 2 0 0 0 0 0 0 0 0 0 + -0.1291 1.2859 0.9765 C 0 0 0 0 0 0 0 0 0 0 0 0 + -0.1316 0.0777 -0.7485 H 0 0 0 0 0 0 0 0 0 0 0 0 + -0.6635 -2.4723 0.7661 N 0 0 0 0 0 0 0 0 0 0 0 0 + -2.1195 -0.6056 1.2094 C 0 0 0 0 0 0 0 0 0 0 0 0 + -0.2330 -1.0071 2.0751 H 0 0 0 0 0 0 0 0 0 0 0 0 + 0.0941 1.0204 2.0208 H 0 0 0 0 0 0 0 0 0 0 0 0 + 0.4747 2.0365 0.5326 H 0 0 0 0 0 0 0 0 0 0 0 0 + -1.2269 1.3969 0.8838 H 0 0 0 0 0 0 0 0 0 0 0 0 + -1.4060 -3.1368 1.2072 H 0 0 0 0 0 0 0 0 0 0 0 0 + -2.4522 -0.1233 2.3086 O 0 0 0 0 0 0 0 0 0 0 0 0 + 4.8640 -0.4320 0.2332 H 0 0 0 0 0 0 0 0 0 0 0 0 + 5.4822 -0.7262 -1.3981 H 0 0 0 0 0 0 0 0 0 0 0 0 + 4.9861 1.9578 -0.1668 H 0 0 0 0 0 0 0 0 0 0 0 0 + 7.3873 0.6244 -1.1562 H 0 0 0 0 0 0 0 0 0 0 0 0 + 6.9138 0.5503 0.5556 H 0 0 0 0 0 0 0 0 0 0 0 0 + 7.0636 2.9092 0.5212 H 0 0 0 0 0 0 0 0 0 0 0 0 + -3.0415 -0.7172 0.1714 N 0 0 0 0 0 0 0 0 0 0 0 0 + -4.4044 -0.3150 0.1018 C 0 0 0 0 0 0 0 0 0 0 0 0 + -2.6396 -1.2160 -0.7218 H 0 0 0 0 0 0 0 0 0 0 0 0 + -4.6428 1.1657 0.0527 C 0 0 0 0 0 0 0 0 0 0 0 0 + -4.9443 -0.6883 1.0864 H 0 0 0 0 0 0 0 0 0 0 0 0 + -4.9708 -0.8216 -0.6749 H 0 0 0 0 0 0 0 0 0 0 0 0 + -3.7263 1.9719 0.1167 O 0 0 0 0 0 0 0 0 0 0 0 0 + -5.9810 1.5745 -0.0860 N 0 0 0 0 0 0 0 0 0 0 0 0 + -6.2972 2.9715 -0.1969 C 0 0 0 0 0 0 0 0 0 0 0 0 + -6.7606 0.8731 -0.1164 H 0 0 0 0 0 0 0 0 0 0 0 0 + -5.5368 3.5799 0.3512 H 0 0 0 0 0 0 0 0 0 0 0 0 + -6.2938 3.2727 -1.2837 H 0 0 0 0 0 0 0 0 0 0 0 0 + -7.2938 3.1548 0.2649 H 0 0 0 0 0 0 0 0 0 0 0 0 + 1 2 1 0 + 1 3 2 0 + 1 20 1 0 + 2 4 1 0 + 2 5 1 0 + 2 6 1 0 + 7 8 1 0 + 7 17 1 0 + 7 18 1 0 + 7 19 1 6 + 8 9 1 0 + 9 10 2 0 + 9 11 1 0 + 9 16 1 1 + 11 12 1 0 + 12 13 1 0 + 12 28 1 0 + 12 29 1 0 + 13 14 1 0 + 13 30 1 0 + 14 15 1 0 + 14 31 1 0 + 14 32 1 0 + 15 33 1 0 + 17 20 1 0 + 17 21 1 0 + 17 22 1 1 + 18 23 1 0 + 18 24 1 0 + 18 25 1 0 + 20 26 1 0 + 21 27 2 0 + 21 34 1 0 + 34 35 1 0 + 34 36 1 0 + 35 37 1 0 + 35 38 1 0 + 35 39 1 0 + 37 40 2 0 + 37 41 1 0 + 41 42 1 0 + 41 43 1 0 + 42 44 1 0 + 42 45 1 0 + 42 46 1 0 +M CHG 1 16 -1 +M END +$$$$ diff --git a/openff/toolkit/tests/test_topology.py b/openff/toolkit/tests/test_topology.py index 0fbd6039e..48399c2bb 100644 --- a/openff/toolkit/tests/test_topology.py +++ b/openff/toolkit/tests/test_topology.py @@ -835,6 +835,36 @@ def test_from_pdb_overlapping_unique_mols(self): assert po4.is_isomorphic_with(top2.molecule(0)) assert phenylphosphate.is_isomorphic_with(top2.molecule(1)) + @requires_rdkit + def test_from_pdb_additional_substructures(self): + """Test that the _additional_substructures arg is wired up correctly""" + with pytest.raises(UnassignedChemistryInPDBError): + Topology.from_pdb(get_data_file_path("proteins/ace-ZZZ-gly-nme.pdb")) + + # Make unnatural AA + mol = Molecule.from_smiles("N[C@@H]([C@@H](C)O[P@](=O)(OCNCO)[O-])C(=O)") + # Get the indices of an N term and C term hydrogen for removal + leaving_atoms = mol.chemical_environment_matches("[H:1]N([H])CC(=O)[H:2]")[0] + + # Label the atoms with whether they're leaving + for atom in mol.atoms: + if atom.molecule_atom_index not in leaving_atoms: + atom.metadata["substructure_atom"] = True + else: + atom.metadata["substructure_atom"] = False + + top = Topology.from_pdb( + get_data_file_path("proteins/ace-ZZZ-gly-nme.pdb"), + _additional_substructures=[mol], + ) + + expected_mol = Molecule.from_file( + get_data_file_path("proteins/ace-ZZZ-gly-nme.sdf") + ) + assert top.molecule(0).is_isomorphic_with( + expected_mol, atom_stereochemistry_matching=False + ) + @requires_pkg("mdtraj") def test_from_mdtraj(self): """Test construction of an OpenFF Topology from an MDTraj Topology object""" diff --git a/openff/toolkit/topology/molecule.py b/openff/toolkit/topology/molecule.py index 8d8b24cf3..ce0c6972c 100644 --- a/openff/toolkit/topology/molecule.py +++ b/openff/toolkit/topology/molecule.py @@ -3558,6 +3558,7 @@ def chemical_environment_matches( self, smirks, unique=unique, + raise_exception_types=[], ) elif isinstance(toolkit_registry, ToolkitWrapper): matches = toolkit_registry.find_smarts_matches( # type: ignore[attr-defined] diff --git a/openff/toolkit/topology/topology.py b/openff/toolkit/topology/topology.py index b02b86c25..97adffa4e 100644 --- a/openff/toolkit/topology/topology.py +++ b/openff/toolkit/topology/topology.py @@ -12,6 +12,7 @@ """ import itertools +import re import warnings from collections import defaultdict from collections.abc import MutableMapping @@ -1535,6 +1536,7 @@ def from_pdb( file_path: Union[str, TextIO], unique_molecules: Optional[Iterable[Molecule]] = None, toolkit_registry=GLOBAL_TOOLKIT_REGISTRY, + _additional_substructures: Optional[Iterable[Molecule]] = None, ): """ Loads supported or user-specified molecules from a PDB file. @@ -1614,6 +1616,9 @@ def from_pdb( PDB. See above for details. toolkit_registry : ToolkitRegistry. Default = None The ToolkitRegistry to use as the cheminformatics backend. + _additional_substructures : Iterable of Molecule, Default = None + Experimental and unstable. Molecule with atom.metadata["substructure_atom"] = + True or False for all atoms. Returns ------- @@ -1682,6 +1687,28 @@ def from_pdb( a.name for a in unique_molecule.atoms ] + substructure_dictionary["ADDITIONAL_SUBSTRUCTURE"] = {} + + if _additional_substructures: + for mol in _additional_substructures: + label_mol = Molecule(mol) + c = 0 + label_mol.properties["atom_map"] = {} + for atom in label_mol.atoms: + if atom.metadata["substructure_atom"]: + label_mol.properties["atom_map"][atom.molecule_atom_index] = c + c += 1 + smi = label_mol.to_smiles(mapped=True) + # remove unmapped atoms from mapped smiles. This will catch things like + # `[H]` and `[Cl]` but not anything with 3 characters like `[H:1]` + smi = re.sub("\[[A-Za-z]{1,2}\]", "", smi) + # Remove any orphaned () that remain + smi = smi.replace("()", "") + + substructure_dictionary["ADDITIONAL_SUBSTRUCTURE"][smi] = [] + + substructure_dictionary["ADDITIONAL_SUBSTRUCTURE_OVERLAP"] = {} + coords_angstrom = np.array( [[*vec3.value_in_unit(openmm_unit.angstrom)] for vec3 in pdb.getPositions()] ) diff --git a/openff/toolkit/utils/__init__.py b/openff/toolkit/utils/__init__.py index d49a2c3fc..3c3e80474 100644 --- a/openff/toolkit/utils/__init__.py +++ b/openff/toolkit/utils/__init__.py @@ -1,29 +1,10 @@ -from openff.toolkit.utils.utils import ( - all_subclasses, - convert_0_1_smirnoff_to_0_2, - convert_0_2_smirnoff_to_0_3, - convert_all_quantities_to_string, - convert_all_strings_to_quantity, - deserialize_numpy, - get_data_file_path, - get_molecule_parameterIDs, - inherit_docstrings, - object_to_quantity, - quantity_to_string, - requires_package, - serialize_numpy, - string_to_quantity, - string_to_unit, - temporary_cd, - unit_to_string, -) from openff.toolkit.utils.constants import ( - DEFAULT_AROMATICITY_MODEL, ALLOWED_AROMATICITY_MODELS, - DEFAULT_FRACTIONAL_BOND_ORDER_MODEL, + ALLOWED_CHARGE_MODELS, ALLOWED_FRACTIONAL_BOND_ORDER_MODELS, + DEFAULT_AROMATICITY_MODEL, DEFAULT_CHARGE_MODEL, - ALLOWED_CHARGE_MODELS, + DEFAULT_FRACTIONAL_BOND_ORDER_MODEL, ) from openff.toolkit.utils.toolkits import ( AMBERTOOLS_AVAILABLE, @@ -53,3 +34,22 @@ ToolkitWrapper, UndefinedStereochemistryError, ) +from openff.toolkit.utils.utils import ( + all_subclasses, + convert_0_1_smirnoff_to_0_2, + convert_0_2_smirnoff_to_0_3, + convert_all_quantities_to_string, + convert_all_strings_to_quantity, + deserialize_numpy, + get_data_file_path, + get_molecule_parameterIDs, + inherit_docstrings, + object_to_quantity, + quantity_to_string, + requires_package, + serialize_numpy, + string_to_quantity, + string_to_unit, + temporary_cd, + unit_to_string, +) diff --git a/openff/toolkit/utils/rdkit_wrapper.py b/openff/toolkit/utils/rdkit_wrapper.py index c4bae64e3..3f928f579 100644 --- a/openff/toolkit/utils/rdkit_wrapper.py +++ b/openff/toolkit/utils/rdkit_wrapper.py @@ -407,7 +407,13 @@ def _polymer_openmm_topology_to_rdmol( # Some special residues are allowed to overlap/override previous matches if any(m in already_assigned_nodes for m in match) and ( - res_name not in ["PEPTIDE_BOND", "DISULFIDE", "UNIQUE_MOLECULE"] + res_name + not in [ + "PEPTIDE_BOND", + "DISULFIDE", + "UNIQUE_MOLECULE", + "ADDITIONAL_SUBSTRUCTURE", + ] ): continue already_assigned_nodes.update(match)