From 12d17a281f4e240d72e3c10368a3c083ffb0c280 Mon Sep 17 00:00:00 2001 From: Jeffrey Wagner Date: Mon, 17 Apr 2023 14:59:05 -0700 Subject: [PATCH 1/4] first steps toward implementing a warning --- openff/toolkit/topology/topology.py | 2 ++ openff/toolkit/utils/rdkit_wrapper.py | 28 ++++++++++++++++++++++++++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/openff/toolkit/topology/topology.py b/openff/toolkit/topology/topology.py index 5cc304fa1..9e3dce29a 100644 --- a/openff/toolkit/topology/topology.py +++ b/openff/toolkit/topology/topology.py @@ -1531,6 +1531,7 @@ def from_pdb( file_path: Union[str, TextIO], unique_molecules: Optional[Iterable[Molecule]] = None, toolkit_registry=GLOBAL_TOOLKIT_REGISTRY, + _ignore_stereo: bool = False ): """ Loads supported or user-specified molecules from a PDB file. @@ -1688,6 +1689,7 @@ def from_pdb( pdb, substructure_dictionary, coords_angstrom, + _ignore_stereo=_ignore_stereo ) for off_atom, atom in zip([*topology.atoms], pdb.topology.atoms()): diff --git a/openff/toolkit/utils/rdkit_wrapper.py b/openff/toolkit/utils/rdkit_wrapper.py index c41563f99..b7800d5db 100644 --- a/openff/toolkit/utils/rdkit_wrapper.py +++ b/openff/toolkit/utils/rdkit_wrapper.py @@ -266,7 +266,7 @@ def _polymer_openmm_topology_to_offmol( return offmol def _polymer_openmm_pdbfile_to_offtop( - self, topology_class, pdbfile, substructure_dictionary, coords_angstrom + self, topology_class, pdbfile, substructure_dictionary, coords_angstrom, _ignore_stereo: bool = False ): from openff.units.openmm import from_openmm from rdkit import Chem, Geometry @@ -276,6 +276,10 @@ def _polymer_openmm_pdbfile_to_offtop( omm_top, substructure_dictionary ) + # The above method will have set stereo on the atoms directly from the substructures and unique_molecules. + # In order to check that the same stereo is perceived from 3D as is assigned from the template, make + # a copy here, then assign stereo from 3D, and then raise an error if there's a disagreement. + mol_with_stereo_from_substr = Chem.Mol(rdkit_mol) rdmol_conformer = Chem.Conformer() for atom_idx in range(rdkit_mol.GetNumAtoms()): x, y, z = coords_angstrom[atom_idx, :] @@ -289,6 +293,28 @@ def _polymer_openmm_pdbfile_to_offtop( Chem.Kekulize(rdkit_mol, clearAromaticFlags=True) Chem.SetAromaticity(rdkit_mol, Chem.AromaticityModel.AROMATICITY_MDL) + # Check stereo assigned from 3D against the stereo assigned by substructures. + + # Note that this DOES NOT compare global stereo (R vs. S) because the global stereo + # of a substructure may be undefined (it could change depending on which substructures connect to it). + # Instead, since we know that the order of atoms and bonds are the same between + # mol_with_stereo_from_substr and rdkit_mol are the same, we compare "local stereo" (CW vs CCW). + msg = "" + for orig_atom, new_atom in zip(mol_with_stereo_from_substr.GetAtoms(), rdkit_mol.GetAtoms()): + if _ignore_stereo: + continue + orig_chi = orig_atom.GetChiralTag() + new_chi = new_atom.GetChiralTag() + # If either the original atom or the newly perceived atom isn't a chiral center, skip this check. + # This means that users can avoid this check by adding unique molecules with undefined stereo + if (orig_chi == Chem.CHI_UNSPECIFIED) or (new_chi == Chem.CHI_UNSPECIFIED): + continue + if orig_chi != new_chi: + msg += f"atom {new_atom.GetIdx()} \n" # TODO Improve this error message with things like atom element and residue and atom name + if msg != "": + msg = "Some atoms loaded from PDB have a 3D geometry that corresponds to a different " \ + "stereochemistry than the substructure template or unique molecule." + # Don't sanitize or we risk assigning non-MDL aromaticity rdmols = Chem.GetMolFrags(rdkit_mol, asMols=True, sanitizeFrags=False) top = topology_class() From 884bbdd7b6a47b356a1691086be57abf8c79b404 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 17 Apr 2023 22:00:09 +0000 Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- openff/toolkit/topology/topology.py | 4 ++-- openff/toolkit/utils/rdkit_wrapper.py | 19 ++++++++++++++----- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/openff/toolkit/topology/topology.py b/openff/toolkit/topology/topology.py index 9e3dce29a..73643a2d1 100644 --- a/openff/toolkit/topology/topology.py +++ b/openff/toolkit/topology/topology.py @@ -1531,7 +1531,7 @@ def from_pdb( file_path: Union[str, TextIO], unique_molecules: Optional[Iterable[Molecule]] = None, toolkit_registry=GLOBAL_TOOLKIT_REGISTRY, - _ignore_stereo: bool = False + _ignore_stereo: bool = False, ): """ Loads supported or user-specified molecules from a PDB file. @@ -1689,7 +1689,7 @@ def from_pdb( pdb, substructure_dictionary, coords_angstrom, - _ignore_stereo=_ignore_stereo + _ignore_stereo=_ignore_stereo, ) for off_atom, atom in zip([*topology.atoms], pdb.topology.atoms()): diff --git a/openff/toolkit/utils/rdkit_wrapper.py b/openff/toolkit/utils/rdkit_wrapper.py index b7800d5db..7116a8a60 100644 --- a/openff/toolkit/utils/rdkit_wrapper.py +++ b/openff/toolkit/utils/rdkit_wrapper.py @@ -266,7 +266,12 @@ def _polymer_openmm_topology_to_offmol( return offmol def _polymer_openmm_pdbfile_to_offtop( - self, topology_class, pdbfile, substructure_dictionary, coords_angstrom, _ignore_stereo: bool = False + self, + topology_class, + pdbfile, + substructure_dictionary, + coords_angstrom, + _ignore_stereo: bool = False, ): from openff.units.openmm import from_openmm from rdkit import Chem, Geometry @@ -300,7 +305,9 @@ def _polymer_openmm_pdbfile_to_offtop( # Instead, since we know that the order of atoms and bonds are the same between # mol_with_stereo_from_substr and rdkit_mol are the same, we compare "local stereo" (CW vs CCW). msg = "" - for orig_atom, new_atom in zip(mol_with_stereo_from_substr.GetAtoms(), rdkit_mol.GetAtoms()): + for orig_atom, new_atom in zip( + mol_with_stereo_from_substr.GetAtoms(), rdkit_mol.GetAtoms() + ): if _ignore_stereo: continue orig_chi = orig_atom.GetChiralTag() @@ -310,10 +317,12 @@ def _polymer_openmm_pdbfile_to_offtop( if (orig_chi == Chem.CHI_UNSPECIFIED) or (new_chi == Chem.CHI_UNSPECIFIED): continue if orig_chi != new_chi: - msg += f"atom {new_atom.GetIdx()} \n" # TODO Improve this error message with things like atom element and residue and atom name + msg += f"atom {new_atom.GetIdx()} \n" # TODO Improve this error message with things like atom element and residue and atom name if msg != "": - msg = "Some atoms loaded from PDB have a 3D geometry that corresponds to a different " \ - "stereochemistry than the substructure template or unique molecule." + msg = ( + "Some atoms loaded from PDB have a 3D geometry that corresponds to a different " + "stereochemistry than the substructure template or unique molecule." + ) # Don't sanitize or we risk assigning non-MDL aromaticity rdmols = Chem.GetMolFrags(rdkit_mol, asMols=True, sanitizeFrags=False) From e595c212d73a51bce49fe8ca7af784af804019b2 Mon Sep 17 00:00:00 2001 From: Jeffrey Wagner Date: Thu, 20 Apr 2023 21:59:28 -0700 Subject: [PATCH 3/4] update with progress, add big TODO with findings --- openff/toolkit/topology/topology.py | 1 + openff/toolkit/utils/rdkit_wrapper.py | 32 +++++++++++++++++++++++---- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/openff/toolkit/topology/topology.py b/openff/toolkit/topology/topology.py index 73643a2d1..204c6c832 100644 --- a/openff/toolkit/topology/topology.py +++ b/openff/toolkit/topology/topology.py @@ -1692,6 +1692,7 @@ def from_pdb( _ignore_stereo=_ignore_stereo, ) + # TODO: This is redundant with the RDKit backend - see RDKTKW._get_connectivity_from_openmm_top for off_atom, atom in zip([*topology.atoms], pdb.topology.atoms()): off_atom.metadata["residue_name"] = atom.residue.name off_atom.metadata["residue_number"] = atom.residue.id diff --git a/openff/toolkit/utils/rdkit_wrapper.py b/openff/toolkit/utils/rdkit_wrapper.py index 7116a8a60..c59307433 100644 --- a/openff/toolkit/utils/rdkit_wrapper.py +++ b/openff/toolkit/utils/rdkit_wrapper.py @@ -11,6 +11,7 @@ import logging import pathlib import tempfile +import warnings from collections import defaultdict from typing import TYPE_CHECKING, Dict, List, Optional, Tuple @@ -285,6 +286,7 @@ def _polymer_openmm_pdbfile_to_offtop( # In order to check that the same stereo is perceived from 3D as is assigned from the template, make # a copy here, then assign stereo from 3D, and then raise an error if there's a disagreement. mol_with_stereo_from_substr = Chem.Mol(rdkit_mol) + Chem.AssignStereochemistry(mol_with_stereo_from_substr) rdmol_conformer = Chem.Conformer() for atom_idx in range(rdkit_mol.GetNumAtoms()): x, y, z = coords_angstrom[atom_idx, :] @@ -294,9 +296,9 @@ def _polymer_openmm_pdbfile_to_offtop( rdkit_mol, Chem.SANITIZE_ALL ^ Chem.SANITIZE_ADJUSTHS, ) - Chem.AssignStereochemistryFrom3D(rdkit_mol) Chem.Kekulize(rdkit_mol, clearAromaticFlags=True) Chem.SetAromaticity(rdkit_mol, Chem.AromaticityModel.AROMATICITY_MDL) + Chem.AssignStereochemistryFrom3D(rdkit_mol) # Check stereo assigned from 3D against the stereo assigned by substructures. @@ -311,18 +313,31 @@ def _polymer_openmm_pdbfile_to_offtop( if _ignore_stereo: continue orig_chi = orig_atom.GetChiralTag() + orig_abs_chi = orig_atom.GetPropsAsDict().get("_CIPCode", "") new_chi = new_atom.GetChiralTag() + new_abs_chi = new_atom.GetPropsAsDict().get("_CIPCode", "") + # If either the original atom or the newly perceived atom isn't a chiral center, skip this check. # This means that users can avoid this check by adding unique molecules with undefined stereo if (orig_chi == Chem.CHI_UNSPECIFIED) or (new_chi == Chem.CHI_UNSPECIFIED): continue if orig_chi != new_chi: - msg += f"atom {new_atom.GetIdx()} \n" # TODO Improve this error message with things like atom element and residue and atom name + print(f"{orig_chi=} {new_chi=} {orig_abs_chi=} {new_abs_chi=}") + residue = orig_atom.GetPDBResidueInfo() + atom_name = orig_atom.GetPropsAsDict().get("_Name", "") + atom_symbol = orig_atom.GetSymbol() + atom_index = orig_atom.GetIdx() + residue_name = residue.GetResidueName() + residue_number = residue.GetResidueNumber() + chain_id = residue.GetChainId() + + msg += f"{atom_index=} {atom_symbol=} {atom_name=} {residue_name=} {residue_number=} {chain_id=} \n" if msg != "": msg = ( "Some atoms loaded from PDB have a 3D geometry that corresponds to a different " - "stereochemistry than the substructure template or unique molecule." + "stereochemistry than the substructure template or unique molecule: \n" + msg ) + warnings.warn(msg) # Don't sanitize or we risk assigning non-MDL aromaticity rdmols = Chem.GetMolFrags(rdkit_mol, asMols=True, sanitizeFrags=False) @@ -436,8 +451,16 @@ def _polymer_openmm_topology_to_rdmol( for atom_i, j in zip(ref.GetAtoms(), match): atom_j = mol.GetAtomWithIdx(j) # copy over chirality + # TODO: This is most likely wrong. This will copy over CW/CCW tags but there's no + # checking that the substituent order in the molecule and substructure are the same, + # so this is basically random assignment. Thankfully Molecule.from_polymer_pdb and + # Topology.from_pdb both overwrite this assignment based on 3D coordinates. The only + # way I can currently think of doing this right would be to arbitrarily assign CW, + # then try to run a strict substructure match on it, and if that fails then assign + # CCW. This would be monstrously inefficient and all it would let us do is compare + # the substructure stereo to 3D stereo, which is of dubious value anyway. if atom_i.GetChiralTag(): - mol.GetAtomWithIdx(j).SetChiralTag(atom_i.GetChiralTag()) + atom_j.SetChiralTag(atom_i.GetChiralTag()) atom_j.SetFormalCharge(atom_i.GetFormalCharge()) for b in ref.GetBonds(): @@ -498,6 +521,7 @@ def _get_connectivity_from_openmm_top(self, omm_top): res.SetChainId(atom.residue.chain.id) rwatom = rwmol.GetAtomWithIdx(idx) rwatom.SetPDBResidueInfo(res) + rwatom.SetProp("_Name", atom.name) # we're fully explicit for atom in rwmol.GetAtoms(): atom.SetNoImplicit(True) From e42b1620163775626d4ed3b87811e3043f91e822 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 21 Apr 2023 04:59:57 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- openff/toolkit/utils/rdkit_wrapper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openff/toolkit/utils/rdkit_wrapper.py b/openff/toolkit/utils/rdkit_wrapper.py index c59307433..35ce9c7b1 100644 --- a/openff/toolkit/utils/rdkit_wrapper.py +++ b/openff/toolkit/utils/rdkit_wrapper.py @@ -335,7 +335,8 @@ def _polymer_openmm_pdbfile_to_offtop( if msg != "": msg = ( "Some atoms loaded from PDB have a 3D geometry that corresponds to a different " - "stereochemistry than the substructure template or unique molecule: \n" + msg + "stereochemistry than the substructure template or unique molecule: \n" + + msg ) warnings.warn(msg)