Merge pull request #1024 from openforcefield/more-gromacs-residue-id

Test more cases of handling GROMACS residue IDs
openforcefield · Aug 30, 2024 · 57dd53a · 57dd53a
2 parents 9947373 + 197240d
commit 57dd53a
Show file tree

Hide file tree

Showing 5 changed files with 93 additions and 58 deletions.
diff --git a/devtools/conda-envs/dev_env.yaml b/devtools/conda-envs/dev_env.yaml
@@ -4,7 +4,7 @@ channels:
   - openeye
 dependencies:
   # Core
-  - python =3.10
+  - python
   - pip
   - numpy
   - pydantic =2
@@ -34,6 +34,8 @@ dependencies:
   # Drivers
   - gromacs
   - lammps >=2023.08.02
+  # https://github.com/conda-forge/lammps-feedstock/issues/207
+  - openmpi =4
   - panedr
   # Typing
   - mypy

diff --git a/openff/interchange/_tests/interoperability_tests/internal/test_gromacs.py b/openff/interchange/_tests/interoperability_tests/internal/test_gromacs.py
@@ -98,57 +98,6 @@ def test_vaccum_warning(self, sage):
         with pytest.warns(UserWarning, match="gitlab"):
             out.to_gro("tmp.gro")
 
-    @pytest.mark.slow
-    @skip_if_missing("openmm")
-    def test_residue_info(self, sage):
-        """Test that residue information is passed through to .gro files."""
-        import mdtraj
-
-        protein = Molecule.from_polymer_pdb(
-            get_data_file_path(
-                "proteins/MainChain_HIE.pdb",
-                "openff.toolkit",
-            ),
-        )
-
-        ff14sb = ForceField("ff14sb_off_impropers_0.0.3.offxml")
-
-        out = Interchange.from_smirnoff(
-            force_field=ff14sb,
-            topology=[protein],
-        )
-
-        out.to_gro("tmp.gro")
-
-        mdtraj_topology = mdtraj.load("tmp.gro").topology
-
-        for found_residue, original_residue in zip(
-            mdtraj_topology.residues,
-            out.topology.hierarchy_iterator("residues"),
-        ):
-            assert found_residue.name == original_residue.residue_name
-            assert str(found_residue.resSeq) == original_residue.residue_number
-
-    @pytest.mark.slow
-    def test_atom_names_pdb(self):
-        peptide = Molecule.from_polymer_pdb(
-            get_data_file_path("proteins/MainChain_ALA_ALA.pdb", "openff.toolkit"),
-        )
-        ff14sb = ForceField("ff14sb_off_impropers_0.0.3.offxml")
-
-        Interchange.from_smirnoff(ff14sb, peptide.to_topology()).to_gro(
-            "atom_names.gro",
-        )
-
-        pdb_object = openmm.app.PDBFile(
-            get_data_file_path("proteins/MainChain_ALA_ALA.pdb", "openff.toolkit"),
-        )
-        pdb_atom_names = [atom.name for atom in pdb_object.topology.atoms()]
-
-        openmm_atom_names = openmm.app.GromacsGroFile("atom_names.gro").atomNames
-
-        assert openmm_atom_names == pdb_atom_names
-
 
 @needs_gmx
 class TestGROMACS:

diff --git a/openff/interchange/_tests/unit_tests/interop/gromacs/export/test_export.py b/openff/interchange/_tests/unit_tests/interop/gromacs/export/test_export.py
@@ -1,3 +1,4 @@
+import random
 from importlib import resources
 from math import exp
 
@@ -41,6 +42,14 @@ class _NeedsGROMACS:
     pass
 
 
+def parse_residue_ids(file) -> list[str]:
+    """Parse residue IDs, and only the residue IDs, from a GROMACS .gro file."""
+    with open(file) as f:
+        ids = [line[:5].strip() for line in f.readlines()[2:-1]]
+
+    return ids
+
+
 class TestToGro(_NeedsGROMACS):
     def test_residue_names(self, sage):
         """Reproduce issue #642."""
@@ -143,8 +152,9 @@ def test_vaccum_warning(self, sage):
             out.to_gro("tmp.gro")
 
     @pytest.mark.slow
+    @pytest.mark.parametrize("offset_residue_ids", [True, False])
     @skip_if_missing("openmm")
-    def test_residue_info(self, sage):
+    def test_residue_info(self, offset_residue_ids):
         """Test that residue information is passed through to .gro files."""
         import mdtraj
 
@@ -155,6 +165,19 @@ def test_residue_info(self, sage):
             ),
         )
 
+        if offset_residue_ids:
+            offset = random.randint(10, 20)
+
+            for atom in protein.atoms:
+                atom.metadata["residue_number"] = str(
+                    int(atom.metadata["residue_number"]) + offset,
+                )
+
+            # Need to manually update residues _and_ atoms
+            # https://github.com/openforcefield/openff-toolkit/issues/1921
+            for residue in protein.residues:
+                residue.residue_number = str(int(residue.residue_number) + offset)
+
         ff14sb = ForceField("ff14sb_off_impropers_0.0.3.offxml")
 
         out = Interchange.from_smirnoff(
@@ -171,7 +194,54 @@ def test_residue_info(self, sage):
             out.topology.hierarchy_iterator("residues"),
         ):
             assert found_residue.name == original_residue.residue_name
-            assert str(found_residue.resSeq) == original_residue.residue_number
+            found_index = [*original_residue.atoms][0].metadata["residue_number"]
+            assert str(found_residue.resSeq) == found_index
+
+    def test_fill_in_residue_ids(self, sage):
+        """Ensure that, if inputs have no residue_number, they are generated on-the-fly."""
+        sage.create_interchange(
+            Topology.from_molecules(
+                [MoleculeWithConformer.from_smiles(smi) for smi in ["CC", "O", "C"]],
+            ),
+        ).to_gro("fill.gro")
+
+        expected_residue_ids = 8 * ["1"] + 3 * ["2"] + 5 * ["3"]
+
+        found_residue_ids = parse_residue_ids("fill.gro")
+
+        for expected, found in zip(expected_residue_ids, found_residue_ids):
+            assert expected == found
+
+    def test_atom_index_gt_100_000(self, water, sage):
+        """Ensure that atom indices are written correctly for large numbers."""
+        water.add_hierarchy_scheme(
+            ("residue_name", "residue_number"),
+            "residues",
+        )
+
+        topology = Topology.from_molecules(2 * [water])
+
+        topology.box_vectors = unit.Quantity([4, 4, 4], unit.nanometer)
+
+        # Can't just update atoms' metadata, neeed to create these scheme/element objects
+        # and need to also modify the residue objects themselves
+        for molecule_index, molecule in enumerate(topology.molecules):
+            for atom in molecule.atoms:
+                atom.metadata["residue_number"] = str(molecule_index + 123_456)
+
+        for residue_index, residue in enumerate(topology.residues):
+            residue.residue_number = str(residue_index + 123_456)
+
+        interchange = sage.create_interchange(topology)
+
+        interchange.to_gro("large.gro")
+
+        expected_residue_ids = 3 * ["23456"] + 3 * ["23457"]
+
+        found_residue_ids = parse_residue_ids("large.gro")
+
+        for expected, found in zip(expected_residue_ids, found_residue_ids):
+            assert expected == found
 
     @pytest.mark.slow
     def test_atom_names_pdb(self):

diff --git a/openff/interchange/components/interchange.py b/openff/interchange/components/interchange.py
@@ -446,6 +446,8 @@ def to_gromacs(
         Molecule names in written files are not guaranteed to match the `Moleclue.name` attribute of the
         molecules in the topology, especially if they are empty strings or not unique.
 
+        See `to_gro` and `to_top` for more details.
+
         """
         from openff.interchange.interop.gromacs.export._export import GROMACSWriter
         from openff.interchange.smirnoff._gromacs import _convert
@@ -505,6 +507,16 @@ def to_gro(self, file_path: Path | str, decimal: int = 3):
         decimal: int, default=3
             The number of decimal places to use when writing the GROMACS coordinate file.
 
+        Residue IDs must be positive integers (or string representations thereof).
+
+        Residue IDs greater than 99,999 are reduced modulo 100,000 in line with common GROMACS practice.
+
+        Residue names and IDs from the topology are used, if present, and otherwise are generated internally.
+
+        Behavior when some, but not all, residue names and IDs are present in the topology is undefined.
+
+        If residue IDs are generated internally, they are assigned sequentially to each molecule starting at 1.
+
         """
         from openff.interchange.interop.gromacs.export._export import GROMACSWriter
         from openff.interchange.smirnoff._gromacs import _convert

diff --git a/openff/interchange/interop/gromacs/export/_export.py b/openff/interchange/interop/gromacs/export/_export.py
@@ -200,7 +200,7 @@ def _write_atoms(
                 top.write(
                     f"{atom.index :6d} "
                     f"{mapping_to_reduced_atom_types[atom.atom_type] :6s}"
-                    f"{atom.residue_index :8d} "
+                    f"{atom.residue_index % 100_000 :8d} "
                     f"{atom.residue_name :8s} "
                     f"{atom.name :6s}"
                     f"{atom.charge_group_number :6d}"
@@ -211,7 +211,7 @@ def _write_atoms(
                 top.write(
                     f"{atom.index :6d} "
                     f"{atom.atom_type :6s}"
-                    f"{atom.residue_index :8d} "
+                    f"{atom.residue_index % 100_000 :8d} "
                     f"{atom.residue_name :8s} "
                     f"{atom.name :6s}"
                     f"{atom.charge_group_number :6d}"
@@ -448,15 +448,17 @@ def _write_gro(self, gro, decimal: int):
         for molecule_name, molecule in self.system.molecule_types.items():
             n_copies = self.system.molecules[molecule_name]
 
-            for _ in range(n_copies):
+            for copy_index in range(n_copies):
+
                 for atom in molecule.atoms:
+
                     gro.write(
                         f"%5d%-5s%5s%5d"
                         f"%{decimal + 5}.{decimal}f"
                         f"%{decimal + 5}.{decimal}f"
                         f"%{decimal + 5}.{decimal}f\n"
                         % (
-                            atom.residue_index,  # This needs to be looked up from a different data structure
+                            (atom.residue_index + copy_index) % 100_000,
                             atom.residue_name[:5],
                             atom.name[:5],
                             (count + 1) % 100000,