From 2a4c999b5be700c20f583b154a65a30cbde4864a Mon Sep 17 00:00:00 2001
From: Timotej Bernat <52677403+timbernat@users.noreply.github.com>
Date: Tue, 16 Jul 2024 11:32:04 -0600
Subject: [PATCH] LAMMPS non-unique molecule ID hotfix (#1000)

* Salted hash in _write_atoms() with molecule ID to guarantee unique IDs for Molecules with identical chemical tables

* Wrote unit test to check unique molecule IDs in Interchange.to_lammps() output

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Apply suggestions from code review

Incorporated suggestions for unique mol ID unit test improvements

Co-authored-by: Matt Thompson <mattwthompson@protonmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Removed unnecessary typing imports

* Consolidated pilot_mol init and conformer generation using MoleculeWithConformer

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Updated sidelen default, added assertion to enforce test for multiple Molecules

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Wrapped mol ID + CTAB hash with hash() to produce more literal value for molecule_hash

* Re-implemented LAMMPS file molecule ID extraction with in-Python lammps interface

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Added pytest parametrization of auxiliary args in test_unique_lammps_mol_ids()

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* MAINT: Streamline, skip generating positions

* FIX: Use benzene, not fulvene

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Matt Thompson <mattwthompson@protonmail.com>
---
 .../interop/lammps/export/test_lammps.py      | 78 ++++++++++++++++++-
 .../interop/lammps/export/export.py           |  8 +-
 2 files changed, 83 insertions(+), 3 deletions(-)

diff --git a/openff/interchange/_tests/unit_tests/interop/lammps/export/test_lammps.py b/openff/interchange/_tests/unit_tests/interop/lammps/export/test_lammps.py
index a179aaa3c..ad5fa3330 100644
--- a/openff/interchange/_tests/unit_tests/interop/lammps/export/test_lammps.py
+++ b/openff/interchange/_tests/unit_tests/interop/lammps/export/test_lammps.py
@@ -1,11 +1,18 @@
+from pathlib import Path
+
+import lammps
 import numpy
 import pytest
-from openff.toolkit import Topology, unit
+from openff.toolkit import ForceField, Quantity, Topology, unit
+from openff.utilities import temporary_cd
 
 from openff.interchange import Interchange
 from openff.interchange._tests import MoleculeWithConformer, needs_lmp
+from openff.interchange.components.mdconfig import MDConfig
 from openff.interchange.drivers import get_lammps_energies, get_openmm_energies
 
+rng = numpy.random.default_rng(821)
+
 
 @needs_lmp
 class TestLammps:
@@ -21,7 +28,12 @@ class TestLammps:
             "C1COC(=O)O1",  # This adds an improper, i2
         ],
     )
-    def test_to_lammps_single_mols(self, mol, sage_unconstrained, n_mols):
+    def test_to_lammps_single_mols(
+        self,
+        mol: str,
+        sage_unconstrained: ForceField,
+        n_mols: int,
+    ) -> None:
         """
         Test that Interchange.to_openmm Interchange.to_lammps report sufficiently similar energies.
 
@@ -62,3 +74,65 @@ def test_to_lammps_single_mols(self, mol, sage_unconstrained, n_mols):
                 "Torsion": 0.02 * unit.kilojoule_per_mole,
             },
         )
+
+    @pytest.mark.parametrize("n_mols", [2, 3])
+    @pytest.mark.parametrize(
+        "smiles",
+        [
+            "CCO",
+            "N1CCCC1",
+            "c1ccccc1",
+        ],
+    )
+    def test_unique_lammps_mol_ids(
+        self,
+        smiles,
+        sage_unconstrained,
+        n_mols,
+    ) -> bool:
+        """Test to see if interop.lammps.export._write_atoms() writes unique ids for each distinct Molecule"""
+
+        molecule = MoleculeWithConformer.from_smiles(smiles)
+        topology = Topology.from_molecules(n_mols * [molecule])
+
+        # Just use random positions since we're testing molecule IDs, not physics
+        topology.set_positions(
+            Quantity(
+                rng.random((topology.n_atoms, 3)),
+                "nanometer",
+            ),
+        )
+        topology.box_vectors = Quantity([4, 4, 4], "nanometer")
+
+        with temporary_cd():
+            lammps_input_path = Path.cwd() / "temp.in"
+            lammps_data_path = Path.cwd() / "out.lmp"
+
+            interchange = sage_unconstrained.create_interchange(topology)
+            interchange.to_lammps(lammps_data_path)
+
+            mdconfig = MDConfig.from_interchange(interchange)
+            mdconfig.write_lammps_input(
+                interchange=interchange,
+                input_file=lammps_input_path,
+            )
+
+            # Extract molecule IDs from data file
+            with lammps.lammps(
+                cmdargs=["-screen", "none", "-log", "none"],
+            ) as lmp:
+                lmp.file(
+                    "temp.in",
+                )
+                written_mol_ids = {
+                    mol_id
+                    for _, mol_id in zip(
+                        range(lmp.get_natoms()),
+                        lmp.extract_atom("molecule"),
+                    )
+                }
+
+        # these are expected to be [1...N] for each of N molecules
+        expected_mol_ids = {i + 1 for i in range(interchange.topology.n_molecules)}
+
+        assert expected_mol_ids == written_mol_ids
diff --git a/openff/interchange/interop/lammps/export/export.py b/openff/interchange/interop/lammps/export/export.py
index e633a9681..8d2b5927c 100644
--- a/openff/interchange/interop/lammps/export/export.py
+++ b/openff/interchange/interop/lammps/export/export.py
@@ -294,7 +294,13 @@ def _write_atoms(lmp_file: IO, interchange: Interchange, atom_type_map: dict):
         atom_map[atom_index] = atom
 
     for molecule_index, molecule in enumerate(interchange.topology.molecules):
-        molecule_hash = molecule.ordered_connection_table_hash()
+        # inject mol ID into hash to allow chemically-identical Molecules
+        # to be labelled with distinct IDs
+        molecule_hash = hash(
+            f"{molecule_index}_{molecule.ordered_connection_table_hash()}",
+        )
+        # TODO: see if this fix also applied for the same issue in GROMACS/AMBER writers
+
         molecule_map[molecule_hash] = molecule_index
         for atom in molecule.atoms:
             atom_molecule_map[atom] = molecule_hash