Merge pull request #4884 from tanishy7777/parallize_rdf

p-j-smith · web-flow · commit c0685ac92b7d · 2025-10-08T18:59:54.000+01:00
Parallelizes `MDAnalysis.analysis.InterRDF` and `MDAnalysis.analysis.InterRDF_s`
diff --git a/package/CHANGELOG b/package/CHANGELOG
@@ -115,9 +115,13 @@ Enhancements
    (Issue #4677, PR #4729)
  * Enables parallelization for analysis.contacts.Contacts (Issue #4660)
  * Enable parallelization for analysis.nucleicacids.NucPairDist (Issue #4670)
+ * Add check and warning for empty (all zero) coordinates in RDKit converter (PR #4824)
+ * Added `precision` for XYZWriter (Issue #4775, PR #4771)
+ * Parallelize `analysis.rdf.InterRDF` and `analysis.rdf.InterRDF` (Issue #4675)
+ * Added `precision` for XYZWriter (Issue #4775, PR #4771)
  * Add check and warning for empty (all zero) coordinates in RDKit converter
    (PR #4824)
- * Added `precision` for XYZWriter (Issue #4775, PR #4771) 
+ * Added `precision` for XYZWriter (Issue #4775, PR #4771)
 
 Changes
  * MDAnalysis.analysis.psa, MDAnalysis.analysis.waterdynamics and
diff --git a/package/MDAnalysis/analysis/rdf.py b/package/MDAnalysis/analysis/rdf.py
@@ -80,7 +80,43 @@
 import numpy as np
 
 from ..lib import distances
-from .base import AnalysisBase
+from .base import AnalysisBase, ResultsGroup
+
+
+def nested_array_sum(arrs):
+    r"""Custom aggregator for nested arrays
+
+    This function takes a nested list or tuple of NumPy arrays, flattens it
+    into a single list, and aggregates the elements at alternating indices
+    into two separate arrays. The first array accumulates elements at even
+    indices, while the second accumulates elements at odd indices.
+
+    Parameters
+    ----------
+    arrs : list
+        List of arrays or nested lists of arrays
+
+    Returns
+    -------
+    list of ndarray
+        A list containing two NumPy arrays:
+        - The first array is the sum of all elements at even indices
+          in the sum of flattened arrays.
+        - The second array is the sum of all elements at odd indices
+          in the sum of flattened arrays.
+    """
+
+    def flatten(arr):
+        if isinstance(arr, (list, tuple)):
+            return [item for sublist in arr for item in flatten(sublist)]
+        return [arr]
+
+    flat = flatten(arrs)
+    aggregated_arr = [np.zeros_like(flat[0]), np.zeros_like(flat[1])]
+    for i in range(len(flat) // 2):
+        aggregated_arr[0] += flat[2 * i]  # 0, 2, 4, ...
+        aggregated_arr[1] += flat[2 * i + 1]  # 1, 3, 5, ...
+    return aggregated_arr
 
 
 class InterRDF(AnalysisBase):
@@ -221,8 +257,23 @@ class InterRDF(AnalysisBase):
        Store results as attributes `bins`, `edges`, `rdf` and `count`
        of the `results` attribute of
        :class:`~MDAnalysis.analysis.AnalysisBase`.
+
+    .. versionchanged:: 2.9.0
+       Enabled **parallel execution** with the ``multiprocessing`` and ``dask``
+       backends; use the new method :meth:`get_supported_backends` to see all
+       supported backends.
     """
 
+    @classmethod
+    def get_supported_backends(cls):
+        return (
+            "serial",
+            "multiprocessing",
+            "dask",
+        )
+
+    _analysis_algorithm_is_parallelizable = True
+
     def __init__(
         self,
         g1,
@@ -281,7 +332,7 @@ def _prepare(self):
 
         if self.norm == "rdf":
             # Cumulative volume for rdf normalization
-            self.volume_cum = 0
+            self.results.volume_cum = 0
         # Set the max range to filter the search radius
         self._maxrange = self.rdf_settings["range"][1]
 
@@ -311,7 +362,17 @@ def _single_frame(self):
         self.results.count += count
 
         if self.norm == "rdf":
-            self.volume_cum += self._ts.volume
+            self.results.volume_cum += self._ts.volume
+
+    def _get_aggregator(self):
+        return ResultsGroup(
+            lookup={
+                "count": ResultsGroup.ndarray_sum,
+                "volume_cum": ResultsGroup.ndarray_sum,
+                "bins": ResultsGroup.ndarray_sum,
+                "edges": ResultsGroup.ndarray_mean,
+            }
+        )
 
     def _conclude(self):
         norm = self.n_frames
@@ -333,6 +394,7 @@ def _conclude(self):
                 N -= xA * xB * nblocks
 
             # Average number density
+            self.volume_cum = self.results.volume_cum
             box_vol = self.volume_cum / self.n_frames
             norm *= N / box_vol
 
@@ -576,8 +638,32 @@ class InterRDF_s(AnalysisBase):
        Instead of `density=True` use `norm='density'`
     .. deprecated:: 2.3.0
        The `universe` parameter is superflous.
+    .. versionchanged:: 2.9.0
+       Enabled **parallel execution** with the ``multiprocessing`` and ``dask``
+       backends; use the new method :meth:`get_supported_backends` to see all
+       supported backends.
     """
 
+    @classmethod
+    def get_supported_backends(cls):
+        return (
+            "serial",
+            "multiprocessing",
+            "dask",
+        )
+
+    _analysis_algorithm_is_parallelizable = True
+
+    def _get_aggregator(self):
+        return ResultsGroup(
+            lookup={
+                "count": nested_array_sum,
+                "volume_cum": ResultsGroup.ndarray_sum,
+                "bins": ResultsGroup.ndarray_mean,
+                "edges": ResultsGroup.ndarray_mean,
+            }
+        )
+
     def __init__(
         self,
         u,
@@ -632,7 +718,7 @@ def _prepare(self):
 
         if self.norm == "rdf":
             # Cumulative volume for rdf normalization
-            self.volume_cum = 0
+            self.results.volume_cum = 0
         self._maxrange = self.rdf_settings["range"][1]
 
     def _single_frame(self):
@@ -650,7 +736,7 @@ def _single_frame(self):
                 self.results.count[i][idx1, idx2, :] += count
 
         if self.norm == "rdf":
-            self.volume_cum += self._ts.volume
+            self.results.volume_cum += self._ts.volume
 
     def _conclude(self):
         norm = self.n_frames
@@ -661,6 +747,7 @@ def _conclude(self):
 
         if self.norm == "rdf":
             # Average number density
+            self.volume_cum = self.results.volume_cum
             norm *= 1 / (self.volume_cum / self.n_frames)
 
         # Empty lists to restore indices, RDF
diff --git a/testsuite/MDAnalysisTests/analysis/conftest.py b/testsuite/MDAnalysisTests/analysis/conftest.py
@@ -19,6 +19,7 @@
 from MDAnalysis.analysis.density import DensityAnalysis
 from MDAnalysis.analysis.lineardensity import LinearDensity
 from MDAnalysis.analysis.polymer import PersistenceLength
+from MDAnalysis.analysis.rdf import InterRDF, InterRDF_s
 from MDAnalysis.lib.util import is_installed
 
 
@@ -194,3 +195,16 @@ def client_LinearDensity(request):
 @pytest.fixture(scope="module", params=params_for_cls(PersistenceLength))
 def client_PersistenceLength(request):
     return request.param
+
+
+# MDAnalysis.analysis.rdf
+
+
+@pytest.fixture(scope="module", params=params_for_cls(InterRDF))
+def client_InterRDF(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=params_for_cls(InterRDF_s))
+def client_InterRDF_s(request):
+    return request.param
diff --git a/testsuite/MDAnalysisTests/analysis/test_rdf.py b/testsuite/MDAnalysisTests/analysis/test_rdf.py
@@ -49,83 +49,85 @@ def sels(u):
     return s1, s2
 
 
-def test_nbins(u):
+def test_nbins(u, client_InterRDF):
     s1 = u.atoms[:3]
     s2 = u.atoms[3:]
-    rdf = InterRDF(s1, s2, nbins=412).run()
+    rdf = InterRDF(s1, s2, nbins=412).run(**client_InterRDF)
 
     assert len(rdf.results.bins) == 412
 
 
-def test_range(u):
+def test_range(u, client_InterRDF):
     s1 = u.atoms[:3]
     s2 = u.atoms[3:]
     rmin, rmax = 1.0, 13.0
-    rdf = InterRDF(s1, s2, range=(rmin, rmax)).run()
+    rdf = InterRDF(s1, s2, range=(rmin, rmax)).run(**client_InterRDF)
 
     assert rdf.results.edges[0] == rmin
     assert rdf.results.edges[-1] == rmax
 
 
-def test_count_sum(sels):
+def test_count_sum(sels, client_InterRDF):
     # OW vs HW
     # should see 8 comparisons in count
     s1, s2 = sels
-    rdf = InterRDF(s1, s2).run()
+    rdf = InterRDF(s1, s2).run(**client_InterRDF)
     assert rdf.results.count.sum() == 8
 
 
-def test_count(sels):
+def test_count(sels, client_InterRDF):
     # should see two distances with 4 counts each
     s1, s2 = sels
-    rdf = InterRDF(s1, s2).run()
+    rdf = InterRDF(s1, s2).run(**client_InterRDF)
     assert len(rdf.results.count[rdf.results.count == 4]) == 2
 
 
-def test_double_run(sels):
+def test_double_run(sels, client_InterRDF):
     # running rdf twice should give the same result
     s1, s2 = sels
-    rdf = InterRDF(s1, s2).run()
-    rdf.run()
+    rdf = InterRDF(s1, s2).run(**client_InterRDF)
+    rdf.run(**client_InterRDF)
     assert len(rdf.results.count[rdf.results.count == 4]) == 2
 
 
-def test_exclusion(sels):
+def test_exclusion(sels, client_InterRDF):
     # should see two distances with 4 counts each
     s1, s2 = sels
-    rdf = InterRDF(s1, s2, exclusion_block=(1, 2)).run()
+    rdf = InterRDF(s1, s2, exclusion_block=(1, 2)).run(**client_InterRDF)
     assert rdf.results.count.sum() == 4
 
 
 @pytest.mark.parametrize(
     "attr, count", [("residue", 8), ("segment", 0), ("chain", 8)]
 )
-def test_ignore_same_residues(sels, attr, count):
+def test_ignore_same_residues(sels, attr, count, client_InterRDF):
     # should see two distances with 4 counts each
     s1, s2 = sels
-    rdf = InterRDF(s2, s2, exclude_same=attr).run()
+    rdf = InterRDF(s2, s2, exclude_same=attr).run(**client_InterRDF)
     assert rdf.rdf[0] == 0
     assert rdf.results.count.sum() == count
 
 
-def test_ignore_same_residues_fails(sels):
+def test_ignore_same_residues_fails(sels, client_InterRDF):
     s1, s2 = sels
     with pytest.raises(
         ValueError, match="The exclude_same argument to InterRDF must be"
     ):
-        InterRDF(s2, s2, exclude_same="unsupported").run()
+        InterRDF(s2, s2, exclude_same="unsupported").run(**client_InterRDF)
 
     with pytest.raises(
         ValueError,
         match="The exclude_same argument to InterRDF cannot be used with",
     ):
-        InterRDF(s2, s2, exclude_same="residue", exclusion_block=tuple()).run()
+        InterRDF(s2, s2, exclude_same="residue", exclusion_block=tuple()).run(
+            **client_InterRDF
+        )
 
 
 @pytest.mark.parametrize("attr", ("rdf", "bins", "edges", "count"))
-def test_rdf_attr_warning(sels, attr):
+def test_rdf_attr_warning(sels, attr, client_InterRDF):
     s1, s2 = sels
-    rdf = InterRDF(s1, s2).run()
+    rdf = InterRDF(s1, s2).run(**client_InterRDF)
     wmsg = f"The `{attr}` attribute was deprecated in MDAnalysis 2.0.0"
     with pytest.warns(DeprecationWarning, match=wmsg):
         getattr(rdf, attr) is rdf.results[attr]
@@ -134,18 +136,18 @@ def test_rdf_attr_warning(sels, attr):
 @pytest.mark.parametrize(
     "norm, value", [("density", 1.956823), ("rdf", 244602.88385), ("none", 4)]
 )
-def test_norm(sels, norm, value):
+def test_norm(sels, norm, value, client_InterRDF):
     s1, s2 = sels
-    rdf = InterRDF(s1, s2, norm=norm).run()
+    rdf = InterRDF(s1, s2, norm=norm).run(**client_InterRDF)
     assert_allclose(max(rdf.results.rdf), value)
 
 
 @pytest.mark.parametrize(
     "norm, norm_required", [("Density", "density"), (None, "none")]
 )
-def test_norm_values(sels, norm, norm_required):
+def test_norm_values(sels, norm, norm_required, client_InterRDF):
     s1, s2 = sels
-    rdf = InterRDF(s1, s2, norm=norm).run()
+    rdf = InterRDF(s1, s2, norm=norm).run(**client_InterRDF)
     assert rdf.norm == norm_required
 
 
diff --git a/testsuite/MDAnalysisTests/analysis/test_rdf_s.py b/testsuite/MDAnalysisTests/analysis/test_rdf_s.py