Merge pull request #100 from PyLops/cupy

Enable use of cupy arrays
PyLops · Jun 2, 2024 · fbf892e · fbf892e
2 parents f628bd3 + 45186ef
commit fbf892e
Show file tree

Hide file tree

Showing 11 changed files with 216 additions and 65 deletions.
diff --git a/docs/source/gpu.rst b/docs/source/gpu.rst
@@ -0,0 +1,85 @@
+.. _gpu:
+
+GPU Support
+===========
+
+Overview
+--------
+PyLops-mpi supports computations on GPUs leveraging the GPU backend of PyLops. Under the hood,
+`CuPy <https://cupy.dev/>`_ (``cupy-cudaXX>=v13.0.0``) is used to perform all of the operations.
+This library must be installed *before* PyLops-mpi is installed.
+
+.. note::
+
+   Set environment variable ``CUPY_PYLOPS=0`` to force PyLops to ignore the ``cupy`` backend.
+   This can be also used if a previous (or faulty) version of ``cupy`` is installed in your system,
+   otherwise you will get an error when importing PyLops.
+
+
+The :class:`pylops_mpi.DistributedArray` and :class:`pylops_mpi.StackedDistributedArray` objects can be 
+generated using both ``numpy`` and ``cupy`` based local arrays, and all of the operators and solvers in PyLops-mpi 
+can handle both scenarios. Note that, since most operators in PyLops-mpi are thin-wrappers around PyLops operators,
+some of the operators in PyLops that lack a GPU implementation cannot be used also in PyLops-mpi when working with
+cupy arrays.
+
+
+Example
+-------
+
+Finally, let's briefly look at an example. First we write a code snippet using
+``numpy`` arrays which PyLops-mpi will run on your CPU:
+
+.. code-block:: python
+
+    # MPI helpers
+    comm = MPI.COMM_WORLD
+    rank = MPI.COMM_WORLD.Get_rank()
+    size = MPI.COMM_WORLD.Get_size()
+    
+    # Create distributed data (broadcast)
+    nxl, nt = 20, 20
+    dtype = np.float32
+    d_dist = pylops_mpi.DistributedArray(global_shape=nxl * nt,                                   
+                                         partition=pylops_mpi.Partition.BROADCAST,
+                                         engine="numpy", dtype=dtype)
+    d_dist[:] = np.ones(d_dist.local_shape, dtype=dtype)
+    
+    # Create and apply VStack operator
+    Sop = pylops.MatrixMult(np.ones((nxl, nxl)), otherdims=(nt, ))
+    HOp = pylops_mpi.MPIVStack(ops=[Sop, ])
+    y_dist = HOp @ d_dist
+    
+
+Now we write a code snippet using ``cupy`` arrays which PyLops will run on 
+your GPU:
+
+.. code-block:: python
+
+    # MPI helpers
+    comm = MPI.COMM_WORLD
+    rank = MPI.COMM_WORLD.Get_rank()
+    size = MPI.COMM_WORLD.Get_size()
+    
+    # Define gpu to use
+    cp.cuda.Device(device=rank).use()
+
+    # Create distributed data (broadcast)
+    nxl, nt = 20, 20
+    dtype = np.float32
+    d_dist = pylops_mpi.DistributedArray(global_shape=nxl * nt,                                   
+                                         partition=pylops_mpi.Partition.BROADCAST,
+                                         engine="cupy", dtype=dtype)
+    d_dist[:] = cp.ones(d_dist.local_shape, dtype=dtype)
+    
+    # Create and apply VStack operator
+    Sop = pylops.MatrixMult(cp.ones((nxl, nxl)), otherdims=(nt, ))
+    HOp = pylops_mpi.MPIVStack(ops=[Sop, ])
+    y_dist = HOp @ d_dist
+
+The code is almost unchanged apart from the fact that we now use ``cupy`` arrays,
+PyLops-mpi will figure this out!
+
+.. note::
+
+   The CuPy backend is in active development, with many examples not yet in the docs.
+   You can find many `other examples <https://github.com/PyLops/pylops_notebooks/tree/master/developement-mpi/Cupy_MPI>`_ from the `PyLops Notebooks repository <https://github.com/PyLops/pylops_notebooks>`_.
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -71,6 +71,7 @@ class and implementing the ``_matvec`` and ``_rmatvec``.
 
     self
     installation.rst
+    gpu.rst
 
 .. toctree::
    :maxdepth: 2

diff --git a/examples/plot_stacked_array.py b/examples/plot_stacked_array.py
@@ -1,6 +1,6 @@
 """
 Stacked Array
-=========================
+=============
 This example shows how to use the :py:class:`pylops_mpi.StackedDistributedArray`.
 This class provides a way to combine and act on multiple :py:class:`pylops_mpi.DistributedArray`
 within the same program. This is very useful in scenarios where an array can be logically

diff --git a/pylops_mpi/DistributedArray.py b/pylops_mpi/DistributedArray.py
@@ -5,6 +5,7 @@
 from enum import Enum
 
 from pylops.utils import DTypeLike, NDArray
+from pylops.utils.backend import get_module, get_array_module, get_module_name
 
 
 class Partition(Enum):
@@ -78,6 +79,8 @@ class DistributedArray:
         Axis along which distribution occurs. Defaults to ``0``.
     local_shapes : :obj:`list`, optional
         List of tuples representing local shapes at each rank.
+    engine : :obj:`str`, optional
+        Engine used to store array (``numpy`` or ``cupy``)
     dtype : :obj:`str`, optional
         Type of elements in input array. Defaults to ``numpy.float64``.
     """
@@ -86,6 +89,7 @@ def __init__(self, global_shape: Union[Tuple, Integral],
                  base_comm: Optional[MPI.Comm] = MPI.COMM_WORLD,
                  partition: Partition = Partition.SCATTER, axis: int = 0,
                  local_shapes: Optional[List[Tuple]] = None,
+                 engine: Optional[str] = "numpy",
                  dtype: Optional[DTypeLike] = np.float64):
         if isinstance(global_shape, Integral):
             global_shape = (global_shape,)
@@ -103,7 +107,8 @@ def __init__(self, global_shape: Union[Tuple, Integral],
         self._check_local_shapes(local_shapes)
         self._local_shape = local_shapes[base_comm.rank] if local_shapes else local_split(global_shape, base_comm,
                                                                                           partition, axis)
-        self._local_array = np.empty(shape=self.local_shape, dtype=self.dtype)
+        self._engine = engine
+        self._local_array = get_module(engine).empty(shape=self.local_shape, dtype=self.dtype)
 
     def __getitem__(self, index):
         return self.local_array[index]
@@ -160,6 +165,16 @@ def local_shape(self):
         """
         return self._local_shape
 
+    @property
+    def engine(self):
+        """Engine of the Distributed array
+
+        Returns
+        -------
+        engine : :obj:`str`
+        """
+        return self._engine
+
     @property
     def local_array(self):
         """View of the Local Array
@@ -269,6 +284,7 @@ def to_dist(cls, x: NDArray,
             Axis of Distribution
         local_shapes : :obj:`list`, optional
             Local Shapes at each rank.
+
         Returns
         ----------
         dist_array : :obj:`DistributedArray`
@@ -279,6 +295,7 @@ def to_dist(cls, x: NDArray,
                                       partition=partition,
                                       axis=axis,
                                       local_shapes=local_shapes,
+                                      engine=get_module_name(get_array_module(x)),
                                       dtype=x.dtype)
         if partition == Partition.BROADCAST:
             dist_array[:] = x
@@ -334,6 +351,7 @@ def __neg__(self):
                                partition=self.partition,
                                axis=self.axis,
                                local_shapes=self.local_shapes,
+                               engine=self.engine,
                                dtype=self.dtype)
         arr[:] = -self.local_array
         return arr
@@ -365,6 +383,7 @@ def add(self, dist_array):
                                     dtype=self.dtype,
                                     partition=self.partition,
                                     local_shapes=self.local_shapes,
+                                    engine=self.engine,
                                     axis=self.axis)
         SumArray[:] = self.local_array + dist_array.local_array
         return SumArray
@@ -387,6 +406,7 @@ def multiply(self, dist_array):
                                         dtype=self.dtype,
                                         partition=self.partition,
                                         local_shapes=self.local_shapes,
+                                        engine=self.engine,
                                         axis=self.axis)
         if isinstance(dist_array, DistributedArray):
             # multiply two DistributedArray
@@ -480,6 +500,7 @@ def conj(self):
                                 partition=self.partition,
                                 axis=self.axis,
                                 local_shapes=self.local_shapes,
+                                engine=self.engine,
                                 dtype=self.dtype)
         conj[:] = self.local_array.conj()
         return conj
@@ -492,6 +513,7 @@ def copy(self):
                                partition=self.partition,
                                axis=self.axis,
                                local_shapes=self.local_shapes,
+                               engine=self.engine,
                                dtype=self.dtype)
         arr[:] = self.local_array
         return arr
@@ -514,6 +536,7 @@ def ravel(self, order: Optional[str] = "C"):
         arr = DistributedArray(global_shape=np.prod(self.global_shape),
                                local_shapes=local_shapes,
                                partition=self.partition,
+                               engine=self.engine,
                                dtype=self.dtype)
         local_array = np.ravel(self.local_array, order=order)
         x = local_array.copy()

diff --git a/pylops_mpi/LinearOperator.py b/pylops_mpi/LinearOperator.py
@@ -81,6 +81,7 @@ def _matvec(self, x: DistributedArray) -> DistributedArray:
                                  base_comm=self.base_comm,
                                  partition=x.partition,
                                  axis=x.axis,
+                                 engine=x.engine,
                                  dtype=self.dtype)
             y[:] = self.Op._matvec(x.local_array)
             return y
@@ -117,6 +118,7 @@ def _rmatvec(self, x: DistributedArray) -> DistributedArray:
                                  base_comm=self.base_comm,
                                  partition=x.partition,
                                  axis=x.axis,
+                                 engine=x.engine,
                                  dtype=self.dtype)
             y[:] = self.Op._rmatvec(x.local_array)
             return y

diff --git a/pylops_mpi/basicoperators/BlockDiag.py b/pylops_mpi/basicoperators/BlockDiag.py
@@ -3,8 +3,9 @@
 from mpi4py import MPI
 from typing import Optional, Sequence
 
-from pylops.utils import DTypeLike
 from pylops import LinearOperator
+from pylops.utils import DTypeLike
+from pylops.utils.backend import get_module
 
 from pylops_mpi import MPILinearOperator, MPIStackedLinearOperator
 from pylops_mpi import DistributedArray, StackedDistributedArray
@@ -113,22 +114,26 @@ def __init__(self, ops: Sequence[LinearOperator],
 
     @reshaped(forward=True, stacking=True)
     def _matvec(self, x: DistributedArray) -> DistributedArray:
-        y = DistributedArray(global_shape=self.shape[0], local_shapes=self.local_shapes_n, dtype=self.dtype)
+        ncp = get_module(x.engine)
+        y = DistributedArray(global_shape=self.shape[0], local_shapes=self.local_shapes_n,
+                             engine=x.engine, dtype=self.dtype)
         y1 = []
         for iop, oper in enumerate(self.ops):
             y1.append(oper.matvec(x.local_array[self.mmops[iop]:
                                                 self.mmops[iop + 1]]))
-        y[:] = np.concatenate(y1)
+        y[:] = ncp.concatenate(y1)
         return y
 
     @reshaped(forward=False, stacking=True)
     def _rmatvec(self, x: DistributedArray) -> DistributedArray:
-        y = DistributedArray(global_shape=self.shape[1], local_shapes=self.local_shapes_m, dtype=self.dtype)
+        ncp = get_module(x.engine)
+        y = DistributedArray(global_shape=self.shape[1], local_shapes=self.local_shapes_m,
+                             engine=x.engine, dtype=self.dtype)
         y1 = []
         for iop, oper in enumerate(self.ops):
             y1.append(oper.rmatvec(x.local_array[self.nnops[iop]:
                                                  self.nnops[iop + 1]]))
-        y[:] = np.concatenate(y1)
+        y[:] = ncp.concatenate(y1)
         return y
-Original file line number
+Diff line change
@@ Expand Up / @@ -71,6 +71,7 @@ class and implementing the ``_matvec`` and ``_rmatvec``. @@
         self
         installation.rst
+        gpu.rst
     .. toctree::
        :maxdepth: 2
@@ Expand Down @@