scverse · flying-sheep · Dec 19, 2024 · Dec 2, 2019 · Apr 20, 2020 · Apr 20, 2020
diff --git a/docs/api/deprecated.md b/docs/api/deprecated.md
@@ -11,4 +11,5 @@
 
    pp.filter_genes_dispersion
    pp.normalize_per_cell
+   pp.subsample
 ```
diff --git a/docs/api/preprocessing.md b/docs/api/preprocessing.md
@@ -31,7 +31,7 @@ For visual quality control, see {func}`~scanpy.pl.highest_expr_genes` and
    pp.normalize_total
    pp.regress_out
    pp.scale
-   pp.subsample
+   pp.sample
    pp.downsample_counts
 ```
 

diff --git a/docs/release-notes/943.feature.md b/docs/release-notes/943.feature.md
@@ -0,0 +1 @@
+{func}`~scanpy.pp.sample` supports both upsampling and downsampling of observations and variables. {func}`~scanpy.pp.subsample` is now deprecated. {smaller}`G Eraslan`
diff --git a/src/scanpy/preprocessing/__init__.py b/src/scanpy/preprocessing/__init__.py
@@ -17,6 +17,7 @@
     log1p,
     normalize_per_cell,
     regress_out,
+    sample,
     sqrt,
     subsample,
 )
@@ -40,6 +41,7 @@
     "log1p",
     "normalize_per_cell",
     "regress_out",
+    "sample",
     "scale",
     "sqrt",
     "subsample",

diff --git a/src/scanpy/preprocessing/_deprecated/highly_variable_genes.py b/src/scanpy/preprocessing/_deprecated/highly_variable_genes.py
@@ -48,18 +48,18 @@ def filter_genes_dispersion(
     """\
     Extract highly variable genes :cite:p:`Satija2015,Zheng2017`.
 
-    .. warning::
-        .. deprecated:: 1.3.6
-            Use :func:`~scanpy.pp.highly_variable_genes`
-            instead. The new function is equivalent to the present
-            function, except that
+    .. deprecated:: 1.3.6
 
-            * the new function always expects logarithmized data
-            * `subset=False` in the new function, it suffices to
-              merely annotate the genes, tools like `pp.pca` will
-              detect the annotation
-            * you can now call: `sc.pl.highly_variable_genes(adata)`
-            * `copy` is replaced by `inplace`
+       Use :func:`~scanpy.pp.highly_variable_genes`
+       instead. The new function is equivalent to the present
+       function, except that
+
+       * the new function always expects logarithmized data
+       * `subset=False` in the new function, it suffices to
+         merely annotate the genes, tools like `pp.pca` will
+         detect the annotation
+       * you can now call: `sc.pl.highly_variable_genes(adata)`
+       * `copy` is replaced by `inplace`
 
     If trying out parameters, pass the data matrix instead of AnnData.
 

diff --git a/src/scanpy/preprocessing/_simple.py b/src/scanpy/preprocessing/_simple.py
@@ -8,7 +8,7 @@
 import warnings
 from functools import singledispatch
 from itertools import repeat
-from typing import TYPE_CHECKING, TypeVar
+from typing import TYPE_CHECKING, TypeVar, overload
 
 import numba
 import numpy as np
@@ -22,6 +22,7 @@
 from .._settings import settings as sett
 from .._utils import (
     _check_array_function_arguments,
+    _resolve_axis,
     axis_sum,
     is_backed_type,
     raise_not_implemented_error_if_backed_type,
@@ -33,7 +34,6 @@
 from ._distributed import materialize_as_ndarray
 from ._utils import _to_dense
 
-# install dask if available
 try:
     import dask.array as da
 except ImportError:
@@ -49,10 +49,13 @@
 
     import pandas as pd
     from numpy.typing import NDArray
+    from scipy.sparse import csc_matrix
 
     from .._compat import DaskArray
     from .._utils import AnyRandom
 
+    CSMatrix = csr_matrix | csc_matrix
+
 
 @old_positionals(
     "min_counts", "min_genes", "max_counts", "max_genes", "inplace", "copy"
@@ -498,16 +501,16 @@
     """\
     Normalize total counts per cell.
 
-    .. warning::
-        .. deprecated:: 1.3.7
-            Use :func:`~scanpy.pp.normalize_total` instead.
-            The new function is equivalent to the present
-            function, except that
+    .. deprecated:: 1.3.7
+
+       Use :func:`~scanpy.pp.normalize_total` instead.
+       The new function is equivalent to the present
+       function, except that
 
-            * the new function doesn't filter cells based on `min_counts`,
-              use :func:`~scanpy.pp.filter_cells` if filtering is needed.
-            * some arguments were renamed
-            * `copy` is replaced by `inplace`
+       * the new function doesn't filter cells based on `min_counts`,
+         use :func:`~scanpy.pp.filter_cells` if filtering is needed.
+       * some arguments were renamed
+       * `copy` is replaced by `inplace`
 
     Normalize each cell by total counts over all genes, so that every cell has
     the same total count after normalization.
@@ -825,18 +828,144 @@
     return np.vstack(responses_chunk_list)
 
 
+@overload
+def sample(
+    data: AnnData,
+    fraction: float | None = None,
+    *,
+    n: int | None = None,
+    random_state: AnyRandom = 0,
+    copy: Literal[False] = False,
+    replace: bool = False,
+    axis: Literal["obs", 0, "var", 1] = "obs",
+) -> None: ...
+@overload
+def sample(
+    data: AnnData,
+    fraction: float | None = None,
+    *,
+    n: int | None = None,
+    random_state: AnyRandom = 0,
+    copy: Literal[True],
+    replace: bool = False,
+    axis: Literal["obs", 0, "var", 1] = "obs",
+) -> AnnData: ...
+@overload
+def sample(
+    data: np.ndarray | CSMatrix,
+    fraction: float | None = None,
+    *,
+    n: int | None = None,
+    random_state: AnyRandom = 0,
+    copy: bool = False,
+    replace: bool = False,
+    axis: Literal["obs", 0, "var", 1] = "obs",
+) -> tuple[np.ndarray | CSMatrix, NDArray[np.int64]]: ...
+def sample(
+    data: AnnData | np.ndarray | CSMatrix,
+    fraction: float | None = None,
+    *,
+    n: int | None = None,
+    random_state: AnyRandom = 0,
+    copy: bool = False,
+    replace: bool = False,
+    axis: Literal["obs", 0, "var", 1] = "obs",
+) -> AnnData | None | tuple[np.ndarray | CSMatrix, NDArray[np.int64]]:
+    """\
+    Sample observations or variables with or without replacement.
+
+    Parameters
+    ----------
+    data
+        The (annotated) data matrix of shape `n_obs` × `n_vars`.
+        Rows correspond to cells and columns to genes.
+    fraction
+        Sample to this `fraction` of the number of observations or variables.
+        This can be larger than 1.0, if `replace=True`.
+        See `axis` and `replace`.
+    n
+        Sample to this number of observations or variables. See `axis`.
+    random_state
+        Random seed to change subsampling.
+    copy
+        If an :class:`~anndata.AnnData` is passed,
+        determines whether a copy is returned.
+    replace
+        If True, samples are drawn with replacement.
+    axis
+        Sample `obs`\\ ervations (axis 0) or `var`\\ iables (axis 1).
+
+    Returns
+    -------
+    If `isinstance(data, AnnData)` and `copy=False`,
+    this function returns `None`. Otherwise:
+
+    `data[indices, :]` | `data[:, indices]` (depending on `axis`)
+        If `data` is array-like or `copy=True`, returns the subset.
+    `indices` : numpy.ndarray
+        If `data` is array-like, also returns the indices into the original.
+    """
+    axis, axis_name = _resolve_axis(axis)
+    old_n = data.shape[axis]
+    match (fraction, n):
+        case (None, None):
+            msg = "Either `fraction` or `n` must be set."
+            raise TypeError(msg)
+        case (None, _):
+            pass
+        case (_, None):
+            if fraction < 0:
+                msg = f"`{fraction=}` needs to be nonnegative."
+                raise ValueError(msg)
+            if not replace and fraction > 1:
+                msg = f"If `replace=False`, `{fraction=}` needs to be within [0, 1]."
+                raise ValueError(msg)
+            n = int(fraction * old_n)
+            logg.debug(f"... sampled to {n} {axis_name}")
+        case _:
+            msg = "Providing both `fraction` and `n` is not allowed."
+            raise TypeError(msg)
+    del fraction
+
+    np.random.seed(random_state)
+    indices = np.random.choice(old_n, size=n, replace=replace)
+    subset = data[indices] if axis_name == "obs" else data[:, indices]
+
+    if not isinstance(data, AnnData):
+        assert not isinstance(subset, AnnData)
+        if copy:
+            subset = subset.copy()
+        return subset, indices
+    assert isinstance(subset, AnnData)
+    if copy:
+        return subset.to_memory() if data.isbacked else subset.copy()
+
+    # in-place
+    if data.isbacked:
+        msg = "Inplace sampling (`copy=False`) is not implemented for backed objects."
+        raise NotImplementedError(msg)
+    if axis_name == "obs":
+        data._inplace_subset_obs(indices)
+    else:
+        data._inplace_subset_var(indices)
+
+
 @old_positionals("n_obs", "random_state", "copy")
 def subsample(
-    data: AnnData | np.ndarray | spmatrix,
+    data: AnnData | np.ndarray | CSMatrix,
     fraction: float | None = None,
     *,
     n_obs: int | None = None,
     random_state: AnyRandom = 0,
     copy: bool = False,
-) -> AnnData | tuple[np.ndarray | spmatrix, NDArray[np.int64]] | None:
+) -> AnnData | tuple[np.ndarray | CSMatrix, NDArray[np.int64]] | None:
     """\
     Subsample to a fraction of the number of observations.
 
+    .. deprecated:: 1.11.0
+
+       Use :func:`~scanpy.pp.sample` instead.
+
     Parameters
     ----------
     data
@@ -858,34 +987,15 @@
     subsamples the passed :class:`~anndata.AnnData` (`copy == False`) or
     returns a subsampled copy of it (`copy == True`).
     """
-    np.random.seed(random_state)
-    old_n_obs = data.n_obs if isinstance(data, AnnData) else data.shape[0]
-    if n_obs is not None:
-        new_n_obs = n_obs
-    elif fraction is not None:
-        if fraction > 1 or fraction < 0:
-            raise ValueError(f"`fraction` needs to be within [0, 1], not {fraction}")
-        new_n_obs = int(fraction * old_n_obs)
-        logg.debug(f"... subsampled to {new_n_obs} data points")
-    else:
-        raise ValueError("Either pass `n_obs` or `fraction`.")
-    obs_indices = np.random.choice(old_n_obs, size=new_n_obs, replace=False)
-    if isinstance(data, AnnData):
-        if data.isbacked:
-            if copy:
-                return data[obs_indices].to_memory()
-            else:
-                raise NotImplementedError(
-                    "Inplace subsampling is not implemented for backed objects."
-                )
-        else:
-            if copy:
-                return data[obs_indices].copy()
-            else:
-                data._inplace_subset_obs(obs_indices)
-    else:
-        X = data
-        return X[obs_indices], obs_indices
+    return sample(
+        data=data,
+        fraction=fraction,
+        n=n_obs,
+        random_state=random_state,
+        copy=copy,
+        replace=False,
+        axis=0,
+    )
 
 
 @renamed_arg("target_counts", "counts_per_cell")

diff --git a/tests/test_package_structure.py b/tests/test_package_structure.py
@@ -138,6 +138,7 @@ class ExpectedSig(TypedDict):
 copy_sigs["sc.pp.filter_cells"] = None  # unclear `inplace` situation
 copy_sigs["sc.pp.filter_genes"] = None  # unclear `inplace` situation
 copy_sigs["sc.pp.subsample"] = None  # returns indices along matrix
+copy_sigs["sc.pp.sample"] = None  # returns indices along matrix
 # partial exceptions: “data” instead of “adata”
 copy_sigs["sc.pp.log1p"]["first_name"] = "data"
 copy_sigs["sc.pp.normalize_per_cell"]["first_name"] = "data"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{func}`~scanpy.pp.sample` supports both upsampling and downsampling of observations and variables. {func}`~scanpy.pp.subsample` is now deprecated. {smaller}`G Eraslan`