Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add replace option to subsample and rename function to sample #943

Merged
merged 41 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
bf922e1
Add replace option to subsample.
gokceneraslan Dec 2, 2019
32baba1
Merge branch 'master' into withreplacement
gokceneraslan Apr 20, 2020
671ec71
Add sc.pp.sample with axis argument.
gokceneraslan Apr 20, 2020
9e0739b
Fix fraction doc
gokceneraslan Apr 20, 2020
8ec8cf3
Add to release notes
gokceneraslan Apr 20, 2020
cdf4c65
Merge branch 'main' into withreplacement
flying-sheep Nov 14, 2024
fdf524a
refactor
flying-sheep Nov 14, 2024
061a19d
Refactor tests
flying-sheep Nov 14, 2024
8528f2d
Merge branch 'main' into withreplacement
flying-sheep Nov 14, 2024
06d4280
handle array case in test
flying-sheep Nov 14, 2024
6eeab2e
Test errors
flying-sheep Nov 14, 2024
b1f5061
prettier deprecations
flying-sheep Nov 14, 2024
cec8aff
docs
flying-sheep Nov 14, 2024
daa147e
ignore dask warning correctly
flying-sheep Nov 14, 2024
3c31abd
sig exception
flying-sheep Nov 14, 2024
d350411
WIP
flying-sheep Nov 18, 2024
f02725a
Merge branch 'main' into withreplacement
flying-sheep Nov 19, 2024
c24e9b2
remove duplicate _LegacyRandom
flying-sheep Nov 19, 2024
e246f02
undo compat thing
flying-sheep Nov 19, 2024
4ad40b7
fix backwards compat
flying-sheep Nov 19, 2024
1b8c81e
Use fake Generator
flying-sheep Nov 19, 2024
594d961
backwards compat test
flying-sheep Nov 19, 2024
00fdd77
Merge branch 'main' into withreplacement
flying-sheep Nov 21, 2024
59a171c
Fix tests for old Pythons
flying-sheep Nov 21, 2024
59adc76
test that random state is modified
flying-sheep Nov 21, 2024
ef27db0
Fix util
flying-sheep Nov 21, 2024
c471e94
types
flying-sheep Nov 21, 2024
3028dff
move deprecated stuff
flying-sheep Nov 21, 2024
f11b6ba
Use deprecation decorator
flying-sheep Nov 21, 2024
735f00a
relnote
flying-sheep Nov 21, 2024
4d54700
Merge branch 'pa/deprecated' into withreplacement
flying-sheep Nov 21, 2024
0a5b284
fix dask warning stuff
flying-sheep Nov 21, 2024
0ca9411
oops
flying-sheep Nov 21, 2024
f587cdf
Merge branch 'main' into withreplacement
flying-sheep Nov 22, 2024
396b21a
Bump numpy to version that has get_bit_generator
flying-sheep Nov 22, 2024
e3831bd
Merge branch 'main' into withreplacement
flying-sheep Dec 12, 2024
1ab8c97
update to compatible numba version
flying-sheep Dec 12, 2024
10db5be
front-load validation
flying-sheep Dec 19, 2024
8927554
add test for n=1
flying-sheep Dec 19, 2024
f163014
Merge branch 'main' into withreplacement
flying-sheep Dec 19, 2024
ce02426
use typevar
flying-sheep Dec 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/release-latest.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
On master
~~~~~~~~~~

.. rubric:: New functionality

- :func:`~scanpy.pp.sample` supports both upsampling and downsampling of observations and variables. :func:`~scanpy.pp.subsample` is now deprecated.

.. rubric:: Performance

- :func:`~scanpy.pp.pca` now uses efficient implicit centering for sparse matrices. This can lead to signifigantly improved performance for large datasets :pr:`1066` :smaller:`A Tarashansky`
Expand Down
2 changes: 1 addition & 1 deletion scanpy/preprocessing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from ._simple import filter_cells, filter_genes
from ._deprecated.highly_variable_genes import filter_genes_dispersion
from ._highly_variable_genes import highly_variable_genes
from ._simple import log1p, sqrt, scale, subsample
from ._simple import log1p, sqrt, scale, subsample, sample
from ._simple import normalize_per_cell, regress_out, downsample_counts
from ._pca import pca
from ._qc import calculate_qc_metrics
Expand Down
109 changes: 88 additions & 21 deletions scanpy/preprocessing/_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,59 +743,127 @@ def scale(
return X if copy else None


def subsample(
def sample(
data: Union[AnnData, np.ndarray, spmatrix],
fraction: Optional[float] = None,
n_obs: Optional[int] = None,
n: Optional[int] = None,
random_state: AnyRandom = 0,
copy: bool = False,
replace: bool = False,
axis: int = 0,
) -> Optional[AnnData]:
"""\
Subsample to a fraction of the number of observations.
Sample observations or variables with or without replacement.

Parameters
----------
data
The (annotated) data matrix of shape `n_obs` × `n_vars`.
Rows correspond to cells and columns to genes.
fraction
Subsample to this `fraction` of the number of observations.
n_obs
Subsample to this number of observations.
Sample to this `fraction` of the number of observations or variables.
This can be larger than 1.0, if replace=True.
See `axis` and `replace`.
n
Sample to this number of observations or variables. See `axis`.
random_state
Random seed to change subsampling.
copy
If an :class:`~anndata.AnnData` is passed,
determines whether a copy is returned.
replace
If True, samples are drawn with replacement.
axis
Sample observations (axis=0) or variables (axis=1). Default is 0.

Returns
-------
Returns `X[obs_indices], obs_indices` if data is array-like, otherwise
subsamples the passed :class:`~anndata.AnnData` (`copy == False`) or
returns a subsampled copy of it (`copy == True`).
Returns `X[indices] or X[:, indices], indices` depending on the axis
argument if data is array-like, otherwise samples the passed
:class:`~anndata.AnnData` (`copy == False`) or returns a sampled
copy of it (`copy == True`).
"""
np.random.seed(random_state)
old_n_obs = data.n_obs if isinstance(data, AnnData) else data.shape[0]
if n_obs is not None:
new_n_obs = n_obs
old_n = data.shape[axis]
if axis not in (0, 1):
raise ValueError(f'`axis` must be either 0 or 1.')
if fraction is None and n is None:
raise ValueError(f'Either `fraction` or `n` must be set.')
if fraction is not None and n is not None:
raise ValueError(f'Providing both `fraction` and `n` is not allowed.')
if n is not None:
new_n = n
elif fraction is not None:
if fraction > 1 or fraction < 0:
if fraction < 0:
raise ValueError(f'`fraction needs to be nonnegative`, not {fraction}')
if not replace and fraction > 1:
raise ValueError(
f'`fraction` needs to be within [0, 1], not {fraction}'
f'If replace=False, `fraction` needs to be within [0, 1], not {fraction}'
)
new_n_obs = int(fraction * old_n_obs)
logg.debug(f'... subsampled to {new_n_obs} data points')
new_n = int(fraction * old_n)
obs_or_var_str = 'observations' if axis == 0 else 'variables'
logg.debug(f'... sampled to {new_n} {obs_or_var_str}')
else:
raise ValueError('Either pass `n_obs` or `fraction`.')
obs_indices = np.random.choice(old_n_obs, size=new_n_obs, replace=False)
indices = np.random.choice(old_n, size=new_n, replace=replace)
if isinstance(data, AnnData):
if copy:
return data[obs_indices].copy()
view = data[indices] if axis == 0 else data[:, indices]
return view.copy()
else:
data._inplace_subset_obs(obs_indices)
if axis == 0:
data._inplace_subset_obs(indices)
else:
data._inplace_subset_var(indices)
else:
X = data
return X[obs_indices], obs_indices
return X[indices] if axis == 0 else X[:, indices], indices


def subsample(
data: Union[AnnData, np.ndarray, spmatrix],
fraction: Optional[float] = None,
n_obs: Optional[int] = None,
random_state: AnyRandom = 0,
copy: bool = False,
) -> Optional[AnnData]:
"""\
Subsample to a fraction of the number of observations.

.. warning::
.. deprecated:: 1.4.7
flying-sheep marked this conversation as resolved.
Show resolved Hide resolved
Use :func:`~scanpy.pp.sample` instead.
flying-sheep marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
data
The (annotated) data matrix of shape `n_obs` × `n_vars`.
Rows correspond to cells and columns to genes.
fraction
Subsample to this `fraction` of the number of observations.
n_obs
Subsample to this number of observations.
random_state
Random seed to change subsampling.
copy
If an :class:`~anndata.AnnData` is passed,
determines whether a copy is returned.

Returns
-------
Returns `X[obs_indices], obs_indices` if data is array-like, otherwise
subsamples the passed :class:`~anndata.AnnData` (`copy == False`) or
returns a subsampled copy of it (`copy == True`).
"""
return sample(
data=data,
fraction=fraction,
n=n_obs,
random_state=random_state,
copy=copy,
replace=False,
axis=0,
)


@deprecated_arg_names({"target_counts": "counts_per_cell"})
Expand Down Expand Up @@ -969,7 +1037,6 @@ def _downsample_array(
return col



def zscore_deprecated(X: np.ndarray) -> np.ndarray:
"""\
Z-score standardize each variable/gene in X.
Expand Down
31 changes: 25 additions & 6 deletions scanpy/tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,18 +91,37 @@ def test_normalize_per_cell():
axis=1).A1.tolist()


def test_subsample():
def test_sample():
flying-sheep marked this conversation as resolved.
Show resolved Hide resolved
adata = AnnData(np.ones((200, 10)))
sc.pp.subsample(adata, n_obs=40)
sc.pp.sample(adata, n=40)
assert adata.n_obs == 40
sc.pp.subsample(adata, fraction=0.1)
sc.pp.sample(adata, fraction=0.1)
assert adata.n_obs == 4
sc.pp.sample(adata, n=201, replace=True)
assert adata.n_obs == 201
sc.pp.sample(adata, n=10, axis=1)
assert adata.n_vars == 10
sc.pp.sample(adata, n=11, axis=1, replace=True)
assert adata.n_vars == 11
sc.pp.sample(adata, fraction=2.0, axis=1, replace=True)
assert adata.n_vars == 22

adata = AnnData(sp.csr_matrix(np.ones((200, 10))))
sc.pp.sample(adata, fraction=2.0, axis=1, replace=True)
assert adata.n_vars == 20
flying-sheep marked this conversation as resolved.
Show resolved Hide resolved

def test_subsample_copy():

def test_sample_copy():
adata = AnnData(np.ones((200, 10)))
assert sc.pp.subsample(adata, n_obs=40, copy=True).shape == (40, 10)
assert sc.pp.subsample(adata, fraction=0.1, copy=True).shape == (20, 10)
assert sc.pp.sample(adata, n=40, copy=True).shape == (40, 10)
assert sc.pp.sample(adata, fraction=0.1, copy=True).shape == (20, 10)
assert sc.pp.sample(adata, fraction=0.1, copy=True).shape == (20, 10)
X = sc.pp.sample(adata, fraction=2.0, axis=1, replace=True, copy=True)
assert X.shape == (200, 20)

adata = AnnData(sp.csr_matrix(np.ones((200, 10))))
X = sc.pp.sample(adata, fraction=2.0, axis=1, replace=True, copy=True)
assert X.shape == (200, 20)


def test_scale():
Expand Down