Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add array type handling for normalization #835

Merged
merged 14 commits into from
Jan 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/run_notebooks.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
name: Run Notebooks

on:
- push
- pull_request

jobs:
Expand Down
9 changes: 7 additions & 2 deletions ehrapy/_compat.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# Since we might check whether an object is an instance of dask.array.Array
# without requiring dask installed in the environment.
# This would become obsolete should dask become a requirement for ehrapy

from collections.abc import Callable

try:
import dask.array as da
Expand All @@ -11,6 +10,12 @@
DASK_AVAILABLE = False


def _raise_array_type_not_implemented(func: Callable, type_: type) -> NotImplementedError:
raise NotImplementedError(
f"{func.__name__} does not support array type {type_}. Must be of type {func.registry.keys()}." # type: ignore
)


def is_dask_array(array):
if DASK_AVAILABLE:
return isinstance(array, da.Array)
Expand Down
130 changes: 105 additions & 25 deletions ehrapy/preprocessing/_normalization.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,22 @@
from __future__ import annotations

from functools import singledispatch
from typing import TYPE_CHECKING

import numpy as np
import sklearn.preprocessing as sklearn_pp

from ehrapy._compat import is_dask_array
from ehrapy._compat import _raise_array_type_not_implemented

try:
import dask.array as da
import dask_ml.preprocessing as daskml_pp

DASK_AVAILABLE = True
Zethson marked this conversation as resolved.
Show resolved Hide resolved
except ImportError:
daskml_pp = None
DASK_AVAILABLE = False


from ehrapy.anndata.anndata_ext import (
assert_numeric_vars,
Expand Down Expand Up @@ -69,6 +75,23 @@ def _scale_func_group(
return None


@singledispatch
Zethson marked this conversation as resolved.
Show resolved Hide resolved
def _scale_norm_function(arr):
_raise_array_type_not_implemented(_scale_norm_function, type(arr))


@_scale_norm_function.register
def _(arr: np.ndarray, **kwargs):
return sklearn_pp.StandardScaler(**kwargs).fit_transform


if DASK_AVAILABLE:

@_scale_norm_function.register
def _(arr: da.Array, **kwargs):
return daskml_pp.StandardScaler(**kwargs).fit_transform


def scale_norm(
adata: AnnData,
vars: str | Sequence[str] | None = None,
Expand Down Expand Up @@ -98,10 +121,7 @@ def scale_norm(
>>> adata_norm = ep.pp.scale_norm(adata, copy=True)
"""

if is_dask_array(adata.X):
scale_func = daskml_pp.StandardScaler(**kwargs).fit_transform
else:
scale_func = sklearn_pp.StandardScaler(**kwargs).fit_transform
scale_func = _scale_norm_function(adata.X, **kwargs)

return _scale_func_group(
adata=adata,
Expand All @@ -113,6 +133,23 @@ def scale_norm(
)


@singledispatch
def _minmax_norm_function(arr):
_raise_array_type_not_implemented(_minmax_norm_function, type(arr))


@_minmax_norm_function.register
def _(arr: np.ndarray, **kwargs):
return sklearn_pp.MinMaxScaler(**kwargs).fit_transform


if DASK_AVAILABLE:

@_minmax_norm_function.register
def _(arr: da.Array, **kwargs):
return daskml_pp.MinMaxScaler(**kwargs).fit_transform


def minmax_norm(
adata: AnnData,
vars: str | Sequence[str] | None = None,
Expand Down Expand Up @@ -143,10 +180,7 @@ def minmax_norm(
>>> adata_norm = ep.pp.minmax_norm(adata, copy=True)
"""

if is_dask_array(adata.X):
scale_func = daskml_pp.MinMaxScaler(**kwargs).fit_transform
else:
scale_func = sklearn_pp.MinMaxScaler(**kwargs).fit_transform
scale_func = _minmax_norm_function(adata.X, **kwargs)

return _scale_func_group(
adata=adata,
Expand All @@ -158,6 +192,16 @@ def minmax_norm(
)


@singledispatch
def _maxabs_norm_function(arr):
_raise_array_type_not_implemented(_scale_norm_function, type(arr))


@_maxabs_norm_function.register
def _(arr: np.ndarray):
return sklearn_pp.MaxAbsScaler().fit_transform


def maxabs_norm(
adata: AnnData,
vars: str | Sequence[str] | None = None,
Expand All @@ -184,10 +228,8 @@ def maxabs_norm(
>>> adata = ep.dt.mimic_2(encoded=True)
>>> adata_norm = ep.pp.maxabs_norm(adata, copy=True)
"""
if is_dask_array(adata.X):
raise NotImplementedError("MaxAbsScaler is not implemented in dask_ml.")
else:
scale_func = sklearn_pp.MaxAbsScaler().fit_transform

scale_func = _maxabs_norm_function(adata.X)

return _scale_func_group(
adata=adata,
Expand All @@ -199,6 +241,23 @@ def maxabs_norm(
)


@singledispatch
def _robust_scale_norm_function(arr, **kwargs):
_raise_array_type_not_implemented(_robust_scale_norm_function, type(arr))


@_robust_scale_norm_function.register
def _(arr: np.ndarray, **kwargs):
return sklearn_pp.RobustScaler(**kwargs).fit_transform


if DASK_AVAILABLE:

@_robust_scale_norm_function.register
def _(arr: da.Array, **kwargs):
return daskml_pp.RobustScaler(**kwargs).fit_transform


def robust_scale_norm(
adata: AnnData,
vars: str | Sequence[str] | None = None,
Expand Down Expand Up @@ -229,10 +288,8 @@ def robust_scale_norm(
>>> adata = ep.dt.mimic_2(encoded=True)
>>> adata_norm = ep.pp.robust_scale_norm(adata, copy=True)
"""
if is_dask_array(adata.X):
scale_func = daskml_pp.RobustScaler(**kwargs).fit_transform
else:
scale_func = sklearn_pp.RobustScaler(**kwargs).fit_transform

scale_func = _robust_scale_norm_function(adata.X, **kwargs)

return _scale_func_group(
adata=adata,
Expand All @@ -244,6 +301,23 @@ def robust_scale_norm(
)


@singledispatch
def _quantile_norm_function(arr):
_raise_array_type_not_implemented(_quantile_norm_function, type(arr))


@_quantile_norm_function.register
def _(arr: np.ndarray, **kwargs):
return sklearn_pp.QuantileTransformer(**kwargs).fit_transform


if DASK_AVAILABLE:

@_quantile_norm_function.register
def _(arr: da.Array, **kwargs):
return daskml_pp.QuantileTransformer(**kwargs).fit_transform


def quantile_norm(
adata: AnnData,
vars: str | Sequence[str] | None = None,
Expand Down Expand Up @@ -273,10 +347,8 @@ def quantile_norm(
>>> adata = ep.dt.mimic_2(encoded=True)
>>> adata_norm = ep.pp.quantile_norm(adata, copy=True)
"""
if is_dask_array(adata.X):
scale_func = daskml_pp.QuantileTransformer(**kwargs).fit_transform
else:
scale_func = sklearn_pp.QuantileTransformer(**kwargs).fit_transform

scale_func = _quantile_norm_function(adata.X, **kwargs)

return _scale_func_group(
adata=adata,
Expand All @@ -288,6 +360,16 @@ def quantile_norm(
)


@singledispatch
def _power_norm_function(arr, **kwargs):
_raise_array_type_not_implemented(_power_norm_function, type(arr))


@_power_norm_function.register
def _(arr: np.ndarray, **kwargs):
return sklearn_pp.PowerTransformer(**kwargs).fit_transform


def power_norm(
adata: AnnData,
vars: str | Sequence[str] | None = None,
Expand Down Expand Up @@ -317,10 +399,8 @@ def power_norm(
>>> adata = ep.dt.mimic_2(encoded=True)
>>> adata_norm = ep.pp.power_norm(adata, copy=True)
"""
if is_dask_array(adata.X):
raise NotImplementedError("dask-ml has no PowerTransformer, this is only available in scikit-learn")
else:
scale_func = sklearn_pp.PowerTransformer(**kwargs).fit_transform

scale_func = _power_norm_function(adata.X, **kwargs)

return _scale_func_group(
adata=adata,
Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ medcat = [
"medcat",
]
dask = [
"dask",
"anndata[dask]",
"dask-ml",
]
dev = [
Expand Down Expand Up @@ -136,7 +136,8 @@ filterwarnings = [
"ignore:`flavor='seurat_v3'` expects raw count data, but non-integers were found:UserWarning",
"ignore:All-NaN slice encountered:RuntimeWarning",
"ignore:Observation names are not unique. To make them unique, call `.obs_names_make_unique`.:UserWarning",
"ignore:Trying to modify attribute .var of view"
"ignore:Trying to modify attribute `.var` of view, initializing view as actual.:anndata.ImplicitModificationWarning",
"ignore:Transforming to str index.:anndata.ImplicitModificationWarning:"
]
minversion = 6.0
norecursedirs = [ '.*', 'build', 'dist', '*.egg', 'data', '__pycache__']
Expand Down
Loading
Loading