Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add array type handling for normalization #835

Merged
merged 14 commits into from
Jan 6, 2025
Merged
128 changes: 104 additions & 24 deletions ehrapy/preprocessing/_normalization.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

from functools import singledispatch
from typing import TYPE_CHECKING

import numpy as np
Expand All @@ -8,9 +9,14 @@
from ehrapy._compat import is_dask_array

try:
import dask.array as da
import dask_ml.preprocessing as daskml_pp

DASK_AVAILABLE = True
Zethson marked this conversation as resolved.
Show resolved Hide resolved
except ImportError:
daskml_pp = None
DASK_AVAILABLE = False


from ehrapy.anndata.anndata_ext import (
assert_numeric_vars,
Expand Down Expand Up @@ -69,6 +75,23 @@ def _scale_func_group(
return None


@singledispatch
Zethson marked this conversation as resolved.
Show resolved Hide resolved
def _scale_norm_function(arr):
raise NotImplementedError(f"scale_norm does not support data to be of type {type(arr)}")


@_scale_norm_function.register
def _(arr: np.ndarray, **kwargs):
return sklearn_pp.StandardScaler(**kwargs).fit_transform


if DASK_AVAILABLE:

@_scale_norm_function.register
def _(arr: da.Array, **kwargs):
return daskml_pp.StandardScaler(**kwargs).fit_transform


def scale_norm(
adata: AnnData,
vars: str | Sequence[str] | None = None,
Expand Down Expand Up @@ -98,10 +121,7 @@ def scale_norm(
>>> adata_norm = ep.pp.scale_norm(adata, copy=True)
"""

if is_dask_array(adata.X):
scale_func = daskml_pp.StandardScaler(**kwargs).fit_transform
else:
scale_func = sklearn_pp.StandardScaler(**kwargs).fit_transform
scale_func = _scale_norm_function(adata.X, **kwargs)

return _scale_func_group(
adata=adata,
Expand All @@ -113,6 +133,23 @@ def scale_norm(
)


@singledispatch
def _minmax_norm_function(arr):
raise NotImplementedError(f"minmax_norm does not support data to be of type {type(arr)}")
Zethson marked this conversation as resolved.
Show resolved Hide resolved


@_minmax_norm_function.register
def _(arr: np.ndarray, **kwargs):
return sklearn_pp.MinMaxScaler(**kwargs).fit_transform


if DASK_AVAILABLE:

@_minmax_norm_function.register
def _(arr: da.Array, **kwargs):
return daskml_pp.MinMaxScaler(**kwargs).fit_transform


def minmax_norm(
adata: AnnData,
vars: str | Sequence[str] | None = None,
Expand Down Expand Up @@ -143,10 +180,7 @@ def minmax_norm(
>>> adata_norm = ep.pp.minmax_norm(adata, copy=True)
"""

if is_dask_array(adata.X):
scale_func = daskml_pp.MinMaxScaler(**kwargs).fit_transform
else:
scale_func = sklearn_pp.MinMaxScaler(**kwargs).fit_transform
scale_func = _minmax_norm_function(adata.X, **kwargs)

return _scale_func_group(
adata=adata,
Expand All @@ -158,6 +192,16 @@ def minmax_norm(
)


@singledispatch
def _maxabs_norm_function(arr):
raise NotImplementedError(f"maxabs_norm does not support data to be of type {type(arr)}")


@_maxabs_norm_function.register
def _(arr: np.ndarray):
return sklearn_pp.MaxAbsScaler().fit_transform


def maxabs_norm(
adata: AnnData,
vars: str | Sequence[str] | None = None,
Expand All @@ -184,10 +228,8 @@ def maxabs_norm(
>>> adata = ep.dt.mimic_2(encoded=True)
>>> adata_norm = ep.pp.maxabs_norm(adata, copy=True)
"""
if is_dask_array(adata.X):
raise NotImplementedError("MaxAbsScaler is not implemented in dask_ml.")
else:
scale_func = sklearn_pp.MaxAbsScaler().fit_transform

scale_func = _maxabs_norm_function(adata.X)

return _scale_func_group(
adata=adata,
Expand All @@ -199,6 +241,23 @@ def maxabs_norm(
)


@singledispatch
def _robust_scale_norm_function(arr, **kwargs):
raise NotImplementedError(f"robust_scale_norm does not support data to be of type {type(arr)}")


@_robust_scale_norm_function.register
def _(arr: np.ndarray, **kwargs):
return sklearn_pp.RobustScaler(**kwargs).fit_transform


if DASK_AVAILABLE:

@_robust_scale_norm_function.register
def _(arr: da.Array, **kwargs):
return daskml_pp.RobustScaler(**kwargs).fit_transform


def robust_scale_norm(
adata: AnnData,
vars: str | Sequence[str] | None = None,
Expand Down Expand Up @@ -229,10 +288,8 @@ def robust_scale_norm(
>>> adata = ep.dt.mimic_2(encoded=True)
>>> adata_norm = ep.pp.robust_scale_norm(adata, copy=True)
"""
if is_dask_array(adata.X):
scale_func = daskml_pp.RobustScaler(**kwargs).fit_transform
else:
scale_func = sklearn_pp.RobustScaler(**kwargs).fit_transform

scale_func = _robust_scale_norm_function(adata.X, **kwargs)

return _scale_func_group(
adata=adata,
Expand All @@ -244,6 +301,23 @@ def robust_scale_norm(
)


@singledispatch
def _quantile_norm_function(arr):
raise NotImplementedError(f"robust_scale_norm does not support data to be of type {type(arr)}")


@_quantile_norm_function.register
def _(arr: np.ndarray, **kwargs):
return sklearn_pp.QuantileTransformer(**kwargs).fit_transform


if DASK_AVAILABLE:

@_quantile_norm_function.register
def _(arr: da.Array, **kwargs):
return daskml_pp.QuantileTransformer(**kwargs).fit_transform


def quantile_norm(
adata: AnnData,
vars: str | Sequence[str] | None = None,
Expand Down Expand Up @@ -273,10 +347,8 @@ def quantile_norm(
>>> adata = ep.dt.mimic_2(encoded=True)
>>> adata_norm = ep.pp.quantile_norm(adata, copy=True)
"""
if is_dask_array(adata.X):
scale_func = daskml_pp.QuantileTransformer(**kwargs).fit_transform
else:
scale_func = sklearn_pp.QuantileTransformer(**kwargs).fit_transform

scale_func = _quantile_norm_function(adata.X, **kwargs)

return _scale_func_group(
adata=adata,
Expand All @@ -288,6 +360,16 @@ def quantile_norm(
)


@singledispatch
def _power_norm_function(arr, **kwargs):
raise NotImplementedError(f"power_norm does not support data to be of type {type(arr)}")


@_power_norm_function.register
def _(arr: np.ndarray, **kwargs):
return sklearn_pp.PowerTransformer(**kwargs).fit_transform


def power_norm(
adata: AnnData,
vars: str | Sequence[str] | None = None,
Expand Down Expand Up @@ -317,10 +399,8 @@ def power_norm(
>>> adata = ep.dt.mimic_2(encoded=True)
>>> adata_norm = ep.pp.power_norm(adata, copy=True)
"""
if is_dask_array(adata.X):
raise NotImplementedError("dask-ml has no PowerTransformer, this is only available in scikit-learn")
else:
scale_func = sklearn_pp.PowerTransformer(**kwargs).fit_transform

scale_func = _power_norm_function(adata.X, **kwargs)

return _scale_func_group(
adata=adata,
Expand Down
Loading
Loading