diff --git a/.github/workflows/run_notebooks.yml b/.github/workflows/run_notebooks.yml index 9fc77c3d..b76fa9bd 100644 --- a/.github/workflows/run_notebooks.yml +++ b/.github/workflows/run_notebooks.yml @@ -19,7 +19,7 @@ jobs: # "docs/tutorials/notebooks/medcat.ipynb", ] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: "true" token: "${{ secrets.CT_SYNC_TOKEN }}" @@ -28,11 +28,14 @@ jobs: with: python-version: "3.11" + - name: Install UV + run: pip install uv + - name: Install ehrapy - run: pip install . + run: uv pip install --system . - name: Install additional dependencies - run: pip install medcat cellrank + run: uv pip install --system cellrank - name: Install nbconvert ipykernel run: pip install nbconvert ipykernel diff --git a/ehrapy/anndata/anndata_ext.py b/ehrapy/anndata/anndata_ext.py index 78257424..02e330eb 100644 --- a/ehrapy/anndata/anndata_ext.py +++ b/ehrapy/anndata/anndata_ext.py @@ -106,7 +106,7 @@ def df_to_anndata( all_num = True if len(numerical_columns) == len(list(dataframes.df.columns)) else False X = X.astype(np.number) if all_num else X.astype(object) - # cast non numerical obs only columns to category or bool dtype, which is needed for writing to .h5ad files + # cast non-numerical obs only columns to category or bool dtype, which is needed for writing to .h5ad files adata = AnnData( X=X, obs=_cast_obs_columns(dataframes.obs), @@ -215,13 +215,14 @@ def move_to_obs(adata: AnnData, to_obs: list[str] | str, copy_obs: bool = False) if copy_obs: cols_to_obs = adata[:, cols_to_obs_indices].to_df() adata.obs = adata.obs.join(cols_to_obs) - adata.obs[var_num] = adata.obs[var_num].apply(pd.to_numeric, errors="ignore", downcast="float") + adata.obs[var_num] = adata.obs[var_num].apply(pd.to_numeric, downcast="float") + adata.obs = _cast_obs_columns(adata.obs) else: df = adata[:, cols_to_obs_indices].to_df() adata._inplace_subset_var(~cols_to_obs_indices) adata.obs = adata.obs.join(df) - adata.obs[var_num] = adata.obs[var_num].apply(pd.to_numeric, errors="ignore", downcast="float") + adata.obs[var_num] = adata.obs[var_num].apply(pd.to_numeric, downcast="float") adata.obs = _cast_obs_columns(adata.obs) return adata diff --git a/ehrapy/preprocessing/_imputation.py b/ehrapy/preprocessing/_imputation.py index a98b2beb..7490466e 100644 --- a/ehrapy/preprocessing/_imputation.py +++ b/ehrapy/preprocessing/_imputation.py @@ -70,7 +70,6 @@ def explicit_impute( # 1: Replace all missing values with the specified value if isinstance(replacement, (int, str)): _replace_explicit(adata.X, replacement, impute_empty_strings) - logg.debug(f"Imputed missing values in the AnnData object by `{replacement}`") # 2: Replace all missing values in a subset of columns with a specified value per column or a default value, when the column is not explicitly named elif isinstance(replacement, dict): @@ -81,9 +80,6 @@ def explicit_impute( _replace_explicit(adata.X[:, idx : idx + 1], imputation_value, impute_empty_strings) else: print(f"[bold yellow]No replace value passed and found for var [not bold green]{column_name}.") - logg.debug( - f"Imputed missing values in columns `{replacement.keys()}` by `{replacement.values()}` respectively." - ) else: raise ValueError( # pragma: no cover f"Type {type(replacement)} is not a valid datatype for replacement parameter. Either use int, str or a dict!" @@ -165,16 +161,14 @@ def simple_impute( if strategy in {"median", "mean"}: try: _simple_impute(adata, var_names, strategy) - logg.debug(f"Imputed the AnnData object using `{strategy}` Imputation.") except ValueError: raise ValueError( f"Can only impute numerical data using {strategy} strategy. Try to restrict imputation" "to certain columns using var_names parameter or use a different mode." ) from None - # most_frequent imputation works with non numerical data as well + # most_frequent imputation works with non-numerical data as well elif strategy == "most_frequent": _simple_impute(adata, var_names, strategy) - logg.debug("Imputed the AnnData object using `most_frequent` Imputation.") # unknown simple imputation strategy else: raise ValueError( # pragma: no cover @@ -272,21 +266,11 @@ def knn_impute( if _check_module_importable("sklearnex"): # pragma: no cover unpatch_sklearn() - if var_names: - logg.debug( - f"Imputed the columns `{var_names}` in the AnnData object using kNN Imputation with {n_neighbours} neighbours considered." - ) - elif not var_names: - logg.debug( - f"Imputed the data in the AnnData object using kNN Imputation with {n_neighbours} neighbours considered." - ) - if copy: return adata def _knn_impute(adata: AnnData, var_names: Iterable[str] | None, n_neighbours: int) -> None: - """Utility function to impute data using KNN-Imputer""" from sklearn.impute import KNNImputer imputer = KNNImputer(n_neighbors=n_neighbours) @@ -428,13 +412,6 @@ def miss_forest_impute( if _check_module_importable("sklearnex"): # pragma: no cover unpatch_sklearn() - if var_names: - logg.debug( - f"Imputed the columns `{var_names}` in the AnnData object with MissForest Imputation using {num_initial_strategy} strategy." - ) - elif not var_names: - logg.debug("Imputed the data in the AnnData object using MissForest Imputation.") - if copy: return adata @@ -535,15 +512,6 @@ def soft_impute( # decode ordinal encoding to obtain imputed original data adata.X[::, column_indices] = enc.inverse_transform(adata.X[::, column_indices]) - if var_names: - logg.debug( - f"Imputed the columns `{var_names}` in the AnnData object using Soft Imputation with shrinkage value of `{shrinkage_value}`." - ) - elif not var_names: - logg.debug( - f"Imputed the data in the AnnData object using Soft Imputation with shrinkage value of `{shrinkage_value}`." - ) - return adata @@ -561,7 +529,6 @@ def _soft_impute( normalizer, verbose, ) -> None: - """Utility function to impute data using SoftImpute""" from fancyimpute import SoftImpute imputer = SoftImpute( @@ -690,11 +657,6 @@ def iterative_svd_impute( # decode ordinal encoding to obtain imputed original data adata.X[::, column_indices] = enc.inverse_transform(adata.X[::, column_indices]) - if var_names: - logg.debug(f"Imputed the columns `{var_names}` in the AnnData object using IterativeSVD Imputation.") - elif not var_names: - logg.debug("Imputed the data in the AnnData object using IterativeSVD Imputation.") - return adata @@ -711,7 +673,6 @@ def _iterative_svd_impute( max_value, verbose, ) -> None: - """Utility function to impute data using IterativeSVD""" from fancyimpute import IterativeSVD imputer = IterativeSVD( @@ -773,7 +734,7 @@ def matrix_factorization_impute( Defaults to None. max_value: The maximum value allowed for the imputed data. Any imputed value greater than `max_value` is clipped to `max_value`. Defaults to None. - verbose: Whether or not to printout training progress. Defaults to False. + verbose: Whether to printout training progress. Defaults to False. copy: Whether to return a copy or act in place. Defaults to False. Returns: @@ -827,15 +788,6 @@ def matrix_factorization_impute( adata.X = adata.X.astype("object") adata.X[::, column_indices] = enc.inverse_transform(adata.X[::, column_indices]) - if var_names: - logg.debug( - f"Imputed the columns `{var_names}` in the AnnData object using MatrixFactorization Imputation with learning rate `{learning_rate}` and shrinkage value `{shrinkage_value}`." - ) - elif not var_names: - logg.debug( - f"Imputed the data in the AnnData object using MatrixFactorization Imputation with learning rate `{learning_rate}` and shrinkage value `{shrinkage_value}`." - ) - return adata @@ -850,7 +802,6 @@ def _matrix_factorization_impute( max_value, verbose, ) -> None: - """Utility function to impute data using MatrixFactorization""" from fancyimpute import MatrixFactorization imputer = MatrixFactorization( @@ -949,15 +900,6 @@ def nuclear_norm_minimization_impute( # decode ordinal encoding to obtain imputed original data adata.X[::, column_indices] = enc.inverse_transform(adata.X[::, column_indices]) - if var_names: - logg.debug( - f"Imputed the columns `{var_names}` in the AnnData object using NuclearNormMinimization Imputation with error tolerance of `{error_tolerance}`." - ) - elif not var_names: - logg.debug( - f"Imputed the data in the AnnData object using NuclearNormMinimization Imputation with error tolerance of `{error_tolerance}`." - ) - return adata @@ -971,7 +913,6 @@ def _nuclear_norm_minimization_impute( max_iters, verbose, ) -> None: - """Utility function to impute data using NuclearNormMinimization""" from fancyimpute import NuclearNormMinimization imputer = NuclearNormMinimization( @@ -1079,22 +1020,12 @@ def mice_forest_impute( print("[bold red]Check that your matrix does not contain any NaN only columns!") raise - if var_names: - logg.debug( - f"Imputed the columns `{var_names}` in the AnnData object using MiceForest Imputation with `{iterations}` iterations." - ) - elif not var_names: - logg.debug( - f"Imputed the data in the AnnData object using MiceForest Imputation with `{iterations}` iterations." - ) - return adata def _miceforest_impute( adata, var_names, save_all_iterations, random_state, inplace, iterations, variable_parameters, verbose ) -> None: - """Utility function to impute data using miceforest""" import miceforest as mf if isinstance(var_names, Iterable): diff --git a/ehrapy/preprocessing/_outliers.py b/ehrapy/preprocessing/_outliers.py index 24af0aaf..91a4aba8 100644 --- a/ehrapy/preprocessing/_outliers.py +++ b/ehrapy/preprocessing/_outliers.py @@ -7,14 +7,17 @@ import scipy.stats.mstats if TYPE_CHECKING: + from collections.abc import Collection + from anndata import AnnData def winsorize( adata: AnnData, - vars: str | list[str] | set[str] = None, - obs_cols: str | list[str] | set[str] = None, - limits: list[float] = None, + vars: Collection[str] = None, + obs_cols: Collection[str] = None, + *, + limits: tuple[float, float] = (0.01, 0.99), copy: bool = False, **kwargs, ) -> AnnData: @@ -23,12 +26,12 @@ def winsorize( The implementation is based on https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.winsorize.html Args: - adata: AnnData object to winsorize - vars: The features to winsorize. - obs_cols: Columns in obs with features to winsorize. + adata: AnnData object to winsorize. + vars: The features to winsorize. Defaults to None. + obs_cols: Columns in obs with features to winsorize. Defaults to None. limits: Tuple of the percentages to cut on each side of the array as floats between 0. and 1. Defaults to (0.01, 0.99) - copy: Whether to return a copy or not + copy: Whether to return a copy. **kwargs: Keywords arguments get passed to scipy.stats.mstats.winsorize Returns: @@ -37,7 +40,7 @@ def winsorize( Examples: >>> import ehrapy as ep >>> adata = ep.dt.mimic_2(encoded=True) - >>> ep.pp.winsorize(adata, ["bmi"]) + >>> ep.pp.winsorize(adata, vars=["bmi"]) """ if copy: # pragma: no cover adata = adata.copy() @@ -61,22 +64,21 @@ def winsorize( def clip_quantile( adata: AnnData, - limits: list[float], - vars: str | list[str] | set[str] = None, - obs_cols: str | list[str] | set[str] = None, + limits: tuple[float, float], + vars: Collection[str] = None, + obs_cols: Collection[str] = None, + *, copy: bool = False, ) -> AnnData: """Clips (limits) features. Given an interval, values outside the interval are clipped to the interval edges. - The implementation is based on https://numpy.org/doc/stable/reference/generated/numpy.clip.html - Args: - adata: The AnnData object - vars: Columns in var with features to clip + adata: The AnnData object to clip. + limits: Values outside the interval are clipped to the interval edges. + vars: Columns in var with features to clip. obs_cols: Columns in obs with features to clip - limits: Interval, values outside of which are clipped to the interval edges copy: Whether to return a copy of AnnData or not Returns: @@ -85,7 +87,7 @@ def clip_quantile( Examples: >>> import ehrapy as ep >>> adata = ep.dt.mimic_2(encoded=True) - >>> ep.pp.clip_quantile(adata, ["bmi"]) + >>> ep.pp.clip_quantile(adata, vars=["bmi"]) """ obs_cols, vars = _validate_outlier_input(adata, obs_cols, vars) # type: ignore @@ -106,23 +108,10 @@ def clip_quantile( return adata -def _validate_outlier_input( - adata, obs_cols: str | list[str] | set[str], vars: str | list[str] | set[str] -) -> tuple[set[str], set[str]]: - """Validates the obs/var columns for outlier preprocessing. - - Args: - adata: AnnData object - obs_cols: str or list of obs columns - vars: str or list of var names - - Returns: - A tuple of lists of obs/var columns - """ - if isinstance(vars, str) or isinstance(vars, list): # pragma: no cover - vars = set(vars) - if isinstance(obs_cols, str) or isinstance(obs_cols, list): # pragma: no cover - obs_cols = set(obs_cols) +def _validate_outlier_input(adata, obs_cols: Collection[str], vars: Collection[str]) -> tuple[set[str], set[str]]: + """Validates the obs/var columns for outlier preprocessing.""" + vars = set(vars) if vars else set() + obs_cols = set(obs_cols) if obs_cols else set() if vars is not None: diff = vars - set(adata.var_names) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index f7681ba1..204c7e30 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -1,7 +1,6 @@ from __future__ import annotations import copy -from collections.abc import Iterable from pathlib import Path from typing import TYPE_CHECKING, Literal @@ -19,7 +18,7 @@ def qc_metrics( - adata: AnnData, qc_vars: Collection[str] = (), layer: str = None, inplace: bool = True + adata: AnnData, qc_vars: Collection[str] = (), layer: str = None ) -> tuple[pd.DataFrame, pd.DataFrame] | None: """Calculates various quality control metrics. @@ -30,7 +29,6 @@ def qc_metrics( adata: Annotated data matrix. qc_vars: Optional List of vars to calculate additional metrics for. layer: Layer to use to calculate the metrics. - inplace: Whether to add the metrics to obs/var or to solely return a Pandas DataFrame. Returns: Two Pandas DataFrames of all calculated QC metrics for `obs` and `var` respectively. @@ -59,9 +57,8 @@ def qc_metrics( obs_metrics = _obs_qc_metrics(adata, layer, qc_vars) var_metrics = _var_qc_metrics(adata, layer) - if inplace: - adata.obs[obs_metrics.columns] = obs_metrics - adata.var[var_metrics.columns] = var_metrics + adata.obs[obs_metrics.columns] = obs_metrics + adata.var[var_metrics.columns] = var_metrics return obs_metrics, var_metrics @@ -134,18 +131,7 @@ def _obs_qc_metrics( return obs_metrics -def _var_qc_metrics(adata: AnnData, layer: str = None) -> pd.DataFrame: - """Calculates quality control metrics for features. - - See :func:`~ehrapy.preprocessing._quality_control.calculate_qc_metrics` for a list of calculated metrics. - - Args: - adata: Annotated data matrix. - layer: Layer containing the matrix to calculate the metrics for. - - Returns: - Pandas DataFrame with the calculated metrics. - """ +def _var_qc_metrics(adata: AnnData, layer: str | None = None) -> pd.DataFrame: var_metrics = pd.DataFrame(index=adata.var_names) mtx = adata.X if layer is None else adata.layers[layer] categorical_indices = np.ndarray([0], dtype=int) @@ -189,8 +175,21 @@ def _var_qc_metrics(adata: AnnData, layer: str = None) -> pd.DataFrame: var_metrics.loc[non_categorical_indices, "max"] = np.nanmax( np.array(mtx[:, non_categorical_indices], dtype=np.float64), axis=0 ) + + # Calculate IQR and define IQR outliers + q1 = np.nanpercentile(mtx[:, non_categorical_indices], 25, axis=0) + q3 = np.nanpercentile(mtx[:, non_categorical_indices], 75, axis=0) + iqr = q3 - q1 + lower_bound = q1 - 1.5 * iqr + upper_bound = q3 + 1.5 * iqr + var_metrics.loc[non_categorical_indices, "iqr_outliers"] = ( + (mtx[:, non_categorical_indices] < lower_bound) | (mtx[:, non_categorical_indices] > upper_bound) + ).any(axis=0) + # Fill all non_categoricals with False because else we have a dtype object Series which h5py cannot save + var_metrics["iqr_outliers"] = var_metrics["iqr_outliers"].fillna(False).astype(bool) except (TypeError, ValueError): - print("[bold yellow]TypeError! Setting quality control metrics to nan. Did you encode your data?") + # We assume that the data just hasn't been encoded yet + pass return var_metrics diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py index 49949232..56420275 100644 --- a/ehrapy/tools/_sa.py +++ b/ehrapy/tools/_sa.py @@ -282,7 +282,7 @@ def _regression_model( df = df.dropna() if not accept_zero_duration: - df[duration_col][df[duration_col] == 0] += 1e-5 + df.loc[df[duration_col] == 0, duration_col] += 1e-5 model = model_class() model.fit(df, duration_col, event_col, entry_col=entry_col) diff --git a/tests/conftest.py b/tests/conftest.py index 941b83ce..2a410ff9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -24,6 +24,15 @@ def rng(): return np.random.default_rng() +@pytest.fixture +def mimic_2_10(): + import ehrapy as ep + + mimic_2_10 = ep.dt.mimic_2()[:10] + + return mimic_2_10 + + @pytest.fixture def mar_adata(rng) -> AnnData: """Generate MAR data using dependent columns.""" diff --git a/tests/preprocessing/test_outliers.py b/tests/preprocessing/test_outliers.py index 1b7eb249..6f8a584f 100644 --- a/tests/preprocessing/test_outliers.py +++ b/tests/preprocessing/test_outliers.py @@ -1,20 +1,7 @@ -from pathlib import Path - import numpy as np -import pytest import ehrapy as ep -CURRENT_DIR = Path(__file__).parent -_TEST_PATH = f"{CURRENT_DIR}/test_preprocessing" - - -@pytest.fixture -def mimic_2_10(): - mimic_2_10 = ep.dt.mimic_2()[:10] - - return mimic_2_10 - def test_winsorize_var(mimic_2_10): winsorized_adata = ep.pp.winsorize(mimic_2_10, vars=["age"], limits=[0.2, 0.2], copy=True) @@ -36,7 +23,7 @@ def test_winsorized_obs(mimic_2_10): def test_clip_var(mimic_2_10): - clipped_adata = ep.pp.clip_quantile(mimic_2_10, vars=["age"], limits=[25, 50], copy=True) + clipped_adata = ep.pp.clip_quantile(mimic_2_10, vars=["age"], limits=(25, 50), copy=True) expected = np.array([50, 50, 36.5, 44.49191, 25, 36.54657, 25, 50, 50, 25.41667]).reshape((10, 1)) np.testing.assert_allclose(np.array(clipped_adata[:, "age"].X, dtype=np.float32), expected) @@ -44,7 +31,7 @@ def test_clip_var(mimic_2_10): def test_clip_obs(mimic_2_10): to_clip_obs = ep.ad.move_to_obs(mimic_2_10, "age") - clipped_adata = ep.pp.clip_quantile(to_clip_obs, obs_cols=["age"], limits=[25, 50], copy=True) + clipped_adata = ep.pp.clip_quantile(to_clip_obs, obs_cols=["age"], limits=(25, 50), copy=True) expected = np.array([50, 50, 36.5, 44.49191, 25, 36.54657, 25, 50, 50, 25.41667]) np.testing.assert_allclose(np.array(clipped_adata.obs["age"]), expected) diff --git a/tests/preprocessing/test_quality_control.py b/tests/preprocessing/test_quality_control.py index 68c70a92..7331d7ee 100644 --- a/tests/preprocessing/test_quality_control.py +++ b/tests/preprocessing/test_quality_control.py @@ -79,6 +79,7 @@ def test_var_qc_metrics(missing_values_adata): assert np.allclose(var_metrics["median"].values, np.array([0.21, np.nan, 24.327]), equal_nan=True) assert np.allclose(var_metrics["min"].values, np.array([0.21, np.nan, 7.234]), equal_nan=True) assert np.allclose(var_metrics["max"].values, np.array([0.21, np.nan, 41.419998]), equal_nan=True) + assert (~var_metrics["iqr_outliers"]).all() def test_obs_nan_qc_metrics(): @@ -102,7 +103,7 @@ def test_var_nan_qc_metrics(): def test_calculate_qc_metrics(missing_values_adata): - obs_metrics, var_metrics = ep.pp.qc_metrics(missing_values_adata, inplace=True) + obs_metrics, var_metrics = ep.pp.qc_metrics(missing_values_adata) assert obs_metrics is not None assert var_metrics is not None