Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More sa functions #664

Merged
merged 11 commits into from
Mar 12, 2024
5 changes: 5 additions & 0 deletions docs/usage/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,11 @@ In contrast to a preprocessing function, a tool usually adds an easily interpret
tools.test_kmf_logrank
tools.test_nested_f_statistic
tools.cox_ph
tools.weibull_aft
tools.log_rogistic_aft
tools.nelson_alen
tools.weibull

```

### Causal Inference
Expand Down
14 changes: 13 additions & 1 deletion ehrapy/tools/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
from ehrapy.tools._sa import anova_glm, cox_ph, glm, kmf, ols, test_kmf_logrank, test_nested_f_statistic
from ehrapy.tools._sa import (
anova_glm,
cox_ph,
glm,
kmf,
log_rogistic_aft,
nelson_alen,
ols,
test_kmf_logrank,
test_nested_f_statistic,
weibull,
weibull_aft,
)
from ehrapy.tools._scanpy_tl_api import * # noqa: F403
from ehrapy.tools.causal._dowhy import causal_inference
from ehrapy.tools.feature_ranking._rank_features_groups import filter_rank_features_groups, rank_features_groups
Expand Down
188 changes: 169 additions & 19 deletions ehrapy/tools/_sa.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,14 @@
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from lifelines import CoxPHFitter, KaplanMeierFitter
from lifelines import (
CoxPHFitter,
KaplanMeierFitter,
LogLogisticAFTFitter,
NelsonAalenFitter,
WeibullAFTFitter,
WeibullFitter,
)
from lifelines.statistics import StatisticalResult, logrank_test
from scipy import stats

Expand Down Expand Up @@ -221,13 +228,13 @@ def test_kmf_logrank(


def test_nested_f_statistic(small_model: GLMResultsWrapper, big_model: GLMResultsWrapper) -> float:
"""Given two fitted GLMs, the larger of which contains the parameter space of the smaller, return the P value corresponding to the larger model adding explanatory power.
""" "Calculate the P value indicating if a larger GLM, encompassing a smaller GLM's parameters, adds explanatory power."

See https://stackoverflow.com/questions/27328623/anova-test-for-glm-in-python/60769343#60769343

Args:
small_model (GLMResultsWrapper): fitted generalized linear models.
big_model (GLMResultsWrapper): fitted generalized linear models.
small_model: fitted generalized linear models.
big_model: fitted generalized linear models.

Returns:
float: p_value
Expand All @@ -245,10 +252,10 @@ def anova_glm(result_1: GLMResultsWrapper, result_2: GLMResultsWrapper, formula_
"""Anova table for two fitted generalized linear models.

Args:
result_1 (GLMResultsWrapper): fitted generalized linear models.
result_2 (GLMResultsWrapper): fitted generalized linear models.
formula_1 (str): The formula specifying the model.
formula_2 (str): The formula specifying the model.
result_1: fitted generalized linear models.
result_2: fitted generalized linear models.
formula_1: The formula specifying the model.
formula_2: The formula specifying the model.

Returns:
pd.DataFrame: Anova table.
Expand All @@ -267,6 +274,25 @@ def anova_glm(result_1: GLMResultsWrapper, result_2: GLMResultsWrapper, formula_
return dataframe


def _regression_model(
model_class, adata: AnnData, duration_col: str, event_col: str, entry_col: str = None, accept_zero_duration=True
):
"""Convenience function for regression models."""
df = anndata_to_df(adata)

keys = [duration_col, event_col]
if entry_col:
keys.append(entry_col)
df = df[keys]
if not accept_zero_duration:
df[duration_col][df[duration_col] == 0] += 1e-5

model = model_class()
model.fit(df, duration_col, event_col, entry_col=entry_col)

return model


def cox_ph(adata: AnnData, duration_col: str, event_col: str, entry_col: str = None) -> CoxPHFitter:
"""Fit the Cox’s proportional hazard for the survival function.

Expand All @@ -277,12 +303,13 @@ def cox_ph(adata: AnnData, duration_col: str, event_col: str, entry_col: str = N

Args:
adata: adata: AnnData object with necessary columns `duration_col` and `event_col`.
duration_col: the name of the column in the AnnData objects that contains the subjects’ lifetimes.
event_col: the name of the column in anndata that contains the subjects’ death observation. If left as None, assume all individuals are uncensored.
entry_col: a column denoting when a subject entered the study, i.e. left-truncation.
duration_col: The name of the column in the AnnData objects that contains the subjects’ lifetimes.
event_col: The name of the column in anndata that contains the subjects’ death observation.
If left as None, assume all individuals are uncensored.
entry_col: Column denoting when a subject entered the study, i.e. left-truncation.

Returns:
Fitted CoxPHFitter
Fitted CoxPHFitter.

Examples:
>>> import ehrapy as ep
Expand All @@ -291,12 +318,135 @@ def cox_ph(adata: AnnData, duration_col: str, event_col: str, entry_col: str = N
>>> adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0)
>>> cph = ep.tl.cox_ph(adata, "mort_day_censored", "censor_flg")
"""
return _regression_model(CoxPHFitter, adata, duration_col, event_col, entry_col)


def weibull_aft(adata: AnnData, duration_col: str, event_col: str, entry_col: str = None) -> WeibullAFTFitter:
"""Fit the Weibull accelerated failure time regression for the survival function.

The Weibull Accelerated Failure Time (AFT) survival regression model is a statistical method used to analyze time-to-event data,
where the underlying assumption is that the logarithm of survival time follows a Weibull distribution.
It models the survival time as an exponential function of the predictors, assuming a specific shape parameter
for the distribution and allowing for accelerated or decelerated failure times based on the covariates.
See https://lifelines.readthedocs.io/en/latest/fitters/regression/WeibullAFTFitter.html

Args:
adata: adata: AnnData object with necessary columns `duration_col` and `event_col`.
duration_col: Name of the column in the AnnData objects that contains the subjects’ lifetimes.
event_col: Name of the column in anndata that contains the subjects’ death observation.
If left as None, assume all individuals are uncensored.
entry_col: Column denoting when a subject entered the study, i.e. left-truncation.

Returns:
Fitted WeibullAFTFitter

Examples:
>>> import ehrapy as ep
>>> adata = ep.dt.mimic_2(encoded=False)
>>> # Flip 'censor_fl' because 0 = death and 1 = censored
>>> adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0)
>>> aft = ep.tl.weibull_aft(adata, "mort_day_censored", "censor_flg")
"""
return _regression_model(WeibullAFTFitter, adata, duration_col, event_col, entry_col, accept_zero_duration=False)


def log_rogistic_aft(adata: AnnData, duration_col: str, event_col: str, entry_col: str = None) -> LogLogisticAFTFitter:
"""Fit the log logistic accelerated failure time regression for the survival function.
The Log-Logistic Accelerated Failure Time (AFT) survival regression model is a powerful statistical tool employed in the analysis of time-to-event data.
This model operates under the assumption that the logarithm of survival time adheres to a log-logistic distribution, offering a flexible framework for understanding the impact of covariates on survival times.
By modeling survival time as a function of predictors, the Log-Logistic AFT model enables researchers to explore
how specific factors influence the acceleration or deceleration of failure times, providing valuable insights into the underlying mechanisms driving event occurrence.
See https://lifelines.readthedocs.io/en/latest/fitters/regression/LogLogisticAFTFitter.html

Args:
adata: adata: AnnData object with necessary columns `duration_col` and `event_col`.
duration_col: Name of the column in the AnnData objects that contains the subjects’ lifetimes.
event_col: Name of the column in anndata that contains the subjects’ death observation.
If left as None, assume all individuals are uncensored.
entry_col: Column denoting when a subject entered the study, i.e. left-truncation.

Returns:
Fitted LogLogisticAFTFitter

Examples:
>>> import ehrapy as ep
>>> adata = ep.dt.mimic_2(encoded=False)
>>> # Flip 'censor_fl' because 0 = death and 1 = censored
>>> adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0)
>>> llf = ep.tl.log_rogistic_aft(adata, "mort_day_censored", "censor_flg")
"""
return _regression_model(
LogLogisticAFTFitter, adata, duration_col, event_col, entry_col, accept_zero_duration=False
)


def _univariate_model(adata: AnnData, duration_col: str, event_col: str, model_class, accept_zero_duration=True):
"""Convenience function for univariate models."""
df = anndata_to_df(adata)
keys = [duration_col, event_col]
if entry_col:
keys.append(entry_col)
df = df[keys]
cph = CoxPHFitter()
cph.fit(df, duration_col, event_col, entry_col=entry_col)

return cph
if not accept_zero_duration:
df[duration_col][df[duration_col] == 0] += 1e-5
T = df[duration_col]
E = df[event_col]

model = model_class()
model.fit(T, event_observed=E)

return model


def nelson_alen(adata: AnnData, duration_col: str, event_col: str) -> NelsonAalenFitter:
"""Employ the Nelson-Aalen estimator to estimate the cumulative hazard function from censored survival data

The Nelson-Aalen estimator is a non-parametric method used in survival analysis to estimate the cumulative hazard function.
This technique is particularly useful when dealing with censored data, as it accounts for the presence of individuals whose event times are unknown due to censoring.
By estimating the cumulative hazard function, the Nelson-Aalen estimator allows researchers to assess the risk of an event occurring over time, providing valuable insights into the underlying dynamics of the survival process.
See https://lifelines.readthedocs.io/en/latest/fitters/univariate/NelsonAalenFitter.html

Args:
adata: adata: AnnData object with necessary columns `duration_col` and `event_col`.
duration_col: The name of the column in the AnnData objects that contains the subjects’ lifetimes.
event_col: The name of the column in anndata that contains the subjects’ death observation.
If left as None, assume all individuals are uncensored.

Returns:
Fitted NelsonAalenFitter

Examples:
>>> import ehrapy as ep
>>> adata = ep.dt.mimic_2(encoded=False)
>>> # Flip 'censor_fl' because 0 = death and 1 = censored
>>> adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0)
>>> naf = ep.tl.nelson_alen(adata, "mort_day_censored", "censor_flg")
"""
return _univariate_model(adata, duration_col, event_col, NelsonAalenFitter)


def weibull(adata: AnnData, duration_col: str, event_col: str) -> WeibullFitter:
"""Employ the Weibull model in univariate survival analysis to understand event occurrence dynamics.

In contrast to the non-parametric Nelson-Aalen estimator, the Weibull model employs a parametric approach with shape and scale parameters,
enabling a more structured analysis of survival data.
This technique is particularly useful when dealing with censored data, as it accounts for the presence of individuals whose event times are unknown due to censoring.
By fitting the Weibull model to censored survival data, researchers can estimate these parameters and gain insights
into the hazard rate over time, facilitating comparisons between different groups or treatments.
This method provides a comprehensive framework for examining survival data and offers valuable insights into the factors influencing event occurrence dynamics.
See https://lifelines.readthedocs.io/en/latest/fitters/univariate/WeibullFitter.html

Args:
adata: adata: AnnData object with necessary columns `duration_col` and `event_col`.
duration_col: Name of the column in the AnnData objects that contains the subjects’ lifetimes.
event_col: Name of the column in the AnnData object that contains the subjects’ death observation.
If left as None, assume all individuals are uncensored.

Returns:
Fitted WeibullFitter

Examples:
>>> import ehrapy as ep
>>> adata = ep.dt.mimic_2(encoded=False)
>>> # Flip 'censor_fl' because 0 = death and 1 = censored
>>> adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0)
>>> wf = ep.tl.weibull(adata, "mort_day_censored", "censor_flg")
"""
return _univariate_model(adata, duration_col, event_col, WeibullFitter, accept_zero_duration=False)
63 changes: 46 additions & 17 deletions tests/tools/test_sa.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,27 @@
import numpy as np
import pytest
import statsmodels
from lifelines import CoxPHFitter, KaplanMeierFitter
from lifelines import (
CoxPHFitter,
KaplanMeierFitter,
LogLogisticAFTFitter,
NelsonAalenFitter,
WeibullAFTFitter,
WeibullFitter,
)

import ehrapy as ep


@pytest.fixture
def mimic_2_sa():
adata = ep.dt.mimic_2(encoded=False)
adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0)
duration_col, event_col = "mort_day_censored", "censor_flg"

return adata, duration_col, event_col


class TestSA:
def test_ols(self):
adata = ep.dt.mimic_2(encoded=False)
Expand All @@ -30,15 +46,6 @@ def test_glm(self):
assert 5.778006344870297 == pytest.approx(Intercept)
assert -0.06523274132877163 == pytest.approx(age)

def test_kmf(self):
adata = ep.dt.mimic_2(encoded=False)
adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0)
kmf = ep.tl.kmf(adata[:, ["mort_day_censored"]].X, adata[:, ["censor_flg"]].X)

assert isinstance(kmf, KaplanMeierFitter)
assert len(kmf.durations) == 1776
assert sum(kmf.event_observed) == 497

@pytest.mark.parametrize("weightings", ["wilcoxon", "tarone-ware", "peto", "fleming-harrington"])
def test_calculate_logrank_pvalue(self, weightings):
durations_A = [1, 2, 3]
Expand Down Expand Up @@ -76,11 +83,33 @@ def test_anova_glm(self):
assert dataframe.iloc[1, 4] == 2
assert pytest.approx(dataframe.iloc[1, 5], 0.1) == 0.103185

def test_cox_ph(self):
adata = ep.dt.mimic_2(encoded=False)
adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0)
cph = ep.tl.cox_ph(adata, "mort_day_censored", "censor_flg")
def _sa_function_assert(self, model, model_class):
assert isinstance(model, model_class)
assert len(model.durations) == 1776
assert sum(model.event_observed) == 497

def _sa_func_test(self, sa_function, sa_class, mimic_2_sa):
adata, duration_col, event_col = mimic_2_sa

sa = sa_function(adata, duration_col, event_col)
self._sa_function_assert(sa, sa_class)

def test_kmf(self, mimic_2_sa):
adata, _, _ = mimic_2_sa
kmf = ep.tl.kmf(adata[:, ["mort_day_censored"]].X, adata[:, ["censor_flg"]].X)
self._sa_function_assert(kmf, KaplanMeierFitter)

def test_cox_ph(self, mimic_2_sa):
self._sa_func_test(ep.tl.cox_ph, CoxPHFitter, mimic_2_sa)

def test_nelson_alen(self, mimic_2_sa):
self._sa_func_test(ep.tl.nelson_alen, NelsonAalenFitter, mimic_2_sa)

def test_weibull(self, mimic_2_sa):
self._sa_func_test(ep.tl.weibull, WeibullFitter, mimic_2_sa)

def test_weibull_aft(self, mimic_2_sa):
self._sa_func_test(ep.tl.weibull_aft, WeibullAFTFitter, mimic_2_sa)

assert isinstance(cph, CoxPHFitter)
assert len(cph.durations) == 1776
assert sum(cph.event_observed) == 497
def test_log_logistic(self):
self._sa_func_test(ep.tl.log_rogistic_aft, LogLogisticAFTFitter, mimic_2_sa)
Loading