theislab · Zethson · Mar 12, 2024 · Feb 21, 2024 · Feb 21, 2024 · Mar 1, 2024
diff --git a/docs/usage/usage.md b/docs/usage/usage.md
@@ -241,6 +241,11 @@ In contrast to a preprocessing function, a tool usually adds an easily interpret
     tools.test_kmf_logrank
     tools.test_nested_f_statistic
     tools.cox_ph
+    tools.weibull_aft
+    tools.log_rogistic_aft
+    tools.nelson_alen
+    tools.weibull
+
 ```
 
 ### Causal Inference

diff --git a/ehrapy/tools/__init__.py b/ehrapy/tools/__init__.py
@@ -1,4 +1,16 @@
-from ehrapy.tools._sa import anova_glm, cox_ph, glm, kmf, ols, test_kmf_logrank, test_nested_f_statistic
+from ehrapy.tools._sa import (
+    anova_glm,
+    cox_ph,
+    glm,
+    kmf,
+    log_rogistic_aft,
+    nelson_alen,
+    ols,
+    test_kmf_logrank,
+    test_nested_f_statistic,
+    weibull,
+    weibull_aft,
+)
 from ehrapy.tools._scanpy_tl_api import *  # noqa: F403
 from ehrapy.tools.causal._dowhy import causal_inference
 from ehrapy.tools.feature_ranking._rank_features_groups import filter_rank_features_groups, rank_features_groups

diff --git a/ehrapy/tools/_sa.py b/ehrapy/tools/_sa.py
@@ -6,7 +6,14 @@
 import pandas as pd
 import statsmodels.api as sm
 import statsmodels.formula.api as smf
-from lifelines import CoxPHFitter, KaplanMeierFitter
+from lifelines import (
+    CoxPHFitter,
+    KaplanMeierFitter,
+    LogLogisticAFTFitter,
+    NelsonAalenFitter,
+    WeibullAFTFitter,
+    WeibullFitter,
+)
 from lifelines.statistics import StatisticalResult, logrank_test
 from scipy import stats
 
@@ -221,13 +228,13 @@ def test_kmf_logrank(
 
 
 def test_nested_f_statistic(small_model: GLMResultsWrapper, big_model: GLMResultsWrapper) -> float:
-    """Given two fitted GLMs, the larger of which contains the parameter space of the smaller, return the P value corresponding to the larger model adding explanatory power.
+    """ "Calculate the P value indicating if a larger GLM, encompassing a smaller GLM's parameters, adds explanatory power."
 
     See https://stackoverflow.com/questions/27328623/anova-test-for-glm-in-python/60769343#60769343
 
     Args:
-        small_model (GLMResultsWrapper): fitted generalized linear models.
-        big_model (GLMResultsWrapper): fitted generalized linear models.
+        small_model: fitted generalized linear models.
+        big_model: fitted generalized linear models.
 
     Returns:
         float: p_value
@@ -245,10 +252,10 @@ def anova_glm(result_1: GLMResultsWrapper, result_2: GLMResultsWrapper, formula_
     """Anova table for two fitted generalized linear models.
 
     Args:
-        result_1 (GLMResultsWrapper): fitted generalized linear models.
-        result_2 (GLMResultsWrapper): fitted generalized linear models.
-        formula_1 (str): The formula specifying the model.
-        formula_2 (str): The formula specifying the model.
+        result_1: fitted generalized linear models.
+        result_2: fitted generalized linear models.
+        formula_1: The formula specifying the model.
+        formula_2: The formula specifying the model.
 
     Returns:
         pd.DataFrame: Anova table.
@@ -267,6 +274,25 @@ def anova_glm(result_1: GLMResultsWrapper, result_2: GLMResultsWrapper, formula_
     return dataframe
 
 
+def _regression_model(
+    model_class, adata: AnnData, duration_col: str, event_col: str, entry_col: str = None, accept_zero_duration=True
+):
+    """Convenience function for regression models."""
+    df = anndata_to_df(adata)
+
+    keys = [duration_col, event_col]
+    if entry_col:
+        keys.append(entry_col)
+    df = df[keys]
+    if not accept_zero_duration:
+        df[duration_col][df[duration_col] == 0] += 1e-5
+
+    model = model_class()
+    model.fit(df, duration_col, event_col, entry_col=entry_col)
+
+    return model
+
+
 def cox_ph(adata: AnnData, duration_col: str, event_col: str, entry_col: str = None) -> CoxPHFitter:
     """Fit the Cox’s proportional hazard for the survival function.
 
@@ -277,12 +303,13 @@ def cox_ph(adata: AnnData, duration_col: str, event_col: str, entry_col: str = N
 
     Args:
         adata: adata: AnnData object with necessary columns `duration_col` and `event_col`.
-        duration_col: the name of the column in the AnnData objects that contains the subjects’ lifetimes.
-        event_col: the name of the column in anndata that contains the subjects’ death observation. If left as None, assume all individuals are uncensored.
-        entry_col: a column denoting when a subject entered the study, i.e. left-truncation.
+        duration_col: The name of the column in the AnnData objects that contains the subjects’ lifetimes.
+        event_col: The name of the column in anndata that contains the subjects’ death observation.
+                   If left as None, assume all individuals are uncensored.
+        entry_col: Column denoting when a subject entered the study, i.e. left-truncation.
 
     Returns:
-        Fitted CoxPHFitter
+        Fitted CoxPHFitter.
 
     Examples:
         >>> import ehrapy as ep
@@ -291,12 +318,135 @@ def cox_ph(adata: AnnData, duration_col: str, event_col: str, entry_col: str = N
         >>> adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0)
         >>> cph = ep.tl.cox_ph(adata, "mort_day_censored", "censor_flg")
     """
+    return _regression_model(CoxPHFitter, adata, duration_col, event_col, entry_col)
+
+
+def weibull_aft(adata: AnnData, duration_col: str, event_col: str, entry_col: str = None) -> WeibullAFTFitter:
+    """Fit the Weibull accelerated failure time regression for the survival function.
+
+    The Weibull Accelerated Failure Time (AFT) survival regression model is a statistical method used to analyze time-to-event data,
+    where the underlying assumption is that the logarithm of survival time follows a Weibull distribution.
+    It models the survival time as an exponential function of the predictors, assuming a specific shape parameter
+    for the distribution and allowing for accelerated or decelerated failure times based on the covariates.
+    See https://lifelines.readthedocs.io/en/latest/fitters/regression/WeibullAFTFitter.html
+
+    Args:
+        adata: adata: AnnData object with necessary columns `duration_col` and `event_col`.
+        duration_col: Name of the column in the AnnData objects that contains the subjects’ lifetimes.
+        event_col: Name of the column in anndata that contains the subjects’ death observation.
+                   If left as None, assume all individuals are uncensored.
+        entry_col: Column denoting when a subject entered the study, i.e. left-truncation.
+
+    Returns:
+        Fitted WeibullAFTFitter
+
+    Examples:
+        >>> import ehrapy as ep
+        >>> adata = ep.dt.mimic_2(encoded=False)
+        >>> # Flip 'censor_fl' because 0 = death and 1 = censored
+        >>> adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0)
+        >>> aft = ep.tl.weibull_aft(adata, "mort_day_censored", "censor_flg")
+    """
+    return _regression_model(WeibullAFTFitter, adata, duration_col, event_col, entry_col, accept_zero_duration=False)
+
+
+def log_rogistic_aft(adata: AnnData, duration_col: str, event_col: str, entry_col: str = None) -> LogLogisticAFTFitter:
+    """Fit the log logistic accelerated failure time regression for the survival function.
+    The Log-Logistic Accelerated Failure Time (AFT) survival regression model is a powerful statistical tool employed in the analysis of time-to-event data.
+    This model operates under the assumption that the logarithm of survival time adheres to a log-logistic distribution, offering a flexible framework for understanding the impact of covariates on survival times.
+    By modeling survival time as a function of predictors, the Log-Logistic AFT model enables researchers to explore
+    how specific factors influence the acceleration or deceleration of failure times, providing valuable insights into the underlying mechanisms driving event occurrence.
+    See https://lifelines.readthedocs.io/en/latest/fitters/regression/LogLogisticAFTFitter.html
+
+    Args:
+        adata: adata: AnnData object with necessary columns `duration_col` and `event_col`.
+        duration_col: Name of the column in the AnnData objects that contains the subjects’ lifetimes.
+        event_col: Name of the column in anndata that contains the subjects’ death observation.
+                   If left as None, assume all individuals are uncensored.
+        entry_col: Column denoting when a subject entered the study, i.e. left-truncation.
+
+    Returns:
+        Fitted LogLogisticAFTFitter
+
+    Examples:
+        >>> import ehrapy as ep
+        >>> adata = ep.dt.mimic_2(encoded=False)
+        >>> # Flip 'censor_fl' because 0 = death and 1 = censored
+        >>> adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0)
+        >>> llf = ep.tl.log_rogistic_aft(adata, "mort_day_censored", "censor_flg")
+    """
+    return _regression_model(
+        LogLogisticAFTFitter, adata, duration_col, event_col, entry_col, accept_zero_duration=False
+    )
+
+
+def _univariate_model(adata: AnnData, duration_col: str, event_col: str, model_class, accept_zero_duration=True):
+    """Convenience function for univariate models."""
     df = anndata_to_df(adata)
-    keys = [duration_col, event_col]
-    if entry_col:
-        keys.append(entry_col)
-    df = df[keys]
-    cph = CoxPHFitter()
-    cph.fit(df, duration_col, event_col, entry_col=entry_col)
 
-    return cph
+    if not accept_zero_duration:
+        df[duration_col][df[duration_col] == 0] += 1e-5
+    T = df[duration_col]
+    E = df[event_col]
+
+    model = model_class()
+    model.fit(T, event_observed=E)
+
+    return model
+
+
+def nelson_alen(adata: AnnData, duration_col: str, event_col: str) -> NelsonAalenFitter:
+    """Employ the Nelson-Aalen estimator to estimate the cumulative hazard function from censored survival data
+
+    The Nelson-Aalen estimator is a non-parametric method used in survival analysis to estimate the cumulative hazard function.
+    This technique is particularly useful when dealing with censored data, as it accounts for the presence of individuals whose event times are unknown due to censoring.
+    By estimating the cumulative hazard function, the Nelson-Aalen estimator allows researchers to assess the risk of an event occurring over time, providing valuable insights into the underlying dynamics of the survival process.
+    See https://lifelines.readthedocs.io/en/latest/fitters/univariate/NelsonAalenFitter.html
+
+    Args:
+        adata: adata: AnnData object with necessary columns `duration_col` and `event_col`.
+        duration_col: The name of the column in the AnnData objects that contains the subjects’ lifetimes.
+        event_col: The name of the column in anndata that contains the subjects’ death observation.
+                   If left as None, assume all individuals are uncensored.
+
+    Returns:
+        Fitted NelsonAalenFitter
+
+    Examples:
+        >>> import ehrapy as ep
+        >>> adata = ep.dt.mimic_2(encoded=False)
+        >>> # Flip 'censor_fl' because 0 = death and 1 = censored
+        >>> adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0)
+        >>> naf = ep.tl.nelson_alen(adata, "mort_day_censored", "censor_flg")
+    """
+    return _univariate_model(adata, duration_col, event_col, NelsonAalenFitter)
+
+
+def weibull(adata: AnnData, duration_col: str, event_col: str) -> WeibullFitter:
+    """Employ the Weibull model in univariate survival analysis to understand event occurrence dynamics.
+
+    In contrast to the non-parametric Nelson-Aalen estimator, the Weibull model employs a parametric approach with shape and scale parameters,
+    enabling a more structured analysis of survival data.
+    This technique is particularly useful when dealing with censored data, as it accounts for the presence of individuals whose event times are unknown due to censoring.
+    By fitting the Weibull model to censored survival data, researchers can estimate these parameters and gain insights
+    into the hazard rate over time, facilitating comparisons between different groups or treatments.
+    This method provides a comprehensive framework for examining survival data and offers valuable insights into the factors influencing event occurrence dynamics.
+    See https://lifelines.readthedocs.io/en/latest/fitters/univariate/WeibullFitter.html
+
+    Args:
+        adata: adata: AnnData object with necessary columns `duration_col` and `event_col`.
+        duration_col: Name of the column in the AnnData objects that contains the subjects’ lifetimes.
+        event_col: Name of the column in the AnnData object that contains the subjects’ death observation.
+                   If left as None, assume all individuals are uncensored.
+
+    Returns:
+        Fitted WeibullFitter
+
+    Examples:
+        >>> import ehrapy as ep
+        >>> adata = ep.dt.mimic_2(encoded=False)
+        >>> # Flip 'censor_fl' because 0 = death and 1 = censored
+        >>> adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0)
+        >>> wf = ep.tl.weibull(adata, "mort_day_censored", "censor_flg")
+    """
+    return _univariate_model(adata, duration_col, event_col, WeibullFitter, accept_zero_duration=False)
diff --git a/tests/tools/test_sa.py b/tests/tools/test_sa.py
@@ -1,11 +1,27 @@
 import numpy as np
 import pytest
 import statsmodels
-from lifelines import CoxPHFitter, KaplanMeierFitter
+from lifelines import (
+    CoxPHFitter,
+    KaplanMeierFitter,
+    LogLogisticAFTFitter,
+    NelsonAalenFitter,
+    WeibullAFTFitter,
+    WeibullFitter,
+)
 
 import ehrapy as ep
 
 
+@pytest.fixture
+def mimic_2_sa():
+    adata = ep.dt.mimic_2(encoded=False)
+    adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0)
+    duration_col, event_col = "mort_day_censored", "censor_flg"
+
+    return adata, duration_col, event_col
+
+
 class TestSA:
     def test_ols(self):
         adata = ep.dt.mimic_2(encoded=False)
@@ -30,15 +46,6 @@ def test_glm(self):
         assert 5.778006344870297 == pytest.approx(Intercept)
         assert -0.06523274132877163 == pytest.approx(age)
 
-    def test_kmf(self):
-        adata = ep.dt.mimic_2(encoded=False)
-        adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0)
-        kmf = ep.tl.kmf(adata[:, ["mort_day_censored"]].X, adata[:, ["censor_flg"]].X)
-
-        assert isinstance(kmf, KaplanMeierFitter)
-        assert len(kmf.durations) == 1776
-        assert sum(kmf.event_observed) == 497
-
     @pytest.mark.parametrize("weightings", ["wilcoxon", "tarone-ware", "peto", "fleming-harrington"])
     def test_calculate_logrank_pvalue(self, weightings):
         durations_A = [1, 2, 3]
@@ -76,11 +83,33 @@ def test_anova_glm(self):
         assert dataframe.iloc[1, 4] == 2
         assert pytest.approx(dataframe.iloc[1, 5], 0.1) == 0.103185
 
-    def test_cox_ph(self):
-        adata = ep.dt.mimic_2(encoded=False)
-        adata[:, ["censor_flg"]].X = np.where(adata[:, ["censor_flg"]].X == 0, 1, 0)
-        cph = ep.tl.cox_ph(adata, "mort_day_censored", "censor_flg")
+    def _sa_function_assert(self, model, model_class):
+        assert isinstance(model, model_class)
+        assert len(model.durations) == 1776
+        assert sum(model.event_observed) == 497
+
+    def _sa_func_test(self, sa_function, sa_class, mimic_2_sa):
+        adata, duration_col, event_col = mimic_2_sa
+
+        sa = sa_function(adata, duration_col, event_col)
+        self._sa_function_assert(sa, sa_class)
+
+    def test_kmf(self, mimic_2_sa):
+        adata, _, _ = mimic_2_sa
+        kmf = ep.tl.kmf(adata[:, ["mort_day_censored"]].X, adata[:, ["censor_flg"]].X)
+        self._sa_function_assert(kmf, KaplanMeierFitter)
+
+    def test_cox_ph(self, mimic_2_sa):
+        self._sa_func_test(ep.tl.cox_ph, CoxPHFitter, mimic_2_sa)
+
+    def test_nelson_alen(self, mimic_2_sa):
+        self._sa_func_test(ep.tl.nelson_alen, NelsonAalenFitter, mimic_2_sa)
+
+    def test_weibull(self, mimic_2_sa):
+        self._sa_func_test(ep.tl.weibull, WeibullFitter, mimic_2_sa)
+
+    def test_weibull_aft(self, mimic_2_sa):
+        self._sa_func_test(ep.tl.weibull_aft, WeibullAFTFitter, mimic_2_sa)
 
-        assert isinstance(cph, CoxPHFitter)
-        assert len(cph.durations) == 1776
-        assert sum(cph.event_observed) == 497
+    def test_log_logistic(self):
+        self._sa_func_test(ep.tl.log_rogistic_aft, LogLogisticAFTFitter, mimic_2_sa)