Move calibrate() inside the PropensityModel class (#839)

jeongyoonlee · web-flow · commit d4100b1cc179 · 2025-07-05T14:02:37.000-07:00
* move calibrate() inside the PropensityModel class * move output clipping out of compute_propensity_score() * temporarily pin scipy to be < 1.16.0 * replace sklearn.base.BaseEstimator._validate_data with utils.validation.validate_data and pin sklearn to be >= 1.6.0 * update _check_sample_weight() input argument to reflect scikit-learn/scikit-learn#30908
diff --git a/causalml/inference/meta/tmle.py b/causalml/inference/meta/tmle.py
@@ -11,7 +11,6 @@
     check_p_conditions,
     convert_pd_to_np,
 )
-from causalml.propensity import calibrate
 
 
 logger = logging.getLogger("causalml")
@@ -105,7 +104,6 @@ def __init__(
         ate_alpha=0.05,
         control_name=0,
         cv=None,
-        calibrate_propensity=True,
     ):
         """Initialize a TMLE learner.
 
@@ -119,7 +117,6 @@ def __init__(
         self.ate_alpha = ate_alpha
         self.control_name = control_name
         self.cv = cv
-        self.calibrate_propensity = calibrate_propensity
 
     def __repr__(self):
         return "{}(model={}, cv={})".format(
@@ -165,10 +162,6 @@ def estimate_ate(self, X, treatment, y, p, segment=None, return_ci=False):
             w_group = (treatment == group).astype(int)
             p_group = p[group]
 
-            if self.calibrate_propensity:
-                logger.info("Calibrating propensity scores.")
-                p_group = calibrate(p_group, w_group)
-
             yhat_c = np.zeros_like(y, dtype=float)
             yhat_t = np.zeros_like(y, dtype=float)
             if self.cv:
diff --git a/causalml/inference/tree/_tree/_classes.py b/causalml/inference/tree/_tree/_classes.py
@@ -40,6 +40,7 @@
     _check_sample_weight,
     assert_all_finite,
     check_is_fitted,
+    validate_data,
 )
 from . import _criterion, _splitter, _tree
 from ._criterion import Criterion
@@ -242,8 +243,8 @@ def _fit(
                 dtype=DTYPE, accept_sparse="csc", force_all_finite=False
             )
             check_y_params = dict(ensure_2d=False, dtype=None)
-            X, y = self._validate_data(
-                X, y, validate_separately=(check_X_params, check_y_params)
+            X, y = validate_data(
+                self, X, y, validate_separately=(check_X_params, check_y_params)
             )
 
             missing_values_in_feature_mask = (
@@ -479,7 +480,8 @@ def _validate_X_predict(self, X, check_input):
                 force_all_finite = "allow-nan"
             else:
                 force_all_finite = True
-            X = self._validate_data(
+            X = validate_data(
+                self,
                 X,
                 dtype=DTYPE,
                 accept_sparse="csr",
diff --git a/causalml/inference/tree/causal/_tree.py b/causalml/inference/tree/causal/_tree.py
@@ -6,7 +6,7 @@
 import numpy as np
 from scipy.sparse import issparse
 from sklearn.utils import check_random_state
-from sklearn.utils.validation import _check_sample_weight
+from sklearn.utils.validation import _check_sample_weight, validate_data
 
 from .._tree._classes import DTYPE, DOUBLE, INT
 from .._tree._classes import SPARSE_SPLITTERS, DENSE_SPLITTERS
@@ -61,8 +61,8 @@ def fit(
             # csr.
             check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
             check_y_params = dict(ensure_2d=False, dtype=None)
-            X, y = self._validate_data(
-                X, y, validate_separately=(check_X_params, check_y_params)
+            X, y = validate_data(
+                self, X, y, validate_separately=(check_X_params, check_y_params)
             )
             if issparse(X):
                 X.sort_indices()
@@ -184,7 +184,7 @@ def fit(
             )
 
         if sample_weight is not None:
-            sample_weight = _check_sample_weight(sample_weight, X, DOUBLE)
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=DOUBLE)
 
         if expanded_class_weight is not None:
             if sample_weight is not None:
diff --git a/causalml/inference/tree/causal/causalforest.py b/causalml/inference/tree/causal/causalforest.py
@@ -6,7 +6,11 @@
 from warnings import catch_warnings, simplefilter, warn
 
 from sklearn.exceptions import DataConversionWarning
-from sklearn.utils.validation import check_random_state, _check_sample_weight
+from sklearn.utils.validation import (
+    check_random_state,
+    _check_sample_weight,
+    validate_data,
+)
 from sklearn.utils.multiclass import type_of_target
 from sklearn import __version__ as sklearn_version
 from sklearn.ensemble._forest import DOUBLE, DTYPE, MAX_INT
@@ -246,8 +250,8 @@ def _fit(
         # Validate or convert input data
         if issparse(y):
             raise ValueError("sparse multilabel-indicator for y is not supported.")
-        X, y = self._validate_data(
-            X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE
+        X, y = validate_data(
+            self, X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE
         )
         if sample_weight is not None:
             sample_weight = _check_sample_weight(sample_weight, X)
diff --git a/causalml/metrics/visualize.py b/causalml/metrics/visualize.py
@@ -342,7 +342,6 @@ def get_tmlegain(
     p_col="p",
     n_segment=5,
     cv=None,
-    calibrate_propensity=True,
     ci=False,
 ):
     """Get TMLE based average uplifts of model estimates of segments.
@@ -356,7 +355,6 @@ def get_tmlegain(
         p_col (str, optional): the column name for propensity score
         n_segment (int, optional): number of segment that TMLE will estimated for each
         cv (sklearn.model_selection._BaseKFold, optional): sklearn CV object
-        calibrate_propensity (bool, optional): whether calibrate propensity score or not
         ci (bool, optional): whether return confidence intervals for ATE or not
     Returns:
         (pandas.DataFrame): cumulative gains of model estimates based of TMLE
@@ -374,7 +372,7 @@ def get_tmlegain(
     inference_col = [x for x in inference_col if x in df.columns]
 
     # Initialize TMLE
-    tmle = TMLELearner(learner, cv=cv, calibrate_propensity=calibrate_propensity)
+    tmle = TMLELearner(learner, cv=cv)
     ate_all, ate_all_lb, ate_all_ub = tmle.estimate_ate(
         X=df[inference_col], p=df[p_col], treatment=df[treatment_col], y=df[outcome_col]
     )
@@ -454,7 +452,6 @@ def get_tmleqini(
     p_col="p",
     n_segment=5,
     cv=None,
-    calibrate_propensity=True,
     ci=False,
     normalize=False,
 ):
@@ -469,7 +466,6 @@ def get_tmleqini(
         p_col (str, optional): the column name for propensity score
         n_segment (int, optional): number of segment that TMLE will estimated for each
         cv (sklearn.model_selection._BaseKFold, optional): sklearn CV object
-        calibrate_propensity (bool, optional): whether calibrate propensity score or not
         ci (bool, optional): whether return confidence intervals for ATE or not
     Returns:
         (pandas.DataFrame): cumulative gains of model estimates based of TMLE
@@ -487,7 +483,7 @@ def get_tmleqini(
     inference_col = [x for x in inference_col if x in df.columns]
 
     # Initialize TMLE
-    tmle = TMLELearner(learner, cv=cv, calibrate_propensity=calibrate_propensity)
+    tmle = TMLELearner(learner, cv=cv)
     ate_all, ate_all_lb, ate_all_ub = tmle.estimate_ate(
         X=df[inference_col], p=df[p_col], treatment=df[treatment_col], y=df[outcome_col]
     )
@@ -696,7 +692,6 @@ def plot_tmlegain(
     p_col="tau",
     n_segment=5,
     cv=None,
-    calibrate_propensity=True,
     ci=False,
     figsize=(8, 8),
 ):
@@ -711,7 +706,6 @@ def plot_tmlegain(
         p_col (str, optional): the column name for propensity score
         n_segment (int, optional): number of segment that TMLE will estimated for each
         cv (sklearn.model_selection._BaseKFold, optional): sklearn CV object
-        calibrate_propensity (bool, optional): whether calibrate propensity score or not
         ci (bool, optional): whether return confidence intervals for ATE or not
     """
 
@@ -728,7 +722,6 @@ def plot_tmlegain(
         p_col=p_col,
         n_segment=n_segment,
         cv=cv,
-        calibrate_propensity=calibrate_propensity,
     )
 
 
@@ -741,7 +734,6 @@ def plot_tmleqini(
     p_col="tau",
     n_segment=5,
     cv=None,
-    calibrate_propensity=True,
     ci=False,
     figsize=(8, 8),
 ):
@@ -756,7 +748,6 @@ def plot_tmleqini(
         p_col (str, optional): the column name for propensity score
         n_segment (int, optional): number of segment that TMLE will estimated for each
         cv (sklearn.model_selection._BaseKFold, optional): sklearn CV object
-        calibrate_propensity (bool, optional): whether calibrate propensity score or not
         ci (bool, optional): whether return confidence intervals for ATE or not
     """
 
@@ -773,7 +764,6 @@ def plot_tmleqini(
         p_col=p_col,
         n_segment=n_segment,
         cv=cv,
-        calibrate_propensity=calibrate_propensity,
     )
 
 
diff --git a/causalml/propensity.py b/causalml/propensity.py
@@ -12,16 +12,19 @@
 
 
 class PropensityModel(metaclass=ABCMeta):
-    def __init__(self, clip_bounds=(1e-3, 1 - 1e-3), **model_kwargs):
+    def __init__(self, clip_bounds=(1e-3, 1 - 1e-3), calibrate=True, **model_kwargs):
         """
         Args:
             clip_bounds (tuple): lower and upper bounds for clipping propensity scores. Bounds should be implemented
                     such that: 0 < lower < upper < 1, to avoid division by zero in BaseRLearner.fit_predict() step.
+            calibrate (bool): whether calibrate the propensity score
             model_kwargs: Keyword arguments to be passed to the underlying classification model.
         """
         self.clip_bounds = clip_bounds
+        self.calibrate = calibrate
         self.model_kwargs = model_kwargs
         self.model = self._model
+        self.calibrator = None
 
     @property
     @abstractmethod
@@ -40,6 +43,15 @@ def fit(self, X, y):
             y (numpy.ndarray): a binary target vector
         """
         self.model.fit(X, y)
+        if self.calibrate:
+            # Fit a calibrator to the propensity scores with IsotonicRegression.
+            # Ref: https://scikit-learn.org/stable/modules/isotonic.html
+            self.calibrator = IsotonicRegression(
+                out_of_bounds="clip",
+                y_min=self.clip_bounds[0],
+                y_max=self.clip_bounds[1],
+            )
+            self.calibrator.fit(self.model.predict_proba(X)[:, 1], y)
 
     def predict(self, X):
         """
@@ -51,7 +63,11 @@ def predict(self, X):
         Returns:
             (numpy.ndarray): Propensity scores between 0 and 1.
         """
-        return np.clip(self.model.predict_proba(X)[:, 1], *self.clip_bounds)
+        p = self.model.predict_proba(X)[:, 1]
+        if self.calibrate:
+            p = self.calibrator.transform(p)
+
+        return np.clip(p, *self.clip_bounds)
 
     def fit_predict(self, X, y):
         """
@@ -66,7 +82,6 @@ def fit_predict(self, X, y):
         """
         self.fit(X, y)
         propensity_scores = self.predict(X)
-        logger.info("AUC score: {:.6f}".format(auc(y, propensity_scores)))
         return propensity_scores
 
 
@@ -112,12 +127,15 @@ class GradientBoostedPropensityModel(PropensityModel):
     https://xgboost.readthedocs.io/en/latest/python/python_api.html
     """
 
-    def __init__(self, early_stop=False, clip_bounds=(1e-3, 1 - 1e-3), **model_kwargs):
+    def __init__(
+        self,
+        early_stop=False,
+        clip_bounds=(1e-3, 1 - 1e-3),
+        calibrate=True,
+        **model_kwargs,
+    ):
         self.early_stop = early_stop
-
-        super(GradientBoostedPropensityModel, self).__init__(
-            clip_bounds, **model_kwargs
-        )
+        super().__init__(clip_bounds, calibrate, **model_kwargs)
 
     @property
     def _model(self):
@@ -156,50 +174,25 @@ def fit(self, X, y, stop_val_size=0.2):
                 y_train,
                 eval_set=[(X_val, y_val)],
             )
+            if self.calibrate:
+                self.calibrator = IsotonicRegression(
+                    out_of_bounds="clip",
+                    y_min=self.clip_bounds[0],
+                    y_max=self.clip_bounds[1],
+                )
+                self.calibrator.fit(self.model.predict_proba(X)[:, 1], y)
         else:
-            super(GradientBoostedPropensityModel, self).fit(X, y)
-
-    def predict(self, X):
-        """
-        Predict propensity scores.
-
-        Args:
-            X (numpy.ndarray): a feature matrix
-
-        Returns:
-            (numpy.ndarray): Propensity scores between 0 and 1.
-        """
-        if self.early_stop:
-            return np.clip(
-                self.model.predict_proba(X)[:, 1],
-                *self.clip_bounds,
-            )
-        else:
-            return super(GradientBoostedPropensityModel, self).predict(X)
-
-
-def calibrate(ps, treatment):
-    """Calibrate propensity scores with IsotonicRegression.
-
-    Ref: https://scikit-learn.org/stable/modules/isotonic.html
-
-    Args:
-        ps (numpy.array): a propensity score vector
-        treatment (numpy.array): a binary treatment vector (0: control, 1: treated)
-
-    Returns:
-        (numpy.array): a calibrated propensity score vector
-    """
-
-    two_eps = 2.0 * np.finfo(float).eps
-    pm_ir = IsotonicRegression(out_of_bounds="clip", y_min=two_eps, y_max=1.0 - two_eps)
-    ps_ir = pm_ir.fit_transform(ps, treatment)
-
-    return ps_ir
+            super().fit(X, y)
 
 
 def compute_propensity_score(
-    X, treatment, p_model=None, X_pred=None, treatment_pred=None, calibrate_p=True
+    X,
+    treatment,
+    p_model=None,
+    X_pred=None,
+    treatment_pred=None,
+    calibrate_p=True,
+    clip_bounds=(1e-3, 1 - 1e-3),
 ):
     """Generate propensity score if user didn't provide and optionally calibrate.
 
@@ -210,16 +203,20 @@ def compute_propensity_score(
         X_pred (np.matrix, optional): features for prediction
         treatment_pred (np.array or pd.Series, optional): a treatment vector for prediciton
         calibrate_p (bool, optional): whether calibrate the propensity score
+        clip_bounds (tuple, optional): lower and upper bounds for clipping propensity scores. Bounds should be implemented
+                    such that: 0 < lower < upper < 1, to avoid division by zero in BaseRLearner.fit_predict() step.
 
     Returns:
         (tuple)
             - p (numpy.ndarray): propensity score
-            - p_model (PropensityModel): either the original p_model, a trained ElasticNetPropensityModel, or None if calibrate_p=True
+            - p_model (PropensityModel): either the original p_model or a trained ElasticNetPropensityModel
     """
     if treatment_pred is None:
         treatment_pred = treatment.copy()
     if p_model is None:
-        p_model = ElasticNetPropensityModel()
+        p_model = ElasticNetPropensityModel(
+            clip_bounds=clip_bounds, calibrate=calibrate_p
+        )
 
     p_model.fit(X, treatment)
 
@@ -231,14 +228,4 @@ def compute_propensity_score(
         logger.info("predict_proba not available, using predict instead")
         p = p_model.predict(X_pred)
 
-    if calibrate_p:
-        logger.info("Calibrating propensity scores. Returning p_model=None.")
-        p = calibrate(p, treatment_pred)
-        p_model = None
-
-    # force the p values within the range
-    eps = np.finfo(float).eps
-    p = np.where(p < 0 + eps, 0 + eps * 1.001, p)
-    p = np.where(p > 1 - eps, 1 - eps * 1.001, p)
-
     return p, p_model
diff --git a/docs/examples/validation_with_tmle.ipynb b/docs/examples/validation_with_tmle.ipynb
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/tests/test_visualize.py b/tests/test_visualize.py