diff --git a/foundry/evaluation/marginal_effects.py b/foundry/evaluation/marginal_effects.py index b9f36c7..c9bf5d3 100644 --- a/foundry/evaluation/marginal_effects.py +++ b/foundry/evaluation/marginal_effects.py @@ -15,7 +15,7 @@ class Binned: - def __init__(self, col: str, bins: Union[int, Sequence] = 20, **kwargs): + def __init__(self, col: str, bins: Union[None, int, Sequence] = 20, **kwargs): """ This class creates an object which can bin a pandas.Series. ``` @@ -504,26 +504,52 @@ def _get_binned_feature_map(X: pd.DataFrame, aggfun: Union[str, Callable]) -> pd.DataFrame: """ Get a dataframe that maps the binned version of a feature to the aggregates of its original values. + + :param X: A dataframe which contains the columns binned_fname and fname + :param binned_fname: The column name of the binned data + :param fname: The column name of the unbinned data + :param aggfun: the aggregation of X[fname] based on grouping by binned_fname. The special case of 'mid' will use + the midpoint of the bins in X[binned_fname]. In the case that there are no actual values in a bin to aggregate, the midpoint + of the bin will be used. + + :returns: a pd.DataFrame with columns [binned_fname, fname]. The returned[fname] will contain the aggregated values. + :raises ValueError: if fname and binned_fname are the same + :raises ValueError: if there are inf or na in the resulting aggregated values. """ - assert binned_fname != fname + if binned_fname == fname: + raise ValueError("binned_fname and fname cannot be the same column.") if aggfun == 'mid': - # creates a df with unique values of `binned_fname` and `nans` for `fname`. - # this will then get filled with the midpoint below: - # todo: less hacky way to do this - df_mapping = X.groupby(binned_fname, observed=False)[fname].agg('count').reset_index() - df_mapping[fname] = float('nan') - else: - df_mapping = X.groupby(binned_fname, observed=False)[fname].agg(aggfun).reset_index() - - # for any bins that aren't actually observed, use the midpoint: - midpoints = pd.Series([x.mid for x in df_mapping[binned_fname]]) - if np.isinf(midpoints).any() and df_mapping[fname].isnull().any(): - raise ValueError( - f"[{fname}] `inf` bin cuts cannot be used when no data present in the bin:" - f"{df_mapping[binned_fname][np.isinf(midpoints)]}" + aggfun = lambda series: series.name.mid + + df_mapping = ( + X + .groupby( + binned_fname, + group_keys=True, + observed=False, + as_index=False, ) - df_mapping[fname].fillna(midpoints, inplace=True) + [fname] + .apply(aggfun) + .assign(**{ + fname: lambda df: ( + df + [fname] + .fillna( + df + [binned_fname] + .map(lambda interval: interval.mid) + .astype(float) + ) + ) + }) + ) + + with pd.option_context("mode.use_inf_as_na", True): + if df_mapping[fname].isna().any(): + raise ValueError(f"aggfun resulted in invalid values: \n {df_mapping}") + return df_mapping def _get_df_novary(self, diff --git a/tests/evaluation/test_marginal_effects.py b/tests/evaluation/test_marginal_effects.py index d80fc93..9cd1da1 100644 --- a/tests/evaluation/test_marginal_effects.py +++ b/tests/evaluation/test_marginal_effects.py @@ -1,13 +1,15 @@ from typing import Callable -import pandas as pd +from unittest.mock import create_autospec + import numpy as np +import pandas as pd import pytest -from unittest.mock import create_autospec -from pandas.testing import assert_series_equal +from foundry.evaluation.marginal_effects import (Binned, MarginalEffects, + binned, raw) +from pandas.testing import assert_frame_equal, assert_series_equal from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline -from foundry.evaluation.marginal_effects import Binned, MarginalEffects, binned, raw class TestBinned(): @pytest.mark.parametrize( @@ -72,7 +74,7 @@ def test_binned_init(self, bins): ) ), ( - False, + None, pd.Series(list(range(20)), name="my_feature") ) ], @@ -133,3 +135,89 @@ def test_feature_names_in(self, col_transformer__columns, expected): assert isinstance(me.feature_names_in, list) assert list(sorted(expected)) == list(sorted(me.feature_names_in)) + + binned_col_A = pd.Series( + [ + pd.Interval(0.999, 2.0), + pd.Interval(2.0, 3.0), + ], + dtype=pd.CategoricalDtype( + categories=[ + pd.Interval(0.999, 2.0), + pd.Interval(2.0, 3.0) + ], + ordered=True + ), + name="binnedA" + ) + + @pytest.mark.parametrize( + argnames=["aggfun", "expected"], + argvalues=[ + ( + "mid", + pd.DataFrame({"binnedA": binned_col_A, "colA": [1.4995, 2.5]}) + ), + ( + "min", + pd.DataFrame({"binnedA": binned_col_A, "colA": [1, 3]}) + ), + ( + np.median, + pd.DataFrame({"binnedA": binned_col_A, "colA": [1.5, 3.0]}) + ), + ] + ) + def test__get_binned_feature_map(self, aggfun, expected): + df = ( + self.x_data + .assign( + **{ + "binnedA": [ + pd.Interval(0.999, 2.0), + pd.Interval(0.999, 2.0), + pd.Interval(2.0, 3.0), + ], + }, + ) + .astype({"binnedA": self.binned_col_A.dtype}) + ) + + test = MarginalEffects._get_binned_feature_map( + df, + "binnedA", + "colA", + aggfun=aggfun, + ) + + print(test.dtypes, expected.dtypes) + assert_frame_equal(test, expected) + + def test__get_binned_feature_map_empty_bins(self): + df = ( + self.x_data + .assign( + **{ + "binnedA": pd.Categorical( + [ + pd.Interval(0.999, 2.0), + pd.Interval(0.999, 2.0), + pd.Interval(2.0, 3.0), + ], + categories=[ + pd.Interval(-np.inf, 0.999), + pd.Interval(0.999, 2.0), + pd.Interval(2.0, 3.0) + ], + ) + }, + ) + ) + + with pytest.raises(ValueError): + MarginalEffects._get_binned_feature_map( + df, + "binnedA", + "colA", + "median", + ) diff --git a/tests/preprocessing/sklearn/test_dataframe_transformer.py b/tests/preprocessing/sklearn/test_dataframe_transformer.py index f285379..25b1e7a 100644 --- a/tests/preprocessing/sklearn/test_dataframe_transformer.py +++ b/tests/preprocessing/sklearn/test_dataframe_transformer.py @@ -19,7 +19,7 @@ class TestDataFrameTransformer: (np.zeros((3, 2)), pd.DataFrame(np.zeros((3, 2)))), # convert sparse: ( - OneHotEncoder(sparse=True).fit_transform([['a'], ['b'], ['c'], ['d']]), + OneHotEncoder(sparse_output=True).fit_transform([['a'], ['b'], ['c'], ['d']]), pd.DataFrame(np.eye(4)) ) ]