diff --git a/.github/workflows/CI.yaml b/.github/workflows/CI.yaml index 794814cf..2c56ff71 100644 --- a/.github/workflows/CI.yaml +++ b/.github/workflows/CI.yaml @@ -44,7 +44,6 @@ jobs: - name: Upgrade pip and install doc requirements run: | python -m pip install --upgrade pip - python -m pip install pip-tools python -m pip install -e ".[extras,docs]" - name: build documentation run: | diff --git a/README.rst b/README.rst index 8e1bb219..a00e6e9d 100644 --- a/README.rst +++ b/README.rst @@ -2,7 +2,7 @@ anesthetic: nested sampling post-processing =========================================== :Authors: Will Handley and Lukas Hergt -:Version: 2.0.0-beta.25 +:Version: 2.0.0-beta.26 :Homepage: https://github.com/williamjameshandley/anesthetic :Documentation: http://anesthetic.readthedocs.io/ @@ -191,8 +191,8 @@ Why create another one? In general, any dedicated user of software will find tha .. code:: python - from anesthetic import MCMCSamples - samples = MCMCSamples(root=file_root) # Load the samples + from anesthetic import read_chains + samples = read_chains(file_root) # Load the samples samples['omegab'] = samples.omegabh2/(samples.H0/100)**2 # Define omegab samples.tex['omegab'] = '$\Omega_b$' # Label omegab samples.plot_1d('omegab') # Simple 1D plot diff --git a/anesthetic/_version.py b/anesthetic/_version.py index c4c4e20b..019ed87d 100644 --- a/anesthetic/_version.py +++ b/anesthetic/_version.py @@ -1 +1 @@ -__version__ = '2.0.0b25' +__version__ = '2.0.0b26' diff --git a/anesthetic/samples.py b/anesthetic/samples.py index 038faea0..9a910185 100644 --- a/anesthetic/samples.py +++ b/anesthetic/samples.py @@ -515,7 +515,7 @@ def remove_burn_in(self, burn_in, reset_index=False, inplace=False): Indicates whether to modify the existing array or return a copy. """ - chains = self.groupby(('chain', '$n_\\mathrm{chain}$'), + chains = self.groupby(('chain', '$n_\\mathrm{chain}$'), sort=False, group_keys=False) nchains = chains.ngroups if isinstance(burn_in, (int, float)): @@ -574,25 +574,32 @@ def Gelman_Rubin(self, params=None): and 'logL' not in key and 'chain' not in key] chains = self[params+['chain']].groupby( - ('chain', '$n_\\mathrm{chain}$') + ('chain', '$n_\\mathrm{chain}$'), sort=False, ) + nchains = chains.ngroups # Within chain variance ``W`` # (average variance within each chain): - W = chains.cov().groupby(level=['params', 'labels']).mean().to_numpy() - # TODO: the above line should be a weighted mean - # --> need to fix groupby for WeightedDataFrames! - + W = chains.cov().groupby(level=('params', 'labels'), sort=False).mean() # Between-chain variance ``B`` - # (variance of the chain means compared to the full mean): - means_diff = (chains.mean() - self[params].mean()).to_numpy() - B = (means_diff.T @ means_diff) / (chains.ngroups - 1) - # B = chains.mean().cov().to_numpy() - # TODO: fix once groupby is fixed - - L = np.linalg.cholesky(W) - invL = np.linalg.inv(L) - D = np.linalg.eigvalsh(invL @ B @ invL.T) + # (variance of the chain means): + B = np.atleast_2d(np.cov(chains.mean().T, ddof=1)) + # We don't weight `B` with the effective number of samples (sum of the + # weights), here, because we want to notice outliers from shorter + # chains. + # In order to be conservative, we generally want to underestimate `W` + # and overestimate `B`, since `W` goes in the denominator and `B` in + # the numerator of the Gelman--Rubin statistic `Rminus1`. + + try: + invL = np.linalg.inv(np.linalg.cholesky(W)) + except np.linalg.LinAlgError as e: + raise np.linalg.LinAlgError( + "Make sure you do not have linearly dependent parameters, " + "e.g. having both `As` and `A=1e9*As` causes trouble.") from e + D = np.linalg.eigvalsh(invL @ ((nchains+1)/nchains * B) @ invL.T) + # The factor of `(nchains+1)/nchains` accounts for the additional + # uncertainty from using a finite number of chains. Rminus1 = np.max(np.abs(D)) return Rminus1 diff --git a/anesthetic/utils.py b/anesthetic/utils.py index 0208a855..541d2132 100644 --- a/anesthetic/utils.py +++ b/anesthetic/utils.py @@ -530,8 +530,9 @@ class to adjust """ for key, val in cls.__dict__.items(): doc = inspect.getdoc(val) - newdoc = re.sub(pattern, repl, doc, *args, **kwargs) - try: - cls.__dict__[key].__doc__ = newdoc - except AttributeError: - pass + if doc is not None: + newdoc = re.sub(pattern, repl, doc, *args, **kwargs) + try: + cls.__dict__[key].__doc__ = newdoc + except AttributeError: + pass diff --git a/anesthetic/weighted_pandas.py b/anesthetic/weighted_pandas.py index fb5ab1bc..cfe5a1d9 100644 --- a/anesthetic/weighted_pandas.py +++ b/anesthetic/weighted_pandas.py @@ -1,12 +1,131 @@ """Pandas DataFrame and Series with weighted samples.""" +import warnings from inspect import signature import numpy as np from pandas import Series, DataFrame, concat, MultiIndex +from pandas.core.groupby import GroupBy, SeriesGroupBy, DataFrameGroupBy, ops +from pandas._libs import lib +from pandas._libs.lib import no_default +from pandas.util._exceptions import find_stack_level from pandas.util import hash_pandas_object from numpy.ma import masked_array from anesthetic.utils import (compress_weights, channel_capacity, quantile, temporary_seed, adjust_docstrings) +from pandas.core.dtypes.missing import notna + + +class WeightedGroupBy(GroupBy): + """Weighted version of ``pandas.core.groupby.GroupBy``.""" + + grouper: ops.BaseGrouper + """:meta private:""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def _add_weights(self, name, *args, **kwargs): + result = self.agg(lambda df: getattr(self.obj._constructor(df), name) + (*args, **kwargs)).set_weights(self.get_weights()) + return result.__finalize__(self.obj, method="groupby") + + def mean(self, *args, **kwargs): # noqa: D102 + return self._add_weights("mean", *args, **kwargs) + + def std(self, *args, **kwargs): # noqa: D102 + return self._add_weights("std", *args, **kwargs) + + def median(self, *args, **kwargs): # noqa: D102 + return self._add_weights("median", *args, **kwargs) + + def var(self, *args, **kwargs): # noqa: D102 + return self._add_weights("var", *args, **kwargs) + + def kurt(self, *args, **kwargs): # noqa: D102 + return self._add_weights("kurt", *args, **kwargs) + + def kurtosis(self, *args, **kwargs): # noqa: D102 + return self._add_weights("kurtosis", *args, **kwargs) + + def sem(self, *args, **kwargs): # noqa: D102 + return self._add_weights("sem", *args, **kwargs) + + def quantile(self, *args, **kwargs): # noqa: D102 + return self._add_weights("quantile", *args, **kwargs) + + def get_weights(self): + """Return the weights of the grouped samples.""" + return self.agg(lambda df: df.get_weights().sum()) + + def _make_wrapper(self, name): + _wrapper = super()._make_wrapper(name) + + def wrapper(*args, **kwargs): + result = _wrapper(*args, **kwargs) + try: + index = result.index.get_level_values(self.keys) + weights = self.get_weights()[index] + except KeyError: + weights = self.get_weights() + return result.set_weights(weights, level=1) + + wrapper.__name__ = name + return wrapper + + +class WeightedSeriesGroupBy(WeightedGroupBy, SeriesGroupBy): + """Weighted version of ``pandas.core.groupby.SeriesGroupBy``.""" + + def sample(self, *args, **kwargs): # noqa: D102 + return super().sample(weights=self.obj.get_weights(), *args, **kwargs) + + +class WeightedDataFrameGroupBy(WeightedGroupBy, DataFrameGroupBy): + """Weighted version of ``pandas.core.groupby.DataFrameGroupBy``.""" + + def get_weights(self): + """Return the weights of the grouped samples.""" + return super().get_weights().min(axis=1-self.axis) + + def _gotitem(self, key, ndim: int, subset=None): # pragma: no cover + if ndim == 2: + if subset is None: + subset = self.obj + return WeightedDataFrameGroupBy( + subset, + self.grouper, + axis=self.axis, + level=self.level, + grouper=self.grouper, + exclusions=self.exclusions, + selection=key, + as_index=self.as_index, + sort=self.sort, + group_keys=self.group_keys, + squeeze=self.squeeze, + observed=self.observed, + mutated=self.mutated, + dropna=self.dropna, + ) + elif ndim == 1: + if subset is None: + subset = self.obj[key] + return WeightedSeriesGroupBy( + subset, + level=self.level, + grouper=self.grouper, + selection=key, + sort=self.sort, + group_keys=self.group_keys, + squeeze=self.squeeze, + observed=self.observed, + dropna=self.dropna, + ) + + raise AssertionError("invalid ndim for _gotitem") + + def sample(self, *args, **kwargs): # noqa: D102 + return super().sample(weights=self.obj.get_weights(), *args, **kwargs) class _WeightedObject(object): @@ -114,6 +233,8 @@ class WeightedSeries(_WeightedObject, Series): """Weighted version of :class:`pandas.Series`.""" def mean(self, skipna=True): # noqa: D102 + if self.get_weights().sum() == 0: + return np.nan null = self.isnull() & skipna return np.average(masked_array(self, null), weights=self.get_weights()) @@ -124,9 +245,13 @@ def kurtosis(self, *args, **kwargs): # noqa: D102 return self.kurt(*args, **kwargs) def median(self, *args, **kwargs): # noqa: D102 + if self.get_weights().sum() == 0: + return np.nan return self.quantile(*args, **kwargs) def var(self, skipna=True): # noqa: D102 + if self.get_weights().sum() == 0: + return np.nan null = self.isnull() & skipna mean = self.mean(skipna=skipna) if np.isnan(mean): @@ -134,20 +259,30 @@ def var(self, skipna=True): # noqa: D102 return np.average(masked_array((self-mean)**2, null), weights=self.get_weights()) - def cov(self, other, skipna=True): # noqa: D102 - null = (self.isnull() | other.isnull()) & skipna - x = self.mean(skipna=skipna) - y = other.mean(skipna=skipna) - if np.isnan(x) or np.isnan(y): + def cov(self, other, *args, **kwargs): # noqa: D102 + + this, other = self.align(other, join="inner", copy=False) + if len(this) == 0: return np.nan - return np.average(masked_array((self-x)*(other-y), null), - weights=self.get_weights()) - def corr(self, other, method="pearson", skipna=True): # noqa: D102 - norm = self.std(skipna=skipna)*other.std(skipna=skipna) - return self.cov(other, skipna=skipna)/norm + weights = self.index.to_frame()['weights'] + weights, _ = weights.align(other, join="inner", copy=False) + + valid = notna(this) & notna(other) + if not valid.all(): + this = this[valid] + other = other[valid] + weights = weights[valid] + + return np.cov(this, other, aweights=weights)[0, 1] + + def corr(self, other, *args, **kwargs): # noqa: D102 + norm = self.std(skipna=True)*other.std(skipna=True) + return self.cov(other, *args, **kwargs)/norm def kurt(self, skipna=True): # noqa: D102 + if self.get_weights().sum() == 0: + return np.nan null = self.isnull() & skipna mean = self.mean(skipna=skipna) std = self.std(skipna=skipna) @@ -157,6 +292,8 @@ def kurt(self, skipna=True): # noqa: D102 weights=self.get_weights()) def skew(self, skipna=True): # noqa: D102 + if self.get_weights().sum() == 0: + return np.nan null = self.isnull() & skipna mean = self.mean(skipna=skipna) std = self.std(skipna=skipna) @@ -166,6 +303,8 @@ def skew(self, skipna=True): # noqa: D102 weights=self.get_weights()) def mad(self, skipna=True): # noqa: D102 + if self.get_weights().sum() == 0: + return np.nan null = self.isnull() & skipna mean = self.mean(skipna=skipna) if np.isnan(mean): @@ -177,6 +316,8 @@ def sem(self, skipna=True): # noqa: D102 return np.sqrt(self.var(skipna=skipna)/self.neff()) def quantile(self, q=0.5, interpolation='linear'): # noqa: D102 + if self.get_weights().sum() == 0: + return np.nan return quantile(self.to_numpy(), q, self.get_weights(), interpolation) def compress(self, ncompress=True): @@ -204,12 +345,44 @@ def _constructor(self): def _constructor_expanddim(self): return WeightedDataFrame + def groupby( + self, + by=None, + axis=0, + level=None, + as_index=True, + sort=True, + group_keys=True, + observed=False, + dropna=True, + ): # pragma: no cover # noqa: D102 + if level is None and by is None: + raise TypeError("You have to supply one of 'by' and 'level'") + if not as_index: + raise TypeError("as_index=False only valid with DataFrame") + axis = self._get_axis_number(axis) + + return WeightedSeriesGroupBy( + obj=self, + keys=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + observed=observed, + dropna=dropna, + ) + class WeightedDataFrame(_WeightedObject, DataFrame): """Weighted version of :class:`pandas.DataFrame`.""" def mean(self, axis=0, skipna=True, *args, **kwargs): # noqa: D102 if self.isweighted(axis): + if self.get_weights(axis).sum() == 0: + return self._constructor_sliced(np.nan, + index=self._get_axis(1-axis)) null = self.isnull() & skipna mean = np.average(masked_array(self, null), weights=self.get_weights(axis), axis=axis) @@ -228,6 +401,9 @@ def median(self, *args, **kwargs): # noqa: D102 def var(self, axis=0, skipna=True, *args, **kwargs): # noqa: D102 if self.isweighted(axis): + if self.get_weights(axis).sum() == 0: + return self._constructor_sliced(np.nan, + index=self._get_axis(1-axis)) null = self.isnull() & skipna mean = self.mean(axis=axis, skipna=skipna) var = np.average(masked_array((self-mean)**2, null), @@ -236,10 +412,10 @@ def var(self, axis=0, skipna=True, *args, **kwargs): # noqa: D102 else: return super().var(axis=axis, skipna=skipna, *args, **kwargs) - def cov(self, skipna=True, *args, **kwargs): # noqa: D102 + def cov(self, *args, **kwargs): # noqa: D102 if self.isweighted(): - null = self.isnull() & skipna - mean = self.mean(skipna=skipna) + null = self.isnull() + mean = self.mean(skipna=True) x = masked_array(self - mean, null) cov = np.ma.dot(self.get_weights()*x.T, x) \ / self.get_weights().sum().T @@ -264,19 +440,25 @@ def corr(self, method="pearson", skipna=True, def corrwith(self, other, axis=0, drop=False, method="pearson", *args, **kwargs): # noqa: D102 - if self.isweighted(axis): + axis = self._get_axis_number(axis) + if not self.isweighted(axis): + return super().corrwith(other, drop=drop, axis=axis, method=method, + *args, **kwargs) + else: if isinstance(other, Series): answer = self.apply(lambda x: other.corr(x, method=method), axis=axis) return self._constructor_sliced(answer) left, right = self.align(other, join="inner", copy=False) - weights = self.get_weights(axis) if axis == 1: left = left.T right = right.T + weights = left.index.to_frame()['weights'] + weights, _ = weights.align(right, join="inner", copy=False) + # mask missing values left = left + right * 0 right = right + left * 0 @@ -285,7 +467,7 @@ def corrwith(self, other, axis=0, drop=False, method="pearson", ldem = left - left.mean() rdem = right - right.mean() - num = (ldem * rdem * weights[:, None]).sum() + num = (ldem * rdem * weights.to_numpy()[:, None]).sum() dom = weights.sum() * left.std() * right.std() correl = num / dom @@ -301,12 +483,12 @@ def corrwith(self, other, axis=0, drop=False, method="pearson", index=idx_diff)]) return self._constructor_sliced(correl) - else: - return super().corrwith(other, drop=drop, axis=axis, method=method, - *args, **kwargs) def kurt(self, axis=0, skipna=True, *args, **kwargs): # noqa: D102 if self.isweighted(axis): + if self.get_weights(axis).sum() == 0: + return self._constructor_sliced(np.nan, + index=self._get_axis(1-axis)) null = self.isnull() & skipna mean = self.mean(axis=axis, skipna=skipna) std = self.std(axis=axis, skipna=skipna) @@ -318,6 +500,9 @@ def kurt(self, axis=0, skipna=True, *args, **kwargs): # noqa: D102 def skew(self, axis=0, skipna=True, *args, **kwargs): # noqa: D102 if self.isweighted(axis): + if self.get_weights(axis).sum() == 0: + return self._constructor_sliced(np.nan, + index=self._get_axis(1-axis)) null = self.isnull() & skipna mean = self.mean(axis=axis, skipna=skipna) std = self.std(axis=axis, skipna=skipna) @@ -329,6 +514,9 @@ def skew(self, axis=0, skipna=True, *args, **kwargs): # noqa: D102 def mad(self, axis=0, skipna=True, *args, **kwargs): # noqa: D102 if self.isweighted(axis): + if self.get_weights(axis).sum() == 0: + return self._constructor_sliced(np.nan, + index=self._get_axis(1-axis)) null = self.isnull() & skipna mean = self.mean(axis=axis, skipna=skipna) mad = np.average(masked_array(abs(self-mean), null), @@ -405,15 +593,61 @@ def _constructor_sliced(self): def _constructor(self): return WeightedDataFrame - -for cls in [WeightedDataFrame, WeightedSeries]: + def groupby( + self, + by=None, + axis=no_default, + level=None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + observed: bool = False, + dropna: bool = True, + ): # pragma: no cover # noqa: D102 + if axis is not lib.no_default: + axis = self._get_axis_number(axis) + if axis == 1: + warnings.warn( + "DataFrame.groupby with axis=1 is deprecated. Do " + "`frame.T.groupby(...)` without axis instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + warnings.warn( + "The 'axis' keyword in DataFrame.groupby is deprecated " + "and will be removed in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + axis = 0 + + if level is None and by is None: + raise TypeError("You have to supply one of 'by' and 'level'") + + return WeightedDataFrameGroupBy( + obj=self, + keys=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + observed=observed, + dropna=dropna, + ) + + +for cls in [WeightedDataFrame, WeightedSeries, WeightedGroupBy, + WeightedDataFrameGroupBy, WeightedSeriesGroupBy]: adjust_docstrings(cls, r'\bDataFrame\b', 'WeightedDataFrame') adjust_docstrings(cls, r'\bDataFrames\b', 'WeightedDataFrames') adjust_docstrings(cls, r'\bSeries\b', 'WeightedSeries') adjust_docstrings(cls, 'core', 'pandas.core') - adjust_docstrings(cls, 'DataFrameGroupBy', - 'pandas.core.groupby.DataFrameGroupBy') - adjust_docstrings(cls, 'SeriesGroupBy', - 'pandas.core.groupby.SeriesGroupBy') adjust_docstrings(cls, 'pandas.core.window.Rolling.quantile', 'pandas.core.window.rolling.Rolling.quantile') + adjust_docstrings(cls, r'\bDataFrameGroupBy\b', 'WeightedDataFrameGroupBy') + adjust_docstrings(cls, r'\bSeriesGroupBy\b', 'WeightedSeriesGroupBy') +adjust_docstrings(WeightedDataFrame, 'resample', 'pandas.DataFrame.resample') +adjust_docstrings(WeightedSeries, 'resample', 'pandas.Series.resample') diff --git a/docs/source/anesthetic.rst b/docs/source/anesthetic.rst index 9e9b8284..97fa9731 100644 --- a/docs/source/anesthetic.rst +++ b/docs/source/anesthetic.rst @@ -99,6 +99,5 @@ anesthetic.weighted\_pandas module .. automodule:: anesthetic.weighted_pandas :members: :undoc-members: - :show-inheritance: diff --git a/tests/test_samples.py b/tests/test_samples.py index d5b31ac2..c8a00445 100644 --- a/tests/test_samples.py +++ b/tests/test_samples.py @@ -436,6 +436,11 @@ def test_mcmc_stats(): assert mcmc_half.Gelman_Rubin() < 0.01 assert mcmc_half.Gelman_Rubin(['x0']) < 0.01 assert mcmc_half.Gelman_Rubin(['x1']) < 0.01 + with pytest.raises(np.linalg.LinAlgError): + mcmc['y1'] = mcmc.x1 + mcmc['y2'] = mcmc.x1 + mcmc['y3'] = mcmc.x1 + mcmc.Gelman_Rubin(['x0', 'x1', 'y1', 'y2', 'y3']) # more burn-in checks mcmc_new = mcmc.remove_burn_in(burn_in=200.9) @@ -1339,3 +1344,253 @@ def test_old_gui(): make_2d_axes(['x0', 'y0'], tex={'x0': '$x_0$', 'y0': '$y_0$'}) with pytest.raises(NotImplementedError): make_1d_axes(['x0', 'y0'], tex={'x0': '$x_0$', 'y0': '$y_0$'}) + + +def test_groupby_stats(): + mcmc = read_chains('./tests/example_data/cb') + params = ['x0', 'x1'] + chains = mcmc[params + ['chain']].groupby(('chain', '$n_\\mathrm{chain}$')) + + assert chains.mean().isweighted() is True + assert chains.std().isweighted() is True + assert chains.median().isweighted() is True + assert chains.var().isweighted() is True + assert chains.kurt().isweighted() is True + assert chains.kurtosis().isweighted() is True + assert chains.skew().isweighted() is True + assert chains.sem().isweighted() is True + assert chains.corr().isweighted() is True + assert chains.cov().isweighted() is True + assert chains.hist().isweighted() is True + assert chains.corrwith(mcmc).isweighted() is True + + w1 = mcmc.loc[mcmc.chain == 1].get_weights().sum() + w2 = mcmc.loc[mcmc.chain == 2].get_weights().sum() + assert np.all(chains.mean().get_weights() == [w1, w2]) + assert np.all(chains.std().get_weights() == [w1, w2]) + assert np.all(chains.median().get_weights() == [w1, w2]) + assert np.all(chains.var().get_weights() == [w1, w2]) + assert np.all(chains.kurt().get_weights() == [w1, w2]) + assert np.all(chains.kurtosis().get_weights() == [w1, w2]) + assert np.all(chains.skew().get_weights() == [w1, w2]) + assert np.all(chains.sem().get_weights() == [w1, w2]) + w = [w1 for _ in range(len(params))] + [w2 for _ in range(len(params))] + assert np.all(chains.corr().get_weights() == w) + assert np.all(chains.cov().get_weights() == w) + assert np.all(chains.corrwith(mcmc).get_weights() == [w1, w2]) + + for chain in [1, 2]: + mask = mcmc.chain == chain + assert_allclose(mcmc.loc[mask, params].mean(), + chains.mean().loc[chain]) + assert_allclose(mcmc.loc[mask, params].std(), + chains.std().loc[chain]) + assert_allclose(mcmc.loc[mask, params].median(), + chains.median().loc[chain]) + assert_allclose(mcmc.loc[mask, params].var(), + chains.var().loc[chain]) + assert_allclose(mcmc.loc[mask, params].kurt(), + chains.kurt().loc[chain]) + assert_allclose(mcmc.loc[mask, params].kurtosis(), + chains.kurtosis().loc[chain]) + assert_allclose(mcmc.loc[mask, params].skew(), + chains.skew().loc[chain]) + assert_allclose(mcmc.loc[mask, params].mad(), + chains.mad().loc[chain]) + assert_allclose(mcmc.loc[mask, params].sem(), + chains.sem().loc[chain]) + assert_allclose(mcmc.loc[mask, params].cov(), + chains.cov().loc[chain]) + assert_allclose(mcmc.loc[mask, params].corr(), + chains.corr().loc[chain]) + assert_allclose([1, 1], chains.corrwith(mcmc.loc[mask, params] + ).loc[chain]) + + group = chains.get_group(chain).drop( + columns=('chain', '$n_\\mathrm{chain}$')) + assert_allclose(mcmc.loc[mask, params].mean(), group.mean()) + assert_allclose(mcmc.loc[mask, params].std(), group.std()) + assert_allclose(mcmc.loc[mask, params].median(), group.median()) + assert_allclose(mcmc.loc[mask, params].var(), group.var()) + assert_allclose(mcmc.loc[mask, params].kurt(), group.kurt()) + assert_allclose(mcmc.loc[mask, params].kurtosis(), group.kurtosis()) + assert_allclose(mcmc.loc[mask, params].skew(), group.skew()) + assert_allclose(mcmc.loc[mask, params].mad(), group.mad()) + assert_allclose(mcmc.loc[mask, params].sem(), group.sem()) + assert_allclose(mcmc.loc[mask, params].cov(), group.cov()) + assert_allclose(mcmc.loc[mask, params].corr(), group.corr()) + + assert_allclose(mcmc[params].mean(), chains.mean().mean()) + + for col in params: + if 'chain' not in col: + for chain in [1, 2]: + mask = mcmc.chain == chain + assert_allclose(mcmc.loc[mask, col].mean(), + chains[col].mean().loc[chain]) + assert_allclose(mcmc.loc[mask, col].std(), + chains[col].std().loc[chain]) + assert_allclose(mcmc.loc[mask, col].median(), + chains[col].median().loc[chain]) + assert_allclose(mcmc.loc[mask, col].var(), + chains[col].var().loc[chain]) + assert_allclose(mcmc.loc[mask, col].kurt(), + chains[col].kurt().loc[chain]) + assert_allclose(mcmc.loc[mask, col].kurtosis(), + chains[col].kurtosis().loc[chain]) + assert_allclose(mcmc.loc[mask, col].skew(), + chains[col].skew().loc[chain]) + assert_allclose(mcmc.loc[mask, col].mad(), + chains[col].mad().loc[chain]) + assert_allclose(mcmc.loc[mask, col].sem(), + chains[col].sem().loc[chain]) + assert_allclose(mcmc.loc[mask, col].cov(mcmc.loc[mask, col]), + chains[col].cov(mcmc.loc[mask, col]) + .loc[chain]) + assert_allclose(mcmc.loc[mask, col].corr(mcmc.loc[mask, col]), + chains[col].corr(mcmc.loc[mask, col]) + .loc[chain]) + q = np.random.rand() + assert_allclose(mcmc.loc[mask, col].quantile(q), + chains[col].quantile(q).loc[chain]) + + group = chains[col].get_group(chain) + assert_allclose(mcmc.loc[mask, col].mean(), group.mean()) + assert_allclose(mcmc.loc[mask, col].std(), group.std()) + assert_allclose(mcmc.loc[mask, col].median(), group.median()) + assert_allclose(mcmc.loc[mask, col].var(), group.var()) + assert_allclose(mcmc.loc[mask, col].kurt(), group.kurt()) + assert_allclose(mcmc.loc[mask, col].kurtosis(), + group.kurtosis()) + assert_allclose(mcmc.loc[mask, col].skew(), group.skew()) + assert_allclose(mcmc.loc[mask, col].mad(), group.mad()) + assert_allclose(mcmc.loc[mask, col].sem(), group.sem()) + + assert_allclose(mcmc.loc[mask, col].cov(mcmc.loc[mask, col]), + group.cov(mcmc.loc[mask, col])) + assert_allclose(mcmc.loc[mask, col].corr(mcmc.loc[mask, col]), + group.corr(mcmc.loc[mask, col])) + + sample = chains.sample(5) + assert len(sample) == 10 + assert sample.value_counts('chain')[1] == 5 + assert sample.value_counts('chain')[2] == 5 + + chains = mcmc.chain.groupby(mcmc.chain) + sample = chains.sample(5) + assert len(sample) == 10 + assert sample.value_counts()[1] == 5 + assert sample.value_counts()[2] == 5 + + +def test_groupby_plots(): + mcmc = read_chains('./tests/example_data/cb') + params = ['x0', 'x1'] + chains = mcmc[params + ['chain']].groupby(('chain', '$n_\\mathrm{chain}$')) + for param in params: + gb_plot = chains.hist(param) + for chain in [1, 2]: + mcmc_axes = mcmc.loc[mcmc.chain == chain].hist(param).flatten() + gb_axes = gb_plot[chain].values[0].flatten() + + mcmc_widths = [p.get_width() for ax in mcmc_axes + for p in ax.patches] + gb_widths = [p.get_width() for ax in gb_axes for p in ax.patches] + assert_allclose(mcmc_widths, gb_widths) + + mcmc_heights = [p.get_height() for ax in mcmc_axes + for p in ax.patches] + gb_heights = [p.get_height() for ax in gb_axes for p in ax.patches] + assert_allclose(mcmc_heights, gb_heights) + plt.close('all') + + for param in params: + _, gb_ax = plt.subplots() + gb_plots = chains[param].plot.hist(ax=gb_ax) + _, mcmc_ax = plt.subplots() + for chain, gb_ax in zip([1, 2], gb_plots): + mcmc_ax = mcmc.loc[mcmc.chain == chain][param].plot.hist( + ax=mcmc_ax) + mcmc_widths = [p.get_width() for p in mcmc_ax.patches] + gb_widths = [p.get_width() for p in gb_ax.patches] + assert_allclose(mcmc_widths, gb_widths) + plt.close('all') + + for param in params: + _, gb_ax = plt.subplots() + gb_plots = chains[param].plot.hist_1d(ax=gb_ax) + _, mcmc_ax = plt.subplots() + for chain, gb_ax in zip([1, 2], gb_plots): + mcmc_ax = mcmc.loc[mcmc.chain == chain][param].plot.hist_1d( + ax=mcmc_ax) + mcmc_widths = [p.get_width() for p in mcmc_ax.patches] + gb_widths = [p.get_width() for p in gb_ax.patches] + assert_allclose(mcmc_widths, gb_widths) + plt.close('all') + + for param in params: + _, gb_ax = plt.subplots() + gb_plots = chains[param].plot.kde(ax=gb_ax) + _, mcmc_ax = plt.subplots() + for chain, gb_ax in zip([1, 2], gb_plots): + mcmc_ax = mcmc.loc[mcmc.chain == chain][param].plot.kde( + ax=mcmc_ax) + [assert_allclose(m.get_data(), g.get_data()) + for m, g in zip(mcmc_ax.get_lines(), gb_ax.get_lines())] + plt.close('all') + + for param in params: + _, gb_ax = plt.subplots() + gb_plots = chains[param].plot.kde_1d(ax=gb_ax) + _, mcmc_ax = plt.subplots() + for chain, gb_ax in zip([1, 2], gb_plots): + mcmc_ax = mcmc.loc[mcmc.chain == chain][param].plot.kde_1d( + ax=mcmc_ax) + [assert_allclose(m.get_data(), g.get_data()) + for m, g in zip(mcmc_ax.get_lines(), gb_ax.get_lines())] + plt.close('all') + + for chain, gb_ax in zip([1, 2], chains.plot.hist_2d(*params)): + mcmc_ax = mcmc.loc[mcmc.chain == chain].plot.hist_2d(*params) + mcmc_widths = [p.get_width() for p in mcmc_ax.patches] + gb_widths = [p.get_width() for p in gb_ax.patches] + assert_allclose(mcmc_widths, gb_widths) + mcmc_heights = [p.get_height() for p in mcmc_ax.patches] + gb_heights = [p.get_height() for p in gb_ax.patches] + assert_allclose(mcmc_heights, gb_heights) + mcmc_colors = [p.get_facecolor() for p in mcmc_ax.patches] + gb_colors = [p.get_facecolor() for p in gb_ax.patches] + assert_allclose(mcmc_colors, gb_colors) + plt.close('all') + + for chain, gb_ax in zip([1, 2], chains.plot.kde_2d(*params)): + mcmc_ax = mcmc.loc[mcmc.chain == chain].plot.kde_2d(*params) + mcmc_verts = [p.get_verts() for p in mcmc_ax.patches] + gb_verts = [p.get_verts() for p in gb_ax.patches] + assert_allclose(mcmc_verts, gb_verts) + mcmc_colors = [p.get_facecolor() for p in mcmc_ax.patches] + gb_colors = [p.get_facecolor() for p in gb_ax.patches] + assert_allclose(mcmc_colors, gb_colors) + plt.close('all') + + if 'fastkde' in sys.modules: + for param in params: + _, gb_ax = plt.subplots() + gb_plots = chains[param].plot.fastkde_1d(ax=gb_ax) + _, mcmc_ax = plt.subplots() + for chain, gb_ax in zip([1, 2], gb_plots): + mcmc_ax = mcmc.loc[mcmc.chain == chain][param].plot.fastkde_1d( + ax=mcmc_ax) + [assert_allclose(m.get_data(), g.get_data()) + for m, g in zip(mcmc_ax.get_lines(), gb_ax.get_lines())] + plt.close('all') + + for chain, gb_ax in zip([1, 2], chains.plot.fastkde_2d(*params)): + mcmc_ax = mcmc.loc[mcmc.chain == chain].plot.fastkde_2d(*params) + mcmc_verts = [p.get_verts() for p in mcmc_ax.patches] + gb_verts = [p.get_verts() for p in gb_ax.patches] + assert_allclose(mcmc_verts, gb_verts) + mcmc_colors = [p.get_facecolor() for p in mcmc_ax.patches] + gb_colors = [p.get_facecolor() for p in gb_ax.patches] + assert_allclose(mcmc_colors, gb_colors) + plt.close('all') diff --git a/tests/test_weighted_pandas.py b/tests/test_weighted_pandas.py index 71128e08..271f98e8 100644 --- a/tests/test_weighted_pandas.py +++ b/tests/test_weighted_pandas.py @@ -176,7 +176,7 @@ def test_WeightedDataFrame_corrwith(frame): assert isinstance(correl, WeightedSeries) assert not correl.isweighted() assert_array_equal(correl.index, frame.columns) - assert_allclose(correl, frame.corr()['A']) + assert_allclose(correl, frame.corr()['A'], atol=1e-2) correl = frame.corrwith(frame[['A', 'B']]) assert isinstance(correl, WeightedSeries) @@ -417,12 +417,6 @@ def test_WeightedDataFrame_nan(frame): assert_array_equal(frame.std(axis=1, skipna=False).isna()[0:6], [True, False, False, False, False, False]) - assert ~frame.cov().isna().any().any() - ans = np.zeros((6, 6), dtype=bool) - ans[0] = True - ans[:, 0] = True - assert_array_equal(frame.cov(skipna=False).isna(), ans) - frame['B'][2] = np.nan assert ~frame.mean().isna().any() assert_array_equal(frame.mean(skipna=False).isna(), @@ -436,11 +430,6 @@ def test_WeightedDataFrame_nan(frame): assert_array_equal(frame.std(axis=1, skipna=False).isna()[0:6], [True, False, True, False, False, False]) - assert ~frame.cov().isna().any().any() - ans[1] = True - ans[:, 1] = True - assert_array_equal(frame.cov(skipna=False).isna(), ans) - frame['C'][4] = np.nan frame['D'][5] = np.nan frame['E'][6] = np.nan @@ -455,9 +444,6 @@ def test_WeightedDataFrame_nan(frame): assert_array_equal(frame.std(axis=1, skipna=False).isna()[0:6], [True, False, True, False, True, True]) - assert ~frame.cov().isna().any().any() - assert frame.cov(skipna=False).isna().all().all() - assert_allclose(frame.mean(), 0.5, atol=1e-2) assert_allclose(frame.std(), (1./12)**0.5, atol=1e-2) assert_allclose(frame.cov(), (1./12)*np.identity(6), atol=1e-2) @@ -467,6 +453,18 @@ def test_WeightedDataFrame_nan(frame): assert isinstance(frame.mean(axis=1), WeightedSeries) assert frame.mean(axis=1).isweighted() + assert frame[:0].mean().isna().all() + assert frame[:0].std().isna().all() + assert frame[:0].median().isna().all() + assert frame[:0].var().isna().all() + assert frame[:0].cov().isna().all().all() + assert frame[:0].corr().isna().all().all() + assert frame[:0].kurt().isna().all() + assert frame[:0].skew().isna().all() + assert frame[:0].mad().isna().all() + assert frame[:0].sem().isna().all() + assert frame[:0].quantile().isna().all() + def test_WeightedSeries_mean(series): series[0] = np.nan @@ -490,11 +488,9 @@ def test_WeightedSeries_cov(frame): assert_allclose(frame.A.cov(frame.A), 1./12, atol=1e-2) assert_allclose(frame.A.cov(frame.B), 0, atol=1e-2) - frame.loc[0, 'B'] = np.nan - assert ~np.isnan(frame.A.cov(frame.B)) - assert np.isnan(frame.A.cov(frame.B, skipna=False)) - assert ~np.isnan(frame.B.cov(frame.A)) - assert np.isnan(frame.B.cov(frame.A, skipna=False)) + frame['A'][0] = np.nan + assert_allclose(frame.A.cov(frame.A), 1./12, atol=1e-2) + assert_allclose(frame.A.cov(frame.B), 0, atol=1e-2) def test_WeightedSeries_corr(frame): @@ -608,6 +604,18 @@ def test_WeightedSeries_nan(series): assert_allclose(series.var(), 1./12, atol=1e-2) assert_allclose(series.std(), (1./12)**0.5, atol=1e-2) + assert np.isnan(series[:0].mean()) + assert np.isnan(series[:0].std()) + assert np.isnan(series[:0].median()) + assert np.isnan(series[:0].var()) + assert np.isnan(series[:0].cov(series)) + assert np.isnan(series[:0].corr(series)) + assert np.isnan(series[:0].kurt()) + assert np.isnan(series[:0].skew()) + assert np.isnan(series[:0].mad()) + assert np.isnan(series[:0].sem()) + assert np.isnan(series[:0].quantile()) + @pytest.fixture def mcmc_df():