diff --git a/README.rst b/README.rst index 4ecb81c2..a5fa35ed 100644 --- a/README.rst +++ b/README.rst @@ -2,7 +2,7 @@ anesthetic: nested sampling post-processing =========================================== :Authors: Will Handley and Lukas Hergt -:Version: 2.9.0 +:Version: 2.10.0 :Homepage: https://github.com/handley-lab/anesthetic :Documentation: http://anesthetic.readthedocs.io/ diff --git a/anesthetic/_version.py b/anesthetic/_version.py index 387cfacc..e2e6e4b5 100644 --- a/anesthetic/_version.py +++ b/anesthetic/_version.py @@ -1 +1 @@ -__version__ = '2.9.0' +__version__ = '2.10.0' diff --git a/anesthetic/read/csv.py b/anesthetic/read/csv.py index 3dd2713e..3a3ef0ea 100644 --- a/anesthetic/read/csv.py +++ b/anesthetic/read/csv.py @@ -5,7 +5,7 @@ def read_csv(filename, *args, **kwargs): - """Read a CSV file into a :class:`Samples` object.""" + """Read a CSV file into a :class:`anesthetic.samples.Samples` object.""" filename = Path(filename) kwargs['label'] = kwargs.get('label', filename.stem) wldf = wl_read_csv(filename.with_suffix('.csv')) diff --git a/anesthetic/tension.py b/anesthetic/tension.py new file mode 100644 index 00000000..fb9c8590 --- /dev/null +++ b/anesthetic/tension.py @@ -0,0 +1,104 @@ +"""Tension statistics between two datasets.""" +from anesthetic.samples import Samples +from scipy.stats import chi2 + + +def stats(A, B, AB, nsamples=None, beta=None): # noqa: D301 + r"""Compute tension statistics between two samples. + + Using nested sampling we can compute: + + - ``logR``: R statistic for dataset consistency + + .. math:: + \log R = \log Z_{AB} - \log Z_{A} - \log Z_{B} + + - ``logI``: information ratio + + .. math:: + \log I = D_{KL}^{A} + D_{KL}^{B} - D_{KL}^{AB} + + - ``logS``: suspiciousness + + .. math:: + \log S = \log L_{AB} - \log L_{A} - \log L_{B} + + - ``d_G``: Gaussian model dimensionality of shared constrained parameters + + .. math:: + d = d_{A} + d_{B} - d_{AB} + + - ``p``: p-value for the tension between two samples + + .. math:: + p = \int_{d-2\log{S}}^{\infty} \chi^2_d(x) dx + + Parameters + ---------- + A : :class:`anesthetic.samples.Samples` + :class:`anesthetic.samples.NestedSamples` object from a sampling run + using only dataset A. + Alternatively, you can pass a precomputed stats object returned from + :meth:`anesthetic.samples.NestedSamples.stats`. + + B : :class:`anesthetic.samples.Samples` + :class:`anesthetic.samples.NestedSamples` object from a sampling run + using only dataset B. + Alternatively, you can pass the precomputed stats object returned from + :meth:`anesthetic.samples.NestedSamples.stats`. + + AB : :class:`anesthetic.samples.Samples` + :class:`anesthetic.samples.NestedSamples` object from a sampling run + using both datasets A and B jointly. + Alternatively, you can pass the precomputed stats object returned from + :meth:`anesthetic.samples.NestedSamples.stats`. + + nsamples : int, optional + - If nsamples is not supplied, calculate mean value. + - If nsamples is integer, draw nsamples from the distribution of + values inferred by nested sampling. + + beta : float, array-like, default=1 + Inverse temperature(s) `beta=1/kT`. + + Returns + ------- + samples : :class:`anesthetic.samples.Samples` + DataFrame containing the following tension statistics in columns: + ['logR', 'logI', 'logS', 'd_G', 'p'] + """ + columns = ['logZ', 'D_KL', 'logL_P', 'd_G'] + if set(columns).issubset(A.drop_labels().columns): + statsA = A + else: + statsA = A.stats(nsamples=nsamples, beta=beta) + if set(columns).issubset(B.drop_labels().columns): + statsB = B + else: + statsB = B.stats(nsamples=nsamples, beta=beta) + if set(columns).issubset(AB.drop_labels().columns): + statsAB = AB + else: + statsAB = AB.stats(nsamples=nsamples, beta=beta) + if statsA.shape != statsAB.shape or statsB.shape != statsAB.shape: + raise ValueError("Shapes of stats_A, stats_B, and stats_AB do not " + "match. Make sure to pass consistent `nsamples`.") + + samples = Samples(index=statsA.index) + + samples['logR'] = statsAB['logZ'] - statsA['logZ'] - statsB['logZ'] + samples.set_label('logR', r'$\ln\mathcal{R}$') + + samples['logI'] = statsA['D_KL'] + statsB['D_KL'] - statsAB['D_KL'] + samples.set_label('logI', r'$\ln\mathcal{I}$') + + samples['logS'] = statsAB['logL_P'] - statsA['logL_P'] - statsB['logL_P'] + samples.set_label('logS', r'$\ln\mathcal{S}$') + + samples['d_G'] = statsA['d_G'] + statsB['d_G'] - statsAB['d_G'] + samples.set_label('d_G', r'$d_\mathrm{G}$') + + p = chi2.sf(samples['d_G'] - 2 * samples['logS'], df=samples['d_G']) + samples['p'] = p + samples.set_label('p', '$p$') + return samples diff --git a/docs/source/anesthetic.read.rst b/docs/source/anesthetic.read.rst index 518686da..cf488135 100644 --- a/docs/source/anesthetic.read.rst +++ b/docs/source/anesthetic.read.rst @@ -25,6 +25,15 @@ anesthetic.read.cobaya module :show-inheritance: +anesthetic.read.csv module +-------------------------- + +.. automodule:: anesthetic.read.csv + :members: + :undoc-members: + :show-inheritance: + + anesthetic.read.getdist module ------------------------------ diff --git a/docs/source/anesthetic.rst b/docs/source/anesthetic.rst index b256f3bb..266ccbe1 100644 --- a/docs/source/anesthetic.rst +++ b/docs/source/anesthetic.rst @@ -72,6 +72,7 @@ anesthetic.samples module .. automodule:: anesthetic.samples :members: :undoc-members: + :show-inheritance: anesthetic.scripts module @@ -83,6 +84,15 @@ anesthetic.scripts module :show-inheritance: +anesthetic.tension module +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: anesthetic.tension + :members: + :undoc-members: + :show-inheritance: + + anesthetic.testing module ~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/tests/test_tension.py b/tests/test_tension.py new file mode 100644 index 00000000..751fa28e --- /dev/null +++ b/tests/test_tension.py @@ -0,0 +1,126 @@ +from pytest import approx, raises +from anesthetic.examples.perfect_ns import correlated_gaussian +import numpy as np +from numpy.linalg import inv, slogdet +from pandas.testing import assert_series_equal +from anesthetic.tension import stats + + +def test_tension_stats_compatible_gaussian(): + np.random.seed(42) + d = 3 + nlive = 10 * d + bounds = [[-1, 1], [0, 3], [0, 1]] + logV = np.log(np.diff(bounds).ravel().prod()) + + muA = np.array([0.1, 0.3, 0.5]) + covA = 0.01 * np.array([[0.010, 0.009, 0.0], + [0.009, 0.010, 0.0], + [0.000, 0.000, 0.1]]) + invcovA = inv(covA) + logLmaxA = 0 + samplesA = correlated_gaussian(nlive, muA, covA, bounds, logLmaxA) + + muB = np.array([0.1, 0.3, 0.5]) + covB = 0.01 * np.array([[+0.010, -0.009, +0.010], + [-0.009, +0.010, -0.001], + [+0.010, -0.001, +0.100]]) + invcovB = inv(covB) + logLmaxB = 0 + samplesB = correlated_gaussian(nlive, muB, covB, bounds, logLmaxB) + + covAB = inv(invcovA + invcovB) + muAB = covAB @ (invcovA @ muA + invcovB @ muB) + dmuAB = muA - muB + dmu_cov_dmu_AB = dmuAB @ invcovA @ covAB @ invcovB @ dmuAB + logLmaxAB = logLmaxA + logLmaxB - dmu_cov_dmu_AB / 2 + samplesAB = correlated_gaussian(nlive, muAB, covAB, bounds, logLmaxAB) + + nsamples = 10 + beta = 1 + s = stats(samplesA, samplesB, samplesAB, nsamples, beta) + + logR_exact = logV - dmu_cov_dmu_AB/2 - slogdet(2*np.pi*(covA+covB))[1]/2 + assert s.logR.mean() == approx(logR_exact, abs=3*s.logR.std()) + + logS_exact = d / 2 - dmu_cov_dmu_AB / 2 + assert s.logS.mean() == approx(logS_exact, abs=3*s.logS.std()) + + logI_exact = logV - d / 2 - slogdet(2*np.pi*(covA+covB))[1] / 2 + assert s.logI.mean() == approx(logI_exact, abs=3*s.logI.std()) + + assert s.logS.mean() == approx(s.logR.mean() - s.logI.mean(), + abs=3*s.logS.std()) + + assert s.get_labels().tolist() == ([r'$\ln\mathcal{R}$', + r'$\ln\mathcal{I}$', + r'$\ln\mathcal{S}$', + r'$d_\mathrm{G}$', + r'$p$']) + + with raises(ValueError): + stats(samplesA.stats(nsamples=5), samplesB, samplesAB, nsamples) + s2 = stats(samplesA.stats(nsamples=nsamples), + samplesB.stats(nsamples=nsamples), + samplesAB.stats(nsamples=nsamples)) + assert_series_equal(s2.mean(), s.mean(), atol=s2.std().max()) + + +def test_tension_stats_incompatible_gaussian(): + np.random.seed(42) + d = 3 + nlive = 10 * d + bounds = [[-1, 1], [0, 3], [0, 1]] + logV = np.log(np.diff(bounds).ravel().prod()) + + muA = np.array([0.1, 0.3, 0.5]) + covA = 0.01 * np.array([[0.010, 0.009, 0.0], + [0.009, 0.010, 0.0], + [0.000, 0.000, 0.1]]) + invcovA = inv(covA) + logLmaxA = 0 + samplesA = correlated_gaussian(nlive, muA, covA, bounds, logLmaxA) + + muB = np.array([0.15, 0.25, 0.45]) + covB = 0.01 * np.array([[+0.010, -0.009, +0.010], + [-0.009, +0.010, -0.001], + [+0.010, -0.001, +0.100]]) + invcovB = inv(covB) + logLmaxB = 0 + samplesB = correlated_gaussian(nlive, muB, covB, bounds, logLmaxB) + + covAB = inv(invcovA + invcovB) + muAB = covAB @ (invcovA @ muA + invcovB @ muB) + dmuAB = muA - muB + dmu_cov_dmu_AB = dmuAB @ invcovA @ covAB @ invcovB @ dmuAB + logLmaxAB = logLmaxA + logLmaxB - dmu_cov_dmu_AB / 2 + samplesAB = correlated_gaussian(nlive, muAB, covAB, bounds, logLmaxAB) + + nsamples = 10 + beta = 1 + s = stats(samplesA, samplesB, samplesAB, nsamples, beta) + + logR_exact = logV - dmu_cov_dmu_AB/2 - slogdet(2*np.pi*(covA+covB))[1]/2 + assert s.logR.mean() == approx(logR_exact, abs=3*s.logR.std()) + + logS_exact = d / 2 - dmu_cov_dmu_AB / 2 + assert s.logS.mean() == approx(logS_exact, abs=3*s.logS.std()) + + logI_exact = logV - d / 2 - slogdet(2*np.pi*(covA+covB))[1] / 2 + assert s.logI.mean() == approx(logI_exact, abs=3*s.logI.std()) + + assert s.logS.mean() == approx(s.logR.mean() - s.logI.mean(), + abs=3*s.logS.std()) + + assert s.get_labels().tolist() == ([r'$\ln\mathcal{R}$', + r'$\ln\mathcal{I}$', + r'$\ln\mathcal{S}$', + r'$d_\mathrm{G}$', + r'$p$']) + + with raises(ValueError): + stats(samplesA.stats(nsamples=5), samplesB, samplesAB, nsamples) + s2 = stats(samplesA.stats(nsamples=nsamples), + samplesB.stats(nsamples=nsamples), + samplesAB.stats(nsamples=nsamples)) + assert_series_equal(s2.mean(), s.mean(), atol=s2.std().max())