-
Notifications
You must be signed in to change notification settings - Fork 0
/
statistics_utils.py
114 lines (98 loc) · 5.05 KB
/
statistics_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import numpy
from scipy.stats import kde
def ks_statistic(data1, data2):
"""Calculate the Kolmogorov-Smirnov statistic to compare two sets of data.
The empirical cumulative distribution function for each set of
set of 1-dimensional data points is calculated. The K-S statistic
the maximum absolute distance between the two cumulative distribution
functions.
Parameters:
data1, data2: 1-dimensional lists or arrays of data points to compare.
"""
x1 = numpy.sort(data1)
x2 = numpy.sort(data2)
x = numpy.sort(numpy.concatenate([x1, x2]))
y1 = numpy.linspace(0, 1, len(x1) + 1)[
1:] # empirical CDF for data1: curve going up by 1/len(data1) at each observed data-point
y2 = numpy.linspace(0, 1, len(x2) + 1)[1:] # as above but for data2
cdf1 = numpy.interp(x, x1, y1, left=0) # linearly interpolate both CDFs onto a common set of x-values.
cdf2 = numpy.interp(x, x2, y2, left=0)
return abs(cdf1 - cdf2).max()
def ks_statistic_kde(data1, data2, num_points=None):
"""Calculate the Kolmogorov-Smirnov statistic to compare two sets of data.
Kernel Density Estimation is used to estimate the distribution of each set
set of 1-dimensional data points. From this, the K-S statistic is
calculated: the maximum absolute distance between the two cumulative
distribution functions.
Parameters:
data1, data2: 1-dimensional lists or arrays of data points to compare.
num_points: number of points to evaluate the density along.
"""
xs, kd1, kd2 = _get_estimators_and_xs(data1, data2, num_points)
with numpy.errstate(under='ignore'):
cdf1 = numpy.array([kd1.integrate_box_1d(-numpy.inf, x) for x in xs])
cdf2 = numpy.array([kd2.integrate_box_1d(-numpy.inf, x) for x in xs])
return abs(cdf1 - cdf2).max()
def js_metric(data1, data2, num_points=None):
"""Calculate the Jensen-Shannon metric to compare two sets of data.
Kernel Density Estimation is used to estimate the distribution of each set
set of 1-dimensional data points. From this, the J-S metric (square root of
J-S divergence) is calculated.
Note: KDE will often underestimate the probability at the far tails of the
distribution (outside of where supported by the data), which can lead to
overestimates of K-L divergence (and hence J-S divergence) for highly
non-overlapping datasets.
Parameters:
data1, data2: 1-dimensional lists or arrays of data points to compare.
num_points: number of points to evaluate the density along.
"""
xs, p1, p2 = _get_point_estimates(data1, data2, num_points)
m = (p1 + p2) / 2
return ((_kl_divergence(xs, p1, m) + _kl_divergence(xs, p2, m)) / 2) ** 0.5
def kl_divergence(data1, data2, num_points=None):
"""Calculate the Kullback-Leibler divergence between two sets of data.
Kernel Density Estimation is used to estimate the distribution of each set
set of 1-dimensional data points. From this, the K-L divergence is
calculated.
Note: KDE will often underestimate the probability at the far tails of the
distribution (outside of where supported by the data), which can lead to
overestimates of K-L divergence for highly non-overlapping datasets.
Parameters:
data1, data2: 1-dimensional lists or arrays of data points to compare.
num_points: number of points to evaluate the density along.
"""
xs, p1, p2 = _get_point_estimates(data1, data2, num_points)
return _kl_divergence(xs, p1, p2)
def _get_kd_estimator_and_xs(data, num_points):
"""Get KDE estimator for a given dataset, and generate a good set of
points to sample the density at."""
data = numpy.asarray(data, dtype=numpy.float)
kd_estimator = kde.gaussian_kde(data)
data_samples = kd_estimator.resample(num_points // 2)[0]
xs = numpy.sort(data_samples)
return kd_estimator, xs
def _get_estimators_and_xs(data1, data2, num_points):
"""Get KDE estimators for two different datasets and a set of points
to evaluate both distributions on."""
if num_points is None:
num_points = min(5000, (len(data1) + len(data2)) // 2)
kd1, xs1 = _get_kd_estimator_and_xs(data1, num_points // 2)
kd2, xs2 = _get_kd_estimator_and_xs(data2, num_points // 2)
xs = numpy.sort(numpy.concatenate([xs1, xs2]))
return xs, kd1, kd2
def _get_point_estimates(data1, data2, num_points):
"""Get point estimates for KDE distributions for two different datasets.
"""
xs, kd1, kd2 = _get_estimators_and_xs(data1, data2, num_points)
with numpy.errstate(under='ignore'):
p1 = kd1(xs)
p2 = kd2(xs)
return xs, p1, p2
def _kl_divergence(xs, p1, p2):
"""Calculate Kullback-Leibler divergence of p1 and p2, which are assumed to
values of two different density functions at the given positions xs.
Return divergence in nats."""
with numpy.errstate(divide='ignore', invalid='ignore'):
kl = p1 * (numpy.log(p1) - numpy.log(p2))
kl[~numpy.isfinite(kl)] = 0 # small numbers in p1 or p2 can cause NaN/-inf, etc.
return numpy.trapz(kl, x=xs) # integrate curve