Skip to content

Commit c7f73e5

Browse files
authored
add data-science-ipython-notebooks
1 parent a9bc9de commit c7f73e5

18 files changed

+10052
-0
lines changed

analyses/churn.ipynb

Lines changed: 1171 additions & 0 deletions
Large diffs are not rendered by default.

analyses/churn_measurements.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
from __future__ import division
2+
import numpy as np
3+
4+
__author__ = "Eric Chiang"
5+
__email__ = "eric[at]yhathq.com"
6+
7+
"""
8+
9+
Measurements inspired by Philip Tetlock's "Expert Political Judgment"
10+
11+
Equations take from Yaniv, Yates, & Smith (1991):
12+
"Measures of Descrimination Skill in Probabilistic Judgement"
13+
14+
"""
15+
16+
17+
def calibration(prob,outcome,n_bins=10):
18+
"""Calibration measurement for a set of predictions.
19+
20+
When predicting events at a given probability, how far is frequency
21+
of positive outcomes from that probability?
22+
NOTE: Lower scores are better
23+
24+
prob: array_like, float
25+
Probability estimates for a set of events
26+
27+
outcome: array_like, bool
28+
If event predicted occurred
29+
30+
n_bins: int
31+
Number of judgement categories to prefrom calculation over.
32+
Prediction are binned based on probability, since "descrete"
33+
probabilities aren't required.
34+
35+
"""
36+
prob = np.array(prob)
37+
outcome = np.array(outcome)
38+
39+
c = 0.0
40+
# Construct bins
41+
judgement_bins = np.arange(n_bins + 1) / n_bins
42+
# Which bin is each prediction in?
43+
bin_num = np.digitize(prob,judgement_bins)
44+
for j_bin in np.unique(bin_num):
45+
# Is event in bin
46+
in_bin = bin_num == j_bin
47+
# Predicted probability taken as average of preds in bin
48+
predicted_prob = np.mean(prob[in_bin])
49+
# How often did events in this bin actually happen?
50+
true_bin_prob = np.mean(outcome[in_bin])
51+
# Squared distance between predicted and true times num of obs
52+
c += np.sum(in_bin) * ((predicted_prob - true_bin_prob) ** 2)
53+
return c / len(prob)
54+
55+
def discrimination(prob,outcome,n_bins=10):
56+
"""Discrimination measurement for a set of predictions.
57+
58+
For each judgement category, how far from the base probability
59+
is the true frequency of that bin?
60+
NOTE: High scores are better
61+
62+
prob: array_like, float
63+
Probability estimates for a set of events
64+
65+
outcome: array_like, bool
66+
If event predicted occurred
67+
68+
n_bins: int
69+
Number of judgement categories to prefrom calculation over.
70+
Prediction are binned based on probability, since "descrete"
71+
probabilities aren't required.
72+
73+
"""
74+
prob = np.array(prob)
75+
outcome = np.array(outcome)
76+
77+
d = 0.0
78+
# Base frequency of outcomes
79+
base_prob = np.mean(outcome)
80+
# Construct bins
81+
judgement_bins = np.arange(n_bins + 1) / n_bins
82+
# Which bin is each prediction in?
83+
bin_num = np.digitize(prob,judgement_bins)
84+
for j_bin in np.unique(bin_num):
85+
in_bin = bin_num == j_bin
86+
true_bin_prob = np.mean(outcome[in_bin])
87+
# Squared distance between true and base times num of obs
88+
d += np.sum(in_bin) * ((true_bin_prob - base_prob) ** 2)
89+
return d / len(prob)

0 commit comments

Comments
 (0)