|
| 1 | +from __future__ import division |
| 2 | +import numpy as np |
| 3 | + |
| 4 | +__author__ = "Eric Chiang" |
| 5 | +__email__ = "eric[at]yhathq.com" |
| 6 | + |
| 7 | +""" |
| 8 | +
|
| 9 | +Measurements inspired by Philip Tetlock's "Expert Political Judgment" |
| 10 | +
|
| 11 | +Equations take from Yaniv, Yates, & Smith (1991): |
| 12 | + "Measures of Descrimination Skill in Probabilistic Judgement" |
| 13 | +
|
| 14 | +""" |
| 15 | + |
| 16 | + |
| 17 | +def calibration(prob,outcome,n_bins=10): |
| 18 | + """Calibration measurement for a set of predictions. |
| 19 | +
|
| 20 | + When predicting events at a given probability, how far is frequency |
| 21 | + of positive outcomes from that probability? |
| 22 | + NOTE: Lower scores are better |
| 23 | +
|
| 24 | + prob: array_like, float |
| 25 | + Probability estimates for a set of events |
| 26 | +
|
| 27 | + outcome: array_like, bool |
| 28 | + If event predicted occurred |
| 29 | +
|
| 30 | + n_bins: int |
| 31 | + Number of judgement categories to prefrom calculation over. |
| 32 | + Prediction are binned based on probability, since "descrete" |
| 33 | + probabilities aren't required. |
| 34 | +
|
| 35 | + """ |
| 36 | + prob = np.array(prob) |
| 37 | + outcome = np.array(outcome) |
| 38 | + |
| 39 | + c = 0.0 |
| 40 | + # Construct bins |
| 41 | + judgement_bins = np.arange(n_bins + 1) / n_bins |
| 42 | + # Which bin is each prediction in? |
| 43 | + bin_num = np.digitize(prob,judgement_bins) |
| 44 | + for j_bin in np.unique(bin_num): |
| 45 | + # Is event in bin |
| 46 | + in_bin = bin_num == j_bin |
| 47 | + # Predicted probability taken as average of preds in bin |
| 48 | + predicted_prob = np.mean(prob[in_bin]) |
| 49 | + # How often did events in this bin actually happen? |
| 50 | + true_bin_prob = np.mean(outcome[in_bin]) |
| 51 | + # Squared distance between predicted and true times num of obs |
| 52 | + c += np.sum(in_bin) * ((predicted_prob - true_bin_prob) ** 2) |
| 53 | + return c / len(prob) |
| 54 | + |
| 55 | +def discrimination(prob,outcome,n_bins=10): |
| 56 | + """Discrimination measurement for a set of predictions. |
| 57 | +
|
| 58 | + For each judgement category, how far from the base probability |
| 59 | + is the true frequency of that bin? |
| 60 | + NOTE: High scores are better |
| 61 | +
|
| 62 | + prob: array_like, float |
| 63 | + Probability estimates for a set of events |
| 64 | +
|
| 65 | + outcome: array_like, bool |
| 66 | + If event predicted occurred |
| 67 | +
|
| 68 | + n_bins: int |
| 69 | + Number of judgement categories to prefrom calculation over. |
| 70 | + Prediction are binned based on probability, since "descrete" |
| 71 | + probabilities aren't required. |
| 72 | +
|
| 73 | + """ |
| 74 | + prob = np.array(prob) |
| 75 | + outcome = np.array(outcome) |
| 76 | + |
| 77 | + d = 0.0 |
| 78 | + # Base frequency of outcomes |
| 79 | + base_prob = np.mean(outcome) |
| 80 | + # Construct bins |
| 81 | + judgement_bins = np.arange(n_bins + 1) / n_bins |
| 82 | + # Which bin is each prediction in? |
| 83 | + bin_num = np.digitize(prob,judgement_bins) |
| 84 | + for j_bin in np.unique(bin_num): |
| 85 | + in_bin = bin_num == j_bin |
| 86 | + true_bin_prob = np.mean(outcome[in_bin]) |
| 87 | + # Squared distance between true and base times num of obs |
| 88 | + d += np.sum(in_bin) * ((true_bin_prob - base_prob) ** 2) |
| 89 | + return d / len(prob) |
0 commit comments