diff --git a/knitr/pool-binary-trials/hier-logit-centered.stan b/knitr/pool-binary-trials/hier-logit-centered.stan index 94df55a1..848174ac 100644 --- a/knitr/pool-binary-trials/hier-logit-centered.stan +++ b/knitr/pool-binary-trials/hier-logit-centered.stan @@ -15,9 +15,5 @@ model { y ~ binomial_logit(K, alpha); // likelihood } generated quantities { - vector[N] theta; // chance of success - - for (n in 1 : N) { - theta[n] = inv_logit(alpha[n]); - } + vector[N] theta = inv_logit(alpha); } diff --git a/knitr/pool-binary-trials/hier-logit.stan b/knitr/pool-binary-trials/hier-logit.stan index f03a1621..78a85c14 100644 --- a/knitr/pool-binary-trials/hier-logit.stan +++ b/knitr/pool-binary-trials/hier-logit.stan @@ -1,106 +1,22 @@ -data { - int N; // items - array[N] int K; // initial trials - array[N] int y; // initial successes - - array[N] int K_new; // new trials - array[N] int y_new; // new successes -} -transformed data { - real min_y; // minimum successes - real max_y; // maximum successes - real mean_y; // sample mean successes - real sd_y; // sample std dev successes - - min_y = min(y); - max_y = max(y); - mean_y = mean(to_vector(y)); - sd_y = sd(to_vector(y)); -} +#include data-blocks.stan parameters { real mu; // population mean of success log-odds real sigma; // population sd of success log-odds - vector[N] alpha_std; // success log-odds (standardized) + vector[N] alpha_std; // success log-odds (standardized) } model { mu ~ normal(-1, 1); // hyperprior sigma ~ normal(0, 1); // hyperprior - alpha_std ~ normal(0, 1); // prior (hierarchical) - y ~ binomial_logit(K, mu + sigma * alpha_std); // likelihood + alpha_std ~ normal(mu, sigma); // prior (hierarchical) + y ~ binomial_logit(K, alpha_std); // likelihood } generated quantities { - vector[N] theta; // chance of success - - real log_p_new; // posterior predictive log density remaining trials - - array[N] int z; // posterior prediction remaining trials - - int some_ability_gt_350; // Pr[some theta > 0.35] - array[N] int avg_gt_400; // Pr[season avg of n] >= 0.400 - array[N] int ability_gt_400; // Pr[chance-of-success of n] >= 0.400 - - array[N] int rnk; // rank of player n - array[N] int is_best; // Pr[player n highest chance of success] - - array[N] int y_rep; // replications for existing items + vector[N] theta = inv_logit(alpha_std); + #include gq-postpred.stan + #include gq-ranking.stan + array[N] int y_pop_rep; // replications for simulated items - - real min_y_rep; // posterior predictive min replicated successes - real max_y_rep; // posterior predictive max replicated successes - real mean_y_rep; // posterior predictive sample mean replicated successes - real sd_y_rep; // posterior predictive sample std dev replicated successes - - int p_min; // posterior predictive p-values - int p_max; - int p_mean; - int p_sd; - - for (n in 1 : N) { - theta[n] = inv_logit(mu + sigma * alpha_std[n]); - } - - log_p_new = 0; - for (n in 1 : N) { - log_p_new = log_p_new + binomial_lpmf(y_new[n] | K_new[n], theta[n]); - } - - for (n in 1 : N) { - z[n] = binomial_rng(K_new[n], theta[n]); - } - - some_ability_gt_350 = max(theta) > 0.35; - for (n in 1 : N) { - avg_gt_400[n] = ((y[n] + z[n]) / (0.0 + K[n] + K_new[n])) > 0.400; - } - for (n in 1 : N) { - ability_gt_400[n] = theta[n] > 0.400; - } - - { - array[N] int dsc; - dsc = sort_indices_desc(theta); - for (n in 1 : N) { - rnk[dsc[n]] = n; - } - } - for (n in 1 : N) { - is_best[n] = rnk[n] == 1; - } - - for (n in 1 : N) { - y_rep[n] = binomial_rng(K[n], theta[n]); - } for (n in 1 : N) { y_pop_rep[n] = binomial_rng(K[n], inv_logit(normal_rng(mu, sigma))); } - - min_y_rep = min(y_rep); - max_y_rep = max(y_rep); - mean_y_rep = mean(to_vector(y_rep)); - sd_y_rep = sd(to_vector(y_rep)); - - p_min = min_y_rep >= min_y; - p_max = max_y_rep >= max_y; - p_mean = mean_y_rep >= mean_y; - p_sd = sd_y_rep >= sd_y; } diff --git a/knitr/pool-binary-trials/hier.stan b/knitr/pool-binary-trials/hier.stan index 6a3f75f0..71b88c89 100644 --- a/knitr/pool-binary-trials/hier.stan +++ b/knitr/pool-binary-trials/hier.stan @@ -1,22 +1,4 @@ -data { - int N; // items - array[N] int K; // initial trials - array[N] int y; // initial successes - - array[N] int K_new; // new trials - array[N] int y_new; // new successes -} -transformed data { - real min_y; // minimum successes - real max_y; // maximum successes - real mean_y; // sample mean successes - real sd_y; // sample std dev successes - - min_y = min(y); - max_y = max(y); - mean_y = mean(to_vector(y)); - sd_y = sd(to_vector(y)); -} +#include data-blocks.stan parameters { real phi; // population chance of success real kappa; // population concentration @@ -28,73 +10,13 @@ model { y ~ binomial(K, theta); // likelihood } generated quantities { - real log_p_new; // posterior predictive log density remaining trials - - array[N] int z; // posterior prediction remaining trials - - int some_ability_gt_350; // Pr[some theta > 0.35] - array[N] int avg_gt_400; // Pr[season avg of n] >= 0.400 - array[N] int ability_gt_400; // Pr[chance-of-success of n] >= 0.400 - - array[N] int rnk; // rank of player n - array[N] int is_best; // Pr[player n highest chance of success] - - array[N] int y_rep; // replications for existing items - array[N] int y_pop_rep; // replications for simulated items - - real min_y_rep; // posterior predictive min replicated successes - real max_y_rep; // posterior predictive max replicated successes - real mean_y_rep; // posterior predictive sample mean replicated successes - real sd_y_rep; // posterior predictive sample std dev replicated successes - - int p_min; // posterior predictive p-values - int p_max; - int p_mean; - int p_sd; - - log_p_new = 0; - for (n in 1 : N) { - log_p_new = log_p_new + binomial_lpmf(y_new[n] | K_new[n], theta[n]); - } - - for (n in 1 : N) { - z[n] = binomial_rng(K_new[n], theta[n]); - } - - some_ability_gt_350 = max(theta) > 0.35; - for (n in 1 : N) { - avg_gt_400[n] = ((y[n] + z[n]) / (0.0 + K[n] + K_new[n])) > 0.400; - } - for (n in 1 : N) { - ability_gt_400[n] = theta[n] > 0.400; - } - - { - array[N] int dsc; - dsc = sort_indices_desc(theta); - for (n in 1 : N) { - rnk[dsc[n]] = n; - } - } - for (n in 1 : N) { - is_best[n] = rnk[n] == 1; - } - - for (n in 1 : N) { - y_rep[n] = binomial_rng(K[n], theta[n]); - } + #include gq-postpred.stan + #include gq-ranking.stan + + // replications for simulated items + array[N] int y_pop_rep; for (n in 1 : N) { y_pop_rep[n] = binomial_rng(K[n], beta_rng(phi * kappa, (1 - phi) * kappa)); } - - min_y_rep = min(y_rep); - max_y_rep = max(y_rep); - mean_y_rep = mean(to_vector(y_rep)); - sd_y_rep = sd(to_vector(y_rep)); - - p_min = min_y_rep >= min_y; - p_max = max_y_rep >= max_y; - p_mean = mean_y_rep >= mean_y; - p_sd = sd_y_rep >= sd_y; } diff --git a/knitr/pool-binary-trials/include/data-blocks.stan b/knitr/pool-binary-trials/include/data-blocks.stan new file mode 100644 index 00000000..865296e3 --- /dev/null +++ b/knitr/pool-binary-trials/include/data-blocks.stan @@ -0,0 +1,16 @@ +/** observed outcome y for N items in K, K_new binary trials **/ +data { + int N; // items + array[N] int K; // initial trials + array[N] int y; // initial successes + + array[N] int K_new; // new trials + array[N] int y_new; // new successes +} +transformed data { + // summary stats for observed data + real min_y = min(y); + real max_y = max(y); + real mean_y = mean(to_vector(y)); + real sd_y = sd(to_vector(y)); +} diff --git a/knitr/pool-binary-trials/include/gq-postpred.stan b/knitr/pool-binary-trials/include/gq-postpred.stan new file mode 100644 index 00000000..c245c6af --- /dev/null +++ b/knitr/pool-binary-trials/include/gq-postpred.stan @@ -0,0 +1,46 @@ +/** include in generated quantities block of model with parameter + vector[N] theta; // chance of success + and data: N, K, y, K_new, y_new, min_y, max_y, mean_y, sd_y +*/ + + // posterior predictive log density remaining trials + real log_p_new = 0; + for (n in 1 : N) { + log_p_new = log_p_new + binomial_lpmf(y_new[n] | K_new[n], theta[n]); + } + + // posterior predictions on new data + array[N] int z = binomial_rng(K_new, theta); + + // Pr[some theta > 0.35] + int some_ability_gt_350 = max(theta) > 0.35; + // Pr[some player ability > 400] + array[N] int ability_gt_400; + for (n in 1 : N) { + ability_gt_400[n] = theta[n] > 0.400; + } + // Pr[some player season average > 400] + array[N] int avg_gt_400; + for (n in 1 : N) { + // y[n] - observed, z[n] - expected hits in rest of season + avg_gt_400[n] = ((y[n] + z[n]) / (0.0 + K[n] + K_new[n])) > 0.400; + } + + // replications for existing items + array[N] int y_rep = binomial_rng(K, theta); + + // posterior predictive min replicated successes, test stat p_val + real min_y_rep = min(y_rep); + int p_min = min_y_rep >= min_y; + + // posterior predictive max replicated successes, test stat p_val + real max_y_rep = max(y_rep); + int p_max = max_y_rep >= max_y; + + // posterior predictive sample mean replicated successes, test stat p_val + real mean_y_rep = mean(to_vector(y_rep)); + int p_mean = mean_y_rep >= mean_y; + + // posterior predictive sample std dev replicated successes, test stat p_val + real sd_y_rep = sd(to_vector(y_rep)); + int p_sd = sd_y_rep >= sd_y; diff --git a/knitr/pool-binary-trials/include/gq-ranking.stan b/knitr/pool-binary-trials/include/gq-ranking.stan new file mode 100644 index 00000000..6e09a810 --- /dev/null +++ b/knitr/pool-binary-trials/include/gq-ranking.stan @@ -0,0 +1,16 @@ +/** include in generated quantities block of model with parameter + vector[N] theta; // chance of success + and data N (number of observations) +*/ + array[N] int rnk; // rank of player n + { + array[N] int dsc; + dsc = sort_indices_desc(theta); + for (n in 1 : N) { + rnk[dsc[n]] = n; + } + } + array[N] int is_best; // Pr[player n highest chance of success] + for (n in 1 : N) { + is_best[n] = rnk[n] == 1; + } diff --git a/knitr/pool-binary-trials/no-pool.stan b/knitr/pool-binary-trials/no-pool.stan index d49e5a78..8ea04cb2 100644 --- a/knitr/pool-binary-trials/no-pool.stan +++ b/knitr/pool-binary-trials/no-pool.stan @@ -1,22 +1,4 @@ -data { - int N; // items - array[N] int K; // initial trials - array[N] int y; // initial successes - - array[N] int K_new; // new trials - array[N] int y_new; // new successes -} -transformed data { - real min_y; // minimum successes - real max_y; // maximum successes - real mean_y; // sample mean successes - real sd_y; // sample std dev successes - - min_y = min(y); - max_y = max(y); - mean_y = mean(to_vector(y)); - sd_y = sd(to_vector(y)); -} +#include data-blocks.stan parameters { vector[N] theta; // chance of success } @@ -24,68 +6,6 @@ model { y ~ binomial(K, theta); // likelihood } generated quantities { - real log_p_new; // posterior predictive log density remaining trials - - array[N] int z; // posterior prediction remaining trials - - int some_ability_gt_350; // Pr[some theta > 0.35] - array[N] int avg_gt_400; // Pr[season avg of n] >= 0.400 - array[N] int ability_gt_400; // Pr[chance-of-success of n] >= 0.400 - - array[N] int rnk; // rank of player n - array[N] int is_best; // Pr[player n highest chance of success] - - array[N] int y_rep; // replications for existing items - - real min_y_rep; // posterior predictive min replicated successes - real max_y_rep; // posterior predictive max replicated successes - real mean_y_rep; // posterior predictive sample mean replicated successes - real sd_y_rep; // posterior predictive sample std dev replicated successes - - int p_min; // posterior p-val for min test stat - int p_max; // posterior p-val for max test stat - int p_mean; // posterior p-val for sample mean test stat - int p_sd; // posterior p-val for smaple std dev test stat - - log_p_new = 0; - for (n in 1 : N) { - log_p_new = log_p_new + binomial_lpmf(y_new[n] | K_new[n], theta[n]); - } - - for (n in 1 : N) { - z[n] = binomial_rng(K_new[n], theta[n]); - } - - some_ability_gt_350 = max(theta) > 0.35; - for (n in 1 : N) { - avg_gt_400[n] = ((y[n] + z[n]) / (0.0 + K[n] + K_new[n])) > 0.400; - } - for (n in 1 : N) { - ability_gt_400[n] = theta[n] > 0.400; - } - - { - array[N] int dsc; - dsc = sort_indices_desc(theta); - for (n in 1 : N) { - rnk[dsc[n]] = n; - } - } - for (n in 1 : N) { - is_best[n] = rnk[n] == 1; - } - - for (n in 1 : N) { - y_rep[n] = binomial_rng(K[n], theta[n]); - } - - min_y_rep = min(y_rep); - max_y_rep = max(y_rep); - mean_y_rep = mean(to_vector(y_rep)); - sd_y_rep = sd(to_vector(y_rep)); - - p_min = min_y_rep >= min_y; - p_max = max_y_rep >= max_y; - p_mean = mean_y_rep >= mean_y; - p_sd = sd_y_rep >= sd_y; + #include gq-postpred.stan + #include gq-ranking.stan } diff --git a/knitr/pool-binary-trials/pool-no-pool.Rmd b/knitr/pool-binary-trials/pool-no-pool.Rmd index eca5897b..211b62bd 100644 --- a/knitr/pool-binary-trials/pool-no-pool.Rmd +++ b/knitr/pool-binary-trials/pool-no-pool.Rmd @@ -2,29 +2,81 @@ title: "Hierarchical Partial Pooling for Repeated Binary Trials" author: "Bob Carpenter" date: "29 January 2016" -output: - html_document: +output: + html_document: theme: readable + pdf_document: default --- +```{css style settings, echo = FALSE} +blockquote { + padding: 10px 20px; + margin: 0 0 20px; + font-size: 0.9em; + border-left: 5px solid #eee; +} +``` + +*updated for CmdStanR by Mitzi Morris, 19 January 2022* + + ### Abstract This note illustrates the effects on posterior inference of pooling data (aka sharing strength) across items for repeated binary trial data. It provides Stan models and R code to fit and check predictive -models for three situations: (a) complete pooling, which assumes each -item is the same, (b) no pooling, which assumes the items are -unrelated, and (c) partial pooling, where the similarity among the -items is estimated. We consider two hierarchical models to estimate -the partial pooling, one with a beta prior on chance of success and -another with a normal prior on the log odds of success. The note -explains with working examples how to (i) fit models in RStan and plot -the results in R using ggplot2, (ii) estimate event probabilities, -(iii) evaluate posterior predictive densities to evaluate model -predictions on held-out data, (iv) rank items by chance of success, -(v) perform multiple comparisons in several settings, (vi) replicate -new data for posterior p-values, and (vii) perform graphical posterior -predictive checks. +models for three situations: + +1. complete pooling, which assumes each item is the same +2. no pooling, which assumes the items are unrelated +3. partial pooling, where the similarity among the items is estimated. + + a hierarchical model with with a *beta prior* on the *chance of success* + + a hierarchical model with a *normal prior* on the *log odds of success* + +We explain and give working examples of how to: + +- fit models in [CmdStanR](https://mc-stan.org/cmdstanr) and plot the results in R using ggplot2, +- estimate event probabilities, +- evaluate posterior predictive densities to evaluate model predictions on held-out data +- rank items by chance of success, +- perform multiple comparisons in several settings, +- replicate new data for posterior p-values +- perform graphical posterior predictive checks. + + +### Packages used in this notebook + +In this notebook, we will use the [CmdStanR interface](https://mc-stan.org/cmdstanr/). + +```{r comment=NA} +# uncomment as needed to install CmdStanR, CmdStan +#library(devtools); +#if(!require(cmdstanr)){ +# devtools::install_github("stan-dev/cmdstanr", dependencies=c("Depends", "Imports")) +#} +options(warn=-1) +options(repr.plot.width=8, repr.plot.height=8) +options(output.width='90%') +library(tidyverse) +library(ggplot2) +library(posterior) +library(cmdstanr) +``` +```{r echo=FALSE} +theme_jupyter <- function() { + theme_grey() + + theme(text = element_text(size=12), + plot.title = element_text(size=rel(1.1)), + plot.subtitle = element_text(size=rel(1)), + axis.title.x = element_text(size=rel(1)), + axis.title.y = element_text(size=rel(1)), + strip.text.x = element_text(size=rel(.95)), + strip.text.y = element_text(size=rel(.95)), + axis.text.x = element_text(size=rel(0.9)), + axis.text.y = element_text(size=rel(0.9)) + ) +} +``` ## Repeated Binary Trials @@ -58,25 +110,16 @@ downloaded 24 Dec 2015 from It is drawn from the 1970 Major League Baseball season from both leagues. +We will only need a few columns of the data; we will be using +the remaining hits and at bats to evaluate the predictive inferences for the various models. + ```{r comment=NA} -df <- read.csv("efron-morris-75-data.tsv", sep="\t"); +df <- read.csv("efron-morris-75-data.tsv", sep="\t") df <- with(df, data.frame(FirstName, LastName, Hits, At.Bats, RemainingAt.Bats, - RemainingHits = SeasonHits - Hits)); -print(df); -``` - -We will only need a few columns of the data; we will be using the -remaining hits and at bats to evaluate the predictive inferences for -the various models. - -```{r comment=NA} -N <- dim(df)[1] -K <- df$At.Bats -y <- df$Hits -K_new <- df$RemainingAt.Bats; -y_new <- df$RemainingHits; + RemainingHits = SeasonHits - Hits)) +print(df) ``` The data separates the outcome from the initial 45 at-bats from the @@ -107,10 +150,23 @@ data { } ``` -As usual, we follow the convention of naming our program +We follow the convention of naming our program variables after the variables we use when we write the model out -mathematically in a paper. We also choose capital letters for integer -constants and y for the main observed variable(s). +mathematically in a paper. We choose capital letters for integer +constants, here N and K, +and we use a lowercase y for the main observed variable(s). +We try to use these abstract names for data and parameters +instead of dataset-specific names such as "hits" and "at-bats" +because these basic models are useful in many domains, +as mentioned above. + +Because we shall develop four models all of which use the same +data and transformed data, we create a Stan program which +contains just the data and transformed data blocks "data-blocks.stan" +and use Stan's [include directive](https://mc-stan.org/docs/reference-manual/includes.html) +to use this across all four models. +Because the models contain #include directives, we need to +specify the `include_paths` when instantiating or compiling the model. ## Pooling @@ -160,33 +216,27 @@ due to numerical arithmetic issues. Assuming each player's at-bats are independent Bernoulli trials, the sampling distribution for each player's number of hits $y_n$ is modeled as - -\[ +$$ p(y_n \, | \, \phi) \ = \ \mathsf{Binomial}(y_n \, | \, K_n, \phi). -\] - +$$ When viewed as a function of $\phi$ for fixed $y_n$, this is called the likelihood function. Assuming each player is independent leads to the complete data likelihood - -\[ +$$ p(y \, | \, \phi) = \prod_{n=1}^N \mathsf{Binomial}(y_n \, | \, K_n, \phi). -\] - +$$ We will assume a uniform prior on $\phi$, - -\[ +$$ p(\phi) \ = \ \mathsf{Uniform}(\phi \, | \, 0, 1) \ = \ 1. -\] - +$$ Whether a prior is uniform or not depends on the scale with which the parameter is expressed. Here, the variable $\phi$ is a chance of success in $[0, 1]$. If we were to consider the log-odds of success, @@ -207,8 +257,7 @@ as ``` model { - ... - y ~ binomial(K, phi); + y ~ binomial(K, phi); // likelihood } ``` @@ -238,62 +287,77 @@ vectorized form can be up to an order of magnitude or more faster in some cases, depending on how many repeated calculations can be avoided. -The actual Stan program in pool.stan has many more +The actual Stan program in pool.stan includes many derived quantities that will be used in the rest of this note; see the appendix for the full code of all of the models discussed. -We start by loading the RStan package. +We must first instantiate a CmdStanModel object then call its sample method. +To do this we first call the `cmdstan_model()` function +which instantiates a CmdStanModel object from a file containing a Stan program. +Because these models contain a number of [include directives](https://mc-stan.org/docs/reference-manual/includes-section.html), +we use the `include_paths` argument to pass in the path to the current directory. -```{r comment=NA} -library(rstan); +```{r} +pool_model <- cmdstan_model(stan_file='pool.stan', include_paths=c('include')) ``` -The model can be fit as follows, with `M` being the total number of -draws in the complete posterior sample (each chain is by default split -into half warmup and half sampling iterations and 4 chains are being -run). - +We create the R data structures corresponding to the model inputs. +In addition, we define the desired size of the MCMC sample, +here we set it to $10,000$ draws. ```{r comment=NA} -M <- 10000; -fit_pool <- stan("pool.stan", data=c("N", "K", "y", "K_new", "y_new"), - iter=(M / 2), chains=4); -ss_pool <- extract(fit_pool); -``` +N <- dim(df)[1] +K <- df$At.Bats +y <- df$Hits +K_new <- df$RemainingAt.Bats +y_new <- df$RemainingHits -Here, we read the data out of the environment by name; normally we -would prefer to encapsulate the data in a list to avoid naming -conflicts in the top-level namespace. We showed the default output -for the stan() function call here, but will suppress it -in subsequent calls. +M <- 10000 # total draws in the sample +``` +We pass the data into the `sample` method as a list of name, value pairs. +By default, CmdStanR runs 4 chains with 1000 warmup and sampling iterations per chain. +To generate a sample with a total of $M = 10000$ draws, +we specify the per-chain sampling iterations to be $M / 4$. +```{r comment=NA} +baseball_data = list(N=N, K=K, y=y, K_new=K_new, y_new=y_new) +fit_pool <- pool_model$sample(data=baseball_data, iter_sampling=M/4, refresh=0) +``` The posterior sample for phi can be summarized as follows. ```{r comment=NA} -print(fit_pool, c("phi"), probs=c(0.1, 0.5, 0.9)); +fit_pool$print('phi') ``` The summary statistics begin with the posterior mean, the MCMC standard error on the posterior mean, and the posterior standard -deviation. Then there are 0.1, 0.5, and 0.9 quantiles, which provide -the posterior median and boundaries of the central 80% interval. The -last two columns are for the effective sample size (MCMC standard -error is the posterior standard deviation divided by the square root -of the effective sample size) and the $\hat{R}$ convergence diagnostic +deviation (sd) and mean average deviation (mad). +Then there are the 0.5, and 0.95 quantiles, which provide +the posterior median and boundaries of the central 90% interval. +The next two columns are for the effective sample size for the +central 90% interval and the tail. +The final column is the $\hat{R}$ convergence diagnostic (its value will be 1 if the chains have all converged to the same -posterior mean and variance; see the Stan Manual (Stan Development -Team 2015) or (Gelman et al. 2013). The $\hat{R}$ value here is -consistent with convergence (i.e., near 1) and the effective sample -size is good (roughly half the number of posterior draws; by default -Stan uses as many iterations to warmup as it does for drawing the -sample). - -The result is a posterior mean for $\theta$ of $0.27$ with an 80% -central posterior interval of $(0.25, 0.29)$. With more data, such as +posterior mean and variance; see the +[Stan Reference Manual](https://mc-stan.org/docs/reference-manual/analysis.html), +[Vehtari et al, 2021](https://projecteuclid.org/journals/bayesian-analysis/volume-16/issue-2/Rank-Normalization-Folding-and-Localization--An-Improved-Rˆ-for/10.1214/20-BA1221.full) +and [its appendix](https://avehtari.github.io/rhat_ess/ess_comparison.html) +for further discussion of these diagnostics. +The $\hat{R}$ value here is +consistent with convergence (i.e., near 1) and the effective sample size is good (well above 10%). + +The result is a posterior mean for $\theta$ of $0.27$ with an 90% +central posterior interval of $(0.24, 0.29)$. + +```{r} +fit_pool$print('theta') +``` + +With more data, such as from more players or from the rest of the season, the posterior approaches a delta function around the maximum likelihood estimate and -the posterior interval around the centeral posterior intervals will +the posterior interval around the central posterior intervals will shrink. Nevertheless, even if we know a player's chance of success exactly, there is a large amount of uncertainty in running $K$ binary trials with that chance of success; using a binomial model @@ -312,29 +376,29 @@ parameter $\theta_n \in [0,1]$ for each item $n$. The prior on each $\theta_n$ is uniform, -\[ +$$ p(\theta_n) = \mathsf{Uniform}(\theta_n \, | \, 0,1), -\] +$$ and the $\theta_n$ are assumed to be independent, -\[ +$$ p(\theta) = \prod_{n=1}^N \mathsf{Uniform}(\theta_n \, | \, 0,1). -\] +$$ The likelihood then uses the chance of success $\theta_n$ for item $n$ in modeling the number of successes $y_n$ as -\[ +$$ p(y_n \, | \, \theta_n) = \mathsf{Binomial}(y_n \, | \, K_n, \theta_n). -\] +$$ Assuming the $y_n$ are independent (conditional on $\theta$), this leads to the total data likelihood -\[ +$$ p(y \, | \, \theta) = \prod_{n=1}^N \mathsf{Binomial}(y_n \, | \, K_n, \theta_n). -\] +$$ The Stan program for no pooling only differs in declaring the ability parameters as an $N$-vector rather than a scalar. @@ -369,16 +433,15 @@ in no-pool.stan, which is shown in the appendix. This model can be fit the same way as the last model. -```{r comment=NA, results='hide'} -fit_no_pool <- stan("no-pool.stan", data=c("N", "K", "y", "K_new", "y_new"), - iter=(M / 2), chains=4); -ss_no_pool <- extract(fit_no_pool); +```{r results='hide', comment=NA} +no_pool_model <- cmdstan_model(stan_file='no-pool.stan', include_paths=c('include')) +fit_no_pool <- no_pool_model$sample(data=baseball_data, iter_sampling=M/4, refresh=0) ``` -Results are displayed the same way. +Now we examine the per-player chance of success `theta` ```{r comment=NA} -print(fit_no_pool, c("theta"), probs=c(0.1, 0.5, 0.9)); +fit_no_pool$print('theta', max_rows=18) ``` Now there is a separate line for each item's estimated $\theta_n$. @@ -388,14 +451,14 @@ median will be reasonably close to the posterior mode despite the skewness (the posterior can be shown analytically to be a Beta distribution). -Each 80% interval is much wider than the estimated interval for the +Each 90% interval is much wider than the estimated interval for the population in the complete pooling model; this is to be expected---there are only 45 data items for each parameter here as opposed to 810 in the complete pooling case. If the items each had different numbers of trials, the intervals would also vary based on size. -As the estimated chance of success goes up toward 0.5, the 80% +As the estimated chance of success goes up toward 0.5, the 90% intervals gets wider. This is to be expected for chance of success parameters, because the standard deviation of a random variable distributed as $\mathsf{Binomial}(K, \theta)$ is $\sqrt{\theta (1 - \theta) K}$. @@ -436,10 +499,10 @@ assume a beta distribution as the prior as it is scaled to values in $[0, 1]$, -\[ +$$ p(\theta_n \, | \, \alpha, \beta) \ = \ \mathsf{Beta}(\theta_n \, | \, \alpha, \beta), -\] +$$ where $\alpha, \beta > 0$ are the parameters of the prior. The beta distribution is the conjugate prior for the binomial, meaning that the @@ -451,25 +514,25 @@ observations and thus a uniform distribution. Each $\theta_n$ will be modeled as conditionally independent given the prior parameters, so that the complete prior is -\[ +$$ p(\theta \, | \, \alpha, \beta) = \prod_{n=1}^N \mathsf{Beta}(\theta_n \, | \, \alpha, \beta). -\] +$$ The parameters $\alpha$ and $\beta$ are themselves given priors (sometimes called hyperpriors). Rather than parameterize $\alpha$ and $\beta$ directly, we will instead put priors on $\phi \in [0, 1]$ and $\kappa > 0$, and then define -\[ +$$ \alpha = \kappa \, \phi -\] +$$ and -\[ +$$ \beta = \kappa \, (1 - \phi). -\] +$$ This reparameterization is convenient, because @@ -482,15 +545,15 @@ inversely related to the variance). We will follow Gelman et al. (2013, Chapter 5) in providing a prior that factors into a uniform prior on $\phi$, -\[ +$$ p(\phi) = \mathsf{Uniform}(\phi \, | \, 0, 1), -\] +$$ and a Pareto prior on $\kappa$, -\[ +$$ p(\kappa) = \mathsf{Pareto}(\kappa \, | \, 1, 1.5) \propto \kappa^{-2.5}. -\] +$$ with the restriction $\kappa > 1$. In general, for functions $f$ and $g$, we write $f(x) \propto g(x)$ if there is some constant $c$ such @@ -534,19 +597,24 @@ explicitly constrained to lie in $[0, 1]$. The prior on $\kappa$ is coded following the model definition. The full model with all generated quantities can be coded in Stan as in -the file hier.stan; it is displayed in the appendix. It is run as usual. +the file hier.stan; it is displayed in the appendix. It is run as usual, +except that this time we set the pseudorandom number generator seed here to 1234 +so that we can consistently generate the exact set of draws. -```{r comment=NA, results='hide'} -fit_hier <- stan("hier.stan", data=c("N", "K", "y", "K_new", "y_new"), - iter=(M / 2), chains=4, - seed=1234); -ss_hier <- extract(fit_hier); +```{r results='hide', comment=NA} +hier_model <- cmdstan_model(stan_file='hier.stan', include_paths=c('include')) +fit_hier <- hier_model$sample(data=baseball_data, + iter_sampling=M/4, + seed=1234, + refresh=0, + show_messages=FALSE) ``` +The sampler prints a warning about divergent transitions. +CmdStan's [diagnose](https://mc-stan.org/docs/cmdstan-guide/diagnose.html) utility provides more checks on the MCMC sample. -Even though we set results='hide' in the knitr R call -here, the error messages are still printed. We set the pseudorandom -number generator seed here to 1234 so that we could discuss what the -output looks like. +```{r} +fit_hier$cmdstan_diagnose() +``` #### Divergent Transitions and Mitigation Strategies @@ -573,34 +641,49 @@ in the gradient times the step size provides a poor approximation to the posterior curvature. The problem with a hierarchical model is that the curvature in the posterior varies based on position; when the hierarchical variance is low, there is high curvature in the -lower-level parameters around the mean. During warmup, Stan globally +lower-level parameters around the mean. +During warmup, Stan globally adapts its step size to a target acceptance rate, which can lead to too large step sizes for highly curved regions of the posterior. To -mitigate this problem, we need to either reduce the step size or -reparameterize. We firt consider reducing step size, and then in the -next secton consider the superior alternative of reparameterization. - +mitigate this problem, we need to either: + - increase the number of warmup iterations + - reduce the step size and increase the target acceptance rate + - reparameterize. + +We first consider the combination of the first two options +and then in the next section consider the superior alternative of reparameterization. + +The default number of warmup iterations is 1000, but for models with +highly varying posterior geometries, running more warmup iterations +is often sufficient to find the correct step size. To reduce the step size of the algorithm, we want to lower the initial step size and increase the target acceptance rate. The former keeps the step size low to start; the latter makes sure the adapted step size is lower. So we'll run this again with the same seed, this time -lowering the step size (stepsize) and increasing the +running 2000 warmup iterations (iter_warmup), +lowering the step size (step_size), and increasing the target acceptance rate (adapt_delta). -```{r comment=NA, results='hide'} -fit_hier <- stan("hier.stan", data=c("N", "K", "y", "K_new", "y_new"), - iter=(M / 2), chains=4, - seed=1234, - control=list(stepsize=0.01, adapt_delta=0.99)); -ss_hier <- extract(fit_hier); +```{r results='hide', comment=NA} +fit_hier <- hier_model$sample(data=baseball_data, + iter_sampling=M/4, + iter_warmup=M/4, + seed=1234, + step_size=0.01, + adapt_delta=0.95, + refresh=0) ``` Now it runs without divergent translations. -Summary statistics for the posterior are printed as before. +```{r} +fit_hier$cmdstan_diagnose() +``` + +Summary statistics for the posterior are printed as before. ```{r comment=NA} -print(fit_hier, c("theta", "kappa", "phi"), probs=c(0.1, 0.5, 0.9)); +fit_hier$print(c("theta", "kappa", "phi"), max_rows=25) ``` Because the Beta prior is conjugate to the binomial likelihood, the @@ -611,10 +694,10 @@ observations (specifically $\phi \, \kappa - 1$ prior successes and $(1 - \phi) \, \kappa - 1$ prior failures). The parameter $\kappa$ is not well determined by the combination of -data and Pareto prior, with a posterior 80% interval of roughly $(25, -225)$. By the informal discussion above, $\kappa \in (25, 225)$ +data and Pareto prior, with a posterior 90% interval of roughly $(20, +364)$. By the informal discussion above, $\kappa \in (20, 364)$ ranges from weighting the data 2:1 relative to the prior to weighting -it 1:5. The wide posterior interval for $\kappa$ arises because the +it 1:8. The wide posterior interval for $\kappa$ arises because the exact variance in the population is not well constrained by only 18 trials of size 45. If there were more items (higher $N$) or even more trials per item (higher $K$), the posterior for $\kappa$ would be more @@ -641,15 +724,18 @@ $\kappa \in (0, \infty)$ is transformed to $\log \kappa$. We reproduce that figure here for our running example. ```{r} -library(ggplot2); +# extract model fit as R dataframe/ tibble for ggplot2 +hier_df = fit_hier$draws(format="df"); + bda_plot <- function(df, x_lab, y_lab) { ggplot(df, aes(x=x, y=y)) + geom_point(shape=19, alpha=0.15) + xlab(x_lab) + - ylab(y_lab); + ylab(y_lab) + + theme_jupyter(); } -df_bda3_fig_5_3 <- with(ss_hier, +df_bda3_fig_5_3 <- with(hier_df, data.frame(x = log(phi / (1 - phi)), y = log(kappa))); plot_bda3_fig_5_3 <- @@ -672,13 +758,14 @@ the relation between $\kappa$ and the $\theta$. For example, consider the following plot of $\theta_1$ vs. $\kappa$. ```{r} +# note: must quote column names that look like indexed expr inv_logit <- function(u) { 1 / (1 + exp(-u)); } -df_funnel <- with(ss_hier, - data.frame(x = inv_logit(theta[,1]), +df_funnel <- with(hier_df, + data.frame(x = inv_logit(hier_df$'theta[1]'), y = log(kappa))) plot_funnel <- bda_plot(df_funnel, - x_lab = expression(paste(logit(theta[1]))), + x_lab = expression(paste(logit('theta[1]'))), y_lab = expression(paste(log(kappa)))); plot_funnel; ``` @@ -704,11 +791,11 @@ chance-of-success $\theta_n \in [0,1]$. In this section, we consider an alternative parameterization in terms of the log-odds $\alpha_n$, which are defined by the logit transform as -\[ +$$ \alpha_n = \mathrm{logit}(\theta_n) = \log \, \frac{\theta_n}{1 - \theta_n}. -\] +$$ For example, $\theta_n = 0.25$ corresponds to odds of $.25$ to $.75$ (equivalently, $1$ to $3$), or log-odds of $\log .25 / .75 = -1.1$. @@ -716,17 +803,17 @@ For example, $\theta_n = 0.25$ corresponds to odds of $.25$ to $.75$ We will still use a binomial likelihood, only now we have logit as a so-called "link" function. -\[ +$$ p(y_n \, | \, K_n, \alpha_n) \ = \ \mathsf{Binomial}(y_n \, | \, K_n, \ \mathrm{logit}^{-1}(\alpha_n)) -\] +$$ The inverse logit function is the logistic sigmoid from which logistic regression gets its name, -\[ +$$ \mathrm{logit}^{-1}(\alpha_n) = \frac{1}{1 + \exp(-\alpha_n)} = \theta_n. -\] +$$ By construction, for any $\alpha_n \in (-\infty, \infty)$, $\mathrm{logit}^{-1}(\alpha_n) \in (0, 1)$; the sigmoid converts @@ -737,24 +824,24 @@ practice, we only know the result will be in $[0, 1]$. Stan has a binomial probability function with a built-in logit link function, with which we can define the likelihood directly as -\[ +$$ p(y_n \, | \, K_n, \alpha_n) \ = \ \mathsf{BinomialLogit}(y_n \, | \, K_n, \alpha_n) \ = \ \mathsf{Binomial}(y_n \, | \, K_n, \ \mathrm{logit}^{-1}(\alpha_n)). -\] +$$ We use a simple normal hierarchical prior, -\[ +$$ p(\alpha_n \, | \, \mu, \sigma) = \mathsf{Normal}(\alpha_n \, | \, \mu, \sigma). -\] +$$ Then one level up, we use a weakly informative hyperprior for $\mu$, -\[ +$$ p(\mu) = \mathsf{Normal}(\mu \, | \, -1, 1), -\] +$$ which places 95% of the prior probability for $\mu$ in the interval $(-3, 1)$, which inverse-logit transforms to the interval $(0.05, @@ -765,11 +852,11 @@ value should obviously be changed for other applications. The prior scale $\sigma > 0$ can be taken to be a truncated normal (half normal, here): -\[ +$$ p(\sigma) \ = \ 2 \, \mathsf{Normal}(\sigma \, | \, 0, 1) \ \propto \ \mathsf{Normal}(\sigma \, | \, 0, 1). -\] +$$ This is a fairly broad prior in this case, being on the log-odds scale. @@ -782,7 +869,7 @@ examples, all of which have been translated to Stan). #### Centered Parameterization -We will begin with the natural centered parameterization, but we must point out ahead of time that this is not the optimal way to code this model in Stan. The centered parameterization directly follows the mathematical presentation. +We will begin with the natural centered parameterization, but we must point out ahead of time that this is not the optimal way to code this model in Stan. The centered parameterization directly follows the above ``` parameters { @@ -803,27 +890,32 @@ included in this document directly (code for the other models is in an appendix at the end of this document). -```{r comment=NA, results='hide'} -fit_hier_logit_centered <- - stan("hier-logit-centered.stan", data=c("N", "K", "y"), - iter=(M / 2), chains=4, - seed=1234); -ss_hier_logit_centered <- extract(fit_hier_logit_centered); +```{r results='hide', comment=NA} +hier_logit_centered_model <- cmdstan_model(stan_file='hier-logit-centered.stan') +fit_hier_logit_centered <- hier_logit_centered_model$sample(data=baseball_data, + iter_warmup=M/4, + iter_sampling=M/4, + seed=1234, + refresh=0) ``` ```{r comment=NA} -print(fit_hier_logit_centered, - c("sigma", "theta[1]", "theta[10]", "alpha[1]", "alpha[10]", - "mu", "sigma", "lp__"), probs=c(0.1, 0.5, 0.9)); +fit_hier_logit_centered$print(c("theta[1]", "theta[10]","alpha[1]", "alpha[10]", "mu", "sigma", "lp__")) ``` -With the centered parameterization and Stan's defalt stepsize and +With the centered parameterization and Stan's default stepsize and target acceptance rate, there are many divergent transitions. Even with 10,000 posterior draws, the effective sample size for the population scale $\sigma$ is in the low hundreds, with $\hat{R} > 1.01$. These results might be usable, but the high $\hat{R}$ and low effective sample size are sure signs the chains aren't mixing well. +A call to `cmdstan_diagnose` confirms this. + +```{r} +fit_hier_logit_centered$cmdstan_diagnose() +``` + *Warning:* In cases where the chains are not mixing well, Stan's effective sample size estimate will be much lower than that from packages such as Coda (Plummer 2006), which analyze each chain's @@ -834,15 +926,18 @@ assume complete mixing in their effective sample size calculations and thus overestimate their performance when there is not good mixing. ```{r} +hier_logit_centered_df = as_draws_df(fit_hier_logit_centered$draws()); + funnel_centered_df <- - with(ss_hier_logit_centered, - data.frame(x = alpha[, 1], + with(hier_logit_centered_df, + data.frame(x = hier_logit_centered_df$'alpha[1]', y = log(sigma))); funnel_centered <- bda_plot(funnel_centered_df, - x_lab = expression(paste("player 1 log odds of success ", alpha[1])), + x_lab = expression(paste("player 1 log odds of success ", 'alpha[1]')), y_lab = expression(paste("log population scale ", log(sigma)))) + - ggtitle("Centered Hierarchical Funnel Plot"); + ggtitle("Centered Hierarchical Funnel Plot") + + theme_jupyter(); funnel_centered; ``` @@ -869,53 +964,70 @@ random-walk Metropolis samplers (Papaspiliopoulos et al. 2003). This basically amounts to changing the parameterization over which sampling is done, taking now a standard unit normal prior for a new variable, -\[ +$$ \alpha^{\mathrm{std}}_n = \frac{\alpha_n - \mu}{\sigma}. -\] +$$ Then we can parameterize in terms of $\alpha^{\mathrm{std}}$, which has a standard-normal distribution -\[ +$$ p(\alpha^{\mathrm{std}}_n) = \mathsf{Normal}(\alpha^{\mathrm{std}}_n \, | \, 0, 1). -\] +$$ We can then define our original $\alpha$ as a derived quantity -\[ +$$ \alpha_n = \mu + \sigma \, \alpha^{\mathrm{std}}_n. -\] +$$ -We code this implicitly in the Stan model by defining the -likelihood as +This decouples the sampling distribution +for $\alpha^{\mathrm{std}}$ from $\mu$ and $\sigma$, greatly reducing +their correlation in the posterior. -\[ -p(y_n \, | \, \alpha^{\mathrm{std}}_n, \mu, \sigma, K) -\ = \ -\mathsf{BinomialLogit}(K_n, \ \mu + \sigma \, \alpha_n). -\] +Prior to Stan 2.19, a Stan implementation directly encoded the above reparameterization, +introducing a new variable `alpha_std` which has a standard normal distribution, +thus decoupling the sampling distribution of `alpha_std` from `mu` and `sigma`. +``` +parameters { + real mu; // population mean of success log-odds + real sigma; // population sd of success log-odds + vector[N] alpha_std; // success log-odds (standardized) +} +model { + vector[N] alpha = mu + sigma * alpha_std; + ... + alpha_std ~ normal(0, 1); // prior (hierarchical) + y ~ binomial_logit(K, alpha); // likelihood +} +``` + +Since Stan version 2.19, the Stan language's +[affine transform](https://mc-stan.org/docs/reference-manual/univariate-data-types-and-variable-declarations.html) construct provides a more efficient way to do this. -This decouples the sampling distribution for $\alpha^{\mathrm{std}}$ -from $\mu$ and $\sigma$, greatly reducing their correlation in the -posterior. +> Real variables may be declared on a space that has been transformed using an affine transformation $x\mapsto \mu + \sigma * x$ with offset $\mu$ and (positive) multiplier $\sigma$, using a syntax similar to that for bounds. While these transforms do not change the asymptotic sampling behaviour of the resulting Stan program (in a sense, the model the program implements), they can be useful for making the sampling process more efficient by transforming the geometry of the problem to a more natural multiplier and to a more natural offset for the sampling process, for instance by facilitating a non-centered parameterisation. + +Specifying the affine transform in the parameter declaration for +$\alpha^{\mathrm{std}}$ eliminates the need for intermediate variables +and makes it easier to see the hierarchical structure of the model. -The Stan program's parameter declaration and model directly follow the -definition. ``` parameters { - real mu; // population mean of success log-odds - real sigma; // population sd of success log-odds - vector[N] alpha_std; // success log-odds + real mu; // population mean of success log-odds + real sigma; // population sd of success log-odds + vector[N] alpha_std; // success log-odds (standardized) } model { - mu ~ normal(-1, 1); // hyperprior - sigma ~ normal(0, 1); // hyperprior - alpha_std ~ normal(0, 1); // prior - y ~ binomial_logit(K, mu + sigma * alpha_std); // likelihood + mu ~ normal(-1, 1); // hyperprior + sigma ~ normal(0, 1); // hyperprior + alpha_std ~ normal(mu, sigma); // prior (hierarchical) + y ~ binomial_logit(K, alpha_std); // likelihood } ``` +This reformulation is functionally equivalent to the previous version, +but now the tranformation is handled directly by Stan. Because the parameters to the prior for $\sigma$ are constants, the normalization for the half-prior (compared to the full prior) is constant and does not need to be included in the notation. This only @@ -928,10 +1040,7 @@ computed as a generated quantity. ``` generated quantities { - vector[N] theta; // chance of success - ... - for (n in 1:N) - theta[n] <- inv_logit(mu + sigma * alpha_std[n]); + vector[N] theta = inv_logit(alpha_std); ... } ``` @@ -941,24 +1050,28 @@ The full Stan program for the hierarchical logistic model is in It is fit and the values are extracted as follows. -```{r comment=NA, results='hide'} -fit_hier_logit <- stan("hier-logit.stan", data=c("N", "K", "y", "K_new", "y_new"), - iter=(M / 2), chains=4, - control=list(stepsize=0.01, adapt_delta=0.99)); -ss_hier_logit <- extract(fit_hier_logit); +```{r results='hide', comment=NA} +hier_logit_model <- cmdstan_model(stan_file='hier-logit.stan', include_paths=c('include')) +fit_hier_logit <- hier_logit_model$sample(data=baseball_data, + iter_sampling=M/4, + iter_warmup=M/4, + seed=1234, + step_size=0.01, + adapt_delta=0.99, + refresh=0) ``` We can print as before. ```{r comment=NA} -print(fit_hier_logit, c("alpha_std", "theta", "mu", "sigma"), probs=c(0.1, 0.5, 0.9)); +fit_hier_logit$print(c("alpha_std", "theta", "mu", "sigma"), max_rows=100) ``` It is clear from the wide posteriors for the $\theta_n$ that there is considerable uncertainty in the estimates of chance-of-success on an -item-by-item basis. With an 80% interval of $(0.03, 0.32)$, it is -clear that the data is consistent with complete pooling (i.e., $\sigma -= 0$). +item-by-item basis. With an 90% interval of $(0.02, 0.36)$ for `sigma`, +it is clear that the data is consistent with complete pooling +(i.e., $\sigma = 0$). Compared to the direct beta priors with uniform and Pareto hyperpriors shown in the first example, the normal prior on log odds exerts more @@ -966,15 +1079,16 @@ pull toward the population mean. The posterior means for $\theta$ ranged from 0.22 to 0.32 with the beta prior, but only range from 0.24 to 0.29 for the normal prior. Furthermore, the posterior intervals for each values are shrunk compared to the beta prior. For example, Roberto -Clemente ($n = 1$), has an 80% central posterior interval of $(0.25, -0.35)$ in the logistic model, whereas he had an 80% posterior interval -of $(.26, .39)$ with a hierarchical beta prior. +Clemente ($n = 1$), has an 90% central posterior interval of $(0.24, +0.35)$ in the logistic model, whereas he had an 90% posterior interval +of $(.25, .42)$ with a hierarchical beta prior. To consider how the reparameterization is working, we plot the posterior for the mean and log scale of the hyperprior. ```{r} -df_bda3_fig_5_3_logit <- with(ss_hier_logit, +hier_logit_df = as_draws_df(fit_hier_logit$draws()); +df_bda3_fig_5_3_logit <- with(hier_logit_df, data.frame(x = mu, y = log(sigma))); plot_bda3_fig_5_3_logit <- @@ -988,18 +1102,18 @@ As before, we see that the prior location ($\mu$) and scale ($\sigma$) are coupl ```{r} df_bda3_fig_5_3_funnel <- - with(ss_hier_logit, - data.frame(x = alpha_std[, 1], + with(hier_logit_df, + data.frame(x = hier_logit_df$'alpha_std[1]', y = log(sigma))); plot_bda3_fig_5_3_funnel <- bda_plot(df_bda3_fig_5_3_funnel, - x_lab = expression(paste("player 1 log odds (standardized), ", alpha[1])), + x_lab = expression(paste("player 1 log odds (standardized), ", 'alpha_std[1]')), y_lab = expression(paste("log population scale, ", log(sigma)))) + ggtitle("Non-Centered Hierarchical Funnel Plot"); plot_bda3_fig_5_3_funnel; ``` -Compared to the earlier plot of $\log \kappa$ versus $\mathrm{logit}(\theta_1)$ in the direct hierarcical model on chance of success, there is much less of a dependency between the variables; this can be seen because the variation is along the axes, not in a weak banana shape as in the first example. But it still has the long-tail problem as $\sigma$ approaches zero (and $\log \sigma$ becomes more negative). +Compared to the earlier plot of $\log \kappa$ versus $\mathrm{logit}(\theta_1)$ in the direct hierarchical model on chance of success, there is much less of a dependency between the variables; this can be seen because the variation is along the axes, not in a weak banana shape as in the first example. But it still has the long-tail problem as $\sigma$ approaches zero (and $\log \sigma$ becomes more negative). @@ -1012,17 +1126,23 @@ successes $y_n$ for the first $K_n$ trials versus the median and 80\% intervals for the estimated chance-of-success parameters $\theta_n$ in the posterior. The following R code reproduces a similar plot for our data. -```{r} -ss_quantiles <- function(ss) { - apply(ss$theta, 2, quantile, probs = c(0.1, 0.5, 0.9)); +```{r, out.width="110%"} +# summary statistics - quantiles 10%, 50%, 90% +ss_quantiles <- function(draws_df) { + apply(select(draws_df, starts_with('theta')), 2, quantile, probs = c(0.1, 0.5, 0.9)); } -theta_pool <- ss_quantiles(ss_pool); -theta_no_pool <- ss_quantiles(ss_no_pool); -theta_hier <- ss_quantiles(ss_hier); -theta_hier_logit <- ss_quantiles(ss_hier_logit); + +pool_df <- as_draws_df(fit_pool$draws()); +theta_pool <- ss_quantiles(pool_df); + +no_pool_df <- as_draws_df(fit_no_pool$draws()); +theta_no_pool <- ss_quantiles(no_pool_df); + +theta_hier <- ss_quantiles(hier_df); +theta_hier_logit <- ss_quantiles(hier_logit_df); models <- c("complete pooling", "no pooling", "partial pooling", - "partial pooling \n(log odds)") + "partial pooling, log odds") df_plot2 <- data.frame(x = rep(y / K, 4), y = c(theta_pool["50%",], theta_no_pool["50%",], theta_hier["50%",], theta_hier_logit["50%",]), @@ -1035,7 +1155,7 @@ df_plot2 <- data.frame(x = rep(y / K, 4), pop_mean <- sum(y) / sum(K); plot_bda3_fig_5_4 <- ggplot(df_plot2, aes(x=x, y=y, ymin=ymin, ymax=ymax)) + - geom_hline(yintercept=pop_mean, colour="lightpink") + + geom_hline(yintercept=pop_mean, colour="orange") + geom_abline(intercept=0, slope=1, colour="skyblue") + facet_grid(. ~ model) + geom_errorbar(width=0.005, colour="gray60") + @@ -1044,7 +1164,11 @@ plot_bda3_fig_5_4 <- scale_x_continuous(breaks = c(0.2, 0.3, 0.4)) + xlab(expression(paste("observed rate ", y[n] / K[n]))) + ylab(expression(paste("chance of success ", theta[n]))) + - ggtitle("Posterior Medians and 80% intervals\n(red line: population mean; blue line: MLE)") + ggtitle("Posterior Medians and 80% intervals", + subtitle="orange line: population mean; blue line: MLE") + + theme_jupyter(); + +options(repr.plot.width=20, repr.plot.height=10) plot_bda3_fig_5_4; ``` @@ -1057,14 +1181,13 @@ are indistinguishable, any differences in estimates are due to MCMC error. The horizontal red line has an intercept equal to the overall success rate, - -\[ +$$ \frac{\sum_{n=1}^N y_n}{\sum_{n=1}^N K_n} \ = \ \frac{215}{810} \ = \ 0.266. -\] +$$ The overall success rate is also the posterior mode (i.e., maximum likelihood estimate) for the complete pooling model. @@ -1125,11 +1248,11 @@ of predictions weighted by the posterior. Given data $y$ and a model with parameters $\theta$, the posterior predictive distribution for new data $\tilde{y}$ is defined by -\[ +$$ p(\tilde{y} \, | \, y) \ = \ \int_{\Theta} p(\tilde{y} \, | \, \theta) \ p(\theta \, | \, y) \ \mathrm{d}\theta, -\] +$$ where $\Theta$ is the support of the parameters $\theta$. What an integral of this form says is that $p(\tilde{y} \, | \, y)$ is defined @@ -1146,14 +1269,14 @@ conditioned on $y$. Because the posterior predictive density is formulated as an expectation over the posterior, it is possible to compute via MCMC. With $M$ draws $\theta^{(m)}$ from the posterior $p(\theta \, | \, -y)$, the posterior predicitve log density for new data +y)$, the posterior predictive log density for new data $y^{\mathrm{new}}$ is given by -\[ +$$ p(y^{\mathrm{new}} \, | \, y) \ \approx \ \log \frac{1}{M} \, \sum_{m=1}^M \ p\left( y^{\mathrm{new}} \, | \, \theta^{(m)} \right). -\] +$$ In practice, this requires care to prevent underflow in floating point calculations; a robust calculation on the log scale is provided below. @@ -1190,7 +1313,7 @@ estimated. #### Calibration -A well calibrated statistical model is one in which the uncertainy in +A well calibrated statistical model is one in which the uncertainty in the predictions matches the uncertainty in further data. That is, if we estimate posterior 50% intervals for predictions on new data (here, number of hits in the rest of the season for each player), roughly 50% @@ -1219,10 +1342,10 @@ variable $Y^{\mathrm{new}}_n$ denoting the number of further successes for item $n$ has a standard deviation from the repeated binary trials of -\[ +$$ \mathrm{sd}[Y^{\mathrm{new}}_n] \ = \ \sqrt{K \ \theta \, (1 - \theta)}. -\] +$$ #### Why Evaluate with the Predictive Posterior? @@ -1241,24 +1364,24 @@ intervals (Gneiting et al. 2007). #### Computing the Log Predictive Posterior Density -The log of posterior predicitve density is defined in the obvious way as +The log of posterior predictive density is defined in the obvious way as -\[ +$$ \log p(\tilde{y} \, | \, y) = \log \int_{\Theta} p(\tilde{y} \, | \, \theta) \ p(\theta \, | \, y) \ \mathrm{d}\theta. -\] +$$ This is not a posterior expectation, but rather the log of a posterior expectation. In particular, it should not be confused with the posterior expectation of the log predictive density, which is given by -\[ +$$ \int_{\Theta} \left( \log p(\tilde{y} \, | \, \theta) \right) \ p(\theta \, | \, y) \ \mathrm{d}\theta. -\] +$$ Although this is easy to compute in Stan in a stable fashion, it does not produce the same answer (as we show below). @@ -1268,11 +1391,11 @@ inequality](https://en.wikipedia.org/wiki/Jensen%27s_inequality) shows that the expectation of the log is less than or equal to the log of the expectation, -\[ +$$ \log \int_{\Theta} p(\tilde{y} \, | \, \theta) \ p(\theta \, | \, y) \ \mathrm{d}\theta \ \leq \ \int_{\Theta} \left( \, \log p(\tilde{y} \, | \, \theta) \, \right) \ p(\theta \, | \, y) \ \mathrm{d}\theta. -\] +$$ We'll compute both expectations and demonstrate Jensen's inequality in our running example. @@ -1281,42 +1404,49 @@ The integer variables K_new[n] and y_new[n] declared in the data block hold the number of at bats (trials) and the number of hits (successes) for player (item) n. To code the evaluation of the held out data in Stan, we declare a -generated quantities variable (log_p_news) to hold the -log density of each data point and define it in the obvious way. +generated quantities variable (log_p_new) +which is the sum of the per-item posterior predictive log densities. +Because we are computing this quantity across all models, +we put the code into its own file named "gq-postpred.stan" +and Stan's include directive in all models. +Include directive in the model's generated quantities block. ``` generated quantities { ... - real log_p_new; // posterior predictive log density remaining trials - vector[N] log_p_news; // posterior predictive log density for item - ... - for (n in 1:N) - log_p_news[n] <- binomial_log(y_new[n], K_new[n], theta[n]); - log_p_new <- sum(log_p_news); + #include gq-postpred.stan ... } ``` +First we compute the posterior predictive density for the remaining trials. +``` + // posterior predictive log density remaining trials + real log_p_new = 0; + for (n in 1 : N) { + log_p_new = log_p_new + binomial_lpmf(y_new[n] | K_new[n], theta[n]); + } +``` + The posterior mean for log_p_new will give us -\[ +$$ \int_{\Theta} \left( \log p(\tilde{y} \, | \, \theta) \right) \ p(\theta \, | \, y) \ \mathrm{d}\theta \ \approx \ \frac{1}{M} \, \sum_{m=1}^M \log p(y^{\mathrm{new}} \, | \, \theta^{(m)}). -\] +$$ This calculation is included in all four of the models we have previously fit and can be displayed directly as follows as the posterior mean of `log_p_new`. ```{r comment=NA} -print(sprintf("%10s %16s", "MODEL", "VALUE"), quote = FALSE); -print(sprintf("%10s %16.0f", "pool", mean(ss_pool$log_p_new)), quote=FALSE); -print(sprintf("%10s %16.0f", "no pool", mean(ss_no_pool$log_p_new)), quote=FALSE); -print(sprintf("%10s %16.0f", "hier", mean(ss_hier$log_p_new)), quote=FALSE); -print(sprintf("%10s %16.0f", "hier logit", mean(ss_hier_logit$log_p_new)), quote=FALSE); +message(sprintf("%25s %5.1f", "complete pooling", mean(pool_df$log_p_new))); +message(sprintf("%25s %5.1f", "no pooling", mean(no_pool_df$log_p_new))); +message(sprintf("%25s %5.1f", "partial pooling", mean(hier_df$log_p_new))); +message(sprintf("%25s %5.1f", "partial pooling, log odds", mean(hier_logit_df$log_p_new))); ``` From a predictive standpoint, the models are ranked by the amount of @@ -1332,11 +1462,11 @@ The straight path to calculate this would be to define a generated quantity $p(y^{\mathrm{new}} \, | y)$, look at the posterior mean computed by Stan, and takes its log. That is, -\[ +$$ \log p(y^{\mathrm{new}} \, | \, y) \ \approx \ \log \frac{1}{M} \, \sum_{m=1}^M p(y^{\mathrm{new}} \, | \, \theta^{(m)}). -\] +$$ Unfortunately, this won't work in most cases because when we try to compute $p(y^{\mathrm{new}} \, | \, \theta^{(m)})$ directly, it is prone @@ -1356,32 +1486,32 @@ To avoid underflow, we're going to use the [log-sum-of-exponentials](https://en.wikipedia.org/wiki/LogSumExp) trick, which begins by noting the obvious, -\[ +$$ \log \frac{1}{M} \, \sum_{m=1}^M \ p(y^{\mathrm{new}} \, | \, \theta^{(m)}). \ = \ \log \frac{1}{M} \, \sum_{m=1}^M \ \exp \left( \log p(y^{\mathrm{new}} \, | \, \theta^{(m)}) \right). -\] +$$ We'll then write that last expression as -\[ +$$ -\log M + \mathrm{log\_sum\_exp \, }_{m=1}^M \ \log p(y^{\mathrm{new}} \, | \, \theta^{(m)}) -\] +$$ We can compute $\mathrm{log\_sum\_exp}$ stably by subtracting the max value. Suppose $u = u_1, \ldots, u_M$, and $\max(u)$ is the largest $u_m$. We can calculate -\[ +$$ \mathrm{log\_sum\_exp \, }_{m=1}^M \ u_m \ = \ \log \sum_{m=1}^M \exp(u_m) \ = \ \max(u) + \log \sum_{m=1}^M \exp(u_m - \max(u)). -\] +$$ Because $u_m - \max(u) \leq 0$, the exponentiations cannot overflow. They may underflow to zero, but this will not lose precision because @@ -1407,15 +1537,15 @@ and then use it to print the log posterior predictive densities for our fit. ```{r comment=NA} -print_new_lp <- function(name, ss) { - lp <- -log(M) + log_sum_exp(ss$log_p_new); - print(sprintf("%25s %5.1f", name, lp), quote=FALSE); +print_new_lp <- function(name, df) { + lp <- -log(nrow(df)) + log_sum_exp(df$log_p_new); + message(sprintf("%25s %5.1f", name, lp)); } -print_new_lp("complete pooling", ss_pool); -print_new_lp("no pooling", ss_no_pool); -print_new_lp("partial pooling", ss_hier); -print_new_lp("partial pooling (logit)", ss_hier_logit); +print_new_lp("complete pooling", pool_df); +print_new_lp("no pooling", no_pool_df); +print_new_lp("partial pooling", hier_df); +print_new_lp("partial pooling (logit)", hier_logit_df); ``` Now the ranking is different! As expected, the values here are lower @@ -1441,14 +1571,7 @@ pseudorandom number generator, which corresponds to the binomial sampling distribution (likelihood) in this case. ``` -generated quantities { - ... - int z[N]; // posterior prediction remaining trials - ... - for (n in 1:N) - z[n] <- binomial_rng(K_new[n], theta[n]); - ... -} + array[N] int z = binomial_rng(K_new, theta); ``` This formulation makes it clear that there are two sources of @@ -1472,19 +1595,25 @@ sense we define below). The number of remaining at-bats $K^{\mathrm{new}}$ was printed out in the original table along with the actual number of hits. -```{r comment=NA} -print(fit_pool, c("z"), probs=c(0.1, 0.5, 0.9), digits=0); +```{r} +print(df) +``` + +We examine the 80% intervals on the basic hierarchical model for z, the posterior predictions for the rest of the season. + +```{r} +fit_hier$print('z', "mean", "median", ~quantile(.x, probs = c(0.1, 0.9)), max_rows=18) ``` Translating the posterior number of hits into a season batting average, $\frac{y_n + z_n}{K_n + K^{\mathrm{new}}_n}$, we get an 80% posterior interval of -\[ -\left( \frac{18 + 93}{45 + 367}, \frac{18 + 147}{45 + 367} \right) +$$ +\left( \frac{18 + 93}{45 + 367}, \frac{18 + 145}{45 + 367} \right) \ = \ (0.269, 0.400). -\] +$$ for Roberto Clemente in the basic hierarchical model. Part of our uncertainty here is due to our uncertainty in Clemente's underlying @@ -1510,7 +1639,8 @@ predictive 50% interval for predicted batting average (success rate) in his remaining at bats (trials); the observed success rate in the remainder of the season is shown as a blue dot. -```{r comment=NA} +```{r comment=NA, out.width="110%"} +# N is number of players y_new_25_pool <- c(NA, N); y_new_25_no_pool <- c(NA, N); y_new_25_hier <- c(NA, N); @@ -1519,16 +1649,24 @@ y_new_75_pool <- c(NA, N); y_new_75_no_pool <- c(NA, N); y_new_75_hier <- c(NA, N); y_new_75_hier_logit <- c(NA, N); + + +zs_pool <- matrix(fit_pool$draws('z'), M, N); +zs_no_pool <- matrix(fit_no_pool$draws('z'), M, N); +zs_hier <- matrix(fit_hier$draws('z'), M, N); +zs_hier_logit <- matrix(fit_hier_logit$draws('z'), M, N); + for (n in 1:N) { - y_new_25_pool[n] <- quantile(ss_pool$z[,n], 0.25)[[1]]; - y_new_25_no_pool[n] <- quantile(ss_no_pool$z[,n], 0.25)[[1]]; - y_new_25_hier[n] <- quantile(ss_hier$z[,n], 0.25)[[1]]; - y_new_25_hier_logit[n] <- quantile(ss_hier_logit$z[,n], 0.25)[[1]]; - y_new_75_pool[n] <- quantile(ss_pool$z[,n], 0.75)[[1]]; - y_new_75_no_pool[n] <- quantile(ss_no_pool$z[,n], 0.75)[[1]]; - y_new_75_hier[n] <- quantile(ss_hier$z[,n], 0.75)[[1]]; - y_new_75_hier_logit[n] <- quantile(ss_hier_logit$z[,n], 0.75)[[1]]; + y_new_25_pool[n] <- quantile(zs_pool[, n], 0.25)[[1]]; + y_new_25_no_pool[n] <- quantile(zs_no_pool[, n], 0.25)[[1]]; + y_new_25_hier[n] <- quantile(zs_hier[, n], 0.25)[[1]]; + y_new_25_hier_logit[n] <- quantile(zs_hier_logit[, n], 0.25)[[1]]; + y_new_75_pool[n] <- quantile(zs_pool[, n], 0.75)[[1]]; + y_new_75_no_pool[n] <- quantile(zs_no_pool[, n], 0.75)[[1]]; + y_new_75_hier[n] <- quantile(zs_hier[, n], 0.75)[[1]]; + y_new_75_hier_logit[n] <- quantile(zs_hier_logit[, n], 0.75)[[1]]; } + y_new_25_pool <- y_new_25_pool / K_new; y_new_25_no_pool <- y_new_25_no_pool / K_new; y_new_25_hier <- y_new_25_hier / K_new; @@ -1543,7 +1681,7 @@ df_post_pred <- data.frame(x = rep(1:N, 4), model = c(rep("complete pooling", N), rep("no pooling", N), rep("partial pooling", N), - rep("partial pooling (log odds)", N))); + rep("partial pooling, log odds", N))); plot_post_pred <- ggplot(df_post_pred, aes(x=x, y=y)) + geom_point(colour="darkblue", size=1) + @@ -1556,9 +1694,9 @@ plot_post_pred <- scale_x_continuous(breaks=c()) + xlab("player ID") + ylab("batting average") + - ggtitle(expression( - atop("Posterior Predictions for Batting Average in Remainder of Season", - atop("50% posterior predictive intervals (gray bars); observed (blue dots)", "")))); + ggtitle("Posterior Predictions for Batting Average in Remainder of Season", + subtitle="50% posterior predictive intervals (gray bars); observed (blue dots)") + + theme_jupyter(); plot_post_pred; ``` @@ -1648,7 +1786,7 @@ over parameters and data. For example, the probability of player $n$'s batting average being 0.400 or better conditioned on the data $y$ is defined by the conditional event probability -\[ +$$ \mathrm{Pr}\left[ \frac{(y_n + z_n)}{(45 + K^{\mathrm{new}}_n)} \geq 0.400 \, \Big| \, @@ -1660,19 +1798,19 @@ y \ p(z_n \, | \, \theta_n, K^{\mathrm{new}}_n) \ p(\theta \, | \, y, K) \ \mathrm{d}\theta. -\] +$$ The indicator function $\mathrm{I}[c]$ evaluates to 1 if the condition $c$ is true and 0 if it is false. Because it is just another expectation with respect to the posterior, we can calculate this event probability using MCMC as -\[ +$$ \mathrm{Pr}\left[\frac{(y_n + z_n)}{(45 + K^{\mathrm{new}}_n)} \geq 0.400 \, \Big| \, y \right] \ \approx \ \frac{1}{M} \, \sum_{m=1}^M \mathrm{I}\left[\frac{(y_n + z_n^{(m)})}{(45 + K^{\mathrm{new}}_n)} \geq 0.400\right]. -\] +$$ This event is about the season batting average being greater than 0.400. What if we care about ability (chance of success), not batting @@ -1681,7 +1819,7 @@ the question of whether $\mathrm{Pr}[\theta_n > 0.4]$. This is defined as a weighted average over the prior and computed via MCMC as the previous case. -\[ +$$ \mathrm{Pr}\left[\theta_n \geq 0.400 \, | \, y \right] \ = \ \int_{\Theta} @@ -1690,25 +1828,26 @@ the previous case. \ \mathrm{d}\theta \ \approx \ \frac{1}{M} \, \sum_{m=1}^M \mathrm{I}[\theta_n^{(m)} \geq 0.400]. -\] - -In Stan, we just declare and define the indicators directly in the -generated quantities block. +$$ +This logic is part of the include file `gq-postpred.stan`. ``` -generated quantities { - ... - int some_ability_gt_350; // Pr[some theta > 0.35] - int avg_gt_400[N]; // Pr[season avg of n] >= 0.400 - int ability_gt_400[N]; // Pr[chance-of-success of n] >= 0.400 - ... - some_ability_gt_350 <- (max(theta) > 0.35); - for (n in 1:N) - avg_gt_400[n] <- (((y[n] + z[n]) / (0.0 + K[n] + K_new[n])) > 0.400); - for (n in 1:N) - ability_gt_400[n] <- (theta[n] > 0.400); - ... -} + // posterior predictions on new data + array[N] int z = binomial_rng(K_new, theta); + + // Pr[some theta > 0.35] + int some_ability_gt_350 = max(theta) > 0.35; + // Pr[some player ability > 400] + array[N] int ability_gt_400; + for (n in 1 : N) { + ability_gt_400[n] = theta[n] > 0.400; + } + // Pr[some player season average > 400] + array[N] int avg_gt_400; + for (n in 1 : N) { + // y[n] - observed, z[n] - expected hits in rest of season + avg_gt_400[n] = ((y[n] + z[n]) / (0.0 + K[n] + K_new[n])) > 0.400; + } ``` The indicator function is not explicitly specified because Stan's @@ -1722,23 +1861,27 @@ summary. We can summarize for all four models again. Only the event indicator variables are printed---the parameter estimates will be the same as before. +The standard deviation and quantiles are not useful here; +we do want to see a reasonable effective +sample size estimate and no evidence of non-convergence in the form +for $\hat{R}$ values much greater than 1. The NaN values in the +$\hat{R}$ column arise when every posterior draw is the same; there is +zero sample variance, and so $\hat{R}$ is not defined. + ```{r comment=NA} pars_to_print <- c("some_ability_gt_350", "avg_gt_400[1]","avg_gt_400[5]", "avg_gt_400[10]", "ability_gt_400[1]", "ability_gt_400[5]", "ability_gt_400[10]"); -print(fit_pool, pars=pars_to_print, probs=c()); -print(fit_no_pool, pars=pars_to_print, probs=c()); -print(fit_hier, pars=pars_to_print, probs=c()); -print(fit_hier_logit, pars=pars_to_print, probs=c()); -``` -The standard deviation and quantiles are not useful here and the -quantiles are suppressed through an empty probs argument -to print(); we do want to see a reasonable effective -sample size estimate and no evidence of non-convergence in the form -for $\hat{R}$ values much greater than 1. The NaN values in the -$\hat{R}$ column arise when every posterior draw is the same; there is -zero sample variance, and so $\hat{R}$ is not defined. +message("complete pooling (note: all players have estimated ability 0.27)"); +fit_pool$print(pars_to_print); +message("no pooling"); +fit_no_pool$print(pars_to_print); +message("partial pooling"); +fit_hier$print(pars_to_print); +message("partial pooling, log odds"); +fit_hier_logit$print(pars_to_print); +``` These results show that the probability of batting 0.400 or better for the season is a different question than asking if the player's ability @@ -1804,11 +1947,11 @@ The generated quantity some_ability_gt_350 will be set to And thus the posterior mean of this generated quantity will be the event probability -\[ +$$ \mathrm{Pr}[\mathrm{max}(\theta) > 0.350] \ = \ \int_{\Theta} \mathrm{I}[\mathrm{max}(\theta) > 0.35] \ p(\theta \, | \, y, K) \ \mathrm{d}\theta \ \approx \ \frac{1}{M} \, \sum_{m=1}^M \ \mathrm{I}[\mathrm{max}(\theta^{(m)}) > 0.35] -\] +$$ where $\theta^{(m)}$ is the sequence of posterior draws for the ability parameter vector. Stan reports this value as the posterior @@ -1839,19 +1982,21 @@ rank (a value ranging from $1$ to $N$) of the second player. First, we define a local variable dsc, and sort it into descending order. +We encapsulate this logic into its own program file "gq-ranking.stan" +and include it in the model's generated quantities block. ``` -generated quantities { - ... - int rnk[N]; // rnk[n] is rank of player n - ... + array[N] int rnk; // rank of player n { - int dsc[N]; - dsc <- sort_indices_desc(theta); - for (n in 1:N) - rnk[dsc[n]] <- n; + array[N] int dsc; + dsc = sort_indices_desc(theta); + for (n in 1 : N) { + rnk[dsc[n]] = n; + } + } + array[N] int is_best; // Pr[player n highest chance of success] + for (n in 1 : N) { + is_best[n] = rnk[n] == 1; } - ... -} ``` After the call to sort_indices_desc, dsc[n] @@ -1873,7 +2018,7 @@ pooling model, where every player is assumed to have the same ability. We can print just the ranks and the 80% central interval for the basic pooling model. ```{r comment=NA} -print(fit_hier, "rnk", probs=c(0.1, 0.5, 0.9)); +fit_hier$print("rnk", "mean", "median", ~quantile(.x, probs = c(0.1, 0.9)), max_rows=18); ``` It is again abundantly clear from the posterior intervals that our @@ -1885,14 +2030,15 @@ mortality, the posterior distribution over ranks was plotted for each hospital. It is now straightforward to reproduce that figure here for the baseball data. -```{r comment=NA} -library(ggplot2); +```{r comment=NA, out.width="110%"} +rnk_hier <- matrix(fit_hier$draws('rnk'), M, N); df_rank <- data.frame(list(name = rep(as.character(df[[1,2]]), M), - rank = ss_hier$rnk[, 1])); + rank = rnk_hier[, 1])); + for (n in 2:N) { df_rank <- rbind(df_rank, data.frame(list(name = rep(as.character(df[[n,2]]), M), - rank = ss_hier$rnk[, n]))); + rank = rnk_hier[, n]))); } rank_plot <- ggplot(df_rank, aes(rank)) + @@ -1901,7 +2047,9 @@ rank_plot <- scale_x_discrete(limits=c(1, 5, 10, 15)) + scale_y_discrete(name="posterior probability", breaks=c(0, 0.1 * M, 0.2 * M), labels=c("0.0", "0.1", "0.2")) + - ggtitle("Rankings for Partial Pooling Model"); + ggtitle("Rankings for Partial Pooling Model") + + theme_jupyter(); +#options(repr.plot.width=20, repr.plot.height=16) rank_plot; ``` @@ -1916,7 +2064,7 @@ We can use our ranking statistic to calculate the event probability for item $n$ that the item has the highest chance of success using MCMC as -\[ +$$ \mathrm{Pr}[\theta_n = \max(\theta)] \ = \ \int_{\Theta} \mathrm{I}[\theta_n = \mathrm{max}(\theta)] @@ -1924,7 +2072,7 @@ MCMC as \ \mathrm{d}\theta \ \approx \ \frac{1}{M} \, \sum_{m=1}^M \mathrm{I}[\theta^{(m)}_n = \mathrm{max}(\theta^{(m)})]. -\] +$$ Like our other models, the partial pooling mitigates the implicit multiple comparisons being done to calculate the probabilities of @@ -1936,18 +2084,13 @@ already computed or we could compute it directly as above. Because $\mathrm{Pr}[\theta_n = \theta_{n'}] = 0$ for $n \neq n'$, we don't have to worry about ties. -We define a new generated quantity for the value of the indicator and -define it using the already-computed ranks. - +The generated quantities variable `is_best` is defined +using the already-computed ranks. ``` -generated quantities { - ... - int is_best[N]; // Pr[player n highest chance of success] - ... - for (n in 1:N) - is_best[n] <- (rnk[n] == 1); - ... -} + array[N] int is_best; // Pr[player n highest chance of success] + for (n in 1 : N) { + is_best[n] = rnk[n] == 1; + } ``` This means that is_best[n] will be 1 if player @@ -1963,18 +2106,22 @@ probability. We can then plot the results for the four models. -```{r comment=NA} -df_is_best_for <- function(name, ss) { +```{r comment=NA, out.width="80%"} +df_is_best_for <- function(name, draws) { is_best <- rep(NA, N); for (n in 1:N) { - is_best[n] <- mean(ss$is_best[,n]); + is_best[n] <- mean(draws[,n]); } return(data.frame(list(item=1:N, is_best=is_best, model=name))); } -df_is_best <- rbind(df_is_best_for("no pool", ss_no_pool), - df_is_best_for("hier", ss_hier), - df_is_best_for("hier (log odds)", ss_hier_logit)); +best_no_pool <- matrix(fit_no_pool$draws('is_best'), M, N); +best_hier <- matrix(fit_hier$draws('is_best'), M, N); +best_hier_logit <- matrix(fit_hier_logit$draws('is_best'), M, N); + +df_is_best <- rbind(df_is_best_for("no pooling", best_no_pool), + df_is_best_for("partial pooling", best_hier), + df_is_best_for("partial pooling, log odds", best_hier_logit)); is_best_plot <- ggplot(df_is_best, aes(x=item, y=is_best)) + @@ -1982,7 +2129,9 @@ is_best_plot <- facet_wrap(~ model) + scale_y_continuous(name = "Pr[player is best]") + scale_x_discrete(name="player", breaks=c(1, 5, 10, 15)) + - ggtitle("Who is the Best Player?"); + ggtitle("Who is the Best Player?") + + theme_jupyter(); +#options(repr.plot.width=12, repr.plot.height=4) is_best_plot; ``` @@ -2031,9 +2180,9 @@ sample standard deviation. Given a test statistic $T$ and data $y$, the Bayesian $p$-value has a direct definition as a probability, -\[ +$$ p_B = \mathrm{Pr}[T(y^{\mathrm{rep}}) \geq T(y) \, | \, y]. -\] +$$ Bayesian $p$-values, like their traditional counterparts, are probabilities, but not probabilities that a model is true. They @@ -2045,11 +2194,11 @@ used to estimate the model is unlikely to have been generated by the estimated model. As with other forms of full Bayesian inference, our estimate is the full posterior, not just a point estimate. -As with other Bayesain inferences, we average over the posterior +As with other Bayesian inferences, we average over the posterior rather than working from a point estimate of the parameters. Expanding this as an expectation of an indicator function, -\[ +$$ p_B \ = \ \int_{\Theta, Y^{\mathrm{rep}}} @@ -2057,7 +2206,7 @@ p_B \ p(y^{\mathrm{rep}} \, | \, \theta) \ p(\theta \, | \, y) \ \mathrm{d}\theta, -\] +$$ We treat $y^{\mathrm{rep}}$ as a parameter in parallel with $\theta$, integrating over possible values $y^{\mathrm{rep}} \in @@ -2065,18 +2214,18 @@ Y^{\mathrm{rep}}$. As usual, we use the integration sign in a general way intended to include summation, as with the discrete variable $y^{\mathrm{rep}}$. -The formualation as an expectation leads to the obvious MCMC +The formulation as an expectation leads to the obvious MCMC calculation based on posterior draws $y^{\mathrm{rep} (m)}$ for $m \in 1{:}M$, -\[ +$$ p_B \approx \frac{1}{M} \, \sum_{m=1}^M \mathrm{I}[T(y^{\mathrm{rep} \ (m)}) \geq T(y)]. -\] +$$ In Stan, the test statistics are first defined for the observed data. @@ -2088,81 +2237,56 @@ all possible. ``` transformed data { - real min_y; // minimum successes - real max_y; // maximum successes - real mean_y; // sample mean successes - real sd_y; // sample std dev successes - - min_y <- min(y); - max_y <- max(y); - mean_y <- mean(to_vector(y)); - sd_y <- sd(to_vector(y)); + real min_y = min(y); // minimum successes + real max_y = max(y); // maximum successes + real mean_y = mean(to_vector(y)); // sample mean successes + real sd_y = sd(to_vector(y)); // sample std dev successes } ``` - -The code to generate the replicated data is in the generated -quantities block. - -``` -generated quantities { - ... - int y_rep[N]; // replications for existing items - ... - for (n in 1:N) - y_rep[n] <- binomial_rng(K[n], theta[n]); - ... -} -``` - -Then the test statistics are defined in the generated quantities -block. - -``` -generated quantities { - ... - real min_y_rep; // posterior predictive min replicated successes - real max_y_rep; // posterior predictive max replicated successes - real mean_y_rep; // posterior predictive sample mean replicated successes - real sd_y_rep; // posterior predictive sample std dev replicated successes - ... - min_y_rep <- min(y_rep); - max_y_rep <- max(y_rep); - mean_y_rep <- mean(to_vector(y_rep)); - sd_y_rep <- sd(to_vector(y_rep)); - ... -} -``` - -Finally, the actual $p$-value indicators are computed and assigned to -integer values. The calls to to_vector() convert an +The calls to to_vector() convert an array of integers to an array of real values so that they are appropriately typed to be the input to the standard deviation calculation. +Then the posterior predictive test statistics are defined in the generated quantities +block and the actual $p$-value indicators are computed and assigned to +integer values. +The last part of included file "gq-postpred.stan" computes these values. ``` -generated quantities { - ... - int p_min; // posterior predictive p-values - int p_max; - int p_mean; - int p_sd; - ... - p_min <- (min_y_rep >= min_y); - p_max <- (max_y_rep >= max_y); - p_mean <- (mean_y_rep >= mean_y); - p_sd <- (sd_y_rep >= sd_y); - ... -} + // replications for existing items + array[N] int y_rep = binomial_rng(K, theta); + + // posterior predictive min replicated successes, test stat p_val + real min_y_rep = min(y_rep); + int p_min = min_y_rep >= min_y; + + // posterior predictive max replicated successes, test stat p_val + real max_y_rep = max(y_rep); + int p_max = max_y_rep >= max_y; + + // posterior predictive sample mean replicated successes, test stat p_val + real mean_y_rep = mean(to_vector(y_rep)); + int p_mean = mean_y_rep >= mean_y; + + // posterior predictive sample std dev replicated successes, test stat p_val + real sd_y_rep = sd(to_vector(y_rep)); + int p_sd = sd_y_rep >= sd_y; ``` The fit from the Stan program can then be used to display the Bayesian $p$-values for each of the models. ```{r comment=NA} -print(fit_pool, c("p_min", "p_max", "p_mean", "p_sd"), probs=c()); -print(fit_no_pool, c("p_min", "p_max", "p_mean", "p_sd"), probs=c()); -print(fit_hier, c("p_min", "p_max", "p_mean", "p_sd"), probs=c()); -print(fit_hier_logit, c("p_min", "p_max", "p_mean", "p_sd"), probs=c()); +pars_to_print <- c("p_min", "p_max", "p_mean", "p_sd"); + +message("complete pooling"); +fit_pool$print(pars_to_print); +message("no pooling"); +fit_no_pool$print(pars_to_print); +message("partial pooling"); +fit_hier$print(pars_to_print); +message("partial pooling, log odds"); +fit_hier_logit$print(pars_to_print); ``` The only worrisomely extreme value is the $p$-value for standard @@ -2177,7 +2301,7 @@ shows the posterior predictive distribution for the test statistic, the observed value as a vertical line, and the p-value for each of the tests. Here is the plot for the basic hierarchical model. -```{r comment=NA} +```{r comment=NA, out.width="120%"} y_min <- min(y); y_max <- max(y); y_mean <- mean(y); @@ -2191,23 +2315,23 @@ pvals_frame <- function(ss, model_name) { replication = ss$max_y_rep), model = rep(model_name, M)); df_pvals_mean <- data.frame(list(test_stat = rep("mean", M), - replication = ss$mean_y_rep), - model = rep(model_name, M)); + replication = ss$mean_y_rep), + model = rep(model_name, M)); df_pvals_sd <- data.frame(list(test_stat = rep("sd", M), replication = ss$sd_y_rep), model = rep(model_name, M)); return(rbind(df_pvals_min, df_pvals_max, df_pvals_mean, df_pvals_sd)); } -df_pvals <- rbind(pvals_frame(ss_hier, "partial pool"), - pvals_frame(ss_hier_logit, "partial (logit)"), - pvals_frame(ss_pool, "complete pool"), - pvals_frame(ss_no_pool, "no pool")); +df_pvals <- rbind(pvals_frame(hier_df, "partial pooling"), + pvals_frame(hier_logit_df, "partial pooling, log odds"), + pvals_frame(pool_df, "complete pooling"), + pvals_frame(no_pool_df, "no pooling")); post_test_stat_plot <- ggplot(df_pvals, aes(replication)) + facet_grid(model ~ test_stat) + - geom_histogram(binwidth = 0.5, colour="black", size = 0.25, fill="white") + + geom_histogram(binwidth = 0.5, colour="black", size = 0.5, fill="darkgrey") + theme(axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank()) + @@ -2217,8 +2341,10 @@ post_test_stat_plot <- rep(y_mean, M), rep(y_sd, M)), 4), test_stat = df_pvals$test_stat, replication = df_pvals$replication), - colour = "blue", size = 0.25) + - ggtitle("Posterior p-values") + colour = "blue", size = 0.5) + + ggtitle("Posterior p-values") + + theme_jupyter(); +#options(repr.plot.width=16, repr.plot.height=16) post_test_stat_plot; ``` @@ -2285,8 +2411,10 @@ We can now print the replicated values y_rep in both the hierarchical and hierarchical logit models. ```{r comment=NA} -print(fit_hier, c("y_rep", "y_pop_rep[1]"), probs=c(0.1, 0.5, 0.9)); -print(fit_hier_logit, c("y_rep", "y_pop_rep[1]"), probs=c(0.1, 0.5, 0.9)); +message("partial pooling"); +fit_hier$print(c("y_rep", "y_pop_rep[1]"), max_rows=18); +message("partial pooling, log odds"); +fit_hier_logit$print(c("y_rep", "y_pop_rep[1]"), max_rows=18); ``` The variable y_rep is ordered because it's based on the @@ -2305,10 +2433,12 @@ fifteen for display. ```{r comment=NA} df_post <- data.frame(list(dataset = rep("REAL", N), y = y)); +y_rep_hier_logit <- matrix(fit_hier_logit$draws('y_rep'), M, N); + for (n in 1:15) { df_post <- rbind(df_post, data.frame(list(dataset = rep(paste("repl ", n), N), - y = ss_hier_logit$y_rep[n,]))); + y = y_rep_hier_logit[n,]))); } post_plot <- ggplot(df_post, aes(y)) + @@ -2322,13 +2452,16 @@ post_plot; And now we can do the same thing for the population-level replication; because the code's the same, we do not echo it to the output. -```{r comment=NA, echo=FALSE} +```{r comment=NA} df_pop_post <- data.frame(list(dataset = rep("REAL", N), y = y)); + +y_pop_rep_hier_logit <- matrix(fit_hier_logit$draws('y_pop_rep'), M, N); + for (n in 1:15) { df_pop_post <- rbind(df_pop_post, data.frame(list(dataset = rep(paste("repl ", n), N), - y = ss_hier_logit$y_pop_rep[n,]))); + y = y_pop_rep_hier_logit[n,]))); } post_pop_plot <- ggplot(df_pop_post, aes(y)) + @@ -2357,13 +2490,14 @@ bats." Here are the same plots for the basic hierarchical model. This time we suppress printing the ggplot code, which is the same as before, and just show the graphs. -```{r comment=NA, echo=FALSE} +```{r echo=FALSE, comment=NA} df_post_beta <- data.frame(list(dataset = rep("REAL", N), y = y)); +y_rep_hier <- matrix(fit_hier$draws('y_rep'), M, N); for (n in 1:15) { df_post_beta <- rbind(df_post_beta, data.frame(list(dataset = rep(paste("repl ", n), N), - y = ss_hier$y_rep[n,]))); + y = y_rep_hier[n,]))); } post_plot_beta <- ggplot(df_post_beta, aes(y)) + @@ -2375,13 +2509,14 @@ post_plot_beta <- post_plot_beta; ``` -```{r comment=NA, echo=FALSE} +```{r echo=FALSE, comment=NA} df_pop_post_beta <- data.frame(list(dataset = rep("REAL", N), y = y)); +y_pop_rep_hier <- matrix(fit_hier$draws('y_pop_rep'), M, N); for (n in 1:15) { df_pop_post_beta <- rbind(df_pop_post_beta, data.frame(list(dataset = rep(paste("repl ", n), N), - y = ss_hier$y_pop_rep[n,]))); + y = y_pop_rep_hier[n,]))); } post_pop_plot_beta <- ggplot(df_pop_post_beta, aes(y)) + @@ -2415,7 +2550,7 @@ matched hyperpriors; see the exercises). With very little data, there is very little we can do to gain sharp inferences other than provide more informative priors, which is well worth doing when prior information is available. On the other hand, with more data, the -models provide similar results (see the exericses), and in the limit, +models provide similar results (see the exercises), and in the limit, all of the models (other than complete pooling) converge to posteriors that are delta functions around the empirical chance of success (i.e., the maximum likelihood estimate). Meanwhile, @@ -2534,7 +2669,7 @@ contrast to the basic partial pooling model. our model fit to data using Bayesian $p$-Values$? Code some up and evaluate on one of the data sets given here or on simulated data. -1. Discuss why the lack of an alternative hypotehsis makes it +1. Discuss why the lack of an alternative hypothesis makes it impossible to perform power calculations with Bayesian $p$-values. What are the implications for hypothesis testing for model fit? @@ -2683,7 +2818,7 @@ Major League Baseball. The data was originally downloaded from the #### pool.stan -```{r comment=NA, echo=FALSE, comment=NA} +```{r echo=FALSE, comment=NA} file_path <- "pool.stan"; lines <- readLines(file_path, encoding="ASCII"); for (n in 1:length(lines)) cat(lines[n],'\n'); @@ -2691,7 +2826,7 @@ for (n in 1:length(lines)) cat(lines[n],'\n'); #### no-pool.stan -```{r comment=NA, echo=FALSE} +```{r echo=FALSE, comment=NA} file_path <- "no-pool.stan"; lines <- readLines(file_path, encoding="ASCII"); for (n in 1:length(lines)) cat(lines[n],'\n'); @@ -2700,7 +2835,7 @@ for (n in 1:length(lines)) cat(lines[n],'\n'); #### hier.stan -```{r comment=NA, echo=FALSE} +```{r echo=FALSE, comment=NA} file_path <- "hier.stan"; lines <- readLines(file_path, encoding="ASCII"); for (n in 1:length(lines)) cat(lines[n],'\n'); @@ -2709,10 +2844,33 @@ for (n in 1:length(lines)) cat(lines[n],'\n'); #### hier-logit.stan -```{r comment=NA, echo=FALSE} +```{r echo=FALSE, comment=NA} file_path <- "hier-logit.stan"; lines <- readLines(file_path, encoding="ASCII"); for (n in 1:length(lines)) cat(lines[n],'\n'); ``` +#### data-blocks.stan + +```{r echo=FALSE, comment=NA} +file_path <- "include/data-blocks.stan"; +lines <- readLines(file_path, encoding="ASCII"); +for (n in 1:length(lines)) cat(lines[n],'\n'); +``` + +#### include-gq-postpred.stan + +```{r echo=FALSE, comment=NA} +file_path <- "include/gq-postpred.stan"; +lines <- readLines(file_path, encoding="ASCII"); +for (n in 1:length(lines)) cat(lines[n],'\n'); +``` + +#### include-gq-ranking.stan +> +```{r echo=FALSE, comment=NA} +file_path <- "include/gq-ranking.stan"; +lines <- readLines(file_path, encoding="ASCII"); +for (n in 1:length(lines)) cat(lines[n],'\n'); +``` diff --git a/knitr/pool-binary-trials/pool-no-pool.html b/knitr/pool-binary-trials/pool-no-pool.html index d71c7590..e129e900 100644 --- a/knitr/pool-binary-trials/pool-no-pool.html +++ b/knitr/pool-binary-trials/pool-no-pool.html @@ -1,12 +1,13 @@ - + - - + + + @@ -14,34 +15,52 @@ Hierarchical Partial Pooling for Repeated Binary Trials - + + - - - - + + + + + + + + - - - + code{white-space: pre-wrap;} + span.smallcaps{font-variant: small-caps;} + span.underline{text-decoration: underline;} + div.column{display: inline-block; vertical-align: top; width: 50%;} + div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;} + ul.task-list{list-style: none;} + + + - - + + + + + + + + + + + + + + + + + + + +
+ + + +

updated for CmdStanR by Mitzi Morris, 19 January 2022

Abstract

-

This note illustrates the effects on posterior inference of pooling data (aka sharing strength) across items for repeated binary trial data. It provides Stan models and R code to fit and check predictive models for three situations: (a) complete pooling, which assumes each item is the same, (b) no pooling, which assumes the items are unrelated, and (c) partial pooling, where the similarity among the items is estimated. We consider two hierarchical models to estimate the partial pooling, one with a beta prior on chance of success and another with a normal prior on the log odds of success. The note explains with working examples how to (i) fit models in RStan and plot the results in R using ggplot2, (ii) estimate event probabilities, (iii) evaluate posterior predictive densities to evaluate model predictions on held-out data, (iv) rank items by chance of success, (v) perform multiple comparisons in several settings, (vi) replicate new data for posterior p-values, and (vii) perform graphical posterior predictive checks.

+

This note illustrates the effects on posterior inference of pooling data (aka sharing strength) across items for repeated binary trial data. It provides Stan models and R code to fit and check predictive models for three situations:

+
    +
  1. complete pooling, which assumes each item is the same
  2. +
  3. no pooling, which assumes the items are unrelated
  4. +
  5. partial pooling, where the similarity among the items is estimated. +
      +
    • a hierarchical model with with a beta prior on the chance of success
    • +
    • a hierarchical model with a normal prior on the log odds of success
    • +
  6. +
+

We explain and give working examples of how to:

+
    +
  • fit models in CmdStanR and plot the results in R using ggplot2,
  • +
  • estimate event probabilities,
  • +
  • evaluate posterior predictive densities to evaluate model predictions on held-out data
  • +
  • rank items by chance of success,
  • +
  • perform multiple comparisons in several settings,
  • +
  • replicate new data for posterior p-values
  • +
  • perform graphical posterior predictive checks.
  • +
+
+
+

Packages used in this notebook

+

In this notebook, we will use the CmdStanR interface.

+
# uncomment as needed to install CmdStanR, CmdStan
+#library(devtools);
+#if(!require(cmdstanr)){
+#    devtools::install_github("stan-dev/cmdstanr", dependencies=c("Depends", "Imports"))
+#}
+options(warn=-1)
+options(repr.plot.width=8, repr.plot.height=8)
+options(output.width='90%')
+library(tidyverse)
+
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
+
✓ ggplot2 3.3.5     ✓ purrr   0.3.4
+✓ tibble  3.1.6     ✓ dplyr   1.0.7
+✓ tidyr   1.1.3     ✓ stringr 1.4.0
+✓ readr   2.0.0     ✓ forcats 0.5.1
+
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
+x dplyr::filter() masks stats::filter()
+x dplyr::lag()    masks stats::lag()
+
library(ggplot2)
+library(posterior)
+
This is posterior version 1.2.0
+

+Attaching package: 'posterior'
+
The following objects are masked from 'package:stats':
+
+    mad, sd, var
+
library(cmdstanr)
+
This is cmdstanr version 0.4.0.9001
+
- CmdStanR documentation and vignettes: mc-stan.org/cmdstanr
+
- CmdStan path: /Users/mitzi/.cmdstan/hide-cmdstan-develop
+
- CmdStan version: 2.28.1
+

+A newer version of CmdStan is available. See ?install_cmdstan() to install it.
+To disable this check set option or environment variable CMDSTANR_NO_VER_CHECK=TRUE.

Repeated Binary Trials

Suppose that for each of \(N\) items \(n \in 1{:}N\), we observe \(y_n\) successes out of \(K_n\) trials. For example, the data may consist of

  • rat tumor development, with \(y_n\) rats developing tumors of \(K_n\) total rats in experimental control group \(n \in 1{:}N\) (Tarone 1982)

  • -
  • surgical mortality, with \(y_n\) surgical patients dying in \(K_n\) surgeries for hospitals \(n \in 1{:}N\) (Spiegelhalter et al. 1996)

  • +
  • surgical mortality, with \(y_n\) surgical patients dying in \(K_n\) surgeries for hospitals \(n \in 1{:}N\) (Spiegelhalter et al. 1996)

  • baseball batting ability, with \(y_n\) hits in \(K_n\) at bats for baseball players \(n \in 1{:}N\) (Efron and Morris 1975; Carpenter 2009)

  • machine learning system accuracy, with \(y_n\) correct classifications out of \(K_n\) examples for systems \(n \in 1{:}N\) (ML conference proceedings; Kaggle competitions)

-

We use the small baseball data set of Efron and Morris (1975) as a running example, and in the same format provide the rat control data of Tarone (1982), the surgical mortality data of Spiegelhalter et al. (1996) and the extended baseball data set of Carpenter (2009).

+

We use the small baseball data set of Efron and Morris (1975) as a running example, and in the same format provide the rat control data of Tarone (1982), the surgical mortality data of Spiegelhalter et al. (1996) and the extended baseball data set of Carpenter (2009).

Baseball Hits (Efron and Morris 1975)

As a running example, we include the data from Table 1 of (Efron and Morris 1975) as efron-morris-75-data.tsv (it was downloaded 24 Dec 2015 from here). It is drawn from the 1970 Major League Baseball season from both leagues.

-
df <- read.csv("efron-morris-75-data.tsv", sep="\t");
+

We will only need a few columns of the data; we will be using the remaining hits and at bats to evaluate the predictive inferences for the various models.

+
df <- read.csv("efron-morris-75-data.tsv", sep="\t")
 df <- with(df, data.frame(FirstName, LastName, 
                           Hits, At.Bats, 
                           RemainingAt.Bats,
-                          RemainingHits = SeasonHits - Hits));
-print(df);
+ RemainingHits = SeasonHits - Hits)) +print(df)
   FirstName   LastName Hits At.Bats RemainingAt.Bats RemainingHits
 1    Roberto   Clemente   18      45              367           127
 2      Frank   Robinson   17      45              426           127
@@ -110,12 +280,6 @@ 

Baseball Hits (Efron and Morris 1975)

16 Bert Campaneris 9 45 558 159 17 Thurman Munson 8 45 408 129 18 Max Alvis 7 45 70 14
-

We will only need a few columns of the data; we will be using the remaining hits and at bats to evaluate the predictive inferences for the various models.

-
N <- dim(df)[1]
-K <- df$At.Bats
-y <- df$Hits
-K_new <- df$RemainingAt.Bats;
-y_new <- df$RemainingHits;

The data separates the outcome from the initial 45 at-bats from the rest of the season. After running this code, N is the number of items (players). Then for each item n, K[n] is the number of initial trials (at-bats), y[n] is the number of initial successes (hits), K_new[n] is the remaining number of trials (remaining at-bats), and y_new[n] is the number of successes in the remaining trials (remaining hits).

The remaining data can be used to evaluate the predictive performance of our models conditioned on the observed data. That is, we will “train” on the first 45 at bats and see how well our various models do at predicting the rest of the season.

Although we consider many models, the data is coded as follows for all of them.

@@ -127,7 +291,8 @@

Baseball Hits (Efron and Morris 1975)

int<lower=0> K_new[N]; // new trials int<lower=0> y_new[N]; // new successes } -

As usual, we follow the convention of naming our program variables after the variables we use when we write the model out mathematically in a paper. We also choose capital letters for integer constants and y for the main observed variable(s).

+

We follow the convention of naming our program variables after the variables we use when we write the model out mathematically in a paper. We choose capital letters for integer constants, here N and K, and we use a lowercase y for the main observed variable(s). We try to use these abstract names for data and parameters instead of dataset-specific names such as “hits” and “at-bats” because these basic models are useful in many domains, as mentioned above.

+

Because we shall develop four models all of which use the same data and transformed data, we create a Stan program which contains just the data and transformed data blocks “data-blocks.stan” and use Stan’s include directive to use this across all four models. Because the models contain #include directives, we need to specify the include_paths when instantiating or compiling the model.

@@ -144,31 +309,24 @@

Model 1: Complete Pooling

real<lower=0, upper=1> phi; // chance of success (pooled) }

The consequences for leaving the constraint off is that the program may fail during random initialization or during an iteration because Stan will generate initial values for phi outside of \([0,1]\). Such a specification may appear to work if there are only a small number of such variables because Stan tries multiple random initial values by default for MCMC; but even so, results may be biased due to numerical arithmetic issues.

-

Assuming each player’s at-bats are independent Bernoulli trials, the sampling distribution for each player’s number of hits \(y_n\) is modeled as

-

\[ +

Assuming each player’s at-bats are independent Bernoulli trials, the sampling distribution for each player’s number of hits \(y_n\) is modeled as \[ p(y_n \, | \, \phi) \ = \ \mathsf{Binomial}(y_n \, | \, K_n, \phi). -\]

-

When viewed as a function of \(\phi\) for fixed \(y_n\), this is called the likelihood function.

-

Assuming each player is independent leads to the complete data likelihood

-

\[ +\] When viewed as a function of \(\phi\) for fixed \(y_n\), this is called the likelihood function.

+

Assuming each player is independent leads to the complete data likelihood \[ p(y \, | \, \phi) = \prod_{n=1}^N \mathsf{Binomial}(y_n \, | \, K_n, \phi). -\]

-

We will assume a uniform prior on \(\phi\),

-

\[ +\] We will assume a uniform prior on \(\phi\), \[ p(\phi) \ = \ \mathsf{Uniform}(\phi \, | \, 0, 1) \ = \ 1. -\]

-

Whether a prior is uniform or not depends on the scale with which the parameter is expressed. Here, the variable \(\phi\) is a chance of success in \([0, 1]\). If we were to consider the log-odds of success, \(\log \frac{\phi}{1 - \phi}\), a uniform prior on log-odds is not the same as a uniform prior on chance of success (they are off by the Jacobian of the transform). A uniform prior on chance of success translates to a unit logistic prior on the log odds (the definition of the unit logistic density can be derived by calculating the Jacobian of the transform).

+\] Whether a prior is uniform or not depends on the scale with which the parameter is expressed. Here, the variable \(\phi\) is a chance of success in \([0, 1]\). If we were to consider the log-odds of success, \(\log \frac{\phi}{1 - \phi}\), a uniform prior on log-odds is not the same as a uniform prior on chance of success (they are off by the Jacobian of the transform). A uniform prior on chance of success translates to a unit logistic prior on the log odds (the definition of the unit logistic density can be derived by calculating the Jacobian of the transform).

By default, Stan places a uniform prior over the values meeting the constraints on a parameter. Because phi is constrained to fall in \([0,1]\), there is no need to explicitly specify the uniform prior on \(\phi\).

The likelihood is expressed as a vectorized sampling statement in Stan as

model {
-  ...
-  y ~ binomial(K, phi);
+  y ~ binomial(K, phi); // likelihood
 }

Sampling statements in Stan are syntactic shorthand for incrementing the underlying log density accumulator. Thus the above would produce the same draws as

  increment_log_prob(binomial_log(y, K, phi));
@@ -177,111 +335,51 @@

Model 1: Complete Pooling

  for (n in 1:N)  
     y[n] ~ binomial(K[n], phi);

In general, Stan will match dimensions, repeating scalars as necessary; any vector or array arguments must be the same size. When used as a function, the result is the sum of the log densities. The vectorized form can be up to an order of magnitude or more faster in some cases, depending on how many repeated calculations can be avoided.

-

The actual Stan program in pool.stan has many more derived quantities that will be used in the rest of this note; see the appendix for the full code of all of the models discussed.

-

We start by loading the RStan package.

-
library(rstan);
-
Loading required package: ggplot2
-rstan (Version 2.9.0, packaged: 2016-01-05 16:17:47 UTC, GitRev: 05c3d0058b6a)
-For execution on a local, multicore CPU with excess RAM we recommend calling
-rstan_options(auto_write = TRUE)
-options(mc.cores = parallel::detectCores())
-

The model can be fit as follows, with M being the total number of draws in the complete posterior sample (each chain is by default split into half warmup and half sampling iterations and 4 chains are being run).

-
M <- 10000;
-fit_pool <- stan("pool.stan", data=c("N", "K", "y", "K_new", "y_new"),
-                 iter=(M / 2), chains=4);
-

-SAMPLING FOR MODEL 'pool' NOW (CHAIN 1).
-
-Chain 1, Iteration:    1 / 5000 [  0%]  (Warmup)
-Chain 1, Iteration:  500 / 5000 [ 10%]  (Warmup)
-Chain 1, Iteration: 1000 / 5000 [ 20%]  (Warmup)
-Chain 1, Iteration: 1500 / 5000 [ 30%]  (Warmup)
-Chain 1, Iteration: 2000 / 5000 [ 40%]  (Warmup)
-Chain 1, Iteration: 2500 / 5000 [ 50%]  (Warmup)
-Chain 1, Iteration: 2501 / 5000 [ 50%]  (Sampling)
-Chain 1, Iteration: 3000 / 5000 [ 60%]  (Sampling)
-Chain 1, Iteration: 3500 / 5000 [ 70%]  (Sampling)
-Chain 1, Iteration: 4000 / 5000 [ 80%]  (Sampling)
-Chain 1, Iteration: 4500 / 5000 [ 90%]  (Sampling)
-Chain 1, Iteration: 5000 / 5000 [100%]  (Sampling)# 
-#  Elapsed Time: 0.070874 seconds (Warm-up)
-#                0.071481 seconds (Sampling)
-#                0.142355 seconds (Total)
-# 
-
-SAMPLING FOR MODEL 'pool' NOW (CHAIN 2).
-
-Chain 2, Iteration:    1 / 5000 [  0%]  (Warmup)
-Chain 2, Iteration:  500 / 5000 [ 10%]  (Warmup)
-Chain 2, Iteration: 1000 / 5000 [ 20%]  (Warmup)
-Chain 2, Iteration: 1500 / 5000 [ 30%]  (Warmup)
-Chain 2, Iteration: 2000 / 5000 [ 40%]  (Warmup)
-Chain 2, Iteration: 2500 / 5000 [ 50%]  (Warmup)
-Chain 2, Iteration: 2501 / 5000 [ 50%]  (Sampling)
-Chain 2, Iteration: 3000 / 5000 [ 60%]  (Sampling)
-Chain 2, Iteration: 3500 / 5000 [ 70%]  (Sampling)
-Chain 2, Iteration: 4000 / 5000 [ 80%]  (Sampling)
-Chain 2, Iteration: 4500 / 5000 [ 90%]  (Sampling)
-Chain 2, Iteration: 5000 / 5000 [100%]  (Sampling)# 
-#  Elapsed Time: 0.074807 seconds (Warm-up)
-#                0.078645 seconds (Sampling)
-#                0.153452 seconds (Total)
-# 
-
-SAMPLING FOR MODEL 'pool' NOW (CHAIN 3).
-
-Chain 3, Iteration:    1 / 5000 [  0%]  (Warmup)
-Chain 3, Iteration:  500 / 5000 [ 10%]  (Warmup)
-Chain 3, Iteration: 1000 / 5000 [ 20%]  (Warmup)
-Chain 3, Iteration: 1500 / 5000 [ 30%]  (Warmup)
-Chain 3, Iteration: 2000 / 5000 [ 40%]  (Warmup)
-Chain 3, Iteration: 2500 / 5000 [ 50%]  (Warmup)
-Chain 3, Iteration: 2501 / 5000 [ 50%]  (Sampling)
-Chain 3, Iteration: 3000 / 5000 [ 60%]  (Sampling)
-Chain 3, Iteration: 3500 / 5000 [ 70%]  (Sampling)
-Chain 3, Iteration: 4000 / 5000 [ 80%]  (Sampling)
-Chain 3, Iteration: 4500 / 5000 [ 90%]  (Sampling)
-Chain 3, Iteration: 5000 / 5000 [100%]  (Sampling)# 
-#  Elapsed Time: 0.073841 seconds (Warm-up)
-#                0.070549 seconds (Sampling)
-#                0.14439 seconds (Total)
-# 
-
-SAMPLING FOR MODEL 'pool' NOW (CHAIN 4).
-
-Chain 4, Iteration:    1 / 5000 [  0%]  (Warmup)
-Chain 4, Iteration:  500 / 5000 [ 10%]  (Warmup)
-Chain 4, Iteration: 1000 / 5000 [ 20%]  (Warmup)
-Chain 4, Iteration: 1500 / 5000 [ 30%]  (Warmup)
-Chain 4, Iteration: 2000 / 5000 [ 40%]  (Warmup)
-Chain 4, Iteration: 2500 / 5000 [ 50%]  (Warmup)
-Chain 4, Iteration: 2501 / 5000 [ 50%]  (Sampling)
-Chain 4, Iteration: 3000 / 5000 [ 60%]  (Sampling)
-Chain 4, Iteration: 3500 / 5000 [ 70%]  (Sampling)
-Chain 4, Iteration: 4000 / 5000 [ 80%]  (Sampling)
-Chain 4, Iteration: 4500 / 5000 [ 90%]  (Sampling)
-Chain 4, Iteration: 5000 / 5000 [100%]  (Sampling)# 
-#  Elapsed Time: 0.073971 seconds (Warm-up)
-#                0.075928 seconds (Sampling)
-#                0.149899 seconds (Total)
-# 
-
ss_pool <- extract(fit_pool);
-

Here, we read the data out of the environment by name; normally we would prefer to encapsulate the data in a list to avoid naming conflicts in the top-level namespace. We showed the default output for the stan() function call here, but will suppress it in subsequent calls.

+

The actual Stan program in pool.stan includes many derived quantities that will be used in the rest of this note; see the appendix for the full code of all of the models discussed.

+

We must first instantiate a CmdStanModel object then call its sample method. To do this we first call the cmdstan_model() function which instantiates a CmdStanModel object from a file containing a Stan program. Because these models contain a number of include directives, we use the include_paths argument to pass in the path to the current directory.

+
pool_model <- cmdstan_model(stan_file='pool.stan', include_paths=c('include'))
+

We create the R data structures corresponding to the model inputs. In addition, we define the desired size of the MCMC sample, here we set it to \(10,000\) draws.

+
N <- dim(df)[1]
+K <- df$At.Bats
+y <- df$Hits
+K_new <- df$RemainingAt.Bats
+y_new <- df$RemainingHits
+
+M <- 10000  # total draws in the sample
+

We pass the data into the sample method as a list of name, value pairs. By default, CmdStanR runs 4 chains with 1000 warmup and sampling iterations per chain. To generate a sample with a total of \(M = 10000\) draws, we specify the per-chain sampling iterations to be \(M / 4\).

+
baseball_data = list(N=N, K=K, y=y, K_new=K_new, y_new=y_new)
+fit_pool <- pool_model$sample(data=baseball_data, iter_sampling=M/4, refresh=0)
+
Running MCMC with 4 sequential chains...
+
+Chain 1 finished in 0.1 seconds.
+Chain 2 finished in 0.1 seconds.
+Chain 3 finished in 0.1 seconds.
+Chain 4 finished in 0.1 seconds.
+
+All 4 chains finished successfully.
+Mean chain execution time: 0.1 seconds.
+Total execution time: 0.6 seconds.

The posterior sample for phi can be summarized as follows.

-
print(fit_pool, c("phi"), probs=c(0.1, 0.5, 0.9));
-
Inference for Stan model: pool.
-4 chains, each with iter=5000; warmup=2500; thin=1; 
-post-warmup draws per chain=2500, total post-warmup draws=10000.
-
-    mean se_mean   sd  10%  50%  90% n_eff Rhat
-phi 0.27       0 0.02 0.25 0.27 0.29  3237    1
-
-Samples were drawn using NUTS(diag_e) at Sat Jan 30 20:56:04 2016.
-For each parameter, n_eff is a crude measure of effective sample size,
-and Rhat is the potential scale reduction factor on split chains (at 
-convergence, Rhat=1).
-

The summary statistics begin with the posterior mean, the MCMC standard error on the posterior mean, and the posterior standard deviation. Then there are 0.1, 0.5, and 0.9 quantiles, which provide the posterior median and boundaries of the central 80% interval. The last two columns are for the effective sample size (MCMC standard error is the posterior standard deviation divided by the square root of the effective sample size) and the \(\hat{R}\) convergence diagnostic (its value will be 1 if the chains have all converged to the same posterior mean and variance; see the Stan Manual (Stan Development Team 2015) or (Gelman et al. 2013). The \(\hat{R}\) value here is consistent with convergence (i.e., near 1) and the effective sample size is good (roughly half the number of posterior draws; by default Stan uses as many iterations to warmup as it does for drawing the sample).

-

The result is a posterior mean for \(\theta\) of \(0.27\) with an 80% central posterior interval of \((0.25, 0.29)\). With more data, such as from more players or from the rest of the season, the posterior approaches a delta function around the maximum likelihood estimate and the posterior interval around the centeral posterior intervals will shrink. Nevertheless, even if we know a player’s chance of success exactly, there is a large amount of uncertainty in running \(K\) binary trials with that chance of success; using a binomial model fundamentally bounds our prediction accuracy.

+
fit_pool$print('phi')
+
 variable mean median   sd  mad   q5  q95 rhat ess_bulk ess_tail
+      phi 0.27   0.27 0.02 0.02 0.24 0.29 1.00     3761     4828
+

The summary statistics begin with the posterior mean, the MCMC standard error on the posterior mean, and the posterior standard deviation (sd) and mean average deviation (mad). Then there are the 0.5, and 0.95 quantiles, which provide the posterior median and boundaries of the central 90% interval. The next two columns are for the effective sample size for the central 90% interval and the tail. The final column is the \(\hat{R}\) convergence diagnostic (its value will be 1 if the chains have all converged to the same posterior mean and variance; see the Stan Reference Manual, Vehtari et al, 2021 and its appendix for further discussion of these diagnostics. The \(\hat{R}\) value here is consistent with convergence (i.e., near 1) and the effective sample size is good (well above 10%).

+

The result is a posterior mean for \(\theta\) of \(0.27\) with an 90% central posterior interval of \((0.24, 0.29)\).

+
fit_pool$print('theta')
+
##   variable mean median   sd  mad   q5  q95 rhat ess_bulk ess_tail
+##  theta[1]  0.27   0.27 0.02 0.02 0.24 0.29 1.00     3761     4828
+##  theta[2]  0.27   0.27 0.02 0.02 0.24 0.29 1.00     3761     4828
+##  theta[3]  0.27   0.27 0.02 0.02 0.24 0.29 1.00     3761     4828
+##  theta[4]  0.27   0.27 0.02 0.02 0.24 0.29 1.00     3761     4828
+##  theta[5]  0.27   0.27 0.02 0.02 0.24 0.29 1.00     3761     4828
+##  theta[6]  0.27   0.27 0.02 0.02 0.24 0.29 1.00     3761     4828
+##  theta[7]  0.27   0.27 0.02 0.02 0.24 0.29 1.00     3761     4828
+##  theta[8]  0.27   0.27 0.02 0.02 0.24 0.29 1.00     3761     4828
+##  theta[9]  0.27   0.27 0.02 0.02 0.24 0.29 1.00     3761     4828
+##  theta[10] 0.27   0.27 0.02 0.02 0.24 0.29 1.00     3761     4828
+## 
+##  # showing 10 of 18 rows (change via 'max_rows' argument or 'cmdstanr_max_rows' option)
+

With more data, such as from more players or from the rest of the season, the posterior approaches a delta function around the maximum likelihood estimate and the posterior interval around the central posterior intervals will shrink. Nevertheless, even if we know a player’s chance of success exactly, there is a large amount of uncertainty in running \(K\) binary trials with that chance of success; using a binomial model fundamentally bounds our prediction accuracy.

Although this model will be a good baseline for comparison, we have good reason to believe from a large amount of prior data (players with as many as 10,000 trials) that it is very unlikely that all players have the same chance of success.

@@ -317,42 +415,32 @@

Model 2: No Pooling

y[n] ~ binomial(K[n], theta[n]);

The full Stan program with all of the extra generated quantities, is in no-pool.stan, which is shown in the appendix.

This model can be fit the same way as the last model.

-
fit_no_pool <- stan("no-pool.stan", data=c("N", "K", "y", "K_new", "y_new"),
-                    iter=(M / 2), chains=4);
-ss_no_pool <- extract(fit_no_pool);
-

Results are displayed the same way.

-
print(fit_no_pool, c("theta"), probs=c(0.1, 0.5, 0.9));
-
Inference for Stan model: no-pool.
-4 chains, each with iter=5000; warmup=2500; thin=1; 
-post-warmup draws per chain=2500, total post-warmup draws=10000.
-
-          mean se_mean   sd  10%  50%  90% n_eff Rhat
-theta[1]  0.40       0 0.07 0.31 0.40 0.50 10000    1
-theta[2]  0.38       0 0.07 0.29 0.38 0.47 10000    1
-theta[3]  0.36       0 0.07 0.27 0.36 0.45 10000    1
-theta[4]  0.34       0 0.07 0.26 0.34 0.43 10000    1
-theta[5]  0.32       0 0.07 0.23 0.32 0.41 10000    1
-theta[6]  0.32       0 0.07 0.23 0.32 0.41 10000    1
-theta[7]  0.30       0 0.07 0.21 0.30 0.39 10000    1
-theta[8]  0.28       0 0.07 0.19 0.27 0.36 10000    1
-theta[9]  0.26       0 0.06 0.18 0.25 0.34 10000    1
-theta[10] 0.26       0 0.06 0.18 0.25 0.34 10000    1
-theta[11] 0.23       0 0.06 0.16 0.23 0.31 10000    1
-theta[12] 0.23       0 0.06 0.16 0.23 0.31 10000    1
-theta[13] 0.23       0 0.06 0.16 0.23 0.31 10000    1
-theta[14] 0.23       0 0.06 0.16 0.23 0.32 10000    1
-theta[15] 0.23       0 0.06 0.16 0.23 0.32 10000    1
-theta[16] 0.21       0 0.06 0.14 0.21 0.29 10000    1
-theta[17] 0.19       0 0.06 0.12 0.19 0.27 10000    1
-theta[18] 0.17       0 0.05 0.10 0.17 0.24 10000    1
-
-Samples were drawn using NUTS(diag_e) at Sat Jan 30 20:56:34 2016.
-For each parameter, n_eff is a crude measure of effective sample size,
-and Rhat is the potential scale reduction factor on split chains (at 
-convergence, Rhat=1).
+
no_pool_model <- cmdstan_model(stan_file='no-pool.stan', include_paths=c('include'))
+fit_no_pool <- no_pool_model$sample(data=baseball_data, iter_sampling=M/4, refresh=0)
+

Now we examine the per-player chance of success theta

+
fit_no_pool$print('theta', max_rows=18)
+
  variable mean median   sd  mad   q5  q95 rhat ess_bulk ess_tail
+ theta[1]  0.40   0.40 0.07 0.07 0.29 0.52 1.00    24409     7602
+ theta[2]  0.38   0.38 0.07 0.07 0.27 0.50 1.00    25206     7563
+ theta[3]  0.36   0.36 0.07 0.07 0.25 0.48 1.00    25109     7787
+ theta[4]  0.34   0.34 0.07 0.07 0.23 0.46 1.00    22216     7334
+ theta[5]  0.32   0.32 0.07 0.07 0.21 0.43 1.00    22009     6914
+ theta[6]  0.32   0.32 0.07 0.07 0.21 0.43 1.00    25727     6513
+ theta[7]  0.30   0.29 0.07 0.07 0.19 0.42 1.00    23907     6981
+ theta[8]  0.28   0.27 0.06 0.07 0.18 0.39 1.00    25647     7337
+ theta[9]  0.26   0.25 0.06 0.06 0.16 0.37 1.00    23949     7112
+ theta[10] 0.25   0.25 0.06 0.06 0.16 0.36 1.00    20836     7294
+ theta[11] 0.23   0.23 0.06 0.06 0.14 0.34 1.00    21133     7385
+ theta[12] 0.23   0.23 0.06 0.06 0.14 0.34 1.00    23278     7041
+ theta[13] 0.23   0.23 0.06 0.06 0.14 0.34 1.00    22225     7189
+ theta[14] 0.23   0.23 0.06 0.06 0.14 0.34 1.00    22492     6815
+ theta[15] 0.23   0.23 0.06 0.06 0.14 0.34 1.00    26100     7376
+ theta[16] 0.21   0.21 0.06 0.06 0.12 0.32 1.00    22456     6874
+ theta[17] 0.19   0.19 0.06 0.06 0.11 0.29 1.00    21633     7102
+ theta[18] 0.17   0.17 0.05 0.05 0.09 0.27 1.00    22404     7611

Now there is a separate line for each item’s estimated \(\theta_n\). The posterior mode is the maximum likelihood estimate, but that requires running Stan’s optimizer to find; the posterior mean and median will be reasonably close to the posterior mode despite the skewness (the posterior can be shown analytically to be a Beta distribution).

-

Each 80% interval is much wider than the estimated interval for the population in the complete pooling model; this is to be expected—there are only 45 data items for each parameter here as opposed to 810 in the complete pooling case. If the items each had different numbers of trials, the intervals would also vary based on size.

-

As the estimated chance of success goes up toward 0.5, the 80% intervals gets wider. This is to be expected for chance of success parameters, because the standard deviation of a random variable distributed as \(\mathsf{Binomial}(K, \theta)\) is \(\sqrt{\frac{\theta \, (1 - \theta)}{K}}\).

+

Each 90% interval is much wider than the estimated interval for the population in the complete pooling model; this is to be expected—there are only 45 data items for each parameter here as opposed to 810 in the complete pooling case. If the items each had different numbers of trials, the intervals would also vary based on size.

+

As the estimated chance of success goes up toward 0.5, the 90% intervals gets wider. This is to be expected for chance of success parameters, because the standard deviation of a random variable distributed as \(\mathsf{Binomial}(K, \theta)\) is \(\sqrt{\theta (1 - \theta) K}\).

The no pooling model model provides better MCMC mixing than the complete pooling model as indicated by the effective sample size and convergence diagnostics \(\hat{R}\); although not in and of itself meaningful, it is often the case that badly misspecified models provide difficult computationally (a result Andrew Gelman has dubbed “The Folk Theorem”).

Based on our existing knowledge of baseball, the no-pooling model is almost certainly overestimating the high abilities and underestimating lower abilities (Ted Williams, 30 years prior to the year this data was collected, was the last player with a 40% observed success rate over a season, whereas 20% is too low for all but a few rare defensive specialists).

@@ -383,7 +471,7 @@

Model 3: Partial Pooling (Chance of Success)

  • \(\phi = \frac{\alpha}{\alpha + \beta}\) is the mean of a variable distributed as \(\mathsf{Beta}(\alpha, \beta)\), and

  • \(\kappa = \alpha + \beta\) is the prior count plus two (roughly inversely related to the variance).

  • -

    We will follow Gelman et al. (2013, Chapter 5) in providing a prior that factors into a uniform prior on \(\phi\),

    +

    We will follow Gelman et al. (2013, Chapter 5) in providing a prior that factors into a uniform prior on \(\phi\),

    \[ p(\phi) = \mathsf{Uniform}(\phi \, | \, 0, 1), \]

    @@ -405,93 +493,143 @@

    Model 3: Partial Pooling (Chance of Success)

    y ~ binomial(K, theta); // likelihood }

    The values of \(\alpha = \phi \, \kappa\) and \(\beta = (1 - \phi) \, \kappa\) are computed in the arguments to the vectorized Beta sampling statement. The prior on \(\phi\) is implicitly uniform because it is explicitly constrained to lie in \([0, 1]\). The prior on \(\kappa\) is coded following the model definition.

    -

    The full model with all generated quantities can be coded in Stan as in the file hier.stan; it is displayed in the appendix. It is run as usual.

    -
    fit_hier <- stan("hier.stan", data=c("N", "K", "y", "K_new", "y_new"),
    -                 iter=(M / 2), chains=4,
    -                 seed=1234);
    -
    Warning: There were 3 divergent transitions after warmup. Increasing
    -adapt_delta above 0.8 may help.
    -
    Warning: Examine the pairs() plot to diagnose sampling problems
    -
    ss_hier <- extract(fit_hier);
    -

    Even though we set results=‘hide’ in the knitr R call here, the error messages are still printed. We set the pseudorandom number generator seed here to 1234 so that we could discuss what the output looks like.

    +

    The full model with all generated quantities can be coded in Stan as in the file hier.stan; it is displayed in the appendix. It is run as usual, except that this time we set the pseudorandom number generator seed here to 1234 so that we can consistently generate the exact set of draws.

    +
    hier_model <- cmdstan_model(stan_file='hier.stan', include_paths=c('include'))
    +fit_hier <- hier_model$sample(data=baseball_data,
    +                              iter_sampling=M/4,
    +                  seed=1234,
    +                  refresh=0,
    +                  show_messages=FALSE)
    +
    
    +Warning: 8 of 10000 (0.0%) transitions ended with a divergence.
    +This may indicate insufficient exploration of the posterior distribution.
    +Possible remedies include: 
    +  * Increasing adapt_delta closer to 1 (default is 0.8) 
    +  * Reparameterizing the model (e.g. using a non-centered parameterization)
    +  * Using informative or weakly informative prior distributions 
    +

    The sampler prints a warning about divergent transitions. CmdStan’s diagnose utility provides more checks on the MCMC sample.

    +
    fit_hier$cmdstan_diagnose()
    +
    ## Processing csv files: /var/folders/db/4jnggnf549s42z50bd61jskm0000gq/T/Rtmpa1KFsp/hier-202201241717-1-857359.csv, /var/folders/db/4jnggnf549s42z50bd61jskm0000gq/T/Rtmpa1KFsp/hier-202201241717-2-857359.csv, /var/folders/db/4jnggnf549s42z50bd61jskm0000gq/T/Rtmpa1KFsp/hier-202201241717-3-857359.csv, /var/folders/db/4jnggnf549s42z50bd61jskm0000gq/T/Rtmpa1KFsp/hier-202201241717-4-857359.csv
    +## 
    +## Checking sampler transitions treedepth.
    +## Treedepth satisfactory for all transitions.
    +## 
    +## Checking sampler transitions for divergences.
    +## 8 of 10000 (0.08%) transitions ended with a divergence.
    +## These divergent transitions indicate that HMC is not fully able to explore the posterior distribution.
    +## Try increasing adapt delta closer to 1.
    +## If this doesn't remove all divergences, try to reparameterize the model.
    +## 
    +## Checking E-BFMI - sampler transitions HMC potential energy.
    +## E-BFMI satisfactory.
    +## 
    +## Effective sample size satisfactory.
    +## 
    +## Split R-hat values satisfactory all parameters.
    +## 
    +## Processing complete.

    Divergent Transitions and Mitigation Strategies

    In this case, there’s an error report for a number of what Stan calls “divergent transitions” after warmup. A divergent transition arises when there is an arithmetic issue in evaluating a numerical expression; this is almost always an overflow or underflow in well-specified models, but may simply be arguments out of bounds if the proper constraints have not been enforced (for instance, taking an unconstrained parameter as the chance of success for a Bernoulli distribution, which requires its chance of success to be in \([0,1]\)).

    Whenever divergent transitions show up, it introduces a bias in the posterior draws away from the region where the divergence happens. Therefore, we should try to remove divergent transitions before trusting our posterior sample.

    -

    The underlying root cause of most divergent transitions is a step size that is too large. The underlying Hamiltonian Monte Carlo algorithm is attempting to follow the Hamiltonian using discrete time steps corresponding to the step size of the algorithm. When that step size is too large relative to posterior curvature, iteratively taking steps in the gradient times the step size provides a poor approximation to the posterior curvature. The problem with a hierarchical model is that the curvature in the posterior varies based on position; when the hierarchical variance is low, there is high curvature in the lower-level parameters around the mean. During warmup, Stan globally adapts its step size to a target acceptance rate, which can lead to too large step sizes for highly curved regions of the posterior. To mitigate this problem, we need to either reduce the step size or reparameterize. We firt consider reducing step size, and then in the next secton consider the superior alternative of reparameterization.

    -

    To reduce the step size of the algorithm, we want to lower the initial step size and increase the target acceptance rate. The former keeps the step size low to start; the latter makes sure the adapted step size is lower. So we’ll run this again with the same seed, this time lowering the step size (stepsize) and increasing the target acceptance rate (adapt_delta).

    -
    fit_hier <- stan("hier.stan", data=c("N", "K", "y", "K_new", "y_new"),
    -                 iter=(M / 2), chains=4,
    -                 seed=1234, 
    -                 control=list(stepsize=0.01, adapt_delta=0.99));
    -ss_hier <- extract(fit_hier);
    +

    The underlying root cause of most divergent transitions is a step size that is too large. The underlying Hamiltonian Monte Carlo algorithm is attempting to follow the Hamiltonian using discrete time steps corresponding to the step size of the algorithm. When that step size is too large relative to posterior curvature, iteratively taking steps in the gradient times the step size provides a poor approximation to the posterior curvature. The problem with a hierarchical model is that the curvature in the posterior varies based on position; when the hierarchical variance is low, there is high curvature in the lower-level parameters around the mean. During warmup, Stan globally adapts its step size to a target acceptance rate, which can lead to too large step sizes for highly curved regions of the posterior. To mitigate this problem, we need to either: - increase the number of warmup iterations - reduce the step size and increase the target acceptance rate - reparameterize.

    +

    We first consider the combination of the first two options and then in the next section consider the superior alternative of reparameterization.

    +

    The default number of warmup iterations is 1000, but for models with highly varying posterior geometries, running more warmup iterations is often sufficient to find the correct step size. To reduce the step size of the algorithm, we want to lower the initial step size and increase the target acceptance rate. The former keeps the step size low to start; the latter makes sure the adapted step size is lower. So we’ll run this again with the same seed, this time running 2000 warmup iterations (iter_warmup), lowering the step size (step_size), and increasing the target acceptance rate (adapt_delta).

    +
    fit_hier <- hier_model$sample(data=baseball_data,
    +                              iter_sampling=M/4,
    +                              iter_warmup=M/4,
    +                              seed=1234,
    +                              step_size=0.01,
    +                              adapt_delta=0.95,
    +                  refresh=0)
    +
    
    +Warning: 2 of 10000 (0.0%) transitions ended with a divergence.
    +This may indicate insufficient exploration of the posterior distribution.
    +Possible remedies include: 
    +  * Increasing adapt_delta closer to 1 (default is 0.8) 
    +  * Reparameterizing the model (e.g. using a non-centered parameterization)
    +  * Using informative or weakly informative prior distributions 

    Now it runs without divergent translations.

    +
    fit_hier$cmdstan_diagnose()
    +
    ## Processing csv files: /var/folders/db/4jnggnf549s42z50bd61jskm0000gq/T/Rtmpa1KFsp/hier-202201241717-1-52353a.csv, /var/folders/db/4jnggnf549s42z50bd61jskm0000gq/T/Rtmpa1KFsp/hier-202201241717-2-52353a.csv, /var/folders/db/4jnggnf549s42z50bd61jskm0000gq/T/Rtmpa1KFsp/hier-202201241717-3-52353a.csv, /var/folders/db/4jnggnf549s42z50bd61jskm0000gq/T/Rtmpa1KFsp/hier-202201241717-4-52353a.csv
    +## 
    +## Checking sampler transitions treedepth.
    +## Treedepth satisfactory for all transitions.
    +## 
    +## Checking sampler transitions for divergences.
    +## 2 of 10000 (0.02%) transitions ended with a divergence.
    +## These divergent transitions indicate that HMC is not fully able to explore the posterior distribution.
    +## Try increasing adapt delta closer to 1.
    +## If this doesn't remove all divergences, try to reparameterize the model.
    +## 
    +## Checking E-BFMI - sampler transitions HMC potential energy.
    +## E-BFMI satisfactory.
    +## 
    +## Effective sample size satisfactory.
    +## 
    +## Split R-hat values satisfactory all parameters.
    +## 
    +## Processing complete.

    Summary statistics for the posterior are printed as before.

    -
    print(fit_hier, c("theta", "kappa", "phi"), probs=c(0.1, 0.5, 0.9));
    -
    Inference for Stan model: hier.
    -4 chains, each with iter=5000; warmup=2500; thin=1; 
    -post-warmup draws per chain=2500, total post-warmup draws=10000.
    -
    -            mean se_mean     sd   10%   50%    90% n_eff Rhat
    -theta[1]    0.32    0.00   0.05  0.26  0.32   0.39  3809 1.00
    -theta[2]    0.31    0.00   0.05  0.25  0.31   0.38  3865 1.00
    -theta[3]    0.30    0.00   0.05  0.25  0.30   0.37  5130 1.00
    -theta[4]    0.29    0.00   0.05  0.24  0.29   0.36 10000 1.00
    -theta[5]    0.29    0.00   0.05  0.23  0.28   0.35  7610 1.00
    -theta[6]    0.29    0.00   0.05  0.23  0.28   0.34 10000 1.00
    -theta[7]    0.28    0.00   0.04  0.22  0.27   0.33 10000 1.00
    -theta[8]    0.27    0.00   0.04  0.21  0.27   0.32 10000 1.00
    -theta[9]    0.26    0.00   0.04  0.20  0.26   0.31 10000 1.00
    -theta[10]   0.26    0.00   0.04  0.21  0.26   0.31 10000 1.00
    -theta[11]   0.25    0.00   0.04  0.19  0.25   0.30 10000 1.00
    -theta[12]   0.25    0.00   0.04  0.19  0.25   0.30 10000 1.00
    -theta[13]   0.25    0.00   0.04  0.20  0.25   0.30 10000 1.00
    -theta[14]   0.25    0.00   0.04  0.19  0.25   0.30 10000 1.00
    -theta[15]   0.25    0.00   0.04  0.20  0.25   0.30 10000 1.00
    -theta[16]   0.24    0.00   0.04  0.18  0.24   0.29  6275 1.00
    -theta[17]   0.23    0.00   0.04  0.17  0.23   0.28  5286 1.00
    -theta[18]   0.22    0.00   0.04  0.17  0.22   0.28  4323 1.00
    -kappa     110.35    7.72 182.42 25.39 64.27 210.76   558 1.01
    -phi         0.27    0.00   0.02  0.24  0.27   0.29  4543 1.00
    -
    -Samples were drawn using NUTS(diag_e) at Sat Jan 30 20:57:10 2016.
    -For each parameter, n_eff is a crude measure of effective sample size,
    -and Rhat is the potential scale reduction factor on split chains (at 
    -convergence, Rhat=1).
    +
    fit_hier$print(c("theta", "kappa", "phi"), max_rows=25)
    +
      variable   mean median     sd   mad    q5    q95 rhat ess_bulk ess_tail
    + theta[1]    0.32   0.32   0.05  0.05  0.25   0.41 1.00     4240     6578
    + theta[2]    0.31   0.31   0.05  0.05  0.24   0.40 1.00     5384     6880
    + theta[3]    0.30   0.30   0.05  0.05  0.23   0.39 1.00     6815     7443
    + theta[4]    0.29   0.29   0.05  0.04  0.22   0.38 1.00     8056     6812
    + theta[5]    0.29   0.28   0.05  0.04  0.22   0.36 1.00     9303     6937
    + theta[6]    0.29   0.28   0.04  0.04  0.22   0.37 1.00     8922     6553
    + theta[7]    0.28   0.27   0.04  0.04  0.21   0.35 1.00     9972     6497
    + theta[8]    0.27   0.27   0.04  0.04  0.20   0.34 1.00    10546     7030
    + theta[9]    0.26   0.26   0.04  0.04  0.19   0.33 1.00    10496     6921
    + theta[10]   0.26   0.26   0.04  0.04  0.19   0.33 1.00     9162     6517
    + theta[11]   0.25   0.25   0.04  0.04  0.18   0.32 1.00     7555     5995
    + theta[12]   0.25   0.25   0.04  0.04  0.18   0.32 1.00     8550     6826
    + theta[13]   0.25   0.25   0.04  0.04  0.18   0.32 1.00     9387     6616
    + theta[14]   0.25   0.25   0.04  0.04  0.18   0.32 1.00     8708     6532
    + theta[15]   0.25   0.25   0.04  0.04  0.18   0.32 1.00     8717     6215
    + theta[16]   0.24   0.24   0.04  0.04  0.17   0.31 1.00     7569     6952
    + theta[17]   0.23   0.23   0.04  0.04  0.16   0.30 1.00     4954     5459
    + theta[18]   0.22   0.22   0.04  0.05  0.15   0.29 1.00     4339     5591
    + kappa     125.79  65.22 218.25 48.25 20.11 403.34 1.00     1085      804
    + phi         0.27   0.27   0.02  0.02  0.24   0.30 1.00     4992     6913

    Because the Beta prior is conjugate to the binomial likelihood, the amount of interpolation between the data and the prior in this particular case is easy to quantify. The data consists of \(K\) observations, whereas the prior will be weighted as if it were \(\kappa - 2\) observations (specifically \(\phi \, \kappa - 1\) prior successes and \((1 - \phi) \, \kappa - 1\) prior failures).

    -

    The parameter \(\kappa\) is not well determined by the combination of data and Pareto prior, with a posterior 80% interval of roughly \((25, 225)\). By the informal discussion above, \(\kappa \in (25, 225)\) ranges from weighting the data 2:1 relative to the prior to weighting it 1:5. The wide posterior interval for \(\kappa\) arises because the exact variance in the population is not well constrained by only 18 trials of size 45. If there were more items (higher \(N\)) or even more trials per item (higher \(K\)), the posterior for \(\kappa\) would be more tightly constrained (see the exercises for an example).

    +

    The parameter \(\kappa\) is not well determined by the combination of data and Pareto prior, with a posterior 90% interval of roughly \((20, 364)\). By the informal discussion above, \(\kappa \in (20, 364)\) ranges from weighting the data 2:1 relative to the prior to weighting it 1:8. The wide posterior interval for \(\kappa\) arises because the exact variance in the population is not well constrained by only 18 trials of size 45. If there were more items (higher \(N\)) or even more trials per item (higher \(K\)), the posterior for \(\kappa\) would be more tightly constrained (see the exercises for an example).

    It is also evident from the posterior summary that the lower effective sample size for \(\kappa\) indicates it is not mixing as well as the other components of the model. Again, this is to be expected with a centered hierarchical prior and low data counts. This is an example where a poorly constrained parameter leads to reduced computational efficiency (as reflected in the effective sample size). Such poor mixing is typical of centered parameterizations in hierarchical models (Betancourt and Girolami 2015). It is not immediately clear how to provide a non-centered analysis of the beta prior, because it isn’t supplied with a location/scale parameterization on the unconstrained scale. Instead, we consider an alternative parameterization in the next section.

    -

    Figure 5.3 from (Gelman et al. 2014) plots the fitted values for \(\phi\) and \(\kappa\) on the unconstrained scale, which is the space over which Stan is sampling. The variable \(\phi \in [0,1]\) is transformed to \(\mathrm{logit}(\phi) = \log(\phi / (1 - \phi))\) and \(\kappa \in (0, \infty)\) is transformed to \(\log \kappa\). We reproduce that figure here for our running example.

    -
    df_bda3_fig_5_3 <- with(ss_hier,
    +

    Figure 5.3 from (Gelman et al. 2014) plots the fitted values for \(\phi\) and \(\kappa\) on the unconstrained scale, which is the space over which Stan is sampling. The variable \(\phi \in [0,1]\) is transformed to \(\mathrm{logit}(\phi) = \log(\phi / (1 - \phi))\) and \(\kappa \in (0, \infty)\) is transformed to \(\log \kappa\). We reproduce that figure here for our running example.

    +
    # extract model fit as R dataframe/ tibble for ggplot2
    +hier_df = fit_hier$draws(format="df");
    +
    +bda_plot <- function(df, x_lab, y_lab) {
    +  ggplot(df, aes(x=x, y=y)) +
    +    geom_point(shape=19, alpha=0.15) +
    +    xlab(x_lab) +
    +    ylab(y_lab) +
    +    theme_jupyter();
    +}
    +
    +df_bda3_fig_5_3 <- with(hier_df,
                             data.frame(x = log(phi / (1 - phi)),
                                        y = log(kappa)));
    -phi_sim <- ss_hier$phi;
    -kappa_sim <- ss_hier$kappa;
    -df_bda3_fig_5_3 <- data.frame(x = log(phi_sim / (1 - phi_sim)),
    -                              y = log(kappa_sim));
    -library(ggplot2);
     plot_bda3_fig_5_3 <- 
    -  ggplot(df_bda3_fig_5_3, aes(x=x, y=y)) +
    -  geom_point(shape=19, alpha=0.15) +
    -  xlab("logit(phi) = log(alpha / beta)") +
    -  ylab("log(kappa) = log(alpha + beta)");
    +  bda_plot(df_bda3_fig_5_3, 
    +           x_lab = expression(paste(logit(phi), " = ", log(alpha/beta))), 
    +           y_lab = expression(paste(log(kappa), " = ", log(alpha+beta))));
     plot_bda3_fig_5_3;
    -

    +

    The (upside-down) funnel-like shape of the posterior is evident, with higher \(\kappa\) values corresponding to lower variance (hence the upside-downedness) and thus a narrower range of \(\phi\) values. This particular funnel-like posterior plot arises because when the population variation is large (i.e., \(\kappa\) is small), there is weak non-identifiability between the prior mean (\(\phi\)) and the mean of the random effects (\(\mathrm{mean}(\theta)\)).

    -

    The root problem with the non-centered parameterization, though, is the relation between \(\kappa\) and the \(\theta\). For example, consider the following plot of \(\theta_1\) vs. \(\kappa\).

    -
    inv_logit <- function(u) { 1 / (1 + exp(-u)); }
    -theta1_sim <- ss_hier$theta[ , 1];
    -kappa_sim <- ss_hier$kappa;
    -df_funnel <- data.frame(x = inv_logit(theta1_sim),
    -                        y = log(kappa_sim));
    -library(ggplot2);
    -plot_funnel<- 
    -  ggplot(df_funnel, aes(x=x, y=y)) +
    -  geom_point(shape=19, alpha=0.15) +
    -  xlab("logit(theta[1])") + 
    -  ylab("log(kappa)");
    +

    The root problem with the non-centered parameterization, though, is the relation between \(\kappa\) and the \(\theta\). For example, consider the following plot of \(\theta_1\) vs. \(\kappa\).

    +
    # note: must quote column names that look like indexed expr
    +inv_logit <- function(u) { 1 / (1 + exp(-u)); }
    +df_funnel <- with(hier_df, 
    +                  data.frame(x = inv_logit(hier_df$'theta[1]'), 
    +                             y = log(kappa)))
    +plot_funnel <- 
    +  bda_plot(df_funnel, 
    +           x_lab = expression(paste(logit('theta[1]'))), 
    +           y_lab = expression(paste(log(kappa))));
     plot_funnel;
    -

    +

    We have unconstrained both \(\theta_1\) (using \(\mathrm{logit}^{-1}\)) and \(\kappa\) (using \(\log\)), rendering both on the unconstrained scale over which sampling is actually carried out in Stan. This helps in diagnosing problems with sampling that may not be as apparent on the constrained scale. All of the transforms used by Stan for constraints are detailed in the Stan manual (Stan Development Team 2015).

    It is clear here that when \(\kappa\) is smaller, there is more latitude for \(\theta_1\) to move around. This phenomenon was originally discussed by Neal (2003), is covered in the Stan manual (Stan Development Team 2015), and is analyzed fors Hamiltonian Monte Carlo by Betancourt and Girolami (2015).

    @@ -554,55 +692,77 @@

    Centered Parameterization

    y ~ binomial_logit(K, alpha); // likelihood }

    The full program may be found in hier-logit-centered.stan; it is not included in this document directly (code for the other models is in an appendix at the end of this document).

    -
    fit_hier_logit_centered <- 
    -  stan("hier-logit-centered.stan", data=c("N", "K", "y"),
    -       iter=(M / 2), chains=4,
    -       seed=1234);
    -
    Warning: There were 87 divergent transitions after warmup. Increasing
    -adapt_delta above 0.8 may help.
    -
    Warning: Examine the pairs() plot to diagnose sampling problems
    -
    ss_hier_logit_centered <- extract(fit_hier_logit_centered);
    -
    print(fit_hier_logit_centered, 
    -      c("sigma", "theta[1]", "theta[10]", "alpha[1]", "alpha[10]",
    -        "mu", "sigma", "lp__"), probs=c(0.1, 0.5, 0.9));
    -
    Inference for Stan model: hier-logit-centered.
    -4 chains, each with iter=5000; warmup=2500; thin=1; 
    -post-warmup draws per chain=2500, total post-warmup draws=10000.
    -
    -             mean se_mean    sd     10%     50%     90% n_eff Rhat
    -sigma        0.17    0.01  0.11    0.04    0.16    0.32   236 1.02
    -theta[1]     0.29    0.00  0.04    0.25    0.29    0.35   674 1.01
    -theta[10]    0.26    0.00  0.03    0.22    0.26    0.30  4225 1.00
    -alpha[1]    -0.88    0.01  0.21   -1.10   -0.92   -0.60   643 1.01
    -alpha[10]   -1.05    0.00  0.17   -1.26   -1.05   -0.84  2993 1.00
    -mu          -1.03    0.00  0.09   -1.14   -1.03   -0.91  1257 1.00
    -lp__      -442.26    1.23 11.68 -455.08 -444.57 -425.36    90 1.05
    -
    -Samples were drawn using NUTS(diag_e) at Sat Jan 30 20:57:42 2016.
    -For each parameter, n_eff is a crude measure of effective sample size,
    -and Rhat is the potential scale reduction factor on split chains (at 
    -convergence, Rhat=1).
    -

    With the centered parameterization and Stan’s defalt stepsize and target acceptance rate, there are many divergent transitions. Even with 10,000 posterior draws, the effective sample size for the population scale \(\sigma\) is in the low hundreds, with \(\hat{R} > 1.01\). These results might be usable, but the high \(\hat{R}\) and low effective sample size are sure signs the chains aren’t mixing well.

    -

    Warning: In cases where the chains are not mixing well, Stan’s effective sample size estimate will be much lower than that from packages such as Coda (Plummer 2006), which analyze each chain’s effective sample size separately (Gelman et al. 2013; Stan Development Team 2016). Thus it is important when comparing systems to use Stan’s more conservative effective sample size estimates; the other systems assume complete mixing in their effective sample size calculations and thus overestimate their performance when there is not good mixing.

    -
    alpha_1_centered <- ss_hier_logit_centered$alpha[, 1];
    +
    hier_logit_centered_model <- cmdstan_model(stan_file='hier-logit-centered.stan')
    +fit_hier_logit_centered <- hier_logit_centered_model$sample(data=baseball_data,
    +                                                            iter_warmup=M/4,
    +                                iter_sampling=M/4,
    +                                seed=1234,
    +                                refresh=0)
    +
    Chain 4 Informational Message: The current Metropolis proposal is about to be rejected because of the following issue:
    +
    Chain 4 Exception: normal_lpdf: Scale parameter is 0, but must be positive! (in '/var/folders/db/4jnggnf549s42z50bd61jskm0000gq/T/RtmpngBstU/model-644f74a8fcdd.stan', line 14, column 2 to column 28)
    +
    Chain 4 If this warning occurs sporadically, such as for highly constrained variable types like covariance matrices, then the sampler is fine,
    +
    Chain 4 but if this warning occurs often then your model may be either severely ill-conditioned or misspecified.
    +
    Chain 4 
    +
    
    +Warning: 207 of 10000 (2.0%) transitions ended with a divergence.
    +This may indicate insufficient exploration of the posterior distribution.
    +Possible remedies include: 
    +  * Increasing adapt_delta closer to 1 (default is 0.8) 
    +  * Reparameterizing the model (e.g. using a non-centered parameterization)
    +  * Using informative or weakly informative prior distributions 
    +
    fit_hier_logit_centered$print(c("theta[1]", "theta[10]","alpha[1]", "alpha[10]", "mu", "sigma", "lp__"))
    +
      variable    mean  median    sd   mad      q5     q95 rhat ess_bulk ess_tail
    + theta[1]     0.30    0.29  0.04  0.04    0.24    0.38 1.01     1131     3493
    + theta[10]    0.26    0.26  0.03  0.03    0.21    0.31 1.01     3566     4228
    + alpha[1]    -0.87   -0.90  0.20  0.18   -1.15   -0.49 1.01     1131     3493
    + alpha[10]   -1.05   -1.04  0.18  0.15   -1.35   -0.78 1.01     3566     4228
    + mu          -1.03   -1.02  0.09  0.09   -1.18   -0.88 1.00     1511     3664
    + sigma        0.18    0.16  0.11  0.11    0.04    0.38 1.03      325      223
    + lp__      -443.19 -445.13 10.94 10.32 -458.13 -421.75 1.03      309      218
    +

    With the centered parameterization and Stan’s default stepsize and target acceptance rate, there are many divergent transitions. Even with 10,000 posterior draws, the effective sample size for the population scale \(\sigma\) is in the low hundreds, with \(\hat{R} > 1.01\). These results might be usable, but the high \(\hat{R}\) and low effective sample size are sure signs the chains aren’t mixing well.

    +

    A call to cmdstan_diagnose confirms this.

    +
    fit_hier_logit_centered$cmdstan_diagnose()
    +
    ## Processing csv files: /var/folders/db/4jnggnf549s42z50bd61jskm0000gq/T/Rtmpa1KFsp/hier-logit-centered-202201241717-1-06a121.csv, /var/folders/db/4jnggnf549s42z50bd61jskm0000gq/T/Rtmpa1KFsp/hier-logit-centered-202201241717-2-06a121.csv, /var/folders/db/4jnggnf549s42z50bd61jskm0000gq/T/Rtmpa1KFsp/hier-logit-centered-202201241717-3-06a121.csv, /var/folders/db/4jnggnf549s42z50bd61jskm0000gq/T/Rtmpa1KFsp/hier-logit-centered-202201241717-4-06a121.csv
    +## 
    +## Checking sampler transitions treedepth.
    +## Treedepth satisfactory for all transitions.
    +## 
    +## Checking sampler transitions for divergences.
    +## 207 of 10000 (2.1%) transitions ended with a divergence.
    +## These divergent transitions indicate that HMC is not fully able to explore the posterior distribution.
    +## Try increasing adapt delta closer to 1.
    +## If this doesn't remove all divergences, try to reparameterize the model.
    +## 
    +## Checking E-BFMI - sampler transitions HMC potential energy.
    +## The E-BFMI, 0.16, is below the nominal threshold of 0.3 which suggests that HMC may have trouble exploring the target distribution.
    +## If possible, try to reparameterize the model.
    +## 
    +## Effective sample size satisfactory.
    +## 
    +## Split R-hat values satisfactory all parameters.
    +## 
    +## Processing complete.
    +

    Warning: In cases where the chains are not mixing well, Stan’s effective sample size estimate will be much lower than that from packages such as Coda (Plummer 2006), which analyze each chain’s effective sample size separately (Gelman et al. 2013; Stan Development Team 2016). Thus it is important when comparing systems to use Stan’s more conservative effective sample size estimates; the other systems assume complete mixing in their effective sample size calculations and thus overestimate their performance when there is not good mixing.

    +
    hier_logit_centered_df = as_draws_df(fit_hier_logit_centered$draws());
    +
     funnel_centered_df <-  
    -  with(ss_hier_logit_centered,
    -       data.frame(x = alpha_1_centered,
    +  with(hier_logit_centered_df,
    +       data.frame(x = hier_logit_centered_df$'alpha[1]',
                       y = log(sigma)));
    -funnel_centered <-
    -  ggplot(funnel_centered_df, aes(x=x, y=y)) +
    -  geom_point(shape=19, alpha=0.1) +
    -  xlab("player 1 log odds of success: alpha[1]") + 
    -  ylab("log population scale: log sigma") +
    -  ggtitle("Centered Hierarchical Funnel Plot");
    +funnel_centered <- 
    +  bda_plot(funnel_centered_df, 
    +           x_lab = expression(paste("player 1 log odds of success ", 'alpha[1]')),
    +           y_lab = expression(paste("log population scale ", log(sigma)))) + 
    +  ggtitle("Centered Hierarchical Funnel Plot") +
    +  theme_jupyter();
     funnel_centered;
    -

    +

    The important feature of this plot is the reduced range possible for \(\alpha_1\) as \(\sigma\) approaches zero (i.e., \(\log \sigma\) approaches negative infinity). Neal (2003) refers to the prior for the non-centered model as a “funnel”. With more data, the funnel shape becomes less pronounced and this centered parameterization is actually more efficient than the non-centered parameterization we discuss in the next section (Betancourt and Girolami 2015).

    Non-Centered Parameterization

    Betancourt and Girolami (2015) provide a detailed discussion of why the centered parameterization described in the previous section is challenging for MCMC methods to sample when there are small counts per group (here, the players are the groups and each has only 45 at bats observed).

    -

    To mitigate the problem, they suggest moving to a non-centered parameterization, as has also been shown to be helpful for Gibbs and random-walk Metropolis samplers (Papaspiliopoulos et al. 2003). This basically amounts to changing the parameterization over which sampling is done, taking now a standard unit normal prior for a new variable,

    +

    To mitigate the problem, they suggest moving to a non-centered parameterization, as has also been shown to be helpful for Gibbs and random-walk Metropolis samplers (Papaspiliopoulos et al. 2003). This basically amounts to changing the parameterization over which sampling is done, taking now a standard unit normal prior for a new variable,

    \[ \alpha^{\mathrm{std}}_n = \frac{\alpha_n - \mu}{\sigma}. \]

    @@ -614,177 +774,169 @@

    Non-Centered Parameterization

    \[ \alpha_n = \mu + \sigma \, \alpha^{\mathrm{std}}_n. \]

    -

    We code this implicitly in the Stan model by defining the likelihood as

    -

    \[ -p(y_n \, | \, \alpha^{\mathrm{std}}_n, \mu, \sigma, K) -\ = \ -\mathsf{BinomialLogit}(K_n, \ \mu + \sigma \, \alpha_n). -\]

    This decouples the sampling distribution for \(\alpha^{\mathrm{std}}\) from \(\mu\) and \(\sigma\), greatly reducing their correlation in the posterior.

    -

    The Stan program’s parameter declaration and model directly follow the definition.

    +

    Prior to Stan 2.19, a Stan implementation directly encoded the math.

    parameters {
    -  real mu;                       // population mean of success log-odds
    -  real<lower=0> sigma;           // population sd of success log-odds
    -  vector[N] alpha_std;           // success log-odds
    +  real mu; // population mean of success log-odds
    +  real<lower=0> sigma; // population sd of success log-odds
    +  vector[N] alpha_std; // success log-odds (standardized)
     }
     model {
    -  mu ~ normal(-1, 1);                             // hyperprior
    -  sigma ~ normal(0, 1);                           // hyperprior
    -  alpha_std ~ normal(0, 1);                       // prior
    -  y ~ binomial_logit(K, mu + sigma * alpha_std);  // likelihood
    +  vector[N] alpha = mu + sigma * alpha_std;
    +  ...
    +  alpha_std ~ normal(0, 1); // prior (hierarchical)
    +  y ~ binomial_logit(K, alpha); // likelihood
    +}
    +

    Since Stan version 2.19, the Stan language’s affine transform construct provides a more efficient way to do this.

    +
    +

    Real variables may be declared on a space that has been transformed using an affine transformation \(x\mapsto \mu + \sigma * x\) with offset \(\mu\) and (positive) multiplier \(\sigma\), using a syntax similar to that for bounds. While these transforms do not change the asymptotic sampling behaviour of the resulting Stan program (in a sense, the model the program implements), they can be useful for making the sampling process more efficient by transforming the geometry of the problem to a more natural multiplier and to a more natural offset for the sampling process, for instance by facilitating a non-centered parameterisation.

    +
    +

    Specifying the affine transform in the parameter declaration for \(\alpha^{\mathrm{std}}\) eliminates the need for intermediate variables and makes it easier to see the hierarchical structure of the model.

    +
    parameters {
    +  real mu; // population mean of success log-odds
    +  real<lower=0> sigma; // population sd of success log-odds
    +  vector<offset=mu, multiplier=sigma>[N] alpha_std; // success log-odds (standardized)
    +}
    +model {
    +  mu ~ normal(-1, 1); // hyperprior
    +  sigma ~ normal(0, 1); // hyperprior
    +  alpha_std ~ normal(mu, sigma); // prior (hierarchical)
    +  y ~ binomial_logit(K, alpha_std); // likelihood
     }

    Because the parameters to the prior for \(\sigma\) are constants, the normalization for the half-prior (compared to the full prior) is constant and does not need to be included in the notation. This only works if the parameters to the density are data or constants; if they are defined as parameters or as quantities depending on parameters, then explicit truncation is required.

    For the purposes of comparison, the chance of success \(\theta\) is computed as a generated quantity.

    generated quantities {
    -  vector[N] theta;  // chance of success
    -  ...
    -  for (n in 1:N)
    -    theta[n] <- inv_logit(mu + sigma * alpha_std[n]);
    +  vector[N] theta = inv_logit(alpha_std);
       ...
     }

    The full Stan program for the hierarchical logistic model is in hier-logit.stan and displayed in the appendix.

    It is fit and the values are extracted as follows.

    -
    fit_hier_logit <- stan("hier-logit.stan", data=c("N", "K", "y", "K_new", "y_new"),
    -                       iter=(M / 2), chains=4,
    -                       control=list(stepsize=0.01, adapt_delta=0.99));
    -ss_hier_logit <- extract(fit_hier_logit);
    +
    hier_logit_model <- cmdstan_model(stan_file='hier-logit.stan', include_paths=c('include'))
    +fit_hier_logit <- hier_logit_model$sample(data=baseball_data,
    +                                          iter_sampling=M/4,
    +                                          iter_warmup=M/4,
    +                                          seed=1234,
    +                                          step_size=0.01,
    +                                          adapt_delta=0.99,
    +                      refresh=0)

    We can print as before.

    -
    print(fit_hier_logit, c("alpha_std", "theta", "mu", "sigma"), probs=c(0.1, 0.5, 0.9));
    -
    Inference for Stan model: hier-logit.
    -4 chains, each with iter=5000; warmup=2500; thin=1; 
    -post-warmup draws per chain=2500, total post-warmup draws=10000.
    -
    -               mean se_mean   sd   10%   50%   90% n_eff Rhat
    -alpha_std[1]   0.69    0.01 0.96 -0.58  0.72  1.89 10000    1
    -alpha_std[2]   0.57    0.01 0.92 -0.63  0.59  1.71 10000    1
    -alpha_std[3]   0.45    0.01 0.93 -0.75  0.47  1.61 10000    1
    -alpha_std[4]   0.34    0.01 0.91 -0.84  0.37  1.48 10000    1
    -alpha_std[5]   0.24    0.01 0.91 -0.92  0.26  1.39 10000    1
    -alpha_std[6]   0.22    0.01 0.90 -0.95  0.24  1.37 10000    1
    -alpha_std[7]   0.13    0.01 0.89 -1.00  0.13  1.25 10000    1
    -alpha_std[8]   0.01    0.01 0.90 -1.15  0.02  1.13 10000    1
    -alpha_std[9]  -0.09    0.01 0.89 -1.22 -0.10  1.05 10000    1
    -alpha_std[10] -0.09    0.01 0.89 -1.23 -0.09  1.04 10000    1
    -alpha_std[11] -0.21    0.01 0.88 -1.31 -0.22  0.90 10000    1
    -alpha_std[12] -0.21    0.01 0.90 -1.36 -0.21  0.95 10000    1
    -alpha_std[13] -0.22    0.01 0.91 -1.37 -0.22  0.94 10000    1
    -alpha_std[14] -0.22    0.01 0.91 -1.35 -0.22  0.93 10000    1
    -alpha_std[15] -0.22    0.01 0.91 -1.36 -0.23  0.95 10000    1
    -alpha_std[16] -0.34    0.01 0.90 -1.47 -0.36  0.82 10000    1
    -alpha_std[17] -0.44    0.01 0.93 -1.62 -0.46  0.77 10000    1
    -alpha_std[18] -0.56    0.01 0.95 -1.77 -0.57  0.68 10000    1
    -theta[1]       0.29    0.00 0.04  0.25  0.29  0.35 10000    1
    -theta[2]       0.29    0.00 0.04  0.25  0.28  0.34 10000    1
    -theta[3]       0.28    0.00 0.04  0.24  0.28  0.33 10000    1
    -theta[4]       0.28    0.00 0.04  0.24  0.27  0.33 10000    1
    -theta[5]       0.27    0.00 0.03  0.24  0.27  0.32 10000    1
    -theta[6]       0.27    0.00 0.03  0.24  0.27  0.32 10000    1
    -theta[7]       0.27    0.00 0.03  0.23  0.27  0.31 10000    1
    -theta[8]       0.27    0.00 0.03  0.23  0.26  0.30 10000    1
    -theta[9]       0.26    0.00 0.03  0.22  0.26  0.30 10000    1
    -theta[10]      0.26    0.00 0.03  0.22  0.26  0.30 10000    1
    -theta[11]      0.26    0.00 0.03  0.21  0.26  0.29 10000    1
    -theta[12]      0.26    0.00 0.03  0.21  0.26  0.29 10000    1
    -theta[13]      0.26    0.00 0.03  0.21  0.26  0.29 10000    1
    -theta[14]      0.26    0.00 0.03  0.21  0.26  0.29 10000    1
    -theta[15]      0.26    0.00 0.03  0.21  0.26  0.29 10000    1
    -theta[16]      0.25    0.00 0.03  0.21  0.25  0.29 10000    1
    -theta[17]      0.25    0.00 0.03  0.20  0.25  0.29 10000    1
    -theta[18]      0.24    0.00 0.04  0.19  0.25  0.28 10000    1
    -mu            -1.03    0.00 0.09 -1.14 -1.03 -0.91 10000    1
    -sigma          0.17    0.00 0.11  0.03  0.15  0.32  3634    1
    -
    -Samples were drawn using NUTS(diag_e) at Sat Jan 30 20:58:17 2016.
    -For each parameter, n_eff is a crude measure of effective sample size,
    -and Rhat is the potential scale reduction factor on split chains (at 
    -convergence, Rhat=1).
    -

    It is clear from the wide posteriors for the \(\theta_n\) that there is considerable uncertainty in the estimates of chance-of-success on an item-by-item basis. With an 80% interval of \((0.03, 0.32)\), it is clear that the data is consistent with complete pooling (i.e., \(\sigma = 0\)).

    -

    Compared to the direct beta priors with uniform and Pareto hyperpriors shown in the first example, the normal prior on log odds exerts more pull toward the population mean. The posterior means for \(\theta\) ranged from 0.22 to 0.32 with the beta prior, but only range from 0.24 to 0.29 for the normal prior. Furthermore, the posterior intervals for each values are shrunk compared to the beta prior. For example, Roberto Clemente (\(n = 1\)), has an 80% central posterior interval of \((0.25, 0.35)\) in the logistic model, whereas he had an 80% posterior interval of \((.26, .39)\) with a hierarchical beta prior.

    +
    fit_hier_logit$print(c("alpha_std", "theta", "mu", "sigma"), max_rows=100)
    +
          variable  mean median   sd  mad    q5   q95 rhat ess_bulk ess_tail
    + alpha_std[1]  -0.88  -0.92 0.20 0.18 -1.15 -0.50 1.00     7456     8818
    + alpha_std[2]  -0.91  -0.94 0.19 0.17 -1.17 -0.54 1.00     7517     8168
    + alpha_std[3]  -0.93  -0.96 0.18 0.16 -1.19 -0.60 1.00     9691     8443
    + alpha_std[4]  -0.96  -0.97 0.18 0.15 -1.21 -0.64 1.00    10923     7757
    + alpha_std[5]  -0.98  -0.99 0.17 0.15 -1.24 -0.67 1.00    12779     8965
    + alpha_std[6]  -0.98  -0.99 0.17 0.15 -1.24 -0.68 1.00    13206     8042
    + alpha_std[7]  -1.00  -1.01 0.16 0.14 -1.27 -0.72 1.00    14130     8161
    + alpha_std[8]  -1.02  -1.02 0.17 0.14 -1.30 -0.75 1.00    15119     8516
    + alpha_std[9]  -1.05  -1.04 0.17 0.14 -1.35 -0.78 1.00    16574     8941
    + alpha_std[10] -1.05  -1.04 0.17 0.14 -1.34 -0.78 1.00    14233     8302
    + alpha_std[11] -1.07  -1.05 0.17 0.15 -1.38 -0.82 1.00    13498     8066
    + alpha_std[12] -1.07  -1.06 0.18 0.15 -1.39 -0.81 1.00    13030     8493
    + alpha_std[13] -1.07  -1.05 0.18 0.15 -1.38 -0.81 1.00    12835     8252
    + alpha_std[14] -1.07  -1.05 0.18 0.15 -1.38 -0.82 1.00    11517     8410
    + alpha_std[15] -1.07  -1.05 0.18 0.15 -1.39 -0.82 1.00    13300     8256
    + alpha_std[16] -1.10  -1.07 0.18 0.15 -1.43 -0.84 1.00    11204     8857
    + alpha_std[17] -1.12  -1.09 0.19 0.16 -1.49 -0.86 1.00     9582     8338
    + alpha_std[18] -1.15  -1.11 0.21 0.18 -1.54 -0.87 1.00     8358     8362
    + theta[1]       0.29   0.28 0.04 0.04  0.24  0.38 1.00     7456     8818
    + theta[2]       0.29   0.28 0.04 0.03  0.24  0.37 1.00     7517     8168
    + theta[3]       0.28   0.28 0.04 0.03  0.23  0.36 1.00     9691     8443
    + theta[4]       0.28   0.27 0.04 0.03  0.23  0.35 1.00    10923     7757
    + theta[5]       0.27   0.27 0.03 0.03  0.22  0.34 1.00    12779     8965
    + theta[6]       0.27   0.27 0.03 0.03  0.22  0.34 1.00    13206     8042
    + theta[7]       0.27   0.27 0.03 0.03  0.22  0.33 1.00    14130     8161
    + theta[8]       0.27   0.26 0.03 0.03  0.21  0.32 1.00    15119     8516
    + theta[9]       0.26   0.26 0.03 0.03  0.21  0.31 1.00    16574     8941
    + theta[10]      0.26   0.26 0.03 0.03  0.21  0.31 1.00    14233     8302
    + theta[11]      0.26   0.26 0.03 0.03  0.20  0.31 1.00    13498     8066
    + theta[12]      0.26   0.26 0.03 0.03  0.20  0.31 1.00    13030     8493
    + theta[13]      0.26   0.26 0.03 0.03  0.20  0.31 1.00    12835     8252
    + theta[14]      0.26   0.26 0.03 0.03  0.20  0.31 1.00    11517     8410
    + theta[15]      0.26   0.26 0.03 0.03  0.20  0.31 1.00    13300     8256
    + theta[16]      0.25   0.26 0.03 0.03  0.19  0.30 1.00    11204     8857
    + theta[17]      0.25   0.25 0.03 0.03  0.18  0.30 1.00     9582     8338
    + theta[18]      0.24   0.25 0.04 0.03  0.18  0.29 1.00     8358     8362
    + mu            -1.03  -1.02 0.09 0.09 -1.18 -0.88 1.00     9370     7012
    + sigma          0.17   0.15 0.11 0.12  0.02  0.36 1.00     3448     4162
    +

    It is clear from the wide posteriors for the \(\theta_n\) that there is considerable uncertainty in the estimates of chance-of-success on an item-by-item basis. With an 90% interval of \((0.02, 0.36)\) for sigma, it is clear that the data is consistent with complete pooling (i.e., \(\sigma = 0\)).

    +

    Compared to the direct beta priors with uniform and Pareto hyperpriors shown in the first example, the normal prior on log odds exerts more pull toward the population mean. The posterior means for \(\theta\) ranged from 0.22 to 0.32 with the beta prior, but only range from 0.24 to 0.29 for the normal prior. Furthermore, the posterior intervals for each values are shrunk compared to the beta prior. For example, Roberto Clemente (\(n = 1\)), has an 90% central posterior interval of \((0.24, 0.35)\) in the logistic model, whereas he had an 90% posterior interval of \((.25, .42)\) with a hierarchical beta prior.

    To consider how the reparameterization is working, we plot the posterior for the mean and log scale of the hyperprior.

    -
    df_bda3_fig_5_3_logit <- with(ss_hier_logit,
    +
    hier_logit_df = as_draws_df(fit_hier_logit$draws());
    +df_bda3_fig_5_3_logit <- with(hier_logit_df,
                                   data.frame(x = mu,
                                              y = log(sigma)));
     plot_bda3_fig_5_3_logit <- 
    -  ggplot(df_bda3_fig_5_3_logit, aes(x=x, y=y)) +
    -  geom_point(shape=19, alpha=0.15) +
    -  xlab("mu") + 
    -  ylab("log sigma")
    +  bda_plot(df_bda3_fig_5_3_logit, 
    +           x_lab = expression(mu), 
    +           y_lab = expression(log(sigma)))
     plot_bda3_fig_5_3_logit;
    -

    +

    As before, we see that the prior location (\(\mu\)) and scale (\(\sigma\)) are coupled in the posterior. For comparison, here’s the scatterplot of the posterior sample values for log scale and the first transformed parameter.

    df_bda3_fig_5_3_funnel <-  
    -  with(ss_hier_logit,
    -       data.frame(x = alpha_std[, 1],
    +  with(hier_logit_df,
    +       data.frame(x = hier_logit_df$'alpha_std[1]',
                       y = log(sigma)));
     plot_bda3_fig_5_3_funnel <-
    -  ggplot(df_bda3_fig_5_3_funnel, aes(x=x, y=y)) +
    -  geom_point(shape=19, alpha=0.15) +
    -  xlab("player 1 log odds of success (standardized): alpha[1]") + 
    -  ylab("log population scale: log sigma") +
    +  bda_plot(df_bda3_fig_5_3_funnel, 
    +           x_lab = expression(paste("player 1 log odds (standardized), ", 'alpha_std[1]')),
    +           y_lab = expression(paste("log population scale, ", log(sigma)))) + 
       ggtitle("Non-Centered Hierarchical Funnel Plot");
     plot_bda3_fig_5_3_funnel;
    -

    -

    Compared to the earlier plot of \(\log \kappa\) versus \(\mathrm{logit}(\theta_1)\) in the direct hierarcical model on chance of success, there is much less of a dependency between the variables; this can be seen because the variation is along the axes, not in a weak banana shape as in the first example. But it still has the long-tail problem as \(\sigma\) approaches zero (and \(\log \sigma\) becomes more negative).

    +

    +

    Compared to the earlier plot of \(\log \kappa\) versus \(\mathrm{logit}(\theta_1)\) in the direct hierarchical model on chance of success, there is much less of a dependency between the variables; this can be seen because the variation is along the axes, not in a weak banana shape as in the first example. But it still has the long-tail problem as \(\sigma\) approaches zero (and \(\log \sigma\) becomes more negative).

    -
    +

    Observed vs. Estimated Chance of Success

    -

    Figure 5.4 from (Gelman et al. 2013) plots the observed number of successes \(y_n\) for the first \(K_n\) trials versus the median and 80% intervals for the estimated chance-of-success parameters \(\theta_n\) in the posterior. The following R code reproduces a similar plot for our data.

    -
    ss_quantile <- function(ss, N, q) {
    -  result <- rep(NA, N);
    -  for (n in 1:N) {
    -    result[n] <- sort(ss$theta[,n])[M * q];
    -  }
    -  return(result);
    +

    Figure 5.4 from (Gelman et al. 2013) plots the observed number of successes \(y_n\) for the first \(K_n\) trials versus the median and 80% intervals for the estimated chance-of-success parameters \(\theta_n\) in the posterior. The following R code reproduces a similar plot for our data.

    +
    # summary statistics - quantiles 10%, 50%, 90%
    +ss_quantiles <- function(draws_df) {
    +  apply(select(draws_df, starts_with('theta')), 2, quantile, probs = c(0.1, 0.5, 0.9));
     }
     
    -theta_10_pool <- ss_quantile(ss_pool, N, 0.1);
    -theta_50_pool <- ss_quantile(ss_pool, N, 0.5);
    -theta_90_pool <- ss_quantile(ss_pool, N, 0.9);
    -
    -theta_10_no_pool <- ss_quantile(ss_no_pool, N, 0.1);
    -theta_50_no_pool <- ss_quantile(ss_no_pool, N, 0.5);
    -theta_90_no_pool <- ss_quantile(ss_no_pool, N, 0.9);
    +pool_df <- as_draws_df(fit_pool$draws());
    +theta_pool <- ss_quantiles(pool_df);
     
    -theta_10_hier <- ss_quantile(ss_hier, N, 0.1);
    -theta_50_hier <- ss_quantile(ss_hier, N, 0.5);
    -theta_90_hier <- ss_quantile(ss_hier, N, 0.9);
    +no_pool_df <- as_draws_df(fit_no_pool$draws());
    +theta_no_pool <- ss_quantiles(no_pool_df);
     
    -theta_10_hier_logit <- ss_quantile(ss_hier_logit, N, 0.1);
    -theta_50_hier_logit <- ss_quantile(ss_hier_logit, N, 0.5);
    -theta_90_hier_logit <- ss_quantile(ss_hier_logit, N, 0.9);
    -
    -pop_mean <- sum(y) / sum(K);
    +theta_hier <- ss_quantiles(hier_df);
    +theta_hier_logit <- ss_quantiles(hier_logit_df);
     
    +models <- c("complete pooling", "no pooling", "partial pooling", 
    +            "partial pooling, log odds")
     df_plot2 <- data.frame(x = rep(y / K, 4),
    -                       y = c(theta_50_pool, theta_50_no_pool,
    -                             theta_50_hier, theta_50_hier_logit),
    -                       model = c(rep("complete pooling", N),
    -                                 rep("no pooling", N),
    -                                 rep("partial pooling", N),
    -                 rep("partial pooling (log odds)", N)));
    +                       y = c(theta_pool["50%",], theta_no_pool["50%",],
    +                             theta_hier["50%",], theta_hier_logit["50%",]),
    +                       ymin = c(theta_pool["10%",], theta_no_pool["10%",],
    +                             theta_hier["10%",], theta_hier_logit["10%",]),
    +                       ymax = c(theta_pool["90%",], theta_no_pool["90%",],
    +                             theta_hier["90%",], theta_hier_logit["90%",]),
    +                       model = rep(models, each = N));
     
    +pop_mean <- sum(y) / sum(K);
     plot_bda3_fig_5_4 <-
    -  ggplot(df_plot2, aes(x=x, y=y)) +
    -  geom_hline(aes(yintercept=pop_mean), colour="lightpink") +
    +  ggplot(df_plot2, aes(x=x, y=y, ymin=ymin, ymax=ymax)) +
    +  geom_hline(yintercept=pop_mean, colour="orange") +
       geom_abline(intercept=0, slope=1, colour="skyblue") +
       facet_grid(. ~ model) +
    -  geom_errorbar(aes(ymin=c(theta_10_pool, theta_10_no_pool,
    -                           theta_10_hier, theta_10_hier_logit),
    -                    ymax=c(theta_90_pool, theta_90_no_pool,
    -                           theta_90_hier, theta_90_hier_logit)),
    -                width=0.005, colour="gray60") +
    +  geom_errorbar(width=0.005, colour="gray60") +
       geom_point(colour="gray30", size=0.75) +
       coord_fixed() +
       scale_x_continuous(breaks = c(0.2, 0.3, 0.4)) +
    -  xlab("observed rate, y[n] / K[n]") +
    -  ylab("chance of success, theta[n]") +
    -  ggtitle("Posterior Medians and 80% intervals\n(red line: population mean;  blue line: MLE)")
    +  xlab(expression(paste("observed rate ", y[n] / K[n]))) +
    +  ylab(expression(paste("chance of success ", theta[n]))) +
    +  ggtitle("Posterior Medians and 80% intervals",
    +         subtitle="orange line: population mean;  blue line: MLE") +
    +  theme_jupyter();
    +
    +options(repr.plot.width=20, repr.plot.height=10)
     plot_bda3_fig_5_4;
    -

    +

    The horizontal axis is the observed rate of success, broken out by player (the overplotting is from players with the same number of successes—they all had the same number of trials in this data). The dots are the posterior medians with bars extending to cover the central 80% posterior interval. Players with the same observed rates are indistinguishable, any differences in estimates are due to MCMC error.

    -

    The horizontal red line has an intercept equal to the overall success rate,

    -

    \[ +

    The horizontal red line has an intercept equal to the overall success rate, \[ \frac{\sum_{n=1}^N y_n}{\sum_{n=1}^N K_n} \ = \ \frac{215}{810} @@ -802,7 +954,7 @@

    Posterior Predictive Distribution

    After we have fit a model using some “training” data, we are usually interested in the predictions of the fitted model for new data, which we can use to

    • make predictions for new data points; e.g., predict how many hits will Roberto Clemente get in the rest of the season,

    • -
    • evaluate predictions against observed future data; e.g., how well did we predict how many hits Roberto Clement actually got in the rest of the season, and

    • +
    • evaluate predictions against observed future data; e.g., how well did we predict how many hits Roberto Clemente actually got in the rest of the season, and

    • generate new simulated data to validate our model fits.

    With full Bayesian inference, we do not make a point estimate of parameters and use those prediction—we instead use an average of predictions weighted by the posterior.

    @@ -815,7 +967,7 @@

    Posterior Predictive Distribution

    where \(\Theta\) is the support of the parameters \(\theta\). What an integral of this form says is that \(p(\tilde{y} \, | \, y)\) is defined as a weighted average over the legal parameter values \(\theta \in \Theta\) of the likelihood function \(p(\tilde{y} \, | \, \theta)\), with weights given by the posterior, \(p(\theta \, | \, y)\). While we do not want to get sidetracked with the notational and mathematical subtleties of expectations here, the posterior predictive density reduces to the expectation of \(p(\tilde{y} \, | \, \theta)\) conditioned on \(y\).

    Evaluating Held-Out Data Predictions

    -

    Because the posterior predictive density is formulated as an expectation over the posterior, it is possible to compute via MCMC. With \(M\) draws \(\theta^{(m)}\) from the posterior \(p(\theta \, | \, y)\), the posterior predicitve log density for new data \(y^{\mathrm{new}}\) is given by

    +

    Because the posterior predictive density is formulated as an expectation over the posterior, it is possible to compute via MCMC. With \(M\) draws \(\theta^{(m)}\) from the posterior \(p(\theta \, | \, y)\), the posterior predictive log density for new data \(y^{\mathrm{new}}\) is given by

    \[ p(y^{\mathrm{new}} \, | \, y) \ \approx \ @@ -835,11 +987,11 @@

    Prediction for New Trials

    After evaluating posterior predictive densities for new data, we consider how well the model actually fit the data with which it is estimated.

    Calibration

    -

    A well calibrated statistical model is one in which the uncertainy in the predictions matches the uncertainty in further data. That is, if we estimate posterior 50% intervals for predictions on new data (here, number of hits in the rest of the season for each player), roughly 50% of the new data should fall in its predicted 50% interval. If the model is true in the sense of correctly describing the generative process of the data, then Bayesian inference is guaranteed to be well calibrated. Given that our models are rarely correct in this deep sense, in practice we are concerned with testing their calibration on quantities of interest.

    +

    A well calibrated statistical model is one in which the uncertainty in the predictions matches the uncertainty in further data. That is, if we estimate posterior 50% intervals for predictions on new data (here, number of hits in the rest of the season for each player), roughly 50% of the new data should fall in its predicted 50% interval. If the model is true in the sense of correctly describing the generative process of the data, then Bayesian inference is guaranteed to be well calibrated. Given that our models are rarely correct in this deep sense, in practice we are concerned with testing their calibration on quantities of interest.

    Sharpness

    -

    Given two well calibrated models, the one that makes the more precise predictions in the sense of having narrower intervals is better predictively (Gneiting et al. 2007). To see this in an example, we would rather have a well-calibrated prediction that there’s a 90% chance the number of hits for a player in the rest of the season will fall in \((120, 130)\) than a 90% prediction that the number of hits will fall in \((100, 150)\).

    +

    Given two well calibrated models, the one that makes the more precise predictions in the sense of having narrower intervals is better predictively (Gneiting et al. 2007). To see this in an example, we would rather have a well-calibrated prediction that there’s a 90% chance the number of hits for a player in the rest of the season will fall in \((120, 130)\) than a 90% prediction that the number of hits will fall in \((100, 150)\).

    For the models introduced here, a posterior that is a delta function provides the sharpest predictions. Even so, there is residual uncertainty due to the repeated trials; with \(K^{\mathrm{new}}\) further trials and a a fixed \(\theta_n\) chance of success, the random variable \(Y^{\mathrm{new}}_n\) denoting the number of further successes for item \(n\) has a standard deviation from the repeated binary trials of

    \[ \mathrm{sd}[Y^{\mathrm{new}}_n] \ = \ \sqrt{K \ @@ -848,11 +1000,11 @@

    Sharpness

    Why Evaluate with the Predictive Posterior?

    -

    The predictive posterior density directly measures the probability of seeing the new data. The higher the probability assigned to the new data, the better job the model has done at predicting the outcome. In the limit, an ideal model would perfectly predict the new outcome with no uncertainty (probability of 1 for a discrete outcome or a delta function at the true value for the density in a continuous outcome). This notion is related to the notion of sharpness discussed in the previous section, because if the new observations have higher predictive densities, they’re probably within narrower posterior intervals (Gneiting et al. 2007).

    +

    The predictive posterior density directly measures the probability of seeing the new data. The higher the probability assigned to the new data, the better job the model has done at predicting the outcome. In the limit, an ideal model would perfectly predict the new outcome with no uncertainty (probability of 1 for a discrete outcome or a delta function at the true value for the density in a continuous outcome). This notion is related to the notion of sharpness discussed in the previous section, because if the new observations have higher predictive densities, they’re probably within narrower posterior intervals (Gneiting et al. 2007).

    Computing the Log Predictive Posterior Density

    -

    The log of posterior predicitve density is defined in the obvious way as

    +

    The log of posterior predictive density is defined in the obvious way as

    \[ \log p(\tilde{y} \, | \, y) = \log \int_{\Theta} p(\tilde{y} \, | \, \theta) @@ -873,17 +1025,19 @@

    Computing the Log Predictive Posterior Density

    \int_{\Theta} \left( \, \log p(\tilde{y} \, | \, \theta) \, \right) \ p(\theta \, | \, y) \ \mathrm{d}\theta. \]

    We’ll compute both expectations and demonstrate Jensen’s inequality in our running example.

    -

    The integer variables K_new[n] and y_new[n] declared in the data block hold the number of at bats (trials) and the number of hits (successes) for player (item) n. To code the evaluation of the held out data in Stan, we declare a generated quantities variable (log_p_news) to hold the log density of each data point and define it in the obvious way.

    +

    The integer variables K_new[n] and y_new[n] declared in the data block hold the number of at bats (trials) and the number of hits (successes) for player (item) n. To code the evaluation of the held out data in Stan, we declare a generated quantities variable (log_p_new) which is the sum of the per-item posterior predictive log densities.

    +

    Because we are computing this quantity across all models, we put the code into its own file named “gq-postpred.stan” and Stan’s include directive in all models. Include directive in the model’s generated quantities block.

    generated quantities {
       ...
    -  real log_p_new;        // posterior predictive log density remaining trials
    -  vector[N] log_p_news;  // posterior predictive log density for item
    -  ...
    -  for (n in 1:N)
    -    log_p_news[n] <- binomial_log(y_new[n], K_new[n], theta[n]);
    -  log_p_new <- sum(log_p_news);
    +  #include gq-postpred.stan
       ...
     }
    +

    First we compute the posterior predictive density for the remaining trials.

    +
      // posterior predictive log density remaining trials
    +  real log_p_new = 0;
    +  for (n in 1 : N) {
    +    log_p_new = log_p_new + binomial_lpmf(y_new[n] | K_new[n], theta[n]);
    +  }

    The posterior mean for log_p_new will give us

    \[ \int_{\Theta} \left( \log p(\tilde{y} \, | \, \theta) \right) @@ -893,16 +1047,14 @@

    Computing the Log Predictive Posterior Density

    \frac{1}{M} \, \sum_{m=1}^M \log p(y^{\mathrm{new}} \, | \, \theta^{(m)}). \]

    This calculation is included in all four of the models we have previously fit and can be displayed directly as follows as the posterior mean of log_p_new.

    -
    print(sprintf("%10s  %16s", "MODEL", "VALUE"), quote = FALSE);
    -
    [1]      MODEL             VALUE
    -
    print(sprintf("%10s  %16.0f", "pool", mean(ss_pool$log_p_new)), quote=FALSE);
    -
    [1]       pool               -81
    -
    print(sprintf("%10s  %16.0f", "no pool", mean(ss_no_pool$log_p_new)), quote=FALSE);
    -
    [1]    no pool              -217
    -
    print(sprintf("%10s  %16.0f", "hier", mean(ss_hier$log_p_new)), quote=FALSE);
    -
    [1]       hier              -125
    -
    print(sprintf("%10s  %16.0f", "hier logit", mean(ss_hier_logit$log_p_new)), quote=FALSE);
    -
    [1] hier logit              -101
    +
    message(sprintf("%25s  %5.1f", "complete pooling", mean(pool_df$log_p_new)));
    +
             complete pooling  -81.2
    +
    message(sprintf("%25s  %5.1f", "no pooling", mean(no_pool_df$log_p_new)));
    +
                   no pooling  -216.2
    +
    message(sprintf("%25s  %5.1f", "partial pooling", mean(hier_df$log_p_new)));
    +
              partial pooling  -123.5
    +
    message(sprintf("%25s  %5.1f", "partial pooling, log odds", mean(hier_logit_df$log_p_new)));
    +
    partial pooling, log odds  -101.1

    From a predictive standpoint, the models are ranked by the amount of pooling they do, with complete pooling being the best, and no pooling being the worst predictively. All of these models do predictions by averaging over their posteriors, with the amount of posterior uncertainty also being ranked in reverse order of the amount of pooling they do.

    @@ -928,7 +1080,7 @@

    Calculating posterior log density

    + \mathrm{log\_sum\_exp \, }_{m=1}^M \ \log p(y^{\mathrm{new}} \, | \, \theta^{(m)}) \]

    -

    We can compute \(mathrm{log\_sum\_exp}\) stably by subtracting the max value. Suppose \(u = u_1, \ldots, u_M\), and \(\max(u)\) is the largest \(u_m\). We can calculate

    +

    We can compute \(\mathrm{log\_sum\_exp}\) stably by subtracting the max value. Suppose \(u = u_1, \ldots, u_M\), and \(\max(u)\) is the largest \(u_m\). We can calculate

    \[ \mathrm{log\_sum\_exp \, }_{m=1}^M \ u_m \ = \ @@ -947,19 +1099,19 @@

    Calculating posterior log density

    return(max_u + log(a)); }

    and then use it to print the log posterior predictive densities for our fit.

    -
    print_new_lp <- function(name, ss) {  
    -  lp <- -log(M) + log_sum_exp(ss$log_p_new);
    -  print(sprintf("%25s  %5.1f", name,  lp), quote=FALSE);
    +
    print_new_lp <- function(name, df) {  
    +  lp <- -log(nrow(df)) + log_sum_exp(df$log_p_new);
    +  message(sprintf("%25s  %5.1f", name,  lp));
     }
     
    -print_new_lp("complete pooling", ss_pool); 
    -
    [1]          complete pooling  -74.7
    -
    print_new_lp("no pooling", ss_no_pool); 
    -
    [1]                no pooling  -97.2
    -
    print_new_lp("partial pooling", ss_hier); 
    -
    [1]           partial pooling  -74.3
    -
    print_new_lp("partial pooling (logit)", ss_hier_logit); 
    -
    [1]   partial pooling (logit)  -71.6
    +print_new_lp("complete pooling", pool_df);
    +
             complete pooling  -74.7
    +
    print_new_lp("no pooling", no_pool_df); 
    +
                   no pooling  -99.1
    +
    print_new_lp("partial pooling", hier_df); 
    +
              partial pooling  -74.8
    +
    print_new_lp("partial pooling (logit)", hier_logit_df); 
    +
      partial pooling (logit)  -73.7

    Now the ranking is different! As expected, the values here are lower than the expectation of the log density due to Jensen’s inequality. The hierarchical logit model appears to be making slightly better predictions than the full pooling model, which in turn is making slightly better predictions than the basic hierarchical model.

    @@ -968,56 +1120,62 @@

    Predicting New Observations

    We showed above that it is straightforward to generate draws from the posterior predictive distribution. With this capability, we can either generate predictions for new data or we can replicate the data we already have.

    We let \(z_n\) be the number of successes for item \(n\) in \(K^{\mathrm{new}}_n\) further trials.

    The posterior predictions can be declared in Stan as generated quantities. Their values are determined by calling the binomial pseudorandom number generator, which corresponds to the binomial sampling distribution (likelihood) in this case.

    -
    generated quantities {
    -  ...
    -  int<lower=0> z[N];  // posterior prediction remaining trials
    -  ...
    -  for (n in 1:N)
    -    z[n] <- binomial_rng(K_new[n], theta[n]);
    -  ...
    -}
    +
      array[N] int<lower=0> z = binomial_rng(K_new, theta);

    This formulation makes it clear that there are two sources of uncertainty in our predictions, the first being the uncertainty in \(\theta\) in the posterior \(p(\theta \, | \, y)\) and the second being the uncertainty due to the likelihood \(p(\tilde{y} \, | \, \theta)\).

    It might seem tempting to eliminate that second source of uncertainty and set \(z_n^{(m)}\) to its expectation, \(\theta_n^{(m)} \, K^{\mathrm{new}}\), at each iteration rather than simulating a new value. Or it might seem tempting to remove the first source of uncertainty and use the posterior mean (or median or mode or …) rather than draws fro the posterior. Either way, the resulting values would suffice for estimating the posterior mean, but would not capture the uncertainty in the prediction for \(y^{\mathrm{new}}_n\) and would thus not be useful in estimating predictive standard deviations or quantiles or as the basis for decision making under uncertainty. In other words, the predictions would not be properly calibrated (in a sense we define below).

    The number of remaining at-bats \(K^{\mathrm{new}}\) was printed out in the original table along with the actual number of hits.

    -
    print(fit_pool, c("z"), probs=c(0.1, 0.5, 0.9), digits=0);
    -
    Inference for Stan model: pool.
    -4 chains, each with iter=5000; warmup=2500; thin=1; 
    -post-warmup draws per chain=2500, total post-warmup draws=10000.
    -
    -      mean se_mean sd 10% 50% 90% n_eff Rhat
    -z[1]    98       0 10  85  98 111  5890    1
    -z[2]   113       0 11  99 113 128  6530    1
    -z[3]   139       0 13 123 139 155  5024    1
    -z[4]    73       0  9  62  73  84  6673    1
    -z[5]   111       0 11  98 111 125  5728    1
    -z[6]   124       0 12 109 124 139  6184    1
    -z[7]   156       0 14 138 156 174  5834    1
    -z[8]    37       0  6  30  37  44  7598    1
    -z[9]   136       0 13 119 136 152  5707    1
    -z[10]   53       0  7  45  53  62  6553    1
    -z[11]  143       0 13 127 143 161  5596    1
    -z[12]   50       0  7  41  49  58  7037    1
    -z[13]  116       0 11 102 116 130  6149    1
    -z[14]   74       0  9  63  74  85  6345    1
    -z[15]  157       0 14 140 157 175  5434    1
    -z[16]  149       0 13 132 148 166  5751    1
    -z[17]  109       0 11  95 108 123  6534    1
    -z[18]   19       0  4  14  19  24  8737    1
    -
    -Samples were drawn using NUTS(diag_e) at Sat Jan 30 20:56:04 2016.
    -For each parameter, n_eff is a crude measure of effective sample size,
    -and Rhat is the potential scale reduction factor on split chains (at 
    -convergence, Rhat=1).
    +
    print(df)
    +
    ##    FirstName   LastName Hits At.Bats RemainingAt.Bats RemainingHits
    +## 1    Roberto   Clemente   18      45              367           127
    +## 2      Frank   Robinson   17      45              426           127
    +## 3      Frank     Howard   16      45              521           144
    +## 4        Jay  Johnstone   15      45              275            61
    +## 5        Ken      Berry   14      45              418           114
    +## 6        Jim    Spencer   14      45              466           126
    +## 7        Don  Kessinger   13      45              586           155
    +## 8       Luis   Alvarado   12      45              138            29
    +## 9        Ron      Santo   11      45              510           137
    +## 10       Ron    Swaboda   11      45              200            46
    +## 11      Rico Petrocelli   10      45              538           142
    +## 12     Ellie  Rodriguez   10      45              186            42
    +## 13    George      Scott   10      45              435           132
    +## 14       Del      Unser   10      45              277            73
    +## 15     Billy   Williams   10      45              591           195
    +## 16      Bert Campaneris    9      45              558           159
    +## 17   Thurman     Munson    8      45              408           129
    +## 18       Max      Alvis    7      45               70            14
    +

    We examine the 80% intervals on the basic hierarchical model for z, the posterior predictions for the rest of the season.

    +
    fit_hier$print('z', "mean", "median", ~quantile(.x, probs = c(0.1, 0.9)), max_rows=18)
    +
    ##  variable   mean median    10%    90%
    +##     z[1]  117.83 116.00  93.00 145.00
    +##     z[2]  133.13 131.00 106.00 164.00
    +##     z[3]  158.21 156.00 126.00 193.00
    +##     z[4]   80.83  80.00  63.00 100.00
    +##     z[5]  119.41 118.00  93.00 147.00
    +##     z[6]  133.20 132.00 105.00 163.00
    +##     z[7]  162.24 161.00 128.00 199.00
    +##     z[8]   36.91  37.00  27.00  47.00
    +##     z[9]  132.14 132.00 102.00 163.00
    +##     z[10]  51.65  51.00  38.00  65.00
    +##     z[11] 134.75 135.00 102.00 166.00
    +##     z[12]  46.45  46.00  34.00  59.00
    +##     z[13] 108.77 109.00  82.00 135.00
    +##     z[14]  69.23  69.00  51.00  87.00
    +##     z[15] 147.71 148.00 113.00 182.00
    +##     z[16] 134.24 134.00 100.00 167.00
    +##     z[17]  94.58  94.00  69.00 121.00
    +##     z[18]  15.62  15.00  10.00  22.00

    Translating the posterior number of hits into a season batting average, \(\frac{y_n + z_n}{K_n + K^{\mathrm{new}}_n}\), we get an 80% posterior interval of

    \[ -\left( \frac{18 + 93}{45 + 367}, \frac{18 + 147}{45 + 367} \right) +\left( \frac{18 + 93}{45 + 367}, \frac{18 + 145}{45 + 367} \right) \ = \ (0.269, 0.400). \]

    for Roberto Clemente in the basic hierarchical model. Part of our uncertainty here is due to our uncertainty in Clemente’s underlying chance of success, and part of our uncertainty is due to there being 367 remaining trials (at bats) modeled as binomial. In the remaining at bats for the season, Clemente’s success rate (batting average) was \(127/367 = 0.35\).

    The posterior produced by the model for the number of hits for the rest of the season is overdispersed compared to a simple binomial model based on a point estimate. For example, if we take the partially pooled posterior mean estimate of 0.32 for Roberto Clemente’s ability (and thus remove the first source of uncertainty, the posterior uncertainty in his chance of success), the prediction for number of hits based on the point estimate would be \(\mathrm{Binomial}(K^{\mathrm{new}}_1, 0.32)\), which we know analytically has a standard deviation of \(\sqrt{n \, \theta_n \, (1 - \theta_n)} = 8.9\), which is quite a bit lower than the posterior standard deviation of 21 in the hierarchical model for \(z_1\).

    For each model, the following plot shows each player’s posterior predictive 50% interval for predicted batting average (success rate) in his remaining at bats (trials); the observed success rate in the remainder of the season is shown as a blue dot.

    -
    y_new_25_pool <- c(NA, N);
    +
    # N is number of players
    +y_new_25_pool <- c(NA, N);
     y_new_25_no_pool <- c(NA, N);
     y_new_25_hier <- c(NA, N);
     y_new_25_hier_logit <- c(NA, N);
    @@ -1025,16 +1183,24 @@ 

    Predicting New Observations

    y_new_75_no_pool <- c(NA, N); y_new_75_hier <- c(NA, N); y_new_75_hier_logit <- c(NA, N); + + +zs_pool <- matrix(fit_pool$draws('z'), M, N); +zs_no_pool <- matrix(fit_no_pool$draws('z'), M, N); +zs_hier <- matrix(fit_hier$draws('z'), M, N); +zs_hier_logit <- matrix(fit_hier_logit$draws('z'), M, N); + for (n in 1:N) { - y_new_25_pool[n] <- quantile(ss_pool$z[,n], 0.25)[[1]]; - y_new_25_no_pool[n] <- quantile(ss_no_pool$z[,n], 0.25)[[1]]; - y_new_25_hier[n] <- quantile(ss_hier$z[,n], 0.25)[[1]]; - y_new_25_hier_logit[n] <- quantile(ss_hier_logit$z[,n], 0.25)[[1]]; - y_new_75_pool[n] <- quantile(ss_pool$z[,n], 0.75)[[1]]; - y_new_75_no_pool[n] <- quantile(ss_no_pool$z[,n], 0.75)[[1]]; - y_new_75_hier[n] <- quantile(ss_hier$z[,n], 0.75)[[1]]; - y_new_75_hier_logit[n] <- quantile(ss_hier_logit$z[,n], 0.75)[[1]]; + y_new_25_pool[n] <- quantile(zs_pool[, n], 0.25)[[1]]; + y_new_25_no_pool[n] <- quantile(zs_no_pool[, n], 0.25)[[1]]; + y_new_25_hier[n] <- quantile(zs_hier[, n], 0.25)[[1]]; + y_new_25_hier_logit[n] <- quantile(zs_hier_logit[, n], 0.25)[[1]]; + y_new_75_pool[n] <- quantile(zs_pool[, n], 0.75)[[1]]; + y_new_75_no_pool[n] <- quantile(zs_no_pool[, n], 0.75)[[1]]; + y_new_75_hier[n] <- quantile(zs_hier[, n], 0.75)[[1]]; + y_new_75_hier_logit[n] <- quantile(zs_hier_logit[, n], 0.75)[[1]]; } + y_new_25_pool <- y_new_25_pool / K_new; y_new_25_no_pool <- y_new_25_no_pool / K_new; y_new_25_hier <- y_new_25_hier / K_new; @@ -1049,7 +1215,7 @@

    Predicting New Observations

    model = c(rep("complete pooling", N), rep("no pooling", N), rep("partial pooling", N), - rep("partial pooling (log odds)", N))); + rep("partial pooling, log odds", N))); plot_post_pred <- ggplot(df_post_pred, aes(x=x, y=y)) + geom_point(colour="darkblue", size=1) + @@ -1062,12 +1228,12 @@

    Predicting New Observations

    scale_x_continuous(breaks=c()) + xlab("player ID") + ylab("batting average") + - ggtitle(expression( - atop("Posterior Predictions for Batting Average in Remainder of Season", - atop("50% posterior predictive intervals (gray bars); observed (blue dots)", "")))); + ggtitle("Posterior Predictions for Batting Average in Remainder of Season", + subtitle="50% posterior predictive intervals (gray bars); observed (blue dots)") + + theme_jupyter(); plot_post_pred;
    -

    -

    We choose 50% posterior intervals as they are a good single point for checking calibration (see the exercises). Rather than plotting the number of at bats on the vertical axis, we have standardized all the predictions and outcomes to a success rate (aka batting average, in the baseball case). Because each item (player) has a different number of subsequent trials (at bats), the posterior intervals are relatively wider or narrower within the plots for each model (more trials imply narrower intervals for the average). Because each item had the same number of initial observed trials, this variation is primarily due to the uncertainty from the binomial model of outcomes.

    +

    +

    We choose 50% posterior intervals as they are a good single point for checking calibration (see the exercises). Rather than plotting the number of hits on the vertical axis, we have standardized all the predictions and outcomes to a success rate (aka batting average, in the baseball case). Because each item (player) has a different number of subsequent trials (at bats), the posterior intervals are relatively wider or narrower within the plots for each model (more trials imply narrower intervals for the average). Because each item had the same number of initial observed trials, this variation is primarily due to the uncertainty from the binomial model of outcomes.

    Calibration

    With 50% intervals, we expect half of our estimates to lie outside their intervals in a well-calibrated model. If fewer than the expected number of outcomes lie in their estimated posterior intervals, we have reason to believe the model is not well calibrated—its posterior intervals are too narrow. This is also true if too many outcomes lie in their estimated posterior intervals—in this case the intervals are too broad. Of course, there is variation in the tests as the number of items lying in their intervals is itself a random variable (see the exercises), so in practice we are only looking for extreme values as indicators of miscalibration.

    @@ -1076,12 +1242,12 @@

    Calibration

    Sharpness

    Consider the width of the posterior predictive intervals for the items across the models. The model with no pooling has the broadest posterior predictive intervals and the complete pooling model the narrowest. This is to be expected given the number of observations used to fit each model; 45 each in the no pooling case and 810 in the complete pooling case, and relatively something in between for the partial pooling models. Because the log odds model is doing more pooling, its intervals are slightly narrower than that of the direct hierarchical model.

    -

    For two well calibrated models, the one with the narrower posterior intervals is preferable because its predictions are more tighter. The term introduced for this by Gneiting et al. (2007) is “sharpness.” In the limit, a perfect model would provide a delta function at the true answer with a vanishing posterior interval.

    +

    For two well calibrated models, the one with the narrower posterior intervals is preferable because its predictions are more tighter. The term introduced for this by Gneiting et al. (2007) is “sharpness.” In the limit, a perfect model would provide a delta function at the true answer with a vanishing posterior interval.

    Data Splitting and Cross-Validation

    Not all data sets come with a convenient train/test (observe/predict) split. One obvious way around that problem is to break them yourself if the data’s exchangeable, as we are assuming it is for this repeated trial data (that is, each trial’s chance of success is the same). We can even do the splits into sizes that we care about.

    -

    This process can be repeated, leading to cross-validation. A typical approach is to divide the data up into ten sections and for each section, estimate based on the remaining nine sections and then evaluate on the given section. Each section is referred to as a “fold” and dividing into \(N\) subsets is known as \(N\)-fold cross-validation. In the limit, we can leave out one observation and train on the rest; this is known as “leave-one-out cross-validation” (LOO). See (Gelman et al. 2013) for a discussion of LOO and its relation to (Bayesian) information criteria such as WAIC.

    +

    This process can be repeated, leading to cross-validation. A typical approach is to divide the data up into ten sections and for each section, estimate based on the remaining nine sections and then evaluate on the given section. Each section is referred to as a “fold” and dividing into \(N\) subsets is known as \(N\)-fold cross-validation. In the limit, we can leave out one observation and train on the rest; this is known as “leave-one-out cross-validation” (LOO). See (Gelman et al. 2013) for a discussion of LOO and its relation to (Bayesian) information criteria such as WAIC.

    @@ -1119,98 +1285,74 @@

    Estimating Event Probabilities

    \ \approx \ \frac{1}{M} \, \sum_{m=1}^M \mathrm{I}[\theta_n^{(m)} \geq 0.400]. \]

    -

    In Stan, we just declare and define the indicators directly in the generated quantities block.

    -
    generated quantities {
    -  ...
    -  int<lower=0, upper=1> some_ability_gt_350;  // Pr[some theta > 0.35]
    -  int<lower=0, upper=1> avg_gt_400[N];        // Pr[season avg of n] >= 0.400
    -  int<lower=0, upper=1> ability_gt_400[N];    // Pr[chance-of-success of n] >= 0.400
    -  ...
    -  some_ability_gt_350 <- (max(theta) > 0.35);
    -  for (n in 1:N)
    -    avg_gt_400[n] <- (((y[n] + z[n]) / (0.0 + K[n] + K_new[n])) > 0.400);
    -  for (n in 1:N)
    -    ability_gt_400[n] <- (theta[n] > 0.400);
    -  ...
    -}
    +

    This logic is part of the include file gq-postpred.stan.

    +
      // posterior predictions on new data
    +  array[N] int<lower=0> z = binomial_rng(K_new, theta); 
    +
    +  // Pr[some theta > 0.35]  
    +  int<lower=0, upper=1> some_ability_gt_350 = max(theta) > 0.35;
    +  // Pr[some player ability > 400]
    +  array[N] int<lower=0, upper=1> ability_gt_400;
    +  for (n in 1 : N) {
    +    ability_gt_400[n] = theta[n] > 0.400;
    +  }
    +  // Pr[some player season average > 400]
    +  array[N] int<lower=0, upper=1> avg_gt_400;
    +  for (n in 1 : N) {
    +    // y[n] - observed, z[n] - expected hits in rest of season
    +    avg_gt_400[n] = ((y[n] + z[n]) / (0.0 + K[n] + K_new[n])) > 0.400;
    +  }

    The indicator function is not explicitly specified because Stan’s boolean operator greater-than returns either 0 or 1. The only trick to this code is the 0.0 + … in the fraction computation, the purpose of which is to cast the value to real to prevent integer division from rounding.

    As usual, the expectations appear as the posterior means in the summary. We can summarize for all four models again. Only the event indicator variables are printed—the parameter estimates will be the same as before.

    +

    The standard deviation and quantiles are not useful here; we do want to see a reasonable effective sample size estimate and no evidence of non-convergence in the form for \(\hat{R}\) values much greater than 1. The NaN values in the \(\hat{R}\) column arise when every posterior draw is the same; there is zero sample variance, and so \(\hat{R}\) is not defined.

    pars_to_print <- c("some_ability_gt_350", 
         "avg_gt_400[1]","avg_gt_400[5]", "avg_gt_400[10]", 
         "ability_gt_400[1]", "ability_gt_400[5]", "ability_gt_400[10]");
    -print(fit_pool, pars=pars_to_print, probs=c());
    -
    Inference for Stan model: pool.
    -4 chains, each with iter=5000; warmup=2500; thin=1; 
    -post-warmup draws per chain=2500, total post-warmup draws=10000.
    -
    -                    mean se_mean sd n_eff Rhat
    -some_ability_gt_350    0       0  0 10000  NaN
    -avg_gt_400[1]          0       0  0 10000  NaN
    -avg_gt_400[5]          0       0  0 10000  NaN
    -avg_gt_400[10]         0       0  0 10000  NaN
    -ability_gt_400[1]      0       0  0 10000  NaN
    -ability_gt_400[5]      0       0  0 10000  NaN
    -ability_gt_400[10]     0       0  0 10000  NaN
    -
    -Samples were drawn using NUTS(diag_e) at Sat Jan 30 20:56:04 2016.
    -For each parameter, n_eff is a crude measure of effective sample size,
    -and Rhat is the potential scale reduction factor on split chains (at 
    -convergence, Rhat=1).
    -
    print(fit_no_pool, pars=pars_to_print, probs=c());
    -
    Inference for Stan model: no-pool.
    -4 chains, each with iter=5000; warmup=2500; thin=1; 
    -post-warmup draws per chain=2500, total post-warmup draws=10000.
    -
    -                    mean se_mean   sd n_eff Rhat
    -some_ability_gt_350 0.99       0 0.07  8851    1
    -avg_gt_400[1]       0.51       0 0.50 10000    1
    -avg_gt_400[5]       0.11       0 0.31  7867    1
    -avg_gt_400[10]      0.01       0 0.09  9135    1
    -ability_gt_400[1]   0.51       0 0.50 10000    1
    -ability_gt_400[5]   0.12       0 0.33  7336    1
    -ability_gt_400[10]  0.02       0 0.13  7856    1
    -
    -Samples were drawn using NUTS(diag_e) at Sat Jan 30 20:56:34 2016.
    -For each parameter, n_eff is a crude measure of effective sample size,
    -and Rhat is the potential scale reduction factor on split chains (at 
    -convergence, Rhat=1).
    -
    print(fit_hier, pars=pars_to_print, probs=c());
    -
    Inference for Stan model: hier.
    -4 chains, each with iter=5000; warmup=2500; thin=1; 
    -post-warmup draws per chain=2500, total post-warmup draws=10000.
    -
    -                    mean se_mean   sd n_eff Rhat
    -some_ability_gt_350 0.60    0.01 0.49  2093    1
    -avg_gt_400[1]       0.09    0.00 0.29  7244    1
    -avg_gt_400[5]       0.01    0.00 0.12  6833    1
    -avg_gt_400[10]      0.00    0.00 0.03 10000    1
    -ability_gt_400[1]   0.08    0.00 0.27  6154    1
    -ability_gt_400[5]   0.01    0.00 0.12  6455    1
    -ability_gt_400[10]  0.00    0.00 0.05  9167    1
    -
    -Samples were drawn using NUTS(diag_e) at Sat Jan 30 20:57:10 2016.
    -For each parameter, n_eff is a crude measure of effective sample size,
    -and Rhat is the potential scale reduction factor on split chains (at 
    -convergence, Rhat=1).
    -
    print(fit_hier_logit, pars=pars_to_print, probs=c());
    -
    Inference for Stan model: hier-logit.
    -4 chains, each with iter=5000; warmup=2500; thin=1; 
    -post-warmup draws per chain=2500, total post-warmup draws=10000.
    -
    -                    mean se_mean   sd n_eff Rhat
    -some_ability_gt_350 0.25       0 0.43  7546    1
    -avg_gt_400[1]       0.03       0 0.17 10000    1
    -avg_gt_400[5]       0.00       0 0.07 10000    1
    -avg_gt_400[10]      0.00       0 0.01 10000    1
    -ability_gt_400[1]   0.02       0 0.15  9244    1
    -ability_gt_400[5]   0.00       0 0.07 10000    1
    -ability_gt_400[10]  0.00       0 0.03 10000    1
    -
    -Samples were drawn using NUTS(diag_e) at Sat Jan 30 20:58:17 2016.
    -For each parameter, n_eff is a crude measure of effective sample size,
    -and Rhat is the potential scale reduction factor on split chains (at 
    -convergence, Rhat=1).
    -

    The standard deviation and quantiles are not useful here and the quantiles are suppressed through an empty probs argument to print(); we do want to see a reasonable effective sample size estimate and no evidence of non-convergence in the form for \(\hat{R}\) values much greater than 1. The NaN values in the \(\hat{R}\) column arise when every posterior draw is the same; there is zero sample variance, and so \(\hat{R}\) is not defined.

    + +message("complete pooling (note: all players have estimated ability 0.27)");
    +
    complete pooling (note: all players have estimated ability 0.27)
    +
    fit_pool$print(pars_to_print);
    +
                variable mean median   sd  mad   q5  q95 rhat ess_bulk ess_tail
    + some_ability_gt_350 0.00   0.00 0.00 0.00 0.00 0.00   NA       NA       NA
    + avg_gt_400[1]       0.00   0.00 0.00 0.00 0.00 0.00   NA       NA       NA
    + avg_gt_400[5]       0.00   0.00 0.00 0.00 0.00 0.00   NA       NA       NA
    + avg_gt_400[10]      0.00   0.00 0.00 0.00 0.00 0.00   NA       NA       NA
    + ability_gt_400[1]   0.00   0.00 0.00 0.00 0.00 0.00   NA       NA       NA
    + ability_gt_400[5]   0.00   0.00 0.00 0.00 0.00 0.00   NA       NA       NA
    + ability_gt_400[10]  0.00   0.00 0.00 0.00 0.00 0.00   NA       NA       NA
    +
    message("no pooling");
    +
    no pooling
    +
    fit_no_pool$print(pars_to_print);
    +
                variable mean median   sd  mad   q5  q95 rhat ess_bulk ess_tail
    + some_ability_gt_350 1.00   1.00 0.06 0.00 1.00 1.00 1.00     9458       NA
    + avg_gt_400[1]       0.52   1.00 0.50 0.00 0.00 1.00 1.00    16113       NA
    + avg_gt_400[5]       0.10   0.00 0.30 0.00 0.00 1.00 1.00     8547       NA
    + avg_gt_400[10]      0.01   0.00 0.09 0.00 0.00 0.00 1.00     9121     9121
    + ability_gt_400[1]   0.51   1.00 0.50 0.00 0.00 1.00 1.00    19086       NA
    + ability_gt_400[5]   0.12   0.00 0.32 0.00 0.00 1.00 1.00     7417       NA
    + ability_gt_400[10]  0.01   0.00 0.12 0.00 0.00 0.00 1.00     8356     8356
    +
    message("partial pooling");
    +
    partial pooling
    +
    fit_hier$print(pars_to_print);
    +
                variable mean median   sd  mad   q5  q95 rhat ess_bulk ess_tail
    + some_ability_gt_350 0.58   1.00 0.49 0.00 0.00 1.00 1.00     2173       NA
    + avg_gt_400[1]       0.09   0.00 0.29 0.00 0.00 1.00 1.00     6475       NA
    + avg_gt_400[5]       0.01   0.00 0.11 0.00 0.00 0.00 1.00     8169     8169
    + avg_gt_400[10]      0.00   0.00 0.02 0.00 0.00 0.00 1.00    10020    10020
    + ability_gt_400[1]   0.08   0.00 0.27 0.00 0.00 1.00 1.00     6064       NA
    + ability_gt_400[5]   0.01   0.00 0.12 0.00 0.00 0.00 1.00     7027     7027
    + ability_gt_400[10]  0.00   0.00 0.03 0.00 0.00 0.00 1.00    10037    10037
    +
    message("partial pooling, log odds");
    +
    partial pooling, log odds
    +
    fit_hier_logit$print(pars_to_print);
    +
                variable mean median   sd  mad   q5  q95 rhat ess_bulk ess_tail
    + some_ability_gt_350 0.25   0.00 0.44 0.00 0.00 1.00 1.00     7105       NA
    + avg_gt_400[1]       0.03   0.00 0.18 0.00 0.00 0.00 1.00     8871     8871
    + avg_gt_400[5]       0.00   0.00 0.06 0.00 0.00 0.00 1.00     8778     8778
    + avg_gt_400[10]      0.00   0.00 0.02 0.00 0.00 0.00 1.00    10022    10022
    + ability_gt_400[1]   0.02   0.00 0.15 0.00 0.00 0.00 1.00     9064     9064
    + ability_gt_400[5]   0.00   0.00 0.07 0.00 0.00 0.00 1.00     9635     9635
    + ability_gt_400[10]  0.00   0.00 0.02 0.00 0.00 0.00 1.00    10024    10024

    These results show that the probability of batting 0.400 or better for the season is a different question than asking if the player’s ability is 0.400 or better; for example, with respect to the basic partial pooling model, there is roughly an estimated 10% chance of Roberto Clemente (\(n = 1\)) batting 0.400 or better for the season, but only an estimated 8% chance that he has ability greater than 0.400. This is again due to there being two sources of uncertainty, that from the estimate of the chance of success in the posterior and that from the remaining binary trials.

    @@ -1220,7 +1362,7 @@

    Multiple Comparisons

    For example, suppose we have our 18 players with ability parameters \(\theta_n\) and we have \(N\) null hypotheses of the form \(H_0^n: \theta_n < 0.350\). Now suppose we evaluate each of these 18 hypotheses independently at the conventional \(p = 0.05\) significance level, giving each a 5% chance of rejecting the null hypothesis in error. When we run all 18 hypothesis tests, the overall chance of falsely rejecting at least one of the null hypotheses is a whopping \(1 - (1 - 0.05)^{18} = 0.60\).

    The traditional solution to this problem is to apply a Bonferroni adjustment to control the false rejection rate; the typical adjustment is to divide the \(p\)-value by the number of hypothesis tests in the “family” (that is, the collective test being done). Here that sets the rate to \(p = 0.05/18\), or approximately \(p = 0.003\), and results in a slightly less than 5% chance of falsely rejecting a null hypothesis in error.

    Although the Bonferroni correction does reduce the overall chance of falsely rejecting a null hypothesis, it also reduces the statistical power of the test to the same degree. This means that many null hypotheses will fail to be rejected in error.

    -

    Rather than doing classical multiple comparison adjustments to adjust for false-discovery rate, such as a Bonferroni correction, Gelman et al. (2012) suggest using a hierarchical model to perform partial pooling instead. As already shown, hierarchical models partially pool the data, which pulls estimates toward the population mean with a strength determined by the amount of observed variation in the population (see also Figure 2 of (Gelman et al. 2012)). This automatically reduces the false-discovery rate, though not in a way that is intrinsically calibrated to false discovery, which is good, because reducing the overall false discovery rate in and of itself reduces the true discovery rate at the same time.

    +

    Rather than doing classical multiple comparison adjustments to adjust for false-discovery rate, such as a Bonferroni correction, Gelman et al. (2012) suggest using a hierarchical model to perform partial pooling instead. As already shown, hierarchical models partially pool the data, which pulls estimates toward the population mean with a strength determined by the amount of observed variation in the population (see also Figure 2 of (Gelman et al. 2012)). This automatically reduces the false-discovery rate, though not in a way that is intrinsically calibrated to false discovery, which is good, because reducing the overall false discovery rate in and of itself reduces the true discovery rate at the same time.

    The generated quantity some_ability_gt_350 will be set to 1 if the maximum ability estimate in \(\theta\) is greater than 0.35. And thus the posterior mean of this generated quantity will be the event probability

    \[ \mathrm{Pr}[\mathrm{max}(\theta) > 0.350] @@ -1234,60 +1376,53 @@

    Multiple Comparisons

    Ranking

    In addition to multiple comparisons, we can use the simultaneous estimation of the ability parameters to rank the items. In this section, we rank ballplayers by (estimated) chance of success (i.e., batting ability).

    To implement ranking in Stan, we define a generated quantity rnkto hold the ranks; that is, rnk[2] is the rank (a value ranging from \(1\) to \(N\)) of the second player. First, we define a local variable dsc, and sort it into descending order.

    -
    generated quantities {
    -  ...
    -  int<lower=1, upper=N> rnk[N];   // rnk[n] is rank of player n
    -  ...
    +

    We encapsulate this logic into its own program file “gq-ranking.stan” and include it in the model’s generated quantities block.

    +
      array[N] int<lower=1, upper=N> rnk; // rank of player n
       {
    -    int dsc[N];
    -    dsc <- sort_indices_desc(theta);
    -    for (n in 1:N)
    -      rnk[dsc[n]] <- n;
    +    array[N] int dsc;
    +    dsc = sort_indices_desc(theta);
    +    for (n in 1 : N) {
    +      rnk[dsc[n]] = n;
    +    }
       }
    -  ...
    -}
    + array[N] int<lower=0, upper=1> is_best; // Pr[player n highest chance of success] + for (n in 1 : N) { + is_best[n] = rnk[n] == 1; + }

    After the call to sort_indices_desc, dsc[n] holds the index of the \(n\)-th best item. For example, if rnk[2] == 5, so that player with identifier 2 is ranked 5th, then dsc[5] == 2, because the fifth rank player is the player with identifier 2. The final loop just puts the ranks into their proper place in the rnk array.

    In this example, the nested brackets produce a scope in which the integer array variable dsc can be declared as a local variable to hold the sorted indices. Unlike variables declared at the top of the generated quantities block, local variables are not saved every iteration.

    Of course, ranking players by ability makes no sense for the complete pooling model, where every player is assumed to have the same ability.

    We can print just the ranks and the 80% central interval for the basic pooling model.

    -
    print(fit_hier, "rnk", probs=c(0.1, 0.5, 0.9));
    -
    Inference for Stan model: hier.
    -4 chains, each with iter=5000; warmup=2500; thin=1; 
    -post-warmup draws per chain=2500, total post-warmup draws=10000.
    -
    -         mean se_mean   sd 10% 50% 90% n_eff Rhat
    -rnk[1]   4.40    0.04 3.67   1   3  10 10000    1
    -rnk[2]   4.99    0.04 3.81   1   4  11 10000    1
    -rnk[3]   5.84    0.04 4.18   1   5  12 10000    1
    -rnk[4]   6.73    0.04 4.38   2   6  13 10000    1
    -rnk[5]   7.49    0.04 4.45   2   7  14 10000    1
    -rnk[6]   7.44    0.04 4.44   2   7  14 10000    1
    -rnk[7]   8.32    0.05 4.61   2   8  15 10000    1
    -rnk[8]   9.31    0.05 4.66   3   9  16 10000    1
    -rnk[9]  10.33    0.05 4.66   4  11  17 10000    1
    -rnk[10] 10.29    0.05 4.62   4  11  16 10000    1
    -rnk[11] 11.28    0.05 4.55   5  12  17 10000    1
    -rnk[12] 11.21    0.05 4.61   4  12  17 10000    1
    -rnk[13] 11.25    0.05 4.53   5  12  17 10000    1
    -rnk[14] 11.33    0.05 4.59   5  12  17 10000    1
    -rnk[15] 11.30    0.05 4.56   5  12  17 10000    1
    -rnk[16] 12.30    0.04 4.44   6  13  18 10000    1
    -rnk[17] 13.17    0.04 4.32   7  14  18 10000    1
    -rnk[18] 14.01    0.04 3.97   8  15  18 10000    1
    -
    -Samples were drawn using NUTS(diag_e) at Sat Jan 30 20:57:10 2016.
    -For each parameter, n_eff is a crude measure of effective sample size,
    -and Rhat is the potential scale reduction factor on split chains (at 
    -convergence, Rhat=1).
    +
    fit_hier$print("rnk", "mean", "median", ~quantile(.x, probs = c(0.1, 0.9)), max_rows=18);
    +
     variable  mean median  10%   90%
    +  rnk[1]   4.45   3.00 1.00 10.00
    +  rnk[2]   5.09   4.00 1.00 11.00
    +  rnk[3]   5.85   5.00 1.00 12.00
    +  rnk[4]   6.63   6.00 2.00 13.00
    +  rnk[5]   7.48   7.00 2.00 14.00
    +  rnk[6]   7.47   7.00 2.00 14.00
    +  rnk[7]   8.45   8.00 2.00 15.00
    +  rnk[8]   9.34   9.00 3.00 16.00
    +  rnk[9]  10.30  11.00 4.00 16.00
    +  rnk[10] 10.38  11.00 4.00 17.00
    +  rnk[11] 11.21  12.00 5.00 17.00
    +  rnk[12] 11.34  12.00 5.00 17.00
    +  rnk[13] 11.20  12.00 4.00 17.00
    +  rnk[14] 11.28  12.00 4.00 17.00
    +  rnk[15] 11.20  12.00 5.00 17.00
    +  rnk[16] 12.22  13.00 6.00 18.00
    +  rnk[17] 13.12  14.00 7.00 18.00
    +  rnk[18] 13.98  15.00 8.00 18.00

    It is again abundantly clear from the posterior intervals that our uncertainty is very great after only 45 at bats.

    In the original Volume I BUGS example (see OpenBUGS: Surgical example) of surgical mortality, the posterior distribution over ranks was plotted for each hospital. It is now straightforward to reproduce that figure here for the baseball data.

    -
    library(ggplot2);
    +
    rnk_hier <- matrix(fit_hier$draws('rnk'), M, N);
     df_rank <- data.frame(list(name = rep(as.character(df[[1,2]]), M),
    -                           rank = ss_hier$rnk[, 1]));
    +                           rank = rnk_hier[, 1]));
    +
     for (n in 2:N) {
       df_rank <- rbind(df_rank,
                        data.frame(list(name = rep(as.character(df[[n,2]]), M),
    -                              rank = ss_hier$rnk[, n])));
    +                              rank = rnk_hier[, n])));
     }
     rank_plot <-
       ggplot(df_rank, aes(rank)) +
    @@ -1296,9 +1431,11 @@ 

    Ranking

    scale_x_discrete(limits=c(1, 5, 10, 15)) + scale_y_discrete(name="posterior probability", breaks=c(0, 0.1 * M, 0.2 * M), labels=c("0.0", "0.1", "0.2")) + - ggtitle("Rankings for Partial Pooling Model"); + ggtitle("Rankings for Partial Pooling Model") + + theme_jupyter(); +#options(repr.plot.width=20, repr.plot.height=16) rank_plot;
    -

    +

    We have followed the original BUGS presentation where the normalized posterior frequency (i.e., the frequency divided by total count) is reported as a probability on the y axis. This plot will look very different for the four different models (see exercises).

    Who has the Highest Chance of Success?

    @@ -1313,29 +1450,29 @@

    Who has the Highest Chance of Success?

    \frac{1}{M} \, \sum_{m=1}^M \mathrm{I}[\theta^{(m)}_n = \mathrm{max}(\theta^{(m)})]. \]

    Like our other models, the partial pooling mitigates the implicit multiple comparisons being done to calculate the probabilities of rankings. Contrast this with an approach that does a pairwise significance test and then applies a false-discovery correction.

    -

    We can compute this straightforwardly using the rank data we have already computed or we could compute it directly as above. Because \(\mathrm{Pr}[\theta_n = \theta_{n'}] = 0\) for \(n \neq n'\), we don’t have to worry about ties.

    -

    We define a new generated quantity for the value of the indicator and define it using the already-computed ranks.

    -
    generated quantities {
    -  ...
    -  int<lower=0, upper=1> is_best[N];  // Pr[player n highest chance of success]
    -  ...
    -  for (n in 1:N)
    -    is_best[n] <- (rnk[n] == 1);
    -  ...
    -}
    +

    We can compute this straightforwardly using the rank data we have already computed or we could compute it directly as above. Because \(\mathrm{Pr}[\theta_n = \theta_{n'}] = 0\) for \(n \neq n'\), we don’t have to worry about ties.

    +

    The generated quantities variable is_best is defined using the already-computed ranks.

    +
      array[N] int<lower=0, upper=1> is_best; // Pr[player n highest chance of success]
    +  for (n in 1 : N) {
    +    is_best[n] = rnk[n] == 1;
    +  }

    This means that is_best[n] will be 1 if player n is the best player. This calculation, which is expressed with ordinary arithmetic operations and assignment, makes clear the way in which variables in Stan behave like random variables. These statements are executed and their expressions evaluated each iteration of the sampler. By saving the results for each draw \(m \in 1{:}M\) in the posterior simulation, we are able to use MCMC to estimate expectations, variances, and quantiles of these random variables. In this case, the expectation of is_best is an event probability.

    We can then plot the results for the four models.

    -
    df_is_best_for <- function(name, ss) {
    +
    df_is_best_for <- function(name, draws) {
       is_best <- rep(NA, N);
       for (n in 1:N) {
    -    is_best[n] <- mean(ss$is_best[,n]);
    +    is_best[n] <- mean(draws[,n]);
       }
       return(data.frame(list(item=1:N, is_best=is_best, model=name)));
     }
     
    -df_is_best <-  rbind(df_is_best_for("no pool", ss_no_pool),
    -                     df_is_best_for("hier", ss_hier),
    -                     df_is_best_for("hier (log odds)", ss_hier_logit));
    +best_no_pool <- matrix(fit_no_pool$draws('is_best'), M, N);
    +best_hier <- matrix(fit_hier$draws('is_best'), M, N);
    +best_hier_logit <- matrix(fit_hier_logit$draws('is_best'), M, N);
    +
    +df_is_best <-  rbind(df_is_best_for("no pooling", best_no_pool),
    +                     df_is_best_for("partial pooling", best_hier),
    +                     df_is_best_for("partial pooling, log odds", best_hier_logit));
     
     is_best_plot <-
       ggplot(df_is_best, aes(x=item, y=is_best)) +
    @@ -1343,9 +1480,11 @@ 

    Who has the Highest Chance of Success?

    facet_wrap(~ model) + scale_y_continuous(name = "Pr[player is best]") + scale_x_discrete(name="player", breaks=c(1, 5, 10, 15)) + - ggtitle("Who is the Best Player?"); + ggtitle("Who is the Best Player?") + + theme_jupyter(); +#options(repr.plot.width=12, repr.plot.height=4) is_best_plot;
    -

    +

    The hierarchical plot is simply the first column of each of the entries in the previous plot in the order they appear; the other plots repeat the process for the no pooling and log-odds hierarchical model.

    This question of which player has the highest chance of success (batting ability) doesn’t even make sense in the complete pooling model, because the chance of success parameters are all the same by definition. In the remaining three models, the amount of pooling directly determines the probabilities of being the best player. For example, Roberto Clemente, the top performing player in the first 45 at bats, has an estimated 35%, 25%, and 18% chance of being the best player according to the no-pooling, partial pooling, and partial pooling of log odds models; the tenth best performing player, Ron Swaboda, has an estimated 1%, 2% and 3% chance according to the same models. That is, the probability of being best goes down for high performing players with more pooling, whereas it goes up for below-average players.

    @@ -1356,14 +1495,14 @@

    Posterior p-Values

    In some cases, we are willing to work with models that are wrong in some measurable aspects, but accurately capture quantities of interest for an application. That is, it’s possible for a model to capture some, but not all, aspects of a data set, and still be useful.

    Test Statistics and Bayesian \(p\)-Values

    -

    A test statistic \(T\) is a function from data to a real value. Following (Gelman et al. 2013), we will concentrate on four specific test statistics for repeated binary trial data (though these choices are fairly general): minimum value, maximum value, sample mean, and sample standard deviation.

    +

    A test statistic \(T\) is a function from data to a real value. Following (Gelman et al. 2013), we will concentrate on four specific test statistics for repeated binary trial data (though these choices are fairly general): minimum value, maximum value, sample mean, and sample standard deviation.

    Given a test statistic \(T\) and data \(y\), the Bayesian \(p\)-value has a direct definition as a probability,

    \[ p_B = \mathrm{Pr}[T(y^{\mathrm{rep}}) \geq T(y) \, | \, y]. \]

    Bayesian \(p\)-values, like their traditional counterparts, are probabilities, but not probabilities that a model is true. They simply measure discrepancies between the observed data and what we would expect if the model is true.

    Values of Bayesian \(p\)-values near 0 or 1 indicate that the data \(y\) used to estimate the model is unlikely to have been generated by the estimated model. As with other forms of full Bayesian inference, our estimate is the full posterior, not just a point estimate.

    -

    As with other Bayesain inferences, we average over the posterior rather than working from a point estimate of the parameters. Expanding this as an expectation of an indicator function,

    +

    As with other Bayesian inferences, we average over the posterior rather than working from a point estimate of the parameters. Expanding this as an expectation of an indicator function,

    \[ p_B \ = \ @@ -1374,7 +1513,7 @@

    Test Statistics and Bayesian \(p\)-Values

    We treat \(y^{\mathrm{rep}}\) as a parameter in parallel with \(\theta\), integrating over possible values \(y^{\mathrm{rep}} \in Y^{\mathrm{rep}}\). As usual, we use the integration sign in a general way intended to include summation, as with the discrete variable \(y^{\mathrm{rep}}\).

    -

    The formualation as an expectation leads to the obvious MCMC calculation based on posterior draws \(y^{\mathrm{rep} (m)}\) for \(m \in 1{:}M\),

    +

    The formulation as an expectation leads to the obvious MCMC calculation based on posterior draws \(y^{\mathrm{rep} (m)}\) for \(m \in 1{:}M\),

    \[ p_B \approx @@ -1385,116 +1524,68 @@

    Test Statistics and Bayesian \(p\)-Values

    In Stan, the test statistics are first defined for the observed data. Because they are functions purely of variables defined in the data block, they can be defined in the transformed data block so that they will be computed only once when the data is read in. In general, it is much more efficient to define variables as transformed data if at all possible.

    transformed data {
    -  real min_y;   // minimum successes
    -  real max_y;   // maximum successes
    -  real mean_y;  // sample mean successes
    -  real sd_y;    // sample std dev successes
    -
    -  min_y <- min(y);
    -  max_y <- max(y);
    -  mean_y <- mean(to_vector(y));
    -  sd_y <- sd(to_vector(y));
    -}
    -

    The code to generate the replicated data is in the generated quantities block.

    -
    generated quantities {
    -  ...
    -  int<lower=0> y_rep[N];      // replications for existing items
    -  ...
    -  for (n in 1:N)
    -    y_rep[n] <- binomial_rng(K[n], theta[n]);
    -  ...
    -}
    -

    Then the test statistics are defined in the generated quantities block.

    -
    generated quantities {
    -  ...
    -  real<lower=0> min_y_rep;   // posterior predictive min replicated successes
    -  real<lower=0> max_y_rep;   // posterior predictive max replicated successes
    -  real<lower=0> mean_y_rep;  // posterior predictive sample mean replicated successes
    -  real<lower=0> sd_y_rep;    // posterior predictive sample std dev replicated successes
    -  ...
    -  min_y_rep <- min(y_rep);
    -  max_y_rep <- max(y_rep);
    -  mean_y_rep <- mean(to_vector(y_rep));
    -  sd_y_rep <- sd(to_vector(y_rep));
    -  ...
    -}
    -

    Finally, the actual \(p\)-value indicators are computed and assigned to integer values. The calls to to_vector() convert an array of integers to an array of real values so that they are appropriately typed to be the input to the standard deviation calculation.

    -
    generated quantities {
    -  ...
    -  int<lower=0, upper=1> p_min;  // posterior predictive p-values
    -  int<lower=0, upper=1> p_max;
    -  int<lower=0, upper=1> p_mean;
    -  int<lower=0, upper=1> p_sd;
    -  ...
    -  p_min <- (min_y_rep >= min_y);
    -  p_max <- (max_y_rep >= max_y);
    -  p_mean <- (mean_y_rep >= mean_y);
    -  p_sd <- (sd_y_rep >= sd_y);
    -  ...
    +  real min_y = min(y);   // minimum successes
    +  real max_y = max(y);   // maximum successes
    +  real mean_y = mean(to_vector(y));  // sample mean successes
    +  real sd_y = sd(to_vector(y));    // sample std dev successes
     }
    +

    The calls to to_vector() convert an array of integers to an array of real values so that they are appropriately typed to be the input to the standard deviation calculation.

    +

    Then the posterior predictive test statistics are defined in the generated quantities block and the actual \(p\)-value indicators are computed and assigned to integer values. The last part of included file “gq-postpred.stan” computes these values.

    +
      // replications for existing items  
    +  array[N] int<lower=0> y_rep = binomial_rng(K, theta);
    +  
    +  // posterior predictive min replicated successes, test stat p_val
    +  real<lower=0> min_y_rep = min(y_rep);
    +  int<lower=0, upper=1> p_min = min_y_rep >= min_y;
    +
    +  // posterior predictive max replicated successes, test stat p_val
    +  real<lower=0> max_y_rep = max(y_rep); 
    +  int<lower=0, upper=1> p_max = max_y_rep >= max_y;
    +
    +  // posterior predictive sample mean replicated successes, test stat p_val
    +  real<lower=0> mean_y_rep = mean(to_vector(y_rep));
    +  int<lower=0, upper=1> p_mean = mean_y_rep >= mean_y;
    +
    +  // posterior predictive sample std dev replicated successes, test stat p_val
    +  real<lower=0> sd_y_rep = sd(to_vector(y_rep));
    +  int<lower=0, upper=1> p_sd = sd_y_rep >= sd_y;

    The fit from the Stan program can then be used to display the Bayesian \(p\)-values for each of the models.

    -
    print(fit_pool, c("p_min", "p_max", "p_mean", "p_sd"), probs=c());
    -
    Inference for Stan model: pool.
    -4 chains, each with iter=5000; warmup=2500; thin=1; 
    -post-warmup draws per chain=2500, total post-warmup draws=10000.
    -
    -       mean se_mean   sd n_eff Rhat
    -p_min  0.60    0.01 0.49  7887    1
    -p_max  0.47    0.01 0.50  8482    1
    -p_mean 0.52    0.01 0.50  6101    1
    -p_sd   0.33    0.00 0.47 10000    1
    -
    -Samples were drawn using NUTS(diag_e) at Sat Jan 30 20:56:04 2016.
    -For each parameter, n_eff is a crude measure of effective sample size,
    -and Rhat is the potential scale reduction factor on split chains (at 
    -convergence, Rhat=1).
    -
    print(fit_no_pool, c("p_min", "p_max", "p_mean", "p_sd"), probs=c());
    -
    Inference for Stan model: no-pool.
    -4 chains, each with iter=5000; warmup=2500; thin=1; 
    -post-warmup draws per chain=2500, total post-warmup draws=10000.
    -
    -       mean se_mean   sd n_eff Rhat
    -p_min  0.09       0 0.29  9548    1
    -p_max  0.97       0 0.17 10000    1
    -p_mean 0.69       0 0.46 10000    1
    -p_sd   0.99       0 0.08  9025    1
    -
    -Samples were drawn using NUTS(diag_e) at Sat Jan 30 20:56:34 2016.
    -For each parameter, n_eff is a crude measure of effective sample size,
    -and Rhat is the potential scale reduction factor on split chains (at 
    -convergence, Rhat=1).
    -
    print(fit_hier, c("p_min", "p_max", "p_mean", "p_sd"), probs=c());
    -
    Inference for Stan model: hier.
    -4 chains, each with iter=5000; warmup=2500; thin=1; 
    -post-warmup draws per chain=2500, total post-warmup draws=10000.
    -
    -       mean se_mean   sd n_eff Rhat
    -p_min  0.33    0.01 0.47  6013    1
    -p_max  0.75    0.01 0.43  5627    1
    -p_mean 0.53    0.01 0.50  6004    1
    -p_sd   0.77    0.01 0.42  4326    1
    -
    -Samples were drawn using NUTS(diag_e) at Sat Jan 30 20:57:10 2016.
    -For each parameter, n_eff is a crude measure of effective sample size,
    -and Rhat is the potential scale reduction factor on split chains (at 
    -convergence, Rhat=1).
    -
    print(fit_hier_logit, c("p_min", "p_max", "p_mean", "p_sd"), probs=c());
    -
    Inference for Stan model: hier-logit.
    -4 chains, each with iter=5000; warmup=2500; thin=1; 
    -post-warmup draws per chain=2500, total post-warmup draws=10000.
    -
    -       mean se_mean   sd n_eff Rhat
    -p_min  0.46    0.00 0.50 10000    1
    -p_max  0.60    0.00 0.49 10000    1
    -p_mean 0.50    0.01 0.50 10000    1
    -p_sd   0.55    0.01 0.50  9180    1
    -
    -Samples were drawn using NUTS(diag_e) at Sat Jan 30 20:58:17 2016.
    -For each parameter, n_eff is a crude measure of effective sample size,
    -and Rhat is the potential scale reduction factor on split chains (at 
    -convergence, Rhat=1).
    +
    pars_to_print <- c("p_min", "p_max", "p_mean", "p_sd");
    +
    +message("complete pooling");
    +
    complete pooling
    +
    fit_pool$print(pars_to_print);
    +
     variable mean median   sd  mad   q5  q95 rhat ess_bulk ess_tail
    +   p_min  0.60   1.00 0.49 0.00 0.00 1.00 1.00     8891       NA
    +   p_max  0.48   0.00 0.50 0.00 0.00 1.00 1.00     8546       NA
    +   p_mean 0.52   1.00 0.50 0.00 0.00 1.00 1.00     6354       NA
    +   p_sd   0.32   0.00 0.47 0.00 0.00 1.00 1.00    10239       NA
    +
    message("no pooling");
    +
    no pooling
    +
    fit_no_pool$print(pars_to_print);
    +
     variable mean median   sd  mad   q5  q95 rhat ess_bulk ess_tail
    +   p_min  0.10   0.00 0.30 0.00 0.00 1.00 1.00     9160       NA
    +   p_max  0.97   1.00 0.17 0.00 1.00 1.00 1.00     9683       NA
    +   p_mean 0.68   1.00 0.46 0.00 0.00 1.00 1.00    12192       NA
    +   p_sd   0.99   1.00 0.09 0.00 1.00 1.00 1.00     9356       NA
    +
    message("partial pooling");
    +
    partial pooling
    +
    fit_hier$print(pars_to_print);
    +
     variable mean median   sd  mad   q5  q95 rhat ess_bulk ess_tail
    +   p_min  0.34   0.00 0.47 0.00 0.00 1.00 1.00     6387       NA
    +   p_max  0.74   1.00 0.44 0.00 0.00 1.00 1.00     6984       NA
    +   p_mean 0.53   1.00 0.50 0.00 0.00 1.00 1.00     6518       NA
    +   p_sd   0.76   1.00 0.42 0.00 0.00 1.00 1.00     4905       NA
    +
    message("partial pooling, log odds");
    +
    partial pooling, log odds
    +
    fit_hier_logit$print(pars_to_print);
    +
     variable mean median   sd  mad   q5  q95 rhat ess_bulk ess_tail
    +   p_min  0.46   0.00 0.50 0.00 0.00 1.00 1.00     9242       NA
    +   p_max  0.61   1.00 0.49 0.00 0.00 1.00 1.00     9923       NA
    +   p_mean 0.51   1.00 0.50 0.00 0.00 1.00 1.00    11860       NA
    +   p_sd   0.56   1.00 0.50 0.00 0.00 1.00 1.00     9187       NA

    The only worrisomely extreme value is the \(p\)-value for standard deviation in the no-pooling model, where 99% of the simulated data sets under the model had standard deviations among the number of hits greater than the actual data. Note that we could’ve constructed our posterior p-values to run in the opposite direction, or could formulate two-sided tests.

    -

    We can easily reproduce Figure 6.12 from (Gelman et al. 2013), which shows the posterior predictive distribution for the test statistic, the observed value as a vertical line, and the p-value for each of the tests. Here is the plot for the basic hierarchical model.

    +

    We can easily reproduce Figure 6.12 from (Gelman et al. 2013), which shows the posterior predictive distribution for the test statistic, the observed value as a vertical line, and the p-value for each of the tests. Here is the plot for the basic hierarchical model.

    y_min <- min(y);
     y_max <- max(y);
     y_mean <- mean(y);
    @@ -1508,23 +1599,23 @@ 

    Test Statistics and Bayesian \(p\)-ValuesTest Statistics and Bayesian \(p\)-Values

    -

    +

    Graphical Posterior Predictive Checks

    -

    In this section, we will compare the simulated data from the two different hierarchical models, one with the Beta prior on the chance-of-success scale and one with a normal prior on the log-odds scale. Following the advice of Gelman et al. (2013), we will take the fitted parameters of the data set and generate replicated data sets, then compare the replicated data sets visually to the actual data set.

    -

    As discussed by Gelman et al. (2013, p. 143), there is a choice of which parameters to fix and which to simulate. We can generate new trials using fitted chance-of-success parameters (\(\alpha\)), or we can generate new items using the population parameters (\(\mu\) and \(\sigma\)). We’ll provide both in the model below for comparison. In both cases, we simulate from posterior draws, not from the posterior mean or other point estimates; Stan makes this easy with its generated quantities block. Then we’ll plot several of those versus the real data.

    +

    In this section, we will compare the simulated data from the two different hierarchical models, one with the Beta prior on the chance-of-success scale and one with a normal prior on the log-odds scale. Following the advice of Gelman et al. (2013), we will take the fitted parameters of the data set and generate replicated data sets, then compare the replicated data sets visually to the actual data set.

    +

    As discussed by Gelman et al. (2013, p. 143), there is a choice of which parameters to fix and which to simulate. We can generate new trials using fitted chance-of-success parameters (\(\alpha\)), or we can generate new items using the population parameters (\(\mu\) and \(\sigma\)). We’ll provide both in the model below for comparison. In both cases, we simulate from posterior draws, not from the posterior mean or other point estimates; Stan makes this easy with its generated quantities block. Then we’ll plot several of those versus the real data.

    The Stan code to generate replicated data for existing items (y_rep) and for new items drawn from the population distribution (y_pop_rep) is in the generated quantities block.

    generated quantities {  
       ...
    @@ -1564,74 +1657,64 @@ 

    Graphical Posterior Predictive Checks

    In this case, the Beta variate is being generated inside the binomial pseudorandom number generator.

    We can now print the replicated values y_rep in both the hierarchical and hierarchical logit models.

    -
    print(fit_hier, c("y_rep", "y_pop_rep[1]"), probs=c(0.1, 0.5, 0.9));
    -
    Inference for Stan model: hier.
    -4 chains, each with iter=5000; warmup=2500; thin=1; 
    -post-warmup draws per chain=2500, total post-warmup draws=10000.
    -
    -              mean se_mean   sd 10% 50% 90% n_eff Rhat
    -y_rep[1]     14.51    0.05 3.90  10  14  20  5705    1
    -y_rep[2]     14.12    0.05 3.84   9  14  19  5432    1
    -y_rep[3]     13.69    0.04 3.80   9  14  19  7938    1
    -y_rep[4]     13.21    0.04 3.66   9  13  18  8316    1
    -y_rep[5]     12.87    0.04 3.65   8  13  18 10000    1
    -y_rep[6]     12.85    0.04 3.67   8  13  18 10000    1
    -y_rep[7]     12.44    0.04 3.61   8  12  17 10000    1
    -y_rep[8]     12.03    0.04 3.53   8  12  17 10000    1
    -y_rep[9]     11.60    0.04 3.50   7  11  16  9593    1
    -y_rep[10]    11.61    0.04 3.52   7  12  16  8953    1
    -y_rep[11]    11.20    0.03 3.46   7  11  16 10000    1
    -y_rep[12]    11.25    0.04 3.50   7  11  16 10000    1
    -y_rep[13]    11.20    0.03 3.48   7  11  16 10000    1
    -y_rep[14]    11.22    0.04 3.51   7  11  16 10000    1
    -y_rep[15]    11.23    0.03 3.46   7  11  16 10000    1
    -y_rep[16]    10.78    0.04 3.40   7  11  15  8451    1
    -y_rep[17]    10.37    0.04 3.42   6  10  15  7630    1
    -y_rep[18]     9.94    0.04 3.40   6  10  14  7187    1
    -y_pop_rep[1] 12.04    0.04 4.14   7  12  17  9664    1
    -
    -Samples were drawn using NUTS(diag_e) at Sat Jan 30 20:57:10 2016.
    -For each parameter, n_eff is a crude measure of effective sample size,
    -and Rhat is the potential scale reduction factor on split chains (at 
    -convergence, Rhat=1).
    -
    print(fit_hier_logit, c("y_rep", "y_pop_rep[1]"), probs=c(0.1, 0.5, 0.9));
    -
    Inference for Stan model: hier-logit.
    -4 chains, each with iter=5000; warmup=2500; thin=1; 
    -post-warmup draws per chain=2500, total post-warmup draws=10000.
    -
    -              mean se_mean   sd 10% 50% 90% n_eff Rhat
    -y_rep[1]     13.24    0.04 3.57   9  13  18 10000    1
    -y_rep[2]     12.94    0.04 3.52   9  13  18 10000    1
    -y_rep[3]     12.79    0.03 3.45   9  13  17 10000    1
    -y_rep[4]     12.55    0.03 3.42   8  12  17 10000    1
    -y_rep[5]     12.34    0.03 3.34   8  12  17 10000    1
    -y_rep[6]     12.36    0.03 3.36   8  12  17 10000    1
    -y_rep[7]     12.12    0.03 3.29   8  12  16 10000    1
    -y_rep[8]     11.92    0.03 3.33   8  12  16 10000    1
    -y_rep[9]     11.78    0.03 3.28   8  12  16 10000    1
    -y_rep[10]    11.79    0.03 3.28   8  12  16 10000    1
    -y_rep[11]    11.54    0.03 3.29   7  11  16 10000    1
    -y_rep[12]    11.51    0.03 3.24   7  11  16 10000    1
    -y_rep[13]    11.54    0.03 3.25   7  11  16 10000    1
    -y_rep[14]    11.50    0.03 3.23   7  11  16 10000    1
    -y_rep[15]    11.53    0.03 3.22   7  11  16 10000    1
    -y_rep[16]    11.30    0.03 3.27   7  11  16 10000    1
    -y_rep[17]    11.10    0.03 3.26   7  11  15 10000    1
    -y_rep[18]    10.91    0.03 3.30   7  11  15 10000    1
    -y_pop_rep[1] 11.93    0.04 3.51   8  12  17 10000    1
    -
    -Samples were drawn using NUTS(diag_e) at Sat Jan 30 20:58:17 2016.
    -For each parameter, n_eff is a crude measure of effective sample size,
    -and Rhat is the potential scale reduction factor on split chains (at 
    -convergence, Rhat=1).
    +
    message("partial pooling");
    +
    partial pooling
    +
    fit_hier$print(c("y_rep", "y_pop_rep[1]"), max_rows=18);
    +
      variable  mean median   sd  mad   q5   q95 rhat ess_bulk ess_tail
    + y_rep[1]  14.47  14.00 3.86 4.45 8.00 21.00 1.00     6982     9073
    + y_rep[2]  14.04  14.00 3.80 4.45 8.00 21.00 1.00     8664     9228
    + y_rep[3]  13.66  13.00 3.76 4.45 8.00 20.00 1.00     8754     9005
    + y_rep[4]  13.23  13.00 3.68 2.97 7.00 20.00 1.00     9174     9912
    + y_rep[5]  12.86  13.00 3.63 2.97 7.00 19.00 1.00     9120     9442
    + y_rep[6]  12.84  13.00 3.64 2.97 7.00 19.00 1.00     9264     8838
    + y_rep[7]  12.44  12.00 3.54 2.97 7.00 19.00 1.00    10410     9684
    + y_rep[8]  12.06  12.00 3.52 2.97 7.00 18.00 1.00     9608     9562
    + y_rep[9]  11.64  12.00 3.50 2.97 6.00 18.00 1.00    10304     9835
    + y_rep[10] 11.63  12.00 3.49 2.97 6.00 18.00 1.00     9544     8615
    + y_rep[11] 11.21  11.00 3.44 2.97 6.00 17.00 1.00     9778     9235
    + y_rep[12] 11.23  11.00 3.46 2.97 6.00 17.00 1.00     9337     9286
    + y_rep[13] 11.28  11.00 3.47 2.97 6.00 17.00 1.00     9885     9006
    + y_rep[14] 11.20  11.00 3.51 2.97 6.00 17.00 1.00     8784     9138
    + y_rep[15] 11.22  11.00 3.45 2.97 6.00 17.00 1.00    10199     8636
    + y_rep[16] 10.87  11.00 3.46 2.97 5.00 17.00 1.00     9397     9207
    + y_rep[17] 10.44  10.00 3.43 2.97 5.00 16.00 1.00     8604     8622
    + y_rep[18] 10.05  10.00 3.40 2.97 5.00 16.00 1.00     6948     8705
    +
    + # showing 18 of 19 rows (change via 'max_rows' argument or 'cmdstanr_max_rows' option)
    +
    message("partial pooling, log odds");
    +
    partial pooling, log odds
    +
    fit_hier_logit$print(c("y_rep", "y_pop_rep[1]"), max_rows=18);
    +
      variable  mean median   sd  mad   q5   q95 rhat ess_bulk ess_tail
    + y_rep[1]  13.21  13.00 3.57 2.97 8.00 19.00 1.00     9486     8596
    + y_rep[2]  12.96  13.00 3.52 2.97 7.00 19.00 1.00     9237     9295
    + y_rep[3]  12.82  13.00 3.45 2.97 7.00 19.00 1.00     9279     9628
    + y_rep[4]  12.58  12.00 3.38 2.97 7.00 18.00 1.00    10006     9350
    + y_rep[5]  12.37  12.00 3.40 2.97 7.00 18.00 1.00    10237     9789
    + y_rep[6]  12.38  12.00 3.32 2.97 7.00 18.00 1.00    10232     9704
    + y_rep[7]  12.14  12.00 3.32 2.97 7.00 18.00 1.00    10644     9716
    + y_rep[8]  11.98  12.00 3.26 2.97 7.00 18.00 1.00    10854    10078
    + y_rep[9]  11.74  12.00 3.28 2.97 7.00 17.00 1.00    10425     9918
    + y_rep[10] 11.74  12.00 3.29 2.97 7.00 17.00 1.00    10393     9968
    + y_rep[11] 11.55  11.00 3.28 2.97 6.00 17.00 1.00    10667     9732
    + y_rep[12] 11.56  11.00 3.26 2.97 6.00 17.00 1.00    10284     9875
    + y_rep[13] 11.51  11.00 3.22 2.97 6.00 17.00 1.00    10307     9979
    + y_rep[14] 11.48  11.00 3.30 2.97 6.00 17.00 1.00     9819     9858
    + y_rep[15] 11.59  11.00 3.31 2.97 6.00 17.00 1.00    10749     9586
    + y_rep[16] 11.34  11.00 3.26 2.97 6.00 17.00 1.00     9718     9655
    + y_rep[17] 11.19  11.00 3.24 2.97 6.00 17.00 1.00    10399     9736
    + y_rep[18] 10.93  11.00 3.32 2.97 6.00 17.00 1.00     9773     9658
    +
    + # showing 18 of 19 rows (change via 'max_rows' argument or 'cmdstanr_max_rows' option)

    The variable y_rep is ordered because it’s based on the actual items, which were sorted by success count in the original data. On the other hand, the y_pop_rep values are fully exchangeable, so indistinguishable in their posterior (other than MCMC error). Because the population replicates are exchangeable, we only printed the first for each model.

    -

    We can plot some of the simulated data sets along with the original data set to do a visual inspection as suggested by Gelman et al. (2013). Conveniently, RStan’s extract() function has already permuted the posterior draws, so we can just take the initial fifteen for display.

    +

    We can plot some of the simulated data sets along with the original data set to do a visual inspection as suggested by Gelman et al. (2013). Conveniently, RStan’s extract() function has already permuted the posterior draws, so we can just take the initial fifteen for display.

    df_post <- data.frame(list(dataset = rep("REAL", N),
                                y = y));
    +y_rep_hier_logit <- matrix(fit_hier_logit$draws('y_rep'), M, N);
    +
     for (n in 1:15) {
       df_post <- rbind(df_post,
                        data.frame(list(dataset = rep(paste("repl ", n), N),
    -                                   y = ss_hier_logit$y_rep[n,])));
    +                                   y = y_rep_hier_logit[n,])));
     }
     post_plot <-
       ggplot(df_post, aes(y)) +
    @@ -1641,18 +1724,36 @@ 

    Graphical Posterior Predictive Checks

    scale_y_discrete(name="count", limits=c(0, 2, 4)) + ggtitle("Existing Item Replication (Normal Prior on Log Odds)"); post_plot;
    -

    +

    And now we can do the same thing for the population-level replication; because the code’s the same, we do not echo it to the output.

    -

    +
    df_pop_post <- data.frame(list(dataset = rep("REAL", N),
    +                               y = y));
    +
    +y_pop_rep_hier_logit <- matrix(fit_hier_logit$draws('y_pop_rep'), M, N);
    +
    +for (n in 1:15) {
    +  df_pop_post <- rbind(df_pop_post,
    +                       data.frame(list(dataset = rep(paste("repl ", n), N),
    +                                        y = y_pop_rep_hier_logit[n,])));
    +}
    +post_pop_plot <-
    +  ggplot(df_pop_post, aes(y)) +
    +  facet_wrap(~ dataset) +
    +  stat_count(width=0.8) +
    +  scale_x_discrete(limits=c(5, 10, 15, 20)) +
    +  scale_y_discrete(name="count", limits=c(0, 2, 4)) +
    +  ggtitle("New Item Replication (Normal Prior on Log Odds)");
    +post_pop_plot;
    +

    The posterior simulations are not unreasonable for a binomial likelihood, but are noticeably more spread out than the actual data. This may actually have more to do with how the data were selected out of all the major league baseball players than the actual data distribution. Efron and Morris (1975, p 312) write, “This sample was chosen because we wanted between 30 and 50 at bats to assure a satisfactory approximation of the binomial by the normal distribution while leaving the bulk of at bats to be estimated. We also wanted to include an unusually good hitter (Clemente) to test the method with at least one extreme parameter, a situation expected to be less favorable to Stein’s estimator. Stein’s estimator requires equal variances, or in this situation, equal at bats, so the remaining 17 players are all whom either the April 26 or May 3 New York Times reported with 45 at bats.”

    Here are the same plots for the basic hierarchical model. This time we suppress printing the ggplot code, which is the same as before, and just show the graphs.

    -

    -

    +

    +

    The plots for replications of existing items is radically different with the beta prior, with the replications being much closer to the originally observed values. The replications for new items look similar. By eye, it looks like the beta prior is controlling the posterior variance of individual item chance-of-success estimates much better, despite producing wider posterior 80% intervals for chance of success.

    Discussion

    -

    A hierarchical model introduces an estimation bias toward the population mean and the stronger the bias, the less variance there is in the estimates for the items. Exactly how much bias and variance is warranted can be estimated by further calibrating the model and testing where its predictions do not bear out. In this case, both models yield similar estimates, with the log-odds model trading a bit more bias for variance in its estimates (the models did not have matched hyperpriors; see the exercises). With very little data, there is very little we can do to gain sharp inferences other than provide more informative priors, which is well worth doing when prior information is available. On the other hand, with more data, the models provide similar results (see the exericses), and in the limit, they all of the models (other than complete pooling) converge to posteriors that are delta functions around the maximum likelihood estimate. It is important to keep in mind that with increasing amounts of data, all of the models (aside from complete pooling) will converge to posteriors with delta functions at the empirical chance of success (i.e., the maximum likelihood estimate). Meanwhile, Bayesian inference is allowing us to make more accurate predictions with the data available before we hit that asymptotic regime.

    +

    A hierarchical model introduces an estimation bias toward the population mean and the stronger the bias, the less variance there is in the estimates for the items. Exactly how much bias and variance is warranted can be estimated by further calibrating the model and testing where its predictions do not bear out. In this case, both models yield similar estimates, with the log-odds model trading a bit more bias for variance in its estimates (the models did not have matched hyperpriors; see the exercises). With very little data, there is very little we can do to gain sharp inferences other than provide more informative priors, which is well worth doing when prior information is available. On the other hand, with more data, the models provide similar results (see the exercises), and in the limit, all of the models (other than complete pooling) converge to posteriors that are delta functions around the empirical chance of success (i.e., the maximum likelihood estimate). Meanwhile, Bayesian inference is allowing us to make more accurate predictions with the data available before we hit that asymptotic regime.

    Exercises

    @@ -1669,16 +1770,16 @@

    Exercises

  • How sensitive is the hierarchical model to the priors on the hyperparameters \(\kappa, \phi\)? Consider a weakly informative prior on \(\phi\) and alternative distributional families for \(\kappa\) (e.g., exponential).

  • Write a Stan model to predict \(z_n\) based on the no pooling and complete pooling models and compare those to the predictions based on the hierarchical model. Which is better calibrated in the sense of having roughly the right number of actual values fall within the prediction intervals? Then, compare the prediction made from a maximum likelihood point estimate of performance using a binomial predictive likelihood.

  • Why do the plots normalize properly (sum to 1) in both the Rankings and Who’s the Best Player plots?

  • -
  • If a model is well calibrated, what is the expected distribution of observed items in the 50% posterior predictive interval? How would this support a hypothesis test for model calibration? Could such a test be formulated in terms f Bayesian posterior \(p\)-values?

  • -
  • If the inverse CDFs of the marginal posteriors were available (or approximated via sorting MCMC draws), what is the expected distribution of the observed points in a well-calibrated model? How is this like testing all of the intervals? What problems might arise? Hint: see (Gneiting et al. 2007).

  • -
  • Lunn et al. (2013) contains a running analysis of pediatric cardiac surgery mortality for 12 hospitals. Reanalyze their data with Stan using the models in this note. The models will have to be generalized to allow \(K\) to vary by item \(n\). The data is available from Stan’s example model repository as BUGS Vol 1 examples: Surgical and the WinBUGS project hosts the original example description (which has different counts from the data analyzed by Lunn et al.); Lunn et al. analyze slightly different data in the BUGS book than in their on-line example.

  • -
  • How sensitive is the logistic model to the hyperprior? If we made it much less informative or much more informative, how much do the fitted values differ? How do the predictions differ? Evaluate the prior used by Lunn et al. (2013), which is \(\mathsf{Uniform}(-100,100)\) for \(\mu\) and \(\mathsf{Uniform}(0, 100)\) for \(\sigma\). Be sure to add constraints on the declarations of \(\mu\) and \(\sigma\) to match the constraints. What happens if the width of the uniform grows much wider or much narrower?

  • +
  • If a model is well calibrated, what is the expected distribution of observed items in the 50% posterior predictive interval? How would this support a hypothesis test for model calibration? Could such a test be formulated in terms of Bayesian posterior \(p\)-values?

  • +
  • If the inverse CDFs of the marginal posteriors were available (or approximated via sorting MCMC draws), what is the expected distribution of the observed points in a well-calibrated model? How is this like testing all of the intervals? What problems might arise? Hint: see (Gneiting et al. 2007).

  • +
  • Lunn et al. (2013) contains a running analysis of pediatric cardiac surgery mortality for 12 hospitals. Reanalyze their data with Stan using the models in this note. The models will have to be generalized to allow \(K\) to vary by item \(n\). The data is available from Stan’s example model repository as BUGS Vol 1 examples: Surgical and the WinBUGS project hosts the original example description (which has different counts from the data analyzed by Lunn et al.); Lunn et al. analyze slightly different data in the BUGS book than in their on-line example.

  • +
  • How sensitive is the logistic model to the hyperprior? If we made it much less informative or much more informative, how much do the fitted values differ? How do the predictions differ? Evaluate the prior used by Lunn et al. (2013), which is \(\mathsf{Uniform}(-100,100)\) for \(\mu\) and \(\mathsf{Uniform}(0, 100)\) for \(\sigma\). Be sure to add constraints on the declarations of \(\mu\) and \(\sigma\) to match the constraints. What happens if the width of the uniform grows much wider or much narrower?

  • For the no pooling and complete pooling models (fixed priors), derive the analytic posterior and code it using a random-number generator in the generated quantities block.

  • Show that the posterior predictive distribution \(p(\tilde{y} \, | \, y)\) is a properly normalized density.

  • For the partial pooling models (hierarchical priors), marginalize out the chance-of-success parameters (\(\theta\) or \(\alpha\)) and sample the higher-level parameters for the prior directly.

  • Generate rankings plot for the other three models; compare and contrast to the basic partial pooling model.

  • What are some other test statistics that might be used to evaluate our model fit to data using Bayesian \(p\)-Values$? Code some up and evaluate on one of the data sets given here or on simulated data.

  • -
  • Discuss why the lack of an alternative hypotehsis makes it impossible to perform power calculations with Bayesian \(p\)-values. What are the implications for hypothesis testing for model fit?

  • +
  • Discuss why the lack of an alternative hypothesis makes it impossible to perform power calculations with Bayesian \(p\)-values. What are the implications for hypothesis testing for model fit?

  • Show why posterior \(p\)-values are expected to have a \(\mathsf{Uniform}(0,1)\) distribution if a model is perfectly calibrated. Can you perform simulations to show that this is true in a particular case?

  • Figure out how to take all the hard-coded output analysis in here and make it dynamic by making better use of knitr. And take all the cut-and-paste duplicated code and wrap it into functions.

  • Discuss the difference between batting average and on-base percentage as random variables. Consider particularly the denominator (at-bat versus plate appearance). Is the denominator in these kinds of problems always a random variable itself? Why might this be important in inference?

  • @@ -1687,17 +1788,17 @@

    Exercises

    Acknowledgements

    -

    This presentation derives directly from four sources: Bayesian Data Analysis (Gelman et al. 2013), BUGS 0.5 Examples (Spiegelhalter et al. 1996), The BUGS Book (Lunn et al. 2014), and Daniel Lee’s hands-on Stan tutorial for a short-course we taught (with Andrew Gelman) in 2015.

    +

    This presentation derives directly from four sources: Bayesian Data Analysis (Gelman et al. 2013), BUGS 0.5 Examples (Spiegelhalter et al. 1996), The BUGS Book (Lunn et al. 2014), and Daniel Lee’s hands-on Stan tutorial for a short-course we taught (with Andrew Gelman) in 2015.

    I’d like to thank Jiqiang Guo, Aki Vehtari, Michael Betancourt, and Jonah Gabry for comments on everything from the math to the R code. Andrew Gelman objected to my using batting-average data as being not very useful for baseball (compared to, say, on-base percentage); I hope the exercise addressing his concerns and the additional data sets partially make up for my choice.

    References

    • Betancourt, M. and Girolami, M. (2015) Hamiltonian Monte Carlo for hierarchical models. Current Trends in Bayesian Methodology with Applications 79.

    • -
    • Efron, B. and Morris, C. (1975) Data analysis using Stein’s estimator and its generalizations. Journal of the American Statistical Association 70(350), 311–319. [ pdf ]

    • +
    • Efron, B. and Morris, C. (1975) Data analysis using Stein’s estimator and its generalizations. Journal of the American Statistical Association 70(350), 311–319. [ pdf]

    • Gelman, A., Carlin, J. B., Stern, H. S., Dunson, D. B., Vehtari, A., and Rubin, D. B. (2013) Bayesian Data Analysis, 3rd Edition. Chapman & Hall/CRC Press, London.

    • Gelman, A. and Hill, J. (2007) Data Analysis Using Regression and Multilevel-Hierarchical Models. Cambridge University Press, Cambridge, United Kingdom.

    • -
    • Gelman, A., Hill, J., and Yajima, M. (2012) Why we (usually) don’t have to worry about multiple comparisons. Journal of Research on Educational Effectiveness 5, 189–211. [ pdf ]

    • +
    • Gelman, A., Hill, J., and Yajima, M. (2012) Why we (usually) don’t have to worry about multiple comparisons. Journal of Research on Educational Effectiveness 5, 189–211. [ pdf]

    • Gneiting, T., Balabdaoui, F., and Raftery, A. E. (2007) Probabilistic forecasts, calibration and sharpness. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 69(2), 243–268.

    • Lunn, D., Jackson, C., Best, N., Thomas, A., and Spiegelhalter, D. (2013) The BUGS Book: A Practical Introduction to Bayesian Analysis. Chapman & Hall/CRC Press.

    • Neal, R. M. (2003) Slice sampling. Annals of Statistics 31(3):705–767.

    • @@ -1712,16 +1813,16 @@

      References

      Data Sets Included

      Rat tumors (N = 71)

      -

      Tarone (1982) provides a data set of tumor incidence in historical control groups of rats; specifically endometrial stromal polyps in female lab rats of type F344. The data set is taken from the book site for (Gelman et al. 2013):

      +

      Tarone (1982) provides a data set of tumor incidence in historical control groups of rats; specifically endometrial stromal polyps in female lab rats of type F344. The data set is taken from the book site for (Gelman et al. 2013):

      Surgical mortality (N = 12)

      -

      Spiegelhalter et al. (1996) provide a data set of mortality rates in 12 hospitals performing cardiac surgery in babies. We just manually entered the data from the paper; it is also available in the Stan example models repository in R format.

      +

      Spiegelhalter et al. (1996) provide a data set of mortality rates in 12 hospitals performing cardiac surgery in babies. We just manually entered the data from the paper; it is also available in the Stan example models repository in R format.

      • Data: surgical-mortality.csv
      • R script: surgical-mortality.R
      • @@ -1751,386 +1852,217 @@

        Baseball hits 1996 AL (N = 308)

        Appendix: Full Stan Programs

        pool.stan

        -
        data { 
        -  int<lower=0> N;           // items 
        -  int<lower=0> K[N];        // initial trials 
        -  int<lower=0> y[N];        // initial successes 
        - 
        -  int<lower=0> K_new[N];    // new trials 
        -  int<lower=0> y_new[N];    // new successes 
        -} 
        -transformed data { 
        -  real min_y;   // minimum successes 
        -  real max_y;   // maximum successes 
        -  real mean_y;  // sample mean successes 
        -  real sd_y;    // sample std dev successes 
        - 
        -  min_y <- min(y); 
        -  max_y <- max(y); 
        -  mean_y <- mean(to_vector(y)); 
        -  sd_y <- sd(to_vector(y)); 
        -} 
        +
        #include data-blocks.stan 
         parameters { 
        -  real<lower=0, upper=1> phi;  // chance of success (pooled) 
        +  real<lower=0, upper=1> phi; // chance of success (pooled) 
         } 
         model { 
        -  y ~ binomial(K, phi);  // likelihood 
        +  y ~ binomial(K, phi); // likelihood 
         } 
         generated quantities { 
        -  vector<lower=0, upper=1>[N] theta;  // chance-of-success 
        - 
        -  real log_p_new;      // posterior predictive log density remaining trials 
        - 
        -  int<lower=0> z[N];  // posterior prediction remaining trials 
        - 
        -  int<lower=0, upper=1> some_ability_gt_350;  // Pr[some theta > 0.35] 
        -  int<lower=0, upper=1> avg_gt_400[N];        // Pr[season avg of n] >= 0.400 
        -  int<lower=0, upper=1> ability_gt_400[N];    // Pr[chance-of-success of n] >= 0.400 
        - 
        -  int<lower=0> y_rep[N];      // replications for existing items 
        - 
        -  real<lower=0> min_y_rep;   // posterior predictive min replicated successes 
        -  real<lower=0> max_y_rep;   // posterior predictive max replicated successes 
        -  real<lower=0> mean_y_rep;  // posterior predictive sample mean replicated successes 
        -  real<lower=0> sd_y_rep;    // posterior predictive sample std dev replicated successes 
        - 
        -  int<lower=0, upper=1> p_min;  // posterior predictive p-values 
        -  int<lower=0, upper=1> p_max; 
        -  int<lower=0, upper=1> p_mean; 
        -  int<lower=0, upper=1> p_sd; 
        - 
        -  theta <- rep_vector(phi, N); 
        - 
        -  log_p_new <- 0; 
        -  for (n in 1:N) 
        -    log_p_new <- log_p_new + binomial_log(y_new[n], K_new[n], theta[n]); 
        - 
        -  for (n in 1:N) 
        -    z[n] <- binomial_rng(K_new[n], theta[n]); 
        - 
        -  some_ability_gt_350 <- (max(theta) > 0.35); 
        -  for (n in 1:N) 
        -    avg_gt_400[n] <- (((y[n] + z[n]) / (0.0 + K[n] + K_new[n])) > 0.400); 
        -  for (n in 1:N) 
        -    ability_gt_400[n] <- (theta[n] > 0.400); 
        +  // define theta, per-player chance-of-success 
        +  vector<lower=0, upper=1>[N] theta = rep_vector(phi, N); 
          
        -  for (n in 1:N) 
        -    y_rep[n] <- binomial_rng(K[n], theta[n]); 
        - 
        -  min_y_rep <- min(y_rep); 
        -  max_y_rep <- max(y_rep); 
        -  mean_y_rep <- mean(to_vector(y_rep)); 
        -  sd_y_rep <- sd(to_vector(y_rep)); 
        - 
        -  p_min <- (min_y_rep >= min_y); 
        -  p_max <- (max_y_rep >= max_y); 
        -  p_mean <- (mean_y_rep >= mean_y); 
        -  p_sd <- (sd_y_rep >= sd_y); 
        +  #include gq-postpred.stan 
         } 

        no-pool.stan

        -
        data { 
        -  int<lower=0> N;          // items 
        -  int<lower=0> K[N];       // initial trials 
        -  int<lower=0> y[N];       // initial successes 
        - 
        -  int<lower=0> K_new[N];   // new trials 
        -  int<lower=0> y_new[N];   // new successes 
        -} 
        -transformed data { 
        -  real min_y;   // minimum successes 
        -  real max_y;   // maximum successes 
        -  real mean_y;  // sample mean successes 
        -  real sd_y;    // sample std dev successes 
        - 
        -  min_y <- min(y); 
        -  max_y <- max(y); 
        -  mean_y <- mean(to_vector(y)); 
        -  sd_y <- sd(to_vector(y)); 
        -} 
        +
        #include data-blocks.stan 
         parameters { 
           vector<lower=0, upper=1>[N] theta; // chance of success 
         } 
         model { 
        -  y ~ binomial(K, theta);  // likelihood 
        +  y ~ binomial(K, theta); // likelihood 
         } 
         generated quantities { 
        -  real log_p_new;     // posterior predictive log density remaining trials 
        - 
        -  int<lower=0> z[N];  // posterior prediction remaining trials 
        - 
        -  int<lower=0, upper=1> some_ability_gt_350;  // Pr[some theta > 0.35] 
        -  int<lower=0, upper=1> avg_gt_400[N];        // Pr[season avg of n] >= 0.400 
        -  int<lower=0, upper=1> ability_gt_400[N];    // Pr[chance-of-success of n] >= 0.400 
        - 
        -  int<lower=1, upper=N> rnk[N];      // rank of player n 
        -  int<lower=0, upper=1> is_best[N];  // Pr[player n highest chance of success] 
        - 
        -  int<lower=0> y_rep[N];   // replications for existing items 
        - 
        -  real min_y_rep;   // posterior predictive min replicated successes 
        -  real max_y_rep;   // posterior predictive max replicated successes 
        -  real mean_y_rep;  // posterior predictive sample mean replicated successes 
        -  real sd_y_rep;    // posterior predictive sample std dev replicated successes 
        - 
        -  int p_min;    // posterior p-val for min test stat 
        -  int p_max;    // posterior p-val for max test stat 
        -  int p_mean;   // posterior p-val for sample mean test stat 
        -  int p_sd;     // posterior p-val for smaple std dev test stat 
        - 
        -  log_p_new <- 0; 
        -  for (n in 1:N) 
        -    log_p_new <- log_p_new + binomial_log(y_new[n], K_new[n], theta[n]); 
        - 
        -  for (n in 1:N) 
        -    z[n] <- binomial_rng(K_new[n], theta[n]); 
        - 
        -  some_ability_gt_350 <- (max(theta) > 0.35); 
        -  for (n in 1:N) 
        -    avg_gt_400[n] <- (((y[n] + z[n]) / (0.0 + K[n] + K_new[n])) > 0.400); 
        -  for (n in 1:N) 
        -    ability_gt_400[n] <- (theta[n] > 0.400); 
        - 
        -  { 
        -    int dsc[N]; 
        -    dsc <- sort_indices_desc(theta); 
        -    for (n in 1:N) 
        -      rnk[dsc[n]] <- n; 
        -  } 
        -  for (n in 1:N) 
        -    is_best[n] <- (rnk[n] == 1); 
        - 
        -  for (n in 1:N) 
        -    y_rep[n] <- binomial_rng(K[n], theta[n]); 
        - 
        -  min_y_rep <- min(y_rep); 
        -  max_y_rep <- max(y_rep); 
        -  mean_y_rep <- mean(to_vector(y_rep)); 
        -  sd_y_rep <- sd(to_vector(y_rep)); 
        - 
        -  p_min <- (min_y_rep >= min_y); 
        -  p_max <- (max_y_rep >= max_y); 
        -  p_mean <- (mean_y_rep >= mean_y); 
        -  p_sd <- (sd_y_rep >= sd_y); 
        +  #include gq-postpred.stan 
        +  #include gq-ranking.stan 
         } 

        hier.stan

        -
        data { 
        -  int<lower=0> N;              // items 
        -  int<lower=0> K[N];           // initial trials 
        -  int<lower=0> y[N];           // initial successes 
        - 
        -  int<lower=0> K_new[N];       // new trials 
        -  int<lower=0> y_new[N];       // new successes 
        -} 
        -transformed data { 
        -  real min_y;   // minimum successes 
        -  real max_y;   // maximum successes 
        -  real mean_y;  // sample mean successes 
        -  real sd_y;    // sample std dev successes 
        - 
        -  min_y <- min(y); 
        -  max_y <- max(y); 
        -  mean_y <- mean(to_vector(y)); 
        -  sd_y <- sd(to_vector(y)); 
        -} 
        +
        #include data-blocks.stan 
         parameters { 
        -  real<lower=0, upper=1> phi;         // population chance of success 
        -  real<lower=1> kappa;                // population concentration 
        -  vector<lower=0, upper=1>[N] theta;  // chance of success  
        +  real<lower=0, upper=1> phi; // population chance of success 
        +  real<lower=1> kappa; // population concentration 
        +  vector<lower=0, upper=1>[N] theta; // chance of success  
         } 
         model { 
        -  kappa ~ pareto(1, 1.5);                        // hyperprior 
        -  theta ~ beta(phi * kappa, (1 - phi) * kappa);  // prior 
        -  y ~ binomial(K, theta);                        // likelihood 
        +  kappa ~ pareto(1, 1.5); // hyperprior 
        +  theta ~ beta(phi * kappa, (1 - phi) * kappa); // prior 
        +  y ~ binomial(K, theta); // likelihood 
         } 
         generated quantities { 
        -  real log_p_new;     // posterior predictive log density remaining trials 
        - 
        -  int<lower=0> z[N];  // posterior prediction remaining trials 
        - 
        -  int<lower=0, upper=1> some_ability_gt_350;  // Pr[some theta > 0.35] 
        -  int<lower=0, upper=1> avg_gt_400[N];        // Pr[season avg of n] >= 0.400 
        -  int<lower=0, upper=1> ability_gt_400[N];    // Pr[chance-of-success of n] >= 0.400 
        - 
        -  int<lower=1, upper=N> rnk[N];      // rank of player n 
        -  int<lower=0, upper=1> is_best[N];  // Pr[player n highest chance of success] 
        - 
        -  int<lower=0> y_rep[N];      // replications for existing items 
        -  int<lower=0> y_pop_rep[N];  // replications for simulated items 
        - 
        -  real min_y_rep;   // posterior predictive min replicated successes 
        -  real max_y_rep;   // posterior predictive max replicated successes 
        -  real mean_y_rep;  // posterior predictive sample mean replicated successes 
        -  real sd_y_rep;    // posterior predictive sample std dev replicated successes 
        - 
        -  int p_min;                                     // posterior predictive p-values 
        -  int p_max; 
        -  int p_mean; 
        -  int p_sd; 
        +  #include gq-postpred.stan 
        +  #include gq-ranking.stan 
          
        -  log_p_new <- 0; 
        -  for (n in 1:N) 
        -    log_p_new <- log_p_new + binomial_log(y_new[n], K_new[n], theta[n]); 
        -   
        -  for (n in 1:N) 
        -    z[n] <- binomial_rng(K_new[n], theta[n]); 
        - 
        -  some_ability_gt_350 <- (max(theta) > 0.35); 
        -  for (n in 1:N) 
        -    avg_gt_400[n] <- (((y[n] + z[n]) / (0.0 + K [n] + K_new[n])) > 0.400); 
        -  for (n in 1:N) 
        -    ability_gt_400[n] <- (theta[n] > 0.400); 
        - 
        -  { 
        -    int dsc[N]; 
        -    dsc <- sort_indices_desc(theta); 
        -    for (n in 1:N) 
        -      rnk[dsc[n]] <- n; 
        +  // replications for simulated items   
        +  array[N] int<lower=0> y_pop_rep;  
        +  for (n in 1 : N) { 
        +    y_pop_rep[n] = binomial_rng(K[n], 
        +                                beta_rng(phi * kappa, (1 - phi) * kappa)); 
           } 
        -  for (n in 1:N) 
        -    is_best[n] <- (rnk[n] == 1); 
        - 
        -  for (n in 1:N) 
        -    y_rep[n] <- binomial_rng(K[n], theta[n]); 
        -  for (n in 1:N) 
        -    y_pop_rep[n] <- binomial_rng(K[n],  
        -                                 beta_rng(phi * kappa, 
        -                                          (1 - phi) * kappa)); 
        - 
        -  min_y_rep <- min(y_rep); 
        -  max_y_rep <- max(y_rep); 
        -  mean_y_rep <- mean(to_vector(y_rep)); 
        -  sd_y_rep <- sd(to_vector(y_rep)); 
        - 
        -  p_min <- (min_y_rep >= min_y); 
        -  p_max <- (max_y_rep >= max_y); 
        -  p_mean <- (mean_y_rep >= mean_y); 
        -  p_sd <- (sd_y_rep >= sd_y); 
         } 

        hier-logit.stan

        -
        data { 
        -  int<lower=0> N;           // items 
        -  int<lower=0> K[N];        // initial trials 
        -  int<lower=0> y[N];        // initial successes 
        - 
        -  int<lower=0> K_new[N];    // new trials 
        -  int<lower=0> y_new[N];    // new successes 
        -} 
        -transformed data { 
        -  real min_y;   // minimum successes 
        -  real max_y;   // maximum successes 
        -  real mean_y;  // sample mean successes 
        -  real sd_y;    // sample std dev successes 
        - 
        -  min_y <- min(y); 
        -  max_y <- max(y); 
        -  mean_y <- mean(to_vector(y)); 
        -  sd_y <- sd(to_vector(y)); 
        -} 
        +
        #include data-blocks.stan 
         parameters { 
        -  real mu;                       // population mean of success log-odds 
        -  real<lower=0> sigma;           // population sd of success log-odds 
        -  vector[N] alpha_std;           // success log-odds (standardized) 
        +  real mu; // population mean of success log-odds 
        +  real<lower=0> sigma; // population sd of success log-odds 
        +  vector<offset=mu, multiplier=sigma>[N] alpha_std; // success log-odds (standardized) 
         } 
         model { 
        -  mu ~ normal(-1, 1);                             // hyperprior 
        -  sigma ~ normal(0, 1);                           // hyperprior 
        -  alpha_std ~ normal(0, 1);                       // prior (hierarchical) 
        -  y ~ binomial_logit(K, mu + sigma * alpha_std);  // likelihood 
        +  mu ~ normal(-1, 1); // hyperprior 
        +  sigma ~ normal(0, 1); // hyperprior 
        +  alpha_std ~ normal(mu, sigma); // prior (hierarchical) 
        +  y ~ binomial_logit(K, alpha_std); // likelihood 
         } 
         generated quantities { 
        -  vector[N] theta;    // chance of success 
        - 
        -  real log_p_new;     // posterior predictive log density remaining trials 
        - 
        -  int<lower=0> z[N];  // posterior prediction remaining trials 
        - 
        -  int<lower=0, upper=1> some_ability_gt_350;  // Pr[some theta > 0.35] 
        -  int<lower=0, upper=1> avg_gt_400[N];        // Pr[season avg of n] >= 0.400 
        -  int<lower=0, upper=1> ability_gt_400[N];    // Pr[chance-of-success of n] >= 0.400 
        - 
        -  int<lower=1, upper=N> rnk[N];      // rank of player n 
        -  int<lower=0, upper=1> is_best[N];  // Pr[player n highest chance of success] 
        +  vector[N] theta = inv_logit(alpha_std); 
        +  #include gq-postpred.stan 
        +  #include gq-ranking.stan 
          
        -  int<lower=0> y_rep[N];      // replications for existing items 
        -  int<lower=0> y_pop_rep[N];  // replications for simulated items 
        +  array[N] int<lower=0> y_pop_rep; // replications for simulated items 
        +  for (n in 1 : N) { 
        +    y_pop_rep[n] = binomial_rng(K[n], inv_logit(normal_rng(mu, sigma))); 
        +  } 
        +} 
        +
        +
        +

        data-blocks.stan

        +
        /** observed outcome y for N items in K, K_new binary trials **/ 
        +data { 
        +  int<lower=0> N; // items 
        +  array[N] int<lower=0> K; // initial trials 
        +  array[N] int<lower=0> y; // initial successes 
        +   
        +  array[N] int<lower=0> K_new; // new trials 
        +  array[N] int<lower=0> y_new; // new successes 
        +} 
        +transformed data { 
        +  // summary stats for observed data 
        +  real min_y = min(y); 
        +  real max_y = max(y); 
        +  real mean_y = mean(to_vector(y)); 
        +  real sd_y = sd(to_vector(y)); 
        +} 
        +
        +
        +

        include-gq-postpred.stan

        +
        /** include in generated quantities block of model with parameter 
        +    vector<lower=0, upper=1>[N] theta; // chance of success  
        +    and data: N, K, y, K_new, y_new, min_y, max_y, mean_y, sd_y 
        +*/ 
          
        -  real min_y_rep;   // posterior predictive min replicated successes 
        -  real max_y_rep;   // posterior predictive max replicated successes 
        -  real mean_y_rep;  // posterior predictive sample mean replicated successes 
        -  real sd_y_rep;    // posterior predictive sample std dev replicated successes 
        +  // posterior predictive log density remaining trials 
        +  real log_p_new = 0; 
        +  for (n in 1 : N) { 
        +    log_p_new = log_p_new + binomial_lpmf(y_new[n] | K_new[n], theta[n]); 
        +  } 
          
        -  int p_min;                                     // posterior predictive p-values 
        -  int p_max; 
        -  int p_mean; 
        -  int p_sd; 
        +  // posterior predictions on new data  
        +  array[N] int<lower=0> z = binomial_rng(K_new, theta);  
          
        -  for (n in 1:N) 
        -    theta[n] <- inv_logit(mu + sigma * alpha_std[n]); 
        +  // Pr[some theta > 0.35]   
        +  int<lower=0, upper=1> some_ability_gt_350 = max(theta) > 0.35; 
        +  // Pr[some player ability > 400] 
        +  array[N] int<lower=0, upper=1> ability_gt_400; 
        +  for (n in 1 : N) { 
        +    ability_gt_400[n] = theta[n] > 0.400; 
        +  } 
        +  // Pr[some player season average > 400] 
        +  array[N] int<lower=0, upper=1> avg_gt_400; 
        +  for (n in 1 : N) { 
        +    // y[n] - observed, z[n] - expected hits in rest of season 
        +    avg_gt_400[n] = ((y[n] + z[n]) / (0.0 + K[n] + K_new[n])) > 0.400; 
        +  } 
          
        -  log_p_new <- 0; 
        -  for (n in 1:N) 
        -    log_p_new <- log_p_new + binomial_log(y_new[n], K_new[n], theta[n]); 
        +  // replications for existing items   
        +  array[N] int<lower=0> y_rep = binomial_rng(K, theta); 
        +   
        +  // posterior predictive min replicated successes, test stat p_val 
        +  real<lower=0> min_y_rep = min(y_rep); 
        +  int<lower=0, upper=1> p_min = min_y_rep >= min_y; 
          
        -  for (n in 1:N) 
        -    z[n] <- binomial_rng(K_new[n], theta[n]); 
        +  // posterior predictive max replicated successes, test stat p_val 
        +  real<lower=0> max_y_rep = max(y_rep);  
        +  int<lower=0, upper=1> p_max = max_y_rep >= max_y; 
          
        -  some_ability_gt_350 <- (max(theta) > 0.35); 
        -  for (n in 1:N) 
        -    avg_gt_400[n] <- (((y[n] + z[n]) / (0.0 + K[n] + K_new[n])) > 0.400); 
        -  for (n in 1:N) 
        -    ability_gt_400[n] <- (theta[n] > 0.400); 
        +  // posterior predictive sample mean replicated successes, test stat p_val 
        +  real<lower=0> mean_y_rep = mean(to_vector(y_rep)); 
        +  int<lower=0, upper=1> p_mean = mean_y_rep >= mean_y; 
          
        +  // posterior predictive sample std dev replicated successes, test stat p_val 
        +  real<lower=0> sd_y_rep = sd(to_vector(y_rep)); 
        +  int<lower=0, upper=1> p_sd = sd_y_rep >= sd_y; 
        +
        +
        +

        include-gq-ranking.stan

        +
        + +
        +
        /** include in generated quantities block of model with parameter 
        +    vector<lower=0, upper=1>[N] theta; // chance of success 
        +    and data N (number of observations) 
        +*/ 
        +  array[N] int<lower=1, upper=N> rnk; // rank of player n 
           { 
        -    int dsc[N]; 
        -    dsc <- sort_indices_desc(theta); 
        -    for (n in 1:N) 
        -      rnk[dsc[n]] <- n; 
        +    array[N] int dsc; 
        +    dsc = sort_indices_desc(theta); 
        +    for (n in 1 : N) { 
        +      rnk[dsc[n]] = n; 
        +    } 
           } 
        -  for (n in 1:N) 
        -    is_best[n] <- (rnk[n] == 1); 
        - 
        -  for (n in 1:N) 
        -    y_rep[n] <- binomial_rng(K[n], theta[n]); 
        -  for (n in 1:N) 
        -    y_pop_rep[n] <- binomial_rng(K[n], inv_logit(normal_rng(mu, sigma))); 
        - 
        -  min_y_rep <- min(y_rep); 
        -  max_y_rep <- max(y_rep); 
        -  mean_y_rep <- mean(to_vector(y_rep)); 
        -  sd_y_rep <- sd(to_vector(y_rep)); 
        - 
        -  p_min <- (min_y_rep >= min_y); 
        -  p_max <- (max_y_rep >= max_y); 
        -  p_mean <- (mean_y_rep >= mean_y); 
        -  p_sd <- (sd_y_rep >= sd_y); 
        -} 
        + array[N] int<lower=0, upper=1> is_best; // Pr[player n highest chance of success] + for (n in 1 : N) { + is_best[n] = rnk[n] == 1; + }
      + +
    + + + + + + + diff --git a/knitr/pool-binary-trials/pool.stan b/knitr/pool-binary-trials/pool.stan index bea0cb1d..651be1e9 100644 --- a/knitr/pool-binary-trials/pool.stan +++ b/knitr/pool-binary-trials/pool.stan @@ -1,22 +1,4 @@ -data { - int N; // items - array[N] int K; // initial trials - array[N] int y; // initial successes - - array[N] int K_new; // new trials - array[N] int y_new; // new successes -} -transformed data { - real min_y; // minimum successes - real max_y; // maximum successes - real mean_y; // sample mean successes - real sd_y; // sample std dev successes - - min_y = min(y); - max_y = max(y); - mean_y = mean(to_vector(y)); - sd_y = sd(to_vector(y)); -} +#include data-blocks.stan parameters { real phi; // chance of success (pooled) } @@ -24,58 +6,8 @@ model { y ~ binomial(K, phi); // likelihood } generated quantities { - vector[N] theta; // chance-of-success - - real log_p_new; // posterior predictive log density remaining trials - - array[N] int z; // posterior prediction remaining trials - - int some_ability_gt_350; // Pr[some theta > 0.35] - array[N] int avg_gt_400; // Pr[season avg of n] >= 0.400 - array[N] int ability_gt_400; // Pr[chance-of-success of n] >= 0.400 - - array[N] int y_rep; // replications for existing items - - real min_y_rep; // posterior predictive min replicated successes - real max_y_rep; // posterior predictive max replicated successes - real mean_y_rep; // posterior predictive sample mean replicated successes - real sd_y_rep; // posterior predictive sample std dev replicated successes - - int p_min; // posterior predictive p-values - int p_max; - int p_mean; - int p_sd; - - theta = rep_vector(phi, N); - - log_p_new = 0; - for (n in 1 : N) { - log_p_new = log_p_new + binomial_lpmf(y_new[n] | K_new[n], theta[n]); - } - - for (n in 1 : N) { - z[n] = binomial_rng(K_new[n], theta[n]); - } - - some_ability_gt_350 = max(theta) > 0.35; - for (n in 1 : N) { - avg_gt_400[n] = ((y[n] + z[n]) / (0.0 + K[n] + K_new[n])) > 0.400; - } - for (n in 1 : N) { - ability_gt_400[n] = theta[n] > 0.400; - } - - for (n in 1 : N) { - y_rep[n] = binomial_rng(K[n], theta[n]); - } - - min_y_rep = min(y_rep); - max_y_rep = max(y_rep); - mean_y_rep = mean(to_vector(y_rep)); - sd_y_rep = sd(to_vector(y_rep)); - - p_min = min_y_rep >= min_y; - p_max = max_y_rep >= max_y; - p_mean = mean_y_rep >= mean_y; - p_sd = sd_y_rep >= sd_y; + // define theta, per-player chance-of-success + vector[N] theta = rep_vector(phi, N); + + #include gq-postpred.stan }