Skip to content

Latest commit

 

History

History
531 lines (414 loc) · 15.1 KB

Bayesian_Modeling.md

File metadata and controls

531 lines (414 loc) · 15.1 KB

Bayesian Modeling

Jesse Cambon 30 April, 2020

References: * http://appliedpredictivemodeling.com/data * http://faculty.marshall.usc.edu/gareth-james/ISL/data.html

Todo: * HDI * Sigma Term * References

Setup

#library(AppliedPredictiveModeling) # datasets
library(ISLR) # datasets
library(skimr)
library(tidyverse)
library(wesanderson)
library(rstanarm)
library(bayestestR)
library(insight)
library(bayesplot)
library(broom)
library(rsample)
library(jcolors)
library(patchwork)
library(ggrepel)
library(knitr)

num_cores <-  parallel::detectCores()
options(mc.cores = num_cores)

set.seed(42) # for reproducibility

Set input data and formula

Datasets and formulas: * ISLR::Carseats : Sales ~ Advertising + Price * ISLR::Credit : Limit ~ Income + Rating * chickwts: weight ~ feed

### Set input dataset here ################
split <- initial_split(chickwts, prop = 0.9)
############################################

### Set model equation here ##########################
model_formula = as.formula(weight ~ feed)
######################################################
chickwts %>% group_by(feed) %>%
  summarize(n=n(),
            min=min(weight),
            median=median(weight),
            mean=mean(weight),
            max=max(weight)) %>%
            ungroup() %>%
  kable()
feed n min median mean max
casein 12 216 342.0 323.5833 404
horsebean 10 108 151.5 160.2000 227
linseed 12 141 221.0 218.7500 309
meatmeal 11 153 263.0 276.9091 380
soybean 14 158 248.0 246.4286 329
sunflower 12 226 328.0 328.9167 423
ggplot(data=chickwts,aes(x=weight,fill=feed)) + 
  facet_wrap( ~ feed) +
  theme_minimal() +
  theme(legend.position='none') +
  geom_density(alpha=0.7)+
  scale_color_jcolors('default') +
  xlab('Weight') + ylab('')

C/V split

train <- training(split) %>% as_tibble()
test  <- testing(split) %>% as_tibble()

train_small <- train %>% sample_n(30)
train_tiny <- train %>% sample_n(15)

Fit models

lm_model <- glm(model_formula, data = train)
stan_model <- stan_glm(model_formula, data = train)
stan_model_small <- stan_glm(model_formula, data = train_small)
stan_model_tiny <- stan_glm(model_formula, data = train_tiny)

Extract posterior

http://mc-stan.org/rstanarm/reference/as.matrix.stanreg.html

post1 <- as.data.frame(stan_model) %>% as_tibble()

Posterior Intervals

https://mc-stan.org/rstanarm/reference/posterior_interval.stanreg.html https://mc-stan.org/rstanarm/articles/rstanarm.html

rstanarm::posterior_interval(stan_model) %>% as.data.frame() %>%
  rownames_to_column('feed')
##            feed         5%         95%
## 1   (Intercept)  290.53537  350.194061
## 2 feedhorsebean -200.23743 -116.511315
## 3   feedlinseed -138.06695  -55.356663
## 4  feedmeatmeal  -91.48396   -4.827427
## 5   feedsoybean -113.34100  -30.701755
## 6 feedsunflower  -30.73630   52.775160
## 7         sigma   50.01772   67.437854
tidy(lm_model,conf.int=T) %>% select(-std.error)
## # A tibble: 6 x 6
##   term          estimate statistic  p.value conf.low conf.high
##   <chr>            <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
## 1 (Intercept)     323.      17.9   2.57e-25    288.     359.  
## 2 feedhorsebean  -163.      -6.39  3.05e- 8   -213.    -113.  
## 3 feedlinseed    -100.      -4.01  1.74e- 4   -149.     -51.2 
## 4 feedmeatmeal    -51.2     -2.01  4.95e- 2   -101.      -1.19
## 5 feedsoybean     -75.6     -3.10  3.02e- 3   -124.     -27.8 
## 6 feedsunflower     8.52     0.342 7.34e- 1    -40.3     57.4
# this could take a bit to run
post_descr <- describe_posterior(stan_model, test = c("p_direction","rope","bayesfactor"))
## Computation of Bayes factors: sampling priors, please wait...

## Loading required namespace: logspline
kable(post_descr)
Parameter Median CI CI_low CI_high pd ROPE_CI ROPE_low ROPE_high ROPE_Percentage BF Rhat ESS
(Intercept) 320.24350 89 290.08448 347.803012 1.00000 89 -8.001383 8.001383 0.0000000 1.862527e+24 1.008134 1252
feedhorsebean -158.87134 89 -200.56483 -120.058178 1.00000 89 -8.001383 8.001383 0.0000000 3.911899e+03 1.004595 1767
feedlinseed -95.51556 89 -135.69007 -55.959280 1.00000 89 -8.001383 8.001383 0.0000000 4.199747e+01 1.006326 1665
feedmeatmeal -47.54054 89 -92.05273 -8.744213 0.96725 89 -8.001383 8.001383 0.0000000 6.826946e-01 1.006196 1625
feedsoybean -72.21472 89 -113.19881 -33.269377 0.99750 89 -8.001383 8.001383 0.0000000 8.051012e+00 1.006223 1636
feedsunflower 11.16637 89 -27.96346 52.923724 0.67150 89 -8.001383 8.001383 0.2580736 1.391958e-01 1.003641 1768

Rope

rope(stan_model)
## # Proportion of samples inside the ROPE [-8.00, 8.00]:
## 
## Parameter     | inside ROPE
## ---------------------------
## (Intercept)   |      0.00 %
## feedhorsebean |      0.00 %
## feedlinseed   |      0.00 %
## feedmeatmeal  |      0.00 %
## feedsoybean   |      0.00 %
## feedsunflower |     25.81 %
rope(stan_model_small)
## # Proportion of samples inside the ROPE [-8.07, 8.07]:
## 
## Parameter     | inside ROPE
## ---------------------------
## (Intercept)   |      0.00 %
## feedhorsebean |      0.00 %
## feedlinseed   |      8.57 %
## feedmeatmeal  |     15.25 %
## feedsoybean   |      0.00 %
## feedsunflower |      9.77 %

Markov Chain Diagnostics

mcmc_trace(stan_model)

mcmc_trace(stan_model_small)

#mcmc_trace(stan_model_tiny)

Highest Density Intervals

hdi(stan_model)
## # Highest Density Interval
## 
## Parameter     |            89% HDI
## ----------------------------------
## (Intercept)   | [ 290.08,  347.80]
## feedhorsebean | [-200.56, -120.06]
## feedlinseed   | [-135.69,  -55.96]
## feedmeatmeal  | [ -92.05,   -8.74]
## feedsoybean   | [-113.20,  -33.27]
## feedsunflower | [ -27.96,   52.92]
stan_model$coefficients
##   (Intercept) feedhorsebean   feedlinseed  feedmeatmeal   feedsoybean 
##     320.24350    -158.87134     -95.51556     -47.54054     -72.21472 
## feedsunflower 
##      11.16637
hdi(stan_model_small)
## # Highest Density Interval
## 
## Parameter     |           89% HDI
## ---------------------------------
## (Intercept)   | [ 269.04, 351.91]
## feedhorsebean | [-201.99, -83.31]
## feedlinseed   | [-127.78,  17.69]
## feedmeatmeal  | [ -89.46,  36.34]
## feedsoybean   | [-116.26, -12.99]
## feedsunflower | [ -13.58, 105.97]
hdi(stan_model_tiny)
## # Highest Density Interval
## 
## Parameter     |           89% HDI
## ---------------------------------
## (Intercept)   | [ 304.94, 426.25]
## feedhorsebean | [-225.23, -53.81]
## feedlinseed   | [-245.32, -32.72]
## feedmeatmeal  | [-211.73, -54.28]
## feedsoybean   | [-154.78, -12.64]
## feedsunflower | [ -76.13,  96.25]

What percentage of each posterior distribution is greater than a certain cutoff value?

cutoff_value <- -100 # define the cutoff

posterior <- get_parameters(stan_model,iterations=10000) %>% 
  pivot_longer(everything(),names_to='Parameter')

post_pct <- 
  posterior %>% filter(Parameter != '(Intercept)') %>%
  mutate(above=case_when(value > cutoff_value ~ 1, TRUE ~ 0)) %>%
  group_by(Parameter) %>%
  summarize(above_pct=mean(above)) %>%
  ungroup()

post_pct %>% kable()
Parameter above_pct
feedhorsebean 0.01075
feedlinseed 0.56675
feedmeatmeal 0.97650
feedsoybean 0.86925
feedsunflower 1.00000

Posterior Distributions

# Function that adds size of training dataset to mcmc_areas
mcmc_areas_info <- function(model,variables) {
  predictor_vars <- str_c('feed',unlist(stan_model$xlevels,use.names=F)[-1])
  
  mcmc_areas(model,pars=predictor_vars) + ggtitle(str_c('n = ',as.character(nrow(model$data)))) +
      theme_minimal() +
      theme(plot.title = element_text(hjust = 0.5))
}

mcmc_areas_info(stan_model,predictors)
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.

mcmc_areas_info(stan_model_small,predictors)
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.

mcmc_areas_info(stan_model_tiny,predictors) 
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.

#mcmc_intervals(stan_model,pars=predictors) + theme_bw()
#posterior_vs_prior(stan_model)
prior_summary(stan_model)
## Priors for model 'stan_model' 
## ------
## Intercept (after predictors centered)
##   Specified prior:
##     ~ normal(location = 0, scale = 10)
##   Adjusted prior:
##     ~ normal(location = 0, scale = 800)
## 
## Coefficients
##   Specified prior:
##     ~ normal(location = [0,0,0,...], scale = [2.5,2.5,2.5,...])
##   Adjusted prior:
##     ~ normal(location = [0,0,0,...], scale = [200.03,200.03,200.03,...])
## 
## Auxiliary (sigma)
##   Specified prior:
##     ~ exponential(rate = 1)
##   Adjusted prior:
##     ~ exponential(rate = 0.012)
## ------
## See help('prior_summary.stanreg') for more details

Draw from the prior and posterior distributions

# Function for simulating prior and posterior distributions from stan model
sim_post_prior <- function(model) {
  # Simulate prior with bayestestR package
  prior <- simulate_prior(model) %>%
  pivot_longer(everything(),names_to='Parameter')

  # Simulate Posterior with insight package
  posterior <- get_parameters(model,iterations=10000) %>% 
  pivot_longer(everything(),names_to='Parameter')

  # Combine into one dataset
  combined <- prior %>% mutate(Distribution='Prior') %>% 
  bind_rows(posterior %>% mutate(Distribution='Posterior'))
  
  return(combined)
}

prior_posterior <- sim_post_prior(stan_model)
prior_posterior_small <- sim_post_prior(stan_model_small)
prior_posterior_tiny <- sim_post_prior(stan_model_tiny)

Plot our parameter prior and posterior distributions

# Find the x,y coordinates for peak density in a sample
find_peak_density <- function(x_sample) {
  density_x <- density(x_sample)
  # Find coordinates for peak density
  x_max <- density_x$x[which.max(density_x$y)]
  y_max <- max(density_x$y)
  
  return(tibble(x=x_max,y=y_max))
}

# Function for plotting 
plot_parameters <- function(distribution_sample,train_data,plot_peaks=FALSE) {
    
  # data to plot - exclude intercept term
  plot_data <- distribution_sample %>% filter(!str_detect(Parameter,'Intercept'))
    
  # Points for labeling max density 
  # based loosely on: https://stackoverflow.com/questions/56520287/how-to-add-label-to-each-geom-density-line)
 density_coordinates <- plot_data %>% 
  group_by(Distribution,Parameter) %>%
  do(find_peak_density(.$value))
    
  base_plot <- ggplot(data=plot_data,
         aes(x=value,fill=Parameter)) +
    facet_wrap(~fct_rev(Distribution),scales='free') +
    theme_minimal() +
    scale_y_continuous(expand =c(0,0,0.15,0)) + # add spacing for labels
    geom_vline(xintercept=0,color='red',size=0.25,linetype='dashed') +
    theme(legend.position='top',
          legend.title=element_blank(),
          plot.title = element_text(hjust = 0.5)) +
    geom_density(alpha=0.4,size=0.05) + ggtitle(str_c('n = ',as.character(nrow(train_data)))) +
    xlab('') + ylab('') + scale_fill_jcolors('pal6') + 
    guides(color = guide_legend(reverse=T))
  
  if (plot_peaks == TRUE) {
    return(base_plot +
    geom_point(data=density_coordinates, aes(x=x, y=y),show.legend = F) +
    geom_text_repel(data=density_coordinates, aes(label=round(x,2),x=x, y=y),
                     force=1.5,size=4,show.legend = F))
  }
  else {
    return(base_plot)
  }
}

Compare parameter distributions by sample size of training dataset

plot_parameters(prior_posterior,train) 

plot_parameters(prior_posterior_small,train_small) 

plot_parameters(prior_posterior_tiny,train_tiny)

Predictive Posterior Distribution

# Function that adds size of training dataset to pp_check
pp_check_info <- function(model) {
  pp_check(model) + ggtitle(str_c('n = ',as.character(nrow(model$data)))) +
      theme_minimal() +
      theme(plot.title = element_text(hjust = 0.5))
}

pp_check_info(stan_model)

pp_check_info(stan_model_small)

pp_check_info(stan_model_tiny)

Manually plot the outcome distribution to compare to the posterior check plot above

# Extract variables from formula
all_model_vars <- all.vars(model_formula)
outcome_var <- sym(all_model_vars[1])
predictors <- all_model_vars[-1]

ggplot(aes(x=!!outcome_var),data=train) + geom_density() + theme_minimal()

Make predictions using the posterior distribution

post_pred <- posterior_predict(stan_model,new_data = test,draws = 1000) %>%
  as_tibble()

Look at the posterior prediction distribution for a single observation

row_num <- quo(`2`)

true_value <- test %>% slice(as.numeric(as_label(row_num))) %>%
  pull(outcome_var)

ggplot(aes(x=!!row_num),data=post_pred) + geom_density() + theme_minimal() +
  geom_vline(xintercept=true_value,color='steelblue')
## Don't know how to automatically pick scale for object of type ppd/matrix. Defaulting to continuous.

# Take a look at that same row number
print(test %>% slice(as.numeric(as_label(row_num))))
## # A tibble: 1 x 2
##   weight feed   
##    <dbl> <fct>  
## 1    230 soybean