diff --git a/.gitignore b/.gitignore index 45e0f08..e3c484a 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ *.DS_Store drafts/* +final_exam/* homework/*.pdf homework_keys/* lab/*.pdf @@ -14,4 +15,5 @@ lecture_drafts/* lecture_recordings/* miscellaneous/* paperwork/* +readings/* scripts/* \ No newline at end of file diff --git a/homework/hw_week_14.Rmd b/homework/hw_week_14.Rmd new file mode 100644 index 0000000..bb40570 --- /dev/null +++ b/homework/hw_week_14.Rmd @@ -0,0 +1,59 @@ +--- +title: "EEEB UN3005/GR5005 \nHomework - Week 14 - Due 05 May 2020" +author: "USE THE NUMERIC PORTION OF YOUR UNI HERE" +output: pdf_document +fontsize: 12pt +--- + +```{r setup, include = FALSE} + +knitr::opts_chunk$set(echo = TRUE) + +library(rethinking) +library(dplyr) +``` + + +**Homework Instructions:** Complete this assignment by writing code in the code chunks provided. If required, provide written explanations **below** the relevant code chunks. Replace "USE THE NUMERIC PORTION OF YOUR UNI HERE" in the document header with the numbers appearing in your UNI. When complete, knit this document within RStudio to generate a PDF file. Please review the resulting PDF to ensure that all content relevant for grading (i.e., code, code output, and written explanations) appears in the document. Rename your PDF document according to the following format: hw_week_14_UNInumbers.pdf. Upload this final homework document to CourseWorks by 5 pm on the due date. + + +This week's homework problems will use a historical human ecology/demography dataset from 1988 on nearly 2,000 Bangladeshi women. Access this dataset, which includes information on the women's households and behavior, from the `rethinking` package using `data("bangladesh")`. + + +## Problem 1 (2 points) + +Import the `bangladesh` data and **filter the data to only contain information on women from administrative districts 1 through 50 (the `district` variable).** Use this filtered dataset for all problems in this assignment. If you don't, you're sure to run into some modeling trouble. + +First, fit a binomial generalized linear model using `district` to predict a woman's decision to use contraception (the `use.contraception` variable). Note, treating `district` as an index variable to generate a vector of intercept parameters (one corresponding to each district) will be the most effective approach here. Use a prior of `dnorm(0, 5)` for your intercept parameters. + +After fitting the model, use `precis()` to report the 97% PIs of the fit model parameters. + +```{r} + +``` + + +## Problem 2 (4 points) + +Now, formulate and fit this same model as a multilevel binomial generalized linear model. Use a prior of `dnorm(0, 5)` for the mean of your cluster of intercepts and a prior of `dcauchy(0, 1)` for the standard deviation of your cluster of intercepts. Don't worry about any warning messages you may receive regarding divergent iterations during sampling. + +After fitting the model, use `precis()` to report the 97% HPDIs of the fit model parameters. + +Using posterior samples from the model, visualize the implied probability of contraception use for a woman occupying an *average* district. + +```{r} + +``` + + +## Problem 3 (4 points) + +Adapt the model you constructed in Problem 2 to also consider the impact of whether or not a woman lives in an urban area as a predictor for contraception use. Note, `urban` is already a dummy variable, with a value of 1 indicating a woman who lives in a city and a value of 0 indicating a woman who lives in a rural area. Use a prior of `dnorm(0, 5)` for your beta coefficient for contraception use. + +After fitting the new model, use `precis()` to report the 97% HPDIs of the fit model parameters. + +Interpret the urban living effect. Based on this dataset and model, are urban women more or less likely to use contraception? + +```{r} + +``` diff --git a/lab/lab_week_14.Rmd b/lab/lab_week_14.Rmd new file mode 100644 index 0000000..00e6249 --- /dev/null +++ b/lab/lab_week_14.Rmd @@ -0,0 +1,53 @@ +--- +title: "EEEB UN3005/GR5005 \nLab - Week 14 - 27 and 29 April 2020" +author: "USE YOUR NAME HERE" +output: pdf_document +fontsize: 12pt +--- + +```{r setup, include = FALSE} + +knitr::opts_chunk$set(echo = TRUE) + +library(rethinking) +``` + + +**Updated Lab Instructions Due to Remote Teaching:** Complete this assignment by writing code in the code chunks provided. If required, provide written explanations below the relevant code chunks. When complete, knit this document within RStudio to generate a PDF, and upload that document to CourseWorks by 5 pm on 30 April. + + +This week's lab exercises will use the snow goose color morph dataset that was introduced at the end of last week's lecture. The following code will load this dataset into your R environment as a data frame called `geese`: + +```{r load_snow_goose_data} + +# Load in snow goose data in aggregated binomial format +geese <- data.frame( + blue_geese = as.integer(c(215, 84, 7)), + total_geese = as.integer(c(500, 300, 25)), + study_site = as.factor(c("Site A", "Site B", "Site C")) +) +``` + + +## Exercise 1 + +Use `map()` to fit a binomial generalized linear model, with `study_site` as a predictor variable and `blue_geese` as the outcome variable. Rather than constructing dummy variables to represent site affiliation, use the existing `study_site` variable as an index variable to create a vector of intercept parameters. Use a prior of `dnorm(0, 5)` for your intercept parameters. This model should be analogous to the model that was demonstrated during the week 13 lecture. However, with this model formulation, you'll generate an intercept value that directly corresponds to each study site's log-odds of a goose belonging to the blue morph. + +After fitting the model, use `precis()` to report the 97% PIs of the fit model parameters. Use posterior samples from the model and `dens()` to visualize the implied probability of success (i.e., implied probability of a goose being blue) values for each of the three study sites. + +```{r} + +``` + + +## Exercise 2 + +Now, formulate and fit this same model as a multilevel binomial generalized linear model. Remember, you'll have to use `map2stan()` in order to successfully fit a multilevel model. Don't worry about any warning messages you may receive regarding divergent iterations during sampling. Use a prior of `dnorm(0, 5)` for the mean of your cluster of intercepts and a prior of `dcauchy(0, 1)` for the standard deviation of your cluster of intercepts. + +After fitting the model, visualize the MCMC chain(s) using a trace plot function, and use `precis()` to report the 97% HPDIs of the fit model parameters. Use posterior samples from the model and `dens()` to visualize the implied probability of success (i.e., implied probability of a goose being blue) values for each of the three study sites. + +What differences, if any, do you see between the posterior probability of success values for each site with this model compared to the model fit in Exercise 1? Referencing your `precis()` output for both models (which show raw model estimates on the log-odds scale) may help with interpretation. + +```{r} + +``` diff --git a/lecture/lecture_week_14.R b/lecture/lecture_week_14.R new file mode 100644 index 0000000..261dcdf --- /dev/null +++ b/lecture/lecture_week_14.R @@ -0,0 +1,164 @@ + + +library(rethinking) + +#========================================================== + + +# Load the reed frog data +data(reedfrogs) +d <- reedfrogs + +# Make the tank cluster variable +d$tank <- 1:nrow(d) + +str(d) + +# Plot the observed tank-level survival proportions +plot(propsurv ~ tank, data = d, pch = 19, ylim = c(0, 1)) + +#========================================================== + + +# Fit a binomial GLM predicting frog survival using a +# tank-level index variable +m12.1 <- map2stan( + data = d, + alist( + surv ~ dbinom(density, p), + logit(p) <- a_tank[tank], + a_tank[tank] ~ dnorm(0, 5) + ), + chains = 4, + iter = 5000 +) + +precis(m12.1, prob = 0.97) + +# Now need to use the "depth" argument to show all fit +# model parameters... +precis(m12.1, depth = 2, prob = 0.97) + +# And when working with posterior samples, we have to +# deal with new data structures... +post <- extract.samples(m12.1) +str(post) +is.list(post) +is.matrix(post$a_tank) + +# To get the posterior samples for the first tank: +post$a_tank[, 1] +# To get the posterior samples for the second tank: +post$a_tank[, 2] + +# Plot the implied probability of survival for tanks +# 9, 32, and 44 +# Note that these all have the same observed proportion +# of tadpole survival... +dens( + logistic(post$a_tank[, 9]), + xlim = c(0, 1), ylim = c(0, 10), + xlab = "Implied probability of survival", + ylab = "Posterior density" +) + +dens( + logistic(post$a_tank[, 32]), + xlim = c(0, 1), ylim = c(0, 10), + xlab = "Implied probability of survival", + ylab = "Posterior density" +) + +dens( + logistic(post$a_tank[, 44]), + xlim = c(0, 1), ylim = c(0, 10), + xlab = "Implied probability of survival", + ylab = "Posterior density" +) + +#========================================================== + + +# Fit a binomial multilevel model predicting frog survival +# using a tank-level index variable +m12.2 <- map2stan( + data = d, + alist( + surv ~ dbinom(density, p), + logit(p) <- a_tank[tank], + a_tank[tank] ~ dnorm(a, sigma), + a ~ dnorm(0, 1), + sigma ~ dcauchy(0, 1) + ), + chains = 4, + iter = 5000 +) + +precis(m12.2, prob = 0.97) +precis(m12.2, depth = 2, prob = 0.97) + +#========================================================== + + +# Visualization to demonstrate that the tank-level +# intercepts are being drawn from (i.e., constrained by) +# a Gaussian distribution + +# Extract posterior samples +post <- extract.samples(m12.2) + +# Generate an x-axis sequence for plotting +x.seq <- seq(from = -5, to = 7, by = 0.01) + +# Plot the Gaussian distribution implied by the posterior +# means for the "a" and "sigma" parameters +plot( + x.seq, + dnorm( + x.seq, + mean = coef(m12.2)["a"], + sd = coef(m12.2)["sigma"] + ), + xlab = "log-odds of survival", ylab = "Density", + type = "n" +) + +lines( + x.seq, + dnorm( + x.seq, + mean = coef(m12.2)["a"], + sd = coef(m12.2)["sigma"] + ) +) + +# Plot as vertical lines, with some time lag, the posterior +# means for the tank-level intercepts +for (i in 1:48) { + + Sys.sleep(time = 0.5) + + points( + x = coef(m12.2)[i], + y = dnorm( + coef(m12.2)[i], + mean = coef(m12.2)["a"], + sd = coef(m12.2)["sigma"] + ), + col = "red", + pch = "|" + ) +} + +#========================================================== + + +# How do the tank-level intercept estimates from the GLM +# and the multilevel model compare? +plot( + coef(m12.1), coef(m12.2)[1:48], pch = 19, + xlim = c(-7, 7), ylim = c(-7, 7), + xlab = "Tank-level intercept mean est. from GLM", + ylab = "Tank-level intercept mean est. from multilevel model" +) +abline(a = 0, b = 1, lty = 2) diff --git a/lecture/lecture_week_14.pptx b/lecture/lecture_week_14.pptx new file mode 100644 index 0000000..7a6b8f2 Binary files /dev/null and b/lecture/lecture_week_14.pptx differ