diff --git a/data/simulated_trees.csv b/data/simulated_trees.csv new file mode 100644 index 0000000..1ed6478 --- /dev/null +++ b/data/simulated_trees.csv @@ -0,0 +1,101 @@ +"height","age" +685,82 +1202,232 +1072,147 +667,132 +1441,190 +1439,167 +628,86 +1328,223 +964,173 +1045,199 +1047,112 +736,161 +1252,170 +678,106 +900,148 +1341,181 +1461,169 +722,96 +937,143 +573,64 +1149,149 +879,140 +1319,194 +647,108 +839,125 +977,124 +645,129 +847,146 +1436,242 +1494,189 +510,101 +659,56 +1285,179 +1485,188 +997,94 +1105,211 +1314,181 +774,109 +1142,162 +644,106 +1443,256 +785,160 +610,62 +656,64 +1403,173 +1259,158 +1430,263 +833,125 +978,126 +1271,176 +506,103 +513,83 +1148,164 +1381,188 +760,92 +1268,241 +1242,210 +1433,265 +1078,151 +1168,166 +1224,158 +1333,194 +1086,175 +744,146 +1304,210 +909,148 +862,160 +931,168 +704,108 +561,65 +756,144 +788,122 +539,124 +671,90 +669,74 +1199,193 +766,98 +1301,211 +871,88 +1028,111 +822,141 +1118,176 +523,100 +868,80 +683,133 +1284,223 +1388,234 +795,139 +1169,228 +810,85 +1389,194 +861,139 +845,107 +1008,153 +920,157 +1487,207 +886,149 +584,101 +604,70 +896,109 diff --git a/lab/lab_week_06.Rmd b/lab/lab_week_06.Rmd new file mode 100644 index 0000000..78455b7 --- /dev/null +++ b/lab/lab_week_06.Rmd @@ -0,0 +1,78 @@ +--- +title: "EEEB UN3005/GR5005 \nLab - Week 06 - 02 and 04 March 2020" +author: "USE YOUR NAME HERE" +output: pdf_document +fontsize: 12pt +--- + +```{r setup, include = FALSE} + +knitr::opts_chunk$set(echo = TRUE) + +library(rethinking) +library(dplyr) +library(ggplot2) +``` + + +# Gaussian Regression Models + +This week we'll be working with simulated data that's based on a real ecological research scenario. Tree age (in years) can be evaluated using tree core data, while tree height (in centimeters) can be remotely sensed using LIDAR technology. Given some initial data relating tree age and tree height, a researcher interested in tree population demography might reasonably want to estimate tree age based on tree height, avoiding the time, expense, and labor associated with the field work needed to collect tree core data. + + +## Exercise 1: Importing and Visualizing Tree Age Data + +First, import the `simulated_trees.csv` dataset into R. This data contains tree height and age variables. Visualize the tree age variable using a plot type of your choice. + +```{r} + +``` + + +## Exercise 2: Fitting a Gaussian Model of Tree Age + +Now, construct a Gaussian model of tree age, and use `map()` to fit the model. Assume priors of `dnorm(0, 50)` for the mean parameter and `dcauchy(0, 5)` for the standard deviation parameter. Visualize these priors before you fit your model. To ensure a good model fit, you will likely need to explicitly define starting values: 0 for the mean parameter and 40 for the standard deviation parameter. + +After fitting the model, use `precis()` to display the 99% PIs for all model parameters. How do you interpret the fit model parameters? + +```{r} + +``` + + +## Exercise 3: Fitting a Linear Regression of Tree Age + +Now, use tree `height` as a predictor of tree `age` in a linear regression model. Assume priors of `dnorm(0, 50)` for the intercept parameter, `dnorm(0, 50)` for the slope parameter, and `dcauchy(0, 5)` for the standard deviation parameter. To ensure a good model fit, you will likely need to explicitly define starting values: 0 for the intercept and slope parameters and 40 for the standard deviation parameter. Again, use `map()` to fit the model. + +After fitting the model, use `precis()` to summarize the 99% PIs for all model parameters. What does the estimated value of the slope parameter suggest about the relationship between tree height and tree age? What does the intercept parameter correspond to in this model? + +```{r} + +``` + + +## Exercise 4: Plotting Raw Data and the *Maximum a Posterior* Trend Line + +Using either base R or `ggplot()`, plot tree age versus tree height, and add the *maximum a posterior* line for the estimated mean age at each tree height value. In other words, this line is the relationship between tree height and mean age implied by the *maximum a posterior* intercept and slope estimates from your fit model. + +```{r} + +``` + + +## Exercise 5: Generating Tree Age Predictions + +Generate 10,000 posterior samples from your fit model from Exercise 3 with `extract.samples()`. Using these samples, make 10,000 age predictions for a tree with a height of 1,200 centimeters. What is the mean value of these age predictions? What is the 50% HPDI of these age predictions? Plot the density of these age predictions. + +```{r} + +``` + + +## Bonus Exercise: Visualizing Uncertainty in the Posterior + +Recreate the plot from Exercise 4, but instead of overlaying the *maximum a posterior* trend line, overlay the trend lines implied by the first 100 posterior samples. + +```{r} + +``` diff --git a/lecture/lecture_week_06.R b/lecture/lecture_week_06.R new file mode 100644 index 0000000..750c70a --- /dev/null +++ b/lecture/lecture_week_06.R @@ -0,0 +1,301 @@ + + +library(rethinking) +library(dplyr) + +#========================================================== + + +# Gaussian model of human height data collected by +# Nancy Howell + +# Prepare Howell data for analysis +data(Howell1) +d <- Howell1 +d2 <- filter(d, age >= 18) # filter to 18+ years of age + +# Visualize priors for the mean and standard deviation +# parameters before fitting the statistical model +curve( + dnorm(x, 178, 20), + from = 50, to = 300, + main = "Prior for mu", + xlab = "mu", + ylab = "Density" +) +curve( + dunif(x, 0, 50), + from = -10, to = 60, + main = "Prior for sigma", + xlab = "sigma", + ylab = "Density" +) + +# Fit Gaussian model using map() +m4.1 <- map( + data = d2, # specify data to fit + alist( + # define statistical model + height ~ dnorm(mu, sigma), # likelihood + mu ~ dnorm(178, 20) , # prior for the mean parameter + sigma ~ dunif(0, 50) # prior for the sd parameter + ) +) + +# Summarize model output +precis(m4.1) # shows PIs by default for map() fits + +# Extract posterior samples for all parameters +post <- extract.samples(m4.1, n = 10000) +head(post, 10) + +# Plot posterior samples for both parameters +dens( + post$mu, + main = "Posterior for mu", + xlab = "mu" +) +dens( + post$sigma, + main = "Posterior for sigma", + xlab = "sigma" +) +# These are called marginal posterior density plots + +plot( + post$mu, post$sigma, + xlab = "mu", ylab = "sigma", + pch = 19 +) + +#========================================================== + + +# Fit linear model with predictor (weight) using map() + +# Visualize the height/weight relationship in the raw data +plot( + height ~ weight, data = d2, + xlim = c(20, 70), ylim = c(120, 190), + pch = 19 +) + +# Visualize priors for the intercept and slope parameters +curve( + dnorm(x, 178, 100), + from = -300, to = 700, + main = "Prior for a (intercept)", + xlab = "a", + ylab = "Density" +) +curve( + dnorm(x, 0, 10), + from = -50, to = 50, + main = "Prior for b (slope)", + xlab = "b", + ylab = "Density" +) + +# Fit linear regression model using map() +m4.3 <- map( + data = d2, # specify data to fit + alist( + # define statistical model + height ~ dnorm(mu, sigma), # likelihood + mu <- a + b*weight, # model mu using a linear formula + a ~ dnorm(178, 100), # prior for the intercept parameter + b ~ dnorm(0, 10), # prior for the slope parameter + sigma ~ dunif(0, 50) # prior for the sd parameter + ) +) + +# Summarize model output +precis(m4.3) + +# Extract posterior samples for all parameters +post <- extract.samples(m4.3, n = 10000) +head(post, 10) + +# Plot posterior samples of a (intercept) and b (slope) +# parameters +dens( + post$a, + main = "Posterior for a (intercept)", + xlab = "a" +) +dens( + post$b, + main = "Posterior for b (slope)", + xlab = "b" +) + +plot( + post$a, post$b, + xlab = "a (intercept)", ylab = "b (slope)", + pch = 19 +) +# We can see in this model these two parameters covary +# strongly with each other + +# Why this relationship? +plot( + height ~ weight, data = d2, + xlim = c(0, 70), ylim = c(100, 200), + pch = 19 +) +for (i in 1:50) { + abline( + a = post$a[i], b = post$b[i], + col = alpha("seagreen", 0.5) + ) +} + +#========================================================== + + +# Using a centered predictor variable + +# Center the weight predictor variable and compare to the +# raw variable +d2$weight.c <- d2$weight - mean(d2$weight) +plot(weight.c ~ weight, data = d2, pch = 19) +cor(d2$weight, d2$weight.c) # perfect correlation + +m4.4 <- map( + data = d2, + alist( + height ~ dnorm(mu, sigma), + mu <- a + b*weight.c, + a ~ dnorm(178, 100), + b ~ dnorm(0, 10), + sigma ~ dunif(0, 50) + ) +) + +precis(m4.4) + +# Remember: the intercept is the value of the linear +# portion of the model when all predictors equal 0. +# That's still what the "a" parameter in model m4.4 +# is telling us. It's just that our predictor has +# changed meaning slightly, so our interpretation is +# different. The value of "a" now corresponds to the +# expected mean height of someone with average weight. +plot( + height ~ weight.c, data = d2, + xlim = c(-40, 40), ylim = c(100, 200), + pch = 19 +) + +# Can also explicitly define parameter start values +# Not necessary in this case, but sometimes needed to +# help map() accurately describe the posterior shape + +m4.4 <- map( + data = d2, + alist( + height ~ dnorm(mu, sigma), + mu <- a + b*weight.c, + a ~ dnorm(178, 100), + b ~ dnorm(0, 10), + sigma ~ dunif(0, 50) + ), + start = list(a = 100, b = 0, sigma = 2) +) + +precis(m4.4) + + +# To standardize a predictor variable: + +d2$weight.s <- + (d2$weight - mean(d2$weight)) / sd(d2$weight) +plot(weight.s ~ weight, data = d2, pch = 19) +cor(d2$weight, d2$weight.s) # perfect correlation + +# Or: + +d2$weight.s <- scale(d2$weight) +plot(weight.s ~ weight, data = d2, pch = 19) +cor(d2$weight, d2$weight.s) # perfect correlation + +#========================================================== + + +# Visualizing posterior parameter estimates + +# Show the MAP trend line + +# First recognize the information that "coef()" gives you +precis(m4.3) +coef(m4.3) + +plot( + height ~ weight, data = d2, + xlim = c(0, 70), ylim = c(100, 200), + pch = 19 +) + +abline(a = coef(m4.3)["a"], b = coef(m4.3)["b"]) + +# Show uncertainty in the regression trend line +# (first 100 estimates) + +head(post, 10) + +plot( + height ~ weight, data = d2, + xlim = c(0, 70), ylim = c(100, 200), + pch = 19 +) + +for (i in 1:100) { + abline( + a = post$a[i], b = post$b[i], + col = alpha("seagreen", 0.1) + ) +} + +#========================================================== + + +# Generating model-based predictions + +# Generate 10,000 predicted heights for an individual +# of 50 kilograms + +preds.50 <- rnorm( + 10000, + mean = post$a + post$b*50, + sd = post$sigma +) + +plot( + preds.50, + ylab = "Predicted Height (cm) for an Individual of 50 kg" +) +dens( + preds.50, + xlab = "Predicted Height (cm) for an Individual of 50 kg" +) + +# Or rethinking has built-in convenience functions to +# generate predicted mu values (link) or full +# predictions (sim) for you + +mu.50.rethinking <- + link(m4.3, data = list(weight = 50), n = 10000) + +dens( + mu.50.rethinking, + xlab = "Predicted Mean Height (cm) for an Individual of 50 kg" +) + +preds.50.rethinking <- + sim(m4.3, data = list(weight = 50), n = 10000) + +dens( + preds.50.rethinking, col = "darkgreen", + xlab = "Predicted Height (cm) for an Individual of 50 kg" +) +dens(preds.50, col = "darkred", add = TRUE) diff --git a/lecture/lecture_week_06.pptx b/lecture/lecture_week_06.pptx new file mode 100644 index 0000000..eb5216d Binary files /dev/null and b/lecture/lecture_week_06.pptx differ