Modeling Workflow

Jesse Cambon 24 November, 2019

Demonstrate model workflows with tidyverse, modelr, and broom. This notebook includes both a group_by and a nested approach which offer similar results. However, the nested model workflow embeds the data into the dataframe along with objects such as models.

References

http://r4ds.had.co.nz/many-models.html

Setup

library(tidyverse)
library(gapminder)
library(broom)
#library(modelr)
library(knitr)
library(kableExtra)

Exploration

These graphs show why log transforming GDP per Capita makes it correlate more linearly to our response variable, life expectancy. Log transformations are often useful for highly skewed variables in regression.

ggplot(data=gapminder,
          aes(x = gdpPercap, y = lifeExp, color = continent,group=1)) +
geom_point(alpha=0.7) +
theme_bw() +
geom_smooth() +
theme(legend.position='top',
  plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 
guides(color=guide_legend(override.aes = list(size=2.5)))

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=gapminder,
          aes(x = log10(gdpPercap), y = lifeExp, color = continent,group=1)) +
geom_point(alpha=0.7) +
theme_bw() +
geom_smooth() +
theme(legend.position='top',
  plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 
guides(color=guide_legend(override.aes = list(size=2.5)))

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=gapminder,
          aes(x = log10(pop), y = lifeExp, color = continent,group=1)) +
geom_point(alpha=0.7) +
#facet_grid(~continent) +
theme_bw() +
geom_smooth() +
theme(legend.position='top',
  plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 
guides(color=guide_legend(override.aes = list(size=2.5)))

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Grouped Models

# One model per continent
models <- gapminder %>%
  group_by(continent) %>%
  do(fit=lm(lifeExp ~ log10(gdpPercap)+log10(pop) + year, data=.)) 

stats <- glance(models,fit) %>%
  arrange(desc(r.squared))

coefficients <- tidy(models,fit) %>%
  filter(term != '(Intercept)') %>%
  arrange(continent,p.value)

model_fit <- augment(models,fit)

ggplot(data=model_fit,
          aes(x = .fitted, y = .resid, color = continent,group=1)) +
geom_point(alpha=0.8) +
facet_grid(~continent) +
ggtitle('Fitted vs. Residual Check') +
theme_bw() +
geom_hline(yintercept=0,color='blue') + # horizontal line at 0 residual
theme(legend.position='none',
  plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 
guides(color=guide_legend(override.aes = list(size=2.5))) +
xlab('Fitted') +
ylab('Residual')

ggplot(data=model_fit,
          aes(.resid)) +
geom_histogram(aes(fill=continent)) +
facet_grid(~continent) +
ggtitle('Residual Distribution') +
theme_bw() +
scale_y_continuous(expand = c(0,0,0.05,0)) + 
theme(legend.position='none',
  plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 
guides(color=guide_legend(override.aes = list(size=2.5))) +
xlab('Residual') +
ylab('Count')

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

kable(stats,format='markdown',digits=2) %>%
  kable_styling(bootstrap_options = c("striped",'border'))

continent	r.squared	adj.r.squared	sigma	statistic	df	logLik	AIC	BIC	deviance	df.residual
Oceania	0.96	0.96	0.78	172.97	4	-26.03	62.06	67.95	12.30	20
Europe	0.80	0.80	2.41	487.82	4	-825.98	1661.96	1681.39	2073.54	356
Americas	0.72	0.72	4.96	255.52	4	-903.93	1817.85	1836.37	7274.08	296
Asia	0.70	0.70	6.50	308.12	4	-1301.08	2612.15	2632.06	16558.14	392
Africa	0.50	0.50	6.48	207.77	4	-2049.22	4108.45	4130.63	26011.51	620

kable(coefficients,format='markdown',digits=4) %>%
  kable_styling(bootstrap_options = c("striped",'border'))

continent	term	estimate	std.error	statistic	p.value
Africa	year	0.2551	0.0160	15.8991	0.0000
Africa	log10(gdpPercap)	11.0142	0.7141	15.4237	0.0000
Africa	log10(pop)	-0.5390	0.4192	-1.2857	0.1990
Americas	log10(gdpPercap)	18.5492	1.1513	16.1118	0.0000
Americas	year	0.2690	0.0179	15.0519	0.0000
Americas	log10(pop)	-1.9190	0.5545	-3.4607	0.0006
Asia	log10(gdpPercap)	12.6233	0.7074	17.8454	0.0000
Asia	year	0.2974	0.0219	13.5703	0.0000
Asia	log10(pop)	2.0425	0.4854	4.2077	0.0000
Europe	log10(gdpPercap)	11.5695	0.4930	23.4667	0.0000
Europe	year	0.1005	0.0091	11.0939	0.0000
Europe	log10(pop)	-1.0054	0.2244	-4.4804	0.0000
Oceania	year	0.1737	0.0384	4.5299	0.0002
Oceania	log10(pop)	0.6644	0.5984	1.1102	0.2801
Oceania	log10(gdpPercap)	4.1229	4.9721	0.8292	0.4168

Nested Models

Now we create a similar model with nesting

my_model <- function(df) {
  lm(lifeExp ~ log10(gdpPercap)+log10(pop) + year, data= df)
}

# Nest models by continent 
nested_models <- gapminder %>% 
  group_by(continent,country) %>% 
  nest() %>%
  # fit models
  mutate(fit = map(data, my_model)) %>%
  # calculate residuals
  mutate(augment = map(fit, augment),
    stats = map(fit,glance),
    terms = map(fit,tidy)) %>%
  ungroup()

# Dataset with predictions and residuals
nest_fit <- nested_models %>% unnest(augment)

nest_stats <- nested_models %>%
  unnest(stats,.drop=TRUE) %>%
  arrange(desc(r.squared))

## Warning: The `.drop` argument of `unnest()` is deprecated as of tidyr 1.0.0.
## All list-columns are now preserved.
## This warning is displayed once per session.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

nest_coefficients <- nested_models %>%
  unnest(terms,.drop=TRUE) %>%
  filter(term != '(Intercept)') %>%
  arrange(continent,country,desc(p.value))

most_important_vars <- nest_coefficients %>%
  group_by(country) %>% 
  slice(1)

summ_imp_vars <- most_important_vars %>%
  group_by(continent) %>%
  count(term) %>%
  arrange(continent,desc(n))

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Modeling_Workflow.md

Modeling_Workflow.md

Modeling Workflow

References

Setup

Exploration

Grouped Models

Nested Models

Files

Modeling_Workflow.md

Latest commit

History

Modeling_Workflow.md

File metadata and controls

Modeling Workflow

References

Setup

Exploration

Grouped Models

Nested Models