Skip to content

Latest commit

 

History

History
1228 lines (1133 loc) · 26.5 KB

Titanic.md

File metadata and controls

1228 lines (1133 loc) · 26.5 KB

Modeling Fundamentals - The Titanic

Jesse Cambon 01 August, 2021

This page is meant to serve as a primer for modeling in R. A linear and logistic regression are demonstrated on the Titanic dataset to model fare prices and survival rates using gender, age, and passenger class as covariates. Data imputation is performed on age in order to avoid biasing results. The broom package is used to display the results of the models and graphical methods are used throughout to explore the dataset and evaluate the fit of each model.

References

Setup

library(PASWR) # titanic3 dataset
library(MASS) # confint for glm
library(wesanderson) # color palettes
library(formattable) # percent format
library(caret) # regression utilities
# library(Hmisc) # capitalize function
library(broom) # model display capabilities
# library(xtable) # pretty table
library(kableExtra)
library(tidyverse)
library(skimr) # summary stats
library(knitr)
# library(rms)
library(mice) # imputation

titanic <- titanic3 %>%
  as_tibble() %>%
  mutate(sex = str_to_title(sex))

titanic_summ <- titanic %>%
  count(survived, pclass, sex) %>%
  group_by(pclass, sex) %>%
  mutate(
    perc_surv_num = n / sum(n),
    perc_surv_char = as.character(percent(n / sum(n), 0))
  ) %>%
  ungroup() %>%
  mutate(survived = factor(survived, labels = c("Died", "Survived")))

# Set default ggplot theme
theme_set(theme_bw() +
  theme(
    legend.position = "top",
    plot.subtitle = element_text(face = "bold", hjust = 0.5),
    plot.title = element_text(lineheight = 1, face = "bold", hjust = 0.5)
  ))

Exploratory Analysis

skim(titanic)
Data summary
Name titanic
Number of rows 1309
Number of columns 14
\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_
Column type frequency:
character 1
factor 7
numeric 6
\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_
Group variables None

Variable type: character

skim\_variable n\_missing complete\_rate min max empty n\_unique whitespace
sex 0 1 4 6 0 2 0

Variable type: factor

skim\_variable n\_missing complete\_rate ordered n\_unique top\_counts
pclass 0 1 FALSE 3 3rd: 709, 1st: 323, 2nd: 277
name 0 1 FALSE 1307 Con: 2, Kel: 2, Abb: 1, Abb: 1
ticket 0 1 FALSE 929 CA.: 11, 160: 8, CA : 8, 310: 7
cabin 0 1 FALSE 187 emp: 1014, B57: 5, C23: 5, G6: 5
embarked 0 1 FALSE 4 Sou: 914, Che: 270, Que: 123, emp: 2
boat 0 1 FALSE 28 emp: 823, 13: 39, C: 38, 15: 37
home.dest 0 1 FALSE 369 emp: 564, New: 64, Lon: 14, Mon: 10

Variable type: numeric

skim\_variable n\_missing complete\_rate mean sd p0 p25 p50 p75 p100 hist
survived 0 1.00 0.38 0.49 0.00 0.0 0.00 1.00 1.00 ▇▁▁▁▅
age 263 0.80 29.88 14.41 0.17 21.0 28.00 39.00 80.00 ▂▇▅▂▁
sibsp 0 1.00 0.50 1.04 0.00 0.0 0.00 1.00 8.00 ▇▁▁▁▁
parch 0 1.00 0.39 0.87 0.00 0.0 0.00 0.00 9.00 ▇▁▁▁▁
fare 1 1.00 33.30 51.76 0.00 7.9 14.45 31.27 512.33 ▇▁▁▁▁
body 1188 0.09 160.81 97.70 1.00 72.0 155.00 256.00 328.00 ▇▇▇▅▇
gender_summ <- titanic %>% count(sex)
pclass_summ <- titanic %>% count(pclass)
surv_summ <- titanic %>% count(survived)
kable(gender_summ)
sex n
Female 466
Male 843
kable(pclass_summ)
pclass n
1st 323
2nd 277
3rd 709
kable(surv_summ)
survived n
0 809
1 500
titanic_summ_dot <- titanic_summ %>%
  filter(survived == "Survived") %>%
  mutate(group = str_c(sex, "-", pclass))

# Analyze where age is missing
titanic_miss <- titanic %>%
  select(pclass, sex, age, survived) %>%
  mutate(missing_age = is.na(age)) %>%
  group_by(pclass, sex) %>%
  dplyr::summarize(perc_missing = mean(missing_age)) %>%
  mutate(group = str_c(sex, "-", pclass))
## `summarise()` has grouped output by 'pclass'. You can override using the `.groups` argument.
ggplot(titanic_summ_dot, aes(x = reorder(group, -perc_surv_num), y = perc_surv_num)) +
  geom_point(size = 3, color = "black") + # Draw points
  theme(
    legend.position = "none",
    panel.grid = element_blank()
  ) +
  scale_y_continuous(expand = c(0, 0, 0.015, 0), labels = scales::percent) +
  geom_segment(aes(
    x = group,
    xend = group,
    y = min(0),
    yend = max(1)
  ),
  linetype = "dashed", color = "black",
  size = 0.1
  ) + # Draw dashed lines
  labs(title = "Titanic Passenger Survival Rates") +
  coord_flip() +
  xlab("Group") +
  ylab("Survival Rate")

ggplot(
  data = titanic_summ,
  aes(x = fct_rev(pclass), y = perc_surv_num, fill = survived)
) +
  facet_grid(~ factor(sex)) +
  geom_bar(stat = "identity", color = "black") +
  coord_flip() +
  geom_text(
    data = titanic_summ, aes(label = ifelse(perc_surv_num > 0.07, perc_surv_char, NA)),
    size = 3, position = position_stack(vjust = 0.5)
  ) +
  scale_fill_manual(values = wes_palette("FantasticFox1")[c(3, 4)]) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    panel.grid = element_blank()
  ) +
  labs(title = "Passenger Survival Rates by Gender and Class") +
  xlab("Passenger Class") +
  ylab("") +
  guides(fill = guide_legend(title = "", reverse = T)) # reverse legend order

ggplot(titanic_miss, aes(x = reorder(group, -perc_missing), y = perc_missing)) +
  geom_point(size = 3, color = "black") + # Draw points
  theme(
    legend.position = "none",
    panel.grid = element_blank()
  ) +
  scale_y_continuous(labels = scales::percent) +
  geom_segment(aes(
    x = group,
    xend = group,
    y = min(perc_missing),
    yend = max(perc_missing)
  ),
  linetype = "dashed", color = "black",
  size = 0.1
  ) + # Draw dashed lines
  labs(title = "Which Passengers Have Missing Age") +
  coord_flip() +
  xlab("Group") +
  ylab("Percent Missing Age")

Imputation

Discarding rows with missing age has the potential to bias our data so we will impute.

https://www.analyticsvidhya.com/blog/2016/03/tutorial-powerful-packages-imputing-missing-values/ https://www.andrew.cmu.edu/user/aurorat/MIA_r.html http://www.gerkovink.com/miceVignettes/Convergence_pooling/Convergence_and_pooling.html

# Use mice to impute data - takes a few seconds on my laptop
# Increasing maxit from the default gives better results
# Making sure to not sure the outcome variable to impute.
titanic_imputed <- mice(titanic %>% select(sex, pclass, age), method = "pmm", maxit = 80, seed = 3530, printFlag = F)
## Warning: Number of logged events: 1
# Add imputed Data
titanic_imp <- complete(titanic_imputed, 5) %>%
  bind_cols(titanic %>% select(survived, age, fare) %>% rename(age_orig = age)) %>%
  mutate(imputed = case_when(is.na(age_orig) ~ "Imputed", TRUE ~ "Original"))

# Plot
ggplot(data = titanic_imp) +
  geom_density(aes(x = age, color = imputed), alpha = 0.8) +
  # facet_grid(vars(sex),vars(pclass)) +
  theme(legend.position = "top") +
  labs(title = "Distributions of Original v. Imputed Age") +
  guides(color = guide_legend(title = ""))

ggplot(data = titanic_imp) +
  geom_density(aes(x = age, color = imputed), alpha = 0.8) +
  facet_grid(sex ~ pclass) +
  theme(legend.position = "top") +
  labs(title = "Distributions of Original v. Imputed Age") +
  guides(color = guide_legend(title = ""))

Logistic Regression Model

We will use the brier score as one measurement of accuracy for our model: https://en.wikipedia.org/wiki/Brier_score The book ‘Superforecasting’ by Philip Tetlock has a good discussion of the use of brier scores.

log_fit <- glm(survived ~ sex + pclass + age, family = binomial(link = "logit"), data = titanic_imp)

predictions <- titanic_imp %>%
  select(sex, pclass, age, survived) %>%
  mutate(prediction = predict(log_fit, newdata = titanic_imp, type = "response")) %>%
  mutate(
    prediction_binary = case_when(prediction > 0.5 ~ 1, TRUE ~ 0),
    brier_score = (prediction - survived)^2
  )

# summary(fit)

log_confint <- augment(log_fit, se_fit = TRUE, type.predict = 'response')
#colnames(log_confint) <- c("Term", "LCLM", "UCLM")

log_info <- glance(log_fit) %>%
  mutate(meanBrierScore = mean(predictions$brier_score, na.rm = T)) %>%
  dplyr::select(meanBrierScore, everything())

log_terms <- tidy(log_fit, conf.int = TRUE) %>%
  rename(Coefficient = estimate, Term = term) %>%
  # Order by largest coefficient but put intercept term on bottom
#  left_join(log_confint, by = "Term") %>%
  mutate(OR = exp(Coefficient)) %>%
  select(Term, Coefficient, OR, everything()) %>%
  arrange(Term == "(Intercept)", desc(abs(Coefficient)))

# An analysis of our model's classification accuracy
confusionMatrix(factor(predictions$prediction_binary), factor(predictions$survived))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 686 159
##          1 123 341
##                                           
##                Accuracy : 0.7846          
##                  95% CI : (0.7613, 0.8066)
##     No Information Rate : 0.618           
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.5373          
##                                           
##  Mcnemar's Test P-Value : 0.03714         
##                                           
##             Sensitivity : 0.8480          
##             Specificity : 0.6820          
##          Pos Pred Value : 0.8118          
##          Neg Pred Value : 0.7349          
##              Prevalence : 0.6180          
##          Detection Rate : 0.5241          
##    Detection Prevalence : 0.6455          
##       Balanced Accuracy : 0.7650          
##                                           
##        'Positive' Class : 0               
## 
ggplot(
  data = predictions,
  aes(x = age, y = prediction, color = pclass)
) +
  geom_point() +
  facet_grid(~sex) +
  scale_y_continuous(labels = scales::percent) +
  theme(legend.margin = margin(0, 0, 0, 0)) +
  scale_color_manual(values = wes_palette("Moonrise3")) +
  labs(title = "Probability of Survival - Logistic Regression") +
  xlab("Age") +
  ylab("Survival Probability") +
  guides(color = guide_legend(title = "Passenger Class", reverse = F, override.aes = list(size = 2.5)))

ggplot(predictions, aes(prediction)) +
  geom_histogram(
    binwidth = 0.02, aes(fill = factor(survived, labels = c("Died", "Survived"))),
    col = "black"
  ) +
  # prevent right label on axis from being clipped
  theme(legend.pos = "top", plot.margin = margin(r = 20, unit = "pt")) +
  scale_fill_manual(values = wes_palette("Moonrise3")) +
  scale_x_continuous(
    labels = scales::percent,
    limits = c(0, 1),
    expand = c(0, 0, 0, 0)
  ) + # eliminate left/right margin
  scale_y_continuous(expand = expansion(mult = c(0, .1))) +
  labs(title = "Logistic Regression Probability Distribution") +
  xlab("Survival Probability") +
  ylab("Count") +
  guides(fill = guide_legend(title = ""))
## Warning: Removed 4 rows containing missing values (geom_bar).

# Same graph as prior but faceted on class

ggplot(predictions, aes(prediction)) +
  geom_histogram(
    binwidth = 0.05, aes(fill = factor(survived, labels = c("Died", "Survived"))),
    col = "black"
  ) +
  facet_wrap(~sex, scales = "free_y") +
  theme(
    legend.pos = "top",
    # prevent right label on axis from being clipped
    plot.margin = margin(r = 20, unit = "pt")
  ) +
  scale_fill_manual(values = wes_palette("Moonrise3")) +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(expand = expansion(mult = c(0, .1))) +
  labs(title = "Logistic Regression Probability Distribution by Gender") +
  xlab("Survival Probability") +
  ylab("Count") +
  guides(fill = guide_legend(title = ""))

ggplot(predictions, aes(brier_score)) +
  geom_histogram(
    binwidth = 0.02, aes(fill = factor(survived, labels = c("Died", "Survived"))),
    col = "black"
  ) +
  labs(title = "Brier Score Distribution") +
  scale_fill_manual(values = wes_palette("Moonrise3")) +
  # Use expand to make sure right axis label isn't clipped
  scale_x_continuous(expand = c(0, 0, 0.01, 0), limits = c(0, 1)) +
  scale_y_continuous(expand = expansion(mult = c(0, .1))) +
  xlab("Brier Score") +
  ylab("Count") +
  guides(fill = guide_legend(title = ""))
## Warning: Removed 4 rows containing missing values (geom_bar).

OR = Odds Ratio. LCLM and UCLM are lower and upper 95% confidence limits.

kable(log_info %>%
  select(-df.residual, -df.null, -deviance), format = "markdown", digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "border"))
meanBrierScore null.deviance logLik AIC BIC nobs
0.15 1741.02 -616.16 1242.32 1268.21 1309
kable(log_terms,format='markdown',digits = 2) %>%
  kable_styling(bootstrap_options = c("striped",'border'))
Term Coefficient OR std.error statistic p.value conf.low conf.high
sexMale -2.49 0.08 0.15 -16.78 0 -2.78 -2.20
pclass3rd -2.18 0.11 0.20 -10.87 0 -2.58 -1.79
pclass2nd -1.16 0.31 0.21 -5.55 0 -1.58 -0.76
age -0.03 0.97 0.01 -4.89 0 -0.04 -0.02
(Intercept) 3.19 24.40 0.29 11.00 0 2.64 3.78

Linear Regression Model

A linear model of passenger fare cost.

lm_fit <- lm(fare ~ sex + pclass + age + survived, data = titanic_imp)

# Calculate confidence limit
lm_confint <- augment(lm_fit, se_fit = TRUE)

lm_predictions <- titanic_imp %>%
  select(sex, pclass, age, survived, fare) %>%
  mutate(prediction = predict(lm_fit, newdata = titanic_imp)) %>%
  mutate(residual = fare - prediction)

lm_info <- glance(lm_fit)

lm_terms <- tidy(lm_fit, conf.int = TRUE) %>%
  rename(Term = term, Coefficient = estimate) %>%
#  left_join(lm_confint, by = "Term") %>%
  select(Term, Coefficient, everything()) %>%
  arrange(Term == "(Intercept)", desc(abs(Coefficient)))

# summary(lm_fit)

# Histogram of Residuals
ggplot(lm_predictions, aes(residual)) +
  geom_histogram(bins = 30) +
  facet_grid(~pclass, scales = "free_x") +
  geom_vline(xintercept = 0, color = "blue") +
  scale_x_continuous(labels = scales::dollar, expand = c(0, 0, 0, 0)) +
  scale_y_continuous(expand = c(0, 0, 0.07, 0)) +
  labs(title = "Residual Distribution by Passenger Class") +
  xlab("Residual") +
  ylab("Count")
## Warning: Removed 1 rows containing non-finite values (stat_bin).

ggplot(
  data = lm_predictions,
  aes(x = age, y = prediction, color = pclass, group = 1)
) +
  geom_point() +
  facet_grid(~ factor(sex)) +
  scale_y_continuous(labels = scales::dollar) +
  # theme(legend.margin=margin(0,0,0,0)) +
  scale_color_manual(values = wes_palette("Moonrise3")) +
  labs(title = "Cost of Fare - Linear Regression") +
  xlab("Age") +
  ylab("Fare Cost") +
  guides(color = guide_legend(title = "Passenger Class", reverse = F, override.aes = list(size = 2.5)))

ggplot(
  data = lm_predictions,
  aes(x = prediction, y = residual, color = sex)
) +
  geom_point() +
  facet_grid(~pclass, scales = "free_x") +
  geom_hline(yintercept = 0, color = "blue") + # horizontal line at 0 residual
  # geom_smooth(method="lm",show.legend=F,size=0.5) +
  scale_x_continuous(labels = scales::dollar) +
  scale_y_continuous(labels = scales::dollar) +
  # theme(legend.pos='none') +
  scale_color_manual(values = wes_palette("Moonrise3")) +
  labs(title = "Residuals vs Predictions by Passenger Class") +
  xlab("Prediction") +
  ylab("Residual") +
  guides(color = guide_legend(title = "Gender", reverse = F, override.aes = list(size = 2.5)))
## Warning: Removed 1 rows containing missing values (geom_point).

ggplot(
  data = lm_predictions,
  aes(x = age, y = residual, color = sex)
) +
  geom_point() +
  facet_grid(~pclass) +
  geom_hline(yintercept = 0, color = "blue") + # horizontal line at 0 residual
  scale_y_continuous(labels = scales::dollar) +
  theme(legend.margin = margin(0, 0, 0, 0)) +
  scale_color_manual(values = wes_palette("Moonrise3")) +
  labs(title = "Residuals By Passenger Class") +
  xlab("Age") +
  ylab("Residual") +
  guides(color = guide_legend(title = "Gender", reverse = F, override.aes = list(size = 2.5)))
## Warning: Removed 1 rows containing missing values (geom_point).

ggplot(
  data = lm_terms,
  aes(x = reorder(Term, -Coefficient), y = Coefficient)
) +
  geom_point() +
  coord_flip() +
  geom_pointrange(mapping = aes(ymin = conf.low, ymax = conf.high)) +
  scale_color_manual(values = wes_palette("Moonrise3")) +
  labs(title = "Linear Model Coefficients with Confidence Intervals") +
  xlab("Term")

kable((lm_info %>% dplyr::select(-df.residual, -logLik, -deviance)), format = "markdown", digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "border"))
r.squared adj.r.squared sigma statistic p.value df AIC BIC nobs
0.38 0.38 40.83 159.63 0 5 13423.87 13460.11 1308
kable(lm_terms, format = "markdown", digits = c(1, 1, 1, 1, 2, 2, 2)) %>%
  kable_styling(bootstrap_options = c("striped", "border"))
Term Coefficient std.error statistic p.value conf.low conf.high
pclass3rd -75.2 3.2 -23.2 0.00 -81.50 -68.80
pclass2nd -67.2 3.5 -19.2 0.00 -74.05 -60.35
sexMale -11.9 2.8 -4.3 0.00 -17.35 -6.44
survived 0.7 2.9 0.3 0.80 -4.93 6.41
age -0.2 0.1 -2.2 0.03 -0.37 -0.02
(Intercept) 101.3 5.2 19.5 0.00 91.07 111.45