Modeling Fundamentals - The Titanic

Jesse Cambon 01 August, 2021

References
Setup
Exploratory Analysis
Imputation
Logistic Regression Model
Linear Regression Model

This page is meant to serve as a primer for modeling in R. A linear and logistic regression are demonstrated on the Titanic dataset to model fare prices and survival rates using gender, age, and passenger class as covariates. Data imputation is performed on age in order to avoid biasing results. The broom package is used to display the results of the models and graphical methods are used throughout to explore the dataset and evaluate the fit of each model.

References

Intro to Statistical Learning: http://www-bcf.usc.edu/~gareth/ISL/
Logistic Regression: https://stats.idre.ucla.edu/r/dae/logit-regression/

Setup

library(PASWR) # titanic3 dataset
library(MASS) # confint for glm
library(wesanderson) # color palettes
library(formattable) # percent format
library(caret) # regression utilities
# library(Hmisc) # capitalize function
library(broom) # model display capabilities
# library(xtable) # pretty table
library(kableExtra)
library(tidyverse)
library(skimr) # summary stats
library(knitr)
# library(rms)
library(mice) # imputation

titanic <- titanic3 %>%
  as_tibble() %>%
  mutate(sex = str_to_title(sex))

titanic_summ <- titanic %>%
  count(survived, pclass, sex) %>%
  group_by(pclass, sex) %>%
  mutate(
    perc_surv_num = n / sum(n),
    perc_surv_char = as.character(percent(n / sum(n), 0))
  ) %>%
  ungroup() %>%
  mutate(survived = factor(survived, labels = c("Died", "Survived")))

# Set default ggplot theme
theme_set(theme_bw() +
  theme(
    legend.position = "top",
    plot.subtitle = element_text(face = "bold", hjust = 0.5),
    plot.title = element_text(lineheight = 1, face = "bold", hjust = 0.5)
  ))

Exploratory Analysis

skim(titanic)

Data summary


Name	titanic
Number of rows	1309
Number of columns	14
\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_
Column type frequency:
character	1
factor	7
numeric	6
\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_
Group variables	None

Variable type: character

skim\_variable	n\_missing	complete\_rate	min	max	empty	n\_unique	whitespace
sex	0	1	4	6	0	2	0

Variable type: factor

skim\_variable	complete\_rate	ordered	n\_unique	top\_counts
pclass	1	FALSE	3	3rd: 709, 1st: 323, 2nd: 277
name	1	FALSE	1307	Con: 2, Kel: 2, Abb: 1, Abb: 1
ticket	1	FALSE	929	CA.: 11, 160: 8, CA : 8, 310: 7
cabin	1	FALSE	187	emp: 1014, B57: 5, C23: 5, G6: 5
embarked	1	FALSE	4	Sou: 914, Che: 270, Que: 123, emp: 2
boat	1	FALSE	28	emp: 823, 13: 39, C: 38, 15: 37
home.dest	1	FALSE	369	emp: 564, New: 64, Lon: 14, Mon: 10

Variable type: numeric

skim\_variable	n\_missing	complete\_rate	mean	sd	p0	p25	p50	p75	p100	hist
survived	0	1.00	0.38	0.49	0.00	0.0	0.00	1.00	1.00	▇▁▁▁▅
age	263	0.80	29.88	14.41	0.17	21.0	28.00	39.00	80.00	▂▇▅▂▁
sibsp	0	1.00	0.50	1.04	0.00	0.0	0.00	1.00	8.00	▇▁▁▁▁
parch	0	1.00	0.39	0.87	0.00	0.0	0.00	0.00	9.00	▇▁▁▁▁
fare	1	1.00	33.30	51.76	0.00	7.9	14.45	31.27	512.33	▇▁▁▁▁
body	1188	0.09	160.81	97.70	1.00	72.0	155.00	256.00	328.00	▇▇▇▅▇

gender_summ <- titanic %>% count(sex)
pclass_summ <- titanic %>% count(pclass)
surv_summ <- titanic %>% count(survived)

kable(gender_summ)

sex	n
Female	466
Male	843

kable(pclass_summ)

pclass	n
1st	323
2nd	277
3rd	709

kable(surv_summ)

survived	n
0	809
1	500

titanic_summ_dot <- titanic_summ %>%
  filter(survived == "Survived") %>%
  mutate(group = str_c(sex, "-", pclass))

# Analyze where age is missing
titanic_miss <- titanic %>%
  select(pclass, sex, age, survived) %>%
  mutate(missing_age = is.na(age)) %>%
  group_by(pclass, sex) %>%
  dplyr::summarize(perc_missing = mean(missing_age)) %>%
  mutate(group = str_c(sex, "-", pclass))

## `summarise()` has grouped output by 'pclass'. You can override using the `.groups` argument.

ggplot(titanic_summ_dot, aes(x = reorder(group, -perc_surv_num), y = perc_surv_num)) +
  geom_point(size = 3, color = "black") + # Draw points
  theme(
    legend.position = "none",
    panel.grid = element_blank()
  ) +
  scale_y_continuous(expand = c(0, 0, 0.015, 0), labels = scales::percent) +
  geom_segment(aes(
    x = group,
    xend = group,
    y = min(0),
    yend = max(1)
  ),
  linetype = "dashed", color = "black",
  size = 0.1
  ) + # Draw dashed lines
  labs(title = "Titanic Passenger Survival Rates") +
  coord_flip() +
  xlab("Group") +
  ylab("Survival Rate")

ggplot(
  data = titanic_summ,
  aes(x = fct_rev(pclass), y = perc_surv_num, fill = survived)
) +
  facet_grid(~ factor(sex)) +
  geom_bar(stat = "identity", color = "black") +
  coord_flip() +
  geom_text(
    data = titanic_summ, aes(label = ifelse(perc_surv_num > 0.07, perc_surv_char, NA)),
    size = 3, position = position_stack(vjust = 0.5)
  ) +
  scale_fill_manual(values = wes_palette("FantasticFox1")[c(3, 4)]) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    panel.grid = element_blank()
  ) +
  labs(title = "Passenger Survival Rates by Gender and Class") +
  xlab("Passenger Class") +
  ylab("") +
  guides(fill = guide_legend(title = "", reverse = T)) # reverse legend order

ggplot(titanic_miss, aes(x = reorder(group, -perc_missing), y = perc_missing)) +
  geom_point(size = 3, color = "black") + # Draw points
  theme(
    legend.position = "none",
    panel.grid = element_blank()
  ) +
  scale_y_continuous(labels = scales::percent) +
  geom_segment(aes(
    x = group,
    xend = group,
    y = min(perc_missing),
    yend = max(perc_missing)
  ),
  linetype = "dashed", color = "black",
  size = 0.1
  ) + # Draw dashed lines
  labs(title = "Which Passengers Have Missing Age") +
  coord_flip() +
  xlab("Group") +
  ylab("Percent Missing Age")

Imputation

Discarding rows with missing age has the potential to bias our data so we will impute.

https://www.analyticsvidhya.com/blog/2016/03/tutorial-powerful-packages-imputing-missing-values/ https://www.andrew.cmu.edu/user/aurorat/MIA_r.html http://www.gerkovink.com/miceVignettes/Convergence_pooling/Convergence_and_pooling.html

# Use mice to impute data - takes a few seconds on my laptop
# Increasing maxit from the default gives better results
# Making sure to not sure the outcome variable to impute.
titanic_imputed <- mice(titanic %>% select(sex, pclass, age), method = "pmm", maxit = 80, seed = 3530, printFlag = F)

## Warning: Number of logged events: 1

# Add imputed Data
titanic_imp <- complete(titanic_imputed, 5) %>%
  bind_cols(titanic %>% select(survived, age, fare) %>% rename(age_orig = age)) %>%
  mutate(imputed = case_when(is.na(age_orig) ~ "Imputed", TRUE ~ "Original"))

# Plot
ggplot(data = titanic_imp) +
  geom_density(aes(x = age, color = imputed), alpha = 0.8) +
  # facet_grid(vars(sex),vars(pclass)) +
  theme(legend.position = "top") +
  labs(title = "Distributions of Original v. Imputed Age") +
  guides(color = guide_legend(title = ""))

ggplot(data = titanic_imp) +
  geom_density(aes(x = age, color = imputed), alpha = 0.8) +
  facet_grid(sex ~ pclass) +
  theme(legend.position = "top") +
  labs(title = "Distributions of Original v. Imputed Age") +
  guides(color = guide_legend(title = ""))

Logistic Regression Model

We will use the brier score as one measurement of accuracy for our model: https://en.wikipedia.org/wiki/Brier_score The book ‘Superforecasting’ by Philip Tetlock has a good discussion of the use of brier scores.

log_fit <- glm(survived ~ sex + pclass + age, family = binomial(link = "logit"), data = titanic_imp)

predictions <- titanic_imp %>%
  select(sex, pclass, age, survived) %>%
  mutate(prediction = predict(log_fit, newdata = titanic_imp, type = "response")) %>%
  mutate(
    prediction_binary = case_when(prediction > 0.5 ~ 1, TRUE ~ 0),
    brier_score = (prediction - survived)^2
  )

# summary(fit)

log_confint <- augment(log_fit, se_fit = TRUE, type.predict = 'response')
#colnames(log_confint) <- c("Term", "LCLM", "UCLM")

log_info <- glance(log_fit) %>%
  mutate(meanBrierScore = mean(predictions$brier_score, na.rm = T)) %>%
  dplyr::select(meanBrierScore, everything())

log_terms <- tidy(log_fit, conf.int = TRUE) %>%
  rename(Coefficient = estimate, Term = term) %>%
  # Order by largest coefficient but put intercept term on bottom
#  left_join(log_confint, by = "Term") %>%
  mutate(OR = exp(Coefficient)) %>%
  select(Term, Coefficient, OR, everything()) %>%
  arrange(Term == "(Intercept)", desc(abs(Coefficient)))

# An analysis of our model's classification accuracy
confusionMatrix(factor(predictions$prediction_binary), factor(predictions$survived))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 686 159
##          1 123 341
##                                           
##                Accuracy : 0.7846          
##                  95% CI : (0.7613, 0.8066)
##     No Information Rate : 0.618           
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.5373          
##                                           
##  Mcnemar's Test P-Value : 0.03714         
##                                           
##             Sensitivity : 0.8480          
##             Specificity : 0.6820          
##          Pos Pred Value : 0.8118          
##          Neg Pred Value : 0.7349          
##              Prevalence : 0.6180          
##          Detection Rate : 0.5241          
##    Detection Prevalence : 0.6455          
##       Balanced Accuracy : 0.7650          
##                                           
##        'Positive' Class : 0               
##

ggplot(
  data = predictions,
  aes(x = age, y = prediction, color = pclass)
) +
  geom_point() +
  facet_grid(~sex) +
  scale_y_continuous(labels = scales::percent) +
  theme(legend.margin = margin(0, 0, 0, 0)) +
  scale_color_manual(values = wes_palette("Moonrise3")) +
  labs(title = "Probability of Survival - Logistic Regression") +
  xlab("Age") +
  ylab("Survival Probability") +
  guides(color = guide_legend(title = "Passenger Class", reverse = F, override.aes = list(size = 2.5)))

ggplot(predictions, aes(prediction)) +
  geom_histogram(
    binwidth = 0.02, aes(fill = factor(survived, labels = c("Died", "Survived"))),
    col = "black"
  ) +
  # prevent right label on axis from being clipped
  theme(legend.pos = "top", plot.margin = margin(r = 20, unit = "pt")) +
  scale_fill_manual(values = wes_palette("Moonrise3")) +
  scale_x_continuous(
    labels = scales::percent,
    limits = c(0, 1),
    expand = c(0, 0, 0, 0)
  ) + # eliminate left/right margin
  scale_y_continuous(expand = expansion(mult = c(0, .1))) +
  labs(title = "Logistic Regression Probability Distribution") +
  xlab("Survival Probability") +
  ylab("Count") +
  guides(fill = guide_legend(title = ""))

## Warning: Removed 4 rows containing missing values (geom_bar).

# Same graph as prior but faceted on class

ggplot(predictions, aes(prediction)) +
  geom_histogram(
    binwidth = 0.05, aes(fill = factor(survived, labels = c("Died", "Survived"))),
    col = "black"
  ) +
  facet_wrap(~sex, scales = "free_y") +
  theme(
    legend.pos = "top",
    # prevent right label on axis from being clipped
    plot.margin = margin(r = 20, unit = "pt")
  ) +
  scale_fill_manual(values = wes_palette("Moonrise3")) +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(expand = expansion(mult = c(0, .1))) +
  labs(title = "Logistic Regression Probability Distribution by Gender") +
  xlab("Survival Probability") +
  ylab("Count") +
  guides(fill = guide_legend(title = ""))

ggplot(predictions, aes(brier_score)) +
  geom_histogram(
    binwidth = 0.02, aes(fill = factor(survived, labels = c("Died", "Survived"))),
    col = "black"
  ) +
  labs(title = "Brier Score Distribution") +
  scale_fill_manual(values = wes_palette("Moonrise3")) +
  # Use expand to make sure right axis label isn't clipped
  scale_x_continuous(expand = c(0, 0, 0.01, 0), limits = c(0, 1)) +
  scale_y_continuous(expand = expansion(mult = c(0, .1))) +
  xlab("Brier Score") +
  ylab("Count") +
  guides(fill = guide_legend(title = ""))

## Warning: Removed 4 rows containing missing values (geom_bar).

OR = Odds Ratio. LCLM and UCLM are lower and upper 95% confidence limits.

kable(log_info %>%
  select(-df.residual, -df.null, -deviance), format = "markdown", digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "border"))

meanBrierScore	null.deviance	logLik	AIC	BIC	nobs
0.15	1741.02	-616.16	1242.32	1268.21	1309

kable(log_terms,format='markdown',digits = 2) %>%
  kable_styling(bootstrap_options = c("striped",'border'))

Term	Coefficient	OR	std.error	statistic	conf.low	conf.high
sexMale	-2.49	0.08	0.15	-16.78	-2.78	-2.20
pclass3rd	-2.18	0.11	0.20	-10.87	-2.58	-1.79
pclass2nd	-1.16	0.31	0.21	-5.55	-1.58	-0.76
age	-0.03	0.97	0.01	-4.89	-0.04	-0.02
(Intercept)	3.19	24.40	0.29	11.00	2.64	3.78

Linear Regression Model

A linear model of passenger fare cost.

lm_fit <- lm(fare ~ sex + pclass + age + survived, data = titanic_imp)

# Calculate confidence limit
lm_confint <- augment(lm_fit, se_fit = TRUE)

lm_predictions <- titanic_imp %>%
  select(sex, pclass, age, survived, fare) %>%
  mutate(prediction = predict(lm_fit, newdata = titanic_imp)) %>%
  mutate(residual = fare - prediction)

lm_info <- glance(lm_fit)

lm_terms <- tidy(lm_fit, conf.int = TRUE) %>%
  rename(Term = term, Coefficient = estimate) %>%
#  left_join(lm_confint, by = "Term") %>%
  select(Term, Coefficient, everything()) %>%
  arrange(Term == "(Intercept)", desc(abs(Coefficient)))

# summary(lm_fit)

# Histogram of Residuals
ggplot(lm_predictions, aes(residual)) +
  geom_histogram(bins = 30) +
  facet_grid(~pclass, scales = "free_x") +
  geom_vline(xintercept = 0, color = "blue") +
  scale_x_continuous(labels = scales::dollar, expand = c(0, 0, 0, 0)) +
  scale_y_continuous(expand = c(0, 0, 0.07, 0)) +
  labs(title = "Residual Distribution by Passenger Class") +
  xlab("Residual") +
  ylab("Count")

## Warning: Removed 1 rows containing non-finite values (stat_bin).

ggplot(
  data = lm_predictions,
  aes(x = age, y = prediction, color = pclass, group = 1)
) +
  geom_point() +
  facet_grid(~ factor(sex)) +
  scale_y_continuous(labels = scales::dollar) +
  # theme(legend.margin=margin(0,0,0,0)) +
  scale_color_manual(values = wes_palette("Moonrise3")) +
  labs(title = "Cost of Fare - Linear Regression") +
  xlab("Age") +
  ylab("Fare Cost") +
  guides(color = guide_legend(title = "Passenger Class", reverse = F, override.aes = list(size = 2.5)))

ggplot(
  data = lm_predictions,
  aes(x = prediction, y = residual, color = sex)
) +
  geom_point() +
  facet_grid(~pclass, scales = "free_x") +
  geom_hline(yintercept = 0, color = "blue") + # horizontal line at 0 residual
  # geom_smooth(method="lm",show.legend=F,size=0.5) +
  scale_x_continuous(labels = scales::dollar) +
  scale_y_continuous(labels = scales::dollar) +
  # theme(legend.pos='none') +
  scale_color_manual(values = wes_palette("Moonrise3")) +
  labs(title = "Residuals vs Predictions by Passenger Class") +
  xlab("Prediction") +
  ylab("Residual") +
  guides(color = guide_legend(title = "Gender", reverse = F, override.aes = list(size = 2.5)))

## Warning: Removed 1 rows containing missing values (geom_point).

ggplot(
  data = lm_predictions,
  aes(x = age, y = residual, color = sex)
) +
  geom_point() +
  facet_grid(~pclass) +
  geom_hline(yintercept = 0, color = "blue") + # horizontal line at 0 residual
  scale_y_continuous(labels = scales::dollar) +
  theme(legend.margin = margin(0, 0, 0, 0)) +
  scale_color_manual(values = wes_palette("Moonrise3")) +
  labs(title = "Residuals By Passenger Class") +
  xlab("Age") +
  ylab("Residual") +
  guides(color = guide_legend(title = "Gender", reverse = F, override.aes = list(size = 2.5)))

## Warning: Removed 1 rows containing missing values (geom_point).

ggplot(
  data = lm_terms,
  aes(x = reorder(Term, -Coefficient), y = Coefficient)
) +
  geom_point() +
  coord_flip() +
  geom_pointrange(mapping = aes(ymin = conf.low, ymax = conf.high)) +
  scale_color_manual(values = wes_palette("Moonrise3")) +
  labs(title = "Linear Model Coefficients with Confidence Intervals") +
  xlab("Term")

kable((lm_info %>% dplyr::select(-df.residual, -logLik, -deviance)), format = "markdown", digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "border"))

r.squared	adj.r.squared	sigma	statistic	p.value	df	AIC	BIC	nobs
0.38	0.38	40.83	159.63	0	5	13423.87	13460.11	1308

kable(lm_terms, format = "markdown", digits = c(1, 1, 1, 1, 2, 2, 2)) %>%
  kable_styling(bootstrap_options = c("striped", "border"))

Term	Coefficient	std.error	statistic	p.value	conf.low	conf.high
pclass3rd	-75.2	3.2	-23.2	0.00	-81.50	-68.80
pclass2nd	-67.2	3.5	-19.2	0.00	-74.05	-60.35
sexMale	-11.9	2.8	-4.3	0.00	-17.35	-6.44
survived	0.7	2.9	0.3	0.80	-4.93	6.41
age	-0.2	0.1	-2.2	0.03	-0.37	-0.02
(Intercept)	101.3	5.2	19.5	0.00	91.07	111.45

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Titanic.md

Titanic.md

Modeling Fundamentals - The Titanic

References

Setup

Exploratory Analysis

Imputation

Logistic Regression Model

Linear Regression Model

Files

Titanic.md

Latest commit

History

Titanic.md

File metadata and controls

Modeling Fundamentals - The Titanic

References

Setup

Exploratory Analysis

Imputation

Logistic Regression Model

Linear Regression Model