Modeling Fundamentals - The Titanic
Jesse Cambon
01 August, 2021
This page is meant to serve as a primer for modeling in R. A linear and
logistic regression are demonstrated on the Titanic dataset to model
fare prices and survival rates using gender, age, and passenger class as
covariates. Data imputation is performed on age in order to avoid
biasing results. The broom package is used to display the results of the
models and graphical methods are used throughout to explore the dataset
and evaluate the fit of each model.
library(PASWR ) # titanic3 dataset
library(MASS ) # confint for glm
library(wesanderson ) # color palettes
library(formattable ) # percent format
library(caret ) # regression utilities
# library(Hmisc) # capitalize function
library(broom ) # model display capabilities
# library(xtable) # pretty table
library(kableExtra )
library(tidyverse )
library(skimr ) # summary stats
library(knitr )
# library(rms)
library(mice ) # imputation
titanic <- titanic3 %> %
as_tibble() %> %
mutate(sex = str_to_title(sex ))
titanic_summ <- titanic %> %
count(survived , pclass , sex ) %> %
group_by(pclass , sex ) %> %
mutate(
perc_surv_num = n / sum(n ),
perc_surv_char = as.character(percent(n / sum(n ), 0 ))
) %> %
ungroup() %> %
mutate(survived = factor (survived , labels = c(" Died" , " Survived" )))
# Set default ggplot theme
theme_set(theme_bw() +
theme(
legend.position = " top" ,
plot.subtitle = element_text(face = " bold" , hjust = 0.5 ),
plot.title = element_text(lineheight = 1 , face = " bold" , hjust = 0.5 )
))
Data summary
Name
titanic
Number of rows
1309
Number of columns
14
\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_
Column type frequency:
character
1
factor
7
numeric
6
\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_
Group variables
None
Variable type: character
skim\_variable
n\_missing
complete\_rate
min
max
empty
n\_unique
whitespace
sex
0
1
4
6
0
2
0
Variable type: factor
skim\_variable
n\_missing
complete\_rate
ordered
n\_unique
top\_counts
pclass
0
1
FALSE
3
3rd: 709, 1st: 323, 2nd: 277
name
0
1
FALSE
1307
Con: 2, Kel: 2, Abb: 1, Abb: 1
ticket
0
1
FALSE
929
CA.: 11, 160: 8, CA : 8, 310: 7
cabin
0
1
FALSE
187
emp: 1014, B57: 5, C23: 5, G6: 5
embarked
0
1
FALSE
4
Sou: 914, Che: 270, Que: 123, emp: 2
boat
0
1
FALSE
28
emp: 823, 13: 39, C: 38, 15: 37
home.dest
0
1
FALSE
369
emp: 564, New: 64, Lon: 14, Mon: 10
Variable type: numeric
skim\_variable
n\_missing
complete\_rate
mean
sd
p0
p25
p50
p75
p100
hist
survived
0
1.00
0.38
0.49
0.00
0.0
0.00
1.00
1.00
▇▁▁▁▅
age
263
0.80
29.88
14.41
0.17
21.0
28.00
39.00
80.00
▂▇▅▂▁
sibsp
0
1.00
0.50
1.04
0.00
0.0
0.00
1.00
8.00
▇▁▁▁▁
parch
0
1.00
0.39
0.87
0.00
0.0
0.00
0.00
9.00
▇▁▁▁▁
fare
1
1.00
33.30
51.76
0.00
7.9
14.45
31.27
512.33
▇▁▁▁▁
body
1188
0.09
160.81
97.70
1.00
72.0
155.00
256.00
328.00
▇▇▇▅▇
gender_summ <- titanic %> % count(sex )
pclass_summ <- titanic %> % count(pclass )
surv_summ <- titanic %> % count(survived )
sex
n
Female
466
Male
843
pclass
n
1st
323
2nd
277
3rd
709
titanic_summ_dot <- titanic_summ %> %
filter(survived == " Survived" ) %> %
mutate(group = str_c(sex , " -" , pclass ))
# Analyze where age is missing
titanic_miss <- titanic %> %
select(pclass , sex , age , survived ) %> %
mutate(missing_age = is.na(age )) %> %
group_by(pclass , sex ) %> %
dplyr :: summarize(perc_missing = mean(missing_age )) %> %
mutate(group = str_c(sex , " -" , pclass ))
## `summarise()` has grouped output by 'pclass'. You can override using the `.groups` argument.
ggplot(titanic_summ_dot , aes(x = reorder(group , - perc_surv_num ), y = perc_surv_num )) +
geom_point(size = 3 , color = " black" ) + # Draw points
theme(
legend.position = " none" ,
panel.grid = element_blank()
) +
scale_y_continuous(expand = c(0 , 0 , 0.015 , 0 ), labels = scales :: percent ) +
geom_segment(aes(
x = group ,
xend = group ,
y = min(0 ),
yend = max(1 )
),
linetype = " dashed" , color = " black" ,
size = 0.1
) + # Draw dashed lines
labs(title = " Titanic Passenger Survival Rates" ) +
coord_flip() +
xlab(" Group" ) +
ylab(" Survival Rate" )
ggplot(
data = titanic_summ ,
aes(x = fct_rev(pclass ), y = perc_surv_num , fill = survived )
) +
facet_grid(~ factor (sex )) +
geom_bar(stat = " identity" , color = " black" ) +
coord_flip() +
geom_text(
data = titanic_summ , aes(label = ifelse(perc_surv_num > 0.07 , perc_surv_char , NA )),
size = 3 , position = position_stack(vjust = 0.5 )
) +
scale_fill_manual(values = wes_palette(" FantasticFox1" )[c(3 , 4 )]) +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
panel.grid = element_blank()
) +
labs(title = " Passenger Survival Rates by Gender and Class" ) +
xlab(" Passenger Class" ) +
ylab(" " ) +
guides(fill = guide_legend(title = " " , reverse = T )) # reverse legend order
ggplot(titanic_miss , aes(x = reorder(group , - perc_missing ), y = perc_missing )) +
geom_point(size = 3 , color = " black" ) + # Draw points
theme(
legend.position = " none" ,
panel.grid = element_blank()
) +
scale_y_continuous(labels = scales :: percent ) +
geom_segment(aes(
x = group ,
xend = group ,
y = min(perc_missing ),
yend = max(perc_missing )
),
linetype = " dashed" , color = " black" ,
size = 0.1
) + # Draw dashed lines
labs(title = " Which Passengers Have Missing Age" ) +
coord_flip() +
xlab(" Group" ) +
ylab(" Percent Missing Age" )
Discarding rows with missing age has the potential to bias our data so
we will impute.
https://www.analyticsvidhya.com/blog/2016/03/tutorial-powerful-packages-imputing-missing-values/
https://www.andrew.cmu.edu/user/aurorat/MIA_r.html
http://www.gerkovink.com/miceVignettes/Convergence_pooling/Convergence_and_pooling.html
# Use mice to impute data - takes a few seconds on my laptop
# Increasing maxit from the default gives better results
# Making sure to not sure the outcome variable to impute.
titanic_imputed <- mice(titanic %> % select(sex , pclass , age ), method = " pmm" , maxit = 80 , seed = 3530 , printFlag = F )
## Warning: Number of logged events: 1
# Add imputed Data
titanic_imp <- complete(titanic_imputed , 5 ) %> %
bind_cols(titanic %> % select(survived , age , fare ) %> % rename(age_orig = age )) %> %
mutate(imputed = case_when(is.na(age_orig ) ~ " Imputed" , TRUE ~ " Original" ))
# Plot
ggplot(data = titanic_imp ) +
geom_density(aes(x = age , color = imputed ), alpha = 0.8 ) +
# facet_grid(vars(sex),vars(pclass)) +
theme(legend.position = " top" ) +
labs(title = " Distributions of Original v. Imputed Age" ) +
guides(color = guide_legend(title = " " ))
ggplot(data = titanic_imp ) +
geom_density(aes(x = age , color = imputed ), alpha = 0.8 ) +
facet_grid(sex ~ pclass ) +
theme(legend.position = " top" ) +
labs(title = " Distributions of Original v. Imputed Age" ) +
guides(color = guide_legend(title = " " ))
Logistic Regression Model
We will use the brier score as one measurement of accuracy for our
model: https://en.wikipedia.org/wiki/Brier_score The book
‘Superforecasting’ by Philip Tetlock has a good discussion of the use of
brier scores.
log_fit <- glm(survived ~ sex + pclass + age , family = binomial(link = " logit" ), data = titanic_imp )
predictions <- titanic_imp %> %
select(sex , pclass , age , survived ) %> %
mutate(prediction = predict(log_fit , newdata = titanic_imp , type = " response" )) %> %
mutate(
prediction_binary = case_when(prediction > 0.5 ~ 1 , TRUE ~ 0 ),
brier_score = (prediction - survived )^ 2
)
# summary(fit)
log_confint <- augment(log_fit , se_fit = TRUE , type.predict = ' response' )
# colnames(log_confint) <- c("Term", "LCLM", "UCLM")
log_info <- glance(log_fit ) %> %
mutate(meanBrierScore = mean(predictions $ brier_score , na.rm = T )) %> %
dplyr :: select(meanBrierScore , everything())
log_terms <- tidy(log_fit , conf.int = TRUE ) %> %
rename(Coefficient = estimate , Term = term ) %> %
# Order by largest coefficient but put intercept term on bottom
# left_join(log_confint, by = "Term") %>%
mutate(OR = exp(Coefficient )) %> %
select(Term , Coefficient , OR , everything()) %> %
arrange(Term == " (Intercept)" , desc(abs(Coefficient )))
# An analysis of our model's classification accuracy
confusionMatrix(factor (predictions $ prediction_binary ), factor (predictions $ survived ))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 686 159
## 1 123 341
##
## Accuracy : 0.7846
## 95% CI : (0.7613, 0.8066)
## No Information Rate : 0.618
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.5373
##
## Mcnemar's Test P-Value : 0.03714
##
## Sensitivity : 0.8480
## Specificity : 0.6820
## Pos Pred Value : 0.8118
## Neg Pred Value : 0.7349
## Prevalence : 0.6180
## Detection Rate : 0.5241
## Detection Prevalence : 0.6455
## Balanced Accuracy : 0.7650
##
## 'Positive' Class : 0
##
ggplot(
data = predictions ,
aes(x = age , y = prediction , color = pclass )
) +
geom_point() +
facet_grid(~ sex ) +
scale_y_continuous(labels = scales :: percent ) +
theme(legend.margin = margin(0 , 0 , 0 , 0 )) +
scale_color_manual(values = wes_palette(" Moonrise3" )) +
labs(title = " Probability of Survival - Logistic Regression" ) +
xlab(" Age" ) +
ylab(" Survival Probability" ) +
guides(color = guide_legend(title = " Passenger Class" , reverse = F , override.aes = list (size = 2.5 )))
ggplot(predictions , aes(prediction )) +
geom_histogram(
binwidth = 0.02 , aes(fill = factor (survived , labels = c(" Died" , " Survived" ))),
col = " black"
) +
# prevent right label on axis from being clipped
theme(legend.pos = " top" , plot.margin = margin(r = 20 , unit = " pt" )) +
scale_fill_manual(values = wes_palette(" Moonrise3" )) +
scale_x_continuous(
labels = scales :: percent ,
limits = c(0 , 1 ),
expand = c(0 , 0 , 0 , 0 )
) + # eliminate left/right margin
scale_y_continuous(expand = expansion(mult = c(0 , .1 ))) +
labs(title = " Logistic Regression Probability Distribution" ) +
xlab(" Survival Probability" ) +
ylab(" Count" ) +
guides(fill = guide_legend(title = " " ))
## Warning: Removed 4 rows containing missing values (geom_bar).
# Same graph as prior but faceted on class
ggplot(predictions , aes(prediction )) +
geom_histogram(
binwidth = 0.05 , aes(fill = factor (survived , labels = c(" Died" , " Survived" ))),
col = " black"
) +
facet_wrap(~ sex , scales = " free_y" ) +
theme(
legend.pos = " top" ,
# prevent right label on axis from being clipped
plot.margin = margin(r = 20 , unit = " pt" )
) +
scale_fill_manual(values = wes_palette(" Moonrise3" )) +
scale_x_continuous(labels = scales :: percent ) +
scale_y_continuous(expand = expansion(mult = c(0 , .1 ))) +
labs(title = " Logistic Regression Probability Distribution by Gender" ) +
xlab(" Survival Probability" ) +
ylab(" Count" ) +
guides(fill = guide_legend(title = " " ))
ggplot(predictions , aes(brier_score )) +
geom_histogram(
binwidth = 0.02 , aes(fill = factor (survived , labels = c(" Died" , " Survived" ))),
col = " black"
) +
labs(title = " Brier Score Distribution" ) +
scale_fill_manual(values = wes_palette(" Moonrise3" )) +
# Use expand to make sure right axis label isn't clipped
scale_x_continuous(expand = c(0 , 0 , 0.01 , 0 ), limits = c(0 , 1 )) +
scale_y_continuous(expand = expansion(mult = c(0 , .1 ))) +
xlab(" Brier Score" ) +
ylab(" Count" ) +
guides(fill = guide_legend(title = " " ))
## Warning: Removed 4 rows containing missing values (geom_bar).
OR = Odds Ratio. LCLM and UCLM are lower and upper 95% confidence
limits.
kable(log_info %> %
select(- df.residual , - df.null , - deviance ), format = " markdown" , digits = 2 ) %> %
kable_styling(bootstrap_options = c(" striped" , " border" ))
meanBrierScore
null.deviance
logLik
AIC
BIC
nobs
0.15
1741.02
-616.16
1242.32
1268.21
1309
kable(log_terms ,format = ' markdown' ,digits = 2 ) %> %
kable_styling(bootstrap_options = c(" striped" ,' border' ))
Term
Coefficient
OR
std.error
statistic
p.value
conf.low
conf.high
sexMale
-2.49
0.08
0.15
-16.78
0
-2.78
-2.20
pclass3rd
-2.18
0.11
0.20
-10.87
0
-2.58
-1.79
pclass2nd
-1.16
0.31
0.21
-5.55
0
-1.58
-0.76
age
-0.03
0.97
0.01
-4.89
0
-0.04
-0.02
(Intercept)
3.19
24.40
0.29
11.00
0
2.64
3.78
A linear model of passenger fare cost.
lm_fit <- lm(fare ~ sex + pclass + age + survived , data = titanic_imp )
# Calculate confidence limit
lm_confint <- augment(lm_fit , se_fit = TRUE )
lm_predictions <- titanic_imp %> %
select(sex , pclass , age , survived , fare ) %> %
mutate(prediction = predict(lm_fit , newdata = titanic_imp )) %> %
mutate(residual = fare - prediction )
lm_info <- glance(lm_fit )
lm_terms <- tidy(lm_fit , conf.int = TRUE ) %> %
rename(Term = term , Coefficient = estimate ) %> %
# left_join(lm_confint, by = "Term") %>%
select(Term , Coefficient , everything()) %> %
arrange(Term == " (Intercept)" , desc(abs(Coefficient )))
# summary(lm_fit)
# Histogram of Residuals
ggplot(lm_predictions , aes(residual )) +
geom_histogram(bins = 30 ) +
facet_grid(~ pclass , scales = " free_x" ) +
geom_vline(xintercept = 0 , color = " blue" ) +
scale_x_continuous(labels = scales :: dollar , expand = c(0 , 0 , 0 , 0 )) +
scale_y_continuous(expand = c(0 , 0 , 0.07 , 0 )) +
labs(title = " Residual Distribution by Passenger Class" ) +
xlab(" Residual" ) +
ylab(" Count" )
## Warning: Removed 1 rows containing non-finite values (stat_bin).
ggplot(
data = lm_predictions ,
aes(x = age , y = prediction , color = pclass , group = 1 )
) +
geom_point() +
facet_grid(~ factor (sex )) +
scale_y_continuous(labels = scales :: dollar ) +
# theme(legend.margin=margin(0,0,0,0)) +
scale_color_manual(values = wes_palette(" Moonrise3" )) +
labs(title = " Cost of Fare - Linear Regression" ) +
xlab(" Age" ) +
ylab(" Fare Cost" ) +
guides(color = guide_legend(title = " Passenger Class" , reverse = F , override.aes = list (size = 2.5 )))
ggplot(
data = lm_predictions ,
aes(x = prediction , y = residual , color = sex )
) +
geom_point() +
facet_grid(~ pclass , scales = " free_x" ) +
geom_hline(yintercept = 0 , color = " blue" ) + # horizontal line at 0 residual
# geom_smooth(method="lm",show.legend=F,size=0.5) +
scale_x_continuous(labels = scales :: dollar ) +
scale_y_continuous(labels = scales :: dollar ) +
# theme(legend.pos='none') +
scale_color_manual(values = wes_palette(" Moonrise3" )) +
labs(title = " Residuals vs Predictions by Passenger Class" ) +
xlab(" Prediction" ) +
ylab(" Residual" ) +
guides(color = guide_legend(title = " Gender" , reverse = F , override.aes = list (size = 2.5 )))
## Warning: Removed 1 rows containing missing values (geom_point).
ggplot(
data = lm_predictions ,
aes(x = age , y = residual , color = sex )
) +
geom_point() +
facet_grid(~ pclass ) +
geom_hline(yintercept = 0 , color = " blue" ) + # horizontal line at 0 residual
scale_y_continuous(labels = scales :: dollar ) +
theme(legend.margin = margin(0 , 0 , 0 , 0 )) +
scale_color_manual(values = wes_palette(" Moonrise3" )) +
labs(title = " Residuals By Passenger Class" ) +
xlab(" Age" ) +
ylab(" Residual" ) +
guides(color = guide_legend(title = " Gender" , reverse = F , override.aes = list (size = 2.5 )))
## Warning: Removed 1 rows containing missing values (geom_point).
ggplot(
data = lm_terms ,
aes(x = reorder(Term , - Coefficient ), y = Coefficient )
) +
geom_point() +
coord_flip() +
geom_pointrange(mapping = aes(ymin = conf.low , ymax = conf.high )) +
scale_color_manual(values = wes_palette(" Moonrise3" )) +
labs(title = " Linear Model Coefficients with Confidence Intervals" ) +
xlab(" Term" )
kable((lm_info %> % dplyr :: select(- df.residual , - logLik , - deviance )), format = " markdown" , digits = 2 ) %> %
kable_styling(bootstrap_options = c(" striped" , " border" ))
r.squared
adj.r.squared
sigma
statistic
p.value
df
AIC
BIC
nobs
0.38
0.38
40.83
159.63
0
5
13423.87
13460.11
1308
kable(lm_terms , format = " markdown" , digits = c(1 , 1 , 1 , 1 , 2 , 2 , 2 )) %> %
kable_styling(bootstrap_options = c(" striped" , " border" ))
Term
Coefficient
std.error
statistic
p.value
conf.low
conf.high
pclass3rd
-75.2
3.2
-23.2
0.00
-81.50
-68.80
pclass2nd
-67.2
3.5
-19.2
0.00
-74.05
-60.35
sexMale
-11.9
2.8
-4.3
0.00
-17.35
-6.44
survived
0.7
2.9
0.3
0.80
-4.93
6.41
age
-0.2
0.1
-2.2
0.03
-0.37
-0.02
(Intercept)
101.3
5.2
19.5
0.00
91.07
111.45