-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathchapter3.R
100 lines (78 loc) · 2.77 KB
/
chapter3.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
library(faraway)
library(skimr)
library(magrittr)
library(tidyverse)
#### Chpt 3 --------------------------------------------------------------------
## * Prostate ----------------------------------
data(prostate)
summary(prostate)
m <- lm(lpsa ~ ., data = prostate)
confint(m, c("age"), .95)
confint(m, c("age"), .90)
# age is negatively correlated with lpsa with 90% conficence, but not with 95%
library(ellipse)
plot(ellipse(m, 3:4), type = "l")
points(coef(m)[3], coef(m)[4], pch = 1)
# since the origin is outside of the elipse we reject joint hypothesis age + lweight = 0
# since 0 is outside of the x dimension, we can regect lweight = 0
# we can't reject age = 0, since 0 is in the y dimension
ts <- m %>% summary() %>% coef()%>% .[4,]
nreps <- 10000
t_stats <- numeric(nreps)
for ( i in 1:nreps ) {
mod <- lm(sample(lpsa) ~ ., data = prostate)
t_stats[i] <- summary(mod)$coef[4,3]
}
mean(abs(t_stats) > abs(ts[3]))
# the larger nreps, the better the guess
summary(m)
n <- update(m, . ~ lcavol + lweight + svi)
summary(n)
anova(n, m)
# not siginificantly better model
## * Cheddar ------------------------------
data(cheddar)
str(cheddar)
m <- lm(taste ~ ., data = cheddar)
summary(m)
# Hydrogen Sulfide and Lactic Acid
log(100) # natural log
exp(log(100)) # use to revert
delog_m <- update(m, . ~ exp(Acetic) + exp(H2S) + Lactic)
summary(delog_m)
# now only Lactic Acid if signif at 5%
# f-tests are used to access variation (expected / un-expected) in a model
anova(m, delog_m)
# there is less risidual variance in the log model (m), but we don't have any
# degrees of freedom left so we can't estimate the F-statistic
(coef(m)[3] * .01) # 0.03911841 increase in taste
?cheddar
test_df <- data.frame(Acetic = rep(mean(cheddar$Acetic),2),
H2S = mean(cheddar$H2S) + c(0,.01),
Lactic = mean(cheddar$Lactic) )
predict(m, test_df) # confirmed
(coef(m)[3] + .01) / coef(delog_m)[3] # 5100% percent increase to match
exp(.01) # a 1 point increase in the unlogged scale
## * teengam ----------
?teengamb
skim(teengamb)
m <- lm(gamble ~ ., data = teengamb)
summary(m)
# sex and income signif at 5% level
# since the coefficient shows a -22.11 shift in gamble for an increase of 1 in sex
# and females are code as 1, women gamble less than men
# if you recode sex as a factor with the levels as c(female, male)
teengamb$sex %<>% fct_rev()
m <- lm(gamble ~ ., data = teengamb) # and refit
summary(m) # then the coef changes sign
m2 <- lm(gamble ~ income, data = teengamb)
summary(m2)
anova(m, m2) # larger model is significnaly better
anova(m2, m) # order of anova doesn't matter
# repeat case with gala example from booth
data(gala)
m0 <- lm(Species ~ ., gala)
m <- update(m0, . ~ . - Endemics)
m2 <- update(m, . ~ . - Area - Adjacent)
anova(m, m2)
anova(m2, m)