-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathteamassign07_atm4rf.R
246 lines (186 loc) · 8.22 KB
/
teamassign07_atm4rf.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
###########################
# #
# Team Assignment 7 #
# #
###########################
# Group 7 | Kennan Grant, Elizabeth Homan, Adrian Mead, Gregory Wert
#################################################################################
## Please submit one set of answers per team. ##
## Your answers should be submitted as a .csv file per the instructions below. ##
## You should also submit your annotated R code per the instructions below. ##
#################################################################################
library(tidyverse)
library(GGally)
library(car)
library(calibrate)
library(leaps)
library(Metrics)
library(boot)
# For this team assignment you will use the file "teamassign07train.csv" to develop
# a linear model using whatever methods you consider appropriate. You will then use
# the model that you have developed to predict the values of the response variable
# corresponding to the explanatory variable values given in the file
# "teamassign07test.csv".
test <- read_csv("teamassign07test.csv")
train <- read_csv("teamassign07train.csv")
#Separate training set into training and validation components
set.seed(2)
sample <- sample_n(train,200)
valid <- setdiff(train,sample)
# going to look at all of the predictors plotted with one another
ggpairs(train)
# So there's clearly some very interesting relationships going on in the data. Things worth looking at. Most of the variables are
# not normally distributed. x7 seems to have a pretty good relationship with y.
# # It looks like x4 and x1 are really anti-correlated
#########################
## Multicollinearity ##
#########################
# Beforehand I decide that a vif > 100 is going to mean that we have a large amount of multicollinearity
lm.model <- glm(y ~ ., data = train)
vif(lm.model)
# OUTPUT:
# x1 x2 x3 x4 x5 x6 x7
# 130.530613 12.639190 56.282357 69.466889 1.008242 1.090237 1.865263
# So immediately I will cut x1 for sure.
lm.model <- glm(y ~ . - x1, data = train)
vif(lm.model)
# OUTPUT:
# x2 x3 x4 x5 x6 x7
# 7.107647 31.365972 30.441323 1.006367 1.090236 1.796884
# Multicollinearity looks immediately better
# Normally I'd consider pulling the column, but since we interested primarily in predictive power I keep x1 in.
lm.model <- glm(y ~ ., data = train)
#####################################
## Outliers / Influential Points ##
#####################################
summary(lm.model)
ti <- rstudent(lm.model)
## Normal probabilty plot
qqnorm(ti)
qqline(ti)
# This is not a great shape. We can see that at extreme theoretical quantiles, the actual sample quantile for the residuals is much
# bigger than we would expect (so it's fatter than normal at the tails)
## Residual plot vs. fitted values
yhat <- fitted(lm.model)
plot(yhat,ti)
# I would not describe this as random scatter. It also looks like there's a significant increases in variance of the residuals as yhat
# increases
## Residual plots vs. explanatory variables
plot(train$x1,ti)
plot(train$x2,ti)
plot(train$x3,ti)
# Very strange behavior here. Big gap between x3 from 160 to 210
plot(train$x4,ti)
plot(train$x5,ti)
plot(train$x6,ti)
plot(train$x7,ti)
# This is the only data that seems to have well-behaved residuals
## A summary of potential leverage and/or influential points
summary(influence.measures(lm.model))
# There aren't any immediately obvious violations on Cook's D or the hat matrix, but we do see several DFFITS issues and a lot of smaller
# covariances (so causing less precision)
inv_pos <- as.numeric(row.names(summary(influence.measures(lm.model))))
plot(yhat,ti)
textxy(yhat[inv_pos],ti[inv_pos],inv_pos)
# You can see immediately that all of the influential points (except for one at the bottom), are the ones that cluster together with
# large residuals.
train[inv_pos,] # These rows represent points with some of the largest observed y-values.
#######################
## Model Selection ##
#######################
## Comparative model selection
bestmod <- regsubsets(y ~ ., data = sample, nbest=10)
## The 10 best models for each number of explanatory variables in the model
summary(bestmod)
best.sum <- as.data.frame(summary(bestmod)$outmat)
best.sum$p <- as.numeric(substr(rownames(best.sum),1,1))+1
## The criterion values corresponding to each model
best.sum$rss <- summary(bestmod)$rss
best.sum$adjr2 <- summary(bestmod)$adjr2
best.sum$cp <- summary(bestmod)$cp
best.sum$bic <- summary(bestmod)$bic
## Determine "best" models
best.sum[order(best.sum$rss),]
best.sum[order(best.sum$adjr2, decreasing = T),]
best.sum[order(best.sum$cp),]
best.sum[order(best.sum$bic),]
# Removing x5 seems to be on one the most highly-recommended things
## Iterative model selection
## Begin by defining the models with no variables (null) and all variables (full)
h.null <- glm(y ~ 1, data = sample)
h.full <- glm(y ~ ., data = sample)
## Forward selection
step(h.null, scope=list(lower=h.null, upper=h.full), direction="forward")
## Backward selection
step(h.full, scope=list(lower=h.null, upper=h.full), direction="backward")
## Stepwise selection
step(h.null, scope=list(lower=h.null, upper=h.full), direction="both")
# Pretty universally, we get that removing x5 gives the best model
###############################
## Further Model Selection ##
###############################
## Iterative model selection
## Begin by defining the models with no variables (null) and all variables (full)
h.null <- glm(y ~ 1, data = sample)
h.full <- glm(y ~ . + I(x1^2) + I(x2^2) + I(x3^2) + I(x4^2) + I(x5^2) + I(x6^2) + I(x7^2), data = sample)
## Forward selection
step(h.null, scope=list(lower=h.null, upper=h.full), direction="forward")
# Gives x2, x3, x4, x6, x7, x2^2, x3^2, x7^2
## Backward selection
step(h.full, scope=list(lower=h.null, upper=h.full), direction="backward")
# Gives x1, x2, x3, x4, x5, x6, x1^2, x2^2, x3^2, x5^2, x7^2
## Stepwise selection
step(h.null, scope=list(lower=h.null, upper=h.full), direction="both")
# Gives x2, x3, x4, x6, x7, x2^2, x3^2, x7^2
########################
## Cross-Validation ##
########################
set.seed(17)
cv.glm(train, lm1, K = 10)$delta[1]
cv.glm(train, lm2, K = 10)$delta[1]
cv.glm(train, lm3, K = 10)$delta[1]
# Trying this model first
lm1 <- glm(y ~ ., data = train)
mse(valid$y, predict(glm1, newdata = valid))
# OUTPUT:
# 16.32582
lm2 <- glm(y ~ . - x5, data = train)
mse(valid$y, predict(glm2, newdata = valid))
# OUTPUT:
# 16.35248
lm3 <- glm(y ~ x2 + x3 + x4 + x6 + x7 + I(x2^2) + I(x3^2) + I(x7^2), data = train)
mse(valid$y, predict(glm3, newdata = valid))
# OUTPUT:
# 15.9392
#Run an initial linear model using all predictors
lm1 = lm(y~.,data=Sample)
summary(lm1)
#Coefficients:
#Estimate Std. Error t value Pr(>|t|)
#(Intercept) 248.3845 25.8347 9.61 < 2e-16 ***
#x1 -0.2864 0.0268 -10.67 < 2e-16 ***
#x2 0.1535 0.0181 8.47 6.5e-15 ***
#x3 0.0952 0.0352 2.70 0.0075 **
#x4 -0.6970 0.0803 -8.68 1.7e-15 ***
#x5 0.1557 0.1999 0.78 0.4372
#x6 15.7673 1.6903 9.33 < 2e-16 ***
#x7 0.1029 0.0220 4.67 5.6e-06 ***
#Residual standard error: 3.06 on 192 degrees of freedom
#Multiple R-squared: 0.89, Adjusted R-squared: 0.886
#F-statistic: 223 on 7 and 192 DF, p-value: <2e-16
#Select out variables shown to be insignificant in previous model one by one...
lm2 = lm(y~x1+x2+x3+x4+x6+x7,data=Sample)
summary(lm2)
#x3 is significant, but significantly less so than the others
lm3 = lm(y~x1+x2+x4+x6+x7,data=Sample)
summary(lm3)
preds.lm3 <- predict(lm3, newdata = Valid)
mse(Valid$y, preds.lm3) #15.8
# Once you have predicted the values of the response variable for the testing set,
# you should save them to a vector called predvect and write them into a .csv file
# using the following code:
write.table(predvect, file="teamassign07preds.csv", row.names=F, col.names=F, sep=",")
# Your annotated R code should explain the reasoning behind your choices in
# model selection and should be neatly organized.
# Your grade on this team assignment will be based on how well your model predicts
# the observed values relative to the other teams.