-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfigure1a_forecastST213-222.R
314 lines (260 loc) · 12.3 KB
/
figure1a_forecastST213-222.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
#nice summary of prediction and confidence intervals in R
#https://rpubs.com/aaronsc32/regression-confidence-prediction-intervals
#nice summary ofInterpreting Log Transformations in a Linear Model
#https://library.virginia.edu/data/articles/interpreting-log-transformations-in-a-linear-model
#must load library as summarize(caseCount = n()); does not run
library(dplyr)
library(ggplot2)
library(magrittr)
library(here)
library(grid)
library(forecast) #v8.23.0
#function to add the linear regression equation and r2 value
linear = function(k) {
z <- list(xx = format(as.numeric(coef(k)[1]), digits = 3),
yy = format(as.numeric(abs(coef(k)[2])), digits = 3),
r2 = format(summary(k)$adj.r.squared, digits = 4));
if (coef(k)[2] >= 0) {
eq <- substitute(italic(hat(y)) == xx + yy %.%
italic(x)*","~~italic(r)^2~"="~r2,z)
} else {
eq <- substitute(italic(hat(y)) == xx - yy %.%
italic(x)*","~~italic(r)^2~"="~r2,z)
}
as.character(as.expression(eq));
}
#read in the data
isoLoc <- read.csv(here::here('02_Manuscript/inputFiles',
'supplementalTable2Metadata-ST213andST222.csv'),
header = T, stringsAsFactors = F)
#summarize case counts by years
isoCount <- isoLoc %>%
dplyr::filter(Source != 'Environmental' & Source != 'Unknown') %>%
dplyr::select(Year, Type, State) %>% #select these two columns
na.omit %>% #remove NA
dplyr::filter(State != "Canada") %>%
dplyr::rename(year = Year) %>%
dplyr::group_by(year) %>%
dplyr::summarize(caseCount = n()) #count summary and rename column header
#data frame which is the original count data
isoCountOG <- isoCount
#remove years which are on either side of the years with zero cases (1992-1996)
isoCount <- isoCount %>%
dplyr::slice(-c(1:2))
#remove years which are on either side of the years with zero cases (1998-2000)
isoCount <- isoCount %>%
dplyr::slice(-c(2:3))
#convert year to numeric to bind future prediction rows
isoCount$year <- as.numeric(isoCount$year)
#generate years for which you want to predict the rate of case counts
new.year <- data.frame(year=c(2021:2030))
#bind data frames, and NA will be placed for future years in the case counts
isoCount <- dplyr::bind_rows(isoCount, new.year)
# #look at historical data to see if there is a relationship
# plot(caseCount ~ year, data = isoCount[1:21,])
#dealing with the years with zero cases; we will assign the calculated rate for
#the years which were removed due to zero cases. So 1992-1996 will result in a
#rate of 0.04 (i.e., n cases = 2, n years = 5; 2/5) for each year. 1998-2000
#will each have a rate of 0.667 (i.e., n cases = 2, n years = 3, 2/3).
isoCount <- rbind(isoCount, c(1992, 2/5))
isoCount <- rbind(isoCount, c(1993, 2/5))
isoCount <- rbind(isoCount, c(1994, 2/5))
isoCount <- rbind(isoCount, c(1995, 2/5))
isoCount <- rbind(isoCount, c(1996, 2/5))
isoCount <- rbind(isoCount, c(1998, 2/3))
isoCount <- rbind(isoCount, c(1999, 2/3))
isoCount <- rbind(isoCount, c(2000, 2/3))
isoCount <- isoCount[order(isoCount$year),]
#data frame which is rates of cases per year rather than counts; for plotting
isoCountRate222 <- isoCount
#take the log of casecount column
isoCount$caseCount <- log(isoCount$caseCount)
# #rename data frame to reflect that we took the log on
isoCountLog <- isoCount
# #check isoCountRate222 by plot; this is now rates of cases per year on the Y
# p1 <- ggplot(isoCountRate222[1:29,], aes(x = year, y = caseCount, group = 1)) + #add group = 1 to aes for line
# geom_line() +
# geom_point() +
# xlab("Year - in Past") + ylab("Rates of Cases per Year") +
# theme_bw() +
# theme(panel.grid.major = element_blank(),
# panel.grid.minor = element_blank()) +
# geom_hline(yintercept = 0, linetype="dashed") +
# theme(axis.text.x = element_text(angle=45, hjust=1))
#
# p1
#
# #generate a linear model to estimate slope (or regression coefficient, as year)
# #and intercept for log values
lModel <- lm(caseCount ~ year, data = isoCountLog[1:29,])
st222 <- summary(lModel)
#this tells us that the year has a significant relationship with rate of case
#count per year. By that I mean, the value of the slope is 0.14726 with a
#p-value of 1.17e-09. The intercept of the line is -294.27833
# Call:
# lm(formula = caseCount ~ year, data = isoCountLog[1:29, ])
#
# Residuals:
# Min 1Q Median 3Q Max
# -2.48981 -0.34875 0.02398 0.41904 1.37421
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -294.27833 32.65852 -9.011 1.26e-09 ***
# year 0.14726 0.01628 9.045 1.17e-09 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 0.7335 on 27 degrees of freedom
# Multiple R-squared: 0.7519, Adjusted R-squared: 0.7427
# F-statistic: 81.82 on 1 and 27 DF, p-value: 1.168e-09
# ###########using tigerstats for additional check on equation of the line
# library(tigerstats)
# lmGC(caseCount ~ year, data = isoCountLog)
#
# Linear Regression
#
# Correlation coefficient r = 0.8671
#
# Equation of Regression Line:
#
# caseCount = -294.2783 + 0.1473 * year
#
# Residual Standard Error: s = 0.7335
# R^2 (unadjusted): R^2 = 0.7519
# #exponentiate the predicted value for the past and present years (1992 - 2020)
# #this is to generate the fit and prediction intervals to plot
isoExpHis <- exp(predict(lModel, list(year = isoCountRate222$year),
interval = 'prediction'))
# #convert the prediction values from the regression line equation to dataframe
isoExpHis <- as.data.frame(isoExpHis)
# #combine with timeValues (aka years) for plotting
isoExpHisTime222 <- cbind(isoCountRate222$year, isoExpHis)
# #rename columns to year and casecount
isoExpHisTime222 <- isoExpHisTime222 %>%
rename(year = `isoCountRate222$year`) %>%
rename(caseCount = fit)
# #estimated case counts with intervals for next ten years
# # futTime fit lwr upr
# 30 2021 24.7787912 5.47207831 112.203893
# 31 2022 28.6464148 6.26302424 131.025692
# 32 2023 33.1177206 7.16417340 153.092807
# 33 2024 38.2869349 8.19036862 178.977217
# 34 2025 44.2629913 9.35839666 209.353426
# 35 2026 51.1718269 10.68723763 245.017090
# 36 2027 59.1590353 12.19834612 286.907046
# 37 2028 68.3929355 13.91596777 336.131393
# 38 2029 79.0681188 15.86749591 393.998362
# 39 2030 91.4095492 18.08387319 462.052880
# #specify different height and width to include the increased margin
# pdf(here::here('02_Figures/',
# "currentAndPredictedWTimelineST222.pdf"),
# width = 9, height = 6)
#
# pdf(here::here('02_Figures',
# "currentAndPredictedWTimelineST222TESt.pdf"),
# width = 8, height = 5)
# #big plot for st222
pBig222 <- ggplot(isoCountRate222[1:29,], aes(x = year, y = caseCount,
group = 1))+ #add group = 1 to aes for line
geom_line(lwd = 1)+
geom_point(size = 3)+
theme(text=element_text(family="sans"))+
xlab("") + ylab("Frequency of ST222/213")+
theme_bw()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
geom_hline(yintercept = 0, linetype="dashed")+
theme(axis.text.x = element_text(size = 12, angle=45, hjust=1, vjust = 1))+
theme(axis.text.y = element_text(size = 12)) +
theme(axis.title.y = element_text(colour = "black", size = 16, face="bold"))+
scale_x_continuous(breaks = seq(min(isoExpHisTime222$year),
max(isoExpHisTime222$year), by = 2))+
geom_line(data = isoExpHisTime222[1:29,], aes(year, caseCount),
col = 'blue', linetype = 'dashed', lwd = 2)+
geom_ribbon(data = isoExpHisTime222[1:29,], aes(ymin = lwr, ymax = upr),
alpha = 0.2)+
geom_line(data = isoExpHisTime222[30:39,], aes(year, caseCount),
col = 'red', linetype = 'dashed', lwd = 2)+
geom_ribbon(data = isoExpHisTime222[30:39,], aes(ymin = lwr, ymax = upr),
alpha = 0.2)
#add linear equation
pBig222 <- pBig222 + annotate("text", x = 1995.5, y = 570,
label = linear(lm(caseCount ~ year,
data = isoCountLog[1:29,])),
colour="black", size = 3.5, parse=TRUE)+
theme(text=element_text(family="sans"))
#add time line information and dotted line
gline = linesGrob(gp = gpar(col = "black", lty = "dotted" ))
first213 <- textGrob("ST213 - 1992\n OH, USA",
gp=gpar(fontsize = 8, col = 'black'))
first222 <- textGrob("ST222 - 1998\n PA, USA",
gp=gpar(fontsize = 8, col = 'black'))
firstEU <- textGrob("ST222 - 2010\n Germany, EU",
gp=gpar(fontsize = 8, col = 'black'))
xTitle222 <- textGrob("Past and Future Frequency from 1992 to 2030",
gp=gpar(fontsize = 16, col = 'black', fontfamily = "sans",
fontface = "bold"))
#add inset with zoom in of st2222
pInset222 <- ggplot(isoCountRate222[1:29,],
aes(x = year, y = caseCount,
group = 1))+ #add group = 1 to aes for line
geom_line()+
geom_point()+
theme(text=element_text(family="sans"))+
xlab("") + ylab("")+
theme_bw()+
ylim(0,40)+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
geom_hline(yintercept = 0, linetype="dashed")+
theme(axis.text.x = element_text(angle=45, hjust=1))+
geom_line(data = isoExpHisTime222[1:29,], aes(year, caseCount),
col = 'blue', linetype = 'dashed', lwd = 1) +
theme(axis.text.x = element_text(size = 8, angle=45, hjust=1, vjust = 1))+
theme(axis.text.y = element_text(size = 8))
#combine big plot with inset
pBig222 <- pBig222 + ggplot2::annotation_custom(ggplotGrob(pInset222),
xmin = 1990, xmax = 2000,
ymin = 125, ymax = 535)
#increase inner plot margin and add text annotation of the time line
pBig222 <- pBig222 +
theme(plot.margin = unit(c(1, 1, 0, 1), "cm")) +
annotation_custom(first213, xmin=1992, xmax=1992, ymin = 90, ymax = 90)+
annotation_custom(gline, xmin=1992, xmax=1992, ymin= 20, ymax= 50)+
annotation_custom(first222, xmin=1998, xmax=1998, ymin = 90, ymax = 90)+
annotation_custom(gline, xmin=1998, xmax=1998, ymin= 20, ymax= 50) +
annotation_custom(firstEU, xmin=2010, xmax=2010, ymin = 90, ymax = 90)+
annotation_custom(gline, xmin=2010, xmax=2010, ymin= 20, ymax= 50)+
annotation_custom(xTitle222,xmin=2010, xmax=2010, ymin = 675, ymax = 675)
pBig222 <- pBig222 + coord_cartesian(clip="off")
# # # #don't forget to dev off
# # # dev.off()
# ##check residuals using the function checkresiduals from forecast package
# #add years with no isolates collected for time series analysis
isoCountCR <- isoCount %>%
dplyr::mutate(isoCount[,2] + 1)
#convert to time series data first
timeSeriesCount <- ts(isoCountCR$caseCount, start = 1992, end = 2020)
tseries::adf.test(timeSeriesCount)
# Augmented Dickey-Fuller Test
#
# data: tsCountLogDiff
# Dickey-Fuller = -2.3064, Lag order = 3, p-value = 0.4552
# alternative hypothesis: stationary
# Because p is greater than 0.05, we cannot reject the null hypothesis
# indicating that our data is non-stationary. Note that we take the log and
# difference to data to do linear model anlaysis
isoTime <- forecast::tslm(timeSeriesCount ~ time(timeSeriesCount))
fit <- forecast::tslm(timeSeriesCount ~ trend )
forecast::checkresiduals(fit) ## supplemental figure 1
ggplot2::ggsave(here::here("02_Manuscript/outputFigures", 'supplementalFigure1CheckresidualsST222.pdf'),
pBig222, device = 'pdf', width = 45, height = 30, units= 'cm')
#Breusch-Godfrey test for serial correlation of order up to 6
#
#data: Residuals from Linear regression model
#LM test = 3.8053, df = 6, p-value = 0.703
## Additional checking
fitFC <- forecast::forecast(fit, h=10)
fitFC$mean <- exp(fitFC$mean)
fitFC$upper <- exp(fitFC$upper)
fitFC$lower <- exp(fitFC$lower)
plot(fitFC)