-
Notifications
You must be signed in to change notification settings - Fork 1
/
lesson21_Rcode.R
203 lines (162 loc) · 5.96 KB
/
lesson21_Rcode.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# ==================================
# N736 Lesson 21 - dependent/paired data
#
# dated 11/8/2017
# Melinda Higgins, PhD
#
# ==================================
# ==================================
# we're be working with the
# helpmkh dataset
# ==================================
library(tidyverse)
library(haven)
helpdat <- haven::read_spss("helpmkh.sav")
# ============================================.
# For this lesson we'll use the helpmkh dataset
#
# In the HELP dataset there are 5 time points
# baseline and 4 follow-up time points
# at 6m, 12m, 18m and 24m
#
# for today's lesson we will be working with the PCS
# physical component score for the SF36 quality of life tool
# let's look at how these 5 PCS measurements are
# correlated across time
#
# and we'll also look at the treat group.
# ============================================.
h1 <- helpdat %>%
select(treat, pcs, pcs1, pcs2, pcs3, pcs4)
# ============================================.
# let's look at the correlations between
# these variables
# ============================================;
# look at the correlation matrix between
# the 5 pcs measurements over time
library(psych)
psych::corr.test(h1[,2:6], method="pearson")
# notice the varying sample sizes
# across these paired tests
# this is due to attrition over time
# =================================================.
# notice that most of these correlations have r>0.4 indicating
# moderate to large correlation across time
# this makes sense since particpants scores probably
# do not change a lot every 6 months and will tend to be
# similar to each other WITHIN each particpant
# more so than pcs scores BETWEEN participants
# =================================================.
# let's look at the first 2 time points and run a PAIRED t-test
# to see if the scores are significantly changing across time
# WITHIN individuals.
t.test(h1$pcs, h1$pcs1, paired=TRUE)
# another way to approach this is to compute
# the change scores and compare the difference
# scores to 0.
h1 <- h1 %>%
mutate(diff_pcs_bl_1=pcs - pcs1)
t.test(h1$diff_pcs_bl_1, mu=0)
# when we run a paired t-test, one of the assumptions
# is that the difference or change scores have a normal
# distribution - not the original scores but the difference scores
# these are good here.
qqnorm(h1$diff_pcs_bl_1)
# we can also run a paired t-test using RM-ANOVA
# repeated measures ANOVA
# compare this F-test wth the
# t-test from the paired t-test
# for 2 groups when df=1
# a t(df=1)^2 = F-test
# to do these analyses in R, we first
# have to reshape the data from WIDE
# to long
# add rowid to h1
h1 <- h1 %>%
mutate(rowid=as.numeric(rownames(h1)))
h1long <- h1 %>%
gather(key=item,
value=value,
-c(treat,diff_pcs_bl_1,rowid))
# add a time variable to long format
h1long <- h1long %>%
mutate(time=c(rep(0,453),
rep(1,453),
rep(2,453),
rep(3,453),
rep(4,453)))
h1long_bl1 <- h1long %>%
filter(time<2) %>%
select(rowid,value,time,treat)
library(car)
rm1 <- aov(value~factor(time)+Error(factor(rowid)),
data = h1long_bl1)
summary(rm1)
# compare the 2 changes from BL to 6m
# for pcs and pcs1 between the 2 treat groups
bartlett.test(h1$diff_pcs_bl_1~h1$treat)
t.test(h1$diff_pcs_bl_1~h1$treat, var.equal=TRUE)
# now let's run a RM-ANOVA
# for the changes from BL to 6m
# BETWEEN the 2 treat groups
# compare the time*treat effect to
# the t-test above for the difference scores
# look at the 2nd part of the table
rm2 <- aov(value~(factor(time)*factor(treat))+
Error(factor(rowid)/(treat)),
data = h1long_bl1)
summary(rm2)
# we can make a plot of
# pcs and pcs1 scores by group
# to get an idea of trend across time
# but this plot is cross sectional not paired
ggplot(h1long_bl1, aes(x=factor(time), y=value)) +
geom_boxplot() +
stat_summary(fun.y=mean, geom="point", shape=5, size=4) +
xlab("Time: Baseline (0) and 6m (1)") +
ylab("PCs Scores") +
facet_wrap(~treat) +
ggtitle("Usual Care = 0; HELP Clinic = 1")
# from the cookbook for R
## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%).
## data: a data frame.
## measurevar: the name of a column that contains the variable to be summariezed
## groupvars: a vector containing names of columns that contain grouping variables
## na.rm: a boolean that indicates whether to ignore NA's
## conf.interval: the percent range of the confidence interval (default is 95%)
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
conf.interval=.95, .drop=TRUE) {
library(plyr)
# New version of length which can handle NA's: if na.rm==T, don't count them
length2 <- function (x, na.rm=FALSE) {
if (na.rm) sum(!is.na(x))
else length(x)
}
# This does the summary. For each group's data frame, return a vector with
# N, mean, and sd
datac <- ddply(data, groupvars, .drop=.drop,
.fun = function(xx, col) {
c(N = length2(xx[[col]], na.rm=na.rm),
mean = mean (xx[[col]], na.rm=na.rm),
sd = sd (xx[[col]], na.rm=na.rm)
)
},
measurevar
)
# Rename the "mean" column
datac <- rename(datac, c("mean" = measurevar))
datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean
# Confidence interval multiplier for standard error
# Calculate t-statistic for confidence interval:
# e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
ciMult <- qt(conf.interval/2 + .5, datac$N-1)
datac$ci <- datac$se * ciMult
return(datac)
}
h1long_bl1_nomiss <- h1long_bl1[!complete.cases(h1long_bl1),]
h1se <- summarySE(h1long_bl1,
measurevar="value", groupvars=c("time","treat"))
ggplot(h1se, aes(x=time, y=value, colour=treat)) +
geom_errorbar(aes(ymin=value-se, ymax=value+se), width=.1) +
geom_line() +
geom_point()