-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathValidation_stats.R
252 lines (222 loc) · 10 KB
/
Validation_stats.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# Purpose: general function to compute validation statistics, plot results (and convergence for NextGP).
#
# Author: Renzo Bonifazi
# Date: 07-08-2023
# TO DO: add documentation
############################################################
# 1) Load packages ---------------
if (!require("pacman", character.only = T)) {
stop('Please install the pacman package, using install.packages("pacman"),
or change the way packages are loaded in this Rscript', call. = F)
}
pacman::p_load(boot, parallel, ggplot2, patchwork, dplyr, tidylog)
# 2) Define functions ------------
# function to return prediction results (including bootstrapping for SE)
compute_prediction_results <- function(
data_x_axis,
data_y_axis,
VG_label, # VG_label for result (e.g. "validation" and "training")
bootstrap = T,
boot_samples = 10000L,
parallel = "no", # provide "multicore" (on Windows) or "snow" (on Linux)
ncpus = 1L) {
# cbind dataset
data <- as.data.frame(cbind(data_x_axis, data_y_axis))
# define core function for bootstrapping
core_calcs <- function(data, indices, ...) {
stats <- list() # list to store results
sampled_data <- data[indices, ] # subset data based on sampled_ID
# correlation
stats$correlation <- cor(x = sampled_data[,1], y = sampled_data[,2])
# slope and intercept
reg <- lm(sampled_data[,2] ~ sampled_data[,1], data = sampled_data)
stats$slope <- as.numeric(coef(reg)[2])
stats$intercept <- as.numeric(coef(reg)[1])
# mean difference as mean_x - mean_y
stats$mean_diff <- mean(sampled_data[,1]) - mean(sampled_data[,2])
# stats$size_samples <- nrow(sampled_data) # if you want to check size of sample used = nrow(data analysed)
return(unlist(stats))
}
# without bootstrapping
if (bootstrap == F) {
results <- as.data.frame(core_calcs(data = data))
colnames(results) <- VG_label # add a VG_label for Validation Group
# with bootstrapping
} else if (bootstrap == T) {
# print clustering info
cat(
"\nRunning bootsrapping, using:",
boot_samples, " samples.\n",
"Other options are:\n parallel = ", parallel, "\n",
"n. of CPUs:", ncpus, "\n"
)
# make a cluster and export in each node needed objects
cl <- makeCluster(ncpus)
clusterExport(
cl = cl,
varlist = c("data"),
envir = environment(core_calcs)
)
# run bootstrap
b <- boot(
data = data,
# VG_label = VG_label,
statistic = core_calcs,
R = boot_samples,
parallel = parallel,
ncpus = ncpus,
cl = cl # use the created cluster
)
# close cluster
stopCluster(cl)
# convert to data.frame and compute SE
results <- data.frame(
value = b$t0,
SE = apply(b$t, MARGIN = 2, sd)
)
# append n. samples to results
results <- rbind(results, data.frame(value = boot_samples, SE = NA, row.names = "boot_samples"))
colnames(results) <- paste0(VG_label, "_", colnames(results))
}
return(results)
}
# function to make scatterplots predictions
plot_GP <- function(data, xcol, ycol, title, subtitle, slope, intercept ) {
# [NOT WORKING ] SE_slope, SE_intercept) {
p <- ggplot(data, aes(x = !!sym(xcol), y = !!sym(ycol), color = Bron2)) +
geom_point(aes(color = Bron2, shape = Bron2))+
geom_abline(intercept = intercept, slope = slope, color = "blue")+
geom_abline(slope = 1, color = "gray") +
# [NOT WORKING ] # Add shaded regions for SE of slope and intercept
# geom_ribbon(aes(ymin = intercept - SE_intercept, ymax = intercept + SE_intercept),
# fill = "lightblue", alpha = 0.5) +
# geom_ribbon(aes(ymin = slope - SE_slope, ymax = slope + SE_slope),
# fill = "lightgreen", alpha = 0.5) +
# [geom_smooth can be used I think that not the same SE as in boostrapping are visualized here]
# geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
theme_bw() +
labs(
x = element_text(paste0("GEBV_", xcol)), y = ycol,
title = title,
subtitle = subtitle
) +
theme(aspect.ratio = 1)
return(p)
}
# function to plot convergence of parameters
plot_conv <- function(file, x_axis) {
data_p <- read.table(paste0("output/", file), header = T) # read file
data_p <- cbind(x_axis_info, data_p) # attach iterations for x-axis plotting
p <- data_p %>%
ggplot(aes(x= x_axis_info, y= .data[[colnames(data_p)[2]]]) )+
geom_line()+
xlab("iterations")+
theme_bw()+
theme(axis.text.x=element_text(angle=+90))+
labs(
title = file
)
return(p)
}
# 2) read data -------------------------------------
# ___2.1) read predictions
# predictions on training
training_pred <- read.table("training_predictions.csv", sep = ",", header= T)
head(training_pred)
# predictions on validation
validation_pred <- read.table("validation_predictions.csv", sep = ",", header= T)
head(validation_pred)
# ___2.2) read and add info on DRP
training_set <- read.table("training_set.txt", header = T) %>% rename(mean_DRP_orig = mean_DRP)
validation_set <- read.table("validation_set.txt", header = T) %>% rename(mean_DRP_orig = mean_DRP)
# merge DRP info with predictions
validation_pred_ext <- cbind(validation_pred, validation_set)
training_pred_ext <- cbind(training_pred, training_set)
# Implement an error check if mean_DRP in input files do not match against the Julia mean_DRP output files
if( !(all(validation_pred_ext$mean_DRP == validation_pred_ext$mean_DRP_orig)==T &&
all(training_pred_ext$mean_DRP == training_pred_ext$mean_DRP_orig)==T )
) {stop("ERROR: order of mean_DRP in input files and output files do not match: there is likely a mistmatch or re-ordering!")}
# 3) compute validation metrics -----------------
results_list <- list() # list to store results
for(col_to_validate in colnames(validation_pred)) {
cat("Now doing:", col_to_validate, "\n")
if(col_to_validate != "mean_DRP") {
results <- cbind(
# validation group
compute_prediction_results(
data_x_axis = validation_pred_ext[col_to_validate],
data_y_axis = validation_pred_ext$mean_DRP,
VG_label = paste0("validation_", col_to_validate),
bootstrap = T, boot_samples = 10000, parallel = "snow", ncpus = 10),
# training group
compute_prediction_results(
data_x_axis = training_pred_ext[col_to_validate],
data_y_axis = training_pred_ext$mean_DRP,
VG_label = paste0("training_", col_to_validate),
bootstrap = T, boot_samples = 10000, parallel = "snow", ncpus = 10)
)
results_list[[col_to_validate]] <- results
rm(results)
}
}
cat("\n\n Prediction results: \n\n"); names(results_list)
lapply(results_list, head) %>% as.data.frame
# write results to disk
write.csv(x = lapply(results_list, head) %>% as.data.frame,
file = "prediction_results.txt", quote = F)
# 4) Make plots ---------------------------------
# __4.1) make training and validation plots -----
pdf("prediction_plots.pdf", height = 9, width = 14)
for(col_to_validate in colnames(validation_pred)) {
cat("Now plotting:", col_to_validate, "\n")
if(col_to_validate != "mean_DRP") {
# validation
p_val <- plot_GP(data = validation_pred_ext, title="validation set",
subtitle=col_to_validate,
xcol = col_to_validate,
ycol = "mean_DRP",
slope = results_list[[col_to_validate]]["slope", paste0("validation_", col_to_validate, "_value")],
intercept = results_list[[col_to_validate]]["intercept", paste0("validation_", col_to_validate, "_value")]
)
# SE_slope = results["slope", "validation_SE"], SE_intercept = results["intercept", "validation_SE"])
# training
p_tra <- plot_GP(data = training_pred_ext, title="training set",
subtitle=col_to_validate,
xcol = col_to_validate,
ycol = "mean_DRP",
slope = results_list[[col_to_validate]]["slope", paste0("training_", col_to_validate, "_value")],
intercept = results_list[[col_to_validate]]["intercept", paste0("training_", col_to_validate, "_value")]
)
p_comb <- p_val + p_tra
print(p_comb)
}
}
dev.off()
# __4.2) make convergence plots
# __4.2.1) get info on runLMEM
# Define the grep command to match lines starting with "runLMEM" and not starting with "#"
grep_command <- "grep '^runLMEM' *.jl | grep -v '^#'"
# Use system() to execute the grep command and capture the output
julia_settings <- system(grep_command, intern = TRUE)
julia_numbers <- as.numeric(unlist(
strsplit(
# Extract runLMEM numbers from the output using regular expressions
gsub(".*runLMEM\\(f,pheno_training,([0-9]+),([0-9]+),([0-9]+).*",
"\\1 \\2 \\3",
julia_settings),
" ") # Split the extracted numbers into a numeric vector
))
x_axis_info <- seq(from=julia_numbers[2]+julia_numbers[3], to=julia_numbers[1], by=julia_numbers[3])
# __4.2.2) read all files to plot (all that start with b and var, but exclude betaM - too many effects)
files_to_plot <- list.files("output/") %>%
as.data.frame %>%
filter(grepl("b|var|pi", .)) %>%
filter(!grepl("^betaM", .)) # exclude betaM (too many, i.e. = # SNPs)
# __4.2.3) write output in a pdf, each page is a parameter
pdf("convergence.pdf")
for(f in 1:nrow(files_to_plot)) {
file = files_to_plot[f, ]
p <- plot_conv(file = file, x_axis = x_axis_info)
print(p)
}
dev.off()