-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path06-correlation.Rmd
250 lines (238 loc) · 13.7 KB
/
06-correlation.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
---
title: "Correlation analysis"
output: html_document
---
```{r, load-data}
# load image and table
load('processed_data/aggregating.rda')
aggregated_dt <- fread('processed_data/aggregated_dt_filtered.csv.gz', stringsAsFactors=TRUE)
# remove manual_ids, i.e., filter IDs in keep_rows
aggregated_dt <- aggregated_dt[ID %in% keep_rows]
# check if there are still manual_ids in the dt
stopifnot(!any(manual_ids %in% aggregated_dt$ID))
aggregated_dt[, `Event Type`:=factor(`Event Type`, levels=to_analyze)]
```
```{r, generate-cor-dt}
# if cor_dt exists, read it, otherwise produce it:
corr_file <- 'processed_data/correlation.csv.gz'
if (file.exists(corr_file)) {
all_cor <- fread(corr_file, stringsAsFactors = TRUE)
} else {
# try multiple correlation methods:
cor_dt <-
rbindlist(pbmclapply(setNames(cor_methods, cor_methods), function(cor_method)
# correlate all the features with PSI, first for Low and High Variability Events separately
rbind(aggregated_dt[, lapply(.SD, function(y) {
# only use full pairs
rmv <- is.na(y) | is.na(PSI)
res <- NA
# only compute correlation if there are enough events
if ((length(rmv) - sum(rmv)) >= minimum_events)
res <- suppressWarnings(stats::cor(PSI, y, use = 'pairwise.complete.obs', method = cor_method))
return(as.numeric(res))
}), by = .(`Event Type`, IHEC, Variability), .SDcols = names(aggregated_dt)[!names(aggregated_dt) %in% c('ID', 'PSI', 'IHEC', 'Event Type', 'gene_id', 'Variability')]],
# correlate all the features with PSI, now for all events
aggregated_dt[, lapply(.SD, function(y) {
rmv <- is.na(y) | is.na(PSI)
res <- NA
if ((length(rmv) - sum(rmv)) >= minimum_events)
res <-
suppressWarnings(stats::cor(PSI, y, use = 'pairwise.complete.obs', method = cor_method))
return(as.numeric(res))
}), by = .(`Event Type`, IHEC), .SDcols = names(aggregated_dt)[!names(aggregated_dt) %in% c('ID', 'PSI', 'IHEC', 'Event Type', 'gene_id', 'Variability')]][, Variability := 'All'])),
idcol = 'Correlation Coefficient')
# set some information
cor_dt[, `Correlation Coefficient` := as.factor(`Correlation Coefficient`)]
cor_dt[, `Partial Correlation` := FALSE]
# repeat but now with partial correlation
pcor_dt <-
rbindlist(pbmclapply(setNames(cor_methods, cor_methods), function(cor_method)
rbind(aggregated_dt[, lapply(.SD, function(y) {
rmv <- is.na(y) | is.na(PSI) | is.na(gene_expression)
res <- NA
if ((length(rmv) - sum(rmv)) >= minimum_events)
res <- tryCatch(
suppressWarnings(
ppcor::pcor.test(PSI[!rmv], y[!rmv], gene_expression[!rmv], method = cor_method)$estimate
),
error = function(e) {
if (identical(y, gene_expression) &
startsWith(e$message, prefix = 'system is computationally singular'))
return(suppressWarnings(
stats::cor(y, PSI, use = 'na.or.complete', method = cor_method)
))
return(e)
}
)
return(as.numeric(res))
}), by = .(`Event Type`, IHEC, Variability), .SDcols = names(aggregated_dt)[!names(aggregated_dt) %in% c('ID', 'PSI', 'IHEC', 'Event Type', 'gene_id', 'Variability')]],
aggregated_dt[, lapply(.SD, function(y) {
rmv <- is.na(y) | is.na(PSI) | is.na(gene_expression)
res <- NA
if ((length(rmv) - sum(rmv)) >= minimum_events)
res <- tryCatch(
suppressWarnings(
ppcor::pcor.test(PSI[!rmv], y[!rmv], gene_expression[!rmv], method = cor_method)$estimate
),
error = function(e) {
if (identical(y, gene_expression) &
startsWith(e$message, prefix = 'system is computationally singular'))
return(suppressWarnings(
stats::cor(y, PSI, use = 'na.or.complete', method = cor_method)
))
return(e)
}
)
return(as.numeric(res))
}), by = .(`Event Type`, IHEC), .SDcols = names(aggregated_dt)[!names(aggregated_dt) %in% c('ID', 'PSI', 'IHEC', 'Event Type', 'gene_id', 'Variability')]][, Variability := 'All'])),
idcol = 'Correlation Coefficient')
# set some information
pcor_dt[, `Correlation Coefficient` := as.factor(`Correlation Coefficient`)]
pcor_dt[, `Partial Correlation` := TRUE]
# bind correlation with partial correlation
all_cor <- rbind(cor_dt, pcor_dt)
# set variability as factor
all_cor[, Variability := factor(Variability,
levels = c('Low Variability', 'All', 'High Variability'))]
# write table
fwrite(all_cor, corr_file)
}
```
```{r, nicer-names}
# this chunk just changes some names in the dt so the plots look nicer
melt_cor_dt <- melt(all_cor, id.vars = c('IHEC', 'Event Type', 'Variability', 'Partial Correlation', 'Correlation Coefficient'), variable.name = 'feature', value.name = 'Correlation w/ PSI')
melt_cor_dt[, `Event Type`:=factor(`Event Type`, levels=c('SE', 'RI'))]
melt_cor_dt[`Event Type` == 'SE', feature := gsub('other_region', 'Intron', feature, fixed=TRUE)]
melt_cor_dt[`Event Type` == 'RI', feature := gsub('other_region', 'Exon', feature, fixed=TRUE)]
melt_cor_dt[`Event Type` == 'SE', feature := gsub('event_name', 'SE', feature, fixed=TRUE)]
melt_cor_dt[`Event Type` == 'RI', feature := gsub('event_name', 'RI', feature, fixed=TRUE)]
melt_cor_dt[, feature:=sub(';', '\n', feature, fixed = TRUE)]
melt_cor_dt[, feature:=sub(',', '\n', feature, fixed = TRUE)]
melt_cor_dt[, feature:=gsub('_', ' ', feature, fixed = TRUE)]
melt_cor_dt[, feature:=sub('distance ', 'Distance\n', feature, fixed = TRUE)]
melt_cor_dt[, feature:=sub('upstream Intron', 'Intron\nUpstream', feature, fixed = TRUE)]
melt_cor_dt[, feature:=sub('upstream Exon', 'Exon\nUpstream', feature, fixed = TRUE)]
melt_cor_dt[, feature:=sub('downstream Intron', 'Intron\nDownstream', feature, fixed = TRUE)]
melt_cor_dt[, feature:=sub('downstream Exon', 'Exon\nDownstream', feature, fixed = TRUE)]
melt_cor_dt[, feature:=sub('gene expression', 'Gene\nExpression', feature, fixed = TRUE)]
melt_cor_dt[, feature:=sub('width', 'Width', feature, fixed = TRUE)]
aggregate_median_by <- c('feature', 'Event Type', 'Variability', 'Partial Correlation', 'Correlation Coefficient')
melt_cor_dt[, median_cor:=median(`Correlation w/ PSI`, na.rm = TRUE), by = aggregate_median_by]
melt_cor_dt[, mean_cor:=mean(`Correlation w/ PSI`, na.rm = TRUE), by = aggregate_median_by]
melt_cor_dt[, sd_cor:=sd(`Correlation w/ PSI`, na.rm = TRUE), by = aggregate_median_by]
melt_cor_dt[, relevant:=any(abs(median_cor) > .1), by = aggregate_median_by]
melt_cor_dt[, Variability:=sub(' Variability', '', Variability, fixed = TRUE)]
melt_cor_dt[, Variability:=factor(Variability, levels = c('Low', 'All', 'High'))]
```
```{r, top10-overall}
top_n <- 10
# get the top n correlations with PSI from the feature vector
tmp_cor_dt <- melt_cor_dt[`Variability` == 'All' & `Correlation Coefficient` == 'pearson' & `Partial Correlation` == 'TRUE']
tmp_cor_dt[order(-abs(median_cor)), .(feature=unique(feature)[1:top_n]), by=.(`Event Type`)]
# annotate epigenetic vs intrinsic features
tmp_cor_dt[, 'Feature Type' := ifelse(Reduce(`|`, lapply(c(histone_marks, 'DNAm'), function(x) startsWith(feature, x))), 'Epigenetic', 'Intrinsic')]
tmp_cor_dt[feature %in% c('max promoter', 'summed enhancer'), `Feature Type` := 'Epigenetic']
plot_dt <- merge(tmp_cor_dt, tmp_cor_dt[order(-abs(median_cor)), .(feature=unique(feature)[1:top_n]), by=.(`Event Type`)])
# Create a vector of colors for ggplot, defaulting to dark grey
plot_colors <- setNames(rep("dark grey", plot_dt[, uniqueN(feature)]), plot_dt[, unique(feature)])
for(name in names(mark_hex_colors)) {
inds <- startsWith(names(plot_colors), name)
plot_colors[inds] <- mark_hex_colors[name]
}
# plot
ggplot(plot_dt,
aes(y = `Correlation w/ PSI`,
x = tidytext::reorder_within(feature, -median_cor, `Event Type`),
color = feature)) + #`Feature Type`)) +
geom_hline(yintercept = 0, color = "light grey", linetype = "dashed") +
geom_boxplot() +
labs(x = NULL, #'Aggregated Signal or Statistic',
title = NULL#, paste('Top', top_n, 'Globally Correlated Features')
) +
tidytext::scale_x_reordered() +
facet_wrap(. ~ `Event Type`, scales="free", nrow=2) +
scale_color_manual(values = plot_colors) +
theme_bw(base_size=12) + theme(legend.position = 'none', strip.background = element_rect(fill = 'white'))
ggsave(filename = file.path(plot_dir, paste0('top', top_n, '_overall.png')), width = 10, height = 5)
ggsave(filename = file.path(plot_dir, paste0('top', top_n, '_overall.svg')), width = 10, height = 5)
```
```{r, comparison-plots}
# make plots comparing different measures and variability
for (color_by in c('Correlation Coefficient', 'Partial Correlation', 'Variability')) {
# aggregate the medians by the relevant columns
find_relevant_by <- aggregate_median_by[aggregate_median_by != color_by]
# remove column from earlier iteration
melt_cor_dt[, plot_relevant:=NULL]
# set which columns are relevant
melt_cor_dt[, plot_relevant:=any(relevant), by = mget(find_relevant_by)]
# get the subset of the table that is relevant depending on what to compare
not_color_by <- TRUE
if (color_by == 'Partial Correlation') {
not_color_by <- melt_cor_dt[, `Correlation Coefficient` == 'pearson' & `Variability` == 'All']
} else if (color_by == 'Correlation Coefficient') {
not_color_by <- melt_cor_dt[, `Partial Correlation` == 'TRUE' & `Variability` == 'All']
} else if (color_by == 'Variability') {
not_color_by <- melt_cor_dt[, `Partial Correlation` == 'TRUE' & `Correlation Coefficient` == 'pearson']
}
# get the number of levels we need to color_by
color_levels <- melt_cor_dt[not_color_by, uniqueN(get(color_by))]
tmp_cor_dt <- melt_cor_dt[not_color_by & !is.na(`Correlation w/ PSI`), if(uniqueN(get(color_by)) == color_levels) .SD, by=.(`Event Type`, feature)]
if (color_by == 'Variability') {
# for variablity make another plot sorted by the greatest differences
median_dt <- dcast(tmp_cor_dt[, .(median_cor=unique(median_cor)), by=.(`Event Type`, feature, `Variability`)], `Event Type` + feature ~ `Variability`, value.var = 'median_cor')
median_dt[, diff_col:=abs(`Low`-`High`)]
sorted_idx <- median_dt[, order(-diff_col)]
p <- ggplot(merge(tmp_cor_dt, median_dt[sorted_idx, .(feature=feature[1:top_n], diff_col=diff_col[1:top_n], all_median=All[1:top_n]), by=.(`Event Type`)]),
aes(y = `Correlation w/ PSI`,
x = tidytext::reorder_within(feature, -diff_col, `Event Type`),
fill = get(color_by))) +
theme_bw() + geom_hline(yintercept = 0, color = "darkgrey", linetype = "dashed") +
geom_boxplot() +
labs(x = 'Aggregated Signal or Statistic') +
tidytext::scale_x_reordered() +
facet_wrap(. ~ `Event Type`, scales="free", nrow=2) +
theme(legend.position = 'bottom') +
scale_fill_manual(values = variability_colors[1:color_levels], name = color_by)
ggsave(filename = file.path(plot_dir, paste0('max diff ', color_by, '.pdf')), p, width = 10, height = 7)
print(p)
}
# always make one plot sorted by the median
p <- ggplot(merge(tmp_cor_dt, tmp_cor_dt[order(-abs(median_cor)), .(feature=unique(feature)[1:top_n]), by=.(`Event Type`)]),
aes(y = `Correlation w/ PSI`,
x = tidytext::reorder_within(feature, -median_cor, `Event Type`),
color = get(color_by))) +
theme_bw() + geom_hline(yintercept = 0, color = "darkgrey", linetype = "dashed") +
geom_boxplot() +
labs(x = 'Aggregated Signal or Statistic') +
tidytext::scale_x_reordered() +
facet_wrap(. ~ `Event Type`, scales="free", nrow=2) +
theme(legend.position = 'bottom', strip.background = element_rect(fill = 'white')) +
scale_color_manual(values = variability_colors, name = color_by)
if (color_levels == 2){
p <- p + scale_color_manual(values = unname(variability_colors[c(1,3)]), name = color_by)}
if (color_by == "Variability")
ggsave(filename = file.path(plot_dir, paste0('top', top_n, color_by, '.pdf')), p, width = 7, height = 4.5)
print(p)
}
```
```{r, cor-heatmap}
# make heatmap with the correlation per feature
heatmap_dt <- melt_cor_dt[`Partial Correlation` == TRUE & `Correlation Coefficient` == 'pearson' & `Variability` == 'All' & grepl(paste(c(histone_marks, "DNAm"), collapse = "|"), x = feature)]
heatmap_dt[, mark:=tstrsplit(x = feature, split = "\n", fixed = TRUE, keep = 1)]
heatmap_dt[, region:=sub(pattern = ".*?\n", replacement = "", x = feature)]
ggplot(heatmap_dt, aes(x = factor(region, levels = c("Intron\nUpstream", "SE", "Intron\nDownstream", "Exon\nUpstream", "RI", "Exon\nDownstream")),
y = factor(mark, levels = rev(c("H3K9me3", "H3K27me3", "H3K4me3", "H3K4me1", "H3K27ac", "H3K36me3", "DNAm"))),
fill = mean_cor)) +
geom_tile() +
ggh4x::facet_grid2( ~ `Event Type`, scales = 'free') +
theme_bw() +
theme(strip.background = element_rect(fill = 'white'), legend.position = "bottom") +
scale_fill_gradient2(
low = scales::muted("blue"),
mid = "white",
high = scales::muted("red")) +
# add text into the tile with the value of the mean and sd_cor in parantheses
geom_text(aes(label = paste0(round(mean_cor, 2), " (\u00B1", round(sd_cor, 2), ")")), size=3) +
labs(y = 'Mark', x = 'Region', fill = 'Partial Correlation')
ggsave(filename = file.path(plot_dir, "epigenetic_cor.pdf"), width = 7, height = 4.5)
```